diff options
-rw-r--r-- | CHANGELOG | 15 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 18 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 7 | ||||
-rw-r--r-- | tests/test_lxml.py | 9 |
5 files changed, 45 insertions, 8 deletions
@@ -99,6 +99,21 @@ sections into ordinary text elements: In theory it's possible to preserve the CDATA sections when using the XML parser, but I don't see how to get it to work in practice. +== Miscellaneous other stuff == + +If the BeautifulSoup instance has .is_xml set to True, an appropriate +XML declaration will be emitted when the tree is transformed into a +string: + + <?xml version="1.0" encoding="utf-8"> + <markup> + ... + </markup> + +The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree +builders set it to False. If you want to parse XHTML with an HTML +parser, you can set it manually. + = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index ce39d33..cee55e7 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -66,7 +66,7 @@ import re from util import isList, buildSet from builder import builder_registry from dammit import UnicodeDammit -from element import NavigableString, Tag +from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag class BeautifulSoup(Tag): @@ -122,6 +122,7 @@ class BeautifulSoup(Tag): % ",".join(features)) builder = builder_class() self.builder = builder + self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only @@ -261,6 +262,21 @@ class BeautifulSoup(Tag): def handle_data(self, data): self.currentData.append(data) + def decode(self, pretty_print=False, indent_level=0, + eventual_encoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'<?xml version="1.0"%s>\n' % encoding_part + else: + prefix = u'' + return prefix + super(BeautifulSoup, self).decode( + pretty_print, indent_level, eventual_encoding) + class StopParsing(Exception): pass diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index fb10628..10c6b7f 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -77,7 +77,7 @@ class TreeBuilder(object): features = [] - assume_html = False + is_xml = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -185,8 +185,6 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - assume_html = True - preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py index 4c7a826..23ac485 100644 --- a/beautifulsoup/builder/_lxml.py +++ b/beautifulsoup/builder/_lxml.py @@ -20,6 +20,8 @@ LXML = 'lxml' class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser + is_xml = True + # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] @@ -87,10 +89,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.handle_data(content) self.soup.endData(Comment) + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment + class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST] + is_xml = False @property def default_parser(self): diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 8f3d798..a96fbbb 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -525,6 +525,7 @@ class TestLXMLXMLBuilder(SoupTest): def default_builder(self): return LXMLTreeBuilderForXML() + def test_cdata_becomes_text(self): # LXML sends CData sections as 'data' events, so we can't # create special CData objects for them. We have to use @@ -556,20 +557,20 @@ class TestLXMLXMLBuilder(SoupTest): self.assertTrue(soup.bar.is_empty_element) soup.bar.insert(1, "Contents") self.assertFalse(soup.bar.is_empty_element) - self.assertEquals(str(soup), "<bar>Contents</bar>") + self.assertEquals(str(soup), self.document_for("<bar>Contents</bar>")) def test_designated_empty_element_tag_has_no_closing_tag(self): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="<bar></bar>") self.assertTrue(soup.bar.is_empty_element) - self.assertEquals(str(soup), "<bar />") + self.assertEquals(str(soup), self.document_for("<bar />")) def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="<foo />") self.assertFalse(soup.foo.is_empty_element) - self.assertEquals(str(soup), "<foo></foo>") + self.assertEquals(str(soup), self.document_for("<foo></foo>")) def test_designated_empty_element_tag_does_not_change_parser_behavior(self): # The designated list of empty-element tags only affects how @@ -577,4 +578,4 @@ class TestLXMLXMLBuilder(SoupTest): # parsed--that's the parser's job. builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="<bar>contents</bar>") - self.assertEquals(str(soup), "<bar>contents</bar>") + self.assertEquals(str(soup), self.document_for("<bar>contents</bar>")) |