diff options
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/__init__.py | 18 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 7 |
3 files changed, 25 insertions, 4 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index ce39d33..cee55e7 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -66,7 +66,7 @@ import re from util import isList, buildSet from builder import builder_registry from dammit import UnicodeDammit -from element import NavigableString, Tag +from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag class BeautifulSoup(Tag): @@ -122,6 +122,7 @@ class BeautifulSoup(Tag): % ",".join(features)) builder = builder_class() self.builder = builder + self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only @@ -261,6 +262,21 @@ class BeautifulSoup(Tag): def handle_data(self, data): self.currentData.append(data) + def decode(self, pretty_print=False, indent_level=0, + eventual_encoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'<?xml version="1.0"%s>\n' % encoding_part + else: + prefix = u'' + return prefix + super(BeautifulSoup, self).decode( + pretty_print, indent_level, eventual_encoding) + class StopParsing(Exception): pass diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index fb10628..10c6b7f 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -77,7 +77,7 @@ class TreeBuilder(object): features = [] - assume_html = False + is_xml = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -185,8 +185,6 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - assume_html = True - preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py index 4c7a826..23ac485 100644 --- a/beautifulsoup/builder/_lxml.py +++ b/beautifulsoup/builder/_lxml.py @@ -20,6 +20,8 @@ LXML = 'lxml' class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser + is_xml = True + # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] @@ -87,10 +89,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.handle_data(content) self.soup.endData(Comment) + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment + class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST] + is_xml = False @property def default_parser(self): |