summaryrefslogtreecommitdiff
path: root/beautifulsoup
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup')
-rw-r--r--beautifulsoup/__init__.py18
-rw-r--r--beautifulsoup/builder/__init__.py4
-rw-r--r--beautifulsoup/builder/_lxml.py7
-rw-r--r--beautifulsoup/dammit.py3
4 files changed, 27 insertions, 5 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index ce39d33..cee55e7 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -66,7 +66,7 @@ import re
from util import isList, buildSet
from builder import builder_registry
from dammit import UnicodeDammit
-from element import NavigableString, Tag
+from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
class BeautifulSoup(Tag):
@@ -122,6 +122,7 @@ class BeautifulSoup(Tag):
% ",".join(features))
builder = builder_class()
self.builder = builder
+ self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
@@ -261,6 +262,21 @@ class BeautifulSoup(Tag):
def handle_data(self, data):
self.currentData.append(data)
+ def decode(self, pretty_print=False, indent_level=0,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ """Returns a string or Unicode representation of this document.
+ To get Unicode, pass None for encoding."""
+ if self.is_xml:
+ # Print the XML declaration
+ encoding_part = ''
+ if eventual_encoding != None:
+ encoding_part = ' encoding="%s"' % eventual_encoding
+ prefix = u'<?xml version="1.0"%s>\n' % encoding_part
+ else:
+ prefix = u''
+ return prefix + super(BeautifulSoup, self).decode(
+ pretty_print, indent_level, eventual_encoding)
+
class StopParsing(Exception):
pass
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index fb10628..10c6b7f 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -77,7 +77,7 @@ class TreeBuilder(object):
features = []
- assume_html = False
+ is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
@@ -185,8 +185,6 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- assume_html = True
-
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index 4c7a826..23ac485 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -20,6 +20,8 @@ LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
+ is_xml = True
+
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
@@ -87,10 +89,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup.handle_data(content)
self.soup.endData(Comment)
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
+
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST]
+ is_xml = False
@property
def default_parser(self):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 788f72d..9833bd4 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -37,7 +37,8 @@ class EntitySubstitution(object):
for codepoint, name in codepoint2name.items():
if codepoint == 34:
# There's no point in turning the quotation mark into
- # &quot--even in attribute values we quote the
+ # &quot;, unless it happens in an attribute value, which
+ # is done elsewhere.
continue;
character = unichr(codepoint)
characters.append(character)