summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG15
-rw-r--r--beautifulsoup/__init__.py18
-rw-r--r--beautifulsoup/builder/__init__.py4
-rw-r--r--beautifulsoup/builder/_lxml.py7
-rw-r--r--tests/test_lxml.py9
5 files changed, 45 insertions, 8 deletions
diff --git a/CHANGELOG b/CHANGELOG
index abdf1b1..4449279 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -99,6 +99,21 @@ sections into ordinary text elements:
In theory it's possible to preserve the CDATA sections when using the
XML parser, but I don't see how to get it to work in practice.
+== Miscellaneous other stuff ==
+
+If the BeautifulSoup instance has .is_xml set to True, an appropriate
+XML declaration will be emitted when the tree is transformed into a
+string:
+
+ <?xml version="1.0" encoding="utf-8">
+ <markup>
+ ...
+ </markup>
+
+The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree
+builders set it to False. If you want to parse XHTML with an HTML
+parser, you can set it manually.
+
= 3.1.0 =
A hybrid version that supports 2.4 and can be automatically converted
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index ce39d33..cee55e7 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -66,7 +66,7 @@ import re
from util import isList, buildSet
from builder import builder_registry
from dammit import UnicodeDammit
-from element import NavigableString, Tag
+from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
class BeautifulSoup(Tag):
@@ -122,6 +122,7 @@ class BeautifulSoup(Tag):
% ",".join(features))
builder = builder_class()
self.builder = builder
+ self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
@@ -261,6 +262,21 @@ class BeautifulSoup(Tag):
def handle_data(self, data):
self.currentData.append(data)
+ def decode(self, pretty_print=False, indent_level=0,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING):
+ """Returns a string or Unicode representation of this document.
+ To get Unicode, pass None for encoding."""
+ if self.is_xml:
+ # Print the XML declaration
+ encoding_part = ''
+ if eventual_encoding != None:
+ encoding_part = ' encoding="%s"' % eventual_encoding
+ prefix = u'<?xml version="1.0"%s>\n' % encoding_part
+ else:
+ prefix = u''
+ return prefix + super(BeautifulSoup, self).decode(
+ pretty_print, indent_level, eventual_encoding)
+
class StopParsing(Exception):
pass
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index fb10628..10c6b7f 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -77,7 +77,7 @@ class TreeBuilder(object):
features = []
- assume_html = False
+ is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
@@ -185,8 +185,6 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- assume_html = True
-
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index 4c7a826..23ac485 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -20,6 +20,8 @@ LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
+ is_xml = True
+
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
@@ -87,10 +89,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup.handle_data(content)
self.soup.endData(Comment)
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
+
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST]
+ is_xml = False
@property
def default_parser(self):
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 8f3d798..a96fbbb 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -525,6 +525,7 @@ class TestLXMLXMLBuilder(SoupTest):
def default_builder(self):
return LXMLTreeBuilderForXML()
+
def test_cdata_becomes_text(self):
# LXML sends CData sections as 'data' events, so we can't
# create special CData objects for them. We have to use
@@ -556,20 +557,20 @@ class TestLXMLXMLBuilder(SoupTest):
self.assertTrue(soup.bar.is_empty_element)
soup.bar.insert(1, "Contents")
self.assertFalse(soup.bar.is_empty_element)
- self.assertEquals(str(soup), "<bar>Contents</bar>")
+ self.assertEquals(str(soup), self.document_for("<bar>Contents</bar>"))
def test_designated_empty_element_tag_has_no_closing_tag(self):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="<bar></bar>")
self.assertTrue(soup.bar.is_empty_element)
- self.assertEquals(str(soup), "<bar />")
+ self.assertEquals(str(soup), self.document_for("<bar />"))
def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="<foo />")
self.assertFalse(soup.foo.is_empty_element)
- self.assertEquals(str(soup), "<foo></foo>")
+ self.assertEquals(str(soup), self.document_for("<foo></foo>"))
def test_designated_empty_element_tag_does_not_change_parser_behavior(self):
# The designated list of empty-element tags only affects how
@@ -577,4 +578,4 @@ class TestLXMLXMLBuilder(SoupTest):
# parsed--that's the parser's job.
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="<bar>contents</bar>")
- self.assertEquals(str(soup), "<bar>contents</bar>")
+ self.assertEquals(str(soup), self.document_for("<bar>contents</bar>"))