Preliminary work for getting XML parsing to work.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-19 21:21:14 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-19 21:21:14 -0500
commit: 22e7fc268c6150f812e9af55f28dba7aeda4d053 (patch)
tree: 766662556ae441c5474e754fe9d582ffce3ff257 /beautifulsoup/builder/lxml_builder.py
parent: 8249b803d9bab9c06be02a244e629cb732f4f5b1 (diff)
parent: 9a936b48fe05666780662c76d5df3b3de7b48074 (diff)
1 files changed, 14 insertions, 9 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 2c264b3..afdf760 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -1,15 +1,16 @@
 from lxml import etree
 from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import HTMLTreeBuilder
+from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
 from beautifulsoup.dammit import UnicodeDammit
 
-class LXMLTreeBuilder(HTMLTreeBuilder):
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
 
-    def __init__(self, parser_class=etree.HTMLParser):
-        # etree.HTMLParser's constructor has an argument strip_cdata,
-        # but it does nothing. CDATA sections are always stripped when
-        # passed through HTMLParser.
-        self.parser = parser_class(target=self)
+    def __init__(self, parser_class=None):
+        # strip_cdata only has an effect on XMLParser. HTMLParser's
+        # constructor accepts strip_cdata but ignores it.
+        parser_class = parser_class or self.DEFAULT_PARSER_CLASS
+        self.parser = parser_class(target=self, strip_cdata=False)
         self.soup = None
 
     def prepare_markup(self, markup, user_specified_encoding=None,
@@ -23,8 +24,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
-        return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding
-
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding)
 
     def feed(self, markup):
         self.parser.feed(markup)
@@ -60,3 +61,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
         """See `TreeBuilder`."""
         return u'<html><body>%s</body></html>' % fragment
 
+
+class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder):
+
+    DEFAULT_PARSER_CLASS = etree.HTMLParser
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-19 21:21:14 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-19 21:21:14 -0500
commit	22e7fc268c6150f812e9af55f28dba7aeda4d053 (patch)
tree	766662556ae441c5474e754fe9d582ffce3ff257 /beautifulsoup/builder/lxml_builder.py
parent	8249b803d9bab9c06be02a244e629cb732f4f5b1 (diff)
parent	9a936b48fe05666780662c76d5df3b3de7b48074 (diff)