Discovered that html5lib can't be made to support SoupStrainers, and changed the test suite appropriately.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 15:10:48 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 15:10:48 -0500
commit: 784dca53bc03943472f58b05b6fcdff801343dac (patch)
tree: 76d78c8831f0acb5a3a41bc628ffbd62b0bbfc6d /beautifulsoup/builder/_lxml.py
parent: 232311a2f682e59078012e5b05e382982862f627 (diff)
parent: 89961f4de3ab1e88b15dd9c0aaa0af77a7c32262 (diff)
1 files changed, 88 insertions, 0 deletions
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
new file mode 100644
index 0000000..c2f368c
--- /dev/null
+++ b/beautifulsoup/builder/_lxml.py
@@ -0,0 +1,88 @@
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from lxml import etree
+from beautifulsoup.element import Comment, Doctype
+from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
+from beautifulsoup.dammit import UnicodeDammit
+import types
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    @property
+    def default_parser(self):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        if parser is None:
+            # Use the default parser.
+            parser = self.default_parser
+        if callable(parser):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False)
+        self.parser = parser
+        self.soup = None
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding)
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def close(self):
+        pass
+
+    def start(self, name, attrs):
+        self.soup.handle_starttag(name, attrs)
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        self.soup.handle_endtag(name)
+
+    def pi(self, target, data):
+        pass
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    @property
+    def default_parser(self):
+        return etree.HTMLParser
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 15:10:48 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 15:10:48 -0500
commit	784dca53bc03943472f58b05b6fcdff801343dac (patch)
tree	76d78c8831f0acb5a3a41bc628ffbd62b0bbfc6d /beautifulsoup/builder/_lxml.py
parent	232311a2f682e59078012e5b05e382982862f627 (diff)
parent	89961f4de3ab1e88b15dd9c0aaa0af77a7c32262 (diff)