summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 16:51:56 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 16:51:56 -0500
commitdc682f38b36294d9b102aed3be506d46f8d37b59 (patch)
tree6f0d20e8440240ffe4b88dc2a48893431dc81240 /bs4/builder/_lxml.py
parentd9f49a66e4a7dfd93823f2396796ed6c55f69648 (diff)
Renamed the beautifulsoup module to bs4 to save typing.
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
new file mode 100644
index 0000000..5c9bc57
--- /dev/null
+++ b/bs4/builder/_lxml.py
@@ -0,0 +1,108 @@
+__all__ = [
+ 'LXMLTreeBuilderForXML',
+ 'LXMLTreeBuilder',
+ ]
+
+from lxml import etree
+from bs4.element import Comment, Doctype
+from bs4.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ PERMISSIVE,
+ TreeBuilder,
+ XML)
+from bs4.dammit import UnicodeDammit
+import types
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+ DEFAULT_PARSER_CLASS = etree.XMLParser
+
+ is_xml = True
+
+ # Well, it's permissive by XML parser standards.
+ features = [LXML, XML, FAST, PERMISSIVE]
+
+ @property
+ def default_parser(self):
+ # This can either return a parser object or a class, which
+ # will be instantiated with default arguments.
+ return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+ def __init__(self, parser=None, empty_element_tags=None):
+ if empty_element_tags is not None:
+ self.empty_element_tags = set(empty_element_tags)
+ if parser is None:
+ # Use the default parser.
+ parser = self.default_parser
+ if callable(parser):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False)
+ self.parser = parser
+ self.soup = None
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def close(self):
+ pass
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.endData()
+ completed_tag = self.soup.tagStack[-1]
+ self.soup.handle_endtag(name)
+
+ def pi(self, target, data):
+ pass
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ doctype = Doctype.for_name_and_ids(name, pubid, system)
+ self.soup.object_was_parsed(doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+ features = [LXML, HTML, FAST]
+ is_xml = False
+
+ @property
+ def default_parser(self):
+ return etree.HTMLParser
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><body>%s</body></html>' % fragment