diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-19 21:16:20 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-19 21:16:20 -0500 |
commit | 86ae2ed0a644f124475a4aff3b34e229f5b7ec8f (patch) | |
tree | 339d0a5724f97c1c406b3aaca042e8341c33fd3d /beautifulsoup/builder/lxml_builder.py | |
parent | e170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff) |
Set up an lxml parser that only parses XML, though it's not very functional yet.
Diffstat (limited to 'beautifulsoup/builder/lxml_builder.py')
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 2c264b3..afdf760 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -1,15 +1,16 @@ from lxml import etree from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import HTMLTreeBuilder +from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder from beautifulsoup.dammit import UnicodeDammit -class LXMLTreeBuilder(HTMLTreeBuilder): +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser - def __init__(self, parser_class=etree.HTMLParser): - # etree.HTMLParser's constructor has an argument strip_cdata, - # but it does nothing. CDATA sections are always stripped when - # passed through HTMLParser. - self.parser = parser_class(target=self) + def __init__(self, parser_class=None): + # strip_cdata only has an effect on XMLParser. HTMLParser's + # constructor accepts strip_cdata but ignores it. + parser_class = parser_class or self.DEFAULT_PARSER_CLASS + self.parser = parser_class(target=self, strip_cdata=False) self.soup = None def prepare_markup(self, markup, user_specified_encoding=None, @@ -23,8 +24,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding - + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) def feed(self, markup): self.parser.feed(markup) @@ -60,3 +61,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder): """See `TreeBuilder`.""" return u'<html><body>%s</body></html>' % fragment + +class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder): + + DEFAULT_PARSER_CLASS = etree.HTMLParser |