diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 16:51:56 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 16:51:56 -0500 |
commit | dc682f38b36294d9b102aed3be506d46f8d37b59 (patch) | |
tree | 6f0d20e8440240ffe4b88dc2a48893431dc81240 /bs4/builder/_lxml.py | |
parent | d9f49a66e4a7dfd93823f2396796ed6c55f69648 (diff) |
Renamed the beautifulsoup module to bs4 to save typing.
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r-- | bs4/builder/_lxml.py | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py new file mode 100644 index 0000000..5c9bc57 --- /dev/null +++ b/bs4/builder/_lxml.py @@ -0,0 +1,108 @@ +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from lxml import etree +from bs4.element import Comment, Doctype +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML) +from bs4.dammit import UnicodeDammit +import types + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] + + @property + def default_parser(self): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + return etree.XMLParser(target=self, strip_cdata=False, recover=True) + + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + if parser is None: + # Use the default parser. + parser = self.default_parser + if callable(parser): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False) + self.parser = parser + self.soup = None + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def close(self): + pass + + def start(self, name, attrs): + self.soup.handle_starttag(name, attrs) + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + self.soup.handle_endtag(name) + + def pi(self, target, data): + pass + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + features = [LXML, HTML, FAST] + is_xml = False + + @property + def default_parser(self): + return etree.HTMLParser + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><body>%s</body></html>' % fragment |