diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:12:38 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:12:38 -0500 |
commit | d9e7f8b0ec4d4c46fa2fa7f8eee59437e8fa46b8 (patch) | |
tree | 310e0b96ac566f3458f731937e03485e17684a9e /bs4/builder/_htmlparser.py | |
parent | bcf2351c87eb2535acfffdf9d2767c8847ca390c (diff) | |
parent | 683e1a1371065c0c98c8cab6c296ecff6e5f8ea3 (diff) |
Added a tree-builder and tests for the built-in HTMLParser.
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..c293d9e --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,94 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + super(HTMLParserTreeBuilder, self).feed(markup) + + def handle_starttag(self, name, attrs): + self.soup.handle_starttag(name, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + |