diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
commit | 838c2b6aeda1b318061f843673fb09340aa97b85 (patch) | |
tree | f5b4f909936c9c398f5aaa821ea00414606581b5 /bs4/builder/_htmlparser.py | |
parent | dc682f38b36294d9b102aed3be506d46f8d37b59 (diff) |
Added a tree builder for the built-in HTMLParser, and tests.
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..c293d9e --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,94 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + super(HTMLParserTreeBuilder, self).feed(markup) + + def handle_starttag(self, name, attrs): + self.soup.handle_starttag(name, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + |