summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
commit838c2b6aeda1b318061f843673fb09340aa97b85 (patch)
treef5b4f909936c9c398f5aaa821ea00414606581b5 /bs4/builder/_htmlparser.py
parentdc682f38b36294d9b102aed3be506d46f8d37b59 (diff)
Added a tree builder for the built-in HTMLParser, and tests.
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py94
1 files changed, 94 insertions, 0 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000..c293d9e
--- /dev/null
+++ b/bs4/builder/_htmlparser.py
@@ -0,0 +1,94 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+ 'HTMLParserTreeBuilder',
+ ]
+
+from HTMLParser import HTMLParser
+from bs4.element import (
+ CData,
+ Comment,
+ Declaration,
+ Doctype,
+ ProcessingInstruction,
+ )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+ HTML,
+ HTMLTreeBuilder,
+ STRICT,
+ )
+
+
+HTMLPARSER = 'html.parser'
+
+class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
+
+ is_xml = False
+ features = [HTML, STRICT, HTMLPARSER]
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ super(HTMLParserTreeBuilder, self).feed(markup)
+
+ def handle_starttag(self, name, attrs):
+ self.soup.handle_starttag(name, dict(attrs))
+
+ def handle_endtag(self, name):
+ self.soup.handle_endtag(name)
+
+ def handle_data(self, data):
+ self.soup.handle_data(data)
+
+ def handle_charref(self, name):
+ self.handle_data(unichr(int(name)))
+
+ def handle_entityref(self, name):
+ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+ if character is not None:
+ data = character
+ else:
+ data = "&%s;" % name
+ self.handle_data(data)
+
+ def handle_comment(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(Comment)
+
+ def handle_decl(self, data):
+ self.soup.endData()
+ if data.startswith("DOCTYPE "):
+ data = data[len("DOCTYPE "):]
+ self.soup.handle_data(data)
+ self.soup.endData(Doctype)
+
+ def unknown_decl(self, data):
+ if data.upper().startswith('CDATA['):
+ cls = CData
+ data = data[len('CDATA['):]
+ else:
+ cls = Declaration
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(cls)
+
+ def handle_pi(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(ProcessingInstruction)
+