diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2010-12-28 11:21:18 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2010-12-28 11:21:18 -0500 |
commit | 566b0e64379633f6029bd2cf0a53bd9cd265735d (patch) | |
tree | a8483d3bf02aaa5bbd275d7be2d40364dbabcb92 /src | |
parent | 30cb1ad76ee23fb89f0611db3f7b9a26cbbce06e (diff) |
Added the html5lib builder.
Diffstat (limited to 'src')
-rw-r--r-- | src/beautifulsoup/builder/html5lib_builder.py | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/beautifulsoup/builder/html5lib_builder.py b/src/beautifulsoup/builder/html5lib_builder.py new file mode 100644 index 0000000..4f3f686 --- /dev/null +++ b/src/beautifulsoup/builder/html5lib_builder.py @@ -0,0 +1,70 @@ +from html5lib.treebuilders.dom import dom2sax +from html5lib import treewalkers +from beautifulsoup.element import Comment +from beautifulsoup.builder import HTMLTreeBuilder, TreeBuilder +import html5lib + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in attrs.items()) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTML5TreeBuilder(SAXTreeBuilder, HTMLTreeBuilder): + """Use html5lib to build a tree, then turn the parsed tree into + SAX events to build a Beautiful Soup tree. + + Eventually this will be replaced with something sane. + """ + + def __init__(self): + self.soup = None + + def feed(self, markup): + builder = html5lib.treebuilders.getTreeBuilder("dom") + parser = html5lib.HTMLParser(tree=builder) + doc = parser.parse(markup) + walker = treewalkers.getTreeWalker('dom') + dom2sax(doc, self) + + |