from html5lib.treebuilders.dom import dom2sax
from html5lib import treewalkers
from beautifulsoup.element import Comment
from beautifulsoup.builder import HTMLTreeBuilder, TreeBuilder
import html5lib
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in attrs.items())
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTML5TreeBuilder(SAXTreeBuilder, HTMLTreeBuilder):
"""Use html5lib to build a tree, then turn the parsed tree into
SAX events to build a Beautiful Soup tree.
Eventually this will be replaced with something sane.
"""
def __init__(self):
self.soup = None
def feed(self, markup):
builder = html5lib.treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=builder)
doc = parser.parse(markup)
walker = treewalkers.getTreeWalker('dom')
dom2sax(doc, self)