diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 06:43:46 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 06:43:46 -0500 |
commit | 2afd7cb857a01f5b59b806f68b06889764c1d754 (patch) | |
tree | ee67f336609f06ffe1b952f14b7b251c8d375ee1 /beautifulsoup | |
parent | f93d5bf81e56515098785068a18f2befec37857a (diff) |
Got rid of now-useless builder configuration.
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py new file mode 100644 index 0000000..86de5ec --- /dev/null +++ b/beautifulsoup/builder/__init__.py @@ -0,0 +1,98 @@ +from beautifulsoup.element import Entities + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + ] + + +class TreeBuilder(Entities): + """Turn a document into a Beautiful Soup object tree.""" + + assume_html = False + + def __init__(self): + self.soup = None + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def reset(self): + pass + + def feed(self, markup): + raise NotImplementedError() + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in attrs.items()) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are self-closing tags. + """ + + assume_html = True + + preserve_whitespace_tags = set(['pre', 'textarea']) + self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + |