diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-03 21:36:13 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-03 21:36:13 -0500 |
commit | 692fe5201d5dec15a3598578a6f403e67802de0d (patch) | |
tree | 7ab5c761163b5fae77304ff8b6769afd8a9d84e1 /src | |
parent | ea57d5122f1df133927e266e5fabbf0ef767f460 (diff) |
Brought in a treebuilder from html5lib and got it to work.
Diffstat (limited to 'src')
-rw-r--r-- | src/beautifulsoup/README.txt | 33 | ||||
-rw-r--r-- | src/beautifulsoup/builder/__init__.py | 2 | ||||
-rw-r--r-- | src/beautifulsoup/builder/html5lib_builder.py | 205 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_html5lib.py | 5 |
4 files changed, 208 insertions, 37 deletions
diff --git a/src/beautifulsoup/README.txt b/src/beautifulsoup/README.txt index d011072..ff83212 100644 --- a/src/beautifulsoup/README.txt +++ b/src/beautifulsoup/README.txt @@ -4,16 +4,19 @@ Introduction >>> from beautifulsoup import BeautifulSoup >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") >>> print soup.prettify() - <p> - Some - <b> - bad - <i> - HTML - </i> - </b> - </p> - + <html> + <body> + <p> + Some + <b> + bad + <i> + HTML + </i> + </b> + </p> + </body> + </html> >>> soup.find(text="bad") u'bad' @@ -21,13 +24,3 @@ Introduction <i>HTML</i> -Python 3 -======== - -The canonical version of Beautiful Soup is the Python 2 version. You -can generate the Python 3 version by running to3.sh, or by doing what -to3.sh does: run 2to3 on BeautifulSoup.py and BeautifulSoupTests.py, -then applying the appropriate .3.diff file to each generated script. - -The testall.sh script tests both the Python 2 version and a freshly -generated Python 3 version. diff --git a/src/beautifulsoup/builder/__init__.py b/src/beautifulsoup/builder/__init__.py index 2d33a0b..fe80953 100644 --- a/src/beautifulsoup/builder/__init__.py +++ b/src/beautifulsoup/builder/__init__.py @@ -16,8 +16,6 @@ class TreeBuilder(Entities): def __init__(self): self.soup = None - self.self_closing_tags = set() - self.preserve_whitespace_tags = set() def isSelfClosingTag(self, name): return name in self.self_closing_tags diff --git a/src/beautifulsoup/builder/html5lib_builder.py b/src/beautifulsoup/builder/html5lib_builder.py index 80c3e6d..0a4b9e7 100644 --- a/src/beautifulsoup/builder/html5lib_builder.py +++ b/src/beautifulsoup/builder/html5lib_builder.py @@ -1,27 +1,202 @@ -from html5lib.treebuilders.dom import dom2sax -from html5lib import treewalkers from beautifulsoup.builder import HTMLTreeBuilder, SAXTreeBuilder import html5lib +from html5lib.constants import DataLossWarning +import warnings +from beautifulsoup.element import Tag, NavigableString, Comment, Declaration -class HTML5TreeBuilder(SAXTreeBuilder, HTMLTreeBuilder): - """Use html5lib to build a tree, then turn the parsed tree into - SAX events to build a Beautiful Soup tree. - - Eventually this will be replaced with something sane. - """ - - def __init__(self): - self.soup = None +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + # These methods are defined by Beautiful Soup. def feed(self, markup): - builder = html5lib.treebuilders.getTreeBuilder("dom") - parser = html5lib.HTMLParser(tree=builder) + parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup) - walker = treewalkers.getTreeWalker('dom') - dom2sax(doc, self) + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'<html><head></head><body>%s</body></html>' % fragment + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + if namespaceHTMLElements: + warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning) + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if publicId: + self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) + elif systemId: + self.soup.insert(0, Declaration("%s SYSTEM \"%s\""% + (name, systemId))) + else: + self.soup.insert(0, Declaration(name)) + + def elementClass(self, name, namespace): + if namespace is not None: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + self.soup.insert(len(self.soup.contents), node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return self.attrs.items().__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return self.attrs.items() + def keys(self): + return self.attrs.keys() + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in self.attrs.keys() + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def _nodeIndex(self, node, refNode): + # Finds a node by identity rather than equality + for index in range(len(self.element.contents)): + if id(self.element.contents[index]) == id(refNode.element): + return index + return None + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") + newStr = NavigableString(self.element.contents[-1]+node.element) + + # Remove the old text node + # (Can't simply use .extract() by itself, because it fails if + # an equal text node exists within the parent node) + oldElement = self.element.contents[-1] + del self.element.contents[-1] + oldElement.parent = None + oldElement.extract() + + self.element.insert(len(self.element.contents), newStr) + else: + self.element.insert(len(self.element.contents), node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + self.element[name] = value + + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(NavigableString(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self._nodeIndex(node, refNode) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + newStr = NavigableString(self.element.contents[index-1]+node.element) + oldNode = self.element.contents[index-1] + del self.element.contents[index-1] + oldNode.parent = None + oldNode.extract() + + self.element.insert(index-1, newStr) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + index = self._nodeIndex(node.parent, node) + del node.parent.element.contents[index] + node.element.parent = None + node.element.extract() + node.parent = None + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild(Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild(TextNode(child, self.soup)) + + def cloneNode(self): + node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/src/beautifulsoup/tests/test_html5lib.py b/src/beautifulsoup/tests/test_html5lib.py index 7164dac..131c999 100644 --- a/src/beautifulsoup/tests/test_html5lib.py +++ b/src/beautifulsoup/tests/test_html5lib.py @@ -11,6 +11,11 @@ class TestHTML5Builder(BuilderSmokeTest): def setUp(self): self.default_builder = HTML5TreeBuilder() + def test_collapsed_whitespace(self): + """Whitespace is preserved even in tags that don't require it.""" + self.assertSoupEquals("<p> </p>") + self.assertSoupEquals("<b> </b>") + class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): """See `BuilderInvalidMarkupSmokeTest`.""" |