diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-13 20:00:33 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-13 20:00:33 -0500 |
commit | 5aa0e5ba8ec6a590c673db54c60ff2a76544f14f (patch) | |
tree | bf0637e79bc667a1b5ae08ca4794df414f169992 | |
parent | 87a55b145f0a73e6fc9ede9a762d81d2527161b6 (diff) | |
parent | bc97bb3a83ee9fb4c8e31d11069ccf1cda61d4ff (diff) |
Fixed handling of doctypes and added tests for nonsensical declarations.
-rw-r--r-- | beautifulsoup/__init__.py | 14 | ||||
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 202 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 8 | ||||
-rw-r--r-- | beautifulsoup/element.py | 10 | ||||
-rw-r--r-- | tests/test_html5lib.py | 13 | ||||
-rw-r--r-- | tests/test_lxml.py | 43 |
6 files changed, 274 insertions, 16 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 4a7e18b..ddf51f9 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -222,11 +222,15 @@ class BeautifulStoneSoup(Tag): not self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) + self.object_was_parsed(o) + + def object_was_parsed(self, o): + """Add an object to the parse tree.""" + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) def _popToTag(self, name, inclusivePop=True): diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py new file mode 100644 index 0000000..736889f --- /dev/null +++ b/beautifulsoup/builder/html5lib_builder.py @@ -0,0 +1,202 @@ +from beautifulsoup.builder import HTMLTreeBuilder, SAXTreeBuilder +import html5lib +from html5lib.constants import DataLossWarning +import warnings +from beautifulsoup.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + doc = parser.parse(markup) + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><head></head><body>%s</body></html>' % fragment + + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + if namespaceHTMLElements: + warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning) + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + if namespace is not None: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + self.soup.insert(len(self.soup.contents), node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return self.attrs.items().__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return self.attrs.items() + def keys(self): + return self.attrs.keys() + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in self.attrs.keys() + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def _nodeIndex(self, node, refNode): + # Finds a node by identity rather than equality + for index in range(len(self.element.contents)): + if id(self.element.contents[index]) == id(refNode.element): + return index + return None + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") + newStr = NavigableString(self.element.contents[-1]+node.element) + + # Remove the old text node + # (Can't simply use .extract() by itself, because it fails if + # an equal text node exists within the parent node) + oldElement = self.element.contents[-1] + del self.element.contents[-1] + oldElement.parent = None + oldElement.extract() + + self.element.insert(len(self.element.contents), newStr) + else: + self.element.insert(len(self.element.contents), node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + self.element[name] = value + + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(NavigableString(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self._nodeIndex(node, refNode) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + newStr = NavigableString(self.element.contents[index-1]+node.element) + oldNode = self.element.contents[index-1] + del self.element.contents[index-1] + oldNode.parent = None + oldNode.extract() + + self.element.insert(index-1, newStr) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + index = self._nodeIndex(node.parent, node) + del node.parent.element.contents[index] + node.element.parent = None + node.element.extract() + node.parent = None + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild(Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild(TextNode(child, self.soup)) + + def cloneNode(self): + node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 4e83bba..9ced9f0 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -6,8 +6,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): def __init__(self, parser_class=etree.HTMLParser): # etree.HTMLParser's constructor has an argument strip_cdata, - # but it does nothing. CDATA sections will become text when - # passed through etree.HTMLParser. + # but it does nothing. CDATA sections are always stripped when + # passed through HTMLParser. self.parser = parser_class(target=self) self.soup = None @@ -32,8 +32,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): def doctype(self, name, pubid, system): self.soup.endData() - self.soup.handle_data(name) - self.soup.endData(Doctype) + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) def comment(self, content): "Handle comments as Comment objects." diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index b2e0e12..8749114 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -372,6 +372,16 @@ class Declaration(NavigableString): class Doctype(NavigableString): + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!DOCTYPE ' + self + u'>' diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 2d16bbb..48f27ae 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -95,6 +95,19 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): data = soup.find(text="[CDATA[foo]]") self.assertEquals(data.__class__, Comment) + def test_nonsensical_declaration(self): + # Declarations that don't make any sense are turned into comments. + soup = self.soup('<! Foo = -8><p>a</p>') + self.assertEquals(str(soup), + ("<!-- Foo = -8-->" + "<html><head></head><body><p>a</p></body></html>")) + + soup = self.soup('<p>a</p><! Foo = -8>') + self.assertEquals(str(soup), + ("<html><head></head><body><p>a</p>" + "<!-- Foo = -8--></body></html>")) + + def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" soup = self.soup(isolatin) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 207d141..cba5522 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -4,7 +4,7 @@ import re from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder -from beautifulsoup.element import Comment +from beautifulsoup.element import Comment, Doctype from beautifulsoup.testing import SoupTest @@ -196,15 +196,40 @@ class TestLXMLBuilder(SoupTest): soup = self.soup("<a> </a>") self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) - # Tests below this line need work. + def test_cdata_where_its_ok(self): + # lxml strips CDATA sections, no matter where they occur. + markup = "<svg><![CDATA[foobar]]>" + self.assertSoupEquals(markup, "<svg></svg>") + + def _test_doctype(self, doctype_fragment): + """Run a battery of assertions on a given doctype string.""" + doctype_str = '<!DOCTYPE %s>' % doctype_fragment + markup = doctype_str + '<p>foo</p>' + soup = self.soup(markup) + doctype = soup.contents[0] + self.assertEquals(doctype.__class__, Doctype) + self.assertEquals(doctype, doctype_fragment) + self.assertEquals(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEquals(soup.p.contents[0], 'foo') - #def test_doctype(self): - # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>' - # self.assertSoupEquals(xml) + def test_doctype(self): + # Test a normal HTML doctype you'll commonly see in a real document. + self._test_doctype( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + def test_namespaced_system_doctype(self): + # Test a namespaced doctype with a system id. + self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_system_doctype(self): + # Test a namespaced doctype with a public id. + self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') + + # Tests below this line need work. - #def test_cdata(self): - # print self.soup("<div><![CDATA[foo]]></div>") def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" @@ -273,6 +298,10 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup) + def test_nonsensical_declaration(self): + # Declarations that don't make any sense are ignored. + self.assertSoupEquals('<! Foo = -8><p>a</p>', "<p>a</p>") + def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "<div><![CDATA[foo]]>" |