Fixed handling of doctypes and added tests for nonsensical declarations.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-13 20:00:33 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-13 20:00:33 -0500
commit: 5aa0e5ba8ec6a590c673db54c60ff2a76544f14f (patch)
tree: bf0637e79bc667a1b5ae08ca4794df414f169992
parent: 87a55b145f0a73e6fc9ede9a762d81d2527161b6 (diff)
parent: bc97bb3a83ee9fb4c8e31d11069ccf1cda61d4ff (diff)
6 files changed, 274 insertions, 16 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 4a7e18b..ddf51f9 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -222,11 +222,15 @@ class BeautifulStoneSoup(Tag):
                     not self.parseOnlyThese.search(currentData)):
                 return
             o = containerClass(currentData)
-            o.setup(self.currentTag, self.previous)
-            if self.previous:
-                self.previous.next = o
-            self.previous = o
-            self.currentTag.contents.append(o)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o):
+        """Add an object to the parse tree."""
+        o.setup(self.currentTag, self.previous)
+        if self.previous:
+            self.previous.next = o
+        self.previous = o
+        self.currentTag.contents.append(o)
 
 
     def _popToTag(self, name, inclusivePop=True):
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
new file mode 100644
index 0000000..736889f
--- /dev/null
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -0,0 +1,202 @@
+from beautifulsoup.builder import HTMLTreeBuilder, SAXTreeBuilder
+import html5lib
+from html5lib.constants import DataLossWarning
+import warnings
+from beautifulsoup.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        doc = parser.parse(markup)
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        if namespaceHTMLElements:
+            warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning)
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        if namespace is not None:
+            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+        return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        self.soup.insert(len(self.soup.contents), node.element)
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return self.attrs.items().__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return self.attrs.items()
+    def keys(self):
+        return self.attrs.keys()
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in self.attrs.keys()
+
+
+class Element(html5lib.treebuilders._base.Node):
+    def __init__(self, element, soup, namespace):
+        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def _nodeIndex(self, node, refNode):
+        # Finds a node by identity rather than equality
+        for index in range(len(self.element.contents)):
+            if id(self.element.contents[index]) == id(refNode.element):
+                return index
+        return None
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+            newStr = NavigableString(self.element.contents[-1]+node.element)
+
+            # Remove the old text node
+            # (Can't simply use .extract() by itself, because it fails if
+            # an equal text node exists within the parent node)
+            oldElement = self.element.contents[-1]
+            del self.element.contents[-1]
+            oldElement.parent = None
+            oldElement.extract()
+
+            self.element.insert(len(self.element.contents), newStr)
+        else:
+            self.element.insert(len(self.element.contents), node.element)
+            node.parent = self
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes:
+            for name, value in attributes.items():
+                self.element[name] =  value
+
+    attributes = property(getAttributes, setAttributes)
+    
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(NavigableString(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self._nodeIndex(node, refNode)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            newStr = NavigableString(self.element.contents[index-1]+node.element)
+            oldNode = self.element.contents[index-1]
+            del self.element.contents[index-1]
+            oldNode.parent = None
+            oldNode.extract()
+
+            self.element.insert(index-1, newStr)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        index = self._nodeIndex(node.parent, node)
+        del node.parent.element.contents[index]
+        node.element.parent = None
+        node.element.extract()
+        node.parent = None
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(TextNode(child, self.soup))
+
+    def cloneNode(self):
+        node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        html5lib.treebuilders._base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+    
+    def cloneNode(self):
+        raise NotImplementedError
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 4e83bba..9ced9f0 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -6,8 +6,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
 
     def __init__(self, parser_class=etree.HTMLParser):
         # etree.HTMLParser's constructor has an argument strip_cdata,
-        # but it does nothing. CDATA sections will become text when
-        # passed through etree.HTMLParser.
+        # but it does nothing. CDATA sections are always stripped when
+        # passed through HTMLParser.
         self.parser = parser_class(target=self)
         self.soup = None
 
@@ -32,8 +32,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
 
     def doctype(self, name, pubid, system):
         self.soup.endData()
-        self.soup.handle_data(name)
-        self.soup.endData(Doctype)
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
 
     def comment(self, content):
         "Handle comments as Comment objects."
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index b2e0e12..8749114 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -372,6 +372,16 @@ class Declaration(NavigableString):
 
 class Doctype(NavigableString):
 
+    @classmethod
+    def for_name_and_ids(cls, name, pub_id, system_id):
+        value = name
+        if pub_id is not None:
+            value += ' PUBLIC "%s"' % pub_id
+        if system_id is not None:
+            value += ' SYSTEM "%s"' % system_id
+
+        return Doctype(value)
+
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<!DOCTYPE ' + self + u'>'
 
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 2d16bbb..48f27ae 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -95,6 +95,19 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
         data = soup.find(text="[CDATA[foo]]")
         self.assertEquals(data.__class__, Comment)
 
+    def test_nonsensical_declaration(self):
+        # Declarations that don't make any sense are turned into comments.
+        soup = self.soup('<! Foo = -8><p>a</p>')
+        self.assertEquals(str(soup),
+                          ("<!-- Foo = -8-->"
+                           "<html><head></head><body><p>a</p></body></html>"))
+
+        soup = self.soup('<p>a</p><! Foo = -8>')
+        self.assertEquals(str(soup),
+                          ("<html><head></head><body><p>a</p>"
+                           "<!-- Foo = -8--></body></html>"))
+
+
     def test_foo(self):
         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
         soup = self.soup(isolatin)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 207d141..cba5522 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -4,7 +4,7 @@ import re
 
 from beautifulsoup import BeautifulSoup
 from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder
-from beautifulsoup.element import Comment
+from beautifulsoup.element import Comment, Doctype
 from beautifulsoup.testing import SoupTest
 
 
@@ -196,15 +196,40 @@ class TestLXMLBuilder(SoupTest):
         soup = self.soup("<a>&nbsp;&nbsp;</a>")
         self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
 
-    # Tests below this line need work.
+    def test_cdata_where_its_ok(self):
+        # lxml strips CDATA sections, no matter where they occur.
+        markup = "<svg><![CDATA[foobar]]>"
+        self.assertSoupEquals(markup, "<svg></svg>")
+
+    def _test_doctype(self, doctype_fragment):
+        """Run a battery of assertions on a given doctype string."""
+        doctype_str = '<!DOCTYPE %s>' % doctype_fragment
+        markup = doctype_str + '<p>foo</p>'
+        soup = self.soup(markup)
+        doctype = soup.contents[0]
+        self.assertEquals(doctype.__class__, Doctype)
+        self.assertEquals(doctype, doctype_fragment)
+        self.assertEquals(str(soup)[:len(doctype_str)], doctype_str)
+
+        # Make sure that the doctype was correctly associated with the
+        # parse tree and that the rest of the document parsed.
+        self.assertEquals(soup.p.contents[0], 'foo')
 
-    #def test_doctype(self):
-    #    xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>'
-    #    self.assertSoupEquals(xml)
+    def test_doctype(self):
+        # Test a normal HTML doctype you'll commonly see in a real document.
+        self._test_doctype(
+            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
 
+    def test_namespaced_system_doctype(self):
+        # Test a namespaced doctype with a system id.
+        self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_system_doctype(self):
+        # Test a namespaced doctype with a public id.
+        self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+    # Tests below this line need work.
 
-    #def test_cdata(self):
-    #    print self.soup("<div><![CDATA[foo]]></div>")
 
     def test_entities_converted_on_the_way_out(self):
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
@@ -273,6 +298,10 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
         self.assertSoupEquals(markup)
 
+    def test_nonsensical_declaration(self):
+        # Declarations that don't make any sense are ignored.
+        self.assertSoupEquals('<! Foo = -8><p>a</p>', "<p>a</p>")
+
     def test_cdata_where_it_doesnt_belong(self):
         #CDATA sections are ignored.
         markup = "<div><![CDATA[foo]]>"
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-13 20:00:33 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-13 20:00:33 -0500
commit	5aa0e5ba8ec6a590c673db54c60ff2a76544f14f (patch)
tree	bf0637e79bc667a1b5ae08ca4794df414f169992
parent	87a55b145f0a73e6fc9ede9a762d81d2527161b6 (diff)
parent	bc97bb3a83ee9fb4c8e31d11069ccf1cda61d4ff (diff)