Added some elementary doctype handling.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-10 16:41:10 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-10 16:41:10 -0500
commit: d89c8878ea86a2575c87e9fad8081cfcd81e0bcd (patch)
tree: cdad3f97812e658d84a611b6017b7198fd97d818
parent: e1ad4220e5ca00ec0e7f77ce5087845fcb356a0e (diff)
4 files changed, 63 insertions, 0 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
new file mode 100644
index 0000000..8336ab4
--- /dev/null
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -0,0 +1,41 @@
+from lxml import etree
+from beautifulsoup.element import Comment, Doctype
+from beautifulsoup.builder import HTMLTreeBuilder
+
+class LXMLTreeBuilder(HTMLTreeBuilder):
+
+    def __init__(self, parser_class=etree.HTMLParser):
+        self.parser = parser_class(target=self)
+        self.soup = None
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def close(self):
+        pass
+
+    def start(self, name, attrs):
+        self.soup.handle_starttag(name, attrs)
+
+    def end(self, name):
+        self.soup.handle_endtag(name)
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        self.soup.handle_data(name)
+        self.soup.endData(Doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
+
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 7ecd482..b2e0e12 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -370,6 +370,11 @@ class Declaration(NavigableString):
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<!' + self + u'>'
 
+class Doctype(NavigableString):
+
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return u'<!DOCTYPE ' + self + u'>'
+
 class Tag(PageElement, Entities):
 
     """Represents a found HTML tag with its attributes and contents."""
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 3e35949..dada900 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -43,6 +43,8 @@ class TestHTML5Builder(TestLXMLBuilder):
         self.assertSoupEquals("<p>   </p>")
         self.assertSoupEquals("<b>   </b>")
 
+    def test_cdata(self):
+        print self.soup("<div><![CDATA[foo]]></div>")
 
 class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
     """See `BuilderInvalidMarkupSmokeTest`."""
@@ -70,6 +72,10 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
             ('<table><tbody><tr></tr></tbody></table>'
              '<table><tbody><tr id="nested"></tr></tbody></table>'))
 
+    def test_doctype_in_body(self):
+        markup = "<p>one<!DOCTYPE foobar>two</p>"
+        self.assertSoupEquals(markup, "<p>onetwo</p>")
+
     def test_foo(self):
         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
         soup = self.soup(isolatin)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 2af952f..9a65f6a 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -198,6 +198,14 @@ class TestLXMLBuilder(SoupTest):
 
     # Tests below this line need work.
 
+    #def test_doctype(self):
+    #    xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>'
+    #    self.assertSoupEquals(xml)
+
+
+    #def test_cdata(self):
+    #    print self.soup("<div><![CDATA[foo]]></div>")
+
     def test_entities_converted_on_the_way_out(self):
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
         expected = u"&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;".encode("utf-8")
@@ -261,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
             '<table><tr><table><tr id="nested">',
             '<table><tr><table><tr id="nested"></tr></table></tr></table>')
 
+    def test_doctype_in_body(self):
+        markup = "<p>one<!DOCTYPE foobar>two</p>"
+        self.assertSoupEquals(markup)
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-10 16:41:10 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-10 16:41:10 -0500
commit	d89c8878ea86a2575c87e9fad8081cfcd81e0bcd (patch)
tree	cdad3f97812e658d84a611b6017b7198fd97d818
parent	e1ad4220e5ca00ec0e7f77ce5087845fcb356a0e (diff)