From d89c8878ea86a2575c87e9fad8081cfcd81e0bcd Mon Sep 17 00:00:00 2001
From: Leonard Richardson
Date: Thu, 10 Feb 2011 16:41:10 -0500
Subject: Added some elementary doctype handling.
---
beautifulsoup/builder/lxml_builder.py | 41 +++++++++++++++++++++++++++++++++++
beautifulsoup/element.py | 5 +++++
tests/test_html5lib.py | 6 +++++
tests/test_lxml.py | 11 ++++++++++
4 files changed, 63 insertions(+)
create mode 100644 beautifulsoup/builder/lxml_builder.py
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
new file mode 100644
index 0000000..8336ab4
--- /dev/null
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -0,0 +1,41 @@
+from lxml import etree
+from beautifulsoup.element import Comment, Doctype
+from beautifulsoup.builder import HTMLTreeBuilder
+
+class LXMLTreeBuilder(HTMLTreeBuilder):
+
+ def __init__(self, parser_class=etree.HTMLParser):
+ self.parser = parser_class(target=self)
+ self.soup = None
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def close(self):
+ pass
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.handle_endtag(name)
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ self.soup.handle_data(name)
+ self.soup.endData(Doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'%s' % fragment
+
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 7ecd482..b2e0e12 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -370,6 +370,11 @@ class Declaration(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
return u''
+class Doctype(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u''
+
class Tag(PageElement, Entities):
"""Represents a found HTML tag with its attributes and contents."""
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 3e35949..dada900 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -43,6 +43,8 @@ class TestHTML5Builder(TestLXMLBuilder):
self.assertSoupEquals("
")
self.assertSoupEquals(" ")
+ def test_cdata(self):
+ print self.soup("")
class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
"""See `BuilderInvalidMarkupSmokeTest`."""
@@ -70,6 +72,10 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
(''
''))
+ def test_doctype_in_body(self):
+ markup = "onetwo
"
+ self.assertSoupEquals(markup, "onetwo
")
+
def test_foo(self):
isolatin = """Sacr\xe9 bleu!"""
soup = self.soup(isolatin)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 2af952f..9a65f6a 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -198,6 +198,14 @@ class TestLXMLBuilder(SoupTest):
# Tests below this line need work.
+ #def test_doctype(self):
+ # xml = 'foo
'
+ # self.assertSoupEquals(xml)
+
+
+ #def test_cdata(self):
+ # print self.soup("")
+
def test_entities_converted_on_the_way_out(self):
text = "<<sacré bleu!>>
"
expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8")
@@ -261,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
'',
'')
+ def test_doctype_in_body(self):
+ markup = "onetwo
"
+ self.assertSoupEquals(markup)
--
cgit v1.2.3
|