diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 16:41:10 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 16:41:10 -0500 |
commit | d89c8878ea86a2575c87e9fad8081cfcd81e0bcd (patch) | |
tree | cdad3f97812e658d84a611b6017b7198fd97d818 | |
parent | e1ad4220e5ca00ec0e7f77ce5087845fcb356a0e (diff) |
Added some elementary doctype handling.
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 41 | ||||
-rw-r--r-- | beautifulsoup/element.py | 5 | ||||
-rw-r--r-- | tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | tests/test_lxml.py | 11 |
4 files changed, 63 insertions, 0 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py new file mode 100644 index 0000000..8336ab4 --- /dev/null +++ b/beautifulsoup/builder/lxml_builder.py @@ -0,0 +1,41 @@ +from lxml import etree +from beautifulsoup.element import Comment, Doctype +from beautifulsoup.builder import HTMLTreeBuilder + +class LXMLTreeBuilder(HTMLTreeBuilder): + + def __init__(self, parser_class=etree.HTMLParser): + self.parser = parser_class(target=self) + self.soup = None + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def close(self): + pass + + def start(self, name, attrs): + self.soup.handle_starttag(name, attrs) + + def end(self, name): + self.soup.handle_endtag(name) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + self.soup.handle_data(name) + self.soup.endData(Doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><body>%s</body></html>' % fragment + diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 7ecd482..b2e0e12 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -370,6 +370,11 @@ class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!' + self + u'>' +class Doctype(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!DOCTYPE ' + self + u'>' + class Tag(PageElement, Entities): """Represents a found HTML tag with its attributes and contents.""" diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 3e35949..dada900 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -43,6 +43,8 @@ class TestHTML5Builder(TestLXMLBuilder): self.assertSoupEquals("<p> </p>") self.assertSoupEquals("<b> </b>") + def test_cdata(self): + print self.soup("<div><![CDATA[foo]]></div>") class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @@ -70,6 +72,10 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): ('<table><tbody><tr></tr></tbody></table>' '<table><tbody><tr id="nested"></tr></tbody></table>')) + def test_doctype_in_body(self): + markup = "<p>one<!DOCTYPE foobar>two</p>" + self.assertSoupEquals(markup, "<p>onetwo</p>") + def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" soup = self.soup(isolatin) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 2af952f..9a65f6a 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -198,6 +198,14 @@ class TestLXMLBuilder(SoupTest): # Tests below this line need work. + #def test_doctype(self): + # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>' + # self.assertSoupEquals(xml) + + + #def test_cdata(self): + # print self.soup("<div><![CDATA[foo]]></div>") + def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") @@ -261,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '<table><tr><table><tr id="nested">', '<table><tr><table><tr id="nested"></tr></table></tr></table>') + def test_doctype_in_body(self): + markup = "<p>one<!DOCTYPE foobar>two</p>" + self.assertSoupEquals(markup) |