diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-11 09:10:56 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-11 09:10:56 -0500 |
commit | d0531c4204a67a4289025bf7108a922f680fa057 (patch) | |
tree | cdad3f97812e658d84a611b6017b7198fd97d818 | |
parent | 3366ad67dc2dfdd508267efc87dfc851b612fb0d (diff) | |
parent | d89c8878ea86a2575c87e9fad8081cfcd81e0bcd (diff) |
Ported some more tests, fixed an encoding problem, and added rudimentary doctype handling.
-rw-r--r-- | TODO | 5 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 41 | ||||
-rw-r--r-- | beautifulsoup/element.py | 8 | ||||
-rw-r--r-- | tests/test_html5lib.py | 32 | ||||
-rw-r--r-- | tests/test_lxml.py | 119 | ||||
-rw-r--r-- | tests/test_tree.py | 16 |
6 files changed, 215 insertions, 6 deletions
@@ -2,6 +2,11 @@ html5lib has its own Unicode, Dammit-like system. Converting the input to Unicode should be up to the builder. The lxml builder would use Unicode, Dammit, and the html5lib builder would be a no-op. +Bare ampersands should be converted to HTML entities upon output. + +It should also be possible to convert certain Unicode characters to +HTML entities upon output. + --- Here are some unit tests that fail with HTMLParser. diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py new file mode 100644 index 0000000..8336ab4 --- /dev/null +++ b/beautifulsoup/builder/lxml_builder.py @@ -0,0 +1,41 @@ +from lxml import etree +from beautifulsoup.element import Comment, Doctype +from beautifulsoup.builder import HTMLTreeBuilder + +class LXMLTreeBuilder(HTMLTreeBuilder): + + def __init__(self, parser_class=etree.HTMLParser): + self.parser = parser_class(target=self) + self.soup = None + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def close(self): + pass + + def start(self, name, attrs): + self.soup.handle_starttag(name, attrs) + + def end(self, name): + self.soup.handle_endtag(name) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + self.soup.handle_data(name) + self.soup.endData(Doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><body>%s</body></html>' % fragment + diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index bd9bcbf..b2e0e12 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -346,9 +346,6 @@ class NavigableString(unicode, PageElement): else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.decode().encode(encoding) - def decodeGivenEventualEncoding(self, eventualEncoding): return self @@ -373,6 +370,11 @@ class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!' + self + u'>' +class Doctype(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!DOCTYPE ' + self + u'>' + class Tag(PageElement, Entities): """Represents a found HTML tag with its attributes and contents.""" diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 3a4ee27..dada900 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -19,11 +19,32 @@ class TestHTML5Builder(TestLXMLBuilder): self.assertSoupEquals( "A bare string", "A bare string") + def test_correctly_nested_tables(self): + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tbody><tr><td>Here\'s another table:' + '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' + '</td></tr></tbody></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_collapsed_whitespace(self): """Whitespace is preserved even in tags that don't require it.""" self.assertSoupEquals("<p> </p>") self.assertSoupEquals("<b> </b>") + def test_cdata(self): + print self.soup("<div><![CDATA[foo]]></div>") class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @@ -40,12 +61,21 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): '<blockquote><p><b>Foo</blockquote><p>Bar', '<blockquote><p><b>Foo</b></p></blockquote><p><b>Bar</b></p>') + def test_table_containing_bare_markup(self): + # Markup should be in table cells, not directly in the table. + self.assertSoupEquals("<table><div>Foo</div></table>", + "<div>Foo</div><table></table>") + def test_incorrectly_nested_tables(self): self.assertSoupEquals( '<table><tr><table><tr id="nested">', ('<table><tbody><tr></tr></tbody></table>' '<table><tbody><tr id="nested"></tr></tbody></table>')) + def test_doctype_in_body(self): + markup = "<p>one<!DOCTYPE foobar>two</p>" + self.assertSoupEquals(markup, "<p>onetwo</p>") + def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" soup = self.soup(isolatin) @@ -53,4 +83,4 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) utf8 = utf8.replace("\xe9", "\xc3\xa9") - print soup + #print soup diff --git a/tests/test_lxml.py b/tests/test_lxml.py index d16e8d9..9a65f6a 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -1,5 +1,7 @@ """Tests to ensure that the lxml tree builder generates good trees.""" +import re + from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder from beautifulsoup.element import Comment @@ -65,6 +67,34 @@ class TestLXMLBuilder(SoupTest): self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') + # This is a <table> tag containing another <table> tag in one of its + # cells. + TABLE_MARKUP_1 = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + def test_correctly_nested_tables(self): + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tr><td>Here\'s another table:' + '<table id="2"><tr><td>foo</td></tr></table>' + '</td></tr></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" self.assertSoupEquals("<p> </p>", "<p> </p>") @@ -114,14 +144,82 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('<script>%s</script>' % javascript) self.assertEquals(soup.script.string, javascript) + def test_naked_ampersands(self): + # Ampersands are left alone. + text = "<p>AT&T</p>" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T") + + # Even if they're in attribute values. + invalid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(invalid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<p><<sacré bleu!>></p>" + expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + self.assertSoupEquals(text, expected) + + def test_entities_in_attribute_values_converted_during_parsing(self): + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="piñata">' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="sacré bleu">' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = "<p>\x91Foo\x92</p>" + soup = self.soup(quote) + self.assertEquals( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + + # Tests below this line need work. + + #def test_doctype(self): + # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>' + # self.assertSoupEquals(xml) + + + #def test_cdata(self): + # print self.soup("<div><![CDATA[foo]]></div>") + + def test_entities_converted_on_the_way_out(self): + text = "<p><<sacré bleu!>></p>" + expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") + soup = BeautifulSoup(text) + str = soup.p.string + #self.assertEquals(str.encode("utf-8"), expected) + def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" soup = self.soup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) utf8 = utf8.replace("\xe9", "\xc3\xa9") - - print soup + #print soup class TestLXMLBuilderInvalidMarkup(SoupTest): @@ -134,6 +232,20 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup at all. """ + def test_table_containing_bare_markup(self): + # Markup should be in table cells, not directly in the table. + self.assertSoupEquals("<table><div>Foo</div></table>") + + def test_incorrectly_nested_table(self): + # The second <table> tag is floating in the <tr> tag + # rather than being inside a <td>. + bad_markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:</td>" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( @@ -157,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '<table><tr><table><tr id="nested">', '<table><tr><table><tr id="nested"></tr></table></tr></table>') + def test_doctype_in_body(self): + markup = "<p>one<!DOCTYPE foobar>two</p>" + self.assertSoupEquals(markup) diff --git a/tests/test_tree.py b/tests/test_tree.py index eac4e72..367489e 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -815,3 +815,19 @@ class TestPersistence(SoupTest): dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) + + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = u"<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEquals(soup.b.string.encode("utf-8"), + u"\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = u"<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEquals( + soup.b.encode("utf-8"), html.encode("utf-8")) |