diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 14:24:42 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 14:24:42 -0500 |
commit | 0c9e690dedf720c7c34cc2433f0ccd03f7eb2a85 (patch) | |
tree | 399e4aba45e8488914731b609ee8890680aa4535 | |
parent | b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (diff) |
Ported tests of bad markup that were lying around the TODO.
-rw-r--r-- | TODO | 29 | ||||
-rw-r--r-- | tests/test_html5lib.py | 54 | ||||
-rw-r--r-- | tests/test_lxml.py | 49 |
3 files changed, 97 insertions, 35 deletions
@@ -26,35 +26,6 @@ objects instead of Comment objects in this situation. --- -Here are some unit tests that fail with HTMLParser. - - def testValidButBogusDeclarationFAILS(self): - self.assertSoupEquals('<! Foo >a', '<!Foo >a') - - def testIncompleteDeclarationAtEndFAILS(self): - self.assertSoupEquals('a<!b') - - def testIncompleteEntityAtEndFAILS(self): - self.assertSoupEquals('<Hello>') - - # This is not what the original author had in mind, but it's - # a legitimate interpretation of what they wrote. - self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", - '<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>') - # SGMLParser generates bogus parse events when attribute values - # contain embedded brackets, but at least Beautiful Soup fixes - # it up a little. - self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') - self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', - """<a href='"http://foo.com/'></a><a> and blah and blah</a>""") - - invalidEntity = "foo&#bar;baz" - soup = BeautifulStoneSoup\ - (invalidEntity, - convertEntities=htmlEnt) - self.assertEquals(str(soup), invalidEntity) - - Tag names that contain Unicode characters crash the parser: def testUnicodeTagNamesFAILS(self): self.assertSoupEquals("<デダ芻デダtext>2PM</デダ芻デダtext>") diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 5aeac76..3045b02 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -131,14 +131,56 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): self.assertEquals(comment, 'b <p') self.assertEquals(str2, 'c') - def test_foo(self): - isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" - soup = self.soup(isolatin) + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo >a') + # 'Foo' becomes a comment that appears before the HTML. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'Foo') - utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) - utf8 = utf8.replace("\xe9", "\xc3\xa9") + self.assertEquals(self.find(text="a") == "a") - #print soup + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """<a href="foo</a>, </a><a href="bar">baz</a>""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=') + + #The string after the second quote (bar"), was treated as an + #empty attribute called bar". + self.assertEquals(soup.a['bar"'], '') + self.assertEquals(soup.a.string, "baz") + + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo ><p>a</p>') + # The declaration becomes a comment. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, ' Foo ') + self.assertEquals(soup.p.string, 'a') + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('<p>a<!b') + # This becomes a string 'a'. The incomplete declaration is ignored. + # Compare html5lib, which turns it into a comment. + s, comment = soup.p.contents + self.assertEquals(s, 'a') + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'b') + + def test_entity_was_not_finished(self): + soup = self.soup("<p><Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "<Hello>") + + def test_nonexistent_entity(self): + soup = self.soup("<p>foo&#bar;baz</p>") + self.assertEquals(soup.p.string, "foo&#bar;baz") + + # Compare a real entity. + soup = self.soup("<p>foodbaz</p>") + self.assertEquals(soup.p.string, "foodbaz") class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 11ef15a..85c6a1b 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -376,6 +376,55 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup = "<div><![CDATA[foo]]>" self.assertSoupEquals(markup, "<div></div>") + def test_attribute_value_never_got_closed(self): + markup = '<a href="http://foo.com/</a> and blah and blah' + soup = self.soup(markup) + self.assertEquals( + soup.a['href'], "http://foo.com/</a> and blah and blah") + + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """<a href="foo</a>, </a><a href="bar">baz</a>""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=') + + #The string after the second quote (bar"), was treated as an + #empty attribute called bar. + self.assertEquals(soup.a['bar'], '') + self.assertEquals(soup.a.string, "baz") + + def test_attribute_value_with_embedded_brackets(self): + soup = self.soup('<a b="<a>">') + self.assertEquals(soup.a['b'], '<a>') + + def test_nonexistent_entity(self): + soup = self.soup("<p>foo&#bar;baz</p>") + self.assertEquals(soup.p.string, "foobar;baz") + + # Compare a real entity. + soup = self.soup("<p>foodbaz</p>") + self.assertEquals(soup.p.string, "foodbaz") + + # Also compare html5lib, which preserves the &# before the + # entity name. + + def test_entity_was_not_finished(self): + soup = self.soup("<p><Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "<Hello>") + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('<p>a<!b') + # This becomes a string 'a'. The incomplete declaration is ignored. + # Compare html5lib, which turns it into a comment. + self.assertEquals(soup.p.contents, ['a']) + + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo ><p>a</p>') + # The declaration is ignored altogether. + self.assertEquals(soup.encode(), "<html><body><p>a</p></body></html>") + class TestLXMLBuilderEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various |