Ported tests of bad markup that were lying around the TODO.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 14:24:42 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 14:24:42 -0500
commit: 0c9e690dedf720c7c34cc2433f0ccd03f7eb2a85 (patch)
tree: 399e4aba45e8488914731b609ee8890680aa4535
parent: b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (diff)
3 files changed, 97 insertions, 35 deletions
diff --git a/TODO b/TODO
index 887c426..74ce8bd 100644
--- a/TODO
+++ b/TODO
@@ -26,35 +26,6 @@ objects instead of Comment objects in this situation.
 
 ---
 
-Here are some unit tests that fail with HTMLParser.
-
-    def testValidButBogusDeclarationFAILS(self):
-        self.assertSoupEquals('<! Foo >a', '<!Foo >a')
-
-    def testIncompleteDeclarationAtEndFAILS(self):
-        self.assertSoupEquals('a<!b')
-
-    def testIncompleteEntityAtEndFAILS(self):
-        self.assertSoupEquals('&lt;Hello&gt')
-
-        # This is not what the original author had in mind, but it's
-        # a legitimate interpretation of what they wrote.
-        self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
-        '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')
-        # SGMLParser generates bogus parse events when attribute values
-        # contain embedded brackets, but at least Beautiful Soup fixes
-        # it up a little.
-        self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>')
-        self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
-                              """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
-
-        invalidEntity = "foo&#bar;baz"
-        soup = BeautifulStoneSoup\
-               (invalidEntity,
-                convertEntities=htmlEnt)
-        self.assertEquals(str(soup), invalidEntity)
-
-
 Tag names that contain Unicode characters crash the parser:
     def testUnicodeTagNamesFAILS(self):
 	self.assertSoupEquals("<デダ芻デダtext>2PM</デダ芻デダtext>")
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 5aeac76..3045b02 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -131,14 +131,56 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
         self.assertEquals(comment, 'b <p')
         self.assertEquals(str2, 'c')
 
-    def test_foo(self):
-        isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
-        soup = self.soup(isolatin)
+    def test_document_starts_with_bogus_declaration(self):
+        soup = self.soup('<! Foo >a')
+        # 'Foo' becomes a comment that appears before the HTML.
+        comment = soup.contents[0]
+        self.assertTrue(isinstance(comment, Comment))
+        self.assertEquals(comment, 'Foo')
 
-        utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
-        utf8 = utf8.replace("\xe9", "\xc3\xa9")
+        self.assertEquals(self.find(text="a") == "a")
 
-        #print soup
+    def test_attribute_value_was_closed_by_subsequent_tag(self):
+        markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+        soup = self.soup(markup)
+        # The string between the first and second quotes was interpreted
+        # as the value of the 'href' attribute.
+        self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+        #The string after the second quote (bar"), was treated as an
+        #empty attribute called bar".
+        self.assertEquals(soup.a['bar"'], '')
+        self.assertEquals(soup.a.string, "baz")
+
+    def test_document_starts_with_bogus_declaration(self):
+        soup = self.soup('<! Foo ><p>a</p>')
+        # The declaration becomes a comment.
+        comment = soup.contents[0]
+        self.assertTrue(isinstance(comment, Comment))
+        self.assertEquals(comment, ' Foo ')
+        self.assertEquals(soup.p.string, 'a')
+
+    def test_document_ends_with_incomplete_declaration(self):
+        soup = self.soup('<p>a<!b')
+        # This becomes a string 'a'. The incomplete declaration is ignored.
+        # Compare html5lib, which turns it into a comment.
+        s, comment = soup.p.contents
+        self.assertEquals(s, 'a')
+        self.assertTrue(isinstance(comment, Comment))
+        self.assertEquals(comment, 'b')
+
+    def test_entity_was_not_finished(self):
+        soup = self.soup("<p>&lt;Hello&gt")
+        # Compare html5lib, which completes the entity.
+        self.assertEquals(soup.p.string, "<Hello>")
+
+    def test_nonexistent_entity(self):
+        soup = self.soup("<p>foo&#bar;baz</p>")
+        self.assertEquals(soup.p.string, "foo&#bar;baz")
+
+        # Compare a real entity.
+        soup = self.soup("<p>foo&#100;baz</p>")
+        self.assertEquals(soup.p.string, "foodbaz")
 
 
 class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 11ef15a..85c6a1b 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -376,6 +376,55 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
         markup = "<div><![CDATA[foo]]>"
         self.assertSoupEquals(markup, "<div></div>")
 
+    def test_attribute_value_never_got_closed(self):
+        markup = '<a href="http://foo.com/</a> and blah and blah'
+        soup = self.soup(markup)
+        self.assertEquals(
+            soup.a['href'], "http://foo.com/</a> and blah and blah")
+
+    def test_attribute_value_was_closed_by_subsequent_tag(self):
+        markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+        soup = self.soup(markup)
+        # The string between the first and second quotes was interpreted
+        # as the value of the 'href' attribute.
+        self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+        #The string after the second quote (bar"), was treated as an
+        #empty attribute called bar.
+        self.assertEquals(soup.a['bar'], '')
+        self.assertEquals(soup.a.string, "baz")
+
+    def test_attribute_value_with_embedded_brackets(self):
+        soup = self.soup('<a b="<a>">')
+        self.assertEquals(soup.a['b'], '<a>')
+
+    def test_nonexistent_entity(self):
+        soup = self.soup("<p>foo&#bar;baz</p>")
+        self.assertEquals(soup.p.string, "foobar;baz")
+
+        # Compare a real entity.
+        soup = self.soup("<p>foo&#100;baz</p>")
+        self.assertEquals(soup.p.string, "foodbaz")
+
+        # Also compare html5lib, which preserves the &# before the
+        # entity name.
+
+    def test_entity_was_not_finished(self):
+        soup = self.soup("<p>&lt;Hello&gt")
+        # Compare html5lib, which completes the entity.
+        self.assertEquals(soup.p.string, "<Hello&gt")
+
+    def test_document_ends_with_incomplete_declaration(self):
+        soup = self.soup('<p>a<!b')
+        # This becomes a string 'a'. The incomplete declaration is ignored.
+        # Compare html5lib, which turns it into a comment.
+        self.assertEquals(soup.p.contents, ['a'])
+
+    def test_document_starts_with_bogus_declaration(self):
+        soup = self.soup('<! Foo ><p>a</p>')
+        # The declaration is ignored altogether.
+        self.assertEquals(soup.encode(), "<html><body><p>a</p></body></html>")
+
 
 class TestLXMLBuilderEncodingConversion(SoupTest):
     # Test Beautiful Soup's ability to decode and encode from various
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 14:24:42 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 14:24:42 -0500
commit	0c9e690dedf720c7c34cc2433f0ccd03f7eb2a85 (patch)
tree	399e4aba45e8488914731b609ee8890680aa4535
parent	b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (diff)