diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 15:13:41 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 15:13:41 -0500 |
commit | 8249b803d9bab9c06be02a244e629cb732f4f5b1 (patch) | |
tree | 447cddabac142fefd583df1acd6268f6abcb8f5c /tests | |
parent | 0dda99b15112df7225e647db9702fbd62dcc8ea8 (diff) | |
parent | e170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff) |
Ported the rest of the HTML tests, including tests of broken HTML from the TODO. Made Unicode, Dammit PEP-8 compliant.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test_html5lib.py | 56 | ||||
-rw-r--r-- | tests/test_lxml.py | 65 | ||||
-rw-r--r-- | tests/test_soup.py | 20 |
3 files changed, 121 insertions, 20 deletions
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 59d84a3..3045b02 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -131,14 +131,56 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): self.assertEquals(comment, 'b <p') self.assertEquals(str2, 'c') - def test_foo(self): - isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" - soup = self.soup(isolatin) + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo >a') + # 'Foo' becomes a comment that appears before the HTML. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'Foo') - utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) - utf8 = utf8.replace("\xe9", "\xc3\xa9") + self.assertEquals(self.find(text="a") == "a") - #print soup + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """<a href="foo</a>, </a><a href="bar">baz</a>""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=') + + #The string after the second quote (bar"), was treated as an + #empty attribute called bar". + self.assertEquals(soup.a['bar"'], '') + self.assertEquals(soup.a.string, "baz") + + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo ><p>a</p>') + # The declaration becomes a comment. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, ' Foo ') + self.assertEquals(soup.p.string, 'a') + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('<p>a<!b') + # This becomes a string 'a'. The incomplete declaration is ignored. + # Compare html5lib, which turns it into a comment. + s, comment = soup.p.contents + self.assertEquals(s, 'a') + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'b') + + def test_entity_was_not_finished(self): + soup = self.soup("<p><Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "<Hello>") + + def test_nonexistent_entity(self): + soup = self.soup("<p>foo&#bar;baz</p>") + self.assertEquals(soup.p.string, "foo&#bar;baz") + + # Compare a real entity. + soup = self.soup("<p>foodbaz</p>") + self.assertEquals(soup.p.string, "foodbaz") class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): @@ -151,7 +193,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, fromEncoding="iso-8859-8") - self.assertEquals(soup.originalEncoding, 'iso8859-8') + self.assertEquals(soup.original_encoding, 'iso8859-8') self.assertEquals( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 4c11b1d..7e15dcf 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -376,6 +376,59 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup = "<div><![CDATA[foo]]>" self.assertSoupEquals(markup, "<div></div>") + def test_attribute_value_never_got_closed(self): + markup = '<a href="http://foo.com/</a> and blah and blah' + soup = self.soup(markup) + self.assertEquals( + soup.a['href'], "http://foo.com/</a> and blah and blah") + + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """<a href="foo</a>, </a><a href="bar">baz</a>""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=') + + #The string after the second quote (bar"), was treated as an + #empty attribute called bar. + self.assertEquals(soup.a['bar'], '') + self.assertEquals(soup.a.string, "baz") + + def test_attribute_value_with_embedded_brackets(self): + soup = self.soup('<a b="<a>">') + self.assertEquals(soup.a['b'], '<a>') + + def test_nonexistent_entity(self): + soup = self.soup("<p>foo&#bar;baz</p>") + self.assertEquals(soup.p.string, "foobar;baz") + + # Compare a real entity. + soup = self.soup("<p>foodbaz</p>") + self.assertEquals(soup.p.string, "foodbaz") + + # Also compare html5lib, which preserves the &# before the + # entity name. + + def test_entity_was_not_finished(self): + soup = self.soup("<p><Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "<Hello>") + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('<p>a<!b') + # This becomes a string 'a'. The incomplete declaration is ignored. + # Compare html5lib, which turns it into a comment. + self.assertEquals(soup.p.contents, ['a']) + + def test_document_starts_with_bogus_declaration(self): + soup = self.soup('<! Foo ><p>a</p>') + # The declaration is ignored altogether. + self.assertEquals(soup.encode(), "<html><body><p>a</p></body></html>") + + def test_tag_name_contains_unicode(self): + # Unicode characters in tag names are stripped. + tag_name = u"<our\N{SNOWMAN}>Joe</our\N{SNOWMAN}>" + self.assertSoupEquals("<our>Joe</our>") class TestLXMLBuilderEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various @@ -391,25 +444,25 @@ class TestLXMLBuilderEncodingConversion(SoupTest): "<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>") def test_ascii_in_unicode_out(self): - # ASCII input is converted to Unicode. The originalEncoding + # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = "<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEquals(unicode_output, self.document_for(ascii)) - self.assertEquals(soup_from_ascii.originalEncoding, "ascii") + self.assertEquals(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): - # Unicode input is left alone. The originalEncoding attribute + # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEquals(soup_from_unicode.decode(), self.unicode_data) self.assertEquals(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') - self.assertEquals(soup_from_unicode.originalEncoding, None) + self.assertEquals(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): - # UTF-8 input is converted to Unicode. The originalEncoding + # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEquals(soup_from_utf8.decode(), self.unicode_data) @@ -427,7 +480,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest): # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, fromEncoding="iso-8859-8") - self.assertEquals(soup.originalEncoding, 'iso-8859-8') + self.assertEquals(soup.original_encoding, 'iso-8859-8') self.assertEquals( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) diff --git a/tests/test_soup.py b/tests/test_soup.py index 4fb2142..01dff53 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -19,15 +19,21 @@ class TestSelectiveParsing(SoupTest): class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" - def test_smart_quotes_to_xml_entities(self): + def test_smart_quotes_to_unicode(self): markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEquals( + dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>") + + def test_smart_quotes_to_xml_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEquals( dammit.unicode, "<foo>‘’“”</foo>") def test_smart_quotes_to_html_entities(self): markup = "<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smartQuotesTo="html") + dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEquals( dammit.unicode, "<foo>‘’“”</foo>") @@ -35,27 +41,27 @@ class TestUnicodeDammit(unittest.TestCase): utf8 = "\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEquals(dammit.unicode, u'\xe9') - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEquals(dammit.originalEncoding, 'iso-8859-8') + self.assertEquals(dammit.original_encoding, 'iso-8859-8') self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') self.assertEquals(dammit.unicode.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') |