summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-18 15:13:41 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-18 15:13:41 -0500
commit8249b803d9bab9c06be02a244e629cb732f4f5b1 (patch)
tree447cddabac142fefd583df1acd6268f6abcb8f5c /tests
parent0dda99b15112df7225e647db9702fbd62dcc8ea8 (diff)
parente170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff)
Ported the rest of the HTML tests, including tests of broken HTML from the TODO. Made Unicode, Dammit PEP-8 compliant.
Diffstat (limited to 'tests')
-rw-r--r--tests/test_html5lib.py56
-rw-r--r--tests/test_lxml.py65
-rw-r--r--tests/test_soup.py20
3 files changed, 121 insertions, 20 deletions
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 59d84a3..3045b02 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -131,14 +131,56 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
self.assertEquals(comment, 'b <p')
self.assertEquals(str2, 'c')
- def test_foo(self):
- isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
- soup = self.soup(isolatin)
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo >a')
+ # 'Foo' becomes a comment that appears before the HTML.
+ comment = soup.contents[0]
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, 'Foo')
- utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
- utf8 = utf8.replace("\xe9", "\xc3\xa9")
+ self.assertEquals(self.find(text="a") == "a")
- #print soup
+ def test_attribute_value_was_closed_by_subsequent_tag(self):
+ markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+ soup = self.soup(markup)
+ # The string between the first and second quotes was interpreted
+ # as the value of the 'href' attribute.
+ self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+ #The string after the second quote (bar"), was treated as an
+ #empty attribute called bar".
+ self.assertEquals(soup.a['bar"'], '')
+ self.assertEquals(soup.a.string, "baz")
+
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo ><p>a</p>')
+ # The declaration becomes a comment.
+ comment = soup.contents[0]
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, ' Foo ')
+ self.assertEquals(soup.p.string, 'a')
+
+ def test_document_ends_with_incomplete_declaration(self):
+ soup = self.soup('<p>a<!b')
+ # This becomes a string 'a'. The incomplete declaration is ignored.
+ # Compare html5lib, which turns it into a comment.
+ s, comment = soup.p.contents
+ self.assertEquals(s, 'a')
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, 'b')
+
+ def test_entity_was_not_finished(self):
+ soup = self.soup("<p>&lt;Hello&gt")
+ # Compare html5lib, which completes the entity.
+ self.assertEquals(soup.p.string, "<Hello>")
+
+ def test_nonexistent_entity(self):
+ soup = self.soup("<p>foo&#bar;baz</p>")
+ self.assertEquals(soup.p.string, "foo&#bar;baz")
+
+ # Compare a real entity.
+ soup = self.soup("<p>foo&#100;baz</p>")
+ self.assertEquals(soup.p.string, "foodbaz")
class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
@@ -151,7 +193,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
fromEncoding="iso-8859-8")
- self.assertEquals(soup.originalEncoding, 'iso8859-8')
+ self.assertEquals(soup.original_encoding, 'iso8859-8')
self.assertEquals(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 4c11b1d..7e15dcf 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -376,6 +376,59 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
markup = "<div><![CDATA[foo]]>"
self.assertSoupEquals(markup, "<div></div>")
+ def test_attribute_value_never_got_closed(self):
+ markup = '<a href="http://foo.com/</a> and blah and blah'
+ soup = self.soup(markup)
+ self.assertEquals(
+ soup.a['href'], "http://foo.com/</a> and blah and blah")
+
+ def test_attribute_value_was_closed_by_subsequent_tag(self):
+ markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+ soup = self.soup(markup)
+ # The string between the first and second quotes was interpreted
+ # as the value of the 'href' attribute.
+ self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+ #The string after the second quote (bar"), was treated as an
+ #empty attribute called bar.
+ self.assertEquals(soup.a['bar'], '')
+ self.assertEquals(soup.a.string, "baz")
+
+ def test_attribute_value_with_embedded_brackets(self):
+ soup = self.soup('<a b="<a>">')
+ self.assertEquals(soup.a['b'], '<a>')
+
+ def test_nonexistent_entity(self):
+ soup = self.soup("<p>foo&#bar;baz</p>")
+ self.assertEquals(soup.p.string, "foobar;baz")
+
+ # Compare a real entity.
+ soup = self.soup("<p>foo&#100;baz</p>")
+ self.assertEquals(soup.p.string, "foodbaz")
+
+ # Also compare html5lib, which preserves the &# before the
+ # entity name.
+
+ def test_entity_was_not_finished(self):
+ soup = self.soup("<p>&lt;Hello&gt")
+ # Compare html5lib, which completes the entity.
+ self.assertEquals(soup.p.string, "<Hello&gt")
+
+ def test_document_ends_with_incomplete_declaration(self):
+ soup = self.soup('<p>a<!b')
+ # This becomes a string 'a'. The incomplete declaration is ignored.
+ # Compare html5lib, which turns it into a comment.
+ self.assertEquals(soup.p.contents, ['a'])
+
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo ><p>a</p>')
+ # The declaration is ignored altogether.
+ self.assertEquals(soup.encode(), "<html><body><p>a</p></body></html>")
+
+ def test_tag_name_contains_unicode(self):
+ # Unicode characters in tag names are stripped.
+ tag_name = u"<our\N{SNOWMAN}>Joe</our\N{SNOWMAN}>"
+ self.assertSoupEquals("<our>Joe</our>")
class TestLXMLBuilderEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
@@ -391,25 +444,25 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
def test_ascii_in_unicode_out(self):
- # ASCII input is converted to Unicode. The originalEncoding
+ # ASCII input is converted to Unicode. The original_encoding
# attribute is set.
ascii = "<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEquals(unicode_output, self.document_for(ascii))
- self.assertEquals(soup_from_ascii.originalEncoding, "ascii")
+ self.assertEquals(soup_from_ascii.original_encoding, "ascii")
def test_unicode_in_unicode_out(self):
- # Unicode input is left alone. The originalEncoding attribute
+ # Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEquals(soup_from_unicode.decode(), self.unicode_data)
self.assertEquals(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
- self.assertEquals(soup_from_unicode.originalEncoding, None)
+ self.assertEquals(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
- # UTF-8 input is converted to Unicode. The originalEncoding
+ # UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEquals(soup_from_utf8.decode(), self.unicode_data)
@@ -427,7 +480,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
fromEncoding="iso-8859-8")
- self.assertEquals(soup.originalEncoding, 'iso-8859-8')
+ self.assertEquals(soup.original_encoding, 'iso-8859-8')
self.assertEquals(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
diff --git a/tests/test_soup.py b/tests/test_soup.py
index 4fb2142..01dff53 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,15 +19,21 @@ class TestSelectiveParsing(SoupTest):
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
- def test_smart_quotes_to_xml_entities(self):
+ def test_smart_quotes_to_unicode(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(
+ dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+ def test_smart_quotes_to_xml_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
+ dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+ self.assertEquals(
dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup, smartQuotesTo="html")
+ dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEquals(
dammit.unicode, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
@@ -35,27 +41,27 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = "\xc3\xa9"
dammit = UnicodeDammit(utf8)
self.assertEquals(dammit.unicode, u'\xe9')
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
def test_convert_hebrew(self):
hebrew = "\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
- self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
+ self.assertEquals(dammit.original_encoding, 'iso-8859-8')
self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
self.assertEquals(dammit.unicode.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')