diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 08:53:17 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 08:53:17 -0500 |
commit | 2208a31babdd6ec331bde1ae82b83b35553cb0ce (patch) | |
tree | 12c57880f0f5cff39b31e4e3e6e4735594b65e69 | |
parent | 4eda5e1cf72ddb1b6a857d5bf9c8c655000d4990 (diff) |
Ported the encoding tests, and split them up into logical chunks. The html5lib writer isn't setting up the charset substitution.
-rw-r--r-- | tests/test_lxml.py | 66 | ||||
-rw-r--r-- | tests/test_tree.py | 41 |
2 files changed, 107 insertions, 0 deletions
diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 6adc2b3..e32872e 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -228,8 +228,74 @@ class TestLXMLBuilder(SoupTest): # Test a namespaced doctype with a public id. self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEquals(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + '<html><head></head><body><pre>' + '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' + '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' + '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' + '</pre></body></html>') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(shift_jis_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEquals(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEquals(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + # Tests below this line need work. + def test_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type" />') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja" />' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is replaced with a + # generic value. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + self.assertEquals(parsed_meta['content'], + 'text/html; charset=%SOUP-ENCODING%') + self.assertEquals(parsed_meta.containsSubstitutions, True) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" diff --git a/tests/test_tree.py b/tests/test_tree.py index e424e0b..02efead 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -817,6 +817,47 @@ class TestPersistence(SoupTest): self.assertEqual(loaded.decode(), soup.decode()) +class TestSubstitutions(SoupTest): + + def test_encoding_substitution(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type" />') + soup = self.soup(meta_tag) + + # Parse the document, and the charset is replaced with a + # generic value. + self.assertEquals(soup.meta['content'], + 'text/html; charset=%SOUP-ENCODING%') + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + self.assertTrue("charset=utf-8" in utf_8) + + euc_jp = soup.encode("euc_jp") + self.assertTrue("charset=euc_jp" in euc_jp) + + shift_jis = soup.encode("shift-jis") + self.assertTrue("charset=shift-jis" in shift_jis) + + utf_16_u = soup.encode("utf-16").decode("utf-16") + self.assertTrue("charset=utf-16" in utf_16_u) + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('<head><meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type" /></head><pre>foo</pre>') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = BeautifulSoup(markup, parseOnlyThese=strainer) + self.assertEquals(soup.contents[0].name, 'pre') + + + class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" |