Ported the encoding tests, and split them up into logical chunks. The html5lib writer isn't setting up the charset substitution.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 08:53:17 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 08:53:17 -0500
commit: 2208a31babdd6ec331bde1ae82b83b35553cb0ce (patch)
tree: 12c57880f0f5cff39b31e4e3e6e4735594b65e69
parent: 4eda5e1cf72ddb1b6a857d5bf9c8c655000d4990 (diff)
2 files changed, 107 insertions, 0 deletions
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 6adc2b3..e32872e 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -228,8 +228,74 @@ class TestLXMLBuilder(SoupTest):
         # Test a namespaced doctype with a public id.
         self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"')
 
+    def test_real_iso_latin_document(self):
+        # Smoke test of interrelated functionality, using an
+        # easy-to-understand document.
+
+        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+
+        # That's because we're going to encode it into ISO-Latin-1, and use
+        # that to test.
+        iso_latin_html = unicode_html.encode("iso-8859-1")
+
+        # Parse the ISO-Latin-1 HTML.
+        soup = self.soup(iso_latin_html)
+        # Encode it to UTF-8.
+        result = soup.encode("utf-8")
+
+        # What do we expect the result to look like? Well, it would
+        # look like unicode_html, except that the META tag would say
+        # UTF-8 instead of ISO-Latin-1.
+        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+
+        # And, of course, it would be in UTF-8, not Unicode.
+        expected = expected.encode("utf-8")
+
+        # Ta-da!
+        self.assertEquals(result, expected)
+
+    def test_real_shift_jis_document(self):
+        # Smoke test to make sure the parser can handle a document in
+        # Shift-JIS encoding, without choking.
+        shift_jis_html = (
+            '<html><head></head><body><pre>'
+            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            '</pre></body></html>')
+        unicode_html = shift_jis_html.decode("shift-jis")
+        soup = self.soup(shift_jis_html)
+
+        # Make sure the parse tree is correctly encoded to various
+        # encodings.
+        self.assertEquals(soup.encode("utf-8"), unicode_html.encode("utf-8"))
+        self.assertEquals(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
+
     # Tests below this line need work.
 
+    def test_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja" />'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is replaced with a
+        # generic value.
+        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
+        self.assertEquals(parsed_meta['content'],
+                          'text/html; charset=%SOUP-ENCODING%')
+        self.assertEquals(parsed_meta.containsSubstitutions, True)
+
+        # For the rest of the story, see TestSubstitutions in
+        # test_tree.py.
 
     def test_entities_converted_on_the_way_out(self):
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
diff --git a/tests/test_tree.py b/tests/test_tree.py
index e424e0b..02efead 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -817,6 +817,47 @@ class TestPersistence(SoupTest):
         self.assertEqual(loaded.decode(), soup.decode())
 
 
+class TestSubstitutions(SoupTest):
+
+    def test_encoding_substitution(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type" />')
+        soup = self.soup(meta_tag)
+
+        # Parse the document, and the charset is replaced with a
+        # generic value.
+        self.assertEquals(soup.meta['content'],
+                          'text/html; charset=%SOUP-ENCODING%')
+
+        # Encode the document into some encoding, and the encoding is
+        # substituted into the meta tag.
+        utf_8 = soup.encode("utf-8")
+        self.assertTrue("charset=utf-8" in utf_8)
+
+        euc_jp = soup.encode("euc_jp")
+        self.assertTrue("charset=euc_jp" in euc_jp)
+
+        shift_jis = soup.encode("shift-jis")
+        self.assertTrue("charset=shift-jis" in shift_jis)
+
+        utf_16_u = soup.encode("utf-16").decode("utf-16")
+        self.assertTrue("charset=utf-16" in utf_16_u)
+
+    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
+        markup = ('<head><meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type" /></head><pre>foo</pre>')
+
+        # Beautiful Soup used to try to rewrite the meta tag even if the
+        # meta tag got filtered out by the strainer. This test makes
+        # sure that doesn't happen.
+        strainer = SoupStrainer('pre')
+        soup = BeautifulSoup(markup, parseOnlyThese=strainer)
+        self.assertEquals(soup.contents[0].name, 'pre')
+
+
+
 class TestEncoding(SoupTest):
     """Test the ability to encode objects into strings."""
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 08:53:17 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 08:53:17 -0500
commit	2208a31babdd6ec331bde1ae82b83b35553cb0ce (patch)
tree	12c57880f0f5cff39b31e4e3e6e4735594b65e69
parent	4eda5e1cf72ddb1b6a857d5bf9c8c655000d4990 (diff)