Added experimental support for fixing Windows-1252 characters embedded in UTF-8 documents.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-27 18:54:22 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-27 18:54:22 -0400
commit: ccc48d4a84eaa0f35a982f297d5f68b114745832 (patch)
tree: 2303eec4589bc34ac940a53d6abca54399eb6def /bs4/tests/test_soup.py
parent: 561a1b479bc17bdd28b11dffa0085de8b2508444 (diff)
1 files changed, 38 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index bb97e52..ef58521 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -286,6 +286,44 @@ class TestUnicodeDammit(unittest.TestCase):
         self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
         self.assertEqual("utf-16le", dammit.original_encoding)
 
+    def test_fix_embedded_windows_1252(self):
+        # Here's a UTF8 document.
+        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+
+        # Here's a Windows-1252 document.
+        windows_1252 = (
+            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+
+        # Through some unholy alchemy, they've been stuck together.
+        doc = utf8 + windows_1252 + utf8
+
+        # The document can't be turned into UTF-8:
+        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
+
+        # Unicode, Dammit thinks the whole document is Windows-1252,
+        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
+
+        # But if we run it through fix_embedded_windows_1252, it's fixed:
+
+        fixed = UnicodeDammit.fix_embedded_windows_1252(doc)
+        self.assertEqual(
+            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+
+    def test_fix_embedded_windows_1252_ignores_multibyte_characters(self):
+        # Each of these characters has a UTF-8 representation ending
+        # in \x93. \x93 is a smart quote if interpreted as
+        # Windows-1252. But our code knows to skip over multibyte
+        # UTF-8 characters, so they'll survive the process unscathed.
+        for tricky_unicode_char in (
+            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+            ):
+            input = tricky_unicode_char.encode("utf8")
+            self.assertTrue(input.endswith(b'\x93'))
+            output = UnicodeDammit.fix_embedded_windows_1252(input)
+            self.assertEqual(output, input)
 
 class TestNamedspacedAttribute(SoupTest):
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-27 18:54:22 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-27 18:54:22 -0400
commit	ccc48d4a84eaa0f35a982f297d5f68b114745832 (patch)
tree	2303eec4589bc34ac940a53d6abca54399eb6def /bs4/tests/test_soup.py
parent	561a1b479bc17bdd28b11dffa0085de8b2508444 (diff)