summaryrefslogtreecommitdiff
path: root/bs4/tests/test_soup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-04-27 18:54:22 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2012-04-27 18:54:22 -0400
commitccc48d4a84eaa0f35a982f297d5f68b114745832 (patch)
tree2303eec4589bc34ac940a53d6abca54399eb6def /bs4/tests/test_soup.py
parent561a1b479bc17bdd28b11dffa0085de8b2508444 (diff)
Added experimental support for fixing Windows-1252 characters embedded in UTF-8 documents.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r--bs4/tests/test_soup.py38
1 files changed, 38 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index bb97e52..ef58521 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -286,6 +286,44 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
+ def test_fix_embedded_windows_1252(self):
+ # Here's a UTF8 document.
+ utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+
+ # Here's a Windows-1252 document.
+ windows_1252 = (
+ u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+ u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+
+ # Through some unholy alchemy, they've been stuck together.
+ doc = utf8 + windows_1252 + utf8
+
+ # The document can't be turned into UTF-8:
+ self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
+
+ # Unicode, Dammit thinks the whole document is Windows-1252,
+ # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
+
+ # But if we run it through fix_embedded_windows_1252, it's fixed:
+
+ fixed = UnicodeDammit.fix_embedded_windows_1252(doc)
+ self.assertEqual(
+ u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+
+ def test_fix_embedded_windows_1252_ignores_multibyte_characters(self):
+ # Each of these characters has a UTF-8 representation ending
+ # in \x93. \x93 is a smart quote if interpreted as
+ # Windows-1252. But our code knows to skip over multibyte
+ # UTF-8 characters, so they'll survive the process unscathed.
+ for tricky_unicode_char in (
+ u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+ u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+ u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+ ):
+ input = tricky_unicode_char.encode("utf8")
+ self.assertTrue(input.endswith(b'\x93'))
+ output = UnicodeDammit.fix_embedded_windows_1252(input)
+ self.assertEqual(output, input)
class TestNamedspacedAttribute(SoupTest):