From 4aff2ee4d6f077e06159c92ab05c0f2ea527c6fa Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 9 Feb 2012 16:15:56 -0500 Subject: As a last-ditch attempt to turn data into Unicode, use errors=replace instead of errors=strict. --- bs4/tests/test_soup.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'bs4/tests/test_soup.py') diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ddfc68c..d744694 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -2,6 +2,7 @@ """Tests of Beautiful Soup as a whole.""" import unittest +from bs4 import BeautifulSoup from bs4.element import SoupStrainer from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import SoupTest @@ -162,3 +163,23 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(data, is_html=True) self.assertEquals( "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + + doc = b"""\357\273\277 +\330\250\330\252\330\261 +\310\322\321\220\312\321\355\344""" + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc) + self.assertTrue(soup.contains_replacement_characters) -- cgit v1.2.3