summaryrefslogtreecommitdiff
path: root/bs4/tests/test_soup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 16:15:56 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 16:15:56 -0500
commit4aff2ee4d6f077e06159c92ab05c0f2ea527c6fa (patch)
tree40951a60046f184794a011a498187053e8ad2a92 /bs4/tests/test_soup.py
parentcaeb168dc47470607b3cd091e1d35db45c089385 (diff)
As a last-ditch attempt to turn data into Unicode, use errors=replace instead of errors=strict.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r--bs4/tests/test_soup.py21
1 files changed, 21 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddfc68c..d744694 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -2,6 +2,7 @@
"""Tests of Beautiful Soup as a whole."""
import unittest
+from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import SoupTest
@@ -162,3 +163,23 @@ class TestUnicodeDammit(unittest.TestCase):
dammit = UnicodeDammit(data, is_html=True)
self.assertEquals(
"euc-jp", dammit.original_encoding)
+
+ def test_last_ditch_entity_replacement(self):
+ # This is a UTF-8 document that contains bytestrings
+ # completely incompatible with UTF-8 (encoded with some other
+ # encoding).
+ #
+ # Since there is no consistent encoding for the document,
+ # Unicode, Dammit will eventually encode the document as UTF-8
+ # and encode the incompatible characters as REPLACEMENT
+ # CHARACTER.
+
+ doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+ dammit = UnicodeDammit(doc)
+ self.assertEqual(True, dammit.contains_replacement_characters)
+ self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+ soup = BeautifulSoup(doc)
+ self.assertTrue(soup.contains_replacement_characters)