summaryrefslogtreecommitdiff
path: root/bs4/tests/test_soup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2015-06-27 09:55:40 -0400
committerLeonard Richardson <leonardr@segfault.org>2015-06-27 09:55:40 -0400
commitfeffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch)
tree6dce892919c201b629628647f86843382b29a60a /bs4/tests/test_soup.py
parentd728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff)
Added an exclude_encodings argument to UnicodeDammit and to the
Beautiful Soup constructor, which lets you prohibit the detection of an encoding that you know is wrong. [bug=1469408]
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r--bs4/tests/test_soup.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index e2e2c30..3643aed 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -50,6 +50,11 @@ class TestConstructor(SoupTest):
soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string)
+ def test_exclude_encodings(self):
+ utf8_data = u"Räksmörgås".encode("utf-8")
+ soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+ self.assertEqual("windows-1252", soup.original_encoding)
+
class TestWarnings(SoupTest):
@@ -322,6 +327,20 @@ class TestUnicodeDammit(unittest.TestCase):
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+ def test_exclude_encodings(self):
+ # This is UTF-8.
+ utf8_data = u"Räksmörgås".encode("utf-8")
+
+ # But if we exclude UTF-8 from consideration, the guess is
+ # Windows-1252.
+ dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
+ self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
+
+ # And if we exclude that, there is no valid guess at all.
+ dammit = UnicodeDammit(
+ utf8_data, exclude_encodings=["utf-8", "windows-1252"])
+ self.assertEqual(dammit.original_encoding, None)
+
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')