diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
commit | feffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch) | |
tree | 6dce892919c201b629628647f86843382b29a60a /bs4/tests/test_soup.py | |
parent | d728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff) |
Added an exclude_encodings argument to UnicodeDammit and to the
Beautiful Soup constructor, which lets you prohibit the detection of
an encoding that you know is wrong. [bug=1469408]
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index e2e2c30..3643aed 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -50,6 +50,11 @@ class TestConstructor(SoupTest): soup = self.soup(data) self.assertEqual(u"foo\0bar", soup.h1.string) + def test_exclude_encodings(self): + utf8_data = u"Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + class TestWarnings(SoupTest): @@ -322,6 +327,20 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = u"Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): detected = EncodingDetector( b'<?xml version="1.0" encoding="UTF-\xdb" ?>') |