diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 22:19:37 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 22:19:37 -0400 |
commit | 4a9444ac0b74fbf84cf86b9fcf6055c85e65f62a (patch) | |
tree | 570cbcb2c9ab9cf458edee87490afeffd8377560 /bs4/tests/test_soup.py | |
parent | 11dad27424b319a2034f59f5a7f48286551102d0 (diff) | |
parent | 4f9a654766df9ddd05e3ef274b4715b42668724f (diff) |
Merged in big encoding-detection refactoring branch.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 36 |
1 files changed, 24 insertions, 12 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b127716..0b69318 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -15,7 +15,10 @@ from bs4.element import ( NamespacedAttribute, ) import bs4.dammit -from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, +) from bs4.testing import ( SoupTest, skipIf, @@ -156,13 +159,23 @@ class TestEncodingConversion(SoupTest): def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding - # attribute is set. - ascii = b"<foo>a</foo>" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii") + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"<foo>a</foo>" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute @@ -192,7 +205,7 @@ class TestEncodingConversion(SoupTest): self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" + """Standalone tests of UnicodeDammit.""" def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" @@ -293,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase): logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) |