summaryrefslogtreecommitdiff
path: root/bs4/tests/test_soup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-02 22:19:37 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-02 22:19:37 -0400
commit4a9444ac0b74fbf84cf86b9fcf6055c85e65f62a (patch)
tree570cbcb2c9ab9cf458edee87490afeffd8377560 /bs4/tests/test_soup.py
parent11dad27424b319a2034f59f5a7f48286551102d0 (diff)
parent4f9a654766df9ddd05e3ef274b4715b42668724f (diff)
Merged in big encoding-detection refactoring branch.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r--bs4/tests/test_soup.py36
1 files changed, 24 insertions, 12 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index b127716..0b69318 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -15,7 +15,10 @@ from bs4.element import (
NamespacedAttribute,
)
import bs4.dammit
-from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.dammit import (
+ EntitySubstitution,
+ UnicodeDammit,
+)
from bs4.testing import (
SoupTest,
skipIf,
@@ -156,13 +159,23 @@ class TestEncodingConversion(SoupTest):
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
- # attribute is set.
- ascii = b"<foo>a</foo>"
- soup_from_ascii = self.soup(ascii)
- unicode_output = soup_from_ascii.decode()
- self.assertTrue(isinstance(unicode_output, unicode))
- self.assertEqual(unicode_output, self.document_for(ascii.decode()))
- self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
+ # attribute is set to 'utf-8', a superset of ASCII.
+ chardet = bs4.dammit.chardet_dammit
+ logging.disable(logging.WARNING)
+ try:
+ def noop(str):
+ return None
+ # Disable chardet, which will realize that the ASCII is ASCII.
+ bs4.dammit.chardet_dammit = noop
+ ascii = b"<foo>a</foo>"
+ soup_from_ascii = self.soup(ascii)
+ unicode_output = soup_from_ascii.decode()
+ self.assertTrue(isinstance(unicode_output, unicode))
+ self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+ self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
+ finally:
+ logging.disable(logging.NOTSET)
+ bs4.dammit.chardet_dammit = chardet
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
@@ -192,7 +205,7 @@ class TestEncodingConversion(SoupTest):
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
- """Standalone tests of Unicode, Dammit."""
+ """Standalone tests of UnicodeDammit."""
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -293,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase):
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
- def test_sniffed_xml_encoding(self):
- # A document written in UTF-16LE will be converted by a different
- # code path that sniffs the byte order markers.
+ def test_byte_order_mark_removed(self):
+ # A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)