summaryrefslogtreecommitdiff
path: root/bs4/tests/test_soup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 08:10:44 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 08:10:44 -0500
commita4ffd587fad7d2442a9ccdcdd0a8f6df347f39eb (patch)
tree268a3c4fb76c48bacf1cd9173bfd34938795860a /bs4/tests/test_soup.py
parent60cb51632dce022d1a4aff18500d286e58e0bd5c (diff)
parent0aa065cb9ef6ae76640b6b0e7d43d687d9db39cd (diff)
Removed unit tests that test different parsers' behavior on invalid markup, and replace them with informative comparisons generated by demonstrate_parser_differences.py.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r--bs4/tests/test_soup.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 896e914..2b7c003 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -103,6 +103,51 @@ class TestEntitySubstitution(unittest.TestCase):
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
+
+class TestEncodingConversion(SoupTest):
+ # Test Beautiful Soup's ability to decode and encode from various
+ # encodings.
+
+ def setUp(self):
+ super(TestEncodingConversion, self).setUp()
+ self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
+ self.utf8_data = self.unicode_data.encode("utf-8")
+ # Just so you know what it looks like.
+ self.assertEqual(
+ self.utf8_data,
+ b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
+
+ def test_ascii_in_unicode_out(self):
+ # ASCII input is converted to Unicode. The original_encoding
+ # attribute is set.
+ ascii = b"<foo>a</foo>"
+ soup_from_ascii = self.soup(ascii)
+ unicode_output = soup_from_ascii.decode()
+ self.assertTrue(isinstance(unicode_output, unicode))
+ self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+ self.assertEqual(soup_from_ascii.original_encoding, "ascii")
+
+ def test_unicode_in_unicode_out(self):
+ # Unicode input is left alone. The original_encoding attribute
+ # is not set.
+ soup_from_unicode = self.soup(self.unicode_data)
+ self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
+ self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_unicode.original_encoding, None)
+
+ def test_utf8_in_unicode_out(self):
+ # UTF-8 input is converted to Unicode. The original_encoding
+ # attribute is set.
+ soup_from_utf8 = self.soup(self.utf8_data)
+ self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
+ self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+
+ def test_utf8_out(self):
+ # The internal data structures can be encoded as UTF-8.
+ soup_from_unicode = self.soup(self.unicode_data)
+ self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
+
+
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""