diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 08:10:44 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 08:10:44 -0500 |
commit | a4ffd587fad7d2442a9ccdcdd0a8f6df347f39eb (patch) | |
tree | 268a3c4fb76c48bacf1cd9173bfd34938795860a /bs4/tests/test_soup.py | |
parent | 60cb51632dce022d1a4aff18500d286e58e0bd5c (diff) | |
parent | 0aa065cb9ef6ae76640b6b0e7d43d687d9db39cd (diff) |
Removed unit tests that test different parsers' behavior on invalid markup, and replace them with informative comparisons generated by demonstrate_parser_differences.py.
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 896e914..2b7c003 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -103,6 +103,51 @@ class TestEntitySubstitution(unittest.TestCase): text = 'Bob\'s "bar"' self.assertEqual(self.sub.substitute_html(text), text) + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setUp(self): + super(TestEncodingConversion, self).setUp() + self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>" + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( + self.utf8_data, + b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>") + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set. + ascii = b"<foo>a</foo>" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding, "ascii") + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) + self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) + self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + + class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" |