diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/dammit.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 5 |
2 files changed, 14 insertions, 6 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index c859066..59640b7 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and XML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs @@ -339,12 +339,15 @@ class UnicodeDammit: self.detector = EncodingDetector(markup, override_encodings, is_html) - # Is the data in Unicode to begin with? + # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) + self.original_encoding = None + return - # As a first step, the encoding detector may strip a byte-order mark. + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. self.markup = self.detector.markup u = None diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b0247fe..47ac245 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -241,6 +241,11 @@ class TestEncodingConversion(SoupTest): class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" + def test_unicode_input(self): + markup = u"I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) + def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) |