diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/dammit.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 5 |
3 files changed, 17 insertions, 6 deletions
@@ -7,6 +7,9 @@ * Fixed a crash when a short input contains data not valid in filenames. [bug=1232604] +* Fixed a bug that caused Unicode data put into UnicodeDammit to + return None instead of the original data. [bug=1214983] + * Combined two tests to stop a spurious test failure when tests are run by nosetests. [bug=1212445] diff --git a/bs4/dammit.py b/bs4/dammit.py index c859066..59640b7 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and XML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs @@ -339,12 +339,15 @@ class UnicodeDammit: self.detector = EncodingDetector(markup, override_encodings, is_html) - # Is the data in Unicode to begin with? + # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) + self.original_encoding = None + return - # As a first step, the encoding detector may strip a byte-order mark. + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. self.markup = self.detector.markup u = None diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b0247fe..47ac245 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -241,6 +241,11 @@ class TestEncodingConversion(SoupTest): class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" + def test_unicode_input(self): + markup = u"I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) + def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) |