summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/dammit.py15
-rw-r--r--bs4/tests/test_soup.py5
3 files changed, 17 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 259771e..d8819e2 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,6 +7,9 @@
* Fixed a crash when a short input contains data not valid in
filenames. [bug=1232604]
+* Fixed a bug that caused Unicode data put into UnicodeDammit to
+ return None instead of the original data. [bug=1214983]
+
* Combined two tests to stop a spurious test failure when tests are
run by nosetests. [bug=1212445]
diff --git a/bs4/dammit.py b/bs4/dammit.py
index c859066..59640b7 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
-This class forces XML data into a standard format (usually to UTF-8 or
-Unicode). It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's the tree builder's job.
+This library converts a bytestream to Unicode through any means
+necessary. It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It works best on XML and XML, but it does not rewrite the
+XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
import codecs
@@ -339,12 +339,15 @@ class UnicodeDammit:
self.detector = EncodingDetector(markup, override_encodings, is_html)
- # Is the data in Unicode to begin with?
+ # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
+ self.original_encoding = None
+ return
- # As a first step, the encoding detector may strip a byte-order mark.
+ # The encoding detector may have stripped a byte-order mark.
+ # Use the stripped markup from this point on.
self.markup = self.detector.markup
u = None
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index b0247fe..47ac245 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -241,6 +241,11 @@ class TestEncodingConversion(SoupTest):
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
+ def test_unicode_input(self):
+ markup = u"I'm already Unicode! \N{SNOWMAN}"
+ dammit = UnicodeDammit(markup)
+ self.assertEqual(dammit.unicode_markup, markup)
+
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)