From d69a8433c010ad5c790566bd4d4e47a1db81988c Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 2 Oct 2013 08:18:54 -0400 Subject: Fixed a bug that caused Unicode data put into UnicodeDammit to return None instead of the original data. [bug=1214983] --- bs4/dammit.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index c859066..59640b7 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and XML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs @@ -339,12 +339,15 @@ class UnicodeDammit: self.detector = EncodingDetector(markup, override_encodings, is_html) - # Is the data in Unicode to begin with? + # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) + self.original_encoding = None + return - # As a first step, the encoding detector may strip a byte-order mark. + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. self.markup = self.detector.markup u = None -- cgit v1.2.3