Fixed a bug in decoding data that contained a byte-order mark, such as data encoded in UTF-16LE. [bug=988980]

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-26 12:39:44 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-26 12:39:44 -0400
commit: 12f37383078c18a37968a8446961eff7a4e77e75 (patch)
tree: b1479338383f51fa0c32ba250ebdba4c261893d1 /bs4
parent: c244fa5be5185a23addb98da68f937fd4be6f582 (diff)
2 files changed, 36 insertions, 20 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 824c4c0..ec62b99 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -187,16 +187,24 @@ class UnicodeDammit:
             self.original_encoding = None
             return
 
-        self.markup, document_encoding, sniffed_encoding = \
-                     self._detectEncoding(markup, is_html)
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
 
         u = None
-        for proposed_encoding in (
-            override_encodings + [document_encoding, sniffed_encoding]):
-            if proposed_encoding is not None:
-                u = self._convert_from(proposed_encoding)
-                if u:
-                    break
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
 
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
@@ -305,44 +313,44 @@ class UnicodeDammit:
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
-            if xml_data[:4] == '\x4c\x6f\xa7\x94':
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
                 # EBCDIC
                 xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == '\x00\x3c\x00\x3f':
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
                 # UTF-16BE
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
                 # UTF-16BE with BOM
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x3f\x00':
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
                 # UTF-16LE
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
                 # UTF-16LE with BOM
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\x00\x3c':
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
                 # UTF-32BE
                 sniffed_xml_encoding = 'utf-32be'
                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x00\x00':
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
                 # UTF-32LE
                 sniffed_xml_encoding = 'utf-32le'
                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\xfe\xff':
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
                 # UTF-32BE with BOM
                 sniffed_xml_encoding = 'utf-32be'
                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\xff\xfe\x00\x00':
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
                 # UTF-32LE with BOM
                 sniffed_xml_encoding = 'utf-32le'
                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == '\xef\xbb\xbf':
+            elif xml_data[:3] == b'\xef\xbb\xbf':
                 # UTF-8 with BOM
                 sniffed_xml_encoding = 'utf-8'
                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 94f325e..bb97e52 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -278,6 +278,14 @@ class TestUnicodeDammit(unittest.TestCase):
         finally:
             bs4.dammit.chardet = chardet
 
+    def test_sniffed_xml_encoding(self):
+        # A document written in UTF-16LE will be converted by a different
+        # code path that sniffs the byte order markers.
+        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+        dammit = UnicodeDammit(data)
+        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+        self.assertEqual("utf-16le", dammit.original_encoding)
+
 
 class TestNamedspacedAttribute(SoupTest):
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-26 12:39:44 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-26 12:39:44 -0400
commit	12f37383078c18a37968a8446961eff7a4e77e75 (patch)
tree	b1479338383f51fa0c32ba250ebdba4c261893d1 /bs4
parent	c244fa5be5185a23addb98da68f937fd4be6f582 (diff)