diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-26 12:39:44 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-26 12:39:44 -0400 |
commit | 12f37383078c18a37968a8446961eff7a4e77e75 (patch) | |
tree | b1479338383f51fa0c32ba250ebdba4c261893d1 /bs4 | |
parent | c244fa5be5185a23addb98da68f937fd4be6f582 (diff) |
Fixed a bug in decoding data that contained a byte-order mark, such as data encoded in UTF-16LE. [bug=988980]
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/dammit.py | 48 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 8 |
2 files changed, 36 insertions, 20 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 824c4c0..ec62b99 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -187,16 +187,24 @@ class UnicodeDammit: self.original_encoding = None return - self.markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) + new_markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, is_html) + self.markup = new_markup u = None - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break + if new_markup != markup: + # _detectEncoding modified the markup, then converted it to + # Unicode and then to UTF-8. So convert it from UTF-8. + u = self._convert_from("utf8") + self.original_encoding = sniffed_encoding + + if not u: + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): @@ -305,44 +313,44 @@ class UnicodeDammit: """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == b'\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == b'\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ + and (xml_data[2:4] != b'\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + elif xml_data[:4] == b'\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ + (xml_data[2:4] != b'\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + elif xml_data[:4] == b'\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + elif xml_data[:4] == b'\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + elif xml_data[:4] == b'\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + elif xml_data[:4] == b'\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + elif xml_data[:3] == b'\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 94f325e..bb97e52 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -278,6 +278,14 @@ class TestUnicodeDammit(unittest.TestCase): finally: bs4.dammit.chardet = chardet + def test_sniffed_xml_encoding(self): + # A document written in UTF-16LE will be converted by a different + # code path that sniffs the byte order markers. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + class TestNamedspacedAttribute(SoupTest): |