diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/dammit.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 8 |
3 files changed, 12 insertions, 1 deletions
@@ -50,6 +50,9 @@ * Improved docstring for encode_contents() and decode_contents(). [bug=1441543] +* Fixed a crash in Unicode, Dammit's encoding detector when the name + of the encoding itself contained invalid bytes. [bug=1360913] + = 4.3.2 (20131002) = * Fixed a bug in which short Unicode input was improperly encoded to diff --git a/bs4/dammit.py b/bs4/dammit.py index 59640b7..68ed81f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -306,7 +306,7 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii') + 'ascii', 'replace') if declared_encoding: return declared_encoding.lower() return None diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b74a246..e2e2c30 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from pdb import set_trace import logging import unittest import sys @@ -20,6 +21,7 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, + EncodingDetector, ) from bs4.testing import ( SoupTest, @@ -320,6 +322,12 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings + def test_detect_html5_style_meta_tag(self): for data in ( |