diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/dammit.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 8 |
2 files changed, 9 insertions, 1 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 59640b7..68ed81f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -306,7 +306,7 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii') + 'ascii', 'replace') if declared_encoding: return declared_encoding.lower() return None diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b74a246..e2e2c30 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from pdb import set_trace import logging import unittest import sys @@ -20,6 +21,7 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, + EncodingDetector, ) from bs4.testing import ( SoupTest, @@ -320,6 +322,12 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings + def test_detect_html5_style_meta_tag(self): for data in ( |