summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/dammit.py2
-rw-r--r--bs4/tests/test_soup.py8
2 files changed, 9 insertions, 1 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 59640b7..68ed81f 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -306,7 +306,7 @@ class EncodingDetector:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii')
+ 'ascii', 'replace')
if declared_encoding:
return declared_encoding.lower()
return None
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index b74a246..e2e2c30 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
+from pdb import set_trace
import logging
import unittest
import sys
@@ -20,6 +21,7 @@ import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
+ EncodingDetector,
)
from bs4.testing import (
SoupTest,
@@ -320,6 +322,12 @@ class TestUnicodeDammit(unittest.TestCase):
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+ def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
+ detected = EncodingDetector(
+ b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
+ encodings = list(detected.encodings)
+ assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
+
def test_detect_html5_style_meta_tag(self):
for data in (