summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/dammit.py2
-rw-r--r--bs4/tests/test_soup.py8
3 files changed, 12 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e3c5938..ed2d89d 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -50,6 +50,9 @@
* Improved docstring for encode_contents() and
decode_contents(). [bug=1441543]
+* Fixed a crash in Unicode, Dammit's encoding detector when the name
+ of the encoding itself contained invalid bytes. [bug=1360913]
+
= 4.3.2 (20131002) =
* Fixed a bug in which short Unicode input was improperly encoded to
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 59640b7..68ed81f 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -306,7 +306,7 @@ class EncodingDetector:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii')
+ 'ascii', 'replace')
if declared_encoding:
return declared_encoding.lower()
return None
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index b74a246..e2e2c30 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
+from pdb import set_trace
import logging
import unittest
import sys
@@ -20,6 +21,7 @@ import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
+ EncodingDetector,
)
from bs4.testing import (
SoupTest,
@@ -320,6 +322,12 @@ class TestUnicodeDammit(unittest.TestCase):
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+ def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
+ detected = EncodingDetector(
+ b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
+ encodings = list(detected.encodings)
+ assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
+
def test_detect_html5_style_meta_tag(self):
for data in (