summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
commitab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
treebce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/dammit.py
parentcf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py37
1 files changed, 29 insertions, 8 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 08109f2..74fa7f0 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -22,6 +22,8 @@ try:
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
+ if isinstance(s, unicode):
+ return None
return cchardet.detect(s)['encoding']
except ImportError:
try:
@@ -30,6 +32,8 @@ except ImportError:
# PyPI package: chardet
import chardet
def chardet_dammit(s):
+ if isinstance(s, unicode):
+ return None
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
@@ -44,10 +48,19 @@ try:
except ImportError:
pass
-xml_encoding_re = re.compile(
- '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
-html_meta_re = re.compile(
- '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+# Build bytestring and Unicode versions of regular expressions for finding
+# a declared encoding inside an XML or HTML document.
+xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
+encoding_res = dict()
+encoding_res[bytes] = {
+ 'html' : re.compile(html_meta.encode("ascii"), re.I),
+ 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
+}
+encoding_res[unicode] = {
+ 'html' : re.compile(html_meta, re.I),
+ 'xml' : re.compile(xml_encoding, re.I)
+}
class EntitySubstitution(object):
@@ -319,14 +332,22 @@ class EncodingDetector:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
+ if isinstance(markup, bytes):
+ res = encoding_res[bytes]
+ else:
+ res = encoding_res[unicode]
+
+ xml_re = res['xml']
+ html_re = res['html']
declared_encoding = None
- declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+ declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
- declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+ declared_encoding_match = html_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
- declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii', 'replace')
+ declared_encoding = declared_encoding_match.groups()[0]
if declared_encoding:
+ if isinstance(declared_encoding, bytes):
+ declared_encoding = declared_encoding.decode('ascii', 'replace')
return declared_encoding.lower()
return None