Avoid a crash when trying to detect the declared encoding of a

Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877]
author: Leonard Richardson <leonardr@segfault.org> 2019-09-02 13:01:06 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2019-09-02 13:01:06 -0400
commit: ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
tree: bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/dammit.py
parent: cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)
1 files changed, 29 insertions, 8 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 08109f2..74fa7f0 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -22,6 +22,8 @@ try:
     #  PyPI package: cchardet
     import cchardet
     def chardet_dammit(s):
+        if isinstance(s, unicode):
+            return None
         return cchardet.detect(s)['encoding']
 except ImportError:
     try:
@@ -30,6 +32,8 @@ except ImportError:
         #  PyPI package: chardet
         import chardet
         def chardet_dammit(s):
+            if isinstance(s, unicode):
+                return None
             return chardet.detect(s)['encoding']
         #import chardet.constants
         #chardet.constants._debug = 1
@@ -44,10 +48,19 @@ try:
 except ImportError:
     pass
 
-xml_encoding_re = re.compile(
-    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
-html_meta_re = re.compile(
-    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+# Build bytestring and Unicode versions of regular expressions for finding
+# a declared encoding inside an XML or HTML document.
+xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
+encoding_res = dict()
+encoding_res[bytes] = {
+    'html' : re.compile(html_meta.encode("ascii"), re.I),
+    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
+}
+encoding_res[unicode] = {
+    'html' : re.compile(html_meta, re.I),
+    'xml' : re.compile(xml_encoding, re.I)
+}
 
 class EntitySubstitution(object):
 
@@ -319,14 +332,22 @@ class EncodingDetector:
             xml_endpos = 1024
             html_endpos = max(2048, int(len(markup) * 0.05))
 
+        if isinstance(markup, bytes):
+            res = encoding_res[bytes]
+        else:
+            res = encoding_res[unicode]
+
+        xml_re = res['xml']
+        html_re = res['html']
         declared_encoding = None
-        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
         if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
         if declared_encoding_match is not None:
-            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii', 'replace')
+            declared_encoding = declared_encoding_match.groups()[0]
         if declared_encoding:
+            if isinstance(declared_encoding, bytes):
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
             return declared_encoding.lower()
         return None
author	Leonard Richardson <leonardr@segfault.org>	2019-09-02 13:01:06 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2019-09-02 13:01:06 -0400
commit	ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
tree	bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/dammit.py
parent	cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)