Avoid a crash when trying to detect the declared encoding of a

Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877]
author: Leonard Richardson <leonardr@segfault.org> 2019-09-02 13:01:06 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2019-09-02 13:01:06 -0400
commit: ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
tree: bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa
parent: cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)
6 files changed, 144 insertions, 18 deletions
diff --git a/CHANGELOG b/CHANGELOG
index c992747..3551449 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -20,6 +20,12 @@
    or string ('string_class') encountered during parsing, rather than
    using the default Tag and NavigableString objects.
 
+* Raise an explanatory exception when the underlying parser
+   completely rejects the incoming markup. [bug=1838877]
+
+* Avoid a crash when trying to detect the declared encoding of a
+   Unicode document. [bug=1838877]
+
 = 4.8.0 (20190720, "One Small Soup")
 
 This release focuses on making it easier to customize Beautiful Soup's
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e27ca6f..e85a0bf 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -302,6 +302,8 @@ class BeautifulSoup(Tag):
                     ' Beautiful Soup.' % markup)
             self._check_markup_is_url(markup)
 
+        rejections = []
+        success = False
         for (self.markup, self.original_encoding, self.declared_html_encoding,
          self.contains_replacement_characters) in (
              self.builder.prepare_markup(
@@ -309,10 +311,18 @@ class BeautifulSoup(Tag):
             self.reset()
             try:
                 self._feed()
+                success = True
                 break
-            except ParserRejectedMarkup:
+            except ParserRejectedMarkup as e:
+                rejections.append(e)
                 pass
 
+        if not success:
+            other_exceptions = [unicode(e) for e in rejections]
+            raise ParserRejectedMarkup(
+                u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
+            )
+
         # Clear out the markup and remove the builder's circular
         # reference to this object.
         self.markup = None
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e28242b..7efbf89 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -175,8 +175,8 @@ class TreeBuilder(object):
         raise NotImplementedError()
 
     def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
-        return markup, None, None, False
+                       document_declared_encoding=None, exclude_encodings=None):
+        yield markup, None, None, False
 
     def test_fragment_to_document(self, fragment):
         """Wrap an HTML fragment to make it look like a document.
@@ -363,8 +363,15 @@ def register_treebuilders_from(module):
             this_module.builder_registry.register(obj)
 
 class ParserRejectedMarkup(Exception):
-    pass
-
+    def __init__(self, message_or_exception):
+        """Explain why the parser rejected the given markup, either
+        with a textual explanation or another exception.
+        """
+        if isinstance(message_or_exception, Exception):
+            e = message_or_exception
+            message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
+        super(ParserRejectedMarkup, self).__init__(message_or_exception)
+            
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 85be1b5..ea66d8b 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -175,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                     self.parser.feed(data)
             self.parser.close()
         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
 
     def close(self):
         self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
@@ -294,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
             self.parser.feed(markup)
             self.parser.close()
         except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
 
 
     def test_fragment_to_document(self, fragment):
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 08109f2..74fa7f0 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -22,6 +22,8 @@ try:
     #  PyPI package: cchardet
     import cchardet
     def chardet_dammit(s):
+        if isinstance(s, unicode):
+            return None
         return cchardet.detect(s)['encoding']
 except ImportError:
     try:
@@ -30,6 +32,8 @@ except ImportError:
         #  PyPI package: chardet
         import chardet
         def chardet_dammit(s):
+            if isinstance(s, unicode):
+                return None
             return chardet.detect(s)['encoding']
         #import chardet.constants
         #chardet.constants._debug = 1
@@ -44,10 +48,19 @@ try:
 except ImportError:
     pass
 
-xml_encoding_re = re.compile(
-    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
-html_meta_re = re.compile(
-    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+# Build bytestring and Unicode versions of regular expressions for finding
+# a declared encoding inside an XML or HTML document.
+xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
+encoding_res = dict()
+encoding_res[bytes] = {
+    'html' : re.compile(html_meta.encode("ascii"), re.I),
+    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
+}
+encoding_res[unicode] = {
+    'html' : re.compile(html_meta, re.I),
+    'xml' : re.compile(xml_encoding, re.I)
+}
 
 class EntitySubstitution(object):
 
@@ -319,14 +332,22 @@ class EncodingDetector:
             xml_endpos = 1024
             html_endpos = max(2048, int(len(markup) * 0.05))
 
+        if isinstance(markup, bytes):
+            res = encoding_res[bytes]
+        else:
+            res = encoding_res[unicode]
+
+        xml_re = res['xml']
+        html_re = res['html']
         declared_encoding = None
-        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
         if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
         if declared_encoding_match is not None:
-            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii', 'replace')
+            declared_encoding = declared_encoding_match.groups()[0]
         if declared_encoding:
+            if isinstance(declared_encoding, bytes):
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
             return declared_encoding.lower()
         return None
 
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index af5f791..3603e81 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -11,6 +11,10 @@ from bs4 import (
     BeautifulSoup,
     BeautifulStoneSoup,
 )
+from bs4.builder import (
+    TreeBuilder,
+    ParserRejectedMarkup,
+)
 from bs4.element import (
     CharsetMetaAttributeValue,
     Comment,
@@ -20,6 +24,7 @@ from bs4.element import (
     Tag,
     NavigableString,
     )
+
 import bs4.dammit
 from bs4.dammit import (
     EntitySubstitution,
@@ -36,7 +41,7 @@ import warnings
 try:
     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
     LXML_PRESENT = True
-except ImportError, e:
+except ImportError as e:
     LXML_PRESENT = False
 
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@@ -65,10 +70,20 @@ class TestConstructor(SoupTest):
             def __init__(self, **kwargs):
                 self.called_with = kwargs
                 self.is_xml = True
+                self.store_line_numbers = False
+                self.cdata_list_attributes = []
+                self.preserve_whitespace_tags = []
             def initialize_soup(self, soup):
                 pass
+            def feed(self, markup):
+                self.fed = markup
+            def reset(self):
+                pass
+            def ignore(self, ignore):
+                pass
+            set_up_substitutions = can_be_empty_element = ignore
             def prepare_markup(self, *args, **kwargs):
-                return ''
+                yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
                 
         kwargs = dict(
             var="value",
@@ -80,7 +95,8 @@ class TestConstructor(SoupTest):
             soup = BeautifulSoup('', builder=Mock, **kwargs)
         assert isinstance(soup.builder, Mock)
         self.assertEqual(dict(var="value"), soup.builder.called_with)
-
+        self.assertEqual("prepared markup", soup.builder.fed)
+        
         # You can also instantiate the TreeBuilder yourself. In this
         # case, that specific object is used and any keyword arguments
         # to the BeautifulSoup constructor are ignored.
@@ -94,6 +110,26 @@ class TestConstructor(SoupTest):
         self.assertEqual(builder, soup.builder)
         self.assertEqual(kwargs, builder.called_with)
 
+    def test_parser_markup_rejection(self):
+        # If markup is completely rejected by the parser, an
+        # explanatory ParserRejectedMarkup exception is raised.
+        class Mock(TreeBuilder):
+            def feed(self, *args, **kwargs):
+                raise ParserRejectedMarkup("Nope.")
+
+        def prepare_markup(self, *args, **kwargs):
+            # We're going to try two different ways of preparing this markup,
+            # but feed() will reject both of them.
+            yield markup, None, None, False
+            yield markup, None, None, False
+            
+        import re
+        self.assertRaisesRegexp(
+            ParserRejectedMarkup,
+            "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
+            BeautifulSoup, '', builder=Mock,
+        )
+        
     def test_cdata_list_attributes(self):
         # Most attribute values are represented as scalars, but the
         # HTML standard says that some attributes, like 'class' have
@@ -554,6 +590,52 @@ class TestUnicodeDammit(unittest.TestCase):
             output = UnicodeDammit.detwingle(input)
             self.assertEqual(output, input)
 
+    def test_find_declared_encoding(self):
+        # Test our ability to find a declared encoding inside an
+        # XML or HTML document.
+        #
+        # Even if the document comes in as Unicode, it may be
+        # interesting to know what encoding was claimed
+        # originally.
+
+        html_unicode = u'<html><head><meta charset="utf-8"></head></html>'
+        html_bytes = html_unicode.encode("ascii")
+
+        xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>'
+        xml_bytes = xml_unicode.encode("ascii")
+
+        m = EncodingDetector.find_declared_encoding
+        self.assertEquals(None, m(html_unicode, is_html=False))
+        self.assertEquals("utf-8", m(html_unicode, is_html=True))
+        self.assertEquals("utf-8", m(html_bytes, is_html=True))
+
+        self.assertEquals("iso-8859-1", m(xml_unicode))
+        self.assertEquals("iso-8859-1", m(xml_bytes))
+
+        # Normally, only the first few kilobytes of a document are checked for
+        # an encoding.
+        spacer = b' ' * 5000
+        self.assertEquals(None, m(spacer + html_bytes))
+        self.assertEquals(None, m(spacer + xml_bytes))
+
+        # But you can tell find_declared_encoding to search an entire
+        # HTML document.
+        self.assertEquals(
+            "utf-8",
+            m(spacer + html_bytes, is_html=True, search_entire_document=True)
+        )
+
+        # The XML encoding declaration has to be the very first thing
+        # in the document. We'll allow whitespace before the document
+        # starts, but nothing else.
+        self.assertEquals(
+            "iso-8859-1",
+            m(xml_bytes, search_entire_document=True)
+        )
+        self.assertEquals(
+            None, m(b'a' + xml_bytes, search_entire_document=True)
+        )
+            
 class TestNamedspacedAttribute(SoupTest):
 
     def test_name_may_be_none(self):
author	Leonard Richardson <leonardr@segfault.org>	2019-09-02 13:01:06 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2019-09-02 13:01:06 -0400
commit	ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
tree	bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa
parent	cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)