When the html.parser parser decides it can't parse a document, Beautiful

Soup now consistently propagates this fact by raising a ParserRejectedMarkup error. [bug=2007343]
author: Leonard Richardson <leonardr@segfault.org> 2023-02-15 20:37:18 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2023-02-15 20:37:18 -0500
commit: e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
tree: d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/builder/_htmlparser.py
parent: 8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)
1 files changed, 24 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e48b6a0..e065096 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
 
 from bs4.builder import (
     DetectsXMLParsedAsHTML,
+    ParserRejectedMarkup,
     HTML,
     HTMLTreeBuilder,
     STRICT,
@@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
 
         self._initialize_xml_detector()
 
+    def error(self, message):
+        # NOTE: This method is required so long as Python 3.9 is
+        # supported. The corresponding code is removed from HTMLParser
+        # in 3.5, but not removed from ParserBase until 3.10.
+        # https://github.com/python/cpython/issues/76025
+        #
+        # The original implementation turned the error into a warning,
+        # but in every case I discovered, this made HTMLParser
+        # immediately crash with an error message that was less
+        # helpful than the warning. The new implementation makes it
+        # more clear that html.parser just can't parse this
+        # markup. The 3.10 implementation does the same, though it
+        # raises AssertionError rather than calling a method. (We
+        # catch this error and wrap it in a ParserRejectedMarkup.)
+        raise ParserRejectedMarkup(message)
+
     def handle_startendtag(self, name, attrs):
         """Handle an incoming empty-element tag.
 
@@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         args, kwargs = self.parser_args
         parser = BeautifulSoupHTMLParser(*args, **kwargs)
         parser.soup = self.soup
-        parser.feed(markup)
+        try:
+            parser.feed(markup)
+        except AssertionError as e:
+            # html.parser raises AssertionError in rare cases to
+            # indicate a fatal problem with the markup, especially
+            # when there's an error in the doctype declaration.
+            raise ParserRejectedMarkup(e)
         parser.close()
         parser.already_closed_empty_element = []
author	Leonard Richardson <leonardr@segfault.org>	2023-02-15 20:37:18 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2023-02-15 20:37:18 -0500
commit	e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
tree	d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/builder/_htmlparser.py
parent	8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)