From e0bbee776ca241d908af36e4e5ce0d0b1bedceaf Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Wed, 15 Feb 2023 20:37:18 -0500
Subject: When the html.parser parser decides it can't parse a document,
 Beautiful   Soup now consistently propagates this fact by raising a  
 ParserRejectedMarkup error. [bug=2007343]

---
 bs4/builder/_htmlparser.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'bs4/builder/_htmlparser.py')

diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e48b6a0..e065096 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
 
 from bs4.builder import (
     DetectsXMLParsedAsHTML,
+    ParserRejectedMarkup,
     HTML,
     HTMLTreeBuilder,
     STRICT,
@@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
 
         self._initialize_xml_detector()
 
+    def error(self, message):
+        # NOTE: This method is required so long as Python 3.9 is
+        # supported. The corresponding code is removed from HTMLParser
+        # in 3.5, but not removed from ParserBase until 3.10.
+        # https://github.com/python/cpython/issues/76025
+        #
+        # The original implementation turned the error into a warning,
+        # but in every case I discovered, this made HTMLParser
+        # immediately crash with an error message that was less
+        # helpful than the warning. The new implementation makes it
+        # more clear that html.parser just can't parse this
+        # markup. The 3.10 implementation does the same, though it
+        # raises AssertionError rather than calling a method. (We
+        # catch this error and wrap it in a ParserRejectedMarkup.)
+        raise ParserRejectedMarkup(message)
+
     def handle_startendtag(self, name, attrs):
         """Handle an incoming empty-element tag.
 
@@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         args, kwargs = self.parser_args
         parser = BeautifulSoupHTMLParser(*args, **kwargs)
         parser.soup = self.soup
-        parser.feed(markup)
+        try:
+            parser.feed(markup)
+        except AssertionError as e:
+            # html.parser raises AssertionError in rare cases to
+            # indicate a fatal problem with the markup, especially
+            # when there's an error in the doctype declaration.
+            raise ParserRejectedMarkup(e)
         parser.close()
         parser.already_closed_empty_element = []
-- 
cgit v1.2.3