4 files changed, 57 insertions, 32 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d213459..4960e1e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -29,6 +29,10 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7.
 
   [bug=2003677]
 
+* When the html.parser parser decides it can't parse a document, Beautiful
+  Soup now consistently propagates this fact by raising a
+  ParserRejectedMarkup error. [bug=2007343]
+
 = 4.11.2 (20230131)
 
 * Fixed test failures caused by nondeterministic behavior of
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e48b6a0..e065096 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
 
 from bs4.builder import (
     DetectsXMLParsedAsHTML,
+    ParserRejectedMarkup,
     HTML,
     HTMLTreeBuilder,
     STRICT,
@@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
 
         self._initialize_xml_detector()
 
+    def error(self, message):
+        # NOTE: This method is required so long as Python 3.9 is
+        # supported. The corresponding code is removed from HTMLParser
+        # in 3.5, but not removed from ParserBase until 3.10.
+        # https://github.com/python/cpython/issues/76025
+        #
+        # The original implementation turned the error into a warning,
+        # but in every case I discovered, this made HTMLParser
+        # immediately crash with an error message that was less
+        # helpful than the warning. The new implementation makes it
+        # more clear that html.parser just can't parse this
+        # markup. The 3.10 implementation does the same, though it
+        # raises AssertionError rather than calling a method. (We
+        # catch this error and wrap it in a ParserRejectedMarkup.)
+        raise ParserRejectedMarkup(message)
+
     def handle_startendtag(self, name, attrs):
         """Handle an incoming empty-element tag.
 
@@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         args, kwargs = self.parser_args
         parser = BeautifulSoupHTMLParser(*args, **kwargs)
         parser.soup = self.soup
-        parser.feed(markup)
+        try:
+            parser.feed(markup)
+        except AssertionError as e:
+            # html.parser raises AssertionError in rare cases to
+            # indicate a fatal problem with the markup, especially
+            # when there's an error in the doctype declaration.
+            raise ParserRejectedMarkup(e)
         parser.close()
         parser.already_closed_empty_element = []
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index f4d62db..d8b3b9b 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -297,37 +297,11 @@ class TreeBuilderSmokeTest(object):
             markup, multi_valued_attributes=multi_valued_attributes
         )
         assert soup.a['class'] == ['a', 'b', 'c']
-        
-    def test_fuzzed_input(self):
-        # This test centralizes in one place the various fuzz tests
-        # for Beautiful Soup created by the oss-fuzz project.
-        
-        # These strings superficially resemble markup, but they
-        # generally can't be parsed into anything. The best we can
-        # hope for is that parsing these strings won't crash the
-        # parser.
-        #
-        # n.b. This markup is commented out because these fuzz tests
-        # _do_ crash the parser. However the crashes are due to bugs
-        # in html.parser, not Beautiful Soup -- otherwise I'd fix the
-        # bugs!
-        
-        bad_markup = [
-            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
-            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
-            # https://bugs.python.org/issue37747
-            #
-            #b'\n<![\xff\xfe\xfe\xcd\x00',
-
-            #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
-            # https://bugs.python.org/issue34480
-            #
-            #b'<![n\x00'
-        ]
-        for markup in bad_markup:
-            with warnings.catch_warnings(record=False):
-                soup = self.soup(markup)
-        
+
+    def test_invalid_doctype(self):
+        markup = '<![if word]>content<![endif]>'
+        markup = '<!DOCTYPE html]ff>'
+        soup = self.soup(markup)
 
 class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
 
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 470d393..a1195d8 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -3,9 +3,11 @@ trees."""
 
 from pdb import set_trace
 import pickle
+import pytest
 import warnings
 from bs4.builder import (
     HTMLParserTreeBuilder,
+    ParserRejectedMarkup,
     XMLParsedAsHTMLWarning,
 )
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
 
     default_builder = HTMLParserTreeBuilder
 
+    def test_rejected_input(self):
+        # Python's html.parser will occasionally reject markup,
+        # especially when there is a problem with the initial DOCTYPE
+        # declaration. Different versions of Python sound the alarm in
+        # different ways, but Beautiful Soup consistently raises
+        # errors as ParserRejectedMarkup exceptions.
+        bad_markup = [
+            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+            # https://github.com/python/cpython/issues/81928
+            b'\n<![\xff\xfe\xfe\xcd\x00',
+
+            #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
+            # https://github.com/python/cpython/issues/78661
+            #
+            b'<![n\x00',
+            b"<![UNKNOWN[]]>",
+        ]
+        for markup in bad_markup:
+            with pytest.raises(ParserRejectedMarkup):
+                soup = self.soup(markup)
+    
     def test_namespaced_system_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
         pass