When the html.parser parser decides it can't parse a document, Beautiful

Soup now consistently propagates this fact by raising a ParserRejectedMarkup error. [bug=2007343]
author: Leonard Richardson <leonardr@segfault.org> 2023-02-15 20:37:18 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2023-02-15 20:37:18 -0500
commit: e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
tree: d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/tests/test_htmlparser.py
parent: 8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)
1 files changed, 24 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 470d393..a1195d8 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -3,9 +3,11 @@ trees."""
 
 from pdb import set_trace
 import pickle
+import pytest
 import warnings
 from bs4.builder import (
     HTMLParserTreeBuilder,
+    ParserRejectedMarkup,
     XMLParsedAsHTMLWarning,
 )
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
 
     default_builder = HTMLParserTreeBuilder
 
+    def test_rejected_input(self):
+        # Python's html.parser will occasionally reject markup,
+        # especially when there is a problem with the initial DOCTYPE
+        # declaration. Different versions of Python sound the alarm in
+        # different ways, but Beautiful Soup consistently raises
+        # errors as ParserRejectedMarkup exceptions.
+        bad_markup = [
+            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+            # https://github.com/python/cpython/issues/81928
+            b'\n<![\xff\xfe\xfe\xcd\x00',
+
+            #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
+            # https://github.com/python/cpython/issues/78661
+            #
+            b'<![n\x00',
+            b"<![UNKNOWN[]]>",
+        ]
+        for markup in bad_markup:
+            with pytest.raises(ParserRejectedMarkup):
+                soup = self.soup(markup)
+    
     def test_namespaced_system_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
         pass
author	Leonard Richardson <leonardr@segfault.org>	2023-02-15 20:37:18 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2023-02-15 20:37:18 -0500
commit	e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
tree	d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/tests/test_htmlparser.py
parent	8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)