summaryrefslogtreecommitdiff
path: root/bs4/tests
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2023-02-15 20:37:18 -0500
committerLeonard Richardson <leonardr@segfault.org>2023-02-15 20:37:18 -0500
commite0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
treed05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/tests
parent8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)
When the html.parser parser decides it can't parse a document, Beautiful
Soup now consistently propagates this fact by raising a ParserRejectedMarkup error. [bug=2007343]
Diffstat (limited to 'bs4/tests')
-rw-r--r--bs4/tests/__init__.py36
-rw-r--r--bs4/tests/test_htmlparser.py24
2 files changed, 29 insertions, 31 deletions
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index f4d62db..d8b3b9b 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -297,37 +297,11 @@ class TreeBuilderSmokeTest(object):
markup, multi_valued_attributes=multi_valued_attributes
)
assert soup.a['class'] == ['a', 'b', 'c']
-
- def test_fuzzed_input(self):
- # This test centralizes in one place the various fuzz tests
- # for Beautiful Soup created by the oss-fuzz project.
-
- # These strings superficially resemble markup, but they
- # generally can't be parsed into anything. The best we can
- # hope for is that parsing these strings won't crash the
- # parser.
- #
- # n.b. This markup is commented out because these fuzz tests
- # _do_ crash the parser. However the crashes are due to bugs
- # in html.parser, not Beautiful Soup -- otherwise I'd fix the
- # bugs!
-
- bad_markup = [
- # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
- # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
- # https://bugs.python.org/issue37747
- #
- #b'\n<![\xff\xfe\xfe\xcd\x00',
-
- #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
- # https://bugs.python.org/issue34480
- #
- #b'<![n\x00'
- ]
- for markup in bad_markup:
- with warnings.catch_warnings(record=False):
- soup = self.soup(markup)
-
+
+ def test_invalid_doctype(self):
+ markup = '<![if word]>content<![endif]>'
+ markup = '<!DOCTYPE html]ff>'
+ soup = self.soup(markup)
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 470d393..a1195d8 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -3,9 +3,11 @@ trees."""
from pdb import set_trace
import pickle
+import pytest
import warnings
from bs4.builder import (
HTMLParserTreeBuilder,
+ ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
)
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
+ def test_rejected_input(self):
+ # Python's html.parser will occasionally reject markup,
+ # especially when there is a problem with the initial DOCTYPE
+ # declaration. Different versions of Python sound the alarm in
+ # different ways, but Beautiful Soup consistently raises
+ # errors as ParserRejectedMarkup exceptions.
+ bad_markup = [
+ # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+ # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+ # https://github.com/python/cpython/issues/81928
+ b'\n<![\xff\xfe\xfe\xcd\x00',
+
+ #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
+ # https://github.com/python/cpython/issues/78661
+ #
+ b'<![n\x00',
+ b"<![UNKNOWN[]]>",
+ ]
+ for markup in bad_markup:
+ with pytest.raises(ParserRejectedMarkup):
+ soup = self.soup(markup)
+
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass