diff options
Diffstat (limited to 'bs4/tests/test_fuzz.py')
-rw-r--r-- | bs4/tests/test_fuzz.py | 97 |
1 files changed, 92 insertions, 5 deletions
diff --git a/bs4/tests/test_fuzz.py b/bs4/tests/test_fuzz.py index f778539..92728c0 100644 --- a/bs4/tests/test_fuzz.py +++ b/bs4/tests/test_fuzz.py @@ -14,13 +14,54 @@ from bs4 import ( BeautifulSoup, ParserRejectedMarkup, ) +try: + from soupsieve.util import SelectorSyntaxError + import lxml + import html5lib + fully_fuzzable = True +except ImportError: + fully_fuzzable = False + +@pytest.mark.skipif(not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed.") class TestFuzz(object): # Test case markup files from fuzzers are given this extension so # they can be included in builds. TESTCASE_SUFFIX = ".testcase" + # Copied 20230512 from + # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py + # + # Copying the code lets us precisely duplicate the behavior of + # oss-fuzz. The downside is that this code changes over time, so + # multiple copies of the code must be kept around to run against + # older tests. I'm not sure what to do about this, but I may + # retire old tests after a time. + def fuzz_test_with_css(self, filename): + data = self.__markup(filename) + parsers = ['lxml-xml', 'html5lib', 'html.parser', 'lxml'] + try: + idx = int(data[0]) % len(parsers) + except ValueError: + return + + css_selector, data = data[1:10], data[10:] + + try: + soup = BeautifulSoup(data[1:], features=parsers[idx]) + except ParserRejectedMarkup: + return + except ValueError: + return + + list(soup.find_all(True)) + try: + soup.css.select(css_selector.decode('utf-8', 'replace')) + except SelectorSyntaxError: + return + soup.prettify() + # This class of error has been fixed by catching a less helpful # exception from html.parser and raising ParserRejectedMarkup # instead. @@ -33,11 +74,14 @@ class TestFuzz(object): markup = self.__markup(filename) with pytest.raises(ParserRejectedMarkup): BeautifulSoup(markup, 'html.parser') - + # This class of error has to do with very deeply nested documents # which overflow the Python call stack when the tree is converted # to a string. This is an issue with Beautiful Soup which was fixed # as part of [bug=1471755]. + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. @pytest.mark.parametrize( "filename", [ "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440", @@ -46,18 +90,44 @@ class TestFuzz(object): "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400", ] ) - def test_deeply_nested_document(self, filename): + def test_deeply_nested_document_without_css(self, filename): # Parsing the document and encoding it back to a string is # sufficient to demonstrate that the overflow problem has # been fixed. markup = self.__markup(filename) BeautifulSoup(markup, 'html.parser').encode() + # This class of error has to do with very deeply nested documents + # which overflow the Python call stack when the tree is converted + # to a string. This is an issue with Beautiful Soup which was fixed + # as part of [bug=1471755]. + @pytest.mark.parametrize( + "filename", [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624", + ] + ) + def test_deeply_nested_document(self, filename): + self.fuzz_test_with_css(filename) + + @pytest.mark.parametrize( + "filename", [ + "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256", + "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824", + ] + ) + def test_soupsieve_errors(self, filename): + self.fuzz_test_with_css(filename) + # This class of error represents problems with html5lib's parser, # not Beautiful Soup. I use # https://github.com/html5lib/html5lib-python/issues/568 to notify # the html5lib developers of these issues. - @pytest.mark.skip("html5lib problems") + # + # These test cases are in the older format that doesn't specify + # which parser to use or give a CSS selector. + @pytest.mark.skip(reason="html5lib-specific problems") @pytest.mark.parametrize( "filename", [ # b"""ÿ<!DOCTyPEV PUBLIC'''Ð'""" @@ -68,7 +138,7 @@ class TestFuzz(object): # b'-<math><sElect><mi><sElect><sElect>' "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896", - + # b'ñ<table><svg><html>' "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224", @@ -79,10 +149,27 @@ class TestFuzz(object): "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08" ] ) - def test_html5lib_parse_errors(self, filename): + def test_html5lib_parse_errors_without_css(self, filename): markup = self.__markup(filename) print(BeautifulSoup(markup, 'html5lib').encode()) + # This class of error represents problems with html5lib's parser, + # not Beautiful Soup. I use + # https://github.com/html5lib/html5lib-python/issues/568 to notify + # the html5lib developers of these issues. + @pytest.mark.skip(reason="html5lib-specific problems") + @pytest.mark.parametrize( + "filename", [ + # b'- \xff\xff <math>\x10<select><mi><select><select>t' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640", + + # b'\xb1<a>\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff' + "clusterfuzz-testcase-minimized-bs4_fuzzer-6401239223762944" + ] + ) + def test_html5lib_parse_errors(self, filename): + self.fuzz_test_with_css(filename) + def __markup(self, filename): if not filename.endswith(self.TESTCASE_SUFFIX): filename += self.TESTCASE_SUFFIX |