diff options
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 25 | ||||
-rw-r--r-- | bs4/tests/__init__.py | 36 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 24 |
4 files changed, 57 insertions, 32 deletions
@@ -29,6 +29,10 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7. [bug=2003677] +* When the html.parser parser decides it can't parse a document, Beautiful + Soup now consistently propagates this fact by raising a + ParserRejectedMarkup error. [bug=2007343] + = 4.11.2 (20230131) * Fixed test failures caused by nondeterministic behavior of diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index e48b6a0..e065096 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( DetectsXMLParsedAsHTML, + ParserRejectedMarkup, HTML, HTMLTreeBuilder, STRICT, @@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): self._initialize_xml_detector() + def error(self, message): + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) + def handle_startendtag(self, name, attrs): """Handle an incoming empty-element tag. @@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup - parser.feed(markup) + try: + parser.feed(markup) + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) parser.close() parser.already_closed_empty_element = [] diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py index f4d62db..d8b3b9b 100644 --- a/bs4/tests/__init__.py +++ b/bs4/tests/__init__.py @@ -297,37 +297,11 @@ class TreeBuilderSmokeTest(object): markup, multi_valued_attributes=multi_valued_attributes ) assert soup.a['class'] == ['a', 'b', 'c'] - - def test_fuzzed_input(self): - # This test centralizes in one place the various fuzz tests - # for Beautiful Soup created by the oss-fuzz project. - - # These strings superficially resemble markup, but they - # generally can't be parsed into anything. The best we can - # hope for is that parsing these strings won't crash the - # parser. - # - # n.b. This markup is commented out because these fuzz tests - # _do_ crash the parser. However the crashes are due to bugs - # in html.parser, not Beautiful Soup -- otherwise I'd fix the - # bugs! - - bad_markup = [ - # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 - # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 - # https://bugs.python.org/issue37747 - # - #b'\n<![\xff\xfe\xfe\xcd\x00', - - #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 - # https://bugs.python.org/issue34480 - # - #b'<![n\x00' - ] - for markup in bad_markup: - with warnings.catch_warnings(record=False): - soup = self.soup(markup) - + + def test_invalid_doctype(self): + markup = '<![if word]>content<![endif]>' + markup = '<!DOCTYPE html]ff>' + soup = self.soup(markup) class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 470d393..a1195d8 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -3,9 +3,11 @@ trees.""" from pdb import set_trace import pickle +import pytest import warnings from bs4.builder import ( HTMLParserTreeBuilder, + ParserRejectedMarkup, XMLParsedAsHTMLWarning, ) from bs4.builder._htmlparser import BeautifulSoupHTMLParser @@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): default_builder = HTMLParserTreeBuilder + def test_rejected_input(self): + # Python's html.parser will occasionally reject markup, + # especially when there is a problem with the initial DOCTYPE + # declaration. Different versions of Python sound the alarm in + # different ways, but Beautiful Soup consistently raises + # errors as ParserRejectedMarkup exceptions. + bad_markup = [ + # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 + # https://github.com/python/cpython/issues/81928 + b'\n<![\xff\xfe\xfe\xcd\x00', + + #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 + # https://github.com/python/cpython/issues/78661 + # + b'<![n\x00', + b"<![UNKNOWN[]]>", + ] + for markup in bad_markup: + with pytest.raises(ParserRejectedMarkup): + soup = self.soup(markup) + def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass |