diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-02-15 20:37:18 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-02-15 20:37:18 -0500 |
commit | e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch) | |
tree | d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/builder/_htmlparser.py | |
parent | 8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff) |
When the html.parser parser decides it can't parse a document, Beautiful
Soup now consistently propagates this fact by raising a
ParserRejectedMarkup error. [bug=2007343]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index e48b6a0..e065096 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( DetectsXMLParsedAsHTML, + ParserRejectedMarkup, HTML, HTMLTreeBuilder, STRICT, @@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): self._initialize_xml_detector() + def error(self, message): + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) + def handle_startendtag(self, name, attrs): """Handle an incoming empty-element tag. @@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup - parser.feed(markup) + try: + parser.feed(markup) + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) parser.close() parser.already_closed_empty_element = [] |