summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2023-02-15 20:37:18 -0500
committerLeonard Richardson <leonardr@segfault.org>2023-02-15 20:37:18 -0500
commite0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch)
treed05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/builder/_htmlparser.py
parent8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff)
When the html.parser parser decides it can't parse a document, Beautiful
Soup now consistently propagates this fact by raising a ParserRejectedMarkup error. [bug=2007343]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py25
1 files changed, 24 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e48b6a0..e065096 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
DetectsXMLParsedAsHTML,
+ ParserRejectedMarkup,
HTML,
HTMLTreeBuilder,
STRICT,
@@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self._initialize_xml_detector()
+ def error(self, message):
+ # NOTE: This method is required so long as Python 3.9 is
+ # supported. The corresponding code is removed from HTMLParser
+ # in 3.5, but not removed from ParserBase until 3.10.
+ # https://github.com/python/cpython/issues/76025
+ #
+ # The original implementation turned the error into a warning,
+ # but in every case I discovered, this made HTMLParser
+ # immediately crash with an error message that was less
+ # helpful than the warning. The new implementation makes it
+ # more clear that html.parser just can't parse this
+ # markup. The 3.10 implementation does the same, though it
+ # raises AssertionError rather than calling a method. (We
+ # catch this error and wrap it in a ParserRejectedMarkup.)
+ raise ParserRejectedMarkup(message)
+
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
- parser.feed(markup)
+ try:
+ parser.feed(markup)
+ except AssertionError as e:
+ # html.parser raises AssertionError in rare cases to
+ # indicate a fatal problem with the markup, especially
+ # when there's an error in the doctype declaration.
+ raise ParserRejectedMarkup(e)
parser.close()
parser.already_closed_empty_element = []