From ab0626db2a60f4f22b97ece310d92038b3da5cc1 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Mon, 2 Sep 2019 13:01:06 -0400 Subject: Avoid a crash when trying to detect the declared encoding of a Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877] --- bs4/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'bs4/__init__.py') diff --git a/bs4/__init__.py b/bs4/__init__.py index e27ca6f..e85a0bf 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -302,6 +302,8 @@ class BeautifulSoup(Tag): ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) + rejections = [] + success = False for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup( @@ -309,10 +311,18 @@ class BeautifulSoup(Tag): self.reset() try: self._feed() + success = True break - except ParserRejectedMarkup: + except ParserRejectedMarkup as e: + rejections.append(e) pass + if not success: + other_exceptions = [unicode(e) for e in rejections] + raise ParserRejectedMarkup( + u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None -- cgit v1.2.3