summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
commitab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
treebce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/__init__.py
parentcf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py12
1 files changed, 11 insertions, 1 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e27ca6f..e85a0bf 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -302,6 +302,8 @@ class BeautifulSoup(Tag):
' Beautiful Soup.' % markup)
self._check_markup_is_url(markup)
+ rejections = []
+ success = False
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(
@@ -309,10 +311,18 @@ class BeautifulSoup(Tag):
self.reset()
try:
self._feed()
+ success = True
break
- except ParserRejectedMarkup:
+ except ParserRejectedMarkup as e:
+ rejections.append(e)
pass
+ if not success:
+ other_exceptions = [unicode(e) for e in rejections]
+ raise ParserRejectedMarkup(
+ u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
+ )
+
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None