diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
commit | ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch) | |
tree | bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/__init__.py | |
parent | cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff) |
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser
completely rejects the incoming markup. [bug=1838877]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index e27ca6f..e85a0bf 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -302,6 +302,8 @@ class BeautifulSoup(Tag): ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) + rejections = [] + success = False for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup( @@ -309,10 +311,18 @@ class BeautifulSoup(Tag): self.reset() try: self._feed() + success = True break - except ParserRejectedMarkup: + except ParserRejectedMarkup as e: + rejections.append(e) pass + if not success: + other_exceptions = [unicode(e) for e in rejections] + raise ParserRejectedMarkup( + u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None |