summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-09-02 13:01:06 -0400
commitab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch)
treebce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/builder/__init__.py
parentcf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff)
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser completely rejects the incoming markup. [bug=1838877]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e28242b..7efbf89 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -175,8 +175,8 @@ class TreeBuilder(object):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- return markup, None, None, False
+ document_declared_encoding=None, exclude_encodings=None):
+ yield markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
@@ -363,8 +363,15 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
- pass
-
+ def __init__(self, message_or_exception):
+ """Explain why the parser rejected the given markup, either
+ with a textual explanation or another exception.
+ """
+ if isinstance(message_or_exception, Exception):
+ e = message_or_exception
+ message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
+ super(ParserRejectedMarkup, self).__init__(message_or_exception)
+
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only