diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
commit | ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch) | |
tree | bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa /bs4/builder/__init__.py | |
parent | cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff) |
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser
completely rejects the incoming markup. [bug=1838877]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e28242b..7efbf89 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -175,8 +175,8 @@ class TreeBuilder(object): raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - return markup, None, None, False + document_declared_encoding=None, exclude_encodings=None): + yield markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. @@ -363,8 +363,15 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) class ParserRejectedMarkup(Exception): - pass - + def __init__(self, message_or_exception): + """Explain why the parser rejected the given markup, either + with a textual explanation or another exception. + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception + message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only |