diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
commit | feffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch) | |
tree | 6dce892919c201b629628647f86843382b29a60a /bs4/builder | |
parent | d728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff) |
Added an exclude_encodings argument to UnicodeDammit and to the
Beautiful Soup constructor, which lets you prohibit the detection of
an encoding that you know is wrong. [bug=1469408]
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/_html5lib.py | 9 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 5 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 4 |
3 files changed, 14 insertions, 4 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 0778dde..7788063 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -29,9 +29,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder): features = [NAME, PERMISSIVE, HTML_5, HTML] - def prepare_markup(self, markup, user_specified_encoding): + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index b2cd467..25811f1 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -138,7 +138,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be @@ -149,7 +149,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index b0bc8a0..2e33386 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -77,6 +77,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -102,7 +103,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) |