diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-27 09:55:40 -0400 |
commit | feffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch) | |
tree | 6dce892919c201b629628647f86843382b29a60a /bs4/dammit.py | |
parent | d728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff) |
Added an exclude_encodings argument to UnicodeDammit and to the
Beautiful Soup constructor, which lets you prohibit the detection of
an encoding that you know is wrong. [bug=1469408]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 7ced3a5..8e6b347 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -213,8 +213,11 @@ class EncodingDetector: 5. Windows-1252. """ - def __init__(self, markup, override_encodings=None, is_html=False): + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None @@ -225,6 +228,8 @@ class EncodingDetector: def _usable(self, encoding, tried): if encoding is not None: encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False if encoding not in tried: tried.add(encoding) return True @@ -332,13 +337,14 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - self.detector = EncodingDetector(markup, override_encodings, is_html) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': |