summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2015-06-27 09:55:40 -0400
committerLeonard Richardson <leonardr@segfault.org>2015-06-27 09:55:40 -0400
commitfeffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch)
tree6dce892919c201b629628647f86843382b29a60a /bs4/dammit.py
parentd728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff)
Added an exclude_encodings argument to UnicodeDammit and to the
Beautiful Soup constructor, which lets you prohibit the detection of an encoding that you know is wrong. [bug=1469408]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 7ced3a5..8e6b347 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -213,8 +213,11 @@ class EncodingDetector:
5. Windows-1252.
"""
- def __init__(self, markup, override_encodings=None, is_html=False):
+ def __init__(self, markup, override_encodings=None, is_html=False,
+ exclude_encodings=None):
self.override_encodings = override_encodings or []
+ exclude_encodings = exclude_encodings or []
+ self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
@@ -225,6 +228,8 @@ class EncodingDetector:
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
+ if encoding in self.exclude_encodings:
+ return False
if encoding not in tried:
tried.add(encoding)
return True
@@ -332,13 +337,14 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False):
+ smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
- self.detector = EncodingDetector(markup, override_encodings, is_html)
+ self.detector = EncodingDetector(
+ markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':