Added an exclude_encodings argument to UnicodeDammit and to the

Beautiful Soup constructor, which lets you prohibit the detection of an encoding that you know is wrong. [bug=1469408]
author: Leonard Richardson <leonardr@segfault.org> 2015-06-27 09:55:40 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2015-06-27 09:55:40 -0400
commit: feffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch)
tree: 6dce892919c201b629628647f86843382b29a60a
parent: d728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff)
8 files changed, 63 insertions, 9 deletions
diff --git a/NEWS.txt b/NEWS.txt
index cc3c17f..bcd4223 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -18,6 +18,10 @@
   argument described in the documentation. `text` may eventually
   change its meaning, but not for a very long time. [bug=1366856]
 
+* Added an `exclude_encodings` argument to UnicodeDammit and to the
+  Beautiful Soup constructor, which lets you prohibit the detection of
+  an encoding that you know is wrong. [bug=1469408]
+
 * Fixed yet another problem that caused the html5lib tree builder to
   create a disconnected parse tree. [bug=1237763]
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 4b92152..e167544 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -80,7 +80,8 @@ class BeautifulSoup(Tag):
     NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
 
     def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
+                 parse_only=None, from_encoding=None, exclude_encodings=None,
+                 **kwargs):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser."""
@@ -202,7 +203,8 @@ class BeautifulSoup(Tag):
 
         for (self.markup, self.original_encoding, self.declared_html_encoding,
          self.contains_replacement_characters) in (
-            self.builder.prepare_markup(markup, from_encoding)):
+             self.builder.prepare_markup(
+                 markup, from_encoding, exclude_encodings=exclude_encodings)):
             self.reset()
             try:
                 self._feed()
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 0778dde..7788063 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -29,9 +29,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
 
     features = [NAME, PERMISSIVE, HTML_5, HTML]
 
-    def prepare_markup(self, markup, user_specified_encoding):
+    def prepare_markup(self, markup, user_specified_encoding,
+                       document_declared_encoding=None, exclude_encodings=None):
         # Store the user-specified encoding for use later on.
         self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        if exclude_encodings:
+            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
         yield (markup, None, None, False)
 
     # These methods are defined by Beautiful Soup.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index b2cd467..25811f1 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -138,7 +138,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         self.parser_args = (args, kwargs)
 
     def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
         """
         :return: A 4-tuple (markup, original encoding, encoding
         declared within markup, whether any characters had to be
@@ -149,7 +149,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
             return
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+                               exclude_encodings=exclude_encodings)
         yield (dammit.markup, dammit.original_encoding,
                dammit.declared_html_encoding,
                dammit.contains_replacement_characters)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index b0bc8a0..2e33386 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -77,6 +77,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             return (None, tag)
 
     def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
                        document_declared_encoding=None):
         """
         :yield: A series of 4-tuples.
@@ -102,7 +103,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         # the document as each one in turn.
         is_html = not self.is_xml
         try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
         for encoding in detector.encodings:
             yield (detector.markup, encoding, document_declared_encoding, False)
 
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 7ced3a5..8e6b347 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -213,8 +213,11 @@ class EncodingDetector:
 
     5. Windows-1252.
     """
-    def __init__(self, markup, override_encodings=None, is_html=False):
+    def __init__(self, markup, override_encodings=None, is_html=False,
+                 exclude_encodings=None):
         self.override_encodings = override_encodings or []
+        exclude_encodings = exclude_encodings or []
+        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
         self.chardet_encoding = None
         self.is_html = is_html
         self.declared_encoding = None
@@ -225,6 +228,8 @@ class EncodingDetector:
     def _usable(self, encoding, tried):
         if encoding is not None:
             encoding = encoding.lower()
+            if encoding in self.exclude_encodings:
+                return False
             if encoding not in tried:
                 tried.add(encoding)
                 return True
@@ -332,13 +337,14 @@ class UnicodeDammit:
         ]
 
     def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
         self.contains_replacement_characters = False
         self.is_html = is_html
 
-        self.detector = EncodingDetector(markup, override_encodings, is_html)
+        self.detector = EncodingDetector(
+            markup, override_encodings, is_html, exclude_encodings)
 
         # Short-circuit if the data is in Unicode to begin with.
         if isinstance(markup, unicode) or markup == '':
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index e2e2c30..3643aed 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -50,6 +50,11 @@ class TestConstructor(SoupTest):
         soup = self.soup(data)
         self.assertEqual(u"foo\0bar", soup.h1.string)
 
+    def test_exclude_encodings(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+        self.assertEqual("windows-1252", soup.original_encoding)
+
 
 class TestWarnings(SoupTest):
 
@@ -322,6 +327,20 @@ class TestUnicodeDammit(unittest.TestCase):
             dammit = UnicodeDammit(utf8_data, [bad_encoding])
             self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 
+    def test_exclude_encodings(self):
+        # This is UTF-8.
+        utf8_data = u"Räksmörgås".encode("utf-8")
+
+        # But if we exclude UTF-8 from consideration, the guess is
+        # Windows-1252.
+        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
+
+        # And if we exclude that, there is no valid guess at all.
+        dammit = UnicodeDammit(
+            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
+        self.assertEqual(dammit.original_encoding, None)
+
     def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
         detected = EncodingDetector(
             b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 1b7b1e6..821dad4 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2397,6 +2397,19 @@ We can fix this by passing in the correct ``from_encoding``::
  soup.original_encoding
  'iso8859-8'
 
+If you don't know what the correct encoding is, but you know that
+Unicode, Dammit is guessing wrong, you can pass the wrong guesses in
+as ``exclude_encodings``::
+
+ soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
+ soup.h1
+ <h1>םולש</h1>
+ soup.original_encoding
+ 'WINDOWS-1255'
+
+(This isn't 100% correct, but Windows-1255 is a compatible superset of
+ISO-8859-8, so it's close enough.)
+
 In rare cases (usually when a UTF-8 document contains text written in
 a completely different encoding), the only way to get Unicode may be
 to replace some characters with the special Unicode character
author	Leonard Richardson <leonardr@segfault.org>	2015-06-27 09:55:40 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2015-06-27 09:55:40 -0400
commit	feffc5a1146e2520c90682bc2c33f5fa7d3943f0 (patch)
tree	6dce892919c201b629628647f86843382b29a60a
parent	d728b9cbd6cd5954acf7c9c32fe2f1878809d6e8 (diff)