Added a second way to pass specify encodings to UnicodeDammit and

EncodingDetector, based on the order of precedence defined in the HTML5 spec, starting at: https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding Encodings in 'known_definite_encodings' are tried first, then byte-order-mark sniffing is run, then encodings in 'user_encodings' are tried. The old argument, 'override_encodings', is now a deprecated alias for 'known_definite_encodings'. This changes the default behavior of the html.parser and lxml tree builders, in a way that may slightly improve encoding detection but will probably have no effect. [bug=1889014]
author: Leonard Richardson <leonardr@segfault.org> 2021-02-13 11:51:13 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-02-13 11:51:13 -0500
commit: 8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
tree: b0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/dammit.py
parent: 4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)
1 files changed, 81 insertions, 17 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 33f7b7d..7e0a7f8 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -228,32 +228,65 @@ class EncodingDetector:
     Order of precedence:
 
     1. Encodings you specifically tell EncodingDetector to try first
-    (the override_encodings argument to the constructor).
+    (the known_definite_encodings argument to the constructor).
 
-    2. An encoding declared within the bytestring itself, either in an
+    2. An encoding determined by sniffing the document's byte-order mark.
+
+    3. Encodings you specifically tell EncodingDetector to try if
+    byte-order mark sniffing fails (the user_encodings argument to the
+    constructor).
+
+    4. An encoding declared within the bytestring itself, either in an
     XML declaration (if the bytestring is to be interpreted as an XML
     document), or in a <meta> tag (if the bytestring is to be
     interpreted as an HTML document.)
 
-    3. An encoding detected through textual analysis by chardet,
+    5. An encoding detected through textual analysis by chardet,
     cchardet, or a similar external library.
 
     4. UTF-8.
 
     5. Windows-1252.
+
     """
-    def __init__(self, markup, override_encodings=None, is_html=False,
-                 exclude_encodings=None):
+    def __init__(self, markup, known_definite_encodings=None,
+                 is_html=False, exclude_encodings=None,
+                 user_encodings=None, override_encodings=None):
         """Constructor.
 
         :param markup: Some markup in an unknown encoding.
-        :param override_encodings: These encodings will be tried first.
-        :param is_html: If True, this markup is considered to be HTML. Otherwise
-            it's assumed to be XML.
-        :param exclude_encodings: These encodings will not be tried, even
-            if they otherwise would be.
+
+        :param known_definite_encodings: When determining the encoding
+            of `markup`, these encodings will be tried first, in
+            order. In HTML terms, this corresponds to the "known
+            definite encoding" step defined here:
+            https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+        :param user_encodings: These encodings will be tried after the
+            `known_definite_encodings` have been tried and failed, and
+            after an attempt to sniff the encoding by looking at a
+            byte order mark has failed. In HTML terms, this
+            corresponds to the step "user has explicitly instructed
+            the user agent to override the document's character
+            encoding", defined here:
+            https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+        :param override_encodings: A deprecated alias for
+            known_definite_encodings. Any encodings here will be tried
+            immediately after the encodings in
+            known_definite_encodings.
+
+        :param is_html: If True, this markup is considered to be
+            HTML. Otherwise it's assumed to be XML.
+
+        :param exclude_encodings: These encodings will not be tried,
+            even if they otherwise would be.
+
         """
-        self.override_encodings = override_encodings or []
+        self.known_definite_encodings = list(known_definite_encodings or [])
+        if override_encodings:
+            self.known_definite_encodings += override_encodings
+        self.user_encodings = user_encodings or []
         exclude_encodings = exclude_encodings or []
         self.exclude_encodings = set([x.lower() for x in exclude_encodings])
         self.chardet_encoding = None
@@ -286,7 +319,9 @@ class EncodingDetector:
         :yield: A sequence of strings.
         """
         tried = set()
-        for e in self.override_encodings:
+
+        # First, try the known definite encodings
+        for e in self.known_definite_encodings:
             if self._usable(e, tried):
                 yield e
 
@@ -295,6 +330,12 @@ class EncodingDetector:
         if self._usable(self.sniffed_encoding, tried):
             yield self.sniffed_encoding
 
+        # Sniffing the byte-order mark did nothing; try the user
+        # encodings.
+        for e in self.user_encodings:
+            if self._usable(e, tried):
+                yield e
+            
         # Look within the document for an XML or HTML encoding
         # declaration.
         if self.declared_encoding is None:
@@ -405,13 +446,33 @@ class UnicodeDammit:
         "iso-8859-2",
         ]
 
-    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+    def __init__(self, markup, known_definite_encodings=[],
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[],
+                 user_encodings=None, override_encodings=None
+    ):
         """Constructor.
 
         :param markup: A bytestring representing markup in an unknown encoding.
-        :param override_encodings: These encodings will be tried first,
-           before any sniffing code is run.
+
+        :param known_definite_encodings: When determining the encoding
+            of `markup`, these encodings will be tried first, in
+            order. In HTML terms, this corresponds to the "known
+            definite encoding" step defined here:
+            https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+        :param user_encodings: These encodings will be tried after the
+            `known_definite_encodings` have been tried and failed, and
+            after an attempt to sniff the encoding by looking at a
+            byte order mark has failed. In HTML terms, this
+            corresponds to the step "user has explicitly instructed
+            the user agent to override the document's character
+            encoding", defined here:
+            https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+        :param override_encodings: A deprecated alias for
+            known_definite_encodings. Any encodings here will be tried
+            immediately after the encodings in
+            known_definite_encodings.
 
         :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
            to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
@@ -421,6 +482,7 @@ class UnicodeDammit:
             it's assumed to be XML.
         :param exclude_encodings: These encodings will not be considered, even
             if the sniffing code thinks they might make sense.
+
         """
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
@@ -428,7 +490,9 @@ class UnicodeDammit:
         self.is_html = is_html
         self.log = logging.getLogger(__name__)
         self.detector = EncodingDetector(
-            markup, override_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings, is_html, exclude_encodings,
+            user_encodings, override_encodings
+        )
 
         # Short-circuit if the data is in Unicode to begin with.
         if isinstance(markup, unicode) or markup == '':
author	Leonard Richardson <leonardr@segfault.org>	2021-02-13 11:51:13 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-02-13 11:51:13 -0500
commit	8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
tree	b0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/dammit.py
parent	4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)