Added a second way to pass specify encodings to UnicodeDammit and

EncodingDetector, based on the order of precedence defined in the HTML5 spec, starting at: https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding Encodings in 'known_definite_encodings' are tried first, then byte-order-mark sniffing is run, then encodings in 'user_encodings' are tried. The old argument, 'override_encodings', is now a deprecated alias for 'known_definite_encodings'. This changes the default behavior of the html.parser and lxml tree builders, in a way that may slightly improve encoding detection but will probably have no effect. [bug=1889014]
author: Leonard Richardson <leonardr@segfault.org> 2021-02-13 11:51:13 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-02-13 11:51:13 -0500
commit: 8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
tree: b0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/builder
parent: 4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)
3 files changed, 31 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 03da4c6..b6e2c37 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -234,7 +234,8 @@ class TreeBuilder(object):
         :param markup: Some markup -- probably a bytestring.
         :param user_specified_encoding: The user asked to try this encoding.
         :param document_declared_encoding: The markup itself claims to be
-            in this encoding.
+            in this encoding. NOTE: This argument is not used by the
+            calling code and can probably be removed.
         :param exclude_encodings: The user asked _not_ to try any of
             these encodings.
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 96a7b7d..2f2bf1e 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -359,9 +359,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
             return
 
         # Ask UnicodeDammit to sniff the most likely encoding.
+
+        # This was provided by the end-user; treat it as a known
+        # definite encoding per the algorithm laid out in the HTML5
+        # spec.  (See the EncodingDetector class for details.)
+        known_definite_encodings = [user_specified_encoding]
+
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
+
         try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
-                               exclude_encodings=exclude_encodings)
+        dammit = UnicodeDammit(
+            markup,
+            known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings,
+            is_html=True,
+            exclude_encodings=exclude_encodings
+        )
         yield (dammit.markup, dammit.original_encoding,
                dammit.declared_html_encoding,
                dammit.contains_replacement_characters)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 1b44d75..c670b84 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             yield (markup.encode("utf8"), "utf8",
                    document_declared_encoding, False)
 
-        try_encodings = [user_specified_encoding, document_declared_encoding]
+        # This was provided by the end-user; treat it as a known
+        # definite encoding per the algorithm laid out in the HTML5
+        # spec.  (See the EncodingDetector class for details.)
+        known_definite_encodings = [user_specified_encoding]
+
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
         detector = EncodingDetector(
-            markup, try_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings, is_html=is_html,
+            exclude_encodings=exclude_encodings
+        )
         for encoding in detector.encodings:
             yield (detector.markup, encoding, document_declared_encoding, False)
author	Leonard Richardson <leonardr@segfault.org>	2021-02-13 11:51:13 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-02-13 11:51:13 -0500
commit	8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
tree	b0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/builder
parent	4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)