From 8f763297abc8bb598c3aca25eccaef6db7f7c987 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sat, 13 Feb 2021 11:51:13 -0500
Subject: Added a second way to pass specify encodings to UnicodeDammit and  
 EncodingDetector, based on the order of precedence defined in the   HTML5
 spec, starting at:  
 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding

  Encodings in 'known_definite_encodings' are tried first, then
  byte-order-mark sniffing is run, then encodings in 'user_encodings'
  are tried. The old argument, 'override_encodings', is now a
  deprecated alias for 'known_definite_encodings'.

  This changes the default behavior of the html.parser and lxml tree
  builders, in a way that may slightly improve encoding
  detection but will probably have no effect. [bug=1889014]
---
 bs4/builder/_lxml.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'bs4/builder/_lxml.py')

diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 1b44d75..c670b84 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             yield (markup.encode("utf8"), "utf8",
                    document_declared_encoding, False)
 
-        try_encodings = [user_specified_encoding, document_declared_encoding]
+        # This was provided by the end-user; treat it as a known
+        # definite encoding per the algorithm laid out in the HTML5
+        # spec.  (See the EncodingDetector class for details.)
+        known_definite_encodings = [user_specified_encoding]
+
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
         detector = EncodingDetector(
-            markup, try_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings, is_html=is_html,
+            exclude_encodings=exclude_encodings
+        )
         for encoding in detector.encodings:
             yield (detector.markup, encoding, document_declared_encoding, False)
 
-- 
cgit v1.2.3