From 8f763297abc8bb598c3aca25eccaef6db7f7c987 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 13 Feb 2021 11:51:13 -0500 Subject: Added a second way to pass specify encodings to UnicodeDammit and EncodingDetector, based on the order of precedence defined in the HTML5 spec, starting at: https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding Encodings in 'known_definite_encodings' are tried first, then byte-order-mark sniffing is run, then encodings in 'user_encodings' are tried. The old argument, 'override_encodings', is now a deprecated alias for 'known_definite_encodings'. This changes the default behavior of the html.parser and lxml tree builders, in a way that may slightly improve encoding detection but will probably have no effect. [bug=1889014] --- bs4/builder/_lxml.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 1b44d75..c670b84 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - try_encodings = [user_specified_encoding, document_declared_encoding] + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] detector = EncodingDetector( - markup, try_encodings, is_html, exclude_encodings) + markup, known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, is_html=is_html, + exclude_encodings=exclude_encodings + ) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) -- cgit v1.2.3