summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-02-13 11:51:13 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-02-13 11:51:13 -0500
commit8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
treeb0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/builder/_htmlparser.py
parent4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)
Added a second way to pass specify encodings to UnicodeDammit and
EncodingDetector, based on the order of precedence defined in the HTML5 spec, starting at: https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding Encodings in 'known_definite_encodings' are tried first, then byte-order-mark sniffing is run, then encodings in 'user_encodings' are tried. The old argument, 'override_encodings', is now a deprecated alias for 'known_definite_encodings'. This changes the default behavior of the html.parser and lxml tree builders, in a way that may slightly improve encoding detection but will probably have no effect. [bug=1889014]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py19
1 files changed, 17 insertions, 2 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 96a7b7d..2f2bf1e 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -359,9 +359,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
return
# Ask UnicodeDammit to sniff the most likely encoding.
+
+ # This was provided by the end-user; treat it as a known
+ # definite encoding per the algorithm laid out in the HTML5
+ # spec. (See the EncodingDetector class for details.)
+ known_definite_encodings = [user_specified_encoding]
+
+ # This was found in the document; treat it as a slightly lower-priority
+ # user encoding.
+ user_encodings = [document_declared_encoding]
+
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True,
- exclude_encodings=exclude_encodings)
+ dammit = UnicodeDammit(
+ markup,
+ known_definite_encodings=known_definite_encodings,
+ user_encodings=user_encodings,
+ is_html=True,
+ exclude_encodings=exclude_encodings
+ )
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)