summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-02-13 11:51:13 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-02-13 11:51:13 -0500
commit8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch)
treeb0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/dammit.py
parent4d8d9af1c841d1eec0e9e838a467579831268b8b (diff)
Added a second way to pass specify encodings to UnicodeDammit and
EncodingDetector, based on the order of precedence defined in the HTML5 spec, starting at: https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding Encodings in 'known_definite_encodings' are tried first, then byte-order-mark sniffing is run, then encodings in 'user_encodings' are tried. The old argument, 'override_encodings', is now a deprecated alias for 'known_definite_encodings'. This changes the default behavior of the html.parser and lxml tree builders, in a way that may slightly improve encoding detection but will probably have no effect. [bug=1889014]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py98
1 files changed, 81 insertions, 17 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 33f7b7d..7e0a7f8 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -228,32 +228,65 @@ class EncodingDetector:
Order of precedence:
1. Encodings you specifically tell EncodingDetector to try first
- (the override_encodings argument to the constructor).
+ (the known_definite_encodings argument to the constructor).
- 2. An encoding declared within the bytestring itself, either in an
+ 2. An encoding determined by sniffing the document's byte-order mark.
+
+ 3. Encodings you specifically tell EncodingDetector to try if
+ byte-order mark sniffing fails (the user_encodings argument to the
+ constructor).
+
+ 4. An encoding declared within the bytestring itself, either in an
XML declaration (if the bytestring is to be interpreted as an XML
document), or in a <meta> tag (if the bytestring is to be
interpreted as an HTML document.)
- 3. An encoding detected through textual analysis by chardet,
+ 5. An encoding detected through textual analysis by chardet,
cchardet, or a similar external library.
4. UTF-8.
5. Windows-1252.
+
"""
- def __init__(self, markup, override_encodings=None, is_html=False,
- exclude_encodings=None):
+ def __init__(self, markup, known_definite_encodings=None,
+ is_html=False, exclude_encodings=None,
+ user_encodings=None, override_encodings=None):
"""Constructor.
:param markup: Some markup in an unknown encoding.
- :param override_encodings: These encodings will be tried first.
- :param is_html: If True, this markup is considered to be HTML. Otherwise
- it's assumed to be XML.
- :param exclude_encodings: These encodings will not be tried, even
- if they otherwise would be.
+
+ :param known_definite_encodings: When determining the encoding
+ of `markup`, these encodings will be tried first, in
+ order. In HTML terms, this corresponds to the "known
+ definite encoding" step defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+ :param user_encodings: These encodings will be tried after the
+ `known_definite_encodings` have been tried and failed, and
+ after an attempt to sniff the encoding by looking at a
+ byte order mark has failed. In HTML terms, this
+ corresponds to the step "user has explicitly instructed
+ the user agent to override the document's character
+ encoding", defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+ :param override_encodings: A deprecated alias for
+ known_definite_encodings. Any encodings here will be tried
+ immediately after the encodings in
+ known_definite_encodings.
+
+ :param is_html: If True, this markup is considered to be
+ HTML. Otherwise it's assumed to be XML.
+
+ :param exclude_encodings: These encodings will not be tried,
+ even if they otherwise would be.
+
"""
- self.override_encodings = override_encodings or []
+ self.known_definite_encodings = list(known_definite_encodings or [])
+ if override_encodings:
+ self.known_definite_encodings += override_encodings
+ self.user_encodings = user_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
@@ -286,7 +319,9 @@ class EncodingDetector:
:yield: A sequence of strings.
"""
tried = set()
- for e in self.override_encodings:
+
+ # First, try the known definite encodings
+ for e in self.known_definite_encodings:
if self._usable(e, tried):
yield e
@@ -295,6 +330,12 @@ class EncodingDetector:
if self._usable(self.sniffed_encoding, tried):
yield self.sniffed_encoding
+ # Sniffing the byte-order mark did nothing; try the user
+ # encodings.
+ for e in self.user_encodings:
+ if self._usable(e, tried):
+ yield e
+
# Look within the document for an XML or HTML encoding
# declaration.
if self.declared_encoding is None:
@@ -405,13 +446,33 @@ class UnicodeDammit:
"iso-8859-2",
]
- def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+ def __init__(self, markup, known_definite_encodings=[],
+ smart_quotes_to=None, is_html=False, exclude_encodings=[],
+ user_encodings=None, override_encodings=None
+ ):
"""Constructor.
:param markup: A bytestring representing markup in an unknown encoding.
- :param override_encodings: These encodings will be tried first,
- before any sniffing code is run.
+
+ :param known_definite_encodings: When determining the encoding
+ of `markup`, these encodings will be tried first, in
+ order. In HTML terms, this corresponds to the "known
+ definite encoding" step defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+ :param user_encodings: These encodings will be tried after the
+ `known_definite_encodings` have been tried and failed, and
+ after an attempt to sniff the encoding by looking at a
+ byte order mark has failed. In HTML terms, this
+ corresponds to the step "user has explicitly instructed
+ the user agent to override the document's character
+ encoding", defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+ :param override_encodings: A deprecated alias for
+ known_definite_encodings. Any encodings here will be tried
+ immediately after the encodings in
+ known_definite_encodings.
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
@@ -421,6 +482,7 @@ class UnicodeDammit:
it's assumed to be XML.
:param exclude_encodings: These encodings will not be considered, even
if the sniffing code thinks they might make sense.
+
"""
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
@@ -428,7 +490,9 @@ class UnicodeDammit:
self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector(
- markup, override_encodings, is_html, exclude_encodings)
+ markup, known_definite_encodings, is_html, exclude_encodings,
+ user_encodings, override_encodings
+ )
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':