summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-12-19 13:56:48 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-12-19 13:56:48 -0500
commit497bf714d07e491bc811abd64e5fe3391564998d (patch)
treefdc945b11e121fb21b8d65284d0db5a9ebc43d30
parent854fd52ad616d8e9c0860bba2eb4ddd93eb2dc79 (diff)
If the charset-normalizer Python module
(https://pypi.org/project/charset-normalizer/) is installed, Beautiful Soup will use it to detect the character sets of incoming documents. This is also the module used by newer versions of the Requests library. For the sake of backwards compatibility, chardet and cchardet both take precedence if installed. [bug=1955346]
-rw-r--r--CHANGELOG13
-rw-r--r--bs4/dammit.py41
-rw-r--r--doc/source/index.rst8
3 files changed, 37 insertions, 25 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 3c807d5..25fa1c4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,8 +11,12 @@ Python 2 was revision 605.
to make it possible to treat ruby text specially in get_text() calls.
[bug=1941980]
-* Fixed a crash when overriding multi_valued_attributes and using the
- html5lib parser. [bug=1948488]
+* If the charset-normalizer Python module
+ (https://pypi.org/project/charset-normalizer/) is installed, Beautiful
+ Soup will use it to detect the character sets of incoming documents.
+ This is also the module used by newer versions of the Requests library.
+ For the sake of backwards compatibility, chardet and cchardet both take
+ precedence if installed. [bug=1955346]
* Added a workaround for an lxml bug
(https://bugs.launchpad.net/lxml/+bug/1948551) that causes
@@ -33,9 +37,12 @@ Python 2 was revision 605.
version. "text" still works, but will give a DeprecationWarning.
[bug=1947038]
-* Fix a crash when pickling a BeautifulSoup object that has no
+* Fixed a crash when pickling a BeautifulSoup object that has no
tree builder. [bug=1934003]
+* Fixed a crash when overriding multi_valued_attributes and using the
+ html5lib parser. [bug=1948488]
+
= 4.10.0 (20210907)
* This is the first release of Beautiful Soup to only support Python
diff --git a/bs4/dammit.py b/bs4/dammit.py
index e017408..de016ca 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -16,32 +16,37 @@ import re
import logging
import string
-# Import a library to autodetect character encodings.
-chardet_type = None
+# Import a library to autodetect character encodings. We'll support
+# any of a number of libraries that all support the same API:
+#
+# * cchardet
+# * chardet
+# * charset-normalizer
+chardet_module = None
try:
- # First try the fast C implementation.
# PyPI package: cchardet
- import cchardet
- def chardet_dammit(s):
- if isinstance(s, str):
- return None
- return cchardet.detect(s)['encoding']
+ import cchardet as chardet_module
except ImportError:
try:
- # Fall back to the pure Python implementation
# Debian package: python-chardet
# PyPI package: chardet
- import chardet
- def chardet_dammit(s):
- if isinstance(s, str):
- return None
- return chardet.detect(s)['encoding']
- #import chardet.constants
- #chardet.constants._debug = 1
+ import chardet as chardet_module
except ImportError:
- # No chardet available.
- def chardet_dammit(s):
+ try:
+ # PyPI package: charset-normalizer
+ import charset_normalizer as chardet_module
+ except ImportError:
+ # No chardet available.
+ chardet_module = None
+
+if chardet_module:
+ def chardet_dammit(s):
+ if isinstance(s, str):
return None
+ return chardet_module.detect(s)['encoding']
+else:
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
#
diff --git a/doc/source/index.rst b/doc/source/index.rst
index d81fccd..66bd03e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2676,10 +2676,10 @@ become Unicode::
# 'utf-8'
Unicode, Dammit's guesses will get a lot more accurate if you install
-the ``chardet`` or ``cchardet`` Python libraries. The more data you
-give Unicode, Dammit, the more accurately it will guess. If you have
-your own suspicions as to what the encoding might be, you can pass
-them in as a list::
+one of these Python libraries: ``charset-normalizer``, ``chardet``, or
+``cchardet``. The more data you give Unicode, Dammit, the more
+accurately it will guess. If you have your own suspicions as to what
+the encoding might be, you can pass them in as a list::
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)