summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG13
-rw-r--r--bs4/dammit.py41
-rw-r--r--doc/source/index.rst8
3 files changed, 37 insertions, 25 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 3c807d5..25fa1c4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,8 +11,12 @@ Python 2 was revision 605.
to make it possible to treat ruby text specially in get_text() calls.
[bug=1941980]
-* Fixed a crash when overriding multi_valued_attributes and using the
- html5lib parser. [bug=1948488]
+* If the charset-normalizer Python module
+ (https://pypi.org/project/charset-normalizer/) is installed, Beautiful
+ Soup will use it to detect the character sets of incoming documents.
+ This is also the module used by newer versions of the Requests library.
+ For the sake of backwards compatibility, chardet and cchardet both take
+ precedence if installed. [bug=1955346]
* Added a workaround for an lxml bug
(https://bugs.launchpad.net/lxml/+bug/1948551) that causes
@@ -33,9 +37,12 @@ Python 2 was revision 605.
version. "text" still works, but will give a DeprecationWarning.
[bug=1947038]
-* Fix a crash when pickling a BeautifulSoup object that has no
+* Fixed a crash when pickling a BeautifulSoup object that has no
tree builder. [bug=1934003]
+* Fixed a crash when overriding multi_valued_attributes and using the
+ html5lib parser. [bug=1948488]
+
= 4.10.0 (20210907)
* This is the first release of Beautiful Soup to only support Python
diff --git a/bs4/dammit.py b/bs4/dammit.py
index e017408..de016ca 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -16,32 +16,37 @@ import re
import logging
import string
-# Import a library to autodetect character encodings.
-chardet_type = None
+# Import a library to autodetect character encodings. We'll support
+# any of a number of libraries that all support the same API:
+#
+# * cchardet
+# * chardet
+# * charset-normalizer
+chardet_module = None
try:
- # First try the fast C implementation.
# PyPI package: cchardet
- import cchardet
- def chardet_dammit(s):
- if isinstance(s, str):
- return None
- return cchardet.detect(s)['encoding']
+ import cchardet as chardet_module
except ImportError:
try:
- # Fall back to the pure Python implementation
# Debian package: python-chardet
# PyPI package: chardet
- import chardet
- def chardet_dammit(s):
- if isinstance(s, str):
- return None
- return chardet.detect(s)['encoding']
- #import chardet.constants
- #chardet.constants._debug = 1
+ import chardet as chardet_module
except ImportError:
- # No chardet available.
- def chardet_dammit(s):
+ try:
+ # PyPI package: charset-normalizer
+ import charset_normalizer as chardet_module
+ except ImportError:
+ # No chardet available.
+ chardet_module = None
+
+if chardet_module:
+ def chardet_dammit(s):
+ if isinstance(s, str):
return None
+ return chardet_module.detect(s)['encoding']
+else:
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
#
diff --git a/doc/source/index.rst b/doc/source/index.rst
index d81fccd..66bd03e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2676,10 +2676,10 @@ become Unicode::
# 'utf-8'
Unicode, Dammit's guesses will get a lot more accurate if you install
-the ``chardet`` or ``cchardet`` Python libraries. The more data you
-give Unicode, Dammit, the more accurately it will guess. If you have
-your own suspicions as to what the encoding might be, you can pass
-them in as a list::
+one of these Python libraries: ``charset-normalizer``, ``chardet``, or
+``cchardet``. The more data you give Unicode, Dammit, the more
+accurately it will guess. If you have your own suspicions as to what
+the encoding might be, you can pass them in as a list::
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)