diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-12-19 13:56:48 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-12-19 13:56:48 -0500 |
commit | 497bf714d07e491bc811abd64e5fe3391564998d (patch) | |
tree | fdc945b11e121fb21b8d65284d0db5a9ebc43d30 /bs4/dammit.py | |
parent | 854fd52ad616d8e9c0860bba2eb4ddd93eb2dc79 (diff) |
If the charset-normalizer Python module
(https://pypi.org/project/charset-normalizer/) is installed, Beautiful
Soup will use it to detect the character sets of incoming documents.
This is also the module used by newer versions of the Requests library.
For the sake of backwards compatibility, chardet and cchardet both take
precedence if installed. [bug=1955346]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 41 |
1 files changed, 23 insertions, 18 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index e017408..de016ca 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -16,32 +16,37 @@ import re import logging import string -# Import a library to autodetect character encodings. -chardet_type = None +# Import a library to autodetect character encodings. We'll support +# any of a number of libraries that all support the same API: +# +# * cchardet +# * chardet +# * charset-normalizer +chardet_module = None try: - # First try the fast C implementation. # PyPI package: cchardet - import cchardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return cchardet.detect(s)['encoding'] + import cchardet as chardet_module except ImportError: try: - # Fall back to the pure Python implementation # Debian package: python-chardet # PyPI package: chardet - import chardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return chardet.detect(s)['encoding'] - #import chardet.constants - #chardet.constants._debug = 1 + import chardet as chardet_module except ImportError: - # No chardet available. - def chardet_dammit(s): + try: + # PyPI package: charset-normalizer + import charset_normalizer as chardet_module + except ImportError: + # No chardet available. + chardet_module = None + +if chardet_module: + def chardet_dammit(s): + if isinstance(s, str): return None + return chardet_module.detect(s)['encoding'] +else: + def chardet_dammit(s): + return None # Available from http://cjkpython.i18n.org/. # |