diff options
-rw-r--r-- | CHANGELOG | 13 | ||||
-rw-r--r-- | bs4/dammit.py | 41 | ||||
-rw-r--r-- | doc/source/index.rst | 8 |
3 files changed, 37 insertions, 25 deletions
@@ -11,8 +11,12 @@ Python 2 was revision 605. to make it possible to treat ruby text specially in get_text() calls. [bug=1941980] -* Fixed a crash when overriding multi_valued_attributes and using the - html5lib parser. [bug=1948488] +* If the charset-normalizer Python module + (https://pypi.org/project/charset-normalizer/) is installed, Beautiful + Soup will use it to detect the character sets of incoming documents. + This is also the module used by newer versions of the Requests library. + For the sake of backwards compatibility, chardet and cchardet both take + precedence if installed. [bug=1955346] * Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that causes @@ -33,9 +37,12 @@ Python 2 was revision 605. version. "text" still works, but will give a DeprecationWarning. [bug=1947038] -* Fix a crash when pickling a BeautifulSoup object that has no +* Fixed a crash when pickling a BeautifulSoup object that has no tree builder. [bug=1934003] +* Fixed a crash when overriding multi_valued_attributes and using the + html5lib parser. [bug=1948488] + = 4.10.0 (20210907) * This is the first release of Beautiful Soup to only support Python diff --git a/bs4/dammit.py b/bs4/dammit.py index e017408..de016ca 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -16,32 +16,37 @@ import re import logging import string -# Import a library to autodetect character encodings. -chardet_type = None +# Import a library to autodetect character encodings. We'll support +# any of a number of libraries that all support the same API: +# +# * cchardet +# * chardet +# * charset-normalizer +chardet_module = None try: - # First try the fast C implementation. # PyPI package: cchardet - import cchardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return cchardet.detect(s)['encoding'] + import cchardet as chardet_module except ImportError: try: - # Fall back to the pure Python implementation # Debian package: python-chardet # PyPI package: chardet - import chardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return chardet.detect(s)['encoding'] - #import chardet.constants - #chardet.constants._debug = 1 + import chardet as chardet_module except ImportError: - # No chardet available. - def chardet_dammit(s): + try: + # PyPI package: charset-normalizer + import charset_normalizer as chardet_module + except ImportError: + # No chardet available. + chardet_module = None + +if chardet_module: + def chardet_dammit(s): + if isinstance(s, str): return None + return chardet_module.detect(s)['encoding'] +else: + def chardet_dammit(s): + return None # Available from http://cjkpython.i18n.org/. # diff --git a/doc/source/index.rst b/doc/source/index.rst index d81fccd..66bd03e 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2676,10 +2676,10 @@ become Unicode:: # 'utf-8' Unicode, Dammit's guesses will get a lot more accurate if you install -the ``chardet`` or ``cchardet`` Python libraries. The more data you -give Unicode, Dammit, the more accurately it will guess. If you have -your own suspicions as to what the encoding might be, you can pass -them in as a list:: +one of these Python libraries: ``charset-normalizer``, ``chardet``, or +``cchardet``. The more data you give Unicode, Dammit, the more +accurately it will guess. If you have your own suspicions as to what +the encoding might be, you can pass them in as a list:: dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) |