summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2012-07-03 17:25:28 -0400
committerLeonard Richardson <leonardr@segfault.org>2012-07-03 17:25:28 -0400
commitf0102682ece130382500f0ee58fbc3340f221d54 (patch)
treec00c22affc4889f062fe93c16fd9ea9399bf9c42 /bs4/dammit.py
parentef51996386270f8bc3d7b4e4272d8b117b4f41af (diff)
When sniffing encodings, if the cchardet library is installed, use it instead of chardet. It's much faster. [bug=1020748]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py32
1 files changed, 22 insertions, 10 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 67ce66c..39ea9ee 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -12,16 +12,28 @@ from htmlentitydefs import codepoint2name
import re
import logging
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
+ # First try the fast C implementation.
+ # PyPI package: cchardet
+ import cchardet
+ def chardet_dammit(s):
+ return cchardet.detect(s).lower()
except ImportError:
- chardet = None
+ try:
+ # Fall back to the pure Python implementation
+ # Debian package: python-chardet
+ # PyPI package: chardet
+ import chardet
+ #import chardet.constants
+ #chardet.constants._debug = 1
+ def chardet_dammit(s):
+ return chardet.detect(s)['encoding']
+ except ImportError:
+ # No chardet available.
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
try:
@@ -207,8 +219,8 @@ class UnicodeDammit:
break
# If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
+ if not u and not isinstance(self.markup, unicode):
+ u = self._convert_from(chardet_dammit(self.markup))
# As a last resort, try utf-8 and windows-1252:
if not u: