When sniffing encodings, if the cchardet library is installed, use it instead of chardet. It's much faster. [bug=1020748]

author: Leonard Richardson <leonardr@segfault.org> 2012-07-03 17:25:28 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2012-07-03 17:25:28 -0400
commit: f0102682ece130382500f0ee58fbc3340f221d54 (patch)
tree: c00c22affc4889f062fe93c16fd9ea9399bf9c42
parent: ef51996386270f8bc3d7b4e4272d8b117b4f41af (diff)
3 files changed, 32 insertions, 15 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 6b21baa..043d79e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
 = 4.1.2 (Unreleased) =
 
+* When sniffing encodings, if the cchardet library is installed, use
+  it instead of chardet. It's much faster. [bug=1020748]
+
 * Use logging.warning() instead of warning.warn() to notify the user
   that characters were replaced with REPLACEMENT
   CHARACTER. [bug=1013862]
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 67ce66c..39ea9ee 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -12,16 +12,28 @@ from htmlentitydefs import codepoint2name
 import re
 import logging
 
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-#  or 'apt-get install python-chardet'
-#  or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
 try:
-    import chardet
-    #import chardet.constants
-    #chardet.constants._debug = 1
+    # First try the fast C implementation.
+    #  PyPI package: cchardet
+    import cchardet
+    def chardet_dammit(s):
+        return cchardet.detect(s).lower()
 except ImportError:
-    chardet = None
+    try:
+        # Fall back to the pure Python implementation
+        #  Debian package: python-chardet
+        #  PyPI package: chardet
+        import chardet
+        #import chardet.constants
+        #chardet.constants._debug = 1
+        def chardet_dammit(s):
+            return chardet.detect(s)['encoding']
+    except ImportError:
+        # No chardet available.
+        def chardet_dammit(s):
+            return None
 
 # Available from http://cjkpython.i18n.org/.
 try:
@@ -207,8 +219,8 @@ class UnicodeDammit:
                         break
 
         # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convert_from(chardet.detect(self.markup)['encoding'])
+        if not u and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet_dammit(self.markup))
 
         # As a last resort, try utf-8 and windows-1252:
         if not u:
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 4b5bab8..0d04558 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -138,12 +138,12 @@ class TestEncodingConversion(SoupTest):
 
     def setUp(self):
         super(TestEncodingConversion, self).setUp()
-        self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
+        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
         self.utf8_data = self.unicode_data.encode("utf-8")
         # Just so you know what it looks like.
         self.assertEqual(
             self.utf8_data,
-            b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
+            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
 
     def test_ascii_in_unicode_out(self):
         # ASCII input is converted to Unicode. The original_encoding
@@ -262,10 +262,12 @@ class TestUnicodeDammit(unittest.TestCase):
         doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
-        chardet = bs4.dammit.chardet
+        chardet = bs4.dammit.chardet_dammit
         logging.disable(logging.WARNING)
         try:
-            bs4.dammit.chardet = None
+            def noop(str):
+                return None
+            bs4.dammit.chardet_dammit = noop
             dammit = UnicodeDammit(doc)
             self.assertEqual(True, dammit.contains_replacement_characters)
             self.assertTrue(u"\ufffd" in dammit.unicode_markup)
@@ -274,7 +276,7 @@ class TestUnicodeDammit(unittest.TestCase):
             self.assertTrue(soup.contains_replacement_characters)
         finally:
             logging.disable(logging.NOTSET)
-            bs4.dammit.chardet = chardet
+            bs4.dammit.chardet_dammit = chardet
 
     def test_sniffed_xml_encoding(self):
         # A document written in UTF-16LE will be converted by a different
author	Leonard Richardson <leonardr@segfault.org>	2012-07-03 17:25:28 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2012-07-03 17:25:28 -0400
commit	f0102682ece130382500f0ee58fbc3340f221d54 (patch)
tree	c00c22affc4889f062fe93c16fd9ea9399bf9c42
parent	ef51996386270f8bc3d7b4e4272d8b117b4f41af (diff)