Mentioned cchardet in docs.

author: Leonard Richardson <leonardr@segfault.org> 2012-07-03 17:59:25 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2012-07-03 17:59:25 -0400
commit: 96eaf6e8f54d84b02e0c3c8c334e7cfd29ef343c (patch)
tree: 7896fbad9bee2fac1f8e89df3d9235cfd4945e40
parent: f0102682ece130382500f0ee58fbc3340f221d54 (diff)
3 files changed, 14 insertions, 10 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 39ea9ee..b8828cb 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -19,7 +19,7 @@ try:
     #  PyPI package: cchardet
     import cchardet
     def chardet_dammit(s):
-        return cchardet.detect(s).lower()
+        return cchardet.detect(s)
 except ImportError:
     try:
         # Fall back to the pure Python implementation
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 0d04558..a10a89e 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -153,7 +153,7 @@ class TestEncodingConversion(SoupTest):
         unicode_output = soup_from_ascii.decode()
         self.assertTrue(isinstance(unicode_output, unicode))
         self.assertEqual(unicode_output, self.document_for(ascii.decode()))
-        self.assertEqual(soup_from_ascii.original_encoding, "ascii")
+        self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
 
     def test_unicode_in_unicode_out(self):
         # Unicode input is left alone. The original_encoding attribute
@@ -207,30 +207,30 @@ class TestUnicodeDammit(unittest.TestCase):
         utf8 = b"\xc3\xa9"
         dammit = UnicodeDammit(utf8)
         self.assertEqual(dammit.unicode_markup, u'\xe9')
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 
     def test_convert_hebrew(self):
         hebrew = b"\xed\xe5\xec\xf9"
         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding, 'iso-8859-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
         self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
 
     def test_dont_see_smart_quotes_where_there_are_none(self):
         utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
         dammit = UnicodeDammit(utf_8)
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
         self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
 
     def test_ignore_inappropriate_codecs(self):
         utf8_data = u"Räksmörgås".encode("utf-8")
         dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 
     def test_ignore_invalid_codecs(self):
         utf8_data = u"Räksmörgås".encode("utf-8")
         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
             dammit = UnicodeDammit(utf8_data, [bad_encoding])
-            self.assertEqual(dammit.original_encoding, 'utf-8')
+            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 
     def test_detect_html5_style_meta_tag(self):
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index e5e3fbc..7c4b847 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2685,14 +2685,18 @@ you're not using lxml as the underlying parser, my advice is to
 :ref:`start <parser-installation>`. Beautiful Soup parses documents
 significantly faster using lxml than using html.parser or html5lib.
 
+You can speed up encoding detection significantly by installing the
+`cchardet <http://pypi.python.org/pypi/cchardet/>`_ library.
+
 Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by
 doing a byte-by-byte examination of the file. This slows Beautiful
 Soup to a crawl. My tests indicate that this only happened on 2.x
 versions of Python, and that it happened most often with documents
 using Russian or Chinese encodings. If this is happening to you, you
-can fix it by using Python 3 for your script. Or, if you happen to
-know a document's encoding, you can pass it into the
-``BeautifulSoup`` constructor as ``from_encoding``.
+can fix it by installing cchardet, or by using Python 3 for your
+script. If you happen to know a document's encoding, you can pass
+it into the ``BeautifulSoup`` constructor as ``from_encoding``, and
+bypass encoding detection altogether.
 
 `Parsing only part of a document`_ won't save you much time parsing
 the document, but it can save a lot of memory, and it'll make
author	Leonard Richardson <leonardr@segfault.org>	2012-07-03 17:59:25 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2012-07-03 17:59:25 -0400
commit	96eaf6e8f54d84b02e0c3c8c334e7cfd29ef343c (patch)
tree	7896fbad9bee2fac1f8e89df3d9235cfd4945e40
parent	f0102682ece130382500f0ee58fbc3340f221d54 (diff)