summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2012-07-03 17:25:28 -0400
committerLeonard Richardson <leonardr@segfault.org>2012-07-03 17:25:28 -0400
commitf0102682ece130382500f0ee58fbc3340f221d54 (patch)
treec00c22affc4889f062fe93c16fd9ea9399bf9c42
parentef51996386270f8bc3d7b4e4272d8b117b4f41af (diff)
When sniffing encodings, if the cchardet library is installed, use it instead of chardet. It's much faster. [bug=1020748]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/dammit.py32
-rw-r--r--bs4/tests/test_soup.py12
3 files changed, 32 insertions, 15 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 6b21baa..043d79e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= 4.1.2 (Unreleased) =
+* When sniffing encodings, if the cchardet library is installed, use
+ it instead of chardet. It's much faster. [bug=1020748]
+
* Use logging.warning() instead of warning.warn() to notify the user
that characters were replaced with REPLACEMENT
CHARACTER. [bug=1013862]
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 67ce66c..39ea9ee 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -12,16 +12,28 @@ from htmlentitydefs import codepoint2name
import re
import logging
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
+ # First try the fast C implementation.
+ # PyPI package: cchardet
+ import cchardet
+ def chardet_dammit(s):
+ return cchardet.detect(s).lower()
except ImportError:
- chardet = None
+ try:
+ # Fall back to the pure Python implementation
+ # Debian package: python-chardet
+ # PyPI package: chardet
+ import chardet
+ #import chardet.constants
+ #chardet.constants._debug = 1
+ def chardet_dammit(s):
+ return chardet.detect(s)['encoding']
+ except ImportError:
+ # No chardet available.
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
try:
@@ -207,8 +219,8 @@ class UnicodeDammit:
break
# If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
+ if not u and not isinstance(self.markup, unicode):
+ u = self._convert_from(chardet_dammit(self.markup))
# As a last resort, try utf-8 and windows-1252:
if not u:
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 4b5bab8..0d04558 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -138,12 +138,12 @@ class TestEncodingConversion(SoupTest):
def setUp(self):
super(TestEncodingConversion, self).setUp()
- self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
+ self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
- b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
+ b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
@@ -262,10 +262,12 @@ class TestUnicodeDammit(unittest.TestCase):
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
- chardet = bs4.dammit.chardet
+ chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
- bs4.dammit.chardet = None
+ def noop(str):
+ return None
+ bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
@@ -274,7 +276,7 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
- bs4.dammit.chardet = chardet
+ bs4.dammit.chardet_dammit = chardet
def test_sniffed_xml_encoding(self):
# A document written in UTF-16LE will be converted by a different