diff options
author | Leonard Richardson <leonardr@segfault.org> | 2012-07-03 17:25:28 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2012-07-03 17:25:28 -0400 |
commit | f0102682ece130382500f0ee58fbc3340f221d54 (patch) | |
tree | c00c22affc4889f062fe93c16fd9ea9399bf9c42 | |
parent | ef51996386270f8bc3d7b4e4272d8b117b4f41af (diff) |
When sniffing encodings, if the cchardet library is installed, use it instead of chardet. It's much faster. [bug=1020748]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/dammit.py | 32 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 12 |
3 files changed, 32 insertions, 15 deletions
@@ -1,5 +1,8 @@ = 4.1.2 (Unreleased) = +* When sniffing encodings, if the cchardet library is installed, use + it instead of chardet. It's much faster. [bug=1020748] + * Use logging.warning() instead of warning.warn() to notify the user that characters were replaced with REPLACEMENT CHARACTER. [bug=1013862] diff --git a/bs4/dammit.py b/bs4/dammit.py index 67ce66c..39ea9ee 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -12,16 +12,28 @@ from htmlentitydefs import codepoint2name import re import logging -# Autodetects character encodings. Very useful. -# Download from http://chardet.feedparser.org/ -# or 'apt-get install python-chardet' -# or 'easy_install chardet' +# Import a library to autodetect character encodings. +chardet_type = None try: - import chardet - #import chardet.constants - #chardet.constants._debug = 1 + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s).lower() except ImportError: - chardet = None + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + #import chardet.constants + #chardet.constants._debug = 1 + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None # Available from http://cjkpython.i18n.org/. try: @@ -207,8 +219,8 @@ class UnicodeDammit: break # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): - u = self._convert_from(chardet.detect(self.markup)['encoding']) + if not u and not isinstance(self.markup, unicode): + u = self._convert_from(chardet_dammit(self.markup)) # As a last resort, try utf-8 and windows-1252: if not u: diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 4b5bab8..0d04558 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -138,12 +138,12 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>" + self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, - b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>") + b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding @@ -262,10 +262,12 @@ class TestUnicodeDammit(unittest.TestCase): doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" - chardet = bs4.dammit.chardet + chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: - bs4.dammit.chardet = None + def noop(str): + return None + bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) @@ -274,7 +276,7 @@ class TestUnicodeDammit(unittest.TestCase): self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) - bs4.dammit.chardet = chardet + bs4.dammit.chardet_dammit = chardet def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different |