summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/tests/test_soup.py22
2 files changed, 22 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1c32f28..0d50c80 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,9 @@
* Renamed Tag.nsprefix to Tag.prefix, for consistency with
NamespacedAttribute.
+* Fixed a test failure that occured on Python 3.x when chardet was
+ installed.
+
= 4.0.0b8 (20110224) =
* All tree builders now preserve namespace information in the
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 33ab0fa..d8584b7 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -8,9 +8,18 @@ from bs4.element import (
NamespacedAttribute,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
-from bs4.testing import SoupTest
+from bs4.testing import (
+ SoupTest,
+ skipIf,
+)
import warnings
+try:
+ import chardet
+ CHARDET_PRESENT = True
+except ImportError, e:
+ CHARDET_PRESENT = False
+
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):
@@ -212,16 +221,23 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(
"euc-jp", dammit.original_encoding)
+ @skipIf(
+ CHARDET_PRESENT,
+ "Not testing last-ditch entity replacement because chardet is present and will find an encoding.")
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
- # completely incompatible with UTF-8 (encoded with some other
+ # completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
-
+ #
+ # If chardet is installed, it will detect that the document
+ # can be converted into ISO-8859-1 without errors. This happens
+ # to be the wrong encoding, but it is a consistent encoding, so the
+ # code we're testing here won't run.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""