summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2022-05-15 15:08:17 -0400
committerLeonard Richardson <leonardr@segfault.org>2022-05-15 15:08:17 -0400
commitd4189dccc847d87eed573c7ab2948db588aed4c0 (patch)
tree53987f4962660c1dcd7cb2b07578110be6c09e3b
parent92b50fc6ae57038a09e9cace20604613b1b345e8 (diff)
Fixed a test failure when cchardet is not installed but
charset_normalizer is. [bug=1973072]
-rw-r--r--CHANGELOG3
-rw-r--r--bs4/tests/test_dammit.py67
2 files changed, 36 insertions, 34 deletions
diff --git a/CHANGELOG b/CHANGELOG
index b1dbf7d..ac7882b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,9 @@ Python 2 was revision 605.
= Unreleased
+* Fixed a test failure when cchardet is not installed but
+ charset_normalizer is. [bug=1973072]
+
* Fixed another crash when overriding multi_valued_attributes and using the
html5lib parser. [bug=1948488]
diff --git a/bs4/tests/test_dammit.py b/bs4/tests/test_dammit.py
index 9971234..9aad0ac 100644
--- a/bs4/tests/test_dammit.py
+++ b/bs4/tests/test_dammit.py
@@ -17,26 +17,24 @@ class TestUnicodeDammit(object):
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == markup
- def test_smart_quotes_to_unicode(self):
+ @pytest.mark.parametrize(
+ "smart_quotes_to,expect_converted",
+ [(None, "\u2018\u2019\u201c\u201d"),
+ ("xml", "&#x2018;&#x2019;&#x201C;&#x201D;"),
+ ("html", "&lsquo;&rsquo;&ldquo;&rdquo;"),
+ ("ascii", "''" + '""'),
+ ]
+ )
+ def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
+ """Verify the functionality of the smart_quotes_to argument
+ to the UnicodeDammit constructor."""
markup = b"<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup)
- assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>"
-
- def test_smart_quotes_to_xml_entities(self):
- markup = b"<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup, smart_quotes_to="xml")
- assert dammit.unicode_markup == "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>"
-
- def test_smart_quotes_to_html_entities(self):
- markup = b"<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup, smart_quotes_to="html")
- assert dammit.unicode_markup == "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>"
-
- def test_smart_quotes_to_ascii(self):
- markup = b"<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
- assert dammit.unicode_markup == """<foo>''""</foo>"""
-
+ converted = UnicodeDammit(
+ markup, known_definite_encodings=["windows-1252"],
+ smart_quotes_to=smart_quotes_to
+ ).unicode_markup
+ assert converted == "<foo>{}</foo>".format(expect_converted)
+
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
@@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
def setup_method(self):
self.sub = EntitySubstitution
- def test_simple_html_substitution(self):
- # Unicode characters corresponding to named HTML entites
- # are substituted, and no others.
- s = "foo\u2200\N{SNOWMAN}\u00f5bar"
- assert self.sub.substitute_html(s) == "foo&forall;\N{SNOWMAN}&otilde;bar"
-
- def test_smart_quote_substitution(self):
- # MS smart quotes are a common source of frustration, so we
- # give them a special test.
- quotes = b"\x91\x92foo\x93\x94"
- dammit = UnicodeDammit(quotes)
- assert self.sub.substitute_html(dammit.markup) == "&lsquo;&rsquo;foo&ldquo;&rdquo;"
+ @pytest.mark.parametrize(
+ "original,substituted",
+ [
+ # Basic case. Unicode characters corresponding to named
+ # HTML entites are substituted; others are not.
+ ("foo\u2200\N{SNOWMAN}\u00f5bar",
+ "foo&forall;\N{SNOWMAN}&otilde;bar"),
+
+ # MS smart quotes are a common source of frustration, so we
+ # give them a special test.
+ ('‘’foo“”', "&lsquo;&rsquo;foo&ldquo;&rdquo;"),
+ ]
+ )
+ def test_substitute_html(self, original, substituted):
+ assert self.sub.substitute_html(original) == substituted
+
def test_html5_entity(self):
- # Some HTML5 entities correspond to single- or multi-character
- # Unicode sequences.
-
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them