diff options
author | Leonard Richardson <leonardr@segfault.org> | 2022-05-15 15:08:17 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2022-05-15 15:08:17 -0400 |
commit | d4189dccc847d87eed573c7ab2948db588aed4c0 (patch) | |
tree | 53987f4962660c1dcd7cb2b07578110be6c09e3b | |
parent | 92b50fc6ae57038a09e9cace20604613b1b345e8 (diff) |
Fixed a test failure when cchardet is not installed but
charset_normalizer is. [bug=1973072]
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/tests/test_dammit.py | 67 |
2 files changed, 36 insertions, 34 deletions
@@ -5,6 +5,9 @@ Python 2 was revision 605. = Unreleased +* Fixed a test failure when cchardet is not installed but + charset_normalizer is. [bug=1973072] + * Fixed another crash when overriding multi_valued_attributes and using the html5lib parser. [bug=1948488] diff --git a/bs4/tests/test_dammit.py b/bs4/tests/test_dammit.py index 9971234..9aad0ac 100644 --- a/bs4/tests/test_dammit.py +++ b/bs4/tests/test_dammit.py @@ -17,26 +17,24 @@ class TestUnicodeDammit(object): dammit = UnicodeDammit(markup) assert dammit.unicode_markup == markup - def test_smart_quotes_to_unicode(self): + @pytest.mark.parametrize( + "smart_quotes_to,expect_converted", + [(None, "\u2018\u2019\u201c\u201d"), + ("xml", "‘’“”"), + ("html", "‘’“”"), + ("ascii", "''" + '""'), + ] + ) + def test_smart_quotes_to(self, smart_quotes_to, expect_converted): + """Verify the functionality of the smart_quotes_to argument + to the UnicodeDammit constructor.""" markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup) - assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>" - - def test_smart_quotes_to_xml_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - assert dammit.unicode_markup == "<foo>‘’“”</foo>" - - def test_smart_quotes_to_html_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - assert dammit.unicode_markup == "<foo>‘’“”</foo>" - - def test_smart_quotes_to_ascii(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - assert dammit.unicode_markup == """<foo>''""</foo>""" - + converted = UnicodeDammit( + markup, known_definite_encodings=["windows-1252"], + smart_quotes_to=smart_quotes_to + ).unicode_markup + assert converted == "<foo>{}</foo>".format(expect_converted) + def test_detect_utf8(self): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) @@ -275,23 +273,24 @@ class TestEntitySubstitution(object): def setup_method(self): self.sub = EntitySubstitution - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = "foo\u2200\N{SNOWMAN}\u00f5bar" - assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar" - - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = b"\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - assert self.sub.substitute_html(dammit.markup) == "‘’foo“”" + @pytest.mark.parametrize( + "original,substituted", + [ + # Basic case. Unicode characters corresponding to named + # HTML entites are substituted; others are not. + ("foo\u2200\N{SNOWMAN}\u00f5bar", + "foo∀\N{SNOWMAN}õbar"), + + # MS smart quotes are a common source of frustration, so we + # give them a special test. + ('‘’foo“”', "‘’foo“”"), + ] + ) + def test_substitute_html(self, original, substituted): + assert self.sub.substitute_html(original) == substituted + def test_html5_entity(self): - # Some HTML5 entities correspond to single- or multi-character - # Unicode sequences. - for entity, u in ( # A few spot checks of our ability to recognize # special character sequences and convert them |