diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 11:51:13 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 11:51:13 -0500 |
commit | 8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch) | |
tree | b0ded4fe88e1c10883d13d0c2000bd9f9374f53e /bs4/tests/test_soup.py | |
parent | 4d8d9af1c841d1eec0e9e838a467579831268b8b (diff) |
Added a second way to pass specify encodings to UnicodeDammit and
EncodingDetector, based on the order of precedence defined in the
HTML5 spec, starting at:
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
Encodings in 'known_definite_encodings' are tried first, then
byte-order-mark sniffing is run, then encodings in 'user_encodings'
are tried. The old argument, 'override_encodings', is now a
deprecated alias for 'known_definite_encodings'.
This changes the default behavior of the html.parser and lxml tree
builders, in a way that may slightly improve encoding
detection but will probably have no effect. [bug=1889014]
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 220 |
1 files changed, 0 insertions, 220 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 0603ce7..ddb6446 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -32,7 +32,6 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, - EncodingDetector, ) from bs4.testing import ( default_builder, @@ -478,226 +477,7 @@ class TestEncodingConversion(SoupTest): markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of UnicodeDammit.""" - def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" - dammit = UnicodeDammit(markup) - self.assertEqual(dammit.unicode_markup, markup) - - def test_smart_quotes_to_unicode(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup) - self.assertEqual( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") - - def test_smart_quotes_to_xml_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEqual( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_smart_quotes_to_html_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEqual( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_smart_quotes_to_ascii(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - self.assertEqual( - dammit.unicode_markup, """<foo>''""</foo>""") - - def test_detect_utf8(self): - utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" - dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') - - - def test_convert_hebrew(self): - hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_exclude_encodings(self): - # This is UTF-8. - utf8_data = u"Räksmörgås".encode("utf-8") - - # But if we exclude UTF-8 from consideration, the guess is - # Windows-1252. - dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') - - # And if we exclude that, there is no valid guess at all. - dammit = UnicodeDammit( - utf8_data, exclude_encodings=["utf-8", "windows-1252"]) - self.assertEqual(dammit.original_encoding, None) - - def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): - detected = EncodingDetector( - b'<?xml version="1.0" encoding="UTF-\xdb" ?>') - encodings = list(detected.encodings) - assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings - - def test_detect_html5_style_meta_tag(self): - - for data in ( - b'<html><meta charset="euc-jp" /></html>', - b"<html><meta charset='euc-jp' /></html>", - b"<html><meta charset=euc-jp /></html>", - b"<html><meta charset=euc-jp/></html>"): - dammit = UnicodeDammit(data, is_html=True) - self.assertEqual( - "euc-jp", dammit.original_encoding) - - def test_last_ditch_entity_replacement(self): - # This is a UTF-8 document that contains bytestrings - # completely incompatible with UTF-8 (ie. encoded with some other - # encoding). - # - # Since there is no consistent encoding for the document, - # Unicode, Dammit will eventually encode the document as UTF-8 - # and encode the incompatible characters as REPLACEMENT - # CHARACTER. - # - # If chardet is installed, it will detect that the document - # can be converted into ISO-8859-1 without errors. This happens - # to be the wrong encoding, but it is a consistent encoding, so the - # code we're testing here won't run. - # - # So we temporarily disable chardet if it's present. - doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> -<html><b>\330\250\330\252\330\261</b> -<i>\310\322\321\220\312\321\355\344</i></html>""" - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - bs4.dammit.chardet_dammit = noop - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_byte_order_mark_removed(self): - # A document written in UTF-16LE will have its byte order marker stripped. - data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' - dammit = UnicodeDammit(data) - self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) - self.assertEqual("utf-16le", dammit.original_encoding) - - def test_detwingle(self): - # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") - - # Here's a Windows-1252 document. - windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") - - # Through some unholy alchemy, they've been stuck together. - doc = utf8 + windows_1252 + utf8 - - # The document can't be turned into UTF-8: - self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") - - # Unicode, Dammit thinks the whole document is Windows-1252, - # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" - - # But if we run it through fix_embedded_windows_1252, it's fixed: - - fixed = UnicodeDammit.detwingle(doc) - self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) - - def test_detwingle_ignores_multibyte_characters(self): - # Each of these characters has a UTF-8 representation ending - # in \x93. \x93 is a smart quote if interpreted as - # Windows-1252. But our code knows to skip over multibyte - # UTF-8 characters, so they'll survive the process unscathed. - for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. - ): - input = tricky_unicode_char.encode("utf8") - self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.detwingle(input) - self.assertEqual(output, input) - - def test_find_declared_encoding(self): - # Test our ability to find a declared encoding inside an - # XML or HTML document. - # - # Even if the document comes in as Unicode, it may be - # interesting to know what encoding was claimed - # originally. - - html_unicode = u'<html><head><meta charset="utf-8"></head></html>' - html_bytes = html_unicode.encode("ascii") - - xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>' - xml_bytes = xml_unicode.encode("ascii") - - m = EncodingDetector.find_declared_encoding - self.assertEquals(None, m(html_unicode, is_html=False)) - self.assertEquals("utf-8", m(html_unicode, is_html=True)) - self.assertEquals("utf-8", m(html_bytes, is_html=True)) - - self.assertEquals("iso-8859-1", m(xml_unicode)) - self.assertEquals("iso-8859-1", m(xml_bytes)) - - # Normally, only the first few kilobytes of a document are checked for - # an encoding. - spacer = b' ' * 5000 - self.assertEquals(None, m(spacer + html_bytes)) - self.assertEquals(None, m(spacer + xml_bytes)) - - # But you can tell find_declared_encoding to search an entire - # HTML document. - self.assertEquals( - "utf-8", - m(spacer + html_bytes, is_html=True, search_entire_document=True) - ) - - # The XML encoding declaration has to be the very first thing - # in the document. We'll allow whitespace before the document - # starts, but nothing else. - self.assertEquals( - "iso-8859-1", - m(xml_bytes, search_entire_document=True) - ) - self.assertEquals( - None, m(b'a' + xml_bytes, search_entire_document=True) - ) - class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none_or_missing(self): |