diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 11:51:13 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 11:51:13 -0500 |
commit | 8f763297abc8bb598c3aca25eccaef6db7f7c987 (patch) | |
tree | b0ded4fe88e1c10883d13d0c2000bd9f9374f53e | |
parent | 4d8d9af1c841d1eec0e9e838a467579831268b8b (diff) |
Added a second way to pass specify encodings to UnicodeDammit and
EncodingDetector, based on the order of precedence defined in the
HTML5 spec, starting at:
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
Encodings in 'known_definite_encodings' are tried first, then
byte-order-mark sniffing is run, then encodings in 'user_encodings'
are tried. The old argument, 'override_encodings', is now a
deprecated alias for 'known_definite_encodings'.
This changes the default behavior of the html.parser and lxml tree
builders, in a way that may slightly improve encoding
detection but will probably have no effect. [bug=1889014]
-rw-r--r-- | CHANGELOG | 14 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 19 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 14 | ||||
-rw-r--r-- | bs4/dammit.py | 98 | ||||
-rw-r--r-- | bs4/tests/test_dammit.py | 286 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 220 |
7 files changed, 412 insertions, 242 deletions
@@ -7,6 +7,20 @@ * Performance improvement when processing tags that speeds up overall tree construction by 2%. Patch by Morotti. [bug=1899358] +* Added a second way to pass specify encodings to UnicodeDammit and + EncodingDetector, based on the order of precedence defined in the + HTML5 spec, starting at: + https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding + + Encodings in 'known_definite_encodings' are tried first, then + byte-order-mark sniffing is run, then encodings in 'user_encodings' + are tried. The old argument, 'override_encodings', is now a + deprecated alias for 'known_definite_encodings'. + + This changes the default behavior of the html.parser and lxml tree + builders, in a way that may slightly improve encoding + detection but will probably have no effect. [bug=1889014] + * Improve the warning issued when a directory name (as opposed to the name of a regular file) is passed as markup into the BeautifulSoup constructor. [bug=1913628] diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 03da4c6..b6e2c37 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -234,7 +234,8 @@ class TreeBuilder(object): :param markup: Some markup -- probably a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be - in this encoding. + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. :param exclude_encodings: The user asked _not_ to try any of these encodings. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 96a7b7d..2f2bf1e 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -359,9 +359,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return # Ask UnicodeDammit to sniff the most likely encoding. + + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] + try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True, - exclude_encodings=exclude_encodings) + dammit = UnicodeDammit( + markup, + known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, + is_html=True, + exclude_encodings=exclude_encodings + ) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 1b44d75..c670b84 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - try_encodings = [user_specified_encoding, document_declared_encoding] + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] detector = EncodingDetector( - markup, try_encodings, is_html, exclude_encodings) + markup, known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, is_html=is_html, + exclude_encodings=exclude_encodings + ) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) diff --git a/bs4/dammit.py b/bs4/dammit.py index 33f7b7d..7e0a7f8 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -228,32 +228,65 @@ class EncodingDetector: Order of precedence: 1. Encodings you specifically tell EncodingDetector to try first - (the override_encodings argument to the constructor). + (the known_definite_encodings argument to the constructor). - 2. An encoding declared within the bytestring itself, either in an + 2. An encoding determined by sniffing the document's byte-order mark. + + 3. Encodings you specifically tell EncodingDetector to try if + byte-order mark sniffing fails (the user_encodings argument to the + constructor). + + 4. An encoding declared within the bytestring itself, either in an XML declaration (if the bytestring is to be interpreted as an XML document), or in a <meta> tag (if the bytestring is to be interpreted as an HTML document.) - 3. An encoding detected through textual analysis by chardet, + 5. An encoding detected through textual analysis by chardet, cchardet, or a similar external library. 4. UTF-8. 5. Windows-1252. + """ - def __init__(self, markup, override_encodings=None, is_html=False, - exclude_encodings=None): + def __init__(self, markup, known_definite_encodings=None, + is_html=False, exclude_encodings=None, + user_encodings=None, override_encodings=None): """Constructor. :param markup: Some markup in an unknown encoding. - :param override_encodings: These encodings will be tried first. - :param is_html: If True, this markup is considered to be HTML. Otherwise - it's assumed to be XML. - :param exclude_encodings: These encodings will not be tried, even - if they otherwise would be. + + :param known_definite_encodings: When determining the encoding + of `markup`, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined here: + https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding + + :param user_encodings: These encodings will be tried after the + `known_definite_encodings` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined here: + https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + + :param override_encodings: A deprecated alias for + known_definite_encodings. Any encodings here will be tried + immediately after the encodings in + known_definite_encodings. + + :param is_html: If True, this markup is considered to be + HTML. Otherwise it's assumed to be XML. + + :param exclude_encodings: These encodings will not be tried, + even if they otherwise would be. + """ - self.override_encodings = override_encodings or [] + self.known_definite_encodings = list(known_definite_encodings or []) + if override_encodings: + self.known_definite_encodings += override_encodings + self.user_encodings = user_encodings or [] exclude_encodings = exclude_encodings or [] self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None @@ -286,7 +319,9 @@ class EncodingDetector: :yield: A sequence of strings. """ tried = set() - for e in self.override_encodings: + + # First, try the known definite encodings + for e in self.known_definite_encodings: if self._usable(e, tried): yield e @@ -295,6 +330,12 @@ class EncodingDetector: if self._usable(self.sniffed_encoding, tried): yield self.sniffed_encoding + # Sniffing the byte-order mark did nothing; try the user + # encodings. + for e in self.user_encodings: + if self._usable(e, tried): + yield e + # Look within the document for an XML or HTML encoding # declaration. if self.declared_encoding is None: @@ -405,13 +446,33 @@ class UnicodeDammit: "iso-8859-2", ] - def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False, exclude_encodings=[]): + def __init__(self, markup, known_definite_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[], + user_encodings=None, override_encodings=None + ): """Constructor. :param markup: A bytestring representing markup in an unknown encoding. - :param override_encodings: These encodings will be tried first, - before any sniffing code is run. + + :param known_definite_encodings: When determining the encoding + of `markup`, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined here: + https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding + + :param user_encodings: These encodings will be tried after the + `known_definite_encodings` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined here: + https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + + :param override_encodings: A deprecated alias for + known_definite_encodings. Any encodings here will be tried + immediately after the encodings in + known_definite_encodings. :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. @@ -421,6 +482,7 @@ class UnicodeDammit: it's assumed to be XML. :param exclude_encodings: These encodings will not be considered, even if the sniffing code thinks they might make sense. + """ self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] @@ -428,7 +490,9 @@ class UnicodeDammit: self.is_html = is_html self.log = logging.getLogger(__name__) self.detector = EncodingDetector( - markup, override_encodings, is_html, exclude_encodings) + markup, known_definite_encodings, is_html, exclude_encodings, + user_encodings, override_encodings + ) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': diff --git a/bs4/tests/test_dammit.py b/bs4/tests/test_dammit.py new file mode 100644 index 0000000..b477df8 --- /dev/null +++ b/bs4/tests/test_dammit.py @@ -0,0 +1,286 @@ +# encoding: utf-8 +import logging +import unittest +import bs4 +from bs4 import BeautifulSoup +from bs4.dammit import ( + EncodingDetector, + UnicodeDammit, +) + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of UnicodeDammit.""" + + def test_unicode_input(self): + markup = u"I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) + + def test_smart_quotes_to_unicode(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup) + self.assertEqual( + dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") + + def test_smart_quotes_to_xml_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEqual( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_html_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEqual( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_ascii(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") + self.assertEqual( + dammit.unicode_markup, """<foo>''""</foo>""") + + def test_detect_utf8(self): + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" + dammit = UnicodeDammit(utf8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') + + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') + self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = u"Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + +class TestEncodingDetector(unittest.TestCase): + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'<html><meta charset="euc-jp" /></html>', + b"<html><meta charset='euc-jp' /></html>", + b"<html><meta charset=euc-jp /></html>", + b"<html><meta charset=euc-jp/></html>"): + dammit = UnicodeDammit(data, is_html=True) + self.assertEqual( + "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> +<html><b>\330\250\330\252\330\261</b> +<i>\310\322\321\220\312\321\355\344</i></html>""" + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + bs4.dammit.chardet_dammit = noop + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + + def test_known_definite_versus_user_encodings(self): + # The known_definite_encodings are used before sniffing the + # byte-order mark; the user_encodings are used afterwards. + + # Here's a document in UTF-16LE. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + + # We can process it as UTF-16 by passing it in as a known + # definite encoding. + before = UnicodeDammit(data, known_definite_encodings=["utf-16"]) + self.assertEqual("utf-16", before.original_encoding) + + # If we pass UTF-18 as a user encoding, it's not even + # tried--the encoding sniffed from the byte-order mark takes + # precedence. + after = UnicodeDammit(data, user_encodings=["utf-8"]) + self.assertEqual("utf-16le", after.original_encoding) + self.assertEqual( + ["utf-16le"], [x[0] for x in dammit.tried_encodings] + ) + + # Here's a document in ISO-8859-8. + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, known_definite_encodings=["utf-8"], + user_encodings=["iso-8859-8"]) + + # The known_definite_encodings don't work, BOM sniffing does + # nothing (it only works for a few UTF encodings), but one of + # the user_encodings does work. + self.assertEqual("iso-8859-8", dammit.original_encoding) + self.assertEqual( + ["utf-8", "iso-8859-8"], [x[0] for x in dammit.tried_encodings] + ) + + def test_deprecated_override_encodings(self): + # override_encodings is a deprecated alias for + # known_definite_encodings. + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit( + hebrew, + known_definite_encodings=["shift-jis"], + override_encodings=["utf-8"], + user_encodings=["iso-8859-8"], + ) + self.assertEqual("iso-8859-8", dammit.original_encoding) + + # known_definite_encodings and override_encodings were tried + # before user_encodings. + self.assertEqual( + ["shift-jis", "utf-8", "iso-8859-8"], + [x[0] for x in dammit.tried_encodings] + ) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + + fixed = UnicodeDammit.detwingle(doc) + self.assertEqual( + u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + self.assertTrue(input.endswith(b'\x93')) + output = UnicodeDammit.detwingle(input) + self.assertEqual(output, input) + + def test_find_declared_encoding(self): + # Test our ability to find a declared encoding inside an + # XML or HTML document. + # + # Even if the document comes in as Unicode, it may be + # interesting to know what encoding was claimed + # originally. + + html_unicode = u'<html><head><meta charset="utf-8"></head></html>' + html_bytes = html_unicode.encode("ascii") + + xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>' + xml_bytes = xml_unicode.encode("ascii") + + m = EncodingDetector.find_declared_encoding + self.assertEquals(None, m(html_unicode, is_html=False)) + self.assertEquals("utf-8", m(html_unicode, is_html=True)) + self.assertEquals("utf-8", m(html_bytes, is_html=True)) + + self.assertEquals("iso-8859-1", m(xml_unicode)) + self.assertEquals("iso-8859-1", m(xml_bytes)) + + # Normally, only the first few kilobytes of a document are checked for + # an encoding. + spacer = b' ' * 5000 + self.assertEquals(None, m(spacer + html_bytes)) + self.assertEquals(None, m(spacer + xml_bytes)) + + # But you can tell find_declared_encoding to search an entire + # HTML document. + self.assertEquals( + "utf-8", + m(spacer + html_bytes, is_html=True, search_entire_document=True) + ) + + # The XML encoding declaration has to be the very first thing + # in the document. We'll allow whitespace before the document + # starts, but nothing else. + self.assertEquals( + "iso-8859-1", + m(xml_bytes, search_entire_document=True) + ) + self.assertEquals( + None, m(b'a' + xml_bytes, search_entire_document=True) + ) + diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 0603ce7..ddb6446 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -32,7 +32,6 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, - EncodingDetector, ) from bs4.testing import ( default_builder, @@ -478,226 +477,7 @@ class TestEncodingConversion(SoupTest): markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of UnicodeDammit.""" - def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" - dammit = UnicodeDammit(markup) - self.assertEqual(dammit.unicode_markup, markup) - - def test_smart_quotes_to_unicode(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup) - self.assertEqual( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") - - def test_smart_quotes_to_xml_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEqual( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_smart_quotes_to_html_entities(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEqual( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_smart_quotes_to_ascii(self): - markup = b"<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - self.assertEqual( - dammit.unicode_markup, """<foo>''""</foo>""") - - def test_detect_utf8(self): - utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" - dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') - - - def test_convert_hebrew(self): - hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_exclude_encodings(self): - # This is UTF-8. - utf8_data = u"Räksmörgås".encode("utf-8") - - # But if we exclude UTF-8 from consideration, the guess is - # Windows-1252. - dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') - - # And if we exclude that, there is no valid guess at all. - dammit = UnicodeDammit( - utf8_data, exclude_encodings=["utf-8", "windows-1252"]) - self.assertEqual(dammit.original_encoding, None) - - def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): - detected = EncodingDetector( - b'<?xml version="1.0" encoding="UTF-\xdb" ?>') - encodings = list(detected.encodings) - assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings - - def test_detect_html5_style_meta_tag(self): - - for data in ( - b'<html><meta charset="euc-jp" /></html>', - b"<html><meta charset='euc-jp' /></html>", - b"<html><meta charset=euc-jp /></html>", - b"<html><meta charset=euc-jp/></html>"): - dammit = UnicodeDammit(data, is_html=True) - self.assertEqual( - "euc-jp", dammit.original_encoding) - - def test_last_ditch_entity_replacement(self): - # This is a UTF-8 document that contains bytestrings - # completely incompatible with UTF-8 (ie. encoded with some other - # encoding). - # - # Since there is no consistent encoding for the document, - # Unicode, Dammit will eventually encode the document as UTF-8 - # and encode the incompatible characters as REPLACEMENT - # CHARACTER. - # - # If chardet is installed, it will detect that the document - # can be converted into ISO-8859-1 without errors. This happens - # to be the wrong encoding, but it is a consistent encoding, so the - # code we're testing here won't run. - # - # So we temporarily disable chardet if it's present. - doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> -<html><b>\330\250\330\252\330\261</b> -<i>\310\322\321\220\312\321\355\344</i></html>""" - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - bs4.dammit.chardet_dammit = noop - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_byte_order_mark_removed(self): - # A document written in UTF-16LE will have its byte order marker stripped. - data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' - dammit = UnicodeDammit(data) - self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) - self.assertEqual("utf-16le", dammit.original_encoding) - - def test_detwingle(self): - # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") - - # Here's a Windows-1252 document. - windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") - - # Through some unholy alchemy, they've been stuck together. - doc = utf8 + windows_1252 + utf8 - - # The document can't be turned into UTF-8: - self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") - - # Unicode, Dammit thinks the whole document is Windows-1252, - # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" - - # But if we run it through fix_embedded_windows_1252, it's fixed: - - fixed = UnicodeDammit.detwingle(doc) - self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) - - def test_detwingle_ignores_multibyte_characters(self): - # Each of these characters has a UTF-8 representation ending - # in \x93. \x93 is a smart quote if interpreted as - # Windows-1252. But our code knows to skip over multibyte - # UTF-8 characters, so they'll survive the process unscathed. - for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. - ): - input = tricky_unicode_char.encode("utf8") - self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.detwingle(input) - self.assertEqual(output, input) - - def test_find_declared_encoding(self): - # Test our ability to find a declared encoding inside an - # XML or HTML document. - # - # Even if the document comes in as Unicode, it may be - # interesting to know what encoding was claimed - # originally. - - html_unicode = u'<html><head><meta charset="utf-8"></head></html>' - html_bytes = html_unicode.encode("ascii") - - xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>' - xml_bytes = xml_unicode.encode("ascii") - - m = EncodingDetector.find_declared_encoding - self.assertEquals(None, m(html_unicode, is_html=False)) - self.assertEquals("utf-8", m(html_unicode, is_html=True)) - self.assertEquals("utf-8", m(html_bytes, is_html=True)) - - self.assertEquals("iso-8859-1", m(xml_unicode)) - self.assertEquals("iso-8859-1", m(xml_bytes)) - - # Normally, only the first few kilobytes of a document are checked for - # an encoding. - spacer = b' ' * 5000 - self.assertEquals(None, m(spacer + html_bytes)) - self.assertEquals(None, m(spacer + xml_bytes)) - - # But you can tell find_declared_encoding to search an entire - # HTML document. - self.assertEquals( - "utf-8", - m(spacer + html_bytes, is_html=True, search_entire_document=True) - ) - - # The XML encoding declaration has to be the very first thing - # in the document. We'll allow whitespace before the document - # starts, but nothing else. - self.assertEquals( - "iso-8859-1", - m(xml_bytes, search_entire_document=True) - ) - self.assertEquals( - None, m(b'a' + xml_bytes, search_entire_document=True) - ) - class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none_or_missing(self): |