summaryrefslogtreecommitdiff
path: root/bs4/tests/test_dammit.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests/test_dammit.py')
-rw-r--r--bs4/tests/test_dammit.py58
1 files changed, 29 insertions, 29 deletions
diff --git a/bs4/tests/test_dammit.py b/bs4/tests/test_dammit.py
index b477df8..5177503 100644
--- a/bs4/tests/test_dammit.py
+++ b/bs4/tests/test_dammit.py
@@ -12,7 +12,7 @@ class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
- markup = u"I'm already Unicode! \N{SNOWMAN}"
+ markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
@@ -20,7 +20,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
- dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+ dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -44,14 +44,14 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
- self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
+ self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
- self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+ self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -60,19 +60,19 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
@@ -90,7 +90,7 @@ class TestEncodingDetector(unittest.TestCase):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
- assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
+ assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
@@ -130,7 +130,7 @@ class TestEncodingDetector(unittest.TestCase):
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+ self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
@@ -142,7 +142,7 @@ class TestEncodingDetector(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
- self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+ self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_known_definite_versus_user_encodings(self):
@@ -201,12 +201,12 @@ class TestEncodingDetector(unittest.TestCase):
def test_detwingle(self):
# Here's a UTF8 document.
- utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+ utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
- u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
- u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+ "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+ "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
@@ -221,7 +221,7 @@ class TestEncodingDetector(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
- u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+ "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
@@ -229,9 +229,9 @@ class TestEncodingDetector(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
- u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
- u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
- u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+ "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+ "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+ "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
@@ -246,29 +246,29 @@ class TestEncodingDetector(unittest.TestCase):
# interesting to know what encoding was claimed
# originally.
- html_unicode = u'<html><head><meta charset="utf-8"></head></html>'
+ html_unicode = '<html><head><meta charset="utf-8"></head></html>'
html_bytes = html_unicode.encode("ascii")
- xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>'
+ xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
xml_bytes = xml_unicode.encode("ascii")
m = EncodingDetector.find_declared_encoding
- self.assertEquals(None, m(html_unicode, is_html=False))
- self.assertEquals("utf-8", m(html_unicode, is_html=True))
- self.assertEquals("utf-8", m(html_bytes, is_html=True))
+ self.assertEqual(None, m(html_unicode, is_html=False))
+ self.assertEqual("utf-8", m(html_unicode, is_html=True))
+ self.assertEqual("utf-8", m(html_bytes, is_html=True))
- self.assertEquals("iso-8859-1", m(xml_unicode))
- self.assertEquals("iso-8859-1", m(xml_bytes))
+ self.assertEqual("iso-8859-1", m(xml_unicode))
+ self.assertEqual("iso-8859-1", m(xml_bytes))
# Normally, only the first few kilobytes of a document are checked for
# an encoding.
spacer = b' ' * 5000
- self.assertEquals(None, m(spacer + html_bytes))
- self.assertEquals(None, m(spacer + xml_bytes))
+ self.assertEqual(None, m(spacer + html_bytes))
+ self.assertEqual(None, m(spacer + xml_bytes))
# But you can tell find_declared_encoding to search an entire
# HTML document.
- self.assertEquals(
+ self.assertEqual(
"utf-8",
m(spacer + html_bytes, is_html=True, search_entire_document=True)
)
@@ -276,11 +276,11 @@ class TestEncodingDetector(unittest.TestCase):
# The XML encoding declaration has to be the very first thing
# in the document. We'll allow whitespace before the document
# starts, but nothing else.
- self.assertEquals(
+ self.assertEqual(
"iso-8859-1",
m(xml_bytes, search_entire_document=True)
)
- self.assertEquals(
+ self.assertEqual(
None, m(b'a' + xml_bytes, search_entire_document=True)
)