diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/beautifulsoup/builder.py.3.diff | 4 | ||||
-rw-r--r-- | src/beautifulsoup/dammit.py.3.diff | 70 | ||||
-rw-r--r-- | src/beautifulsoup/element.py.3.diff | 8 | ||||
-rw-r--r-- | src/beautifulsoup/python3.diff | 208 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_soup.py.3.diff | 122 |
5 files changed, 208 insertions, 204 deletions
diff --git a/src/beautifulsoup/builder.py.3.diff b/src/beautifulsoup/builder.py.3.diff deleted file mode 100644 index 91b510d..0000000 --- a/src/beautifulsoup/builder.py.3.diff +++ /dev/null @@ -1,4 +0,0 @@ -90c90 -< from HTMLParser import HTMLParser, HTMLParseError ---- -> from html.parser import HTMLParser, HTMLParseError diff --git a/src/beautifulsoup/dammit.py.3.diff b/src/beautifulsoup/dammit.py.3.diff deleted file mode 100644 index f6bab68..0000000 --- a/src/beautifulsoup/dammit.py.3.diff +++ /dev/null @@ -1,70 +0,0 @@ -1800c1800 -< smart_quotes_re = "([\x80-\x9f])" ---- -> smart_quotes_re = b"([\x80-\x9f])" -1952,1983c1952,1983 -< MS_CHARS = { '\x80' : ('euro', '20AC'), -< '\x81' : ' ', -< '\x82' : ('sbquo', '201A'), -< '\x83' : ('fnof', '192'), -< '\x84' : ('bdquo', '201E'), -< '\x85' : ('hellip', '2026'), -< '\x86' : ('dagger', '2020'), -< '\x87' : ('Dagger', '2021'), -< '\x88' : ('circ', '2C6'), -< '\x89' : ('permil', '2030'), -< '\x8A' : ('Scaron', '160'), -< '\x8B' : ('lsaquo', '2039'), -< '\x8C' : ('OElig', '152'), -< '\x8D' : '?', -< '\x8E' : ('#x17D', '17D'), -< '\x8F' : '?', -< '\x90' : '?', -< '\x91' : ('lsquo', '2018'), -< '\x92' : ('rsquo', '2019'), -< '\x93' : ('ldquo', '201C'), -< '\x94' : ('rdquo', '201D'), -< '\x95' : ('bull', '2022'), -< '\x96' : ('ndash', '2013'), -< '\x97' : ('mdash', '2014'), -< '\x98' : ('tilde', '2DC'), -< '\x99' : ('trade', '2122'), -< '\x9a' : ('scaron', '161'), -< '\x9b' : ('rsaquo', '203A'), -< '\x9c' : ('oelig', '153'), -< '\x9d' : '?', -< '\x9e' : ('#x17E', '17E'), -< '\x9f' : ('Yuml', ''),} ---- -> MS_CHARS = { b'\x80' : ('euro', '20AC'), -> b'\x81' : ' ', -> b'\x82' : ('sbquo', '201A'), -> b'\x83' : ('fnof', '192'), -> b'\x84' : ('bdquo', '201E'), -> b'\x85' : ('hellip', '2026'), -> b'\x86' : ('dagger', '2020'), -> b'\x87' : ('Dagger', '2021'), -> b'\x88' : ('circ', '2C6'), -> b'\x89' : ('permil', '2030'), -> b'\x8A' : ('Scaron', '160'), -> b'\x8B' : ('lsaquo', '2039'), -> b'\x8C' : ('OElig', '152'), -> b'\x8D' : '?', -> b'\x8E' : ('#x17D', '17D'), -> b'\x8F' : '?', -> b'\x90' : '?', -> b'\x91' : ('lsquo', '2018'), -> b'\x92' : ('rsquo', '2019'), -> b'\x93' : ('ldquo', '201C'), -> b'\x94' : ('rdquo', '201D'), -> b'\x95' : ('bull', '2022'), -> b'\x96' : ('ndash', '2013'), -> b'\x97' : ('mdash', '2014'), -> b'\x98' : ('tilde', '2DC'), -> b'\x99' : ('trade', '2122'), -> b'\x9a' : ('scaron', '161'), -> b'\x9b' : ('rsaquo', '203A'), -> b'\x9c' : ('oelig', '153'), -> b'\x9d' : '?', -> b'\x9e' : ('#x17E', '17E'), -> b'\x9f' : ('Yuml', ''),} diff --git a/src/beautifulsoup/element.py.3.diff b/src/beautifulsoup/element.py.3.diff deleted file mode 100644 index 4549edd..0000000 --- a/src/beautifulsoup/element.py.3.diff +++ /dev/null @@ -1,8 +0,0 @@ -92c92 -< from htmlentitydefs import name2codepoint ---- -> from html.entities import name2codepoint -337c337 -< i = g.next() ---- -> i = g.__next__() diff --git a/src/beautifulsoup/python3.diff b/src/beautifulsoup/python3.diff new file mode 100644 index 0000000..142f2b1 --- /dev/null +++ b/src/beautifulsoup/python3.diff @@ -0,0 +1,208 @@ +=== modified file 'src/beautifulsoup/builder.py' +--- src/beautifulsoup/builder.py 2009-04-10 15:22:53 +0000 ++++ src/beautifulsoup/builder.py 2009-04-10 17:12:49 +0000 +@@ -6,7 +6,7 @@ + from element import name2codepoint + from element import ( + CData, Comment, Declaration, Entities, ProcessingInstruction) +-from HTMLParser import HTMLParser, HTMLParseError ++from html.parser import HTMLParser, HTMLParseError + + __all__ = ['TreeBuilder', + 'HTMLParserXMLTreeBuilder', + +=== modified file 'src/beautifulsoup/element.py' +--- src/beautifulsoup/element.py 2009-04-10 15:22:53 +0000 ++++ src/beautifulsoup/element.py 2009-04-10 17:12:49 +0000 +@@ -1,7 +1,7 @@ + import re + import types + try: +- from htmlentitydefs import name2codepoint ++ from html.entities import name2codepoint + except ImportError: + name2codepoint = {} + +@@ -254,7 +254,7 @@ + g = generator() + while True: + try: +- i = g.next() ++ i = g.__next__() + except StopIteration: + break + if i: + +=== modified file 'src/beautifulsoup/tests/test_soup.py' +--- src/beautifulsoup/tests/test_soup.py 2009-04-10 15:45:04 +0000 ++++ src/beautifulsoup/tests/test_soup.py 2009-04-10 17:15:31 +0000 +@@ -635,9 +635,9 @@ + self.assertSoupEquals('<b>hello there</b>') + + def testEntitiesInAttributeValues(self): +- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', ++ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', + encoding='utf-8') +- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', ++ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', + encoding='utf-8') + + builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) +@@ -681,11 +681,11 @@ + smart quote fixes.""" + + def testUnicodeDammitStandalone(self): +- markup = "<foo>\x92</foo>" ++ markup = b"<foo>\x92</foo>" + dammit = UnicodeDammit(markup) + self.assertEquals(dammit.unicode, "<foo>’</foo>") + +- hebrew = "\xed\xe5\xec\xf9" ++ hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.originalEncoding, 'iso-8859-8') +@@ -697,7 +697,7 @@ + + unicodeData = u"<foo>\u00FC</foo>" + utf8 = unicodeData.encode("utf-8") +- self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') ++ self.assertEquals(utf8, b'<foo>\xc3\xbc</foo>') + + unicodeSoup = BeautifulStoneSoup(unicodeData) + self.assertEquals(unicodeData, unicodeSoup.decode()) +@@ -724,8 +724,8 @@ + self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') + + def testRewrittenXMLHeader(self): +- euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' +- utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" ++ euc_jp = b'<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' ++ utf8 = b"<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" + soup = BeautifulStoneSoup(euc_jp) + if soup.originalEncoding != "euc-jp": + raise Exception("Test failed when parsing euc-jp document. " +@@ -736,12 +736,12 @@ + self.assertEquals(soup.originalEncoding, "euc-jp") + self.assertEquals(soup.renderContents('utf-8'), utf8) + +- old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" ++ old_text = b"<?xml encoding='windows-1252'><foo>\x92</foo>" + new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" + self.assertSoupEquals(old_text, new_text) + + def testRewrittenMetaTag(self): +- no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' ++ no_shift_jis_html = b'''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' + soup = BeautifulSoup(no_shift_jis_html) + + # Beautiful Soup used to try to rewrite the meta tag even if the +@@ -751,16 +751,16 @@ + soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) + self.assertEquals(soup.contents[0].name, 'pre') + +- meta_tag = ('<meta content="text/html; charset=x-sjis" ' +- 'http-equiv="Content-type" />') ++ meta_tag = (b'<meta content="text/html; charset=x-sjis" ' ++ b'http-equiv="Content-type" />') + shift_jis_html = ( +- '<html><head>\n%s\n' +- '<meta http-equiv="Content-language" content="ja" />' +- '</head><body><pre>\n' +- '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' +- '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' +- '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' +- '</pre></body></html>') % meta_tag ++ b'<html><head>\n' + meta_tag + b'\n' ++ b'<meta http-equiv="Content-language" content="ja" />' ++ b'</head><body><pre>\n' ++ b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' ++ b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' ++ b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' ++ b'</pre></body></html>') + soup = BeautifulSoup(shift_jis_html) + if soup.originalEncoding != "shift-jis": + raise Exception("Test failed when parsing shift-jis document " +@@ -773,60 +773,60 @@ + content_type_tag = soup.meta['content'] + self.assertEquals(content_type_tag[content_type_tag.find('charset='):], + 'charset=%SOUP-ENCODING%') +- content_type = str(soup.meta) ++ content_type = soup.meta.decode() + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=utf8')+1], + 'charset=utf-8') + content_type = soup.meta.encode('shift-jis') +- index = content_type.find('charset=') ++ index = content_type.find(b'charset=') + self.assertEqual(content_type[index:index+len('charset=shift-jis')], + 'charset=shift-jis'.encode()) + + self.assertEquals(soup.encode('utf-8'), ( +- '<html><head>\n' +- '<meta content="text/html; charset=utf-8" ' +- 'http-equiv="Content-type" />\n' +- '<meta http-equiv="Content-language" content="ja" />' +- '</head><body><pre>\n' +- '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' +- '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' +- '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' +- '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' +- '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' +- '</pre></body></html>')) ++ b'<html><head>\n' ++ b'<meta content="text/html; charset=utf-8" ' ++ b'http-equiv="Content-type" />\n' ++ b'<meta http-equiv="Content-language" content="ja" />' ++ b'</head><body><pre>\n' ++ b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' ++ b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' ++ b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' ++ b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' ++ b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' ++ b'</pre></body></html>')) + self.assertEquals(soup.encode("shift-jis"), + shift_jis_html.replace('x-sjis'.encode(), + 'shift-jis'.encode())) + +- isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" ++ isolatin = b"""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = BeautifulSoup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) +- utf8 = utf8.replace("\xe9", "\xc3\xa9") ++ utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") + self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') + + def testHebrew(self): +- iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' +- utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' ++ iso_8859_8= b'<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' ++ utf8 = b'<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' + soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.encode('utf-8'), utf8) + + def testSmartQuotesNotSoSmartAnymore(self): +- self.assertSoupEquals("\x91Foo\x92 <!--blah-->", ++ self.assertSoupEquals(b"\x91Foo\x92 <!--blah-->", + '‘Foo’ <!--blah-->') + + def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): +- smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" ++ smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" + soup = BeautifulSoup(smartQuotes) + self.assertEquals(soup.decode(), + 'Il a dit, ‹Sacré bleu!›') + builder = HTMLParserTreeBuilder(convertEntities="html") + soup = BeautifulSoup(smartQuotes, builder) + self.assertEquals(soup.encode('utf-8'), +- 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') ++ b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + + def testDontSeeSmartQuotesWhereThereAreNone(self): +- utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" ++ utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + self.assertSoupEquals(utf_8, encoding='utf-8') + + + diff --git a/src/beautifulsoup/tests/test_soup.py.3.diff b/src/beautifulsoup/tests/test_soup.py.3.diff deleted file mode 100644 index fc9636c..0000000 --- a/src/beautifulsoup/tests/test_soup.py.3.diff +++ /dev/null @@ -1,122 +0,0 @@ -433c433 -< self.assertTrue('attr' in BeautifulSoup(text).foo) ---- -> self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) -622c622 -< self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', ---- -> self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', -624c624 -< self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', ---- -> self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', -671c671 -< markup = "<foo>\x92</foo>" ---- -> markup = b"<foo>\x92</foo>" -675c675 -< hebrew = "\xed\xe5\xec\xf9" ---- -> hebrew = b"\xed\xe5\xec\xf9" -687c687 -< self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') ---- -> self.assertEquals(utf8, b'<foo>\xc3\xbc</foo>') -714,715c714,715 -< euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' -< utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" ---- -> euc_jp = b'<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' -> utf8 = b"<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" -726c726 -< old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" ---- -> old_text = b"<?xml encoding='windows-1252'><foo>\x92</foo>" -731c731 -< no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' ---- -> no_shift_jis_html = b'''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' -741,742c741,742 -< meta_tag = ('<meta content="text/html; charset=x-sjis" ' -< 'http-equiv="Content-type" />') ---- -> meta_tag = (b'<meta content="text/html; charset=x-sjis" ' -> b'http-equiv="Content-type" />') -744,750c744,750 -< '<html><head>\n%s\n' -< '<meta http-equiv="Content-language" content="ja" />' -< '</head><body><pre>\n' -< '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' -< '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' -< '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' -< '</pre></body></html>') % meta_tag ---- -> b'<html><head>\n' + meta_tag + b'\n' -> b'<meta http-equiv="Content-language" content="ja" />' -> b'</head><body><pre>\n' -> b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' -> b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' -> b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' -> b'</pre></body></html>') -763c763 -< content_type = str(soup.meta) ---- -> content_type = soup.meta.decode() -768c768 -< index = content_type.find('charset=') ---- -> index = content_type.find(b'charset=') -773,783c773,783 -< '<html><head>\n' -< '<meta content="text/html; charset=utf-8" ' -< 'http-equiv="Content-type" />\n' -< '<meta http-equiv="Content-language" content="ja" />' -< '</head><body><pre>\n' -< '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' -< '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' -< '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' -< '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' -< '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' -< '</pre></body></html>')) ---- -> b'<html><head>\n' -> b'<meta content="text/html; charset=utf-8" ' -> b'http-equiv="Content-type" />\n' -> b'<meta http-equiv="Content-language" content="ja" />' -> b'</head><body><pre>\n' -> b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' -> b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' -> b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' -> b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' -> b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' -> b'</pre></body></html>')) -788c788 -< isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" ---- -> isolatin = b"""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" -792c792 -< utf8 = utf8.replace("\xe9", "\xc3\xa9") ---- -> utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") -796,797c796,797 -< iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' -< utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' ---- -> iso_8859_8= b'<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' -> utf8 = b'<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' -802c802 -< self.assertSoupEquals("\x91Foo\x92 <!--blah-->", ---- -> self.assertSoupEquals(b"\x91Foo\x92 <!--blah-->", -806c806 -< smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" ---- -> smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" -812c812 -< 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') ---- -> b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') -815c815 -< utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" ---- -> utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |