=== modified file 'src/beautifulsoup/builder.py' --- src/beautifulsoup/builder.py 2009-04-10 15:22:53 +0000 +++ src/beautifulsoup/builder.py 2009-04-10 17:12:49 +0000 @@ -6,7 +6,7 @@ from element import name2codepoint from element import ( CData, Comment, Declaration, Entities, ProcessingInstruction) -from HTMLParser import HTMLParser, HTMLParseError +from html.parser import HTMLParser, HTMLParseError __all__ = ['TreeBuilder', 'HTMLParserXMLTreeBuilder', === modified file 'src/beautifulsoup/element.py' --- src/beautifulsoup/element.py 2009-04-10 15:22:53 +0000 +++ src/beautifulsoup/element.py 2009-04-10 17:12:49 +0000 @@ -1,7 +1,7 @@ import re import types try: - from htmlentitydefs import name2codepoint + from html.entities import name2codepoint except ImportError: name2codepoint = {} @@ -254,7 +254,7 @@ g = generator() while True: try: - i = g.next() + i = g.__next__() except StopIteration: break if i: === modified file 'src/beautifulsoup/tests/test_soup.py' --- src/beautifulsoup/tests/test_soup.py 2009-04-10 15:45:04 +0000 +++ src/beautifulsoup/tests/test_soup.py 2009-04-10 17:15:31 +0000 @@ -635,9 +635,9 @@ self.assertSoupEquals('hello there') def testEntitiesInAttributeValues(self): - self.assertSoupEquals('', '', + self.assertSoupEquals('', b'', encoding='utf-8') - self.assertSoupEquals('', '', + self.assertSoupEquals('', b'', encoding='utf-8') builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) @@ -681,11 +681,11 @@ smart quote fixes.""" def testUnicodeDammitStandalone(self): - markup = "\x92" + markup = b"\x92" dammit = UnicodeDammit(markup) self.assertEquals(dammit.unicode, "") - hebrew = "\xed\xe5\xec\xf9" + hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') self.assertEquals(dammit.originalEncoding, 'iso-8859-8') @@ -697,7 +697,7 @@ unicodeData = u"\u00FC" utf8 = unicodeData.encode("utf-8") - self.assertEquals(utf8, '\xc3\xbc') + self.assertEquals(utf8, b'\xc3\xbc') unicodeSoup = BeautifulStoneSoup(unicodeData) self.assertEquals(unicodeData, unicodeSoup.decode()) @@ -724,8 +724,8 @@ self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') def testRewrittenXMLHeader(self): - euc_jp = '\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' - utf8 = "\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" + euc_jp = b'\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' + utf8 = b"\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" soup = BeautifulStoneSoup(euc_jp) if soup.originalEncoding != "euc-jp": raise Exception("Test failed when parsing euc-jp document. " @@ -736,12 +736,12 @@ self.assertEquals(soup.originalEncoding, "euc-jp") self.assertEquals(soup.renderContents('utf-8'), utf8) - old_text = "\x92" + old_text = b"\x92" new_text = "" self.assertSoupEquals(old_text, new_text) def testRewrittenMetaTag(self): - no_shift_jis_html = '''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' + no_shift_jis_html = b'''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' soup = BeautifulSoup(no_shift_jis_html) # Beautiful Soup used to try to rewrite the meta tag even if the @@ -751,16 +751,16 @@ soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) self.assertEquals(soup.contents[0].name, 'pre') - meta_tag = ('') + meta_tag = (b'') shift_jis_html = ( - '\n%s\n' - '' - '
\n'
-            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
-            '
') % meta_tag + b'\n' + meta_tag + b'\n' + b'' + b'
\n'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
+            b'
') soup = BeautifulSoup(shift_jis_html) if soup.originalEncoding != "shift-jis": raise Exception("Test failed when parsing shift-jis document " @@ -773,60 +773,60 @@ content_type_tag = soup.meta['content'] self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%') - content_type = str(soup.meta) + content_type = soup.meta.decode() index = content_type.find('charset=') self.assertEqual(content_type[index:index+len('charset=utf8')+1], 'charset=utf-8') content_type = soup.meta.encode('shift-jis') - index = content_type.find('charset=') + index = content_type.find(b'charset=') self.assertEqual(content_type[index:index+len('charset=shift-jis')], 'charset=shift-jis'.encode()) self.assertEquals(soup.encode('utf-8'), ( - '\n' - '\n' - '' - '
\n'
-                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
-                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
-                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
-                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
-                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
-                '
')) + b'\n' + b'\n' + b'' + b'
\n'
+                b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
+                b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
+                b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
+                b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
+                b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
+                b'
')) self.assertEquals(soup.encode("shift-jis"), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode())) - isolatin = """Sacr\xe9 bleu!""" + isolatin = b"""Sacr\xe9 bleu!""" soup = BeautifulSoup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) - utf8 = utf8.replace("\xe9", "\xc3\xa9") + utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') def testHebrew(self): - iso_8859_8= '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xed\xe5\xec\xf9\n\n' - utf8 = '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' + iso_8859_8= b'\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xed\xe5\xec\xf9\n\n' + utf8 = b'\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") self.assertEquals(soup.encode('utf-8'), utf8) def testSmartQuotesNotSoSmartAnymore(self): - self.assertSoupEquals("\x91Foo\x92 ", + self.assertSoupEquals(b"\x91Foo\x92 ", '‘Foo’ ') def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): - smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" + smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" soup = BeautifulSoup(smartQuotes) self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›') builder = HTMLParserTreeBuilder(convertEntities="html") soup = BeautifulSoup(smartQuotes, builder) self.assertEquals(soup.encode('utf-8'), - 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') def testDontSeeSmartQuotesWhereThereAreNone(self): - utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" self.assertSoupEquals(utf_8, encoding='utf-8')