diff options
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | bs4/dammit.py | 13 | ||||
-rw-r--r-- | tests/test_soup.py | 12 |
3 files changed, 13 insertions, 13 deletions
@@ -38,6 +38,7 @@ work. Here are the renames: Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element + * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup So have some arguments to popular methods: diff --git a/bs4/dammit.py b/bs4/dammit.py index 75d445e..4aafe81 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -9,7 +9,6 @@ encoding; that's the tree builder's job. import codecs from htmlentitydefs import codepoint2name import re -import types # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ @@ -37,7 +36,7 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters = [] - for codepoint, name in codepoint2name.items(): + for codepoint, name in list(codepoint2name.items()): if codepoint == 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which @@ -175,7 +174,7 @@ class UnicodeDammit: self.tried_encodings = [] if markup == '' or isinstance(markup, unicode): self.original_encoding = None - self.unicode = unicode(markup) + self.unicode_markup = unicode(markup) return u = None @@ -197,7 +196,7 @@ class UnicodeDammit: if u: break - self.unicode = u + self.unicode_markup = u if not u: self.original_encoding = None def _sub_ms_char(self, match): @@ -205,7 +204,7 @@ class UnicodeDammit: entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: + if type(sub) == tuple: if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: @@ -234,7 +233,7 @@ class UnicodeDammit: u = self._to_unicode(markup, proposed) self.markup = u self.original_encoding = proposed - except Exception, e: + except Exception as e: # print "That didn't work!" # print e return None @@ -375,7 +374,7 @@ class UnicodeDammit: 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), diff --git a/tests/test_soup.py b/tests/test_soup.py index d283b8a..87d6f3b 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -86,37 +86,37 @@ class TestUnicodeDammit(unittest.TestCase): markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEquals( - dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>") + dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") def test_smart_quotes_to_xml_entities(self): markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEquals( - dammit.unicode, "<foo>‘’“”</foo>") + dammit.unicode_markup, "<foo>‘’“”</foo>") def test_smart_quotes_to_html_entities(self): markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEquals( - dammit.unicode, "<foo>‘’“”</foo>") + dammit.unicode_markup, "<foo>‘’“”</foo>") def test_detect_utf8(self): utf8 = "\xc3\xa9" dammit = UnicodeDammit(utf8) - self.assertEquals(dammit.unicode, u'\xe9') + self.assertEquals(dammit.unicode_markup, u'\xe9') self.assertEquals(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEquals(dammit.original_encoding, 'iso-8859-8') - self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEquals(dammit.original_encoding, 'utf-8') - self.assertEquals(dammit.unicode.encode("utf-8"), utf_8) + self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") |