summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG1
-rw-r--r--bs4/dammit.py13
-rw-r--r--tests/test_soup.py12
3 files changed, 13 insertions, 13 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 00d80da..cd01b3b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -38,6 +38,7 @@ work. Here are the renames:
Some attributes have also been renamed:
* Tag.isSelfClosing -> Tag.is_empty_element
+ * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup
So have some arguments to popular methods:
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 75d445e..4aafe81 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -9,7 +9,6 @@ encoding; that's the tree builder's job.
import codecs
from htmlentitydefs import codepoint2name
import re
-import types
# Autodetects character encodings. Very useful.
# Download from http://chardet.feedparser.org/
@@ -37,7 +36,7 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters = []
- for codepoint, name in codepoint2name.items():
+ for codepoint, name in list(codepoint2name.items()):
if codepoint == 34:
# There's no point in turning the quotation mark into
# ", unless it happens within an attribute value, which
@@ -175,7 +174,7 @@ class UnicodeDammit:
self.tried_encodings = []
if markup == '' or isinstance(markup, unicode):
self.original_encoding = None
- self.unicode = unicode(markup)
+ self.unicode_markup = unicode(markup)
return
u = None
@@ -197,7 +196,7 @@ class UnicodeDammit:
if u:
break
- self.unicode = u
+ self.unicode_markup = u
if not u: self.original_encoding = None
def _sub_ms_char(self, match):
@@ -205,7 +204,7 @@ class UnicodeDammit:
entity."""
orig = match.group(1)
sub = self.MS_CHARS.get(orig)
- if type(sub) == types.TupleType:
+ if type(sub) == tuple:
if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
@@ -234,7 +233,7 @@ class UnicodeDammit:
u = self._to_unicode(markup, proposed)
self.markup = u
self.original_encoding = proposed
- except Exception, e:
+ except Exception as e:
# print "That didn't work!"
# print e
return None
@@ -375,7 +374,7 @@ class UnicodeDammit:
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
MS_CHARS = { '\x80' : ('euro', '20AC'),
diff --git a/tests/test_soup.py b/tests/test_soup.py
index d283b8a..87d6f3b 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -86,37 +86,37 @@ class TestUnicodeDammit(unittest.TestCase):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(
- dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+ dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEquals(
- dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+ dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEquals(
- dammit.unicode, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
+ dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_detect_utf8(self):
utf8 = "\xc3\xa9"
dammit = UnicodeDammit(utf8)
- self.assertEquals(dammit.unicode, u'\xe9')
+ self.assertEquals(dammit.unicode_markup, u'\xe9')
self.assertEquals(dammit.original_encoding, 'utf-8')
def test_convert_hebrew(self):
hebrew = "\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEquals(dammit.original_encoding, 'iso-8859-8')
- self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
+ self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEquals(dammit.original_encoding, 'utf-8')
- self.assertEquals(dammit.unicode.encode("utf-8"), utf_8)
+ self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")