summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG4
-rw-r--r--TODO59
-rw-r--r--beautifulsoup/__init__.py2
-rw-r--r--beautifulsoup/builder/__init__.py10
-rw-r--r--beautifulsoup/builder/html5lib_builder.py4
-rw-r--r--beautifulsoup/builder/lxml_builder.py2
-rw-r--r--beautifulsoup/dammit.py73
-rw-r--r--tests/test_html5lib.py56
-rw-r--r--tests/test_lxml.py65
-rw-r--r--tests/test_soup.py20
10 files changed, 178 insertions, 117 deletions
diff --git a/CHANGELOG b/CHANGELOG
index dffab7c..5d13a6d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo".
An HTML or XML entity is always converted into the corresponding
Unicode character. There are no longer any smartQuotesTo or
-convertEntities arguments. (Unicode Dammit still has smartQuotesTo,
-though that may change.)
+convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
+but the default is now to turn smart quotes into Unicode.)
= 3.1.0 =
diff --git a/TODO b/TODO
index ea32bbb..a799bbb 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,11 @@
-html5lib has its own Unicode, Dammit-like system. Converting the input
-to Unicode should be up to the builder. The lxml builder would use
-Unicode, Dammit, and the html5lib builder would be a no-op.
-
Bare ampersands should be converted to HTML entities upon output.
-It should also be possible to convert certain Unicode characters to
-HTML entities upon output.
+It should also be possible to, on output, convert to HTML entities any
+Unicode characters found in htmlentitydefs.codepoint2name. (This
+algorithm would allow me to simplify Unicode, Dammit--convert
+everything to Unicode, and then convert to entities upon output, not
+treating smart quotes differently from any other Unicode character
+that can be represented as an entity.)
XML handling:
@@ -21,50 +21,3 @@ as-yet-unreleased version of html5lib changes the parser's handling of
CDATA sections to allow CDATA sections in tags like <svg> and
<math>. The HTML5TreeBuilder will need to be updated to create CData
objects instead of Comment objects in this situation.
-
-
-
----
-
-Here are some unit tests that fail with HTMLParser.
-
- def testValidButBogusDeclarationFAILS(self):
- self.assertSoupEquals('<! Foo >a', '<!Foo >a')
-
- def testIncompleteDeclarationAtEndFAILS(self):
- self.assertSoupEquals('a<!b')
-
- def testIncompleteEntityAtEndFAILS(self):
- self.assertSoupEquals('&lt;Hello&gt')
-
- # This is not what the original author had in mind, but it's
- # a legitimate interpretation of what they wrote.
- self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
- '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')
- # SGMLParser generates bogus parse events when attribute values
- # contain embedded brackets, but at least Beautiful Soup fixes
- # it up a little.
- self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>')
- self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
- """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
-
- invalidEntity = "foo&#bar;baz"
- soup = BeautifulStoneSoup\
- (invalidEntity,
- convertEntities=htmlEnt)
- self.assertEquals(str(soup), invalidEntity)
-
-
-Tag names that contain Unicode characters crash the parser:
- def testUnicodeTagNamesFAILS(self):
- self.assertSoupEquals("<デダ芻デダtext>2PM</デダ芻デダtext>")
-
-Here's the implementation of NavigableString.__unicode__:
-
- def __unicode__(self):
- return unicode(str(self))
-
-It converts the Unicode to a string, and then back to Unicode. I can't
-find any other way of turning an element of a Unicode subclass into a
-normal Unicode object. This is pretty bad and a better technique is
-welcome.
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 32ea73f..5d66bc7 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -149,7 +149,7 @@ class BeautifulStoneSoup(Tag):
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- self.markup, self.originalEncoding, self.declaredHTMLEncoding = (
+ self.markup, self.original_encoding, self.declared_html_encoding = (
self.builder.prepare_markup(markup, fromEncoding))
try:
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 5bf5929..5c275d7 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -120,8 +120,8 @@ class HTMLTreeBuilder(TreeBuilder):
# This is an interesting meta tag.
match = self.CHARSET_RE.search(content)
if match:
- if (self.soup.declaredHTMLEncoding is not None or
- self.soup.originalEncoding == self.soup.fromEncoding):
+ if (self.soup.declared_html_encoding is not None or
+ self.soup.original_encoding == self.soup.fromEncoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
@@ -136,9 +136,9 @@ class HTMLTreeBuilder(TreeBuilder):
# Go through it again with the encoding information.
new_charset = match.group(3)
if (new_charset is not None
- and new_charset != self.soup.originalEncoding):
- self.soup.declaredHTMLEncoding = new_charset
- self.soup._feed(self.soup.declaredHTMLEncoding)
+ and new_charset != self.soup.original_encoding):
+ self.soup.declared_html_encoding = new_charset
+ self.soup._feed(self.soup.declared_html_encoding)
raise StopParsing
pass
return False
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index 95151da..0a24ce1 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -27,9 +27,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
- doc.originalEncoding = None
+ doc.original_encoding = None
else:
- doc.originalEncoding = parser.tokenizer.stream.charEncoding[0]
+ doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index a1f8c1e..2c264b3 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -23,7 +23,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
- return dammit.markup, dammit.originalEncoding, dammit.declaredHTMLEncoding
+ return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding
def feed(self, markup):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 954ca54..455b0bf 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -3,23 +3,24 @@
This class forces XML data into a standard format (usually to UTF-8 or
Unicode). It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's Beautiful Soup's job.
+encoding; that's the tree builder's job.
"""
import codecs
import re
import types
-# Autodetects character encodings.
+# Autodetects character encodings. Very useful.
# Download from http://chardet.feedparser.org/
+# or 'apt-get install python-chardet'
+# or 'easy_install chardet'
try:
import chardet
-# import chardet.constants
-# chardet.constants._debug = 1
+ #import chardet.constants
+ #chardet.constants._debug = 1
except ImportError:
chardet = None
-# cjkcodecs and iconv_codec make Python know about more character encodings.
# Both are available from http://cjkpython.i18n.org/
# They're built in if you use Python 2.4.
try:
@@ -45,46 +46,53 @@ class UnicodeDammit:
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
- def __init__(self, markup, overrideEncodings=[],
- smartQuotesTo='xml', isHTML=False):
- self.declaredHTMLEncoding = None
- self.markup, documentEncoding, sniffedEncoding = \
+ ENCODINGS_WITH_SMART_QUOTES = [
+ "windows-1252",
+ "iso-8859-1",
+ "iso-8859-2",
+ ]
+
+ def __init__(self, markup, override_encodings=[],
+ smart_quotes_to=None, isHTML=False):
+ self.declared_html_encoding = None
+ self.markup, document_encoding, sniffed_encoding = \
self._detectEncoding(markup, isHTML)
- self.smartQuotesTo = smartQuotesTo
- self.triedEncodings = []
+ self.smart_quotes_to = smart_quotes_to
+ self.tried_encodings = []
if markup == '' or isinstance(markup, unicode):
- self.originalEncoding = None
+ self.original_encoding = None
self.unicode = unicode(markup)
return
u = None
- for proposedEncoding in (
- overrideEncodings + [documentEncoding, sniffedEncoding]):
- if proposedEncoding is not None:
- u = self._convertFrom(proposedEncoding)
+ for proposed_encoding in (
+ override_encodings + [document_encoding, sniffed_encoding]):
+ if proposed_encoding is not None:
+ u = self._convert_from(proposed_encoding)
if u:
break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+ u = self._convert_from(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
if not u:
for proposed_encoding in ("utf-8", "windows-1252"):
- u = self._convertFrom(proposed_encoding)
- if u: break
+ u = self._convert_from(proposed_encoding)
+ if u:
+ break
self.unicode = u
- if not u: self.originalEncoding = None
+ if not u: self.original_encoding = None
- def _subMSChar(self, match):
+ def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity."""
orig = match.group(1)
sub = self.MS_CHARS.get(orig)
if type(sub) == types.TupleType:
- if self.smartQuotesTo == 'xml':
+ if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
sub = '&'.encode() + sub[0].encode() + ';'.encode()
@@ -92,27 +100,26 @@ class UnicodeDammit:
sub = sub.encode()
return sub
- def _convertFrom(self, proposed):
+ def _convert_from(self, proposed):
proposed = self.find_codec(proposed)
- if not proposed or proposed in self.triedEncodings:
+ if not proposed or proposed in self.tried_encodings:
return None
- self.triedEncodings.append(proposed)
+ self.tried_encodings.append(proposed)
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
- if self.smartQuotesTo and proposed.lower() in("windows-1252",
- "iso-8859-1",
- "iso-8859-2"):
+ if (self.smart_quotes_to is not None
+ and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = "([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
- markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+ markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
# print "Trying to convert document to %s" % proposed
- u = self._toUnicode(markup, proposed)
+ u = self._to_unicode(markup, proposed)
self.markup = u
- self.originalEncoding = proposed
+ self.original_encoding = proposed
except Exception, e:
# print "That didn't work!"
# print e
@@ -120,7 +127,7 @@ class UnicodeDammit:
#print "Correct encoding: %s" % proposed
return self.markup
- def _toUnicode(self, data, encoding):
+ def _to_unicode(self, data, encoding):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
@@ -205,7 +212,7 @@ class UnicodeDammit:
xml_encoding = xml_encoding_match.groups()[0].decode(
'ascii').lower()
if isHTML:
- self.declaredHTMLEncoding = xml_encoding
+ self.declared_html_encoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 59d84a3..3045b02 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -131,14 +131,56 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
self.assertEquals(comment, 'b <p')
self.assertEquals(str2, 'c')
- def test_foo(self):
- isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
- soup = self.soup(isolatin)
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo >a')
+ # 'Foo' becomes a comment that appears before the HTML.
+ comment = soup.contents[0]
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, 'Foo')
- utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
- utf8 = utf8.replace("\xe9", "\xc3\xa9")
+ self.assertEquals(self.find(text="a") == "a")
- #print soup
+ def test_attribute_value_was_closed_by_subsequent_tag(self):
+ markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+ soup = self.soup(markup)
+ # The string between the first and second quotes was interpreted
+ # as the value of the 'href' attribute.
+ self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+ #The string after the second quote (bar"), was treated as an
+ #empty attribute called bar".
+ self.assertEquals(soup.a['bar"'], '')
+ self.assertEquals(soup.a.string, "baz")
+
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo ><p>a</p>')
+ # The declaration becomes a comment.
+ comment = soup.contents[0]
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, ' Foo ')
+ self.assertEquals(soup.p.string, 'a')
+
+ def test_document_ends_with_incomplete_declaration(self):
+ soup = self.soup('<p>a<!b')
+ # This becomes a string 'a'. The incomplete declaration is ignored.
+ # Compare html5lib, which turns it into a comment.
+ s, comment = soup.p.contents
+ self.assertEquals(s, 'a')
+ self.assertTrue(isinstance(comment, Comment))
+ self.assertEquals(comment, 'b')
+
+ def test_entity_was_not_finished(self):
+ soup = self.soup("<p>&lt;Hello&gt")
+ # Compare html5lib, which completes the entity.
+ self.assertEquals(soup.p.string, "<Hello>")
+
+ def test_nonexistent_entity(self):
+ soup = self.soup("<p>foo&#bar;baz</p>")
+ self.assertEquals(soup.p.string, "foo&#bar;baz")
+
+ # Compare a real entity.
+ soup = self.soup("<p>foo&#100;baz</p>")
+ self.assertEquals(soup.p.string, "foodbaz")
class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
@@ -151,7 +193,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
fromEncoding="iso-8859-8")
- self.assertEquals(soup.originalEncoding, 'iso8859-8')
+ self.assertEquals(soup.original_encoding, 'iso8859-8')
self.assertEquals(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 4c11b1d..7e15dcf 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -376,6 +376,59 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
markup = "<div><![CDATA[foo]]>"
self.assertSoupEquals(markup, "<div></div>")
+ def test_attribute_value_never_got_closed(self):
+ markup = '<a href="http://foo.com/</a> and blah and blah'
+ soup = self.soup(markup)
+ self.assertEquals(
+ soup.a['href'], "http://foo.com/</a> and blah and blah")
+
+ def test_attribute_value_was_closed_by_subsequent_tag(self):
+ markup = """<a href="foo</a>, </a><a href="bar">baz</a>"""
+ soup = self.soup(markup)
+ # The string between the first and second quotes was interpreted
+ # as the value of the 'href' attribute.
+ self.assertEquals(soup.a['href'], 'foo</a>, </a><a href=')
+
+ #The string after the second quote (bar"), was treated as an
+ #empty attribute called bar.
+ self.assertEquals(soup.a['bar'], '')
+ self.assertEquals(soup.a.string, "baz")
+
+ def test_attribute_value_with_embedded_brackets(self):
+ soup = self.soup('<a b="<a>">')
+ self.assertEquals(soup.a['b'], '<a>')
+
+ def test_nonexistent_entity(self):
+ soup = self.soup("<p>foo&#bar;baz</p>")
+ self.assertEquals(soup.p.string, "foobar;baz")
+
+ # Compare a real entity.
+ soup = self.soup("<p>foo&#100;baz</p>")
+ self.assertEquals(soup.p.string, "foodbaz")
+
+ # Also compare html5lib, which preserves the &# before the
+ # entity name.
+
+ def test_entity_was_not_finished(self):
+ soup = self.soup("<p>&lt;Hello&gt")
+ # Compare html5lib, which completes the entity.
+ self.assertEquals(soup.p.string, "<Hello&gt")
+
+ def test_document_ends_with_incomplete_declaration(self):
+ soup = self.soup('<p>a<!b')
+ # This becomes a string 'a'. The incomplete declaration is ignored.
+ # Compare html5lib, which turns it into a comment.
+ self.assertEquals(soup.p.contents, ['a'])
+
+ def test_document_starts_with_bogus_declaration(self):
+ soup = self.soup('<! Foo ><p>a</p>')
+ # The declaration is ignored altogether.
+ self.assertEquals(soup.encode(), "<html><body><p>a</p></body></html>")
+
+ def test_tag_name_contains_unicode(self):
+ # Unicode characters in tag names are stripped.
+ tag_name = u"<our\N{SNOWMAN}>Joe</our\N{SNOWMAN}>"
+ self.assertSoupEquals("<our>Joe</our>")
class TestLXMLBuilderEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
@@ -391,25 +444,25 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
def test_ascii_in_unicode_out(self):
- # ASCII input is converted to Unicode. The originalEncoding
+ # ASCII input is converted to Unicode. The original_encoding
# attribute is set.
ascii = "<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEquals(unicode_output, self.document_for(ascii))
- self.assertEquals(soup_from_ascii.originalEncoding, "ascii")
+ self.assertEquals(soup_from_ascii.original_encoding, "ascii")
def test_unicode_in_unicode_out(self):
- # Unicode input is left alone. The originalEncoding attribute
+ # Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEquals(soup_from_unicode.decode(), self.unicode_data)
self.assertEquals(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
- self.assertEquals(soup_from_unicode.originalEncoding, None)
+ self.assertEquals(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
- # UTF-8 input is converted to Unicode. The originalEncoding
+ # UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEquals(soup_from_utf8.decode(), self.unicode_data)
@@ -427,7 +480,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
fromEncoding="iso-8859-8")
- self.assertEquals(soup.originalEncoding, 'iso-8859-8')
+ self.assertEquals(soup.original_encoding, 'iso-8859-8')
self.assertEquals(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
diff --git a/tests/test_soup.py b/tests/test_soup.py
index 4fb2142..01dff53 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,15 +19,21 @@ class TestSelectiveParsing(SoupTest):
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
- def test_smart_quotes_to_xml_entities(self):
+ def test_smart_quotes_to_unicode(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(
+ dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+ def test_smart_quotes_to_xml_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
+ dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+ self.assertEquals(
dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
- dammit = UnicodeDammit(markup, smartQuotesTo="html")
+ dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEquals(
dammit.unicode, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
@@ -35,27 +41,27 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = "\xc3\xa9"
dammit = UnicodeDammit(utf8)
self.assertEquals(dammit.unicode, u'\xe9')
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
def test_convert_hebrew(self):
hebrew = "\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
- self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
+ self.assertEquals(dammit.original_encoding, 'iso-8859-8')
self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
self.assertEquals(dammit.unicode.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
- self.assertEquals(dammit.originalEncoding, 'utf-8')
+ self.assertEquals(dammit.original_encoding, 'utf-8')