diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 12:23:37 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 12:23:37 -0500 |
commit | d35e92875c62cf43227ccc6fca75b5e74a6350e8 (patch) | |
tree | 03267eb13dcaf17614a4475cf40e8b7e7552d698 | |
parent | 845dbe03bee981bcc5d24ef06ca868042968aa4c (diff) |
Made Unicode, Dammit more PEP-8 compliant.
-rw-r--r-- | beautifulsoup/__init__.py | 2 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 2 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 39 | ||||
-rw-r--r-- | tests/test_html5lib.py | 2 | ||||
-rw-r--r-- | tests/test_lxml.py | 12 | ||||
-rw-r--r-- | tests/test_soup.py | 10 |
8 files changed, 40 insertions, 35 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 9ae87a6..5d66bc7 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -149,7 +149,7 @@ class BeautifulStoneSoup(Tag): if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - self.markup, self.originalEncoding, self.declared_html_encoding = ( + self.markup, self.original_encoding, self.declared_html_encoding = ( self.builder.prepare_markup(markup, fromEncoding)) try: diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index a5e1b06..5c275d7 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -121,7 +121,7 @@ class HTMLTreeBuilder(TreeBuilder): match = self.CHARSET_RE.search(content) if match: if (self.soup.declared_html_encoding is not None or - self.soup.originalEncoding == self.soup.fromEncoding): + self.soup.original_encoding == self.soup.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the @@ -136,7 +136,7 @@ class HTMLTreeBuilder(TreeBuilder): # Go through it again with the encoding information. new_charset = match.group(3) if (new_charset is not None - and new_charset != self.soup.originalEncoding): + and new_charset != self.soup.original_encoding): self.soup.declared_html_encoding = new_charset self.soup._feed(self.soup.declared_html_encoding) raise StopParsing diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index 95151da..0a24ce1 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -27,9 +27,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. - doc.originalEncoding = None + doc.original_encoding = None else: - doc.originalEncoding = parser.tokenizer.stream.charEncoding[0] + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 360e37d..2c264b3 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -23,7 +23,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return dammit.markup, dammit.originalEncoding, dammit.declared_html_encoding + return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding def feed(self, markup): diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index f810d15..09a37e3 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -45,38 +45,44 @@ class UnicodeDammit: CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + def __init__(self, markup, override_encodings=[], smart_quotes_to='xml', isHTML=False): self.declared_html_encoding = None - self.markup, documentEncoding, sniffedEncoding = \ + self.markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, isHTML) self.smart_quotes_to = smart_quotes_to - self.triedEncodings = [] + self.tried_encodings = [] if markup == '' or isinstance(markup, unicode): - self.originalEncoding = None + self.original_encoding = None self.unicode = unicode(markup) return u = None - for proposedEncoding in ( - override_encodings + [documentEncoding, sniffedEncoding]): - if proposedEncoding is not None: - u = self._convertFrom(proposedEncoding) + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): - u = self._convertFrom(chardet.detect(self.markup)['encoding']) + u = self._convert_from(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convertFrom(proposed_encoding) + u = self._convert_from(proposed_encoding) if u: break self.unicode = u - if not u: self.originalEncoding = None + if not u: self.original_encoding = None def _subMSChar(self, match): """Changes a MS smart quote character to an XML or HTML @@ -92,18 +98,17 @@ class UnicodeDammit: sub = sub.encode() return sub - def _convertFrom(self, proposed): + def _convert_from(self, proposed): proposed = self.find_codec(proposed) - if not proposed or proposed in self.triedEncodings: + if not proposed or proposed in self.tried_encodings: return None - self.triedEncodings.append(proposed) + self.tried_encodings.append(proposed) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. - if self.smart_quotes_to and proposed.lower() in("windows-1252", - "iso-8859-1", - "iso-8859-2"): + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = "([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._subMSChar, markup) @@ -112,7 +117,7 @@ class UnicodeDammit: # print "Trying to convert document to %s" % proposed u = self._toUnicode(markup, proposed) self.markup = u - self.originalEncoding = proposed + self.original_encoding = proposed except Exception, e: # print "That didn't work!" # print e diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 59d84a3..5aeac76 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -151,7 +151,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, fromEncoding="iso-8859-8") - self.assertEquals(soup.originalEncoding, 'iso8859-8') + self.assertEquals(soup.original_encoding, 'iso8859-8') self.assertEquals( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 4c11b1d..11ef15a 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -391,25 +391,25 @@ class TestLXMLBuilderEncodingConversion(SoupTest): "<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>") def test_ascii_in_unicode_out(self): - # ASCII input is converted to Unicode. The originalEncoding + # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = "<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEquals(unicode_output, self.document_for(ascii)) - self.assertEquals(soup_from_ascii.originalEncoding, "ascii") + self.assertEquals(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): - # Unicode input is left alone. The originalEncoding attribute + # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEquals(soup_from_unicode.decode(), self.unicode_data) self.assertEquals(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') - self.assertEquals(soup_from_unicode.originalEncoding, None) + self.assertEquals(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): - # UTF-8 input is converted to Unicode. The originalEncoding + # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEquals(soup_from_utf8.decode(), self.unicode_data) @@ -427,7 +427,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest): # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, fromEncoding="iso-8859-8") - self.assertEquals(soup.originalEncoding, 'iso-8859-8') + self.assertEquals(soup.original_encoding, 'iso-8859-8') self.assertEquals( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) diff --git a/tests/test_soup.py b/tests/test_soup.py index 7414403..c3a19e1 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -35,27 +35,27 @@ class TestUnicodeDammit(unittest.TestCase): utf8 = "\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEquals(dammit.unicode, u'\xe9') - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEquals(dammit.originalEncoding, 'iso-8859-8') + self.assertEquals(dammit.original_encoding, 'iso-8859-8') self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') self.assertEquals(dammit.unicode.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEquals(dammit.originalEncoding, 'utf-8') + self.assertEquals(dammit.original_encoding, 'utf-8') |