Ported the rest of the HTML tests, including tests of broken HTML from the TODO. Made Unicode, Dammit PEP-8 compliant.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 15:13:41 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 15:13:41 -0500
commit: 8249b803d9bab9c06be02a244e629cb732f4f5b1 (patch)
tree: 447cddabac142fefd583df1acd6268f6abcb8f5c /beautifulsoup
parent: 0dda99b15112df7225e647db9702fbd62dcc8ea8 (diff)
parent: e170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff)
5 files changed, 49 insertions, 42 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 32ea73f..5d66bc7 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -149,7 +149,7 @@ class BeautifulStoneSoup(Tag):
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
-        self.markup, self.originalEncoding, self.declaredHTMLEncoding = (
+        self.markup, self.original_encoding, self.declared_html_encoding = (
             self.builder.prepare_markup(markup, fromEncoding))
 
         try:
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 5bf5929..5c275d7 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -120,8 +120,8 @@ class HTMLTreeBuilder(TreeBuilder):
             # This is an interesting meta tag.
             match = self.CHARSET_RE.search(content)
             if match:
-                if (self.soup.declaredHTMLEncoding is not None or
-                    self.soup.originalEncoding == self.soup.fromEncoding):
+                if (self.soup.declared_html_encoding is not None or
+                    self.soup.original_encoding == self.soup.fromEncoding):
                     # An HTML encoding was sniffed while converting
                     # the document to Unicode, or an HTML encoding was
                     # sniffed during a previous pass through the
@@ -136,9 +136,9 @@ class HTMLTreeBuilder(TreeBuilder):
                     # Go through it again with the encoding information.
                     new_charset = match.group(3)
                     if (new_charset is not None
-                        and new_charset != self.soup.originalEncoding):
-                        self.soup.declaredHTMLEncoding = new_charset
-                        self.soup._feed(self.soup.declaredHTMLEncoding)
+                        and new_charset != self.soup.original_encoding):
+                        self.soup.declared_html_encoding = new_charset
+                        self.soup._feed(self.soup.declared_html_encoding)
                         raise StopParsing
                     pass
         return False
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index 95151da..0a24ce1 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -27,9 +27,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         if isinstance(markup, unicode):
             # We need to special-case this because html5lib sets
             # charEncoding to UTF-8 if it gets Unicode input.
-            doc.originalEncoding = None
+            doc.original_encoding = None
         else:
-            doc.originalEncoding = parser.tokenizer.stream.charEncoding[0]
+            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
 
     def create_treebuilder(self, namespaceHTMLElements):
         self.underlying_builder = TreeBuilderForHtml5lib(
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index a1f8c1e..2c264b3 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -23,7 +23,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
-        return dammit.markup, dammit.originalEncoding, dammit.declaredHTMLEncoding
+        return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding
 
 
     def feed(self, markup):
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 954ca54..455b0bf 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -3,23 +3,24 @@
 This class forces XML data into a standard format (usually to UTF-8 or
 Unicode).  It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's Beautiful Soup's job.
+encoding; that's the tree builder's job.
 """
 
 import codecs
 import re
 import types
 
-# Autodetects character encodings.
+# Autodetects character encodings. Very useful.
 # Download from http://chardet.feedparser.org/
+#  or 'apt-get install python-chardet'
+#  or 'easy_install chardet'
 try:
     import chardet
-#    import chardet.constants
-#    chardet.constants._debug = 1
+    #import chardet.constants
+    #chardet.constants._debug = 1
 except ImportError:
     chardet = None
 
-# cjkcodecs and iconv_codec make Python know about more character encodings.
 # Both are available from http://cjkpython.i18n.org/
 # They're built in if you use Python 2.4.
 try:
@@ -45,46 +46,53 @@ class UnicodeDammit:
     CHARSET_ALIASES = { "macintosh" : "mac-roman",
                         "x-sjis" : "shift-jis" }
 
-    def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml', isHTML=False):
-        self.declaredHTMLEncoding = None
-        self.markup, documentEncoding, sniffedEncoding = \
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, isHTML=False):
+        self.declared_html_encoding = None
+        self.markup, document_encoding, sniffed_encoding = \
                      self._detectEncoding(markup, isHTML)
-        self.smartQuotesTo = smartQuotesTo
-        self.triedEncodings = []
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
         if markup == '' or isinstance(markup, unicode):
-            self.originalEncoding = None
+            self.original_encoding = None
             self.unicode = unicode(markup)
             return
 
         u = None
-        for proposedEncoding in (
-            overrideEncodings + [documentEncoding, sniffedEncoding]):
-            if proposedEncoding is not None:
-                u = self._convertFrom(proposedEncoding)
+        for proposed_encoding in (
+            override_encodings + [document_encoding, sniffed_encoding]):
+            if proposed_encoding is not None:
+                u = self._convert_from(proposed_encoding)
                 if u:
                     break
 
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+            u = self._convert_from(chardet.detect(self.markup)['encoding'])
 
         # As a last resort, try utf-8 and windows-1252:
         if not u:
             for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convertFrom(proposed_encoding)
-                if u: break
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
 
         self.unicode = u
-        if not u: self.originalEncoding = None
+        if not u: self.original_encoding = None
 
-    def _subMSChar(self, match):
+    def _sub_ms_char(self, match):
         """Changes a MS smart quote character to an XML or HTML
         entity."""
         orig = match.group(1)
         sub = self.MS_CHARS.get(orig)
         if type(sub) == types.TupleType:
-            if self.smartQuotesTo == 'xml':
+            if self.smart_quotes_to == 'xml':
                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
             else:
                 sub = '&'.encode() + sub[0].encode() + ';'.encode()
@@ -92,27 +100,26 @@ class UnicodeDammit:
             sub = sub.encode()
         return sub
 
-    def _convertFrom(self, proposed):
+    def _convert_from(self, proposed):
         proposed = self.find_codec(proposed)
-        if not proposed or proposed in self.triedEncodings:
+        if not proposed or proposed in self.tried_encodings:
             return None
-        self.triedEncodings.append(proposed)
+        self.tried_encodings.append(proposed)
         markup = self.markup
 
         # Convert smart quotes to HTML if coming from an encoding
         # that might have them.
-        if self.smartQuotesTo and proposed.lower() in("windows-1252",
-                                                      "iso-8859-1",
-                                                      "iso-8859-2"):
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
             smart_quotes_re = "([\x80-\x9f])"
             smart_quotes_compiled = re.compile(smart_quotes_re)
-            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
 
         try:
             # print "Trying to convert document to %s" % proposed
-            u = self._toUnicode(markup, proposed)
+            u = self._to_unicode(markup, proposed)
             self.markup = u
-            self.originalEncoding = proposed
+            self.original_encoding = proposed
         except Exception, e:
             # print "That didn't work!"
             # print e
@@ -120,7 +127,7 @@ class UnicodeDammit:
         #print "Correct encoding: %s" % proposed
         return self.markup
 
-    def _toUnicode(self, data, encoding):
+    def _to_unicode(self, data, encoding):
         '''Given a string and its encoding, decodes the string into Unicode.
         %encoding is a string recognized by encodings.aliases'''
 
@@ -205,7 +212,7 @@ class UnicodeDammit:
             xml_encoding = xml_encoding_match.groups()[0].decode(
                 'ascii').lower()
             if isHTML:
-                self.declaredHTMLEncoding = xml_encoding
+                self.declared_html_encoding = xml_encoding
             if sniffed_xml_encoding and \
                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 15:13:41 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 15:13:41 -0500
commit	8249b803d9bab9c06be02a244e629cb732f4f5b1 (patch)
tree	447cddabac142fefd583df1acd6268f6abcb8f5c /beautifulsoup
parent	0dda99b15112df7225e647db9702fbd62dcc8ea8 (diff)
parent	e170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff)