Minor Unicode, Dammit cleanup.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-09 10:13:42 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-09 10:13:42 -0500
commit: 274ceca63fe55336201cd611d897662e5d000e8f (patch)
tree: fd18ec6b193c4b31d4e1a49af5a65a408f46102e /bs4/dammit.py
parent: 3c9e9c3f780d59f2ef7927fe8246ab78fe4f6572 (diff)
1 files changed, 11 insertions, 11 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 2b681e8..09ac89e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,6 +27,8 @@ try:
 except ImportError:
     pass
 
+xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
 
 class EntitySubstitution(object):
 
@@ -165,18 +167,19 @@ class UnicodeDammit:
         ]
 
     def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, isHTML=False):
+                 smart_quotes_to=None, is_html=False):
         self.declared_html_encoding = None
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
 
         if markup == '' or isinstance(markup, unicode):
-            self.original_encoding = None
+            self.markup = markup
             self.unicode_markup = unicode(markup)
+            self.original_encoding = None
             return
 
         self.markup, document_encoding, sniffed_encoding = \
-                     self._detectEncoding(markup, isHTML)
+                     self._detectEncoding(markup, is_html)
 
         u = None
         for proposed_encoding in (
@@ -267,7 +270,7 @@ class UnicodeDammit:
         newdata = unicode(data, encoding)
         return newdata
 
-    def _detectEncoding(self, xml_data, isHTML=False):
+    def _detectEncoding(self, xml_data, is_html=False):
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
@@ -317,16 +320,13 @@ class UnicodeDammit:
                 pass
         except:
             xml_encoding_match = None
-        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
-        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
-        if not xml_encoding_match and isHTML:
-            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
-            regexp = re.compile(meta_re, re.I)
-            xml_encoding_match = regexp.search(xml_data)
+        xml_encoding_match = xml_encoding_re.match(xml_data)
+        if not xml_encoding_match and is_html:
+            xml_encoding_match = html_meta_re.search(xml_data)
         if xml_encoding_match is not None:
             xml_encoding = xml_encoding_match.groups()[0].decode(
                 'ascii').lower()
-            if isHTML:
+            if is_html:
                 self.declared_html_encoding = xml_encoding
             if sniffed_xml_encoding and \
                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-09 10:13:42 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-09 10:13:42 -0500
commit	274ceca63fe55336201cd611d897662e5d000e8f (patch)
tree	fd18ec6b193c4b31d4e1a49af5a65a408f46102e /bs4/dammit.py
parent	3c9e9c3f780d59f2ef7927fe8246ab78fe4f6572 (diff)