summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 10:13:42 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 10:13:42 -0500
commit274ceca63fe55336201cd611d897662e5d000e8f (patch)
treefd18ec6b193c4b31d4e1a49af5a65a408f46102e /bs4/dammit.py
parent3c9e9c3f780d59f2ef7927fe8246ab78fe4f6572 (diff)
Minor Unicode, Dammit cleanup.
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py22
1 files changed, 11 insertions, 11 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 2b681e8..09ac89e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,6 +27,8 @@ try:
except ImportError:
pass
+xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@@ -165,18 +167,19 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, isHTML=False):
+ smart_quotes_to=None, is_html=False):
self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
if markup == '' or isinstance(markup, unicode):
- self.original_encoding = None
+ self.markup = markup
self.unicode_markup = unicode(markup)
+ self.original_encoding = None
return
self.markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, isHTML)
+ self._detectEncoding(markup, is_html)
u = None
for proposed_encoding in (
@@ -267,7 +270,7 @@ class UnicodeDammit:
newdata = unicode(data, encoding)
return newdata
- def _detectEncoding(self, xml_data, isHTML=False):
+ def _detectEncoding(self, xml_data, is_html=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
@@ -317,16 +320,13 @@ class UnicodeDammit:
pass
except:
xml_encoding_match = None
- xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
- xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
- if not xml_encoding_match and isHTML:
- meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
- regexp = re.compile(meta_re, re.I)
- xml_encoding_match = regexp.search(xml_data)
+ xml_encoding_match = xml_encoding_re.match(xml_data)
+ if not xml_encoding_match and is_html:
+ xml_encoding_match = html_meta_re.search(xml_data)
if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].decode(
'ascii').lower()
- if isHTML:
+ if is_html:
self.declared_html_encoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',