summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/builder/_htmlparser.py2
-rw-r--r--bs4/builder/_lxml.py2
-rw-r--r--bs4/dammit.py22
3 files changed, 13 insertions, 13 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 6762a00..edd0bfb 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -58,7 +58,7 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
return markup, None, None
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index b330979..7219e49 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -53,7 +53,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return markup, None, None
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding)
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 2b681e8..09ac89e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,6 +27,8 @@ try:
except ImportError:
pass
+xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@@ -165,18 +167,19 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, isHTML=False):
+ smart_quotes_to=None, is_html=False):
self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
if markup == '' or isinstance(markup, unicode):
- self.original_encoding = None
+ self.markup = markup
self.unicode_markup = unicode(markup)
+ self.original_encoding = None
return
self.markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, isHTML)
+ self._detectEncoding(markup, is_html)
u = None
for proposed_encoding in (
@@ -267,7 +270,7 @@ class UnicodeDammit:
newdata = unicode(data, encoding)
return newdata
- def _detectEncoding(self, xml_data, isHTML=False):
+ def _detectEncoding(self, xml_data, is_html=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
@@ -317,16 +320,13 @@ class UnicodeDammit:
pass
except:
xml_encoding_match = None
- xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
- xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
- if not xml_encoding_match and isHTML:
- meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
- regexp = re.compile(meta_re, re.I)
- xml_encoding_match = regexp.search(xml_data)
+ xml_encoding_match = xml_encoding_re.match(xml_data)
+ if not xml_encoding_match and is_html:
+ xml_encoding_match = html_meta_re.search(xml_data)
if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].decode(
'ascii').lower()
- if isHTML:
+ if is_html:
self.declared_html_encoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',