summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-31 09:17:11 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-31 09:17:11 -0400
commit19f05a586c79b86be8ebe06a3728ab9a94162bee (patch)
tree295326e49419a40a8942dc3b0552e51f97e18abb /bs4/dammit.py
parent342da7818966498e1fc2100c0b920cbc242c9831 (diff)
Create a new lxml parser object for every new parsing strategy.
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py21
1 files changed, 16 insertions, 5 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index cb6d354..a8acef9 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -224,9 +224,11 @@ class EncodingDetector:
self.sniffed_encoding = None
def _usable(self, encoding, tried):
- if encoding not in tried and encoding is not None:
- tried.add(encoding)
- return True
+ if encoding is not None:
+ encoding = encoding.lower()
+ if encoding not in tried:
+ tried.add(encoding)
+ return True
return False
@property
@@ -386,18 +388,17 @@ class UnicodeDammit:
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
- self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
+ self.detector = EncodingDetector(markup, override_encodings, is_html)
if markup == '' or isinstance(markup, unicode):
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
- self.detector = EncodingDetector(markup, override_encodings, is_html)
self.markup, ignore = self.detector.strip_byte_order_mark(markup)
u = None
@@ -496,6 +497,16 @@ class UnicodeDammit:
newdata = unicode(data, encoding, errors)
return newdata
+ @property
+ def declared_html_encoding(self):
+ if not self.is_html:
+ return None
+ return self.detector.declared_encoding
+
+ @property
+ def is_html(self):
+ return self.detector.is_html
+
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \