diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-31 09:17:11 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-31 09:17:11 -0400 |
commit | 19f05a586c79b86be8ebe06a3728ab9a94162bee (patch) | |
tree | 295326e49419a40a8942dc3b0552e51f97e18abb /bs4/dammit.py | |
parent | 342da7818966498e1fc2100c0b920cbc242c9831 (diff) |
Create a new lxml parser object for every new parsing strategy.
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 21 |
1 files changed, 16 insertions, 5 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index cb6d354..a8acef9 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -224,9 +224,11 @@ class EncodingDetector: self.sniffed_encoding = None def _usable(self, encoding, tried): - if encoding not in tried and encoding is not None: - tried.add(encoding) - return True + if encoding is not None: + encoding = encoding.lower() + if encoding not in tried: + tried.add(encoding) + return True return False @property @@ -386,18 +388,17 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.detector = EncodingDetector(markup, override_encodings, is_html) if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return - self.detector = EncodingDetector(markup, override_encodings, is_html) self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None @@ -496,6 +497,16 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + @property + def is_html(self): + return self.detector.is_html + def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ |