From ea23194367fb36d201cf6b8134601a73070dff63 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 May 2013 11:33:00 -0400 Subject: Split out the code that guesses at encodings from the code that tries to decode a bytestring based on those encodings. This is necessary because lxml wants to do the decoding itself. --- bs4/dammit.py | 317 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 189 insertions(+), 128 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index a733cad..71281d2 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -11,6 +11,7 @@ import codecs from htmlentitydefs import codepoint2name import re import logging +import string # Import a library to autodetect character encodings. chardet_type = None @@ -175,7 +176,6 @@ class EntitySubstitution(object): value = cls.quoted_attribute_value(value) return value - @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. @@ -192,6 +192,180 @@ class EntitySubstitution(object): cls._substitute_html_entity, s) +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding implied by a short substring at the beginning of + the bytestring, such as a byte-order mark. + + 3. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 4. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 5. UTF-8. + + 6. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False): + self.markup = markup + self.override_encodings = override_encodings or [] + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + self.sniffed_encoding = None + + def _yield(self, encoding, tried): + if encoding not in tried and encoding is not None: + yield encoding + tried.add(encoding) + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + for x in self._yield(e, tried): + yield x + + if self.sniffed_encoding is None: + (self.markup, + self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) + for x in self._yield(self.sniffed_encoding, tried): + yield x + + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + + if self.sniffed_encoding and self.declared_encoding in ( + 'iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16'): + # We were able to sniff an encoding by looking at the + # first part of the document. The declared encoding is redundant + # with the sniffed encoding. + # + # TODO: Is it really? How do we know? What if the BOM + # says UTF-32 and the declaration says UTF-16? In real + # usage this doesn't matter because this method is + # only called if the sniffed encoding didn't work. + self.declared_encoding = self.sniffed_encoding + + if self.declared_encoding is not None: + for x in self._yield(self.declared_encoding, tried): + yield x + + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self.chardet_encoding is not None: + for x in self._yield(self.chardet_encoding, tried): + yield x + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + for x in self._yield(e, tried): + yield x + + @classmethod + def strip_byte_order_mark(cls, markup): + "Remove a byte-order mark from a document, and guess at its encoding." + if markup[:4] == b'\x4c\x6f\xa7\x94': + # EBCDIC + # There's no 'ebcdic' codec, so just convert the ebsdic to ASCII. + markup = self.ebcdic_to_ascii(markup) + elif markup[:4] == b'\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_encoding = 'utf-16be' + elif (len(markup) >= 4) and (markup[:2] == b'\xfe\xff') \ + and (markup[2:4] != b'\x00\x00'): + # UTF-16BE with BOM + sniffed_encoding = 'utf-16be' + markup = markup[2:] + elif markup[:4] == b'\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_encoding = 'utf-16le' + elif (len(markup) >= 4) and (markup[:2] == b'\xff\xfe') and \ + (markup[2:4] != b'\x00\x00'): + # UTF-16LE with BOM + sniffed_encoding = 'utf-16le' + markup = markup[2:] + elif markup[:4] == b'\x00\x00\x00\x3c': + # UTF-32BE + sniffed_encoding = 'utf-32be' + elif markup[:4] == b'\x3c\x00\x00\x00': + # UTF-32LE + sniffed_encoding = 'utf-32le' + elif markup[:4] == b'\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_encoding = 'utf-32be' + markup = markup[4:] + elif markup[:4] == b'\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_encoding = 'utf-32le' + markup = markup[4:] + elif markup[:3] == b'\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_encoding = 'utf-8' + markup = markup[3:] + else: + # No idea. + sniffed_encoding = None + return markup, sniffed_encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag. + """ + declared_encoding = None + declared_encoding_match = xml_encoding_re.match(markup) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii').lower() + return declared_encoding + + EBCDIC_TO_ASCII_MAP = None + + @classmethod + def ebcdic_to_ascii(cls, s): + if not cls.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + cls.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) + return s.translate(cls.EBCDIC_TO_ASCII_MAP) + + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -224,44 +398,22 @@ class UnicodeDammit: self.original_encoding = None return - new_markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) - self.markup = new_markup + self.detector = EncodingDetector(markup, override_encodings, is_html) + self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None - if new_markup != markup: - # _detectEncoding modified the markup, then converted it to - # Unicode and then to UTF-8. So convert it from UTF-8. - u = self._convert_from("utf8") - self.original_encoding = sniffed_encoding - - if not u: - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break - - # If no luck and we have auto-detection library, try that: - if not u and not isinstance(self.markup, unicode): - u = self._convert_from(chardet_dammit(self.markup)) + for encoding in self.detector.encodings: + u = self._convert_from(encoding) + if u is not None: + break - # As a last resort, try utf-8 and windows-1252: if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. - # As an absolute last resort, try the encodings again with - # character replacement. - if not u: - for proposed_encoding in ( - override_encodings + [ - document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): - if proposed_encoding != "ascii": - u = self._convert_from(proposed_encoding, "replace") + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") if u is not None: logging.warning( "Some characters could not be decoded, and were " @@ -269,8 +421,9 @@ class UnicodeDammit: self.contains_replacement_characters = True break - # We could at this point force it to ASCII, but that would - # destroy so much data that I think giving up is better + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. self.unicode_markup = u if not u: self.original_encoding = None @@ -344,72 +497,6 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata - def _detectEncoding(self, xml_data, is_html=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ - and (xml_data[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ - (xml_data[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_match = xml_encoding_re.match(xml_data) - if not xml_encoding_match and is_html: - xml_encoding_match = html_meta_re.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if is_html: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding - def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ @@ -427,32 +514,6 @@ class UnicodeDammit: pass return codec - EBCDIC_TO_ASCII_MAP = None - - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), -- cgit v1.2.3 From 342da7818966498e1fc2100c0b920cbc242c9831 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 May 2013 12:43:22 -0400 Subject: Refactored code a bit. --- bs4/dammit.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index 71281d2..cb6d354 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -223,24 +223,25 @@ class EncodingDetector: self.declared_encoding = None self.sniffed_encoding = None - def _yield(self, encoding, tried): + def _usable(self, encoding, tried): if encoding not in tried and encoding is not None: - yield encoding tried.add(encoding) + return True + return False @property def encodings(self): """Yield a number of encodings that might work for this markup.""" tried = set() for e in self.override_encodings: - for x in self._yield(e, tried): - yield x + if self._usable(e, tried): + yield e if self.sniffed_encoding is None: (self.markup, self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) - for x in self._yield(self.sniffed_encoding, tried): - yield x + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding if self.declared_encoding is None: self.declared_encoding = self.find_declared_encoding( @@ -261,20 +262,18 @@ class EncodingDetector: # only called if the sniffed encoding didn't work. self.declared_encoding = self.sniffed_encoding - if self.declared_encoding is not None: - for x in self._yield(self.declared_encoding, tried): - yield x + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding if self.chardet_encoding is None: self.chardet_encoding = chardet_dammit(self.markup) - if self.chardet_encoding is not None: - for x in self._yield(self.chardet_encoding, tried): - yield x + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding # As a last-ditch effort, try utf-8 and windows-1252. for e in ('utf-8', 'windows-1252'): - for x in self._yield(e, tried): - yield x + if self._usable(e, tried): + yield e @classmethod def strip_byte_order_mark(cls, markup): -- cgit v1.2.3 From 19f05a586c79b86be8ebe06a3728ab9a94162bee Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 31 May 2013 09:17:11 -0400 Subject: Create a new lxml parser object for every new parsing strategy. --- bs4/dammit.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index cb6d354..a8acef9 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -224,9 +224,11 @@ class EncodingDetector: self.sniffed_encoding = None def _usable(self, encoding, tried): - if encoding not in tried and encoding is not None: - tried.add(encoding) - return True + if encoding is not None: + encoding = encoding.lower() + if encoding not in tried: + tried.add(encoding) + return True return False @property @@ -386,18 +388,17 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.detector = EncodingDetector(markup, override_encodings, is_html) if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return - self.detector = EncodingDetector(markup, override_encodings, is_html) self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None @@ -496,6 +497,16 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + @property + def is_html(self): + return self.detector.is_html + def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ -- cgit v1.2.3 From 6d9221a42dd3fdd679b8e222cb9c73065eeeb747 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 2 Jun 2013 18:28:03 -0400 Subject: It turns out most of the untested code wasn't doing anything useful. --- bs4/dammit.py | 128 +++++++++------------------------------------------------- 1 file changed, 20 insertions(+), 108 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index a8acef9..a5558d7 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -200,20 +200,17 @@ class EncodingDetector: 1. Encodings you specifically tell EncodingDetector to try first (the override_encodings argument to the constructor). - 2. An encoding implied by a short substring at the beginning of - the bytestring, such as a byte-order mark. - - 3. An encoding declared within the bytestring itself, either in an + 2. An encoding declared within the bytestring itself, either in an XML declaration (if the bytestring is to be interpreted as an XML document), or in a tag (if the bytestring is to be interpreted as an HTML document.) - 4. An encoding detected through textual analysis by chardet, + 3. An encoding detected through textual analysis by chardet, cchardet, or a similar external library. - 5. UTF-8. + 4. UTF-8. - 6. Windows-1252. + 5. Windows-1252. """ def __init__(self, markup, override_encodings=None, is_html=False): self.markup = markup @@ -239,31 +236,12 @@ class EncodingDetector: if self._usable(e, tried): yield e - if self.sniffed_encoding is None: - (self.markup, - self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) - if self._usable(self.sniffed_encoding, tried): - yield self.sniffed_encoding - if self.declared_encoding is None: + # Look within the document for an XML or HTML encoding + # declaration. self.declared_encoding = self.find_declared_encoding( self.markup, self.is_html) - if self.sniffed_encoding and self.declared_encoding in ( - 'iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16'): - # We were able to sniff an encoding by looking at the - # first part of the document. The declared encoding is redundant - # with the sniffed encoding. - # - # TODO: Is it really? How do we know? What if the BOM - # says UTF-32 and the declaration says UTF-16? In real - # usage this doesn't matter because this method is - # only called if the sniffed encoding didn't work. - self.declared_encoding = self.sniffed_encoding - if self._usable(self.declared_encoding, tried): yield self.declared_encoding @@ -277,52 +255,6 @@ class EncodingDetector: if self._usable(e, tried): yield e - @classmethod - def strip_byte_order_mark(cls, markup): - "Remove a byte-order mark from a document, and guess at its encoding." - if markup[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - # There's no 'ebcdic' codec, so just convert the ebsdic to ASCII. - markup = self.ebcdic_to_ascii(markup) - elif markup[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_encoding = 'utf-16be' - elif (len(markup) >= 4) and (markup[:2] == b'\xfe\xff') \ - and (markup[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_encoding = 'utf-16be' - markup = markup[2:] - elif markup[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_encoding = 'utf-16le' - elif (len(markup) >= 4) and (markup[:2] == b'\xff\xfe') and \ - (markup[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_encoding = 'utf-16le' - markup = markup[2:] - elif markup[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_encoding = 'utf-32be' - elif markup[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_encoding = 'utf-32le' - elif markup[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_encoding = 'utf-32be' - markup = markup[4:] - elif markup[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_encoding = 'utf-32le' - markup = markup[4:] - elif markup[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_encoding = 'utf-8' - markup = markup[3:] - else: - # No idea. - sniffed_encoding = None - return markup, sniffed_encoding - @classmethod def find_declared_encoding(cls, markup, is_html=False): """Given a document, tries to find its declared encoding. @@ -337,35 +269,10 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii').lower() - return declared_encoding - - EBCDIC_TO_ASCII_MAP = None - - @classmethod - def ebcdic_to_ascii(cls, s): - if not cls.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - cls.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(cls.EBCDIC_TO_ASCII_MAP) - + 'ascii') + if declared_encoding: + return declared_encoding.lower() + return None class UnicodeDammit: """A class for detecting the encoding of a *ML document and @@ -399,7 +306,7 @@ class UnicodeDammit: self.original_encoding = None return - self.markup, ignore = self.detector.strip_byte_order_mark(markup) + self.markup = markup u = None for encoding in self.detector.encodings: @@ -454,7 +361,7 @@ class UnicodeDammit: # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) @@ -508,10 +415,15 @@ class UnicodeDammit: return self.detector.is_html def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) or charset + ) + if value: + return value.lower() + return None def _codec(self, charset): if not charset: -- cgit v1.2.3 From 4f9a654766df9ddd05e3ef274b4715b42668724f Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 2 Jun 2013 19:12:07 -0400 Subject: Turns out we had two bits of code to strip byte-order marks. --- bs4/dammit.py | 77 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 34 deletions(-) (limited to 'bs4/dammit.py') diff --git a/bs4/dammit.py b/bs4/dammit.py index a5558d7..9ea432f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -213,12 +213,13 @@ class EncodingDetector: 5. Windows-1252. """ def __init__(self, markup, override_encodings=None, is_html=False): - self.markup = markup self.override_encodings = override_encodings or [] self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None - self.sniffed_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) def _usable(self, encoding, tried): if encoding is not None: @@ -236,15 +237,21 @@ class EncodingDetector: if self._usable(e, tried): yield e + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. if self.declared_encoding is None: - # Look within the document for an XML or HTML encoding - # declaration. self.declared_encoding = self.find_declared_encoding( self.markup, self.is_html) - if self._usable(self.declared_encoding, tried): yield self.declared_encoding + # Use third-party character set detection to guess at the + # encoding. if self.chardet_encoding is None: self.chardet_encoding = chardet_dammit(self.markup) if self._usable(self.chardet_encoding, tried): @@ -255,6 +262,29 @@ class EncodingDetector: if self._usable(e, tried): yield e + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + @classmethod def find_declared_encoding(cls, markup, is_html=False): """Given a document, tries to find its declared encoding. @@ -298,18 +328,21 @@ class UnicodeDammit: self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.is_html = is_html self.detector = EncodingDetector(markup, override_encodings, is_html) - if markup == '' or isinstance(markup, unicode): + + # Is the data in Unicode to begin with? + if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) - self.original_encoding = None - return - self.markup = markup + # As a first step, the encoding detector may strip a byte-order mark. + self.markup = self.detector.markup u = None for encoding in self.detector.encodings: + markup = self.detector.markup u = self._convert_from(encoding) if u is not None: break @@ -382,27 +415,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata + return unicode(data, encoding, errors) @property def declared_html_encoding(self): @@ -410,10 +423,6 @@ class UnicodeDammit: return None return self.detector.declared_encoding - @property - def is_html(self): - return self.detector.is_html - def find_codec(self, charset): value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) or (charset and self._codec(charset.replace("-", ""))) -- cgit v1.2.3