"""Beautiful Soup bonus library: Unicode, Dammit This class forces XML data into a standard format (usually to UTF-8 or Unicode). It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It does not rewrite the XML or HTML to reflect a new encoding; that's Beautiful Soup's job. """ import codecs import re import types # Autodetects character encodings. # Download from http://chardet.feedparser.org/ try: import chardet # import chardet.constants # chardet.constants._debug = 1 except ImportError: chardet = None # cjkcodecs and iconv_codec make Python know about more character encodings. # Both are available from http://cjkpython.i18n.org/ # They're built in if you use Python 2.4. try: import cjkcodecs.aliases except ImportError: pass try: import iconv_codec except ImportError: pass class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.""" # This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None self.unicode = unicode(markup) return u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break self.unicode = u if not u: self.originalEncoding = None def _subMSChar(self, match): """Changes a MS smart quote character to an XML or HTML entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() else: sub = sub.encode() return sub def _convertFrom(self, proposed): proposed = self.find_codec(proposed) if not proposed or proposed in self.triedEncodings: return None self.triedEncodings.append(proposed) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): smart_quotes_re = "([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._subMSChar, markup) try: # print "Trying to convert document to %s" % proposed u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed except Exception, e: # print "That didn't work!" # print e return None #print "Correct encoding: %s" % proposed return self.markup def _toUnicode(self, data, encoding): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding) return newdata def _detectEncoding(self, xml_data, isHTML=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == '\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == '\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass except: xml_encoding_match = None xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) if not xml_encoding_match and isHTML: meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() regexp = re.compile(meta_re, re.I) xml_encoding_match = regexp.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].decode( 'ascii').lower() if isHTML: self.declaredHTMLEncoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): pass return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ ''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), '\x81' : ' ', '\x82' : ('sbquo', '201A'), '\x83' : ('fnof', '192'), '\x84' : ('bdquo', '201E'), '\x85' : ('hellip', '2026'), '\x86' : ('dagger', '2020'), '\x87' : ('Dagger', '2021'), '\x88' : ('circ', '2C6'), '\x89' : ('permil', '2030'), '\x8A' : ('Scaron', '160'), '\x8B' : ('lsaquo', '2039'), '\x8C' : ('OElig', '152'), '\x8D' : '?', '\x8E' : ('#x17D', '17D'), '\x8F' : '?', '\x90' : '?', '\x91' : ('lsquo', '2018'), '\x92' : ('rsquo', '2019'), '\x93' : ('ldquo', '201C'), '\x94' : ('rdquo', '201D'), '\x95' : ('bull', '2022'), '\x96' : ('ndash', '2013'), '\x97' : ('mdash', '2014'), '\x98' : ('tilde', '2DC'), '\x99' : ('trade', '2122'), '\x9a' : ('scaron', '161'), '\x9b' : ('rsaquo', '203A'), '\x9c' : ('oelig', '153'), '\x9d' : '?', '\x9e' : ('#x17E', '17E'), '\x9f' : ('Yuml', ''),}