diff options
-rw-r--r-- | BeautifulSoup.py | 305 | ||||
-rw-r--r-- | BeautifulSoup.py.3.diff | 70 | ||||
-rw-r--r-- | dammit.py | 292 | ||||
-rw-r--r-- | dammit.py.3.diff | 70 | ||||
-rwxr-xr-x | to3.sh | 2 |
5 files changed, 367 insertions, 372 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py index c0f7482..4d40c34 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -32,17 +32,13 @@ Beautiful Soup defines classes for two main parsing strategies: or invalid. This class has web browser-like heuristics for obtaining a sensible parse tree in the face of common HTML errors. -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting -the encoding of an HTML or XML document, and converting it to -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. - For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: -Copyright (c) 2004-2008, Leonard Richardson +Copyright (c) 2004-2009, Leonard Richardson All rights reserved. @@ -79,11 +75,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.1.0" -__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" +__version__ = "4.0.0" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" __license__ = "New-style BSD" -import codecs import markupbase import types import re @@ -96,6 +91,7 @@ try: set except NameError: from sets import Set as set +from dammit import UnicodeDammit #These hacks make Beautiful Soup able to parse XML with namespaces markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match @@ -1709,299 +1705,6 @@ class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup): return ICantBelieveItsValidHTMLBuilder() -###################################################### -# -# Bonus library: Unicode, Dammit -# -# This class forces XML data into a standard format (usually to UTF-8 -# or Unicode). It is heavily based on code from Mark Pilgrim's -# Universal Feed Parser. It does not rewrite the XML or HTML to -# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi -# (XML) and BeautifulSoup.handleSpecialMetaTag (HTML). - -# Autodetects character encodings. -# Download from http://chardet.feedparser.org/ -try: - import chardet -# import chardet.constants -# chardet.constants._debug = 1 -except ImportError: - chardet = None - -# cjkcodecs and iconv_codec make Python know about more character encodings. -# Both are available from http://cjkpython.i18n.org/ -# They're built in if you use Python 2.4. -try: - import cjkcodecs.aliases -except ImportError: - pass -try: - import iconv_codec -except ImportError: - pass - -class UnicodeDammit: - """A class for detecting the encoding of a *ML document and - converting it to a Unicode string. If the source encoding is - windows-1252, can replace MS smart quotes with their HTML or XML - equivalents.""" - - # This dictionary maps commonly seen values for "charset" in HTML - # meta tags to the corresponding Python codec names. It only covers - # values that aren't in Python's aliases and can't be determined - # by the heuristics in find_codec. - CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } - - def __init__(self, markup, overrideEncodings=[], - smartQuotesTo='xml', isHTML=False): - self.declaredHTMLEncoding = None - self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup, isHTML) - self.smartQuotesTo = smartQuotesTo - self.triedEncodings = [] - if markup == '' or isinstance(markup, unicode): - self.originalEncoding = None - self.unicode = unicode(markup) - return - - u = None - for proposedEncoding in overrideEncodings: - u = self._convertFrom(proposedEncoding) - if u: break - if not u: - for proposedEncoding in (documentEncoding, sniffedEncoding): - u = self._convertFrom(proposedEncoding) - if u: break - - # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): - u = self._convertFrom(chardet.detect(self.markup)['encoding']) - - # As a last resort, try utf-8 and windows-1252: - if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convertFrom(proposed_encoding) - if u: break - - self.unicode = u - if not u: self.originalEncoding = None - - def _subMSChar(self, match): - """Changes a MS smart quote character to an XML or HTML - entity.""" - orig = match.group(1) - sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: - if self.smartQuotesTo == 'xml': - sub = '&#x'.encode() + sub[1].encode() + ';'.encode() - else: - sub = '&'.encode() + sub[0].encode() + ';'.encode() - else: - sub = sub.encode() - return sub - - def _convertFrom(self, proposed): - proposed = self.find_codec(proposed) - if not proposed or proposed in self.triedEncodings: - return None - self.triedEncodings.append(proposed) - markup = self.markup - - # Convert smart quotes to HTML if coming from an encoding - # that might have them. - if self.smartQuotesTo and proposed.lower() in("windows-1252", - "iso-8859-1", - "iso-8859-2"): - smart_quotes_re = "([\x80-\x9f])" - smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._subMSChar, markup) - - try: - # print "Trying to convert document to %s" % proposed - u = self._toUnicode(markup, proposed) - self.markup = u - self.originalEncoding = proposed - except Exception, e: - # print "That didn't work!" - # print e - return None - #print "Correct encoding: %s" % proposed - return self.markup - - def _toUnicode(self, data, encoding): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - return newdata - - def _detectEncoding(self, xml_data, isHTML=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() - xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) - if not xml_encoding_match and isHTML: - meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() - regexp = re.compile(meta_re, re.I) - xml_encoding_match = regexp.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if isHTML: - self.declaredHTMLEncoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding - - - def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ - or charset - - def _codec(self, charset): - if not charset: return charset - codec = None - try: - codecs.lookup(charset) - codec = charset - except (LookupError, ValueError): - pass - return codec - - EBCDIC_TO_ASCII_MAP = None - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) - - MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} - -####################################################################### - - #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys diff --git a/BeautifulSoup.py.3.diff b/BeautifulSoup.py.3.diff index 3943cb0..ad3d6e1 100644 --- a/BeautifulSoup.py.3.diff +++ b/BeautifulSoup.py.3.diff @@ -10,73 +10,3 @@ < i = g.next() --- > i = g.__next__() -1800c1800 -< smart_quotes_re = "([\x80-\x9f])" ---- -> smart_quotes_re = b"([\x80-\x9f])" -1952,1983c1952,1983 -< MS_CHARS = { '\x80' : ('euro', '20AC'), -< '\x81' : ' ', -< '\x82' : ('sbquo', '201A'), -< '\x83' : ('fnof', '192'), -< '\x84' : ('bdquo', '201E'), -< '\x85' : ('hellip', '2026'), -< '\x86' : ('dagger', '2020'), -< '\x87' : ('Dagger', '2021'), -< '\x88' : ('circ', '2C6'), -< '\x89' : ('permil', '2030'), -< '\x8A' : ('Scaron', '160'), -< '\x8B' : ('lsaquo', '2039'), -< '\x8C' : ('OElig', '152'), -< '\x8D' : '?', -< '\x8E' : ('#x17D', '17D'), -< '\x8F' : '?', -< '\x90' : '?', -< '\x91' : ('lsquo', '2018'), -< '\x92' : ('rsquo', '2019'), -< '\x93' : ('ldquo', '201C'), -< '\x94' : ('rdquo', '201D'), -< '\x95' : ('bull', '2022'), -< '\x96' : ('ndash', '2013'), -< '\x97' : ('mdash', '2014'), -< '\x98' : ('tilde', '2DC'), -< '\x99' : ('trade', '2122'), -< '\x9a' : ('scaron', '161'), -< '\x9b' : ('rsaquo', '203A'), -< '\x9c' : ('oelig', '153'), -< '\x9d' : '?', -< '\x9e' : ('#x17E', '17E'), -< '\x9f' : ('Yuml', ''),} ---- -> MS_CHARS = { b'\x80' : ('euro', '20AC'), -> b'\x81' : ' ', -> b'\x82' : ('sbquo', '201A'), -> b'\x83' : ('fnof', '192'), -> b'\x84' : ('bdquo', '201E'), -> b'\x85' : ('hellip', '2026'), -> b'\x86' : ('dagger', '2020'), -> b'\x87' : ('Dagger', '2021'), -> b'\x88' : ('circ', '2C6'), -> b'\x89' : ('permil', '2030'), -> b'\x8A' : ('Scaron', '160'), -> b'\x8B' : ('lsaquo', '2039'), -> b'\x8C' : ('OElig', '152'), -> b'\x8D' : '?', -> b'\x8E' : ('#x17D', '17D'), -> b'\x8F' : '?', -> b'\x90' : '?', -> b'\x91' : ('lsquo', '2018'), -> b'\x92' : ('rsquo', '2019'), -> b'\x93' : ('ldquo', '201C'), -> b'\x94' : ('rdquo', '201D'), -> b'\x95' : ('bull', '2022'), -> b'\x96' : ('ndash', '2013'), -> b'\x97' : ('mdash', '2014'), -> b'\x98' : ('tilde', '2DC'), -> b'\x99' : ('trade', '2122'), -> b'\x9a' : ('scaron', '161'), -> b'\x9b' : ('rsaquo', '203A'), -> b'\x9c' : ('oelig', '153'), -> b'\x9d' : '?', -> b'\x9e' : ('#x17E', '17E'), -> b'\x9f' : ('Yuml', ''),} diff --git a/dammit.py b/dammit.py new file mode 100644 index 0000000..78bd4b2 --- /dev/null +++ b/dammit.py @@ -0,0 +1,292 @@ +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's Beautiful Soup's job. +""" + +import codecs +import re +import types + +# Autodetects character encodings. +# Download from http://chardet.feedparser.org/ +try: + import chardet +# import chardet.constants +# chardet.constants._debug = 1 +except ImportError: + chardet = None + +# cjkcodecs and iconv_codec make Python know about more character encodings. +# Both are available from http://cjkpython.i18n.org/ +# They're built in if you use Python 2.4. +try: + import cjkcodecs.aliases +except ImportError: + pass +try: + import iconv_codec +except ImportError: + pass + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + def __init__(self, markup, overrideEncodings=[], + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None + self.markup, documentEncoding, sniffedEncoding = \ + self._detectEncoding(markup, isHTML) + self.smartQuotesTo = smartQuotesTo + self.triedEncodings = [] + if markup == '' or isinstance(markup, unicode): + self.originalEncoding = None + self.unicode = unicode(markup) + return + + u = None + for proposedEncoding in overrideEncodings: + u = self._convertFrom(proposedEncoding) + if u: break + if not u: + for proposedEncoding in (documentEncoding, sniffedEncoding): + u = self._convertFrom(proposedEncoding) + if u: break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convertFrom(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convertFrom(proposed_encoding) + if u: break + + self.unicode = u + if not u: self.originalEncoding = None + + def _subMSChar(self, match): + """Changes a MS smart quote character to an XML or HTML + entity.""" + orig = match.group(1) + sub = self.MS_CHARS.get(orig) + if type(sub) == types.TupleType: + if self.smartQuotesTo == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convertFrom(self, proposed): + proposed = self.find_codec(proposed) + if not proposed or proposed in self.triedEncodings: + return None + self.triedEncodings.append(proposed) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if self.smartQuotesTo and proposed.lower() in("windows-1252", + "iso-8859-1", + "iso-8859-2"): + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._subMSChar, markup) + + try: + # print "Trying to convert document to %s" % proposed + u = self._toUnicode(markup, proposed) + self.markup = u + self.originalEncoding = proposed + except Exception, e: + # print "That didn't work!" + # print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _toUnicode(self, data, encoding): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + return newdata + + def _detectEncoding(self, xml_data, isHTML=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ + and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ + (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + MS_CHARS = { '\x80' : ('euro', '20AC'), + '\x81' : ' ', + '\x82' : ('sbquo', '201A'), + '\x83' : ('fnof', '192'), + '\x84' : ('bdquo', '201E'), + '\x85' : ('hellip', '2026'), + '\x86' : ('dagger', '2020'), + '\x87' : ('Dagger', '2021'), + '\x88' : ('circ', '2C6'), + '\x89' : ('permil', '2030'), + '\x8A' : ('Scaron', '160'), + '\x8B' : ('lsaquo', '2039'), + '\x8C' : ('OElig', '152'), + '\x8D' : '?', + '\x8E' : ('#x17D', '17D'), + '\x8F' : '?', + '\x90' : '?', + '\x91' : ('lsquo', '2018'), + '\x92' : ('rsquo', '2019'), + '\x93' : ('ldquo', '201C'), + '\x94' : ('rdquo', '201D'), + '\x95' : ('bull', '2022'), + '\x96' : ('ndash', '2013'), + '\x97' : ('mdash', '2014'), + '\x98' : ('tilde', '2DC'), + '\x99' : ('trade', '2122'), + '\x9a' : ('scaron', '161'), + '\x9b' : ('rsaquo', '203A'), + '\x9c' : ('oelig', '153'), + '\x9d' : '?', + '\x9e' : ('#x17E', '17E'), + '\x9f' : ('Yuml', ''),} diff --git a/dammit.py.3.diff b/dammit.py.3.diff new file mode 100644 index 0000000..f6bab68 --- /dev/null +++ b/dammit.py.3.diff @@ -0,0 +1,70 @@ +1800c1800 +< smart_quotes_re = "([\x80-\x9f])" +--- +> smart_quotes_re = b"([\x80-\x9f])" +1952,1983c1952,1983 +< MS_CHARS = { '\x80' : ('euro', '20AC'), +< '\x81' : ' ', +< '\x82' : ('sbquo', '201A'), +< '\x83' : ('fnof', '192'), +< '\x84' : ('bdquo', '201E'), +< '\x85' : ('hellip', '2026'), +< '\x86' : ('dagger', '2020'), +< '\x87' : ('Dagger', '2021'), +< '\x88' : ('circ', '2C6'), +< '\x89' : ('permil', '2030'), +< '\x8A' : ('Scaron', '160'), +< '\x8B' : ('lsaquo', '2039'), +< '\x8C' : ('OElig', '152'), +< '\x8D' : '?', +< '\x8E' : ('#x17D', '17D'), +< '\x8F' : '?', +< '\x90' : '?', +< '\x91' : ('lsquo', '2018'), +< '\x92' : ('rsquo', '2019'), +< '\x93' : ('ldquo', '201C'), +< '\x94' : ('rdquo', '201D'), +< '\x95' : ('bull', '2022'), +< '\x96' : ('ndash', '2013'), +< '\x97' : ('mdash', '2014'), +< '\x98' : ('tilde', '2DC'), +< '\x99' : ('trade', '2122'), +< '\x9a' : ('scaron', '161'), +< '\x9b' : ('rsaquo', '203A'), +< '\x9c' : ('oelig', '153'), +< '\x9d' : '?', +< '\x9e' : ('#x17E', '17E'), +< '\x9f' : ('Yuml', ''),} +--- +> MS_CHARS = { b'\x80' : ('euro', '20AC'), +> b'\x81' : ' ', +> b'\x82' : ('sbquo', '201A'), +> b'\x83' : ('fnof', '192'), +> b'\x84' : ('bdquo', '201E'), +> b'\x85' : ('hellip', '2026'), +> b'\x86' : ('dagger', '2020'), +> b'\x87' : ('Dagger', '2021'), +> b'\x88' : ('circ', '2C6'), +> b'\x89' : ('permil', '2030'), +> b'\x8A' : ('Scaron', '160'), +> b'\x8B' : ('lsaquo', '2039'), +> b'\x8C' : ('OElig', '152'), +> b'\x8D' : '?', +> b'\x8E' : ('#x17D', '17D'), +> b'\x8F' : '?', +> b'\x90' : '?', +> b'\x91' : ('lsquo', '2018'), +> b'\x92' : ('rsquo', '2019'), +> b'\x93' : ('ldquo', '201C'), +> b'\x94' : ('rdquo', '201D'), +> b'\x95' : ('bull', '2022'), +> b'\x96' : ('ndash', '2013'), +> b'\x97' : ('mdash', '2014'), +> b'\x98' : ('tilde', '2DC'), +> b'\x99' : ('trade', '2122'), +> b'\x9a' : ('scaron', '161'), +> b'\x9b' : ('rsaquo', '203A'), +> b'\x9c' : ('oelig', '153'), +> b'\x9d' : '?', +> b'\x9e' : ('#x17E', '17E'), +> b'\x9f' : ('Yuml', ''),} @@ -1,6 +1,6 @@ #!/bin/sh mkdir python3 -for i in BeautifulSoup*.py +for i in BeautifulSoup*.py dammit.py do cp $i python3/ 2to3-3.0 -x next $i | patch -p0 python3/$i |