Split Unicode Dammit into a separate library.

author: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-10 09:10:10 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-10 09:10:10 -0400
commit: 20268a5bc881fc21af089047e03a78870e15d112 (patch)
tree: 3233c14a829e0ad064f72c90ac0989eaee527708 /BeautifulSoup.py
parent: dea80d87dfa914565de15da2b06d00e620a82ae3 (diff)
1 files changed, 4 insertions, 301 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index c0f7482..4d40c34 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -32,17 +32,13 @@ Beautiful Soup defines classes for two main parsing strategies:
    or invalid. This class has web browser-like heuristics for
    obtaining a sensible parse tree in the face of common HTML errors.
 
-Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
-the encoding of an HTML or XML document, and converting it to
-Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
-
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/documentation.html
 
 Here, have some legalese:
 
-Copyright (c) 2004-2008, Leonard Richardson
+Copyright (c) 2004-2009, Leonard Richardson
 
 All rights reserved.
 
@@ -79,11 +75,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
 from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "3.1.0"
-__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
+__version__ = "4.0.0"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
 __license__ = "New-style BSD"
 
-import codecs
 import markupbase
 import types
 import re
@@ -96,6 +91,7 @@ try:
     set
 except NameError:
     from sets import Set as set
+from dammit import UnicodeDammit
 
 #These hacks make Beautiful Soup able to parse XML with namespaces
 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
@@ -1709,299 +1705,6 @@ class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup):
         return ICantBelieveItsValidHTMLBuilder()
 
 
-######################################################
-#
-# Bonus library: Unicode, Dammit
-#
-# This class forces XML data into a standard format (usually to UTF-8
-# or Unicode).  It is heavily based on code from Mark Pilgrim's
-# Universal Feed Parser. It does not rewrite the XML or HTML to
-# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
-# (XML) and BeautifulSoup.handleSpecialMetaTag (HTML).
-
-# Autodetects character encodings.
-# Download from http://chardet.feedparser.org/
-try:
-    import chardet
-#    import chardet.constants
-#    chardet.constants._debug = 1
-except ImportError:
-    chardet = None
-
-# cjkcodecs and iconv_codec make Python know about more character encodings.
-# Both are available from http://cjkpython.i18n.org/
-# They're built in if you use Python 2.4.
-try:
-    import cjkcodecs.aliases
-except ImportError:
-    pass
-try:
-    import iconv_codec
-except ImportError:
-    pass
-
-class UnicodeDammit:
-    """A class for detecting the encoding of a *ML document and
-    converting it to a Unicode string. If the source encoding is
-    windows-1252, can replace MS smart quotes with their HTML or XML
-    equivalents."""
-
-    # This dictionary maps commonly seen values for "charset" in HTML
-    # meta tags to the corresponding Python codec names. It only covers
-    # values that aren't in Python's aliases and can't be determined
-    # by the heuristics in find_codec.
-    CHARSET_ALIASES = { "macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
-
-    def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml', isHTML=False):
-        self.declaredHTMLEncoding = None
-        self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup, isHTML)
-        self.smartQuotesTo = smartQuotesTo
-        self.triedEncodings = []
-        if markup == '' or isinstance(markup, unicode):
-            self.originalEncoding = None
-            self.unicode = unicode(markup)
-            return
-
-        u = None
-        for proposedEncoding in overrideEncodings:
-            u = self._convertFrom(proposedEncoding)
-            if u: break
-        if not u:
-            for proposedEncoding in (documentEncoding, sniffedEncoding):
-                u = self._convertFrom(proposedEncoding)
-                if u: break
-
-        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
-
-        # As a last resort, try utf-8 and windows-1252:
-        if not u:
-            for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convertFrom(proposed_encoding)
-                if u: break
-
-        self.unicode = u
-        if not u: self.originalEncoding = None
-
-    def _subMSChar(self, match):
-        """Changes a MS smart quote character to an XML or HTML
-        entity."""
-        orig = match.group(1)
-        sub = self.MS_CHARS.get(orig)
-        if type(sub) == types.TupleType:
-            if self.smartQuotesTo == 'xml':
-                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
-            else:
-                sub = '&'.encode() + sub[0].encode() + ';'.encode()
-        else:
-            sub = sub.encode()
-        return sub
-
-    def _convertFrom(self, proposed):
-        proposed = self.find_codec(proposed)
-        if not proposed or proposed in self.triedEncodings:
-            return None
-        self.triedEncodings.append(proposed)
-        markup = self.markup
-
-        # Convert smart quotes to HTML if coming from an encoding
-        # that might have them.
-        if self.smartQuotesTo and proposed.lower() in("windows-1252",
-                                                      "iso-8859-1",
-                                                      "iso-8859-2"):
-            smart_quotes_re = "([\x80-\x9f])"
-            smart_quotes_compiled = re.compile(smart_quotes_re)
-            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
-
-        try:
-            # print "Trying to convert document to %s" % proposed
-            u = self._toUnicode(markup, proposed)
-            self.markup = u
-            self.originalEncoding = proposed
-        except Exception, e:
-            # print "That didn't work!"
-            # print e
-            return None
-        #print "Correct encoding: %s" % proposed
-        return self.markup
-
-    def _toUnicode(self, data, encoding):
-        '''Given a string and its encoding, decodes the string into Unicode.
-        %encoding is a string recognized by encodings.aliases'''
-
-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding)
-        return newdata
-
-    def _detectEncoding(self, xml_data, isHTML=False):
-        """Given a document, tries to detect its XML encoding."""
-        xml_encoding = sniffed_xml_encoding = None
-        try:
-            if xml_data[:4] == '\x4c\x6f\xa7\x94':
-                # EBCDIC
-                xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == '\x00\x3c\x00\x3f':
-                # UTF-16BE
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
-                # UTF-16BE with BOM
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x3f\x00':
-                # UTF-16LE
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
-                # UTF-16LE with BOM
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\x00\x3c':
-                # UTF-32BE
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x00\x00':
-                # UTF-32LE
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\xfe\xff':
-                # UTF-32BE with BOM
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\xff\xfe\x00\x00':
-                # UTF-32LE with BOM
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == '\xef\xbb\xbf':
-                # UTF-8 with BOM
-                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-            else:
-                sniffed_xml_encoding = 'ascii'
-                pass
-        except:
-            xml_encoding_match = None
-        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
-        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
-        if not xml_encoding_match and isHTML:
-            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
-            regexp = re.compile(meta_re, re.I)
-            xml_encoding_match = regexp.search(xml_data)
-        if xml_encoding_match is not None:
-            xml_encoding = xml_encoding_match.groups()[0].decode(
-                'ascii').lower()
-            if isHTML:
-                self.declaredHTMLEncoding = xml_encoding
-            if sniffed_xml_encoding and \
-               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
-                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
-                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
-                                 'utf16', 'u16')):
-                xml_encoding = sniffed_xml_encoding
-        return xml_data, xml_encoding, sniffed_xml_encoding
-
-
-    def find_codec(self, charset):
-        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
-               or charset
-
-    def _codec(self, charset):
-        if not charset: return charset
-        codec = None
-        try:
-            codecs.lookup(charset)
-            codec = charset
-        except (LookupError, ValueError):
-            pass
-        return codec
-
-    EBCDIC_TO_ASCII_MAP = None
-    def _ebcdic_to_ascii(self, s):
-        c = self.__class__
-        if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
-            import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
-        return s.translate(c.EBCDIC_TO_ASCII_MAP)
-
-    MS_CHARS = { '\x80' : ('euro', '20AC'),
-                 '\x81' : ' ',
-                 '\x82' : ('sbquo', '201A'),
-                 '\x83' : ('fnof', '192'),
-                 '\x84' : ('bdquo', '201E'),
-                 '\x85' : ('hellip', '2026'),
-                 '\x86' : ('dagger', '2020'),
-                 '\x87' : ('Dagger', '2021'),
-                 '\x88' : ('circ', '2C6'),
-                 '\x89' : ('permil', '2030'),
-                 '\x8A' : ('Scaron', '160'),
-                 '\x8B' : ('lsaquo', '2039'),
-                 '\x8C' : ('OElig', '152'),
-                 '\x8D' : '?',
-                 '\x8E' : ('#x17D', '17D'),
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : ('lsquo', '2018'),
-                 '\x92' : ('rsquo', '2019'),
-                 '\x93' : ('ldquo', '201C'),
-                 '\x94' : ('rdquo', '201D'),
-                 '\x95' : ('bull', '2022'),
-                 '\x96' : ('ndash', '2013'),
-                 '\x97' : ('mdash', '2014'),
-                 '\x98' : ('tilde', '2DC'),
-                 '\x99' : ('trade', '2122'),
-                 '\x9a' : ('scaron', '161'),
-                 '\x9b' : ('rsaquo', '203A'),
-                 '\x9c' : ('oelig', '153'),
-                 '\x9d' : '?',
-                 '\x9e' : ('#x17E', '17E'),
-                 '\x9f' : ('Yuml', ''),}
-
-#######################################################################
-
-
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
author	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-10 09:10:10 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-10 09:10:10 -0400
commit	20268a5bc881fc21af089047e03a78870e15d112 (patch)
tree	3233c14a829e0ad064f72c90ac0989eaee527708 /BeautifulSoup.py
parent	dea80d87dfa914565de15da2b06d00e620a82ae3 (diff)