5 files changed, 367 insertions, 372 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index c0f7482..4d40c34 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -32,17 +32,13 @@ Beautiful Soup defines classes for two main parsing strategies:
    or invalid. This class has web browser-like heuristics for
    obtaining a sensible parse tree in the face of common HTML errors.
 
-Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
-the encoding of an HTML or XML document, and converting it to
-Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
-
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/documentation.html
 
 Here, have some legalese:
 
-Copyright (c) 2004-2008, Leonard Richardson
+Copyright (c) 2004-2009, Leonard Richardson
 
 All rights reserved.
 
@@ -79,11 +75,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
 from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "3.1.0"
-__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
+__version__ = "4.0.0"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
 __license__ = "New-style BSD"
 
-import codecs
 import markupbase
 import types
 import re
@@ -96,6 +91,7 @@ try:
     set
 except NameError:
     from sets import Set as set
+from dammit import UnicodeDammit
 
 #These hacks make Beautiful Soup able to parse XML with namespaces
 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
@@ -1709,299 +1705,6 @@ class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup):
         return ICantBelieveItsValidHTMLBuilder()
 
 
-######################################################
-#
-# Bonus library: Unicode, Dammit
-#
-# This class forces XML data into a standard format (usually to UTF-8
-# or Unicode).  It is heavily based on code from Mark Pilgrim's
-# Universal Feed Parser. It does not rewrite the XML or HTML to
-# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
-# (XML) and BeautifulSoup.handleSpecialMetaTag (HTML).
-
-# Autodetects character encodings.
-# Download from http://chardet.feedparser.org/
-try:
-    import chardet
-#    import chardet.constants
-#    chardet.constants._debug = 1
-except ImportError:
-    chardet = None
-
-# cjkcodecs and iconv_codec make Python know about more character encodings.
-# Both are available from http://cjkpython.i18n.org/
-# They're built in if you use Python 2.4.
-try:
-    import cjkcodecs.aliases
-except ImportError:
-    pass
-try:
-    import iconv_codec
-except ImportError:
-    pass
-
-class UnicodeDammit:
-    """A class for detecting the encoding of a *ML document and
-    converting it to a Unicode string. If the source encoding is
-    windows-1252, can replace MS smart quotes with their HTML or XML
-    equivalents."""
-
-    # This dictionary maps commonly seen values for "charset" in HTML
-    # meta tags to the corresponding Python codec names. It only covers
-    # values that aren't in Python's aliases and can't be determined
-    # by the heuristics in find_codec.
-    CHARSET_ALIASES = { "macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
-
-    def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml', isHTML=False):
-        self.declaredHTMLEncoding = None
-        self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup, isHTML)
-        self.smartQuotesTo = smartQuotesTo
-        self.triedEncodings = []
-        if markup == '' or isinstance(markup, unicode):
-            self.originalEncoding = None
-            self.unicode = unicode(markup)
-            return
-
-        u = None
-        for proposedEncoding in overrideEncodings:
-            u = self._convertFrom(proposedEncoding)
-            if u: break
-        if not u:
-            for proposedEncoding in (documentEncoding, sniffedEncoding):
-                u = self._convertFrom(proposedEncoding)
-                if u: break
-
-        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
-
-        # As a last resort, try utf-8 and windows-1252:
-        if not u:
-            for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convertFrom(proposed_encoding)
-                if u: break
-
-        self.unicode = u
-        if not u: self.originalEncoding = None
-
-    def _subMSChar(self, match):
-        """Changes a MS smart quote character to an XML or HTML
-        entity."""
-        orig = match.group(1)
-        sub = self.MS_CHARS.get(orig)
-        if type(sub) == types.TupleType:
-            if self.smartQuotesTo == 'xml':
-                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
-            else:
-                sub = '&'.encode() + sub[0].encode() + ';'.encode()
-        else:
-            sub = sub.encode()
-        return sub
-
-    def _convertFrom(self, proposed):
-        proposed = self.find_codec(proposed)
-        if not proposed or proposed in self.triedEncodings:
-            return None
-        self.triedEncodings.append(proposed)
-        markup = self.markup
-
-        # Convert smart quotes to HTML if coming from an encoding
-        # that might have them.
-        if self.smartQuotesTo and proposed.lower() in("windows-1252",
-                                                      "iso-8859-1",
-                                                      "iso-8859-2"):
-            smart_quotes_re = "([\x80-\x9f])"
-            smart_quotes_compiled = re.compile(smart_quotes_re)
-            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
-
-        try:
-            # print "Trying to convert document to %s" % proposed
-            u = self._toUnicode(markup, proposed)
-            self.markup = u
-            self.originalEncoding = proposed
-        except Exception, e:
-            # print "That didn't work!"
-            # print e
-            return None
-        #print "Correct encoding: %s" % proposed
-        return self.markup
-
-    def _toUnicode(self, data, encoding):
-        '''Given a string and its encoding, decodes the string into Unicode.
-        %encoding is a string recognized by encodings.aliases'''
-
-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding)
-        return newdata
-
-    def _detectEncoding(self, xml_data, isHTML=False):
-        """Given a document, tries to detect its XML encoding."""
-        xml_encoding = sniffed_xml_encoding = None
-        try:
-            if xml_data[:4] == '\x4c\x6f\xa7\x94':
-                # EBCDIC
-                xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == '\x00\x3c\x00\x3f':
-                # UTF-16BE
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
-                # UTF-16BE with BOM
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x3f\x00':
-                # UTF-16LE
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
-                # UTF-16LE with BOM
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\x00\x3c':
-                # UTF-32BE
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x00\x00':
-                # UTF-32LE
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\xfe\xff':
-                # UTF-32BE with BOM
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\xff\xfe\x00\x00':
-                # UTF-32LE with BOM
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == '\xef\xbb\xbf':
-                # UTF-8 with BOM
-                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-            else:
-                sniffed_xml_encoding = 'ascii'
-                pass
-        except:
-            xml_encoding_match = None
-        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
-        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
-        if not xml_encoding_match and isHTML:
-            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
-            regexp = re.compile(meta_re, re.I)
-            xml_encoding_match = regexp.search(xml_data)
-        if xml_encoding_match is not None:
-            xml_encoding = xml_encoding_match.groups()[0].decode(
-                'ascii').lower()
-            if isHTML:
-                self.declaredHTMLEncoding = xml_encoding
-            if sniffed_xml_encoding and \
-               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
-                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
-                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
-                                 'utf16', 'u16')):
-                xml_encoding = sniffed_xml_encoding
-        return xml_data, xml_encoding, sniffed_xml_encoding
-
-
-    def find_codec(self, charset):
-        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
-               or charset
-
-    def _codec(self, charset):
-        if not charset: return charset
-        codec = None
-        try:
-            codecs.lookup(charset)
-            codec = charset
-        except (LookupError, ValueError):
-            pass
-        return codec
-
-    EBCDIC_TO_ASCII_MAP = None
-    def _ebcdic_to_ascii(self, s):
-        c = self.__class__
-        if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
-            import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
-        return s.translate(c.EBCDIC_TO_ASCII_MAP)
-
-    MS_CHARS = { '\x80' : ('euro', '20AC'),
-                 '\x81' : ' ',
-                 '\x82' : ('sbquo', '201A'),
-                 '\x83' : ('fnof', '192'),
-                 '\x84' : ('bdquo', '201E'),
-                 '\x85' : ('hellip', '2026'),
-                 '\x86' : ('dagger', '2020'),
-                 '\x87' : ('Dagger', '2021'),
-                 '\x88' : ('circ', '2C6'),
-                 '\x89' : ('permil', '2030'),
-                 '\x8A' : ('Scaron', '160'),
-                 '\x8B' : ('lsaquo', '2039'),
-                 '\x8C' : ('OElig', '152'),
-                 '\x8D' : '?',
-                 '\x8E' : ('#x17D', '17D'),
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : ('lsquo', '2018'),
-                 '\x92' : ('rsquo', '2019'),
-                 '\x93' : ('ldquo', '201C'),
-                 '\x94' : ('rdquo', '201D'),
-                 '\x95' : ('bull', '2022'),
-                 '\x96' : ('ndash', '2013'),
-                 '\x97' : ('mdash', '2014'),
-                 '\x98' : ('tilde', '2DC'),
-                 '\x99' : ('trade', '2122'),
-                 '\x9a' : ('scaron', '161'),
-                 '\x9b' : ('rsaquo', '203A'),
-                 '\x9c' : ('oelig', '153'),
-                 '\x9d' : '?',
-                 '\x9e' : ('#x17E', '17E'),
-                 '\x9f' : ('Yuml', ''),}
-
-#######################################################################
-
-
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
diff --git a/BeautifulSoup.py.3.diff b/BeautifulSoup.py.3.diff
index 3943cb0..ad3d6e1 100644
--- a/BeautifulSoup.py.3.diff
+++ b/BeautifulSoup.py.3.diff
@@ -10,73 +10,3 @@
 <                 i = g.next()
 ---
 >                 i = g.__next__()
-1800c1800
-<             smart_quotes_re = "([\x80-\x9f])"
----
->             smart_quotes_re = b"([\x80-\x9f])"
-1952,1983c1952,1983
-<     MS_CHARS = { '\x80' : ('euro', '20AC'),
-<                  '\x81' : ' ',
-<                  '\x82' : ('sbquo', '201A'),
-<                  '\x83' : ('fnof', '192'),
-<                  '\x84' : ('bdquo', '201E'),
-<                  '\x85' : ('hellip', '2026'),
-<                  '\x86' : ('dagger', '2020'),
-<                  '\x87' : ('Dagger', '2021'),
-<                  '\x88' : ('circ', '2C6'),
-<                  '\x89' : ('permil', '2030'),
-<                  '\x8A' : ('Scaron', '160'),
-<                  '\x8B' : ('lsaquo', '2039'),
-<                  '\x8C' : ('OElig', '152'),
-<                  '\x8D' : '?',
-<                  '\x8E' : ('#x17D', '17D'),
-<                  '\x8F' : '?',
-<                  '\x90' : '?',
-<                  '\x91' : ('lsquo', '2018'),
-<                  '\x92' : ('rsquo', '2019'),
-<                  '\x93' : ('ldquo', '201C'),
-<                  '\x94' : ('rdquo', '201D'),
-<                  '\x95' : ('bull', '2022'),
-<                  '\x96' : ('ndash', '2013'),
-<                  '\x97' : ('mdash', '2014'),
-<                  '\x98' : ('tilde', '2DC'),
-<                  '\x99' : ('trade', '2122'),
-<                  '\x9a' : ('scaron', '161'),
-<                  '\x9b' : ('rsaquo', '203A'),
-<                  '\x9c' : ('oelig', '153'),
-<                  '\x9d' : '?',
-<                  '\x9e' : ('#x17E', '17E'),
-<                  '\x9f' : ('Yuml', ''),}
----
->     MS_CHARS = { b'\x80' : ('euro', '20AC'),
->                  b'\x81' : ' ',
->                  b'\x82' : ('sbquo', '201A'),
->                  b'\x83' : ('fnof', '192'),
->                  b'\x84' : ('bdquo', '201E'),
->                  b'\x85' : ('hellip', '2026'),
->                  b'\x86' : ('dagger', '2020'),
->                  b'\x87' : ('Dagger', '2021'),
->                  b'\x88' : ('circ', '2C6'),
->                  b'\x89' : ('permil', '2030'),
->                  b'\x8A' : ('Scaron', '160'),
->                  b'\x8B' : ('lsaquo', '2039'),
->                  b'\x8C' : ('OElig', '152'),
->                  b'\x8D' : '?',
->                  b'\x8E' : ('#x17D', '17D'),
->                  b'\x8F' : '?',
->                  b'\x90' : '?',
->                  b'\x91' : ('lsquo', '2018'),
->                  b'\x92' : ('rsquo', '2019'),
->                  b'\x93' : ('ldquo', '201C'),
->                  b'\x94' : ('rdquo', '201D'),
->                  b'\x95' : ('bull', '2022'),
->                  b'\x96' : ('ndash', '2013'),
->                  b'\x97' : ('mdash', '2014'),
->                  b'\x98' : ('tilde', '2DC'),
->                  b'\x99' : ('trade', '2122'),
->                  b'\x9a' : ('scaron', '161'),
->                  b'\x9b' : ('rsaquo', '203A'),
->                  b'\x9c' : ('oelig', '153'),
->                  b'\x9d' : '?',
->                  b'\x9e' : ('#x17E', '17E'),
->                  b'\x9f' : ('Yuml', ''),}
diff --git a/dammit.py b/dammit.py
new file mode 100644
index 0000000..78bd4b2
--- /dev/null
+++ b/dammit.py
@@ -0,0 +1,292 @@
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's Beautiful Soup's job.
+"""
+
+import codecs
+import re
+import types
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+    import chardet
+#    import chardet.constants
+#    chardet.constants._debug = 1
+except ImportError:
+    chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+    import cjkcodecs.aliases
+except ImportError:
+    pass
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = { "macintosh" : "mac-roman",
+                        "x-sjis" : "shift-jis" }
+
+    def __init__(self, markup, overrideEncodings=[],
+                 smartQuotesTo='xml', isHTML=False):
+        self.declaredHTMLEncoding = None
+        self.markup, documentEncoding, sniffedEncoding = \
+                     self._detectEncoding(markup, isHTML)
+        self.smartQuotesTo = smartQuotesTo
+        self.triedEncodings = []
+        if markup == '' or isinstance(markup, unicode):
+            self.originalEncoding = None
+            self.unicode = unicode(markup)
+            return
+
+        u = None
+        for proposedEncoding in overrideEncodings:
+            u = self._convertFrom(proposedEncoding)
+            if u: break
+        if not u:
+            for proposedEncoding in (documentEncoding, sniffedEncoding):
+                u = self._convertFrom(proposedEncoding)
+                if u: break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and chardet and not isinstance(self.markup, unicode):
+            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convertFrom(proposed_encoding)
+                if u: break
+
+        self.unicode = u
+        if not u: self.originalEncoding = None
+
+    def _subMSChar(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity."""
+        orig = match.group(1)
+        sub = self.MS_CHARS.get(orig)
+        if type(sub) == types.TupleType:
+            if self.smartQuotesTo == 'xml':
+                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+            else:
+                sub = '&'.encode() + sub[0].encode() + ';'.encode()
+        else:
+            sub = sub.encode()
+        return sub
+
+    def _convertFrom(self, proposed):
+        proposed = self.find_codec(proposed)
+        if not proposed or proposed in self.triedEncodings:
+            return None
+        self.triedEncodings.append(proposed)
+        markup = self.markup
+
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if self.smartQuotesTo and proposed.lower() in("windows-1252",
+                                                      "iso-8859-1",
+                                                      "iso-8859-2"):
+            smart_quotes_re = "([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+
+        try:
+            # print "Trying to convert document to %s" % proposed
+            u = self._toUnicode(markup, proposed)
+            self.markup = u
+            self.originalEncoding = proposed
+        except Exception, e:
+            # print "That didn't work!"
+            # print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _toUnicode(self, data, encoding):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding)
+        return newdata
+
+    def _detectEncoding(self, xml_data, isHTML=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == '\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == '\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+                     and (xml_data[2:4] != '\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+                     (xml_data[2:4] != '\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == '\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
+        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+        if not xml_encoding_match and isHTML:
+            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+            regexp = re.compile(meta_re, re.I)
+            xml_encoding_match = regexp.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if isHTML:
+                self.declaredHTMLEncoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset: return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    MS_CHARS = { '\x80' : ('euro', '20AC'),
+                 '\x81' : ' ',
+                 '\x82' : ('sbquo', '201A'),
+                 '\x83' : ('fnof', '192'),
+                 '\x84' : ('bdquo', '201E'),
+                 '\x85' : ('hellip', '2026'),
+                 '\x86' : ('dagger', '2020'),
+                 '\x87' : ('Dagger', '2021'),
+                 '\x88' : ('circ', '2C6'),
+                 '\x89' : ('permil', '2030'),
+                 '\x8A' : ('Scaron', '160'),
+                 '\x8B' : ('lsaquo', '2039'),
+                 '\x8C' : ('OElig', '152'),
+                 '\x8D' : '?',
+                 '\x8E' : ('#x17D', '17D'),
+                 '\x8F' : '?',
+                 '\x90' : '?',
+                 '\x91' : ('lsquo', '2018'),
+                 '\x92' : ('rsquo', '2019'),
+                 '\x93' : ('ldquo', '201C'),
+                 '\x94' : ('rdquo', '201D'),
+                 '\x95' : ('bull', '2022'),
+                 '\x96' : ('ndash', '2013'),
+                 '\x97' : ('mdash', '2014'),
+                 '\x98' : ('tilde', '2DC'),
+                 '\x99' : ('trade', '2122'),
+                 '\x9a' : ('scaron', '161'),
+                 '\x9b' : ('rsaquo', '203A'),
+                 '\x9c' : ('oelig', '153'),
+                 '\x9d' : '?',
+                 '\x9e' : ('#x17E', '17E'),
+                 '\x9f' : ('Yuml', ''),}
diff --git a/dammit.py.3.diff b/dammit.py.3.diff
new file mode 100644
index 0000000..f6bab68
--- /dev/null
+++ b/dammit.py.3.diff
@@ -0,0 +1,70 @@
+1800c1800
+<             smart_quotes_re = "([\x80-\x9f])"
+---
+>             smart_quotes_re = b"([\x80-\x9f])"
+1952,1983c1952,1983
+<     MS_CHARS = { '\x80' : ('euro', '20AC'),
+<                  '\x81' : ' ',
+<                  '\x82' : ('sbquo', '201A'),
+<                  '\x83' : ('fnof', '192'),
+<                  '\x84' : ('bdquo', '201E'),
+<                  '\x85' : ('hellip', '2026'),
+<                  '\x86' : ('dagger', '2020'),
+<                  '\x87' : ('Dagger', '2021'),
+<                  '\x88' : ('circ', '2C6'),
+<                  '\x89' : ('permil', '2030'),
+<                  '\x8A' : ('Scaron', '160'),
+<                  '\x8B' : ('lsaquo', '2039'),
+<                  '\x8C' : ('OElig', '152'),
+<                  '\x8D' : '?',
+<                  '\x8E' : ('#x17D', '17D'),
+<                  '\x8F' : '?',
+<                  '\x90' : '?',
+<                  '\x91' : ('lsquo', '2018'),
+<                  '\x92' : ('rsquo', '2019'),
+<                  '\x93' : ('ldquo', '201C'),
+<                  '\x94' : ('rdquo', '201D'),
+<                  '\x95' : ('bull', '2022'),
+<                  '\x96' : ('ndash', '2013'),
+<                  '\x97' : ('mdash', '2014'),
+<                  '\x98' : ('tilde', '2DC'),
+<                  '\x99' : ('trade', '2122'),
+<                  '\x9a' : ('scaron', '161'),
+<                  '\x9b' : ('rsaquo', '203A'),
+<                  '\x9c' : ('oelig', '153'),
+<                  '\x9d' : '?',
+<                  '\x9e' : ('#x17E', '17E'),
+<                  '\x9f' : ('Yuml', ''),}
+---
+>     MS_CHARS = { b'\x80' : ('euro', '20AC'),
+>                  b'\x81' : ' ',
+>                  b'\x82' : ('sbquo', '201A'),
+>                  b'\x83' : ('fnof', '192'),
+>                  b'\x84' : ('bdquo', '201E'),
+>                  b'\x85' : ('hellip', '2026'),
+>                  b'\x86' : ('dagger', '2020'),
+>                  b'\x87' : ('Dagger', '2021'),
+>                  b'\x88' : ('circ', '2C6'),
+>                  b'\x89' : ('permil', '2030'),
+>                  b'\x8A' : ('Scaron', '160'),
+>                  b'\x8B' : ('lsaquo', '2039'),
+>                  b'\x8C' : ('OElig', '152'),
+>                  b'\x8D' : '?',
+>                  b'\x8E' : ('#x17D', '17D'),
+>                  b'\x8F' : '?',
+>                  b'\x90' : '?',
+>                  b'\x91' : ('lsquo', '2018'),
+>                  b'\x92' : ('rsquo', '2019'),
+>                  b'\x93' : ('ldquo', '201C'),
+>                  b'\x94' : ('rdquo', '201D'),
+>                  b'\x95' : ('bull', '2022'),
+>                  b'\x96' : ('ndash', '2013'),
+>                  b'\x97' : ('mdash', '2014'),
+>                  b'\x98' : ('tilde', '2DC'),
+>                  b'\x99' : ('trade', '2122'),
+>                  b'\x9a' : ('scaron', '161'),
+>                  b'\x9b' : ('rsaquo', '203A'),
+>                  b'\x9c' : ('oelig', '153'),
+>                  b'\x9d' : '?',
+>                  b'\x9e' : ('#x17E', '17E'),
+>                  b'\x9f' : ('Yuml', ''),}
diff --git a/to3.sh b/to3.sh
index 60c9b7d..f24a121 100755
--- a/to3.sh
+++ b/to3.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 mkdir python3
-for i in BeautifulSoup*.py
+for i in BeautifulSoup*.py dammit.py
 do
     cp $i python3/
     2to3-3.0 -x next $i | patch -p0 python3/$i