1 files changed, 44 insertions, 0 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..eb92e6b 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
 from beautifulsoup.element import Entities
 
 __all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
         """
         return fragment
 
+    def set_up_substitutions(self, tag):
+        pass
+
 
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,43 @@ class HTMLTreeBuilder(TreeBuilder):
     self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                             'spacer', 'link', 'frame', 'base'])
 
+    # Used by set_up_substitutions to detect the charset in a META tag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def set_up_substitutions(self, tag):
+        if tag.name != 'meta':
+            return False
+
+        httpEquiv = None
+        contentType = None
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+
+        if (http_equiv is not None
+            and content is not None
+            and http_equiv.lower() == 'content-type'):
+            # This is an interesting meta tag.
+            match = self.CHARSET_RE.search(content)
+            if match:
+                if (self.soup.declaredHTMLEncoding is not None or
+                    self.soup.originalEncoding == self.soup.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    newAttr = self.CHARSET_RE.sub(rewrite, content)
+                    tag['content'] = newAttr
+                    return True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    newCharset = match.group(3)
+                    if newCharset and newCharset != self.soup.originalEncoding:
+                        self.soup.declaredHTMLEncoding = newCharset
+                        self.soup._feed(self.soup.declaredHTMLEncoding)
+                        raise StopParsing
+                    pass
+        return False