2 files changed, 52 insertions, 3 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..cf5e6c6 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
 from beautifulsoup.element import Entities
 
 __all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
         """
         return fragment
 
+    def set_up_substitutions(self, tag):
+        pass
+
 
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,41 @@ class HTMLTreeBuilder(TreeBuilder):
     self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                             'spacer', 'link', 'frame', 'base'])
 
+    # Used by set_up_substitutions to detect the charset in a META tag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def set_up_substitutions(self, tag):
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+
+        if (http_equiv is not None
+            and content is not None
+            and http_equiv.lower() == 'content-type'):
+            # This is an interesting meta tag.
+            match = self.CHARSET_RE.search(content)
+            if match:
+                if (self.soup.declaredHTMLEncoding is not None or
+                    self.soup.originalEncoding == self.soup.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+                    return True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    new_charset = match.group(3)
+                    if (new_charset is not None
+                        and new_charset != self.soup.originalEncoding):
+                        self.soup.declaredHTMLEncoding = new_charset
+                        self.soup._feed(self.soup.declaredHTMLEncoding)
+                        raise StopParsing
+                    pass
+        return False
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index 736889f..dc95493 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -130,12 +130,19 @@ class Element(html5lib.treebuilders._base.Node):
         return AttrList(self.element)
 
     def setAttributes(self, attributes):
-        if attributes:
+        if attributes is not None and attributes != {}:
             for name, value in attributes.items():
                 self.element[name] =  value
-
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            # The Tag constructor calls this method automatically,
+            # but html5lib creates a Tag object before setting up
+            # the attributes.
+            self.element.contains_substitutions = (
+                self.soup.builder.set_up_substitutions(
+                    self.element))
     attributes = property(getAttributes, setAttributes)
-    
+
     def insertText(self, data, insertBefore=None):
         text = TextNode(NavigableString(data), self.soup)
         if insertBefore: