4 files changed, 51 insertions, 65 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index e23c9d9..f2c20de 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -120,9 +120,6 @@ class BeautifulStoneSoup(Tag):
     """
     ROOT_TAG_NAME = u'[document]'
 
-    # Used to detect the charset in a META tag; see handleSpecialMetaTag
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
     # Used when determining whether a text node is all whitespace and
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
@@ -272,13 +269,8 @@ class BeautifulStoneSoup(Tag):
                  or not self.parseOnlyThese.searchTag(name, attrs))):
             return None
 
-        containsSubstitutions = False
-        if name == 'meta' and self.builder.assume_html:
-            containsSubstitutions = self.handleSpecialMetaTag(attrs)
-
         tag = Tag(self, self.builder, name, attrs, self.currentTag,
                   self.previous)
-        tag.containsSubstitutions = containsSubstitutions
         if self.previous:
             self.previous.next = tag
         self.previous = tag
@@ -293,60 +285,6 @@ class BeautifulStoneSoup(Tag):
     def handle_data(self, data):
         self.currentData.append(data)
 
-    def handleSpecialMetaTag(self, attrs):
-        """Beautiful Soup can detect a charset included in a META tag,
-        try to convert the document to that charset, and re-parse the
-        document from the beginning. Neither lxml nor html5lib does
-        this, so the feature is still here."""
-        httpEquiv = None
-        contentType = None
-        contentTypeIndex = None
-        tagNeedsEncodingSubstitution = False
-
-        if isinstance(attrs, dict):
-            httpEquiv = attrs.get('http-equiv')
-            contentType = attrs.get('content')
-        else:
-            # XXX do we need this?
-            for i in range(0, len(attrs)):
-                key, value = attrs[i]
-                key = key.lower()
-                if key == 'http-equiv':
-                    httpEquiv = value
-                elif key == 'content':
-                    contentType = value
-                    contentTypeIndex = i
-
-        if httpEquiv and contentType: # It's an interesting meta tag.
-            match = self.CHARSET_RE.search(contentType)
-            if match:
-                if (self.declaredHTMLEncoding is not None or
-                    self.originalEncoding == self.fromEncoding):
-                    # An HTML encoding was sniffed while converting
-                    # the document to Unicode, or an HTML encoding was
-                    # sniffed during a previous pass through the
-                    # document, or an encoding was specified
-                    # explicitly and it worked. Rewrite the meta tag.
-                    def rewrite(match):
-                        return match.group(1) + "%SOUP-ENCODING%"
-                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
-                    if isinstance(attrs, dict):
-                        attrs['content'] = newAttr
-                    else:
-                        attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
-                                                   newAttr)
-                    tagNeedsEncodingSubstitution = True
-                else:
-                    # This is our first pass through the document.
-                    # Go through it again with the encoding information.
-                    newCharset = match.group(3)
-                    if newCharset and newCharset != self.originalEncoding:
-                        self.declaredHTMLEncoding = newCharset
-                        self._feed(self.declaredHTMLEncoding)
-                        raise StopParsing
-                    pass
-        return tagNeedsEncodingSubstitution
-
 
 class BeautifulSoup(BeautifulStoneSoup):
     """A convenience class for parsing HTML without creating a builder."""
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..eb92e6b 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
 from beautifulsoup.element import Entities
 
 __all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
         """
         return fragment
 
+    def set_up_substitutions(self, tag):
+        pass
+
 
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,43 @@ class HTMLTreeBuilder(TreeBuilder):
     self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                             'spacer', 'link', 'frame', 'base'])
 
+    # Used by set_up_substitutions to detect the charset in a META tag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def set_up_substitutions(self, tag):
+        if tag.name != 'meta':
+            return False
+
+        httpEquiv = None
+        contentType = None
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+
+        if (http_equiv is not None
+            and content is not None
+            and http_equiv.lower() == 'content-type'):
+            # This is an interesting meta tag.
+            match = self.CHARSET_RE.search(content)
+            if match:
+                if (self.soup.declaredHTMLEncoding is not None or
+                    self.soup.originalEncoding == self.soup.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    newAttr = self.CHARSET_RE.sub(rewrite, content)
+                    tag['content'] = newAttr
+                    return True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    newCharset = match.group(3)
+                    if newCharset and newCharset != self.soup.originalEncoding:
+                        self.soup.declaredHTMLEncoding = newCharset
+                        self.soup._feed(self.soup.declaredHTMLEncoding)
+                        raise StopParsing
+                    pass
+        return False
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 39e0e06..6e2bada 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -438,13 +438,16 @@ class Tag(PageElement, Entities):
         self.contents = []
         self.setup(parent, previous)
         self.hidden = False
-        self.containsSubstitutions = False
 
         if isinstance(attrs, types.DictType):
             self.attrs = [kv for kv in attrs.items()]
         else:
             self.attrs = list(attrs)
 
+        # Set up any substitutions, such as the charset in a META tag.
+        self.contains_substitutions = builder.set_up_substitutions(self)
+
+
     @property
     def string(self):
         """Convenience property to get the single string within this tag.
@@ -581,7 +584,7 @@ class Tag(PageElement, Entities):
             for key, val in self.attrs:
                 fmt = '%s="%s"'
                 if isString(val):
-                    if (self.containsSubstitutions
+                    if (self.contains_substitutions
                         and eventualEncoding is not None
                         and '%SOUP-ENCODING%' in val):
                         val = self.substituteEncoding(val, eventualEncoding)
@@ -882,6 +885,7 @@ class SoupStrainer:
                 result = matchAgainst == markup
         return result
 
+
 class ResultSet(list):
     """A ResultSet is just a list that keeps track of the SoupStrainer
     that created it."""
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index ab5b219..b002227 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -292,7 +292,7 @@ class TestLXMLBuilder(SoupTest):
         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
         self.assertEquals(parsed_meta['content'],
                           'text/html; charset=%SOUP-ENCODING%')
-        self.assertEquals(parsed_meta.containsSubstitutions, True)
+        self.assertEquals(parsed_meta.contains_substitutions, True)
 
         # For the rest of the story, see TestSubstitutions in
         # test_tree.py.