Refactored the code that sets up substitutions in attribute values, and made content-type substitution work with html5lib.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 09:58:07 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 09:58:07 -0500
commit: 66cbef12d959149746b3361f227f2a0328a31469 (patch)
tree: c6772c648933ad1477dc642aa598f34508870fff /beautifulsoup
parent: cb85520f7627a914e10e2d3ea52d7066bdf3984d (diff)
parent: d9462ef1b2760ccb6273903abcd7d253445716a4 (diff)
4 files changed, 61 insertions, 70 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index e23c9d9..f2c20de 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -120,9 +120,6 @@ class BeautifulStoneSoup(Tag):
     """
     ROOT_TAG_NAME = u'[document]'
 
-    # Used to detect the charset in a META tag; see handleSpecialMetaTag
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
     # Used when determining whether a text node is all whitespace and
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
@@ -272,13 +269,8 @@ class BeautifulStoneSoup(Tag):
                  or not self.parseOnlyThese.searchTag(name, attrs))):
             return None
 
-        containsSubstitutions = False
-        if name == 'meta' and self.builder.assume_html:
-            containsSubstitutions = self.handleSpecialMetaTag(attrs)
-
         tag = Tag(self, self.builder, name, attrs, self.currentTag,
                   self.previous)
-        tag.containsSubstitutions = containsSubstitutions
         if self.previous:
             self.previous.next = tag
         self.previous = tag
@@ -293,60 +285,6 @@ class BeautifulStoneSoup(Tag):
     def handle_data(self, data):
         self.currentData.append(data)
 
-    def handleSpecialMetaTag(self, attrs):
-        """Beautiful Soup can detect a charset included in a META tag,
-        try to convert the document to that charset, and re-parse the
-        document from the beginning. Neither lxml nor html5lib does
-        this, so the feature is still here."""
-        httpEquiv = None
-        contentType = None
-        contentTypeIndex = None
-        tagNeedsEncodingSubstitution = False
-
-        if isinstance(attrs, dict):
-            httpEquiv = attrs.get('http-equiv')
-            contentType = attrs.get('content')
-        else:
-            # XXX do we need this?
-            for i in range(0, len(attrs)):
-                key, value = attrs[i]
-                key = key.lower()
-                if key == 'http-equiv':
-                    httpEquiv = value
-                elif key == 'content':
-                    contentType = value
-                    contentTypeIndex = i
-
-        if httpEquiv and contentType: # It's an interesting meta tag.
-            match = self.CHARSET_RE.search(contentType)
-            if match:
-                if (self.declaredHTMLEncoding is not None or
-                    self.originalEncoding == self.fromEncoding):
-                    # An HTML encoding was sniffed while converting
-                    # the document to Unicode, or an HTML encoding was
-                    # sniffed during a previous pass through the
-                    # document, or an encoding was specified
-                    # explicitly and it worked. Rewrite the meta tag.
-                    def rewrite(match):
-                        return match.group(1) + "%SOUP-ENCODING%"
-                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
-                    if isinstance(attrs, dict):
-                        attrs['content'] = newAttr
-                    else:
-                        attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
-                                                   newAttr)
-                    tagNeedsEncodingSubstitution = True
-                else:
-                    # This is our first pass through the document.
-                    # Go through it again with the encoding information.
-                    newCharset = match.group(3)
-                    if newCharset and newCharset != self.originalEncoding:
-                        self.declaredHTMLEncoding = newCharset
-                        self._feed(self.declaredHTMLEncoding)
-                        raise StopParsing
-                    pass
-        return tagNeedsEncodingSubstitution
-
 
 class BeautifulSoup(BeautifulStoneSoup):
     """A convenience class for parsing HTML without creating a builder."""
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..cf5e6c6 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
 from beautifulsoup.element import Entities
 
 __all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
         """
         return fragment
 
+    def set_up_substitutions(self, tag):
+        pass
+
 
 class SAXTreeBuilder(TreeBuilder):
     """A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,41 @@ class HTMLTreeBuilder(TreeBuilder):
     self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                             'spacer', 'link', 'frame', 'base'])
 
+    # Used by set_up_substitutions to detect the charset in a META tag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def set_up_substitutions(self, tag):
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+
+        if (http_equiv is not None
+            and content is not None
+            and http_equiv.lower() == 'content-type'):
+            # This is an interesting meta tag.
+            match = self.CHARSET_RE.search(content)
+            if match:
+                if (self.soup.declaredHTMLEncoding is not None or
+                    self.soup.originalEncoding == self.soup.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+                    return True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    new_charset = match.group(3)
+                    if (new_charset is not None
+                        and new_charset != self.soup.originalEncoding):
+                        self.soup.declaredHTMLEncoding = new_charset
+                        self.soup._feed(self.soup.declaredHTMLEncoding)
+                        raise StopParsing
+                    pass
+        return False
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index 736889f..dc95493 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -130,12 +130,19 @@ class Element(html5lib.treebuilders._base.Node):
         return AttrList(self.element)
 
     def setAttributes(self, attributes):
-        if attributes:
+        if attributes is not None and attributes != {}:
             for name, value in attributes.items():
                 self.element[name] =  value
-
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            # The Tag constructor calls this method automatically,
+            # but html5lib creates a Tag object before setting up
+            # the attributes.
+            self.element.contains_substitutions = (
+                self.soup.builder.set_up_substitutions(
+                    self.element))
     attributes = property(getAttributes, setAttributes)
-    
+
     def insertText(self, data, insertBefore=None):
         text = TextNode(NavigableString(data), self.soup)
         if insertBefore:
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 39e0e06..5793d59 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -9,7 +9,7 @@ from util import isString, isList
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
-class Entities:
+class Entities(object):
     """A mixin class that knows about XML entities."""
 
     HTML_ENTITIES = "html"
@@ -31,7 +31,7 @@ class Entities:
 
     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 
-class PageElement:
+class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 
@@ -438,13 +438,16 @@ class Tag(PageElement, Entities):
         self.contents = []
         self.setup(parent, previous)
         self.hidden = False
-        self.containsSubstitutions = False
 
         if isinstance(attrs, types.DictType):
             self.attrs = [kv for kv in attrs.items()]
         else:
             self.attrs = list(attrs)
 
+        # Set up any substitutions, such as the charset in a META tag.
+        self.contains_substitutions = builder.set_up_substitutions(self)
+
+
     @property
     def string(self):
         """Convenience property to get the single string within this tag.
@@ -581,7 +584,7 @@ class Tag(PageElement, Entities):
             for key, val in self.attrs:
                 fmt = '%s="%s"'
                 if isString(val):
-                    if (self.containsSubstitutions
+                    if (self.contains_substitutions
                         and eventualEncoding is not None
                         and '%SOUP-ENCODING%' in val):
                         val = self.substituteEncoding(val, eventualEncoding)
@@ -762,7 +765,7 @@ class Tag(PageElement, Entities):
 
 
 # Next, a couple classes to represent queries and their results.
-class SoupStrainer:
+class SoupStrainer(object):
     """Encapsulates a number of ways of matching a markup element (tag or
     text)."""
 
@@ -882,6 +885,7 @@ class SoupStrainer:
                 result = matchAgainst == markup
         return result
 
+
 class ResultSet(list):
     """A ResultSet is just a list that keeps track of the SoupStrainer
     that created it."""
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 09:58:07 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 09:58:07 -0500
commit	66cbef12d959149746b3361f227f2a0328a31469 (patch)
tree	c6772c648933ad1477dc642aa598f34508870fff /beautifulsoup
parent	cb85520f7627a914e10e2d3ea52d7066bdf3984d (diff)
parent	d9462ef1b2760ccb6273903abcd7d253445716a4 (diff)