Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%).

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-18 14:37:44 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-18 14:37:44 -0400
commit: 7b9d05ec019d59575a0280c6e109e794e142f8cf (patch)
tree: d9d204abf24e9947e70b40e7ec233c3d1cc58efe
parent: ce805a11981bf58b7b005b81f56a80ea1a1bb8f9 (diff)
6 files changed, 104 insertions, 32 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 13b49fc..72c388b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
 = 4.0.5 (unreleased) =
 
+* Made encoding substitution in <meta> tags completely transparent (no
+  more %SOUP-ENCODING%).
+
 * Fixed a bug that made the HTMLParser treebuilder generate XML
   definitions ending with two question marks instead of
   one. [bug=984258]
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 84b5289..9f4f59e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -1,6 +1,9 @@
 from collections import defaultdict
-import re
 import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    )
 
 __all__ = [
     'HTMLTreeBuilder',
@@ -218,9 +221,6 @@ class HTMLTreeBuilder(TreeBuilder):
         "output" : ["for"],
         }
 
-    # Used by set_up_substitutions to detect the charset in a META tag
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
     def set_up_substitutions(self, tag):
         # We are only interested in <meta> tags
         if tag.name != 'meta':
@@ -235,27 +235,22 @@ class HTMLTreeBuilder(TreeBuilder):
         # tags that provide the "charset" attribute. It also means
         # HTML 4-style <meta> tags that provide the "content"
         # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
         meta_encoding = None
         if charset is not None:
             # HTML 5 style:
             # <meta charset="utf8">
             meta_encoding = charset
-
-            # Modify the tag.
-            tag['charset'] = "%SOUP-ENCODING%"
+            tag['charset'] = CharsetMetaAttributeValue(charset)
 
         elif (content is not None and http_equiv is not None
               and http_equiv.lower() == 'content-type'):
             # HTML 4 style:
             # <meta http-equiv="content-type" content="text/html; charset=utf8">
-            match = self.CHARSET_RE.search(content)
-            if match is not None:
-                meta_encoding = match.group(3)
-
-                # Modify the tag.
-                def rewrite(match):
-                    return match.group(1) + "%SOUP-ENCODING%"
-                tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+            tag['content'] = ContentMetaAttributeValue(content)
 
         return (meta_encoding is not None)
 
diff --git a/bs4/element.py b/bs4/element.py
index 684da38..3729789 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -34,6 +34,51 @@ class NamespacedAttribute(unicode):
         obj.namespace = namespace
         return obj
 
+class AttributeValueWithCharsetSubstitution(unicode):
+    """A stand-in object for a character encoding specified in HTML."""
+
+class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+    """A generic stand-in for the value of a meta tag's 'charset' attribute.
+
+    When Beautiful Soup parses the markup '<meta charset="utf8">', the
+    value of the 'charset' attribute will be one of these objects.
+    """
+
+    def __new__(cls, original_value):
+        obj = unicode.__new__(cls, original_value)
+        obj.original_value = original_value
+        return obj
+
+    def encode(self, encoding):
+        return encoding
+
+
+class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+    """A generic stand-in for the value of a meta tag's 'content' attribute.
+
+    When Beautiful Soup parses the markup:
+     <meta http-equiv="content-type" content="text/html; charset=utf8">
+
+    The value of the 'content' attribute will be one of these objects.
+    """
+
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    def __new__(cls, original_value):
+        match = cls.CHARSET_RE.search(original_value)
+        if match is None:
+            # No substitution necessary.
+            return unicode.__new__(unicode, original_value)
+
+        obj = unicode.__new__(cls, original_value)
+        obj.original_value = original_value
+        return obj
+
+    def encode(self, encoding):
+        def rewrite(match):
+            return match.group(1) + encoding
+        return self.CHARSET_RE.sub(rewrite, self.original_value)
+
 
 class PageElement(object):
     """Contains the navigational information for some part of the page
@@ -950,10 +995,10 @@ class Tag(PageElement):
                         val = ' '.join(val)
                     elif not isinstance(val, basestring):
                         val = str(val)
-                    if (self.contains_substitutions
-                        and eventual_encoding is not None
-                        and '%SOUP-ENCODING%' in val):
-                        val = self.substitute_encoding(val, eventual_encoding)
+                    elif (
+                        isinstance(val, AttributeValueWithCharsetSubstitution)
+                        and eventual_encoding is not None):
+                        val = val.encode(eventual_encoding)
 
                     text = self.format_string(val, formatter)
                     decoded = (
diff --git a/bs4/testing.py b/bs4/testing.py
index 41c8783..94c87c9 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -6,7 +6,9 @@ import unittest
 from unittest import TestCase
 from bs4 import BeautifulSoup
 from bs4.element import (
+    CharsetMetaAttributeValue,
     Comment,
+    ContentMetaAttributeValue,
     Doctype,
     SoupStrainer,
 )
@@ -371,12 +373,17 @@ class HTMLTreeBuilderSmokeTest(object):
             '</head><body>Shift-JIS markup goes here.') % meta_tag
         soup = self.soup(shift_jis_html)
 
-        # Parse the document, and the charset is replaced with a
-        # generic value.
+        # Parse the document, and the charset is seemingly unaffected.
         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
-        self.assertEqual(parsed_meta['content'],
-                          'text/html; charset=%SOUP-ENCODING%')
-        self.assertEqual(parsed_meta.contains_substitutions, True)
+        content = parsed_meta['content']
+        self.assertEqual('text/html; charset=x-sjis', content)
+
+        # But that value is actually a ContentMetaAttributeValue object.
+        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
 
         # For the rest of the story, see TestSubstitutions in
         # test_tree.py.
@@ -393,11 +400,17 @@ class HTMLTreeBuilderSmokeTest(object):
             '</head><body>Shift-JIS markup goes here.') % meta_tag
         soup = self.soup(shift_jis_html)
 
-        # Parse the document, and the charset is replaced with a
-        # generic value.
+        # Parse the document, and the charset is seemingly unaffected.
         parsed_meta = soup.find('meta', id="encoding")
-        self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset'])
-        self.assertEqual(True, parsed_meta.contains_substitutions)
+        charset = parsed_meta['charset']
+        self.assertEqual('x-sjis', charset)
+
+        # But that value is actually a CharsetMetaAttributeValue object.
+        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('utf8', charset.encode("utf8"))
 
 class XMLTreeBuilderSmokeTest(object):
 
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddbffd4..39e1964 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -7,6 +7,8 @@ from bs4 import (
     BeautifulStoneSoup,
 )
 from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
     SoupStrainer,
     NamespacedAttribute,
     )
@@ -299,3 +301,19 @@ class TestNamedspacedAttribute(SoupTest):
 
         e = NamespacedAttribute("z", "b", "c")
         self.assertNotEqual(a, e)
+
+
+class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
+
+    def test_content_meta_attribute_value(self):
+        value = CharsetMetaAttributeValue("euc-jp")
+        self.assertEqual("euc-jp", value)
+        self.assertEqual("euc-jp", value.original_value)
+        self.assertEqual("utf8", value.encode("utf8"))
+
+
+    def test_content_meta_attribute_value(self):
+        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
+        self.assertEqual("text/html; charset=euc-jp", value)
+        self.assertEqual("text/html; charset=euc-jp", value.original_value)
+        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 661decb..6cb1b7f 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1262,10 +1262,8 @@ class TestSubstitutions(SoupTest):
                     'http-equiv="Content-type"/>')
         soup = self.soup(meta_tag)
 
-        # Parse the document, and the charset is replaced with a
-        # generic value.
-        self.assertEqual(soup.meta['content'],
-                          'text/html; charset=%SOUP-ENCODING%')
+        # Parse the document, and the charset apprears unchanged.
+        self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
 
         # Encode the document into some encoding, and the encoding is
         # substituted into the meta tag.
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-18 14:37:44 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-18 14:37:44 -0400
commit	7b9d05ec019d59575a0280c6e109e794e142f8cf (patch)
tree	d9d204abf24e9947e70b40e7ec233c3d1cc58efe
parent	ce805a11981bf58b7b005b81f56a80ea1a1bb8f9 (diff)