summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 14:37:44 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 14:37:44 -0400
commit7b9d05ec019d59575a0280c6e109e794e142f8cf (patch)
treed9d204abf24e9947e70b40e7ec233c3d1cc58efe
parentce805a11981bf58b7b005b81f56a80ea1a1bb8f9 (diff)
Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%).
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/__init__.py25
-rw-r--r--bs4/element.py53
-rw-r--r--bs4/testing.py31
-rw-r--r--bs4/tests/test_soup.py18
-rw-r--r--bs4/tests/test_tree.py6
6 files changed, 104 insertions, 32 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 13b49fc..72c388b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= 4.0.5 (unreleased) =
+* Made encoding substitution in <meta> tags completely transparent (no
+ more %SOUP-ENCODING%).
+
* Fixed a bug that made the HTMLParser treebuilder generate XML
definitions ending with two question marks instead of
one. [bug=984258]
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 84b5289..9f4f59e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -1,6 +1,9 @@
from collections import defaultdict
-import re
import sys
+from bs4.element import (
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
+ )
__all__ = [
'HTMLTreeBuilder',
@@ -218,9 +221,6 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
- # Used by set_up_substitutions to detect the charset in a META tag
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
@@ -235,27 +235,22 @@ class HTMLTreeBuilder(TreeBuilder):
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
+ #
+ # In both cases we will replace the value of the appropriate
+ # attribute with a standin object that can take on any
+ # encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
-
- # Modify the tag.
- tag['charset'] = "%SOUP-ENCODING%"
+ tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
- match = self.CHARSET_RE.search(content)
- if match is not None:
- meta_encoding = match.group(3)
-
- # Modify the tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+ tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)
diff --git a/bs4/element.py b/bs4/element.py
index 684da38..3729789 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -34,6 +34,51 @@ class NamespacedAttribute(unicode):
obj.namespace = namespace
return obj
+class AttributeValueWithCharsetSubstitution(unicode):
+ """A stand-in object for a character encoding specified in HTML."""
+
+class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+ """A generic stand-in for the value of a meta tag's 'charset' attribute.
+
+ When Beautiful Soup parses the markup '<meta charset="utf8">', the
+ value of the 'charset' attribute will be one of these objects.
+ """
+
+ def __new__(cls, original_value):
+ obj = unicode.__new__(cls, original_value)
+ obj.original_value = original_value
+ return obj
+
+ def encode(self, encoding):
+ return encoding
+
+
+class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+ """A generic stand-in for the value of a meta tag's 'content' attribute.
+
+ When Beautiful Soup parses the markup:
+ <meta http-equiv="content-type" content="text/html; charset=utf8">
+
+ The value of the 'content' attribute will be one of these objects.
+ """
+
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def __new__(cls, original_value):
+ match = cls.CHARSET_RE.search(original_value)
+ if match is None:
+ # No substitution necessary.
+ return unicode.__new__(unicode, original_value)
+
+ obj = unicode.__new__(cls, original_value)
+ obj.original_value = original_value
+ return obj
+
+ def encode(self, encoding):
+ def rewrite(match):
+ return match.group(1) + encoding
+ return self.CHARSET_RE.sub(rewrite, self.original_value)
+
class PageElement(object):
"""Contains the navigational information for some part of the page
@@ -950,10 +995,10 @@ class Tag(PageElement):
val = ' '.join(val)
elif not isinstance(val, basestring):
val = str(val)
- if (self.contains_substitutions
- and eventual_encoding is not None
- and '%SOUP-ENCODING%' in val):
- val = self.substitute_encoding(val, eventual_encoding)
+ elif (
+ isinstance(val, AttributeValueWithCharsetSubstitution)
+ and eventual_encoding is not None):
+ val = val.encode(eventual_encoding)
text = self.format_string(val, formatter)
decoded = (
diff --git a/bs4/testing.py b/bs4/testing.py
index 41c8783..94c87c9 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -6,7 +6,9 @@ import unittest
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
+ CharsetMetaAttributeValue,
Comment,
+ ContentMetaAttributeValue,
Doctype,
SoupStrainer,
)
@@ -371,12 +373,17 @@ class HTMLTreeBuilderSmokeTest(object):
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
- # Parse the document, and the charset is replaced with a
- # generic value.
+ # Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
- self.assertEqual(parsed_meta['content'],
- 'text/html; charset=%SOUP-ENCODING%')
- self.assertEqual(parsed_meta.contains_substitutions, True)
+ content = parsed_meta['content']
+ self.assertEqual('text/html; charset=x-sjis', content)
+
+ # But that value is actually a ContentMetaAttributeValue object.
+ self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+ # And it will take on a value that reflects its current
+ # encoding.
+ self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
@@ -393,11 +400,17 @@ class HTMLTreeBuilderSmokeTest(object):
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
- # Parse the document, and the charset is replaced with a
- # generic value.
+ # Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', id="encoding")
- self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset'])
- self.assertEqual(True, parsed_meta.contains_substitutions)
+ charset = parsed_meta['charset']
+ self.assertEqual('x-sjis', charset)
+
+ # But that value is actually a CharsetMetaAttributeValue object.
+ self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+ # And it will take on a value that reflects its current
+ # encoding.
+ self.assertEqual('utf8', charset.encode("utf8"))
class XMLTreeBuilderSmokeTest(object):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddbffd4..39e1964 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -7,6 +7,8 @@ from bs4 import (
BeautifulStoneSoup,
)
from bs4.element import (
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
)
@@ -299,3 +301,19 @@ class TestNamedspacedAttribute(SoupTest):
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
+
+
+class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
+
+ def test_content_meta_attribute_value(self):
+ value = CharsetMetaAttributeValue("euc-jp")
+ self.assertEqual("euc-jp", value)
+ self.assertEqual("euc-jp", value.original_value)
+ self.assertEqual("utf8", value.encode("utf8"))
+
+
+ def test_content_meta_attribute_value(self):
+ value = ContentMetaAttributeValue("text/html; charset=euc-jp")
+ self.assertEqual("text/html; charset=euc-jp", value)
+ self.assertEqual("text/html; charset=euc-jp", value.original_value)
+ self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 661decb..6cb1b7f 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1262,10 +1262,8 @@ class TestSubstitutions(SoupTest):
'http-equiv="Content-type"/>')
soup = self.soup(meta_tag)
- # Parse the document, and the charset is replaced with a
- # generic value.
- self.assertEqual(soup.meta['content'],
- 'text/html; charset=%SOUP-ENCODING%')
+ # Parse the document, and the charset apprears unchanged.
+ self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
# Encode the document into some encoding, and the encoding is
# substituted into the meta tag.