diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 25 | ||||
-rw-r--r-- | bs4/element.py | 53 | ||||
-rw-r--r-- | bs4/testing.py | 31 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 18 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 6 |
6 files changed, 104 insertions, 32 deletions
@@ -1,5 +1,8 @@ = 4.0.5 (unreleased) = +* Made encoding substitution in <meta> tags completely transparent (no + more %SOUP-ENCODING%). + * Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258] diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 84b5289..9f4f59e 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -1,6 +1,9 @@ from collections import defaultdict -import re import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + ) __all__ = [ 'HTMLTreeBuilder', @@ -218,9 +221,6 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } - # Used by set_up_substitutions to detect the charset in a META tag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': @@ -235,27 +235,22 @@ class HTMLTreeBuilder(TreeBuilder): # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset - - # Modify the tag. - tag['charset'] = "%SOUP-ENCODING%" + tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> - match = self.CHARSET_RE.search(content) - if match is not None: - meta_encoding = match.group(3) - - # Modify the tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - tag['content'] = self.CHARSET_RE.sub(rewrite, content) + tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None) diff --git a/bs4/element.py b/bs4/element.py index 684da38..3729789 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -34,6 +34,51 @@ class NamespacedAttribute(unicode): obj.namespace = namespace return obj +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '<meta charset="utf8">', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + <meta http-equiv="content-type" content="text/html; charset=utf8"> + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + class PageElement(object): """Contains the navigational information for some part of the page @@ -950,10 +995,10 @@ class Tag(PageElement): val = ' '.join(val) elif not isinstance(val, basestring): val = str(val) - if (self.contains_substitutions - and eventual_encoding is not None - and '%SOUP-ENCODING%' in val): - val = self.substitute_encoding(val, eventual_encoding) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None): + val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( diff --git a/bs4/testing.py b/bs4/testing.py index 41c8783..94c87c9 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -6,7 +6,9 @@ import unittest from unittest import TestCase from bs4 import BeautifulSoup from bs4.element import ( + CharsetMetaAttributeValue, Comment, + ContentMetaAttributeValue, Doctype, SoupStrainer, ) @@ -371,12 +373,17 @@ class HTMLTreeBuilderSmokeTest(object): '</head><body>Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) - # Parse the document, and the charset is replaced with a - # generic value. + # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) - self.assertEqual(parsed_meta['content'], - 'text/html; charset=%SOUP-ENCODING%') - self.assertEqual(parsed_meta.contains_substitutions, True) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. @@ -393,11 +400,17 @@ class HTMLTreeBuilderSmokeTest(object): '</head><body>Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) - # Parse the document, and the charset is replaced with a - # generic value. + # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") - self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset']) - self.assertEqual(True, parsed_meta.contains_substitutions) + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) class XMLTreeBuilderSmokeTest(object): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ddbffd4..39e1964 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -7,6 +7,8 @@ from bs4 import ( BeautifulStoneSoup, ) from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, ) @@ -299,3 +301,19 @@ class TestNamedspacedAttribute(SoupTest): e = NamespacedAttribute("z", "b", "c") self.assertNotEqual(a, e) + + +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): + + def test_content_meta_attribute_value(self): + value = CharsetMetaAttributeValue("euc-jp") + self.assertEqual("euc-jp", value) + self.assertEqual("euc-jp", value.original_value) + self.assertEqual("utf8", value.encode("utf8")) + + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + self.assertEqual("text/html; charset=euc-jp", value) + self.assertEqual("text/html; charset=euc-jp", value.original_value) + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 661decb..6cb1b7f 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1262,10 +1262,8 @@ class TestSubstitutions(SoupTest): 'http-equiv="Content-type"/>') soup = self.soup(meta_tag) - # Parse the document, and the charset is replaced with a - # generic value. - self.assertEqual(soup.meta['content'], - 'text/html; charset=%SOUP-ENCODING%') + # Parse the document, and the charset apprears unchanged. + self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') # Encode the document into some encoding, and the encoding is # substituted into the meta tag. |