diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 14:37:44 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 14:37:44 -0400 |
commit | 7b9d05ec019d59575a0280c6e109e794e142f8cf (patch) | |
tree | d9d204abf24e9947e70b40e7ec233c3d1cc58efe /bs4/builder/__init__.py | |
parent | ce805a11981bf58b7b005b81f56a80ea1a1bb8f9 (diff) |
Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%).
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 25 |
1 files changed, 10 insertions, 15 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 84b5289..9f4f59e 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -1,6 +1,9 @@ from collections import defaultdict -import re import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + ) __all__ = [ 'HTMLTreeBuilder', @@ -218,9 +221,6 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } - # Used by set_up_substitutions to detect the charset in a META tag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': @@ -235,27 +235,22 @@ class HTMLTreeBuilder(TreeBuilder): # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset - - # Modify the tag. - tag['charset'] = "%SOUP-ENCODING%" + tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> - match = self.CHARSET_RE.search(content) - if match is not None: - meta_encoding = match.group(3) - - # Modify the tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - tag['content'] = self.CHARSET_RE.sub(rewrite, content) + tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None) |