summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 14:37:44 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2012-04-18 14:37:44 -0400
commit7b9d05ec019d59575a0280c6e109e794e142f8cf (patch)
treed9d204abf24e9947e70b40e7ec233c3d1cc58efe /bs4/builder/__init__.py
parentce805a11981bf58b7b005b81f56a80ea1a1bb8f9 (diff)
Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%).
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py25
1 files changed, 10 insertions, 15 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 84b5289..9f4f59e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -1,6 +1,9 @@
from collections import defaultdict
-import re
import sys
+from bs4.element import (
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
+ )
__all__ = [
'HTMLTreeBuilder',
@@ -218,9 +221,6 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
- # Used by set_up_substitutions to detect the charset in a META tag
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
@@ -235,27 +235,22 @@ class HTMLTreeBuilder(TreeBuilder):
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
+ #
+ # In both cases we will replace the value of the appropriate
+ # attribute with a standin object that can take on any
+ # encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
-
- # Modify the tag.
- tag['charset'] = "%SOUP-ENCODING%"
+ tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
- match = self.CHARSET_RE.search(content)
- if match is not None:
- meta_encoding = match.group(3)
-
- # Modify the tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+ tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)