summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-03-01 13:37:42 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-03-01 13:37:42 -0500
commite533b599457f713944a17d29739308f09bfd5aef (patch)
tree70fa05e8de9e6457c0df6bfb594ac3cf04abd38d /bs4/builder/__init__.py
parent483286bfbb40bfabe4c48c9f31c59ef7449d64bb (diff)
In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714]
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py59
1 files changed, 30 insertions, 29 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4e31572..a38a98f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -135,7 +135,7 @@ class TreeBuilder(object):
return fragment
def set_up_substitutions(self, tag):
- pass
+ return False
class SAXTreeBuilder(TreeBuilder):
@@ -222,41 +222,42 @@ class HTMLTreeBuilder(TreeBuilder):
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def set_up_substitutions(self, tag):
+ # We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
-
- if (http_equiv is not None
- and content is not None
- and http_equiv.lower() == 'content-type'):
- # This is an interesting meta tag.
+ charset = tag.get('charset')
+
+ # We are interested in <meta> tags that say what encoding the
+ # document was originally in. This means HTML 5-style <meta>
+ # tags that provide the "charset" attribute. It also means
+ # HTML 4-style <meta> tags that provide the "content"
+ # attribute and have "http-equiv" set to "content-type".
+ meta_encoding = None
+ if charset is not None:
+ # HTML 5 style:
+ # <meta charset="utf8">
+ meta_encoding = charset
+
+ # Modify the tag.
+ tag['charset'] = "%SOUP-ENCODING%"
+
+ elif (content is not None and http_equiv is not None
+ and http_equiv.lower() == 'content-type'):
+ # HTML 4 style:
+ # <meta http-equiv="content-type" content="text/html; charset=utf8">
match = self.CHARSET_RE.search(content)
- if match:
- if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.from_encoding):
- # An HTML encoding was sniffed while converting
- # the document to Unicode, or an HTML encoding was
- # sniffed during a previous pass through the
- # document, or an encoding was specified
- # explicitly and it worked. Rewrite the meta tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
- return True
- else:
- # This is our first pass through the document.
- # Go through it again with the encoding information.
- new_charset = match.group(3)
- if (new_charset is not None
- and new_charset != self.soup.original_encoding):
- self.soup.declared_html_encoding = new_charset
- self.soup._feed(self.soup.declared_html_encoding)
- raise StopParsing
- pass
- return False
+ if match is not None:
+ meta_encoding = match.group(3)
+
+ # Modify the tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+ return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""