diff options
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 86de5ec..eb92e6b 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +import re from beautifulsoup.element import Entities __all__ = [ @@ -37,6 +38,9 @@ class TreeBuilder(Entities): """ return fragment + def set_up_substitutions(self, tag): + pass + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" @@ -96,3 +100,43 @@ class HTMLTreeBuilder(TreeBuilder): self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # Used by set_up_substitutions to detect the charset in a META tag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def set_up_substitutions(self, tag): + if tag.name != 'meta': + return False + + httpEquiv = None + contentType = None + http_equiv = tag.get('http-equiv') + content = tag.get('content') + + if (http_equiv is not None + and content is not None + and http_equiv.lower() == 'content-type'): + # This is an interesting meta tag. + match = self.CHARSET_RE.search(content) + if match: + if (self.soup.declaredHTMLEncoding is not None or + self.soup.originalEncoding == self.soup.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, content) + tag['content'] = newAttr + return True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.soup.originalEncoding: + self.soup.declaredHTMLEncoding = newCharset + self.soup._feed(self.soup.declaredHTMLEncoding) + raise StopParsing + pass + return False |