diff options
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 42 | ||||
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 13 |
2 files changed, 52 insertions, 3 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 86de5ec..cf5e6c6 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +import re from beautifulsoup.element import Entities __all__ = [ @@ -37,6 +38,9 @@ class TreeBuilder(Entities): """ return fragment + def set_up_substitutions(self, tag): + pass + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" @@ -96,3 +100,41 @@ class HTMLTreeBuilder(TreeBuilder): self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # Used by set_up_substitutions to detect the charset in a META tag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def set_up_substitutions(self, tag): + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + + if (http_equiv is not None + and content is not None + and http_equiv.lower() == 'content-type'): + # This is an interesting meta tag. + match = self.CHARSET_RE.search(content) + if match: + if (self.soup.declaredHTMLEncoding is not None or + self.soup.originalEncoding == self.soup.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + tag['content'] = self.CHARSET_RE.sub(rewrite, content) + return True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + new_charset = match.group(3) + if (new_charset is not None + and new_charset != self.soup.originalEncoding): + self.soup.declaredHTMLEncoding = new_charset + self.soup._feed(self.soup.declaredHTMLEncoding) + raise StopParsing + pass + return False diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index 736889f..dc95493 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -130,12 +130,19 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): - if attributes: + if attributes is not None and attributes != {}: for name, value in attributes.items(): self.element[name] = value - + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # The Tag constructor calls this method automatically, + # but html5lib creates a Tag object before setting up + # the attributes. + self.element.contains_substitutions = ( + self.soup.builder.set_up_substitutions( + self.element)) attributes = property(getAttributes, setAttributes) - + def insertText(self, data, insertBefore=None): text = TextNode(NavigableString(data), self.soup) if insertBefore: |