summaryrefslogtreecommitdiff
path: root/beautifulsoup/builder
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r--beautifulsoup/builder/__init__.py42
-rw-r--r--beautifulsoup/builder/html5lib_builder.py13
2 files changed, 52 insertions, 3 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..cf5e6c6 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
from beautifulsoup.element import Entities
__all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
"""
return fragment
+ def set_up_substitutions(self, tag):
+ pass
+
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,41 @@ class HTMLTreeBuilder(TreeBuilder):
self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ # Used by set_up_substitutions to detect the charset in a META tag
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def set_up_substitutions(self, tag):
+ if tag.name != 'meta':
+ return False
+
+ http_equiv = tag.get('http-equiv')
+ content = tag.get('content')
+
+ if (http_equiv is not None
+ and content is not None
+ and http_equiv.lower() == 'content-type'):
+ # This is an interesting meta tag.
+ match = self.CHARSET_RE.search(content)
+ if match:
+ if (self.soup.declaredHTMLEncoding is not None or
+ self.soup.originalEncoding == self.soup.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+ return True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ new_charset = match.group(3)
+ if (new_charset is not None
+ and new_charset != self.soup.originalEncoding):
+ self.soup.declaredHTMLEncoding = new_charset
+ self.soup._feed(self.soup.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ return False
diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py
index 736889f..dc95493 100644
--- a/beautifulsoup/builder/html5lib_builder.py
+++ b/beautifulsoup/builder/html5lib_builder.py
@@ -130,12 +130,19 @@ class Element(html5lib.treebuilders._base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
- if attributes:
+ if attributes is not None and attributes != {}:
for name, value in attributes.items():
self.element[name] = value
-
+ # The attributes may contain variables that need substitution.
+ # Call set_up_substitutions manually.
+ # The Tag constructor calls this method automatically,
+ # but html5lib creates a Tag object before setting up
+ # the attributes.
+ self.element.contains_substitutions = (
+ self.soup.builder.set_up_substitutions(
+ self.element))
attributes = property(getAttributes, setAttributes)
-
+
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore: