summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py62
-rw-r--r--beautifulsoup/builder/__init__.py44
-rw-r--r--beautifulsoup/element.py8
-rw-r--r--tests/test_lxml.py2
4 files changed, 51 insertions, 65 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index e23c9d9..f2c20de 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -120,9 +120,6 @@ class BeautifulStoneSoup(Tag):
"""
ROOT_TAG_NAME = u'[document]'
- # Used to detect the charset in a META tag; see handleSpecialMetaTag
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
# Used when determining whether a text node is all whitespace and
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
@@ -272,13 +269,8 @@ class BeautifulStoneSoup(Tag):
or not self.parseOnlyThese.searchTag(name, attrs))):
return None
- containsSubstitutions = False
- if name == 'meta' and self.builder.assume_html:
- containsSubstitutions = self.handleSpecialMetaTag(attrs)
-
tag = Tag(self, self.builder, name, attrs, self.currentTag,
self.previous)
- tag.containsSubstitutions = containsSubstitutions
if self.previous:
self.previous.next = tag
self.previous = tag
@@ -293,60 +285,6 @@ class BeautifulStoneSoup(Tag):
def handle_data(self, data):
self.currentData.append(data)
- def handleSpecialMetaTag(self, attrs):
- """Beautiful Soup can detect a charset included in a META tag,
- try to convert the document to that charset, and re-parse the
- document from the beginning. Neither lxml nor html5lib does
- this, so the feature is still here."""
- httpEquiv = None
- contentType = None
- contentTypeIndex = None
- tagNeedsEncodingSubstitution = False
-
- if isinstance(attrs, dict):
- httpEquiv = attrs.get('http-equiv')
- contentType = attrs.get('content')
- else:
- # XXX do we need this?
- for i in range(0, len(attrs)):
- key, value = attrs[i]
- key = key.lower()
- if key == 'http-equiv':
- httpEquiv = value
- elif key == 'content':
- contentType = value
- contentTypeIndex = i
-
- if httpEquiv and contentType: # It's an interesting meta tag.
- match = self.CHARSET_RE.search(contentType)
- if match:
- if (self.declaredHTMLEncoding is not None or
- self.originalEncoding == self.fromEncoding):
- # An HTML encoding was sniffed while converting
- # the document to Unicode, or an HTML encoding was
- # sniffed during a previous pass through the
- # document, or an encoding was specified
- # explicitly and it worked. Rewrite the meta tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- newAttr = self.CHARSET_RE.sub(rewrite, contentType)
- if isinstance(attrs, dict):
- attrs['content'] = newAttr
- else:
- attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
- newAttr)
- tagNeedsEncodingSubstitution = True
- else:
- # This is our first pass through the document.
- # Go through it again with the encoding information.
- newCharset = match.group(3)
- if newCharset and newCharset != self.originalEncoding:
- self.declaredHTMLEncoding = newCharset
- self._feed(self.declaredHTMLEncoding)
- raise StopParsing
- pass
- return tagNeedsEncodingSubstitution
-
class BeautifulSoup(BeautifulStoneSoup):
"""A convenience class for parsing HTML without creating a builder."""
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 86de5ec..eb92e6b 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+import re
from beautifulsoup.element import Entities
__all__ = [
@@ -37,6 +38,9 @@ class TreeBuilder(Entities):
"""
return fragment
+ def set_up_substitutions(self, tag):
+ pass
+
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
@@ -96,3 +100,43 @@ class HTMLTreeBuilder(TreeBuilder):
self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ # Used by set_up_substitutions to detect the charset in a META tag
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def set_up_substitutions(self, tag):
+ if tag.name != 'meta':
+ return False
+
+ httpEquiv = None
+ contentType = None
+ http_equiv = tag.get('http-equiv')
+ content = tag.get('content')
+
+ if (http_equiv is not None
+ and content is not None
+ and http_equiv.lower() == 'content-type'):
+ # This is an interesting meta tag.
+ match = self.CHARSET_RE.search(content)
+ if match:
+ if (self.soup.declaredHTMLEncoding is not None or
+ self.soup.originalEncoding == self.soup.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, content)
+ tag['content'] = newAttr
+ return True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.soup.originalEncoding:
+ self.soup.declaredHTMLEncoding = newCharset
+ self.soup._feed(self.soup.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ return False
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 39e0e06..6e2bada 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -438,13 +438,16 @@ class Tag(PageElement, Entities):
self.contents = []
self.setup(parent, previous)
self.hidden = False
- self.containsSubstitutions = False
if isinstance(attrs, types.DictType):
self.attrs = [kv for kv in attrs.items()]
else:
self.attrs = list(attrs)
+ # Set up any substitutions, such as the charset in a META tag.
+ self.contains_substitutions = builder.set_up_substitutions(self)
+
+
@property
def string(self):
"""Convenience property to get the single string within this tag.
@@ -581,7 +584,7 @@ class Tag(PageElement, Entities):
for key, val in self.attrs:
fmt = '%s="%s"'
if isString(val):
- if (self.containsSubstitutions
+ if (self.contains_substitutions
and eventualEncoding is not None
and '%SOUP-ENCODING%' in val):
val = self.substituteEncoding(val, eventualEncoding)
@@ -882,6 +885,7 @@ class SoupStrainer:
result = matchAgainst == markup
return result
+
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index ab5b219..b002227 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -292,7 +292,7 @@ class TestLXMLBuilder(SoupTest):
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
self.assertEquals(parsed_meta['content'],
'text/html; charset=%SOUP-ENCODING%')
- self.assertEquals(parsed_meta.containsSubstitutions, True)
+ self.assertEquals(parsed_meta.contains_substitutions, True)
# For the rest of the story, see TestSubstitutions in
# test_tree.py.