diff options
-rw-r--r-- | beautifulsoup/__init__.py | 62 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 44 | ||||
-rw-r--r-- | beautifulsoup/element.py | 8 | ||||
-rw-r--r-- | tests/test_lxml.py | 2 |
4 files changed, 51 insertions, 65 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index e23c9d9..f2c20de 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -120,9 +120,6 @@ class BeautifulStoneSoup(Tag): """ ROOT_TAG_NAME = u'[document]' - # Used to detect the charset in a META tag; see handleSpecialMetaTag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left @@ -272,13 +269,8 @@ class BeautifulStoneSoup(Tag): or not self.parseOnlyThese.searchTag(name, attrs))): return None - containsSubstitutions = False - if name == 'meta' and self.builder.assume_html: - containsSubstitutions = self.handleSpecialMetaTag(attrs) - tag = Tag(self, self.builder, name, attrs, self.currentTag, self.previous) - tag.containsSubstitutions = containsSubstitutions if self.previous: self.previous.next = tag self.previous = tag @@ -293,60 +285,6 @@ class BeautifulStoneSoup(Tag): def handle_data(self, data): self.currentData.append(data) - def handleSpecialMetaTag(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning. Neither lxml nor html5lib does - this, so the feature is still here.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - if isinstance(attrs, dict): - httpEquiv = attrs.get('http-equiv') - contentType = attrs.get('content') - else: - # XXX do we need this? - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) - if isinstance(attrs, dict): - attrs['content'] = newAttr - else: - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - pass - return tagNeedsEncodingSubstitution - class BeautifulSoup(BeautifulStoneSoup): """A convenience class for parsing HTML without creating a builder.""" diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 86de5ec..eb92e6b 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +import re from beautifulsoup.element import Entities __all__ = [ @@ -37,6 +38,9 @@ class TreeBuilder(Entities): """ return fragment + def set_up_substitutions(self, tag): + pass + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" @@ -96,3 +100,43 @@ class HTMLTreeBuilder(TreeBuilder): self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # Used by set_up_substitutions to detect the charset in a META tag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def set_up_substitutions(self, tag): + if tag.name != 'meta': + return False + + httpEquiv = None + contentType = None + http_equiv = tag.get('http-equiv') + content = tag.get('content') + + if (http_equiv is not None + and content is not None + and http_equiv.lower() == 'content-type'): + # This is an interesting meta tag. + match = self.CHARSET_RE.search(content) + if match: + if (self.soup.declaredHTMLEncoding is not None or + self.soup.originalEncoding == self.soup.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, content) + tag['content'] = newAttr + return True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.soup.originalEncoding: + self.soup.declaredHTMLEncoding = newCharset + self.soup._feed(self.soup.declaredHTMLEncoding) + raise StopParsing + pass + return False diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 39e0e06..6e2bada 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -438,13 +438,16 @@ class Tag(PageElement, Entities): self.contents = [] self.setup(parent, previous) self.hidden = False - self.containsSubstitutions = False if isinstance(attrs, types.DictType): self.attrs = [kv for kv in attrs.items()] else: self.attrs = list(attrs) + # Set up any substitutions, such as the charset in a META tag. + self.contains_substitutions = builder.set_up_substitutions(self) + + @property def string(self): """Convenience property to get the single string within this tag. @@ -581,7 +584,7 @@ class Tag(PageElement, Entities): for key, val in self.attrs: fmt = '%s="%s"' if isString(val): - if (self.containsSubstitutions + if (self.contains_substitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventualEncoding) @@ -882,6 +885,7 @@ class SoupStrainer: result = matchAgainst == markup return result + class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" diff --git a/tests/test_lxml.py b/tests/test_lxml.py index ab5b219..b002227 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -292,7 +292,7 @@ class TestLXMLBuilder(SoupTest): parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) self.assertEquals(parsed_meta['content'], 'text/html; charset=%SOUP-ENCODING%') - self.assertEquals(parsed_meta.containsSubstitutions, True) + self.assertEquals(parsed_meta.contains_substitutions, True) # For the rest of the story, see TestSubstitutions in # test_tree.py. |