diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 09:58:07 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 09:58:07 -0500 |
commit | 66cbef12d959149746b3361f227f2a0328a31469 (patch) | |
tree | c6772c648933ad1477dc642aa598f34508870fff /beautifulsoup | |
parent | cb85520f7627a914e10e2d3ea52d7066bdf3984d (diff) | |
parent | d9462ef1b2760ccb6273903abcd7d253445716a4 (diff) |
Refactored the code that sets up substitutions in attribute values, and made content-type substitution work with html5lib.
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/__init__.py | 62 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 42 | ||||
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 13 | ||||
-rw-r--r-- | beautifulsoup/element.py | 14 |
4 files changed, 61 insertions, 70 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index e23c9d9..f2c20de 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -120,9 +120,6 @@ class BeautifulStoneSoup(Tag): """ ROOT_TAG_NAME = u'[document]' - # Used to detect the charset in a META tag; see handleSpecialMetaTag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left @@ -272,13 +269,8 @@ class BeautifulStoneSoup(Tag): or not self.parseOnlyThese.searchTag(name, attrs))): return None - containsSubstitutions = False - if name == 'meta' and self.builder.assume_html: - containsSubstitutions = self.handleSpecialMetaTag(attrs) - tag = Tag(self, self.builder, name, attrs, self.currentTag, self.previous) - tag.containsSubstitutions = containsSubstitutions if self.previous: self.previous.next = tag self.previous = tag @@ -293,60 +285,6 @@ class BeautifulStoneSoup(Tag): def handle_data(self, data): self.currentData.append(data) - def handleSpecialMetaTag(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning. Neither lxml nor html5lib does - this, so the feature is still here.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - if isinstance(attrs, dict): - httpEquiv = attrs.get('http-equiv') - contentType = attrs.get('content') - else: - # XXX do we need this? - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) - if isinstance(attrs, dict): - attrs['content'] = newAttr - else: - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - pass - return tagNeedsEncodingSubstitution - class BeautifulSoup(BeautifulStoneSoup): """A convenience class for parsing HTML without creating a builder.""" diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 86de5ec..cf5e6c6 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +import re from beautifulsoup.element import Entities __all__ = [ @@ -37,6 +38,9 @@ class TreeBuilder(Entities): """ return fragment + def set_up_substitutions(self, tag): + pass + class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" @@ -96,3 +100,41 @@ class HTMLTreeBuilder(TreeBuilder): self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # Used by set_up_substitutions to detect the charset in a META tag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def set_up_substitutions(self, tag): + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + + if (http_equiv is not None + and content is not None + and http_equiv.lower() == 'content-type'): + # This is an interesting meta tag. + match = self.CHARSET_RE.search(content) + if match: + if (self.soup.declaredHTMLEncoding is not None or + self.soup.originalEncoding == self.soup.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + tag['content'] = self.CHARSET_RE.sub(rewrite, content) + return True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + new_charset = match.group(3) + if (new_charset is not None + and new_charset != self.soup.originalEncoding): + self.soup.declaredHTMLEncoding = new_charset + self.soup._feed(self.soup.declaredHTMLEncoding) + raise StopParsing + pass + return False diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index 736889f..dc95493 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -130,12 +130,19 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): - if attributes: + if attributes is not None and attributes != {}: for name, value in attributes.items(): self.element[name] = value - + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # The Tag constructor calls this method automatically, + # but html5lib creates a Tag object before setting up + # the attributes. + self.element.contains_substitutions = ( + self.soup.builder.set_up_substitutions( + self.element)) attributes = property(getAttributes, setAttributes) - + def insertText(self, data, insertBefore=None): text = TextNode(NavigableString(data), self.soup) if insertBefore: diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 39e0e06..5793d59 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -9,7 +9,7 @@ from util import isString, isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class Entities: +class Entities(object): """A mixin class that knows about XML entities.""" HTML_ENTITIES = "html" @@ -31,7 +31,7 @@ class Entities: XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) -class PageElement: +class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -438,13 +438,16 @@ class Tag(PageElement, Entities): self.contents = [] self.setup(parent, previous) self.hidden = False - self.containsSubstitutions = False if isinstance(attrs, types.DictType): self.attrs = [kv for kv in attrs.items()] else: self.attrs = list(attrs) + # Set up any substitutions, such as the charset in a META tag. + self.contains_substitutions = builder.set_up_substitutions(self) + + @property def string(self): """Convenience property to get the single string within this tag. @@ -581,7 +584,7 @@ class Tag(PageElement, Entities): for key, val in self.attrs: fmt = '%s="%s"' if isString(val): - if (self.containsSubstitutions + if (self.contains_substitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventualEncoding) @@ -762,7 +765,7 @@ class Tag(PageElement, Entities): # Next, a couple classes to represent queries and their results. -class SoupStrainer: +class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or text).""" @@ -882,6 +885,7 @@ class SoupStrainer: result = matchAgainst == markup return result + class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" |