import re import types try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} from util import isString, isList DEFAULT_OUTPUT_ENCODING = "utf-8" class Entities: """A mixin class that knows about XML entities.""" HTML_ENTITIES = "html" XML_ENTITIES = "xml" XHTML_ENTITIES = "xhtml" def _invert(h): "Cheap function to invert a hash." i = {} for k,v in h.items(): i[v] = k return i XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", "quot" : '"', "amp" : "&", "lt" : "<", "gt" : ">" } XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def setup(self, parent=None, previous=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous = previous self.next = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.contents.index(self) if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: # We're replacing this element with one of its siblings. index = self.parent.contents.index(replaceWith) if index and index < myIndex: # Furthermore, it comes before this element. That # means that when we extract it, the index of this # element will change. myIndex = myIndex - 1 self.extract() oldParent.insert(myIndex, replaceWith) def extract(self): """Destructively rips this element out of the tree.""" if self.parent: try: self.parent.contents.remove(self) except ValueError: pass #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._lastRecursiveChild() nextElement = lastChild.next if self.previous: self.previous.next = nextElement if nextElement: nextElement.previous = self.previous self.previous = None lastChild.next = None self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling if self.nextSibling: self.nextSibling.previousSibling = self.previousSibling self.previousSibling = self.nextSibling = None return self def _lastRecursiveChild(self): "Finds the last element beneath this object to be parsed." lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: lastChild = lastChild.contents[-1] return lastChild def insert(self, position, newChild): if (isinstance(newChild, basestring) or isinstance(newChild, unicode)) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent != None: # We're 'inserting' an element that's already one # of this object's children. if newChild.parent == self: index = self.find(newChild) if index and index < position: # Furthermore we're moving it further down the # list of this object's children. That means that # when we extract this element, our target index # will jump down one. position = position - 1 newChild.extract() newChild.parent = self previousChild = None if position == 0: newChild.previousSibling = None newChild.previous = self else: previousChild = self.contents[position-1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild newChild.previous = previousChild._lastRecursiveChild() if newChild.previous: newChild.previous.next = newChild newChildsLastElement = newChild._lastRecursiveChild() if position >= len(self.contents): newChild.nextSibling = None parent = self parentsNextSibling = None while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent if not parent: # This is the last element in the document. break if parentsNextSibling: newChildsLastElement.next = parentsNextSibling else: newChildsLastElement.next = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild if newChildsLastElement.next: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def findNext(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findAllNext, name, attrs, text, **kwargs) def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousGenerator, **kwargs) fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x def findParent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _findOne because findParents takes a different # set of arguments. r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r def findParents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) fetchParents = findParents # Compatibility with pre-3.x #These methods do the real heavy lifting. def _findOne(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _findAll(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) g = generator() while True: try: i = g.next() except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These Generators can be used to navigate starting from both #NavigableStrings and Tags. def nextGenerator(self): i = self while i: i = i.next yield i def nextSiblingGenerator(self): i = self while i: i = i.nextSibling yield i def previousGenerator(self): i = self while i: i = i.previous yield i def previousSiblingGenerator(self): i = self while i: i = i.previousSibling yield i def parentGenerator(self): i = self while i: i = i.parent yield i # Utility methods def substituteEncoding(self, str, encoding=None): encoding = encoding or "utf-8" return str.replace("%SOUP-ENCODING%", encoding) def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" if isinstance(s, unicode): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: s = unicode(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s class NavigableString(unicode, PageElement): def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (unicode(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.decode().encode(encoding) def decodeGivenEventualEncoding(self, eventualEncoding): return self class CData(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'' class ProcessingInstruction(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): output = self if u'%SOUP-ENCODING%' in output: output = self.substituteEncoding(output, eventualEncoding) return u'' class Comment(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'' class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'' class Tag(PageElement, Entities): """Represents a found HTML tag with its attributes and contents.""" def _convertEntities(self, builder, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if builder.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if builder.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x def __init__(self, parser, builder, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.name = name self.isSelfClosing = builder.isSelfClosingTag(name) if attrs == None: attrs = [] if isinstance(attrs, types.DictType): self.attrMap = attrs self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert_one = lambda x: self._convertEntities(parser.builder, x) def convert(kval): k, val = kval if val is None: return kval return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", convert_one, val)) if isinstance(attrs, types.DictType): self.attrs = [convert(kv) for kv in attrs.items()] else: self.attrs = map(convert, attrs) def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self._getAttrMap().get(key, default) def has_key(self, key): return self._getAttrMap().has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self._getAttrMap()[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self._getAttrMap() self.attrMap[key] = value found = False for i in range(0, len(self.attrs)): if self.attrs[i][0] == key: self.attrs[i] = (key, value) found = True if not found: self.attrs.append((key, value)) self._getAttrMap()[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." for item in self.attrs: if item[0] == key: self.attrs.remove(item) #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return apply(self.findAll, args, kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag. NOTE: right now this will return false if two tags have the same attributes in a different order. Should this be fixed?""" if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.decode(eventualEncoding=encoding) BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") def _sub_entity(self, x): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" def __unicode__(self): return self.decode() def __str__(self): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) def decode(self, prettyPrint=False, indentLevel=0, eventualEncoding=DEFAULT_OUTPUT_ENCODING): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.""" attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): if (self.containsSubstitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventualEncoding) # The attribute value either: # # * Contains no embedded double quotes or single quotes. # No problem: we enclose it in double quotes. # * Contains embedded single quotes. No problem: # double quotes work here too. # * Contains embedded double quotes. No problem: # we enclose it in single quotes. # * Embeds both single _and_ double quotes. This # can't happen naturally, but it can happen if # you modify an attribute value after parsing # the document. Now we have a bit of a # problem. We solve it by enclosing the # attribute in single quotes, and escaping any # embedded single quotes to XML entities. if '"' in val: fmt = "%s='%s'" if "'" in val: # TODO: replace with apos when # appropriate. val = val.replace("'", "&squot;") # Now we're okay w/r/t quotes. But the attribute # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) if val is None: # Handle boolean attributes. decoded = key else: decoded = fmt % (key, val) attrs.append(decoded) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: closeTag = '' % self.name indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 contents = self.decodeContents(prettyPrint, indentContents, eventualEncoding) if self.hidden: s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) if prettyPrint and contents and contents[-1] != "\n": s.append("\n") if prettyPrint and closeTag: s.append(space) s.append(closeTag) if prettyPrint and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s def decompose(self): """Recursively destroys the contents of this tree.""" contents = [i for i in self.contents] for i in contents: if isinstance(i, Tag): i.decompose() else: i.extract() self.extract() def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): return self.decodeContents(prettyPrint, indentLevel).encode(encoding) def decodeContents(self, prettyPrint=False, indentLevel=0, eventualEncoding=DEFAULT_OUTPUT_ENCODING): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): text = c.decodeGivenEventualEncoding(eventualEncoding) elif isinstance(c, Tag): s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) if text and prettyPrint: text = text.strip() if text: if prettyPrint: s.append(" " * (indentLevel-1)) s.append(text) if prettyPrint: s.append("\n") return ''.join(s) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.findAll(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.recursiveChildGenerator if not recursive: generator = self.childGenerator return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll #Private methods def _getAttrMap(self): """Initializes a map representation of this tag's attributes, if not already initialized.""" if not getattr(self, 'attrMap'): self.attrMap = {} for (key, value) in self.attrs: self.attrMap[key] = value return self.attrMap #Generator methods def childGenerator(self): for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration def recursiveChildGenerator(self): if not len(self.contents): raise StopIteration stopNode = self._lastRecursiveChild().next current = self.contents[0] while current is not stopNode: yield current current = current.next # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isString(attrs): kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def searchTag(self, markupName=None, markupAttrs={}): found = None markup = None if isinstance(markupName, Tag): markup = markupName markupAttrs = markup callFunctionWithTagData = callable(self.name) \ and not isinstance(markupName, Tag) if (not self.name) \ or callFunctionWithTagData \ or (markup and self._matches(markup, self.name)) \ or (not markup and self._matches(markupName, self.name)): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: match = True markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs else: markupAttrMap = {} for k,v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): match = False break if match: if markup: found = markup else: found = markupName return found def search(self, markup): #print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if isList(markup) and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text: found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isString(markup): if self._matches(markup, self.text): found = markup else: raise Exception, "I don't know how to match against a %s" \ % markup.__class__ return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name if markup is not None and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) elif (isList(matchAgainst) and (markup is not None or not isString(matchAgainst))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) elif matchAgainst and isString(markup): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) else: matchAgainst = str(matchAgainst) if not result: result = matchAgainst == markup return result class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source