diff options
Diffstat (limited to 'beautifulsoup/element.py')
-rw-r--r-- | beautifulsoup/element.py | 855 |
1 files changed, 0 insertions, 855 deletions
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py deleted file mode 100644 index 61ed4ab..0000000 --- a/beautifulsoup/element.py +++ /dev/null @@ -1,855 +0,0 @@ -import re -import types -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} -from beautifulsoup.dammit import EntitySubstitution - -from util import isList - -DEFAULT_OUTPUT_ENCODING = "utf-8" - - -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent=None, previous=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.contents.index(self) - if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: - # We're replacing this element with one of its siblings. - index = self.parent.contents.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - self.parent.contents.remove(self) - except ValueError: - pass - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - return self - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent != None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent == self: - index = self.find(newChild) - if index and index < position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position-1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.insert(len(self.contents), tag) - - def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.find_all_next, name, attrs, text, **kwargs) - findNext = find_next # BS3 - - def find_all_next(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.next_elements, - **kwargs) - findAllNext = find_all_next # BS3 - - def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.find_next_siblings, name, attrs, text, - **kwargs) - findNextSibling = find_next_sibling # BS3 - - def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.next_siblings, **kwargs) - findNextSiblings = find_next_siblings # BS3 - fetchNextSiblings = find_next_siblings # BS2 - - def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne( - self.find_all_previous, name, attrs, text, **kwargs) - findPrevious = find_previous # BS3 - - def find_all_previous(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.previous_elements, - **kwargs) - findAllPrevious = find_all_previous # BS3 - fetchPrevious = find_all_previous # BS2 - - def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.find_previous_siblings, name, attrs, text, - **kwargs) - findPreviousSibling = find_previous_sibling # BS3 - - def find_previous_siblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.previous_siblings, **kwargs) - findPreviousSiblings = find_previous_siblings # BS3 - fetchPreviousSiblings = find_previous_siblings # BS2 - - def find_parent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.find_parents(name, attrs, 1) - if l: - r = l[0] - return r - findParent = find_parent # BS3 - - def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) - findParents = find_parents # BS3 - fetchParents = find_parents # BS2 - - #These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _find_all(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - else: - # Build a SoupStrainer - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - while True: - try: - i = generator.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These generators can be used to navigate starting from both - #NavigableStrings and Tags. - @property - def next_elements(self): - i = self - while i: - i = i.next - yield i - - @property - def next_siblings(self): - i = self - while i: - i = i.nextSibling - yield i - - @property - def previous_elements(self): - i = self - while i: - i = i.previous - yield i - - @property - def previous_siblings(self): - i = self - while i: - i = i.previousSibling - yield i - - @property - def parents(self): - i = self - while i: - i = i.parent - yield i - - # Old non-property versions of the generators, for backwards - # compatibility with BS3. - def nextGenerator(self): - return self.next_elements - - def nextSiblingGenerator(self): - return self.next_siblings - - def previousGenerator(self): - return self.previous_elements - - def previousSiblingGenerator(self): - return self.previous_siblings - - def parentGenerator(self): - return self.parents - - # Utility methods - def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - -class NavigableString(unicode, PageElement): - - PREFIX = '' - SUFFIX = '' - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - - def __getnewargs__(self): - return (unicode(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def output_ready(self, substitute_html_entities=False): - if substitute_html_entities: - output = EntitySubstitution.substitute_html(self) - else: - output = self - return self.PREFIX + output + self.SUFFIX - - -class CData(NavigableString): - - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' - - -class ProcessingInstruction(NavigableString): - - PREFIX = u'<?' - SUFFIX = u'?>' - - -class Comment(NavigableString): - - PREFIX = u'<!--' - SUFFIX = u'-->' - -class Declaration(NavigableString): - PREFIX = u'<!' - SUFFIX = u'!>' - - -class Doctype(NavigableString): - - @classmethod - def for_name_and_ids(cls, name, pub_id, system_id): - value = name - if pub_id is not None: - value += ' PUBLIC "%s"' % pub_id - if system_id is not None: - value += ' SYSTEM "%s"' % system_id - - return Doctype(value) - - PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>' - - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def __init__(self, parser, builder, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected. - self.parserClass = parser.__class__ - self.name = name - if attrs == None: - attrs = {} - else: - attrs = dict(attrs) - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - - # Set up any substitutions, such as the charset in a META tag. - self.contains_substitutions = builder.set_up_substitutions(self) - - self.can_be_empty_element = builder.can_be_empty_element(name) - - @property - def is_empty_element(self): - """Is this tag an empty-element tag? (aka a self-closing tag) - - A tag that has contents is never an empty-element tag. - - A tag that has no contents may or may not be an empty-element - tag. It depends on the builder used to create the tag. If the - builder has a designated list of empty-element tags, then only - a tag whose name shows up in that list is considered an - empty-element tag. - - If the builder has no designated list of empty-element tags, - then any tag with no contents is an empty-element tag. - """ - return len(self.contents) == 0 and self.can_be_empty_element - isSelfClosing = is_empty_element # BS3 - - - @property - def string(self): - """Convenience property to get the single string within this tag. - - :Return: If this tag has a single string child, return value - is that string. If this tag has no children, or more than one - child, return value is None. If this tag has one child tag, - return value is the 'string' attribute of the child tag, - recursively. - """ - if len(self.contents) != 1: - return None - child = self.contents[0] - if isinstance(child, NavigableString): - return child - return child.string - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self.attrs.get(key, default) - - def has_key(self, key): - return self.attrs.has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self.attrs[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self.attrs[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - if self.attrs.has_key(key): - del self.attrs[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - find_all() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.find_all, args, kwargs) - - def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - XXX: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.encode(encoding) - - def __unicode__(self): - return self.decode() - - def __str__(self): - return self.encode() - - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, substitute_html_entities=False): - return self.decode(indent_level, encoding, - substitute_html_entities).encode(encoding) - - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): - """Returns a Unicode representation of this tag and its contents. - - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - """ - attrs = [] - if self.attrs: - for key, val in sorted(self.attrs.items()): - if val is None: - decoded = key - else: - if not isinstance(val, basestring): - val = str(val) - if (self.contains_substitutions - and eventual_encoding is not None - and '%SOUP-ENCODING%' in val): - val = self.substituteEncoding(val, eventual_encoding) - - decoded = (key + '=' - + EntitySubstitution.substitute_xml(val, True)) - attrs.append(decoded) - close = '' - closeTag = '' - if self.is_empty_element: - close = ' /' - else: - closeTag = '</%s>' % self.name - - pretty_print = (indent_level is not None) - if pretty_print: - space = (' ' * (indent_level-1)) - indent_contents = indent_level + 1 - else: - space = '' - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, substitute_html_entities) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if pretty_print: - s.append(space) - s.append('<%s%s%s>' % (self.name, attributeString, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if pretty_print and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def decompose(self): - """Recursively destroys the contents of this tree.""" - contents = [i for i in self.contents] - for i in contents: - if isinstance(i, Tag): - i.decompose() - else: - i.extract() - self.extract() - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.encode(encoding, True) - - def decode_contents(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): - """Renders the contents of this tag as a Unicode string. - - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - """ - pretty_print = (indent_level is not None) - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(substitute_html_entities) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - substitute_html_entities)) - if text and indent_level: - text = text.strip() - if text: - if pretty_print: - s.append(" " * (indent_level-1)) - s.append(text) - if pretty_print: - s.append("\n") - return ''.join(s) - - #Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.find_all(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def find_all(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursive_children - if not recursive: - generator = self.children - return self._find_all(name, attrs, text, limit, generator, **kwargs) - findAll = find_all # BS3 - findChildren = find_all # BS2 - - #Generator methods - @property - def children(self): - for i in range(0, len(self.contents)): - yield self.contents[i] - raise StopIteration - - @property - def recursive_children(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - - # Old names for backwards compatibility - def childGenerator(self): - return self.children - - def recursiveChildGenerator(self): - return self.recursive_children - - -# Next, a couple classes to represent queries and their results. -class SoupStrainer(object): - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = attrs - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k,v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if isList(markup) and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: - result = markup != None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup is not None and not isinstance(markup, basestring): - markup = unicode(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif (isList(matchAgainst) - and (markup is not None - or not isinstance(matchAgainst, basestring))): - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source): - list.__init__([]) - self.source = source |