diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-18 16:49:58 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-18 16:49:58 -0500 |
commit | f5f0e1ea521f813964b4f9cbea53fcdfc3942f56 (patch) | |
tree | 360f985212b9f16d2b466b8ad0f93e4dc0632b28 | |
parent | 79e7e06050d9bbf4c6e6b01db1e16c79087c0555 (diff) |
Added Python docstrings to all public methods in element.py.
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | bs4/element.py | 671 |
2 files changed, 570 insertions, 106 deletions
@@ -3,12 +3,17 @@ * Fixed a deprecation warning on Python 3.7. Patch by Colin Watson. [bug=1847592] +* Added Python docstrings to most public methods. + * The html.parser tree builder now correctly handles DOCTYPEs that are not uppercase. [bug=1848401] * Added a Chinese translation by Deron Wang and a Brazilian Portuguese translation by Cezar Peixeiro to the repository. +* PageElement.select() now returns a ResultSet rather than a regular list, + making it consistent with methods like find_all(). + = 4.8.1 (20191006) * When the html.parser or html5lib parsers are in use, Beautiful Soup diff --git a/bs4/element.py b/bs4/element.py index 8684870..bba3f7e 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -44,7 +44,10 @@ def _alias(attr): class NamespacedAttribute(unicode): - + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ + def __new__(cls, prefix, name=None, namespace=None): if not name: # This is the default namespace. Its name "has no value" @@ -79,6 +82,9 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + """When an HTML document is being encoded to a given encoding, the + value of a meta tag's 'charset' is the name of the encoding. + """ return encoding @@ -110,13 +116,31 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" + """Contains the navigational information for some part of the page: + that is, its current location in the parse tree. + + NavigableString, Tag, etc. are all subclasses of PageElement. + """ def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and - other elements.""" + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ self.parent = parent self.previous_element = previous_element @@ -140,7 +164,11 @@ class PageElement(object): self.previous_sibling.next_sibling = self def format_string(self, s, formatter): - """Format the given string using the given formatter.""" + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ if formatter is None: return s if not isinstance(formatter, Formatter): @@ -153,9 +181,10 @@ class PageElement(object): if necessary. :param formatter: Can be a Formatter object (used as-is), a - function (used as the entity substitution hook for an - XMLFormatter or HTMLFormatter), or a string (used to look up - an XMLFormatter or HTMLFormatter in the appropriate registry. + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look + up an XMLFormatter or HTMLFormatter in the appropriate + registry. """ if isinstance(formatter, Formatter): return formatter @@ -194,6 +223,12 @@ class PageElement(object): previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): + """Replace this PageElement with another one, keeping the rest of the + tree the same. + + :param replace_with: A PageElement. + :return: `self`, no longer part of the tree. + """ if self.parent is None: raise ValueError( "Cannot replace one element with another when the " @@ -210,6 +245,10 @@ class PageElement(object): replaceWith = replace_with # BS3 def unwrap(self): + """Replace this PageElement with its contents. + + :return: `self`, no longer part of the tree. + """ my_parent = self.parent if self.parent is None: raise ValueError( @@ -224,12 +263,21 @@ class PageElement(object): replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): + """Wrap this PageElement inside another one. + + :param wrap_inside: A PageElement. + :return: `wrap_inside`, occupying the position in the tree that used + to be occupied by `self`, and with `self` inside it. + """ me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside def extract(self): - """Destructively rips this element out of the tree.""" + """Destructively rips this element out of the tree. + + :return: `self`, no longer part of the tree. + """ if self.parent is not None: del self.parent.contents[self.parent.index(self)] @@ -258,7 +306,12 @@ class PageElement(object): return self def _last_descendant(self, is_initialized=True, accept_self=True): - "Finds the last element beneath this object to be parsed." + """Finds the last element beneath this object to be parsed. + + :param is_initialized: Has `setup` been called on this PageElement + yet? + :param accept_self: Is `self` an acceptable answer to the question? + """ if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: @@ -272,6 +325,14 @@ class PageElement(object): _lastRecursiveChild = _last_descendant def insert(self, position, new_child): + """Insert a new PageElement in the list of this PageElement's children. + + This works the same way as `list.insert`. + + :param position: The numeric position that should be occupied + in `self.children` by the new PageElement. + :param new_child: A PageElement. + """ if new_child is None: raise ValueError("Cannot insert None into a tag.") if new_child is self: @@ -346,19 +407,27 @@ class PageElement(object): self.contents.insert(position, new_child) def append(self, tag): - """Appends the given tag to the contents of this tag.""" + """Appends the given PageElement to the contents of this one. + + :param tag: A PageElement. + """ self.insert(len(self.contents), tag) def extend(self, tags): - """Appends the given tags to the contents of this tag.""" + """Appends the given PageElements to this one's contents. + + :param tags: A list of PageElements. + """ for tag in tags: self.append(tag) def insert_before(self, *args): """Makes the given element(s) the immediate predecessor of this one. - The elements will have the same parent, and the given elements + All the elements will have the same parent, and the given elements will be immediately before this one. + + :param args: One or more PageElements. """ parent = self.parent if parent is None: @@ -379,6 +448,8 @@ class PageElement(object): The elements will have the same parent, and the given elements will be immediately after this one. + + :param args: One or more PageElements. """ # Do all error checking before modifying the tree. parent = self.parent @@ -399,70 +470,165 @@ class PageElement(object): offset += 1 def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + + """ return self._find_one(self.find_all_next, name, attrs, text, **kwargs) findNext = find_next # BS3 def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" + """Find all PageElements that match the given criteria and appear + later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet containing PageElements. + """ return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + """ return self._find_one(self.find_next_siblings, name, attrs, text, **kwargs) findNextSibling = find_next_sibling # BS3 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" + """Find all siblings of this PageElement that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + """ return self._find_all(name, attrs, text, limit, self.next_siblings, **kwargs) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" + """Look backwards in the document from this PageElement and find the + first PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + """ return self._find_one( self.find_all_previous, name, attrs, text, **kwargs) findPrevious = find_previous # BS3 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" + """Look backwards in the document from this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + """ return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" + """Returns the closest sibling to this PageElement that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + """ return self._find_one(self.find_previous_siblings, name, attrs, text, **kwargs) findPreviousSibling = find_previous_sibling # BS3 def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + """ return self._find_all(name, attrs, text, limit, self.previous_siblings, **kwargs) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + """ # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None @@ -473,9 +639,18 @@ class PageElement(object): findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" + """Find all parents of this PageElement that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + """ return self._find_all(name, attrs, None, limit, self.parents, **kwargs) findParents = find_parents # BS3 @@ -483,10 +658,18 @@ class PageElement(object): @property def next(self): + """The PageElement, if any, that was parsed just after this one. + + :return: A PageElement. + """ return self.next_element @property def previous(self): + """The PageElement, if any, that was parsed just before this one. + + :return: A PageElement. + """ return self.previous_element #These methods do the real heavy lifting. @@ -554,6 +737,10 @@ class PageElement(object): #NavigableStrings and Tags. @property def next_elements(self): + """All PageElements that were parsed after this one. + + :yield: A sequence of PageElements. + """ i = self.next_element while i is not None: yield i @@ -561,6 +748,11 @@ class PageElement(object): @property def next_siblings(self): + """All PageElements that are siblings of this one but were parsed + later. + + :yield: A sequence of PageElements. + """ i = self.next_sibling while i is not None: yield i @@ -568,6 +760,10 @@ class PageElement(object): @property def previous_elements(self): + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ i = self.previous_element while i is not None: yield i @@ -575,6 +771,11 @@ class PageElement(object): @property def previous_siblings(self): + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ i = self.previous_sibling while i is not None: yield i @@ -582,6 +783,10 @@ class PageElement(object): @property def parents(self): + """All PageElements that are parents of this PageElement. + + :yield: A sequence of PageElements. + """ i = self.parent while i is not None: yield i @@ -606,6 +811,11 @@ class PageElement(object): class NavigableString(unicode, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a NavigableString for the string "penguin". + """ PREFIX = '' SUFFIX = '' @@ -651,35 +861,58 @@ class NavigableString(unicode, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): - """Run the string through the provided formatter.""" + """Run the string through the provided formatter. + + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @property def name(self): + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + """ return None @name.setter def name(self, name): + """Prevent NavigableString.name from ever being set.""" raise AttributeError("A NavigableString cannot be given a name.") + class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. - The string will be passed into the formatter (to trigger side effects), - but the return value will be ignored. + This is an abstract class used for special kinds of strings such + as comments (the Comment class) and CDATA blocks (the CData + class). """ - + + PREFIX = '' + SUFFIX = '' + def output_ready(self, formatter=None): - """CData strings are passed into the formatter, purely - for any side effects. The return value is ignored. + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A Formatter object, or a string naming one + of the standard formatters. The string will be passed into the + Formatter, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. """ if formatter is not None: ignore = self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): - + """A CDATA block.""" PREFIX = u'<![CDATA[' SUFFIX = u']]>' @@ -695,20 +928,32 @@ class XMLProcessingInstruction(ProcessingInstruction): SUFFIX = u'?>' class Comment(PreformattedString): - + """An HTML or XML comment.""" PREFIX = u'<!--' SUFFIX = u'-->' class Declaration(PreformattedString): + """An XML declaration.""" PREFIX = u'<?' SUFFIX = u'?>' class Doctype(PreformattedString): - + """A document type declaration.""" @classmethod def for_name_and_ids(cls, name, pub_id, system_id): + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + + :return: A Doctype. + """ value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id @@ -724,8 +969,12 @@ class Doctype(PreformattedString): class Tag(PageElement): + """Represents an HTML or XML tag that is part of a parse tree, along + with its attributes and contents. - """Represents a found HTML tag with its attributes and contents.""" + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a Tag object representing the <b> tag. + """ def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, @@ -733,8 +982,31 @@ class Tag(PageElement): can_be_empty_element=None, cdata_list_attributes=None, preserve_whitespace_tags=None ): - "Basic constructor." - + """Basic constructor. + + :param parser: A BeautifulSoup object. + :param builder: A TreeBuilder. + :param name: The name of the tag. + :param namespace: The URI of this Tag's XML namespace, if any. + :param prefix: The prefix for this Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values. + :param parent: The PageElement to use as this Tag's parent. + :param previous: The PageElement that was parsed immediately before + this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A list of attributes whose values should + be treated as CDATA if they ever show up on this tag. + :param preserve_whitespace_tags: A list of tag names whose contents + should have their whitespace preserved. + """ if parser is None: self.parser_class = None else: @@ -840,13 +1112,17 @@ class Tag(PageElement): @property def string(self): - """Convenience property to get the single string within this tag. + """Convenience property to get the single string within this + PageElement. + + TODO It might make sense to have NavigableString.string return + itself. - :Return: If this tag has a single string child, return value - is that string. If this tag has no children, or more than one - child, return value is None. If this tag has one child tag, + :return: If this element has a single string child, return + value is that string. If this element has one child tag, return value is the 'string' attribute of the child tag, - recursively. + recursively. If this element is itself a string, has no + children, or has more than one child, return value is None. """ if len(self.contents) != 1: return None @@ -857,14 +1133,23 @@ class Tag(PageElement): @string.setter def string(self, string): + """Replace this PageElement's contents with `string`.""" self.clear() self.append(string.__class__(string)) def _all_strings(self, strip=False, types=(NavigableString, CData)): """Yield all strings of certain classes, possibly stripping them. - By default, yields only NavigableString and CData objects. So - no comments, processing instructions, etc. + :param strip: If True, all strings will be stripped before being + yielded. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + etc. + + :yield: A sequence of strings. """ for descendant in self.descendants: if ( @@ -882,13 +1167,29 @@ class Tag(PageElement): @property def stripped_strings(self): + """Yield all strings in the document, stripping them first. + + :yield: A sequence of stripped strings. + """ for string in self._all_strings(True): yield string def get_text(self, separator=u"", strip=False, types=(NavigableString, CData)): - """ - Get all child strings, concatenated using the given separator. + """Get all child strings, concatenated using the given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + etc. + + :return: A string. """ return separator.join([s for s in self._all_strings( strip, types=types)]) @@ -896,7 +1197,11 @@ class Tag(PageElement): text = property(get_text) def decompose(self): - """Recursively destroys the contents of this tree.""" + """Recursively destroys this PageElement and its children. + + This element will be removed from the tree and wiped out; so + will everything beneath it. + """ self.extract() i = self while i is not None: @@ -906,8 +1211,11 @@ class Tag(PageElement): i = next def clear(self, decompose=False): - """ - Extract all children. If decompose is True, decompose instead. + """Wipe out all children of this PageElement by calling extract() + on them. + + :param decompose: If this is True, decompose() (a more + destructive method) will be called instead of extract(). """ if decompose: for element in self.contents[:]: @@ -920,7 +1228,8 @@ class Tag(PageElement): element.extract() def smooth(self): - """Smooth out this element's children by consolidating consecutive strings. + """Smooth out this element's children by consolidating consecutive + strings. This makes pretty-printed output look more natural following a lot of operations that modified the tree. @@ -957,9 +1266,12 @@ class Tag(PageElement): a.replace_with(n) def index(self, element): - """ - Find the index of a child by identity, not value. Avoids issues with - tag.contents.index(element) getting the index of equal elements. + """Find the index of a child by identity, not value. + + Avoids issues with tag.contents.index(element) getting the + index of equal elements. + + :param element: Look for this PageElement in `self.contents`. """ for i, child in enumerate(self.contents): if child is element: @@ -973,29 +1285,37 @@ class Tag(PageElement): return self.attrs.get(key, default) def get_attribute_list(self, key, default=None): - """The same as get(), but always returns a list.""" + """The same as get(), but always returns a list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this PageElement. + :return: A list of values, probably containing only a single + value. + """ value = self.get(key, default) if not isinstance(value, list): value = [value] return value def has_attr(self, key): + """Does this PageElement have an attribute with the given name?""" return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, + """tag[key] returns the value of the 'key' attribute for the Tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + "Iterating over a Tag iterates over its contents." return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + "The length of a Tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): @@ -1015,12 +1335,13 @@ class Tag(PageElement): self.attrs.pop(key, None) def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its + """Calling a Tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") @@ -1038,8 +1359,8 @@ class Tag(PageElement): "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag.""" + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" if self is other: return True if (not hasattr(other, 'name') or @@ -1055,12 +1376,17 @@ class Tag(PageElement): return True def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, + """Returns true iff this Tag is not identical to `other`, as defined in __eq__.""" return not self == other def __repr__(self, encoding="unicode-escape"): - """Renders this tag as a string.""" + """Renders this PageElement as a string. + + :param encoding: The encoding to use (Python 2 only). + :return: Under Python 2, a bytestring; under Python 3, + a Unicode string. + """ if PY3K: # "The return value must be a string object", i.e. Unicode return self.decode() @@ -1071,9 +1397,15 @@ class Tag(PageElement): return self.encode(encoding) def __unicode__(self): + """Renders this PageElement as a Unicode string.""" return self.decode() def __str__(self): + """Renders this PageElement as a generic string. + + :return: Under Python 2, a UTF-8 bytestring; under Python 3, + a Unicode string. + """ if PY3K: return self.decode() else: @@ -1085,6 +1417,22 @@ class Tag(PageElement): def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): + """Render a bytestring representation of this PageElement and its + contents. + + :param encoding: The destination encoding. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + encode() and its value should be one of the constants + defined by Python. + :return: A bytestring. + + """ # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) @@ -1093,14 +1441,20 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Returns a Unicode representation of this tag and its contents. + """Render a Unicode representation of this PageElement and its + contents. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. """ # First off, turn a non-Formatter `formatter` into a Formatter @@ -1186,7 +1540,11 @@ class Tag(PageElement): return s def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ return ( indent_level is not None and ( @@ -1196,6 +1554,15 @@ class Tag(PageElement): ) def prettify(self, encoding=None, formatter="minimal"): + """Pretty-print this PageElement as a string. + + :param encoding: The eventual encoding of the string. If this is None, + a Unicode string will be returned. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A Unicode string (if encoding==None) or a bytestring + (otherwise). + """ if encoding is None: return self.decode(True, formatter=formatter) else: @@ -1207,7 +1574,8 @@ class Tag(PageElement): """Renders the contents of this tag as a Unicode string. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The tag is destined to be encoded into this encoding. decode_contents() is _not_ @@ -1249,23 +1617,26 @@ class Tag(PageElement): def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Renders the contents of this tag as a bytestring. + """Renders the contents of this PageElement as a bytestring. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The bytestring will be in this encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. - """ + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + :return: A bytestring. + """ contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): + """Deprecated method for BS3 compatibility.""" if not prettyPrint: indentLevel = None return self.encode_contents( @@ -1275,8 +1646,21 @@ class Tag(PageElement): def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + """ r = None l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: @@ -1286,16 +1670,21 @@ class Tag(PageElement): def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - + """Look in the children of this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + """ generator = self.descendants if not recursive: generator = self.children @@ -1306,11 +1695,20 @@ class Tag(PageElement): #Generator methods @property def children(self): + """Iterate over all direct children of this PageElement. + + :yield: A sequence of PageElements. + """ # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def descendants(self): + """Iterate over all children of this PageElement in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ if not len(self.contents): return stopNode = self._last_descendant().next_element @@ -1321,7 +1719,20 @@ class Tag(PageElement): # CSS selector code def select_one(self, selector, namespaces=None, **kwargs): - """Perform a CSS selection operation on the current element.""" + """Perform a CSS selection operation on the current element. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A PageElement. + """ value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] @@ -1335,14 +1746,16 @@ class Tag(PageElement): :param selector: A string containing a CSS selector. :param namespaces: A dictionary mapping namespace prefixes - used in the CSS selector to namespace URIs. By default, - Beautiful Soup will use the prefixes it encountered while - parsing the document. + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. :param limit: After finding this number of results, stop looking. - :param kwargs: Any extra arguments you'd like to pass in to - soupsieve.select(). + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of PageElements. """ if namespaces is None: namespaces = self._namespaces @@ -1354,19 +1767,26 @@ class Tag(PageElement): "Cannot execute CSS selectors because the soupsieve package is not installed." ) - return soupsieve.select(selector, self, namespaces, limit, **kwargs) + results = soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # We do this because it's more consistent and because + # ResultSet.__getattr__ has a helpful error message. + return ResultSet(None, results) # Old names for backwards compatibility def childGenerator(self): + """Deprecated generator.""" return self.children def recursiveChildGenerator(self): + """Deprecated generator.""" return self.descendants def has_key(self, key): - """This was kind of misleading because has_key() (attributes) - was different from __in__ (contents). has_key() is gone in - Python 3, anyway.""" + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). has_key() + is gone in Python 3, anyway. + """ warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( key)) return self.has_attr(key) @@ -1377,6 +1797,17 @@ class SoupStrainer(object): text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): + """Constructor. + + The SoupStrainer constructor takes the same arguments passed + into the find_* methods. See the online documentation for + detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + """ self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' @@ -1434,12 +1865,25 @@ class SoupStrainer(object): return unicode(str(value)) def __str__(self): + """A string representation of this SoupStrainer.""" if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): + """Check whether a Tag with the given name and attributes would + match this SoupStrainer. + + Used prospectively to decide whether to even bother creating a Tag + object. + + :param markup_name: A tag name as found in some markup. + :param markup_attrs: A dictionary of attributes as found in some markup. + + :return: True if the prospective tag would match this SoupStrainer; + False otherwise. + """ found = None markup = None if isinstance(markup_name, Tag): @@ -1478,9 +1922,18 @@ class SoupStrainer(object): if found and self.text and not self._matches(found.string, self.text): found = None return found + + # For BS3 compatibility. searchTag = search_tag def search(self, markup): + """Find all items in `markup` that match this SoupStrainer. + + Used by the core _find_all() method, which is ultimately + called by all find_* methods. + + :param markup: A PageElement or a list of them. + """ # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that @@ -1593,10 +2046,16 @@ class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source, result=()): + """Constructor. + + :param source: A SoupStrainer. + :param result: A list of PageElements. + """ super(ResultSet, self).__init__(result) self.source = source def __getattr__(self, key): + """Raise a helpful exception.""" raise AttributeError( - "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key ) |