summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG5
-rw-r--r--bs4/element.py671
2 files changed, 570 insertions, 106 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 921f639..a691583 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,12 +3,17 @@
* Fixed a deprecation warning on Python 3.7. Patch by Colin
Watson. [bug=1847592]
+* Added Python docstrings to most public methods.
+
* The html.parser tree builder now correctly handles DOCTYPEs that are
not uppercase. [bug=1848401]
* Added a Chinese translation by Deron Wang and a Brazilian Portuguese
translation by Cezar Peixeiro to the repository.
+* PageElement.select() now returns a ResultSet rather than a regular list,
+ making it consistent with methods like find_all().
+
= 4.8.1 (20191006)
* When the html.parser or html5lib parsers are in use, Beautiful Soup
diff --git a/bs4/element.py b/bs4/element.py
index 8684870..bba3f7e 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -44,7 +44,10 @@ def _alias(attr):
class NamespacedAttribute(unicode):
-
+ """A namespaced string (e.g. 'xml:lang') that remembers the namespace
+ ('xml') and the name ('lang') that were used to create it.
+ """
+
def __new__(cls, prefix, name=None, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
@@ -79,6 +82,9 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ """When an HTML document is being encoded to a given encoding, the
+ value of a meta tag's 'charset' is the name of the encoding.
+ """
return encoding
@@ -110,13 +116,31 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
+ """Contains the navigational information for some part of the page:
+ that is, its current location in the parse tree.
+
+ NavigableString, Tag, etc. are all subclasses of PageElement.
+ """
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
- other elements."""
+ other elements.
+
+ :param parent: The parent of this element.
+
+ :param previous_element: The element parsed immediately before
+ this one.
+
+ :param next_element: The element parsed immediately before
+ this one.
+
+ :param previous_sibling: The most recently encountered element
+ on the same level of the parse tree as this one.
+
+ :param previous_sibling: The next element to be encountered
+ on the same level of the parse tree as this one.
+ """
self.parent = parent
self.previous_element = previous_element
@@ -140,7 +164,11 @@ class PageElement(object):
self.previous_sibling.next_sibling = self
def format_string(self, s, formatter):
- """Format the given string using the given formatter."""
+ """Format the given string using the given formatter.
+
+ :param s: A string.
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
if formatter is None:
return s
if not isinstance(formatter, Formatter):
@@ -153,9 +181,10 @@ class PageElement(object):
if necessary.
:param formatter: Can be a Formatter object (used as-is), a
- function (used as the entity substitution hook for an
- XMLFormatter or HTMLFormatter), or a string (used to look up
- an XMLFormatter or HTMLFormatter in the appropriate registry.
+ function (used as the entity substitution hook for an
+ XMLFormatter or HTMLFormatter), or a string (used to look
+ up an XMLFormatter or HTMLFormatter in the appropriate
+ registry.
"""
if isinstance(formatter, Formatter):
return formatter
@@ -194,6 +223,12 @@ class PageElement(object):
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
+ """Replace this PageElement with another one, keeping the rest of the
+ tree the same.
+
+ :param replace_with: A PageElement.
+ :return: `self`, no longer part of the tree.
+ """
if self.parent is None:
raise ValueError(
"Cannot replace one element with another when the "
@@ -210,6 +245,10 @@ class PageElement(object):
replaceWith = replace_with # BS3
def unwrap(self):
+ """Replace this PageElement with its contents.
+
+ :return: `self`, no longer part of the tree.
+ """
my_parent = self.parent
if self.parent is None:
raise ValueError(
@@ -224,12 +263,21 @@ class PageElement(object):
replaceWithChildren = unwrap # BS3
def wrap(self, wrap_inside):
+ """Wrap this PageElement inside another one.
+
+ :param wrap_inside: A PageElement.
+ :return: `wrap_inside`, occupying the position in the tree that used
+ to be occupied by `self`, and with `self` inside it.
+ """
me = self.replace_with(wrap_inside)
wrap_inside.append(me)
return wrap_inside
def extract(self):
- """Destructively rips this element out of the tree."""
+ """Destructively rips this element out of the tree.
+
+ :return: `self`, no longer part of the tree.
+ """
if self.parent is not None:
del self.parent.contents[self.parent.index(self)]
@@ -258,7 +306,12 @@ class PageElement(object):
return self
def _last_descendant(self, is_initialized=True, accept_self=True):
- "Finds the last element beneath this object to be parsed."
+ """Finds the last element beneath this object to be parsed.
+
+ :param is_initialized: Has `setup` been called on this PageElement
+ yet?
+ :param accept_self: Is `self` an acceptable answer to the question?
+ """
if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element
else:
@@ -272,6 +325,14 @@ class PageElement(object):
_lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
+ """Insert a new PageElement in the list of this PageElement's children.
+
+ This works the same way as `list.insert`.
+
+ :param position: The numeric position that should be occupied
+ in `self.children` by the new PageElement.
+ :param new_child: A PageElement.
+ """
if new_child is None:
raise ValueError("Cannot insert None into a tag.")
if new_child is self:
@@ -346,19 +407,27 @@ class PageElement(object):
self.contents.insert(position, new_child)
def append(self, tag):
- """Appends the given tag to the contents of this tag."""
+ """Appends the given PageElement to the contents of this one.
+
+ :param tag: A PageElement.
+ """
self.insert(len(self.contents), tag)
def extend(self, tags):
- """Appends the given tags to the contents of this tag."""
+ """Appends the given PageElements to this one's contents.
+
+ :param tags: A list of PageElements.
+ """
for tag in tags:
self.append(tag)
def insert_before(self, *args):
"""Makes the given element(s) the immediate predecessor of this one.
- The elements will have the same parent, and the given elements
+ All the elements will have the same parent, and the given elements
will be immediately before this one.
+
+ :param args: One or more PageElements.
"""
parent = self.parent
if parent is None:
@@ -379,6 +448,8 @@ class PageElement(object):
The elements will have the same parent, and the given elements
will be immediately after this one.
+
+ :param args: One or more PageElements.
"""
# Do all error checking before modifying the tree.
parent = self.parent
@@ -399,70 +470,165 @@ class PageElement(object):
offset += 1
def find_next(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears after this Tag in the document."""
+ """Find the first PageElement that matches the given criteria and
+ appears later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+
+ """
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
findNext = find_next # BS3
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- after this Tag in the document."""
+ """Find all PageElements that match the given criteria and appear
+ later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet containing PageElements.
+ """
return self._find_all(name, attrs, text, limit, self.next_elements,
**kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears after this Tag in the document."""
+ """Find the closest sibling to this PageElement that matches the
+ given criteria and appears later in the document.
+
+ All find_* methods take a common set of arguments. See the
+ online documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ """
return self._find_one(self.find_next_siblings, name, attrs, text,
**kwargs)
findNextSibling = find_next_sibling # BS3
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear after this Tag in the document."""
+ """Find all siblings of this PageElement that match the given criteria
+ and appear later in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ """
return self._find_all(name, attrs, text, limit,
self.next_siblings, **kwargs)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears before this Tag in the document."""
+ """Look backwards in the document from this PageElement and find the
+ first PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ """
return self._find_one(
self.find_all_previous, name, attrs, text, **kwargs)
findPrevious = find_previous # BS3
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- before this Tag in the document."""
+ """Look backwards in the document from this PageElement and find all
+ PageElements that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ """
return self._find_all(name, attrs, text, limit, self.previous_elements,
**kwargs)
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears before this Tag in the document."""
+ """Returns the closest sibling to this PageElement that matches the
+ given criteria and appears earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ """
return self._find_one(self.find_previous_siblings, name, attrs, text,
**kwargs)
findPreviousSibling = find_previous_sibling # BS3
def find_previous_siblings(self, name=None, attrs={}, text=None,
limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
+ """Returns all siblings to this PageElement that match the
+ given criteria and appear earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ """
return self._find_all(name, attrs, text, limit,
self.previous_siblings, **kwargs)
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
+ """Find the closest parent of this PageElement that matches the given
+ criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ """
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
@@ -473,9 +639,18 @@ class PageElement(object):
findParent = find_parent # BS3
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
+ """Find all parents of this PageElement that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ """
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
findParents = find_parents # BS3
@@ -483,10 +658,18 @@ class PageElement(object):
@property
def next(self):
+ """The PageElement, if any, that was parsed just after this one.
+
+ :return: A PageElement.
+ """
return self.next_element
@property
def previous(self):
+ """The PageElement, if any, that was parsed just before this one.
+
+ :return: A PageElement.
+ """
return self.previous_element
#These methods do the real heavy lifting.
@@ -554,6 +737,10 @@ class PageElement(object):
#NavigableStrings and Tags.
@property
def next_elements(self):
+ """All PageElements that were parsed after this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_element
while i is not None:
yield i
@@ -561,6 +748,11 @@ class PageElement(object):
@property
def next_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ later.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_sibling
while i is not None:
yield i
@@ -568,6 +760,10 @@ class PageElement(object):
@property
def previous_elements(self):
+ """All PageElements that were parsed before this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_element
while i is not None:
yield i
@@ -575,6 +771,11 @@ class PageElement(object):
@property
def previous_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ earlier.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_sibling
while i is not None:
yield i
@@ -582,6 +783,10 @@ class PageElement(object):
@property
def parents(self):
+ """All PageElements that are parents of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
i = self.parent
while i is not None:
yield i
@@ -606,6 +811,11 @@ class PageElement(object):
class NavigableString(unicode, PageElement):
+ """A Python Unicode string that is part of a parse tree.
+
+ When Beautiful Soup parses the markup <b>penguin</b>, it will
+ create a NavigableString for the string "penguin".
+ """
PREFIX = ''
SUFFIX = ''
@@ -651,35 +861,58 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
- """Run the string through the provided formatter."""
+ """Run the string through the provided formatter.
+
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@property
def name(self):
+ """Since a NavigableString is not a Tag, it has no .name.
+
+ This property is implemented so that code like this doesn't crash
+ when run on a mixture of Tag and NavigableString objects:
+ [x.name for x in tag.children]
+ """
return None
@name.setter
def name(self, name):
+ """Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")
+
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
- The string will be passed into the formatter (to trigger side effects),
- but the return value will be ignored.
+ This is an abstract class used for special kinds of strings such
+ as comments (the Comment class) and CDATA blocks (the CData
+ class).
"""
-
+
+ PREFIX = ''
+ SUFFIX = ''
+
def output_ready(self, formatter=None):
- """CData strings are passed into the formatter, purely
- for any side effects. The return value is ignored.
+ """Make this string ready for output by adding any subclass-specific
+ prefix or suffix.
+
+ :param formatter: A Formatter object, or a string naming one
+ of the standard formatters. The string will be passed into the
+ Formatter, but only to trigger any side effects: the return
+ value is ignored.
+
+ :return: The string, with any subclass-specific prefix and
+ suffix added on.
"""
if formatter is not None:
ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
-
+ """A CDATA block."""
PREFIX = u'<![CDATA['
SUFFIX = u']]>'
@@ -695,20 +928,32 @@ class XMLProcessingInstruction(ProcessingInstruction):
SUFFIX = u'?>'
class Comment(PreformattedString):
-
+ """An HTML or XML comment."""
PREFIX = u'<!--'
SUFFIX = u'-->'
class Declaration(PreformattedString):
+ """An XML declaration."""
PREFIX = u'<?'
SUFFIX = u'?>'
class Doctype(PreformattedString):
-
+ """A document type declaration."""
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
+ """Generate an appropriate document type declaration for a given
+ public ID and system ID.
+
+ :param name: The name of the document's root element, e.g. 'html'.
+ :param pub_id: The Formal Public Identifier for this document type,
+ e.g. '-//W3C//DTD XHTML 1.1//EN'
+ :param system_id: The system identifier for this document type,
+ e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+
+ :return: A Doctype.
+ """
value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
@@ -724,8 +969,12 @@ class Doctype(PreformattedString):
class Tag(PageElement):
+ """Represents an HTML or XML tag that is part of a parse tree, along
+ with its attributes and contents.
- """Represents a found HTML tag with its attributes and contents."""
+ When Beautiful Soup parses the markup <b>penguin</b>, it will
+ create a Tag object representing the <b> tag.
+ """
def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None,
@@ -733,8 +982,31 @@ class Tag(PageElement):
can_be_empty_element=None, cdata_list_attributes=None,
preserve_whitespace_tags=None
):
- "Basic constructor."
-
+ """Basic constructor.
+
+ :param parser: A BeautifulSoup object.
+ :param builder: A TreeBuilder.
+ :param name: The name of the tag.
+ :param namespace: The URI of this Tag's XML namespace, if any.
+ :param prefix: The prefix for this Tag's XML namespace, if any.
+ :param attrs: A dictionary of this Tag's attribute values.
+ :param parent: The PageElement to use as this Tag's parent.
+ :param previous: The PageElement that was parsed immediately before
+ this tag.
+ :param is_xml: If True, this is an XML tag. Otherwise, this is an
+ HTML tag.
+ :param sourceline: The line number where this tag was found in its
+ source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was found.
+ :param can_be_empty_element: If True, this tag should be
+ represented as <tag/>. If False, this tag should be represented
+ as <tag></tag>.
+ :param cdata_list_attributes: A list of attributes whose values should
+ be treated as CDATA if they ever show up on this tag.
+ :param preserve_whitespace_tags: A list of tag names whose contents
+ should have their whitespace preserved.
+ """
if parser is None:
self.parser_class = None
else:
@@ -840,13 +1112,17 @@ class Tag(PageElement):
@property
def string(self):
- """Convenience property to get the single string within this tag.
+ """Convenience property to get the single string within this
+ PageElement.
+
+ TODO It might make sense to have NavigableString.string return
+ itself.
- :Return: If this tag has a single string child, return value
- is that string. If this tag has no children, or more than one
- child, return value is None. If this tag has one child tag,
+ :return: If this element has a single string child, return
+ value is that string. If this element has one child tag,
return value is the 'string' attribute of the child tag,
- recursively.
+ recursively. If this element is itself a string, has no
+ children, or has more than one child, return value is None.
"""
if len(self.contents) != 1:
return None
@@ -857,14 +1133,23 @@ class Tag(PageElement):
@string.setter
def string(self, string):
+ """Replace this PageElement's contents with `string`."""
self.clear()
self.append(string.__class__(string))
def _all_strings(self, strip=False, types=(NavigableString, CData)):
"""Yield all strings of certain classes, possibly stripping them.
- By default, yields only NavigableString and CData objects. So
- no comments, processing instructions, etc.
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :types: A tuple of NavigableString subclasses. Any strings of
+ a subclass not found in this list will be ignored. By
+ default, this means only NavigableString and CData objects
+ will be considered. So no comments, processing instructions,
+ etc.
+
+ :yield: A sequence of strings.
"""
for descendant in self.descendants:
if (
@@ -882,13 +1167,29 @@ class Tag(PageElement):
@property
def stripped_strings(self):
+ """Yield all strings in the document, stripping them first.
+
+ :yield: A sequence of stripped strings.
+ """
for string in self._all_strings(True):
yield string
def get_text(self, separator=u"", strip=False,
types=(NavigableString, CData)):
- """
- Get all child strings, concatenated using the given separator.
+ """Get all child strings, concatenated using the given separator.
+
+ :param separator: Strings will be concatenated using this separator.
+
+ :param strip: If True, strings will be stripped before being
+ concatenated.
+
+ :types: A tuple of NavigableString subclasses. Any strings of
+ a subclass not found in this list will be ignored. By
+ default, this means only NavigableString and CData objects
+ will be considered. So no comments, processing instructions,
+ etc.
+
+ :return: A string.
"""
return separator.join([s for s in self._all_strings(
strip, types=types)])
@@ -896,7 +1197,11 @@ class Tag(PageElement):
text = property(get_text)
def decompose(self):
- """Recursively destroys the contents of this tree."""
+ """Recursively destroys this PageElement and its children.
+
+ This element will be removed from the tree and wiped out; so
+ will everything beneath it.
+ """
self.extract()
i = self
while i is not None:
@@ -906,8 +1211,11 @@ class Tag(PageElement):
i = next
def clear(self, decompose=False):
- """
- Extract all children. If decompose is True, decompose instead.
+ """Wipe out all children of this PageElement by calling extract()
+ on them.
+
+ :param decompose: If this is True, decompose() (a more
+ destructive method) will be called instead of extract().
"""
if decompose:
for element in self.contents[:]:
@@ -920,7 +1228,8 @@ class Tag(PageElement):
element.extract()
def smooth(self):
- """Smooth out this element's children by consolidating consecutive strings.
+ """Smooth out this element's children by consolidating consecutive
+ strings.
This makes pretty-printed output look more natural following a
lot of operations that modified the tree.
@@ -957,9 +1266,12 @@ class Tag(PageElement):
a.replace_with(n)
def index(self, element):
- """
- Find the index of a child by identity, not value. Avoids issues with
- tag.contents.index(element) getting the index of equal elements.
+ """Find the index of a child by identity, not value.
+
+ Avoids issues with tag.contents.index(element) getting the
+ index of equal elements.
+
+ :param element: Look for this PageElement in `self.contents`.
"""
for i, child in enumerate(self.contents):
if child is element:
@@ -973,29 +1285,37 @@ class Tag(PageElement):
return self.attrs.get(key, default)
def get_attribute_list(self, key, default=None):
- """The same as get(), but always returns a list."""
+ """The same as get(), but always returns a list.
+
+ :param key: The attribute to look for.
+ :param default: Use this value if the attribute is not present
+ on this PageElement.
+ :return: A list of values, probably containing only a single
+ value.
+ """
value = self.get(key, default)
if not isinstance(value, list):
value = [value]
return value
def has_attr(self, key):
+ """Does this PageElement have an attribute with the given name?"""
return key in self.attrs
def __hash__(self):
return str(self).__hash__()
def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
+ """tag[key] returns the value of the 'key' attribute for the Tag,
and throws an exception if it's not there."""
return self.attrs[key]
def __iter__(self):
- "Iterating over a tag iterates over its contents."
+ "Iterating over a Tag iterates over its contents."
return iter(self.contents)
def __len__(self):
- "The length of a tag is the length of its list of contents."
+ "The length of a Tag is the length of its list of contents."
return len(self.contents)
def __contains__(self, x):
@@ -1015,12 +1335,13 @@ class Tag(PageElement):
self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
+ """Calling a Tag like a function is the same as calling its
find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
return self.find_all(*args, **kwargs)
def __getattr__(self, tag):
+ """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
#print "Getattr %s.%s" % (self.__class__, tag)
if len(tag) > 3 and tag.endswith('Tag'):
# BS3: soup.aTag -> "soup.find("a")
@@ -1038,8 +1359,8 @@ class Tag(PageElement):
"'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag."""
+ """Returns true iff this Tag has the same name, the same attributes,
+ and the same contents (recursively) as `other`."""
if self is other:
return True
if (not hasattr(other, 'name') or
@@ -1055,12 +1376,17 @@ class Tag(PageElement):
return True
def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
+ """Returns true iff this Tag is not identical to `other`,
as defined in __eq__."""
return not self == other
def __repr__(self, encoding="unicode-escape"):
- """Renders this tag as a string."""
+ """Renders this PageElement as a string.
+
+ :param encoding: The encoding to use (Python 2 only).
+ :return: Under Python 2, a bytestring; under Python 3,
+ a Unicode string.
+ """
if PY3K:
# "The return value must be a string object", i.e. Unicode
return self.decode()
@@ -1071,9 +1397,15 @@ class Tag(PageElement):
return self.encode(encoding)
def __unicode__(self):
+ """Renders this PageElement as a Unicode string."""
return self.decode()
def __str__(self):
+ """Renders this PageElement as a generic string.
+
+ :return: Under Python 2, a UTF-8 bytestring; under Python 3,
+ a Unicode string.
+ """
if PY3K:
return self.decode()
else:
@@ -1085,6 +1417,22 @@ class Tag(PageElement):
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
+ """Render a bytestring representation of this PageElement and its
+ contents.
+
+ :param encoding: The destination encoding.
+ :param indent_level: Each line of the rendering will be
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :param errors: An error handling strategy such as
+ 'xmlcharrefreplace'. This value is passed along into
+ encode() and its value should be one of the constants
+ defined by Python.
+ :return: A bytestring.
+
+ """
# Turn the data structure into Unicode, then encode the
# Unicode.
u = self.decode(indent_level, encoding, formatter)
@@ -1093,14 +1441,20 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Returns a Unicode representation of this tag and its contents.
+ """Render a Unicode representation of this PageElement and its
+ contents.
+ :param indent_level: Each line of the rendering will be
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a <META> tag that mentions the document's
+ encoding.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
"""
# First off, turn a non-Formatter `formatter` into a Formatter
@@ -1186,7 +1540,11 @@ class Tag(PageElement):
return s
def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
+ """Should this tag be pretty-printed?
+
+ Most of them should, but some (such as <pre> in HTML
+ documents) should not.
+ """
return (
indent_level is not None
and (
@@ -1196,6 +1554,15 @@ class Tag(PageElement):
)
def prettify(self, encoding=None, formatter="minimal"):
+ """Pretty-print this PageElement as a string.
+
+ :param encoding: The eventual encoding of the string. If this is None,
+ a Unicode string will be returned.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :return: A Unicode string (if encoding==None) or a bytestring
+ (otherwise).
+ """
if encoding is None:
return self.decode(True, formatter=formatter)
else:
@@ -1207,7 +1574,8 @@ class Tag(PageElement):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. decode_contents() is _not_
@@ -1249,23 +1617,26 @@ class Tag(PageElement):
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Renders the contents of this tag as a bytestring.
+ """Renders the contents of this PageElement as a bytestring.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The bytestring will be in this encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
- """
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
+ :return: A bytestring.
+ """
contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding)
# Old method for BS3 compatibility
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
+ """Deprecated method for BS3 compatibility."""
if not prettyPrint:
indentLevel = None
return self.encode_contents(
@@ -1275,8 +1646,21 @@ class Tag(PageElement):
def find(self, name=None, attrs={}, recursive=True, text=None,
**kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
+ """Look in the children of this PageElement and find the first
+ PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ """
r = None
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
if l:
@@ -1286,16 +1670,21 @@ class Tag(PageElement):
def find_all(self, name=None, attrs={}, recursive=True, text=None,
limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
-
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
-
+ """Look in the children of this PageElement and find all
+ PageElements that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find_all() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ """
generator = self.descendants
if not recursive:
generator = self.children
@@ -1306,11 +1695,20 @@ class Tag(PageElement):
#Generator methods
@property
def children(self):
+ """Iterate over all direct children of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def descendants(self):
+ """Iterate over all children of this PageElement in a
+ breadth-first sequence.
+
+ :yield: A sequence of PageElements.
+ """
if not len(self.contents):
return
stopNode = self._last_descendant().next_element
@@ -1321,7 +1719,20 @@ class Tag(PageElement):
# CSS selector code
def select_one(self, selector, namespaces=None, **kwargs):
- """Perform a CSS selection operation on the current element."""
+ """Perform a CSS selection operation on the current element.
+
+ :param selector: A CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A PageElement.
+ """
value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
@@ -1335,14 +1746,16 @@ class Tag(PageElement):
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
- used in the CSS selector to namespace URIs. By default,
- Beautiful Soup will use the prefixes it encountered while
- parsing the document.
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
:param limit: After finding this number of results, stop looking.
- :param kwargs: Any extra arguments you'd like to pass in to
- soupsieve.select().
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A ResultSet of PageElements.
"""
if namespaces is None:
namespaces = self._namespaces
@@ -1354,19 +1767,26 @@ class Tag(PageElement):
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
- return soupsieve.select(selector, self, namespaces, limit, **kwargs)
+ results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
+
+ # We do this because it's more consistent and because
+ # ResultSet.__getattr__ has a helpful error message.
+ return ResultSet(None, results)
# Old names for backwards compatibility
def childGenerator(self):
+ """Deprecated generator."""
return self.children
def recursiveChildGenerator(self):
+ """Deprecated generator."""
return self.descendants
def has_key(self, key):
- """This was kind of misleading because has_key() (attributes)
- was different from __in__ (contents). has_key() is gone in
- Python 3, anyway."""
+ """Deprecated method. This was kind of misleading because has_key()
+ (attributes) was different from __in__ (contents). has_key()
+ is gone in Python 3, anyway.
+ """
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
key))
return self.has_attr(key)
@@ -1377,6 +1797,17 @@ class SoupStrainer(object):
text)."""
def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ """Constructor.
+
+ The SoupStrainer constructor takes the same arguments passed
+ into the find_* methods. See the online documentation for
+ detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ """
self.name = self._normalize_search_value(name)
if not isinstance(attrs, dict):
# Treat a non-dict value for attrs as a search for the 'class'
@@ -1434,12 +1865,25 @@ class SoupStrainer(object):
return unicode(str(value))
def __str__(self):
+ """A string representation of this SoupStrainer."""
if self.text:
return self.text
else:
return "%s|%s" % (self.name, self.attrs)
def search_tag(self, markup_name=None, markup_attrs={}):
+ """Check whether a Tag with the given name and attributes would
+ match this SoupStrainer.
+
+ Used prospectively to decide whether to even bother creating a Tag
+ object.
+
+ :param markup_name: A tag name as found in some markup.
+ :param markup_attrs: A dictionary of attributes as found in some markup.
+
+ :return: True if the prospective tag would match this SoupStrainer;
+ False otherwise.
+ """
found = None
markup = None
if isinstance(markup_name, Tag):
@@ -1478,9 +1922,18 @@ class SoupStrainer(object):
if found and self.text and not self._matches(found.string, self.text):
found = None
return found
+
+ # For BS3 compatibility.
searchTag = search_tag
def search(self, markup):
+ """Find all items in `markup` that match this SoupStrainer.
+
+ Used by the core _find_all() method, which is ultimately
+ called by all find_* methods.
+
+ :param markup: A PageElement or a list of them.
+ """
# print 'looking for %s in %s' % (self, markup)
found = None
# If given a list of items, scan it for a text element that
@@ -1593,10 +2046,16 @@ class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
def __init__(self, source, result=()):
+ """Constructor.
+
+ :param source: A SoupStrainer.
+ :param result: A list of PageElements.
+ """
super(ResultSet, self).__init__(result)
self.source = source
def __getattr__(self, key):
+ """Raise a helpful exception."""
raise AttributeError(
- "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+ "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
)