summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-12-20 15:07:27 -0500
committerLeonard Richardson <leonardr@segfault.org>2019-12-20 15:07:27 -0500
commitee3f30b0e778747fb701856a33713369f778834f (patch)
tree1b7aeedc410a2bbd4262dfd984d70c640d5cec38
parentf5f0e1ea521f813964b4f9cbea53fcdfc3942f56 (diff)
Added docstrings to all methods in __init__.py
-rw-r--r--bs4/__init__.py178
1 files changed, 120 insertions, 58 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 1ea7b97..0f3a0e3 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -52,18 +52,26 @@ from .element import (
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
- """
- This class defines the basic interface called by the tree builders.
+ """A data structure representing a parsed HTML or XML document.
+
+ Most of the methods you'll call on a BeautifulSoup object are inherited from
+ PageElement or Tag.
+
+ Internally, this class defines the basic interface called by the
+ tree builders when converting an HTML/XML document into a data
+ structure. The interface abstracts away the differences between
+ parsers. To write a new tree builder, you'll need to understand
+ these methods as a whole.
- These methods will be called by the parser:
- reset()
- feed(markup)
+ These methods will be called by the BeautifulSoup constructor:
+ * reset()
+ * feed(markup)
The tree builder may call these methods from its feed() implementation:
- handle_starttag(name, attrs) # See note about return value
- handle_endtag(name)
- handle_data(data) # Appends to the current data node
- endData(containerClass) # Ends the current data node
+ * handle_starttag(name, attrs) # See note about return value
+ * handle_endtag(name)
+ * handle_data(data) # Appends to the current data node
+ * endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@@ -73,13 +81,19 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
+
+ # Since BeautifulSoup subclasses Tag, it's possible to treat it as
+ # a Tag with a .name. This name makes it clear the BeautifulSoup
+ # object isn't a real markup tag.
ROOT_TAG_NAME = u'[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
- ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+
+ # A bytestring containing all ASCII whitespace characters, used in
+ # endData() to detect data chunks that seem 'empty'.
+ ASCII_SPACES = b'\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
@@ -89,50 +103,49 @@ class BeautifulSoup(Tag):
"""Constructor.
:param markup: A string or a file-like object representing
- markup to be parsed.
+ markup to be parsed.
:param features: Desirable features of the parser to be used. This
- may be the name of a specific parser ("lxml", "lxml-xml",
- "html.parser", or "html5lib") or it may be the type of markup
- to be used ("html", "html5", "xml"). It's recommended that you
- name a specific parser, so that Beautiful Soup gives you the
- same results across platforms and virtual environments.
+ may be the name of a specific parser ("lxml", "lxml-xml",
+ "html.parser", or "html5lib") or it may be the type of markup
+ to be used ("html", "html5", "xml"). It's recommended that you
+ name a specific parser, so that Beautiful Soup gives you the
+ same results across platforms and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
- instance to use) instead of looking one up based on
- `features`. You only need to use this if you've implemented a
- custom TreeBuilder.
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
- matching the SoupStrainer will be considered. This is useful
- when parsing part of a document that would otherwise be too
- large to fit into memory.
+ matching the SoupStrainer will be considered. This is useful
+ when parsing part of a document that would otherwise be too
+ large to fit into memory.
:param from_encoding: A string indicating the encoding of the
- document to be parsed. Pass this in if Beautiful Soup is
- guessing wrongly about the document's encoding.
+ document to be parsed. Pass this in if Beautiful Soup is
+ guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
- encodings known to be wrong. Pass this in if you don't know
- the document's encoding but you know Beautiful Soup's guess is
- wrong.
+ encodings known to be wrong. Pass this in if you don't know
+ the document's encoding but you know Beautiful Soup's guess is
+ wrong.
:param element_classes: A dictionary mapping BeautifulSoup
- classes like Tag and NavigableString to other classes you'd
- like to be instantiated instead as the parse tree is
- built. This is useful for using subclasses to modify the
- default behavior of Tag or NavigableString.
+ classes like Tag and NavigableString, to other classes you'd
+ like to be instantiated instead as the parse tree is
+ built. This is useful for subclassing Tag or NavigableString
+ to modify default behavior.
:param kwargs: For backwards compatibility purposes, the
- constructor accepts certain keyword arguments used in
- Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4; they will result in a warning and then be ignored.
-
- Apart from this, any keyword arguments passed into the BeautifulSoup
- constructor are propagated to the TreeBuilder constructor. This
- makes it possible to configure a TreeBuilder beyond saying
- which one to use.
-
+ constructor accepts certain keyword arguments used in
+ Beautiful Soup 3. None of these arguments do anything in
+ Beautiful Soup 4; they will result in a warning and then be ignored.
+
+ Apart from this, any keyword arguments passed into the BeautifulSoup
+ constructor are propagated to the TreeBuilder constructor. This
+ makes it possible to configure a TreeBuilder by passing in arguments,
+ not just by saying which one to use.
"""
if 'convertEntities' in kwargs:
@@ -329,6 +342,9 @@ class BeautifulSoup(Tag):
self.builder.soup = None
def __copy__(self):
+ """A copy of a BeautifulSoup object is created by converting the
+ document to a string and parsing it again.
+ """
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
@@ -349,9 +365,10 @@ class BeautifulSoup(Tag):
@staticmethod
def _check_markup_is_url(markup):
- """
- Check if markup looks like it's actually a url and raise a warning
- if so. Markup can be unicode or str (py2) / bytes (py3).
+ """Error-handling method to raise a warning if incoming markup looks
+ like a URL.
+
+ :param markup: A string.
"""
if isinstance(markup, bytes):
space = b' '
@@ -376,6 +393,9 @@ class BeautifulSoup(Tag):
)
def _feed(self):
+ """Internal method that parses previously set markup, creating a large
+ number of Tag and NavigableString objects.
+ """
# Convert the document to Unicode.
self.builder.reset()
@@ -386,6 +406,9 @@ class BeautifulSoup(Tag):
self.popTag()
def reset(self):
+ """Reset this object to a state as though it had never parsed any
+ markup.
+ """
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
@@ -397,7 +420,7 @@ class BeautifulSoup(Tag):
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
- """Create a new tag associated with this soup."""
+ """Create a new Tag associated with this BeautifulSoup object."""
kwattrs.update(attrs)
return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
@@ -405,19 +428,28 @@ class BeautifulSoup(Tag):
)
def new_string(self, s, subclass=None):
- """Create a new NavigableString associated with this soup."""
+ """Create a new NavigableString associated with this BeautifulSoup
+ object.
+ """
subclass = subclass or self.element_classes.get(
NavigableString, NavigableString
)
return subclass(s)
def insert_before(self, successor):
+ """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+ it because there is nothing before or after it in the parse tree.
+ """
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
+ """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+ it because there is nothing before or after it in the parse tree.
+ """
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
+ """Internal method called by _popToTag when a tag is closed."""
tag = self.tagStack.pop()
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
@@ -427,6 +459,7 @@ class BeautifulSoup(Tag):
return self.currentTag
def pushTag(self, tag):
+ """Internal method called by handle_starttag when a tag is opened."""
#print "Push", tag.name
if self.currentTag is not None:
self.currentTag.contents.append(tag)
@@ -436,7 +469,9 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=None):
-
+ """Method called by the TreeBuilder when the end of a data segment
+ occurs.
+ """
# Default container is NavigableString.
containerClass = containerClass or NavigableString
@@ -476,7 +511,7 @@ class BeautifulSoup(Tag):
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
- """Add an object to the parse tree."""
+ """Method called by the TreeBuilder to integrate an object into the parse tree."""
if parent is None:
parent = self.currentTag
if most_recent_element is not None:
@@ -545,9 +580,14 @@ class BeautifulSoup(Tag):
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
+ instance of the given tag.
+
+ :param name: Pop up to the most recent tag with this name.
+ :param nsprefix: The namespace prefix that goes with `name`.
+ :param inclusivePop: It this is false, pops the tag stack up
+ to but *not* including the most recent instqance of the
+ given tag.
+ """
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped.
@@ -568,14 +608,21 @@ class BeautifulSoup(Tag):
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
- """Push a start tag on to the stack.
+ """Called by the tree builder when a new tag is encountered.
- If this method returns None, the tag was rejected by the
+ :param name: Name of the tag.
+ :param nsprefix: Namespace prefix for the tag.
+ :param attrs: A dictionary of attribute values.
+ :param sourceline: The line number where this tag was found in its
+ source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was found.
+
+ If this method returns None, the tag was rejected by an active
SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
-
# print "Start tag %s: %s" % (name, attrs)
self.endData()
@@ -598,19 +645,30 @@ class BeautifulSoup(Tag):
return tag
def handle_endtag(self, name, nsprefix=None):
+ """Called by the tree builder when an ending tag is encountered.
+
+ :param name: Name of the tag.
+ :param nsprefix: Namespace prefix for the tag.
+ """
#print "End tag: " + name
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
+ """Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Returns a string or Unicode representation of this document.
- To get Unicode, pass None for encoding."""
+ """Returns a string or Unicode representation of the parse tree
+ as an HTML or XML document.
+ :param pretty_print: If this is True, indentation will be used to
+ make the document more readable.
+ :param eventual_encoding: The encoding of the final document.
+ If this is None, the document will be a Unicode string.
+ """
if self.is_xml:
# Print the XML declaration
encoding_part = ''
@@ -626,7 +684,7 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
-# Alias to make it easier to type import: 'from bs4 import _soup'
+# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
@@ -642,13 +700,17 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception):
+ """Exception raised by a TreeBuilder if it's unable to continue parsing."""
pass
class FeatureNotFound(ValueError):
+ """Exception raised by the BeautifulSoup constructor if no parser with the
+ requested features is found.
+ """
pass
-#By default, act as an HTML pretty-printer.
+#If this file is run as a script, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)