summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
committerLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
commitbef726b23d0770860cd347b03009ffb027159572 (patch)
tree325b698568a6fcb63018753db4830a579254f6ca
parent5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
Added docstrings for some but not all tree buidlers.
-rw-r--r--CHANGELOG3
-rw-r--r--bs4/builder/__init__.py143
-rw-r--r--bs4/builder/_htmlparser.py111
-rw-r--r--bs4/builder/_lxml.py46
4 files changed, 255 insertions, 48 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d24559c..2758e62 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
= 4.8.2 (20191224)
-* Added Python docstrings to most public methods.
+* Added Python docstrings to all public methods of the most commonly
+ used classes.
* Added a Chinese translation by Deron Wang and a Brazilian Portuguese
translation by Cezar Peixeiro to the repository.
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 7efbf89..e8d78f9 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -27,18 +27,33 @@ HTML_5 = 'html5'
class TreeBuilderRegistry(object):
-
+ """A way of looking up TreeBuilder subclasses by their name or by desired
+ features.
+ """
+
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
+ """Register a treebuilder based on its advertised features.
+
+ :param treebuilder_class: A subclass of Treebuilder. its .features
+ attribute should list its features.
+ """
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
+ """Look up a TreeBuilder subclass with the desired features.
+
+ :param features: A list of features to look for. If none are
+ provided, the most recently registered TreeBuilder subclass
+ will be used.
+ :return: A TreeBuilder subclass, or None if there's no
+ registered subclass with all the requested features.
+ """
if len(self.builders) == 0:
# There are no builders at all.
return None
@@ -81,7 +96,7 @@ class TreeBuilderRegistry(object):
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
+ """Turn a textual document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
@@ -109,26 +124,27 @@ class TreeBuilder(object):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
- TreeBuilder will not turn any values for attributes like
- 'class' into lists. Setting this do a dictionary will
- customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
- for an example.
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this do a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
- Internally, these are called "CDATA list attributes", but that
- probably doesn't make sense to an end-user, so the argument name
- is `multi_valued_attributes`.
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
- the way <pre> tags are treated in HTML. Tags in this list
- will have
+ the way <pre> tags are treated in HTML. Tags in this list
+ are immune from pretty-printing; their contents will always be
+ output as-is.
:param store_line_numbers: If the parser keeps track of the
- line numbers and positions of the original markup, that
- information will, by default, be stored in each corresponding
- `Tag` object. You can turn this off by passing
- store_line_numbers=False. If the parser you're using doesn't
- keep track of this information, then setting store_line_numbers=True
- will do nothing.
+ line numbers and positions of the original markup, that
+ information will, by default, be stored in each corresponding
+ `Tag` object. You can turn this off by passing
+ store_line_numbers=False. If the parser you're using doesn't
+ keep track of this information, then setting store_line_numbers=True
+ will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
@@ -144,10 +160,17 @@ class TreeBuilder(object):
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
+
+ :param soup: A BeautifulSoup object.
"""
self.soup = soup
def reset(self):
+ """Do any work necessary to reset the underlying parser
+ for a new document.
+
+ By default, this does nothing.
+ """
pass
def can_be_empty_element(self, tag_name):
@@ -159,23 +182,56 @@ class TreeBuilder(object):
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
- will be presented as "<p></p>", not "<p />".
+ will be presented as "<p></p>", not "<p/>" or "<p>".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+ empty-element tag if and only if it has no children.
+ "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
be left alone.
+
+ :param tag_name: The name of a markup tag.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+
+ This method is not implemented in TreeBuilder; it must be
+ implemented in subclasses.
+
+ :return: None.
+ """
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
+
+ By default, the only strategy is to parse the markup
+ as-is. See `LXMLTreeBuilderForXML` and
+ `HTMLParserTreeBuilder` for implementations that take into
+ account the quirks of particular parsers.
+ """
yield markup, None, None, False
def test_fragment_to_document(self, fragment):
@@ -188,16 +244,36 @@ class TreeBuilder(object):
results against other HTML fragments.
This method should not be used outside of tests.
+
+ :param fragment: A string -- fragment of HTML.
+ :return: A string -- a full HTML document.
"""
return fragment
def set_up_substitutions(self, tag):
+ """Set up any substitutions that will need to be performed on
+ a `Tag` when it's output as a string.
+
+ By default, this does nothing. See `HTMLTreeBuilder` for a
+ case where this is used.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
- """Replaces class="foo bar" with class=["foo", "bar"]
+ """When an attribute value is associated with a tag that can
+ have multiple values for that attribute, convert the string
+ value to a list of strings.
+
+ Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+ NOTE: This method modifies its input in place.
- Modifies its input in place.
+ :param tag_name: The name of a tag.
+ :param attrs: A dictionary containing the tag's attributes.
+ Any appropriate attribute values will be modified in place.
"""
if not attrs:
return attrs
@@ -225,7 +301,11 @@ class TreeBuilder(object):
return attrs
class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
+ """A Beautiful Soup treebuilder that listens for SAX events.
+
+ This is not currently used for anything, but it demonstrates
+ how a simple TreeBuilder would work.
+ """
def feed(self, markup):
raise NotImplementedError()
@@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
+ """Replace the declared encoding in a <meta> tag with a placeholder,
+ to be substituted when the tag is output to a string.
+
+ An HTML document may come in to Beautiful Soup as one
+ encoding, but exit in a different encoding, and the <meta> tag
+ needs to be changed to reflect this.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
@@ -363,6 +453,9 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
+ """An Exception to be raised when the underlying parser simply
+ refuses to parse the given markup.
+ """
def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
@@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4531407..2bb764f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -53,7 +53,11 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
-
+ """A subclass of the Python standard library's HTMLParser class, which
+ listens for HTMLParser events and translates them into calls
+ to Beautiful Soup's tree construction API.
+ """
+
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
@@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element = []
def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although this
- requirement doesn't appear to be documented.
+ """In Python 3, HTMLParser subclasses must implement error(), although
+ this requirement doesn't appear to be documented.
- In Python 2, HTMLParser implements error() as raising an exception.
+ In Python 2, HTMLParser implements error() by raising an exception,
+ which we don't want to do.
- In any event, this method is called only on very strange markup and our best strategy
- is to pretend it didn't happen and keep going.
+ In any event, this method is called only on very strange
+ markup and our best strategy is to pretend it didn't happen
+ and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
- # This is only called when the markup looks like
- # <tag/>.
+ """Handle an incoming empty-element tag.
+ This is only called when the markup looks like <tag/>.
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ """
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
@@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
+ """Handle an opening tag, e.g. '<tag>'
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ :param handle_empty_element: True if this tag is known to be
+ an empty-element tag (i.e. there is not expected to be any
+ closing tag).
+ """
# XXX namespace
attr_dict = {}
for key, value in attrs:
@@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
+ """Handle a closing tag, e.g. '</tag>'
+
+ :param name: A tag name.
+ :param check_already_closed: True if this tag is expected to
+ be the closing portion of an empty-element tag,
+ e.g. '<tag></tag>'.
+ """
#print "END", name
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
@@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.handle_endtag(name)
def handle_data(self, data):
+ """Handle some textual data that shows up between tags."""
self.soup.handle_data(data)
def handle_charref(self, name):
+ """Handle a numeric character reference by converting it to the
+ corresponding Unicode character and treating it as textual
+ data.
+
+ :param name: Character number, possibly in hexadecimal.
+ """
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
@@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_entityref(self, name):
+ """Handle a named entity reference by converting it to the
+ corresponding Unicode character and treating it as textual
+ data.
+
+ :param name: Name of the entity reference.
+ """
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
@@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_comment(self, data):
+ """Handle an HTML comment.
+
+ :param data: The text of the comment.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
+ """Handle a DOCTYPE declaration.
+
+ :param data: The text of the declaration.
+ """
self.soup.endData()
data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
+ """Handle a declaration of unknown type -- probably a CDATA block.
+
+ :param data: The text of the declaration.
+ """
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
@@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData(cls)
def handle_pi(self, data):
+ """Handle a processing instruction.
+
+ :param data: The text of the instruction.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+ """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+ found in the Python standard library.
+ """
is_xml = False
picklable = True
NAME = HTMLPARSER
@@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ """Constructor.
+
+ :param parser_args: Positional arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param parser_kwargs: Keyword arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param kwargs: Keyword arguments for the superclass constructor.
+ """
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
@@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
+
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
if isinstance(markup, unicode):
+ # Parse Unicode as-is.
yield (markup, None, None, False)
return
+ # Ask UnicodeDammit to sniff the most likely encoding.
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
@@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+ """
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index c83dd2d..1b44d75 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
+ # See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
+
+ :param soup: A `BeautifulSoup`.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
@@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
while parsing the document.
This might be useful later on when creating CSS selectors.
+
+ :param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in mapping.items():
if key and key not in self.soup._namespaces:
@@ -84,14 +89,23 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup._namespaces[key] = value
def default_parser(self, encoding):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
+ """Find the default parser for the given encoding.
+
+ :param encoding: A string.
+ :return: Either a parser object or a class, which
+ will be instantiated with default arguments.
+ """
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
+ """Instantiate an appropriate parser for the given encoding.
+
+ :param encoding: A string.
+ :return: A parser object such as an `etree.XMLParser`.
+ """
# Use the default parser.
parser = self.default_parser(encoding)
@@ -124,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
- """
- :yield: A series of 4-tuples.
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ lxml really wants to get a bytestring and convert it to
+ Unicode itself. So instead of using UnicodeDammit to convert
+ the bytestring to Unicode using different encodings, this
+ implementation uses EncodingDetector to iterate over the
+ encodings, and tell lxml to try to parse the document as each
+ one in turn.
+
+ :param markup: Some markup -- hopefully a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
- Each 4-tuple represents a strategy for parsing the document.
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction