summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
committerLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
commitbef726b23d0770860cd347b03009ffb027159572 (patch)
tree325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/__init__.py
parent5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
Added docstrings for some but not all tree buidlers.
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py143
1 files changed, 118 insertions, 25 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 7efbf89..e8d78f9 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -27,18 +27,33 @@ HTML_5 = 'html5'
class TreeBuilderRegistry(object):
-
+ """A way of looking up TreeBuilder subclasses by their name or by desired
+ features.
+ """
+
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
+ """Register a treebuilder based on its advertised features.
+
+ :param treebuilder_class: A subclass of Treebuilder. its .features
+ attribute should list its features.
+ """
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
+ """Look up a TreeBuilder subclass with the desired features.
+
+ :param features: A list of features to look for. If none are
+ provided, the most recently registered TreeBuilder subclass
+ will be used.
+ :return: A TreeBuilder subclass, or None if there's no
+ registered subclass with all the requested features.
+ """
if len(self.builders) == 0:
# There are no builders at all.
return None
@@ -81,7 +96,7 @@ class TreeBuilderRegistry(object):
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
+ """Turn a textual document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
@@ -109,26 +124,27 @@ class TreeBuilder(object):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
- TreeBuilder will not turn any values for attributes like
- 'class' into lists. Setting this do a dictionary will
- customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
- for an example.
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this do a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
- Internally, these are called "CDATA list attributes", but that
- probably doesn't make sense to an end-user, so the argument name
- is `multi_valued_attributes`.
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
- the way <pre> tags are treated in HTML. Tags in this list
- will have
+ the way <pre> tags are treated in HTML. Tags in this list
+ are immune from pretty-printing; their contents will always be
+ output as-is.
:param store_line_numbers: If the parser keeps track of the
- line numbers and positions of the original markup, that
- information will, by default, be stored in each corresponding
- `Tag` object. You can turn this off by passing
- store_line_numbers=False. If the parser you're using doesn't
- keep track of this information, then setting store_line_numbers=True
- will do nothing.
+ line numbers and positions of the original markup, that
+ information will, by default, be stored in each corresponding
+ `Tag` object. You can turn this off by passing
+ store_line_numbers=False. If the parser you're using doesn't
+ keep track of this information, then setting store_line_numbers=True
+ will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
@@ -144,10 +160,17 @@ class TreeBuilder(object):
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
+
+ :param soup: A BeautifulSoup object.
"""
self.soup = soup
def reset(self):
+ """Do any work necessary to reset the underlying parser
+ for a new document.
+
+ By default, this does nothing.
+ """
pass
def can_be_empty_element(self, tag_name):
@@ -159,23 +182,56 @@ class TreeBuilder(object):
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
- will be presented as "<p></p>", not "<p />".
+ will be presented as "<p></p>", not "<p/>" or "<p>".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+ empty-element tag if and only if it has no children.
+ "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
be left alone.
+
+ :param tag_name: The name of a markup tag.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+
+ This method is not implemented in TreeBuilder; it must be
+ implemented in subclasses.
+
+ :return: None.
+ """
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
+
+ By default, the only strategy is to parse the markup
+ as-is. See `LXMLTreeBuilderForXML` and
+ `HTMLParserTreeBuilder` for implementations that take into
+ account the quirks of particular parsers.
+ """
yield markup, None, None, False
def test_fragment_to_document(self, fragment):
@@ -188,16 +244,36 @@ class TreeBuilder(object):
results against other HTML fragments.
This method should not be used outside of tests.
+
+ :param fragment: A string -- fragment of HTML.
+ :return: A string -- a full HTML document.
"""
return fragment
def set_up_substitutions(self, tag):
+ """Set up any substitutions that will need to be performed on
+ a `Tag` when it's output as a string.
+
+ By default, this does nothing. See `HTMLTreeBuilder` for a
+ case where this is used.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
- """Replaces class="foo bar" with class=["foo", "bar"]
+ """When an attribute value is associated with a tag that can
+ have multiple values for that attribute, convert the string
+ value to a list of strings.
+
+ Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+ NOTE: This method modifies its input in place.
- Modifies its input in place.
+ :param tag_name: The name of a tag.
+ :param attrs: A dictionary containing the tag's attributes.
+ Any appropriate attribute values will be modified in place.
"""
if not attrs:
return attrs
@@ -225,7 +301,11 @@ class TreeBuilder(object):
return attrs
class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
+ """A Beautiful Soup treebuilder that listens for SAX events.
+
+ This is not currently used for anything, but it demonstrates
+ how a simple TreeBuilder would work.
+ """
def feed(self, markup):
raise NotImplementedError()
@@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
+ """Replace the declared encoding in a <meta> tag with a placeholder,
+ to be substituted when the tag is output to a string.
+
+ An HTML document may come in to Beautiful Soup as one
+ encoding, but exit in a different encoding, and the <meta> tag
+ needs to be changed to reflect this.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
@@ -363,6 +453,9 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
+ """An Exception to be raised when the underlying parser simply
+ refuses to parse the given markup.
+ """
def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
@@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try: