Added docstrings for some but not all tree buidlers.

author: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
commit: bef726b23d0770860cd347b03009ffb027159572 (patch)
tree: 325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/__init__.py
parent: 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
1 files changed, 118 insertions, 25 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 7efbf89..e8d78f9 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -27,18 +27,33 @@ HTML_5 = 'html5'
 
 
 class TreeBuilderRegistry(object):
-
+    """A way of looking up TreeBuilder subclasses by their name or by desired
+    features.
+    """
+    
     def __init__(self):
         self.builders_for_feature = defaultdict(list)
         self.builders = []
 
     def register(self, treebuilder_class):
-        """Register a treebuilder based on its advertised features."""
+        """Register a treebuilder based on its advertised features.
+
+        :param treebuilder_class: A subclass of Treebuilder. its .features
+           attribute should list its features.
+        """
         for feature in treebuilder_class.features:
             self.builders_for_feature[feature].insert(0, treebuilder_class)
         self.builders.insert(0, treebuilder_class)
 
     def lookup(self, *features):
+        """Look up a TreeBuilder subclass with the desired features.
+
+        :param features: A list of features to look for. If none are
+            provided, the most recently registered TreeBuilder subclass
+            will be used.
+        :return: A TreeBuilder subclass, or None if there's no
+            registered subclass with all the requested features.
+        """
         if len(self.builders) == 0:
             # There are no builders at all.
             return None
@@ -81,7 +96,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()
 
 class TreeBuilder(object):
-    """Turn a document into a Beautiful Soup object tree."""
+    """Turn a textual document into a Beautiful Soup object tree."""
 
     NAME = "[Unknown tree builder]"
     ALTERNATE_NAMES = []
@@ -109,26 +124,27 @@ class TreeBuilder(object):
         """Constructor.
 
         :param multi_valued_attributes: If this is set to None, the
-        TreeBuilder will not turn any values for attributes like
-        'class' into lists. Setting this do a dictionary will
-        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
-        for an example.
+         TreeBuilder will not turn any values for attributes like
+         'class' into lists. Setting this do a dictionary will
+         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+         for an example.
 
-        Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-user, so the argument name
-        is `multi_valued_attributes`.
+         Internally, these are called "CDATA list attributes", but that
+         probably doesn't make sense to an end-user, so the argument name
+         is `multi_valued_attributes`.
 
         :param preserve_whitespace_tags: A list of tags to treat
-        the way <pre> tags are treated in HTML. Tags in this list
-        will have 
+         the way <pre> tags are treated in HTML. Tags in this list
+         are immune from pretty-printing; their contents will always be
+         output as-is.
 
         :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         line numbers and positions of the original markup, that
+         information will, by default, be stored in each corresponding
+         `Tag` object. You can turn this off by passing
+         store_line_numbers=False. If the parser you're using doesn't 
+         keep track of this information, then setting store_line_numbers=True
+         will do nothing.
         """
         self.soup = None
         if multi_valued_attributes is self.USE_DEFAULT:
@@ -144,10 +160,17 @@ class TreeBuilder(object):
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
         being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
         """
         self.soup = soup
         
     def reset(self):
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
         pass
 
     def can_be_empty_element(self, tag_name):
@@ -159,23 +182,56 @@ class TreeBuilder(object):
         For instance: an HTMLBuilder does not consider a <p> tag to be
         an empty-element tag (it's not in
         HTMLBuilder.empty_element_tags). This means an empty <p> tag
-        will be presented as "<p></p>", not "<p />".
+        will be presented as "<p></p>", not "<p/>" or "<p>".
 
         The default implementation has no opinion about which tags are
         empty-element tags, so a tag will be presented as an
-        empty-element tag if and only if it has no contents.
-        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        empty-element tag if and only if it has no children.
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
         be left alone.
+
+        :param tag_name: The name of a markup tag.
         """
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
     
     def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+
+        This method is not implemented in TreeBuilder; it must be
+        implemented in subclasses.
+
+        :return: None.
+        """
         raise NotImplementedError()
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
+
+         By default, the only strategy is to parse the markup
+         as-is. See `LXMLTreeBuilderForXML` and
+         `HTMLParserTreeBuilder` for implementations that take into
+         account the quirks of particular parsers.
+        """
         yield markup, None, None, False
 
     def test_fragment_to_document(self, fragment):
@@ -188,16 +244,36 @@ class TreeBuilder(object):
         results against other HTML fragments.
 
         This method should not be used outside of tests.
+
+        :param fragment: A string -- fragment of HTML.
+        :return: A string -- a full HTML document.
         """
         return fragment
 
     def set_up_substitutions(self, tag):
+        """Set up any substitutions that will need to be performed on 
+        a `Tag` when it's output as a string.
+
+        By default, this does nothing. See `HTMLTreeBuilder` for a
+        case where this is used.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
         return False
 
     def _replace_cdata_list_attribute_values(self, tag_name, attrs):
-        """Replaces class="foo bar" with class=["foo", "bar"]
+        """When an attribute value is associated with a tag that can
+        have multiple values for that attribute, convert the string
+        value to a list of strings.
+
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+        NOTE: This method modifies its input in place.
 
-        Modifies its input in place.
+        :param tag_name: The name of a tag.
+        :param attrs: A dictionary containing the tag's attributes.
+           Any appropriate attribute values will be modified in place.
         """
         if not attrs:
             return attrs
@@ -225,7 +301,11 @@ class TreeBuilder(object):
         return attrs
 
 class SAXTreeBuilder(TreeBuilder):
-    """A Beautiful Soup treebuilder that listens for SAX events."""
+    """A Beautiful Soup treebuilder that listens for SAX events.
+
+    This is not currently used for anything, but it demonstrates
+    how a simple TreeBuilder would work.
+    """
 
     def feed(self, markup):
         raise NotImplementedError()
@@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder):
     DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
     
     def set_up_substitutions(self, tag):
+        """Replace the declared encoding in a <meta> tag with a placeholder,
+        to be substituted when the tag is output to a string.
+
+        An HTML document may come in to Beautiful Soup as one
+        encoding, but exit in a different encoding, and the <meta> tag
+        needs to be changed to reflect this.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
         # We are only interested in <meta> tags
         if tag.name != 'meta':
             return False
@@ -363,6 +453,9 @@ def register_treebuilders_from(module):
             this_module.builder_registry.register(obj)
 
 class ParserRejectedMarkup(Exception):
+    """An Exception to be raised when the underlying parser simply
+    refuses to parse the given markup.
+    """
     def __init__(self, message_or_exception):
         """Explain why the parser rejected the given markup, either
         with a textual explanation or another exception.
@@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception):
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
author	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
commit	bef726b23d0770860cd347b03009ffb027159572 (patch)
tree	325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/__init__.py
parent	5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)