Added docstrings for some but not all tree buidlers.

author: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
commit: bef726b23d0770860cd347b03009ffb027159572 (patch)
tree: 325b698568a6fcb63018753db4830a579254f6ca
parent: 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
4 files changed, 255 insertions, 48 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d24559c..2758e62 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 = 4.8.2 (20191224)
 
-* Added Python docstrings to most public methods.
+* Added Python docstrings to all public methods of the most commonly
+  used classes.
 
 * Added a Chinese translation by Deron Wang and a Brazilian Portuguese
   translation by Cezar Peixeiro to the repository.
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 7efbf89..e8d78f9 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -27,18 +27,33 @@ HTML_5 = 'html5'
 
 
 class TreeBuilderRegistry(object):
-
+    """A way of looking up TreeBuilder subclasses by their name or by desired
+    features.
+    """
+    
     def __init__(self):
         self.builders_for_feature = defaultdict(list)
         self.builders = []
 
     def register(self, treebuilder_class):
-        """Register a treebuilder based on its advertised features."""
+        """Register a treebuilder based on its advertised features.
+
+        :param treebuilder_class: A subclass of Treebuilder. its .features
+           attribute should list its features.
+        """
         for feature in treebuilder_class.features:
             self.builders_for_feature[feature].insert(0, treebuilder_class)
         self.builders.insert(0, treebuilder_class)
 
     def lookup(self, *features):
+        """Look up a TreeBuilder subclass with the desired features.
+
+        :param features: A list of features to look for. If none are
+            provided, the most recently registered TreeBuilder subclass
+            will be used.
+        :return: A TreeBuilder subclass, or None if there's no
+            registered subclass with all the requested features.
+        """
         if len(self.builders) == 0:
             # There are no builders at all.
             return None
@@ -81,7 +96,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()
 
 class TreeBuilder(object):
-    """Turn a document into a Beautiful Soup object tree."""
+    """Turn a textual document into a Beautiful Soup object tree."""
 
     NAME = "[Unknown tree builder]"
     ALTERNATE_NAMES = []
@@ -109,26 +124,27 @@ class TreeBuilder(object):
         """Constructor.
 
         :param multi_valued_attributes: If this is set to None, the
-        TreeBuilder will not turn any values for attributes like
-        'class' into lists. Setting this do a dictionary will
-        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
-        for an example.
+         TreeBuilder will not turn any values for attributes like
+         'class' into lists. Setting this do a dictionary will
+         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+         for an example.
 
-        Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-user, so the argument name
-        is `multi_valued_attributes`.
+         Internally, these are called "CDATA list attributes", but that
+         probably doesn't make sense to an end-user, so the argument name
+         is `multi_valued_attributes`.
 
         :param preserve_whitespace_tags: A list of tags to treat
-        the way <pre> tags are treated in HTML. Tags in this list
-        will have 
+         the way <pre> tags are treated in HTML. Tags in this list
+         are immune from pretty-printing; their contents will always be
+         output as-is.
 
         :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         line numbers and positions of the original markup, that
+         information will, by default, be stored in each corresponding
+         `Tag` object. You can turn this off by passing
+         store_line_numbers=False. If the parser you're using doesn't 
+         keep track of this information, then setting store_line_numbers=True
+         will do nothing.
         """
         self.soup = None
         if multi_valued_attributes is self.USE_DEFAULT:
@@ -144,10 +160,17 @@ class TreeBuilder(object):
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
         being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
         """
         self.soup = soup
         
     def reset(self):
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
         pass
 
     def can_be_empty_element(self, tag_name):
@@ -159,23 +182,56 @@ class TreeBuilder(object):
         For instance: an HTMLBuilder does not consider a <p> tag to be
         an empty-element tag (it's not in
         HTMLBuilder.empty_element_tags). This means an empty <p> tag
-        will be presented as "<p></p>", not "<p />".
+        will be presented as "<p></p>", not "<p/>" or "<p>".
 
         The default implementation has no opinion about which tags are
         empty-element tags, so a tag will be presented as an
-        empty-element tag if and only if it has no contents.
-        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        empty-element tag if and only if it has no children.
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
         be left alone.
+
+        :param tag_name: The name of a markup tag.
         """
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
     
     def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+
+        This method is not implemented in TreeBuilder; it must be
+        implemented in subclasses.
+
+        :return: None.
+        """
         raise NotImplementedError()
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
+
+         By default, the only strategy is to parse the markup
+         as-is. See `LXMLTreeBuilderForXML` and
+         `HTMLParserTreeBuilder` for implementations that take into
+         account the quirks of particular parsers.
+        """
         yield markup, None, None, False
 
     def test_fragment_to_document(self, fragment):
@@ -188,16 +244,36 @@ class TreeBuilder(object):
         results against other HTML fragments.
 
         This method should not be used outside of tests.
+
+        :param fragment: A string -- fragment of HTML.
+        :return: A string -- a full HTML document.
         """
         return fragment
 
     def set_up_substitutions(self, tag):
+        """Set up any substitutions that will need to be performed on 
+        a `Tag` when it's output as a string.
+
+        By default, this does nothing. See `HTMLTreeBuilder` for a
+        case where this is used.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
         return False
 
     def _replace_cdata_list_attribute_values(self, tag_name, attrs):
-        """Replaces class="foo bar" with class=["foo", "bar"]
+        """When an attribute value is associated with a tag that can
+        have multiple values for that attribute, convert the string
+        value to a list of strings.
+
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+        NOTE: This method modifies its input in place.
 
-        Modifies its input in place.
+        :param tag_name: The name of a tag.
+        :param attrs: A dictionary containing the tag's attributes.
+           Any appropriate attribute values will be modified in place.
         """
         if not attrs:
             return attrs
@@ -225,7 +301,11 @@ class TreeBuilder(object):
         return attrs
 
 class SAXTreeBuilder(TreeBuilder):
-    """A Beautiful Soup treebuilder that listens for SAX events."""
+    """A Beautiful Soup treebuilder that listens for SAX events.
+
+    This is not currently used for anything, but it demonstrates
+    how a simple TreeBuilder would work.
+    """
 
     def feed(self, markup):
         raise NotImplementedError()
@@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder):
     DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
     
     def set_up_substitutions(self, tag):
+        """Replace the declared encoding in a <meta> tag with a placeholder,
+        to be substituted when the tag is output to a string.
+
+        An HTML document may come in to Beautiful Soup as one
+        encoding, but exit in a different encoding, and the <meta> tag
+        needs to be changed to reflect this.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
         # We are only interested in <meta> tags
         if tag.name != 'meta':
             return False
@@ -363,6 +453,9 @@ def register_treebuilders_from(module):
             this_module.builder_registry.register(obj)
 
 class ParserRejectedMarkup(Exception):
+    """An Exception to be raised when the underlying parser simply
+    refuses to parse the given markup.
+    """
     def __init__(self, message_or_exception):
         """Explain why the parser rejected the given markup, either
         with a textual explanation or another exception.
@@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception):
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4531407..2bb764f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -53,7 +53,11 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 
 class BeautifulSoupHTMLParser(HTMLParser):
-
+    """A subclass of the Python standard library's HTMLParser class, which
+    listens for HTMLParser events and translates them into calls
+    to Beautiful Soup's tree construction API.
+    """
+    
     def __init__(self, *args, **kwargs):
         HTMLParser.__init__(self, *args, **kwargs)
 
@@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.already_closed_empty_element = []
 
     def error(self, msg):
-        """In Python 3, HTMLParser subclasses must implement error(), although this
-        requirement doesn't appear to be documented.
+        """In Python 3, HTMLParser subclasses must implement error(), although
+        this requirement doesn't appear to be documented.
 
-        In Python 2, HTMLParser implements error() as raising an exception.
+        In Python 2, HTMLParser implements error() by raising an exception,
+        which we don't want to do.
 
-        In any event, this method is called only on very strange markup and our best strategy
-        is to pretend it didn't happen and keep going.
+        In any event, this method is called only on very strange
+        markup and our best strategy is to pretend it didn't happen
+        and keep going.
         """
         warnings.warn(msg)
         
     def handle_startendtag(self, name, attrs):
-        # This is only called when the markup looks like
-        # <tag/>.
+        """Handle an incoming empty-element tag.
 
+        This is only called when the markup looks like <tag/>.
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        """
         # is_startend() tells handle_starttag not to close the tag
         # just because its name matches a known empty-element tag. We
         # know that this is an empty-element tag and we want to call
@@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_endtag(name)
         
     def handle_starttag(self, name, attrs, handle_empty_element=True):
+        """Handle an opening tag, e.g. '<tag>'
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        :param handle_empty_element: True if this tag is known to be
+            an empty-element tag (i.e. there is not expected to be any
+            closing tag).
+        """
         # XXX namespace
         attr_dict = {}
         for key, value in attrs:
@@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.already_closed_empty_element.append(name)
             
     def handle_endtag(self, name, check_already_closed=True):
+        """Handle a closing tag, e.g. '</tag>'
+        
+        :param name: A tag name.
+        :param check_already_closed: True if this tag is expected to
+           be the closing portion of an empty-element tag,
+           e.g. '<tag></tag>'.
+        """
         #print "END", name
         if check_already_closed and name in self.already_closed_empty_element:
             # This is a redundant end tag for an empty-element tag.
@@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.soup.handle_endtag(name)
 
     def handle_data(self, data):
+        """Handle some textual data that shows up between tags."""
         self.soup.handle_data(data)
 
     def handle_charref(self, name):
+        """Handle a numeric character reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Character number, possibly in hexadecimal.
+        """
         # XXX workaround for a bug in HTMLParser. Remove this once
         # it's fixed in all supported versions.
         # http://bugs.python.org/issue13633
@@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_data(data)
 
     def handle_entityref(self, name):
+        """Handle a named entity reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Name of the entity reference.
+        """
         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
         if character is not None:
             data = character
@@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_data(data)
 
     def handle_comment(self, data):
+        """Handle an HTML comment.
+
+        :param data: The text of the comment.
+        """
         self.soup.endData()
         self.soup.handle_data(data)
         self.soup.endData(Comment)
 
     def handle_decl(self, data):
+        """Handle a DOCTYPE declaration.
+
+        :param data: The text of the declaration.
+        """
         self.soup.endData()
         data = data[len("DOCTYPE "):]
         self.soup.handle_data(data)
         self.soup.endData(Doctype)
 
     def unknown_decl(self, data):
+        """Handle a declaration of unknown type -- probably a CDATA block.
+
+        :param data: The text of the declaration.
+        """
         if data.upper().startswith('CDATA['):
             cls = CData
             data = data[len('CDATA['):]
@@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.soup.endData(cls)
 
     def handle_pi(self, data):
+        """Handle a processing instruction.
+
+        :param data: The text of the instruction.
+        """
         self.soup.endData()
         self.soup.handle_data(data)
         self.soup.endData(ProcessingInstruction)
 
 
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+    found in the Python standard library.
+    """
     is_xml = False
     picklable = True
     NAME = HTMLPARSER
@@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     TRACKS_LINE_NUMBERS = True
     
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        """Constructor.
+
+        :param parser_args: Positional arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param parser_kwargs: Keyword arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param kwargs: Keyword arguments for the superclass constructor.
+        """
         super(HTMLParserTreeBuilder, self).__init__(**kwargs)
         parser_args = parser_args or []
         parser_kwargs = parser_kwargs or {}
@@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
-        """
-        :return: A 4-tuple (markup, original encoding, encoding
-        declared within markup, whether any characters had to be
-        replaced with REPLACEMENT CHARACTER).
+
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
         """
         if isinstance(markup, unicode):
+            # Parse Unicode as-is.
             yield (markup, None, None, False)
             return
 
+        # Ask UnicodeDammit to sniff the most likely encoding.
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, is_html=True,
                                exclude_encodings=exclude_encodings)
@@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
                dammit.contains_replacement_characters)
 
     def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+        """
         args, kwargs = self.parser_args
         parser = BeautifulSoupHTMLParser(*args, **kwargs)
         parser.soup = self.soup
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index c83dd2d..1b44d75 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     # But instead we build an XMLParser or HTMLParser object to serve
     # as the target of parse messages, and those messages don't include
     # line numbers.
+    # See: https://bugs.launchpad.net/lxml/+bug/1846906
     
     def initialize_soup(self, soup):
         """Let the BeautifulSoup object know about the standard namespace
         mapping.
+
+        :param soup: A `BeautifulSoup`.
         """
         super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
         self._register_namespaces(self.DEFAULT_NSMAPS)
@@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         while parsing the document.
 
         This might be useful later on when creating CSS selectors.
+
+        :param mapping: A dictionary mapping namespace prefixes to URIs.
         """
         for key, value in mapping.items():
             if key and key not in self.soup._namespaces:
@@ -84,14 +89,23 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                 self.soup._namespaces[key] = value
 
     def default_parser(self, encoding):
-        # This can either return a parser object or a class, which
-        # will be instantiated with default arguments.
+        """Find the default parser for the given encoding.
+
+        :param encoding: A string.
+        :return: Either a parser object or a class, which
+          will be instantiated with default arguments.
+        """
         if self._default_parser is not None:
             return self._default_parser
         return etree.XMLParser(
             target=self, strip_cdata=False, recover=True, encoding=encoding)
 
     def parser_for(self, encoding):
+        """Instantiate an appropriate parser for the given encoding.
+
+        :param encoding: A string.
+        :return: A parser object such as an `etree.XMLParser`.
+        """
         # Use the default parser.
         parser = self.default_parser(encoding)
 
@@ -124,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def prepare_markup(self, markup, user_specified_encoding=None,
                        exclude_encodings=None,
                        document_declared_encoding=None):
-        """
-        :yield: A series of 4-tuples.
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        lxml really wants to get a bytestring and convert it to
+        Unicode itself. So instead of using UnicodeDammit to convert
+        the bytestring to Unicode using different encodings, this
+        implementation uses EncodingDetector to iterate over the
+        encodings, and tell lxml to try to parse the document as each
+        one in turn.
+
+        :param markup: Some markup -- hopefully a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
          (markup, encoding, declared encoding,
           has undergone character replacement)
 
-        Each 4-tuple represents a strategy for parsing the document.
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
         """
-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
         is_html = not self.is_xml
         if is_html:
             self.processing_instruction_class = ProcessingInstruction
author	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
commit	bef726b23d0770860cd347b03009ffb027159572 (patch)
tree	325b698568a6fcb63018753db4830a579254f6ca
parent	5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)