Added docstrings for some but not all tree buidlers.

author: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2019-12-24 10:41:57 -0500
commit: bef726b23d0770860cd347b03009ffb027159572 (patch)
tree: 325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/_htmlparser.py
parent: 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
1 files changed, 98 insertions, 13 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4531407..2bb764f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -53,7 +53,11 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 
 class BeautifulSoupHTMLParser(HTMLParser):
-
+    """A subclass of the Python standard library's HTMLParser class, which
+    listens for HTMLParser events and translates them into calls
+    to Beautiful Soup's tree construction API.
+    """
+    
     def __init__(self, *args, **kwargs):
         HTMLParser.__init__(self, *args, **kwargs)
 
@@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.already_closed_empty_element = []
 
     def error(self, msg):
-        """In Python 3, HTMLParser subclasses must implement error(), although this
-        requirement doesn't appear to be documented.
+        """In Python 3, HTMLParser subclasses must implement error(), although
+        this requirement doesn't appear to be documented.
 
-        In Python 2, HTMLParser implements error() as raising an exception.
+        In Python 2, HTMLParser implements error() by raising an exception,
+        which we don't want to do.
 
-        In any event, this method is called only on very strange markup and our best strategy
-        is to pretend it didn't happen and keep going.
+        In any event, this method is called only on very strange
+        markup and our best strategy is to pretend it didn't happen
+        and keep going.
         """
         warnings.warn(msg)
         
     def handle_startendtag(self, name, attrs):
-        # This is only called when the markup looks like
-        # <tag/>.
+        """Handle an incoming empty-element tag.
 
+        This is only called when the markup looks like <tag/>.
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        """
         # is_startend() tells handle_starttag not to close the tag
         # just because its name matches a known empty-element tag. We
         # know that this is an empty-element tag and we want to call
@@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_endtag(name)
         
     def handle_starttag(self, name, attrs, handle_empty_element=True):
+        """Handle an opening tag, e.g. '<tag>'
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        :param handle_empty_element: True if this tag is known to be
+            an empty-element tag (i.e. there is not expected to be any
+            closing tag).
+        """
         # XXX namespace
         attr_dict = {}
         for key, value in attrs:
@@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.already_closed_empty_element.append(name)
             
     def handle_endtag(self, name, check_already_closed=True):
+        """Handle a closing tag, e.g. '</tag>'
+        
+        :param name: A tag name.
+        :param check_already_closed: True if this tag is expected to
+           be the closing portion of an empty-element tag,
+           e.g. '<tag></tag>'.
+        """
         #print "END", name
         if check_already_closed and name in self.already_closed_empty_element:
             # This is a redundant end tag for an empty-element tag.
@@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser):
             self.soup.handle_endtag(name)
 
     def handle_data(self, data):
+        """Handle some textual data that shows up between tags."""
         self.soup.handle_data(data)
 
     def handle_charref(self, name):
+        """Handle a numeric character reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Character number, possibly in hexadecimal.
+        """
         # XXX workaround for a bug in HTMLParser. Remove this once
         # it's fixed in all supported versions.
         # http://bugs.python.org/issue13633
@@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_data(data)
 
     def handle_entityref(self, name):
+        """Handle a named entity reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Name of the entity reference.
+        """
         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
         if character is not None:
             data = character
@@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.handle_data(data)
 
     def handle_comment(self, data):
+        """Handle an HTML comment.
+
+        :param data: The text of the comment.
+        """
         self.soup.endData()
         self.soup.handle_data(data)
         self.soup.endData(Comment)
 
     def handle_decl(self, data):
+        """Handle a DOCTYPE declaration.
+
+        :param data: The text of the declaration.
+        """
         self.soup.endData()
         data = data[len("DOCTYPE "):]
         self.soup.handle_data(data)
         self.soup.endData(Doctype)
 
     def unknown_decl(self, data):
+        """Handle a declaration of unknown type -- probably a CDATA block.
+
+        :param data: The text of the declaration.
+        """
         if data.upper().startswith('CDATA['):
             cls = CData
             data = data[len('CDATA['):]
@@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
         self.soup.endData(cls)
 
     def handle_pi(self, data):
+        """Handle a processing instruction.
+
+        :param data: The text of the instruction.
+        """
         self.soup.endData()
         self.soup.handle_data(data)
         self.soup.endData(ProcessingInstruction)
 
 
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+    found in the Python standard library.
+    """
     is_xml = False
     picklable = True
     NAME = HTMLPARSER
@@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     TRACKS_LINE_NUMBERS = True
     
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        """Constructor.
+
+        :param parser_args: Positional arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param parser_kwargs: Keyword arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param kwargs: Keyword arguments for the superclass constructor.
+        """
         super(HTMLParserTreeBuilder, self).__init__(**kwargs)
         parser_args = parser_args or []
         parser_kwargs = parser_kwargs or {}
@@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
-        """
-        :return: A 4-tuple (markup, original encoding, encoding
-        declared within markup, whether any characters had to be
-        replaced with REPLACEMENT CHARACTER).
+
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
         """
         if isinstance(markup, unicode):
+            # Parse Unicode as-is.
             yield (markup, None, None, False)
             return
 
+        # Ask UnicodeDammit to sniff the most likely encoding.
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, is_html=True,
                                exclude_encodings=exclude_encodings)
@@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
                dammit.contains_replacement_characters)
 
     def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+        """
         args, kwargs = self.parser_args
         parser = BeautifulSoupHTMLParser(*args, **kwargs)
         parser.soup = self.soup
author	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2019-12-24 10:41:57 -0500
commit	bef726b23d0770860cd347b03009ffb027159572 (patch)
tree	325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/_htmlparser.py
parent	5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)