diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-20 15:07:27 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-20 15:07:27 -0500 |
commit | ee3f30b0e778747fb701856a33713369f778834f (patch) | |
tree | 1b7aeedc410a2bbd4262dfd984d70c640d5cec38 | |
parent | f5f0e1ea521f813964b4f9cbea53fcdfc3942f56 (diff) |
Added docstrings to all methods in __init__.py
-rw-r--r-- | bs4/__init__.py | 178 |
1 files changed, 120 insertions, 58 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 1ea7b97..0f3a0e3 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -52,18 +52,26 @@ from .element import ( 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. + """A data structure representing a parsed HTML or XML document. + + Most of the methods you'll call on a BeautifulSoup object are inherited from + PageElement or Tag. + + Internally, this class defines the basic interface called by the + tree builders when converting an HTML/XML document into a data + structure. The interface abstracts away the differences between + parsers. To write a new tree builder, you'll need to understand + these methods as a whole. - These methods will be called by the parser: - reset() - feed(markup) + These methods will be called by the BeautifulSoup constructor: + * reset() + * feed(markup) The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass) # Ends the current data node + * handle_starttag(name, attrs) # See note about return value + * handle_endtag(name) + * handle_data(data) # Appends to the current data node + * endData(containerClass) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, @@ -73,13 +81,19 @@ class BeautifulSoup(Tag): like HTML's <br> tag), call handle_starttag and then handle_endtag. """ + + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. ROOT_TAG_NAME = u'[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + # A bytestring containing all ASCII whitespace characters, used in + # endData() to detect data chunks that seem 'empty'. + ASCII_SPACES = b'\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" @@ -89,50 +103,49 @@ class BeautifulSoup(Tag): """Constructor. :param markup: A string or a file-like object representing - markup to be parsed. + markup to be parsed. :param features: Desirable features of the parser to be used. This - may be the name of a specific parser ("lxml", "lxml-xml", - "html.parser", or "html5lib") or it may be the type of markup - to be used ("html", "html5", "xml"). It's recommended that you - name a specific parser, so that Beautiful Soup gives you the - same results across platforms and virtual environments. + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. :param builder: A TreeBuilder subclass to instantiate (or - instance to use) instead of looking one up based on - `features`. You only need to use this if you've implemented a - custom TreeBuilder. + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. :param parse_only: A SoupStrainer. Only parts of the document - matching the SoupStrainer will be considered. This is useful - when parsing part of a document that would otherwise be too - large to fit into memory. + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. :param from_encoding: A string indicating the encoding of the - document to be parsed. Pass this in if Beautiful Soup is - guessing wrongly about the document's encoding. + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. :param exclude_encodings: A list of strings indicating - encodings known to be wrong. Pass this in if you don't know - the document's encoding but you know Beautiful Soup's guess is - wrong. + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. :param element_classes: A dictionary mapping BeautifulSoup - classes like Tag and NavigableString to other classes you'd - like to be instantiated instead as the parse tree is - built. This is useful for using subclasses to modify the - default behavior of Tag or NavigableString. + classes like Tag and NavigableString, to other classes you'd + like to be instantiated instead as the parse tree is + built. This is useful for subclassing Tag or NavigableString + to modify default behavior. :param kwargs: For backwards compatibility purposes, the - constructor accepts certain keyword arguments used in - Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4; they will result in a warning and then be ignored. - - Apart from this, any keyword arguments passed into the BeautifulSoup - constructor are propagated to the TreeBuilder constructor. This - makes it possible to configure a TreeBuilder beyond saying - which one to use. - + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder by passing in arguments, + not just by saying which one to use. """ if 'convertEntities' in kwargs: @@ -329,6 +342,9 @@ class BeautifulSoup(Tag): self.builder.soup = None def __copy__(self): + """A copy of a BeautifulSoup object is created by converting the + document to a string and parsing it again. + """ copy = type(self)( self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' ) @@ -349,9 +365,10 @@ class BeautifulSoup(Tag): @staticmethod def _check_markup_is_url(markup): - """ - Check if markup looks like it's actually a url and raise a warning - if so. Markup can be unicode or str (py2) / bytes (py3). + """Error-handling method to raise a warning if incoming markup looks + like a URL. + + :param markup: A string. """ if isinstance(markup, bytes): space = b' ' @@ -376,6 +393,9 @@ class BeautifulSoup(Tag): ) def _feed(self): + """Internal method that parses previously set markup, creating a large + number of Tag and NavigableString objects. + """ # Convert the document to Unicode. self.builder.reset() @@ -386,6 +406,9 @@ class BeautifulSoup(Tag): self.popTag() def reset(self): + """Reset this object to a state as though it had never parsed any + markup. + """ Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() @@ -397,7 +420,7 @@ class BeautifulSoup(Tag): def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs): - """Create a new tag associated with this soup.""" + """Create a new Tag associated with this BeautifulSoup object.""" kwattrs.update(attrs) return self.element_classes.get(Tag, Tag)( None, self.builder, name, namespace, nsprefix, kwattrs, @@ -405,19 +428,28 @@ class BeautifulSoup(Tag): ) def new_string(self, s, subclass=None): - """Create a new NavigableString associated with this soup.""" + """Create a new NavigableString associated with this BeautifulSoup + object. + """ subclass = subclass or self.element_classes.get( NavigableString, NavigableString ) return subclass(s) def insert_before(self, successor): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ raise NotImplementedError("BeautifulSoup objects don't support insert_before().") def insert_after(self, successor): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ raise NotImplementedError("BeautifulSoup objects don't support insert_after().") def popTag(self): + """Internal method called by _popToTag when a tag is closed.""" tag = self.tagStack.pop() if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: self.preserve_whitespace_tag_stack.pop() @@ -427,6 +459,7 @@ class BeautifulSoup(Tag): return self.currentTag def pushTag(self, tag): + """Internal method called by handle_starttag when a tag is opened.""" #print "Push", tag.name if self.currentTag is not None: self.currentTag.contents.append(tag) @@ -436,7 +469,9 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack.append(tag) def endData(self, containerClass=None): - + """Method called by the TreeBuilder when the end of a data segment + occurs. + """ # Default container is NavigableString. containerClass = containerClass or NavigableString @@ -476,7 +511,7 @@ class BeautifulSoup(Tag): self.object_was_parsed(o) def object_was_parsed(self, o, parent=None, most_recent_element=None): - """Add an object to the parse tree.""" + """Method called by the TreeBuilder to integrate an object into the parse tree.""" if parent is None: parent = self.currentTag if most_recent_element is not None: @@ -545,9 +580,14 @@ class BeautifulSoup(Tag): def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" + instance of the given tag. + + :param name: Pop up to the most recent tag with this name. + :param nsprefix: The namespace prefix that goes with `name`. + :param inclusivePop: It this is false, pops the tag stack up + to but *not* including the most recent instqance of the + given tag. + """ #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: # The BeautifulSoup object itself can never be popped. @@ -568,14 +608,21 @@ class BeautifulSoup(Tag): def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None): - """Push a start tag on to the stack. + """Called by the tree builder when a new tag is encountered. - If this method returns None, the tag was rejected by the + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + :param attrs: A dictionary of attribute values. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + + If this method returns None, the tag was rejected by an active SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ - # print "Start tag %s: %s" % (name, attrs) self.endData() @@ -598,19 +645,30 @@ class BeautifulSoup(Tag): return tag def handle_endtag(self, name, nsprefix=None): + """Called by the tree builder when an ending tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + """ #print "End tag: " + name self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): + """Called by the tree builder when a chunk of textual data is encountered.""" self.current_data.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" + """Returns a string or Unicode representation of the parse tree + as an HTML or XML document. + :param pretty_print: If this is True, indentation will be used to + make the document more readable. + :param eventual_encoding: The encoding of the final document. + If this is None, the document will be a Unicode string. + """ if self.is_xml: # Print the XML declaration encoding_part = '' @@ -626,7 +684,7 @@ class BeautifulSoup(Tag): return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) -# Alias to make it easier to type import: 'from bs4 import _soup' +# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' _s = BeautifulSoup _soup = BeautifulSoup @@ -642,13 +700,17 @@ class BeautifulStoneSoup(BeautifulSoup): class StopParsing(Exception): + """Exception raised by a TreeBuilder if it's unable to continue parsing.""" pass class FeatureNotFound(ValueError): + """Exception raised by the BeautifulSoup constructor if no parser with the + requested features is found. + """ pass -#By default, act as an HTML pretty-printer. +#If this file is run as a script, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) |