diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 09:59:42 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 09:59:42 -0500 |
commit | 5952879a2458fdeb74673d3ccd61fd312c7d66df (patch) | |
tree | abf4c1b06725b2a755b8121ff9fefc1b35619fce | |
parent | b6c5db5c45cd38cd4df50f415d42f518fc821ca2 (diff) |
Added docstrings to diagnose.py.
-rw-r--r-- | bs4/diagnose.py | 26 | ||||
-rw-r--r-- | bs4/element.py | 17 | ||||
-rw-r--r-- | bs4/formatter.py | 2 |
3 files changed, 35 insertions, 10 deletions
diff --git a/bs4/diagnose.py b/bs4/diagnose.py index f9835c3..c58d610 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -20,7 +20,11 @@ import sys import cProfile def diagnose(data): - """Diagnostic suite for isolating common problems.""" + """Diagnostic suite for isolating common problems. + + :param data: A string containing markup that needs to be explained. + :return: None; diagnostics are printed to standard output. + """ print "Diagnostic running on Beautiful Soup %s" % __version__ print "Python version %s" % sys.version @@ -90,14 +94,25 @@ def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful - Soup code is running. + Soup code is running. You can use this to determine whether + an lxml-specific problem is in Beautiful Soup's lxml tree builders + or in lxml itself. + + :param data: Some markup. + :param html: If True, markup will be parsed with lxml's HTML parser. + if False, lxml's XML parser will be used. """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): print("%s, %4s, %s" % (event, element.tag, element.text)) class AnnouncingParser(HTMLParser): - """Announces HTMLParser parse events, without doing anything else.""" + """Subclass of HTMLParser that announces parse events, without doing + anything else. + + You can use this to get a picture of how html.parser sees a given + document. The easiest way to do this is to call `htmlparser_trace`. + """ def _p(self, s): print(s) @@ -134,6 +149,8 @@ def htmlparser_trace(data): This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. + + :param data: Some markup. """ parser = AnnouncingParser() parser.feed(data) @@ -207,7 +224,7 @@ def benchmark_parsers(num_elements=100000): print "Raw html5lib parsed the markup in %.2fs." % (b-a) def profile(num_elements=100000, parser="lxml"): - + """Use Python's profiler on a randomly generated document.""" filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name @@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"): stats.sort_stats("cumulative") stats.print_stats('_html5lib|bs4', 50) +# If this file is run as a script, standard input is diagnosed. if __name__ == '__main__': diagnose(sys.stdin.read()) diff --git a/bs4/element.py b/bs4/element.py index bba3f7e..1d6ce3c 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1784,8 +1784,9 @@ class Tag(PageElement): def has_key(self, key): """Deprecated method. This was kind of misleading because has_key() - (attributes) was different from __in__ (contents). has_key() - is gone in Python 3, anyway. + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. """ warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( key)) @@ -1794,7 +1795,13 @@ class Tag(PageElement): # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or - text).""" + string). + + This is primarily used to underpin the find_* methods, but you can + create one yourself and pass it in as `parse_only` to the + `BeautifulSoup` constructor, to parse a subset of a large + document. + """ def __init__(self, name=None, attrs={}, text=None, **kwargs): """Constructor. @@ -1865,7 +1872,7 @@ class SoupStrainer(object): return unicode(str(value)) def __str__(self): - """A string representation of this SoupStrainer.""" + """A human-readable representation of this SoupStrainer.""" if self.text: return self.text else: @@ -2055,7 +2062,7 @@ class ResultSet(list): self.source = source def __getattr__(self, key): - """Raise a helpful exception.""" + """Raise a helpful exception to explain a common code fix.""" raise AttributeError( "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key ) diff --git a/bs4/formatter.py b/bs4/formatter.py index c907ea8..09d15e7 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -7,7 +7,7 @@ class Formatter(EntitySubstitution): HTML4, HTML5, and XML. Others are configurable by the user. Formatters are passed in as the `formatter` argument to methods - like `Element.encode`. Most people won't need to think about + like `PageElement.encode`. Most people won't need to think about formatters, and most people who need to think about them can pass in one of these predefined strings as `formatter` rather than making a new Formatter object: |