diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
commit | bef726b23d0770860cd347b03009ffb027159572 (patch) | |
tree | 325b698568a6fcb63018753db4830a579254f6ca | |
parent | 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff) |
Added docstrings for some but not all tree buidlers.
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 143 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 111 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 46 |
4 files changed, 255 insertions, 48 deletions
@@ -1,6 +1,7 @@ = 4.8.2 (20191224) -* Added Python docstrings to most public methods. +* Added Python docstrings to all public methods of the most commonly + used classes. * Added a Chinese translation by Deron Wang and a Brazilian Portuguese translation by Cezar Peixeiro to the repository. diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 7efbf89..e8d78f9 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -27,18 +27,33 @@ HTML_5 = 'html5' class TreeBuilderRegistry(object): - + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ if len(self.builders) == 0: # There are no builders at all. return None @@ -81,7 +96,7 @@ class TreeBuilderRegistry(object): builder_registry = TreeBuilderRegistry() class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" + """Turn a textual document into a Beautiful Soup object tree.""" NAME = "[Unknown tree builder]" ALTERNATE_NAMES = [] @@ -109,26 +124,27 @@ class TreeBuilder(object): """Constructor. :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this do a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. :param preserve_whitespace_tags: A list of tags to treat - the way <pre> tags are treated in HTML. Tags in this list - will have + the way <pre> tags are treated in HTML. Tags in this list + are immune from pretty-printing; their contents will always be + output as-is. :param store_line_numbers: If the parser keeps track of the - line numbers and positions of the original markup, that - information will, by default, be stored in each corresponding - `Tag` object. You can turn this off by passing - store_line_numbers=False. If the parser you're using doesn't - keep track of this information, then setting store_line_numbers=True - will do nothing. + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: @@ -144,10 +160,17 @@ class TreeBuilder(object): def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. """ self.soup = soup def reset(self): + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ pass def can_be_empty_element(self, tag_name): @@ -159,23 +182,56 @@ class TreeBuilder(object): For instance: an HTMLBuilder does not consider a <p> tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty <p> tag - will be presented as "<p></p>", not "<p />". + will be presented as "<p></p>", not "<p/>" or "<p>". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will be left alone. + + :param tag_name: The name of a markup tag. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ yield markup, None, None, False def test_fragment_to_document(self, fragment): @@ -188,16 +244,36 @@ class TreeBuilder(object): results against other HTML fragments. This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. """ return fragment def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """Replaces class="foo bar" with class=["foo", "bar"] + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. - Modifies its input in place. + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. """ if not attrs: return attrs @@ -225,7 +301,11 @@ class TreeBuilder(object): return attrs class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ def feed(self, markup): raise NotImplementedError() @@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder): DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) def set_up_substitutions(self, tag): + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ # We are only interested in <meta> tags if tag.name != 'meta': return False @@ -363,6 +453,9 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) class ParserRejectedMarkup(Exception): + """An Exception to be raised when the underlying parser simply + refuses to parse the given markup. + """ def __init__(self, message_or_exception): """Explain why the parser rejected the given markup, either with a textual explanation or another exception. @@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception): # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only -# want to use HTMLParser as a last result. +# want to use HTMLParser as a last resort. from . import _htmlparser register_treebuilders_from(_htmlparser) try: diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 4531407..2bb764f 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -53,7 +53,11 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + """ + def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) @@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser): self.already_closed_empty_element = [] def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although this - requirement doesn't appear to be documented. + """In Python 3, HTMLParser subclasses must implement error(), although + this requirement doesn't appear to be documented. - In Python 2, HTMLParser implements error() as raising an exception. + In Python 2, HTMLParser implements error() by raising an exception, + which we don't want to do. - In any event, this method is called only on very strange markup and our best strategy - is to pretend it didn't happen and keep going. + In any event, this method is called only on very strange + markup and our best strategy is to pretend it didn't happen + and keep going. """ warnings.warn(msg) def handle_startendtag(self, name, attrs): - # This is only called when the markup looks like - # <tag/>. + """Handle an incoming empty-element tag. + This is only called when the markup looks like <tag/>. + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + """ # is_startend() tells handle_starttag not to close the tag # just because its name matches a known empty-element tag. We # know that this is an empty-element tag and we want to call @@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_endtag(name) def handle_starttag(self, name, attrs, handle_empty_element=True): + """Handle an opening tag, e.g. '<tag>' + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ # XXX namespace attr_dict = {} for key, value in attrs: @@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser): self.already_closed_empty_element.append(name) def handle_endtag(self, name, check_already_closed=True): + """Handle a closing tag, e.g. '</tag>' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. '<tag></tag>'. + """ #print "END", name if check_already_closed and name in self.already_closed_empty_element: # This is a redundant end tag for an empty-element tag. @@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.handle_endtag(name) def handle_data(self, data): + """Handle some textual data that shows up between tags.""" self.soup.handle_data(data) def handle_charref(self, name): + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed in all supported versions. # http://bugs.python.org/issue13633 @@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_data(data) def handle_entityref(self, name): + """Handle a named entity reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Name of the entity reference. + """ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character @@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_data(data) def handle_comment(self, data): + """Handle an HTML comment. + + :param data: The text of the comment. + """ self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ self.soup.endData() data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] @@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData(cls) def handle_pi(self, data): + """Handle a processing instruction. + + :param data: The text of the instruction. + """ self.soup.endData() self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): - + """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, + found in the Python standard library. + """ is_xml = False picklable = True NAME = HTMLPARSER @@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): TRACKS_LINE_NUMBERS = True def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ super(HTMLParserTreeBuilder, self).__init__(**kwargs) parser_args = parser_args or [] parser_kwargs = parser_kwargs or {} @@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). + + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. """ if isinstance(markup, unicode): + # Parse Unicode as-is. yield (markup, None, None, False) return + # Ask UnicodeDammit to sniff the most likely encoding. try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) @@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): dammit.contains_replacement_characters) def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + """ args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index c83dd2d..1b44d75 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder): # But instead we build an XMLParser or HTMLParser object to serve # as the target of parse messages, and those messages don't include # line numbers. + # See: https://bugs.launchpad.net/lxml/+bug/1846906 def initialize_soup(self, soup): """Let the BeautifulSoup object know about the standard namespace mapping. + + :param soup: A `BeautifulSoup`. """ super(LXMLTreeBuilderForXML, self).initialize_soup(soup) self._register_namespaces(self.DEFAULT_NSMAPS) @@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): while parsing the document. This might be useful later on when creating CSS selectors. + + :param mapping: A dictionary mapping namespace prefixes to URIs. """ for key, value in mapping.items(): if key and key not in self.soup._namespaces: @@ -84,14 +89,23 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup._namespaces[key] = value def default_parser(self, encoding): - # This can either return a parser object or a class, which - # will be instantiated with default arguments. + """Find the default parser for the given encoding. + + :param encoding: A string. + :return: Either a parser object or a class, which + will be instantiated with default arguments. + """ if self._default_parser is not None: return self._default_parser return etree.XMLParser( target=self, strip_cdata=False, recover=True, encoding=encoding) def parser_for(self, encoding): + """Instantiate an appropriate parser for the given encoding. + + :param encoding: A string. + :return: A parser object such as an `etree.XMLParser`. + """ # Use the default parser. parser = self.default_parser(encoding) @@ -124,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): - """ - :yield: A series of 4-tuples. + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + lxml really wants to get a bytestring and convert it to + Unicode itself. So instead of using UnicodeDammit to convert + the bytestring to Unicode using different encodings, this + implementation uses EncodingDetector to iterate over the + encodings, and tell lxml to try to parse the document as each + one in turn. + + :param markup: Some markup -- hopefully a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, has undergone character replacement) - Each 4-tuple represents a strategy for parsing the document. + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. """ - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction |