diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
commit | bef726b23d0770860cd347b03009ffb027159572 (patch) | |
tree | 325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/__init__.py | |
parent | 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff) |
Added docstrings for some but not all tree buidlers.
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 143 |
1 files changed, 118 insertions, 25 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 7efbf89..e8d78f9 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -27,18 +27,33 @@ HTML_5 = 'html5' class TreeBuilderRegistry(object): - + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ if len(self.builders) == 0: # There are no builders at all. return None @@ -81,7 +96,7 @@ class TreeBuilderRegistry(object): builder_registry = TreeBuilderRegistry() class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" + """Turn a textual document into a Beautiful Soup object tree.""" NAME = "[Unknown tree builder]" ALTERNATE_NAMES = [] @@ -109,26 +124,27 @@ class TreeBuilder(object): """Constructor. :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this do a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. :param preserve_whitespace_tags: A list of tags to treat - the way <pre> tags are treated in HTML. Tags in this list - will have + the way <pre> tags are treated in HTML. Tags in this list + are immune from pretty-printing; their contents will always be + output as-is. :param store_line_numbers: If the parser keeps track of the - line numbers and positions of the original markup, that - information will, by default, be stored in each corresponding - `Tag` object. You can turn this off by passing - store_line_numbers=False. If the parser you're using doesn't - keep track of this information, then setting store_line_numbers=True - will do nothing. + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: @@ -144,10 +160,17 @@ class TreeBuilder(object): def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. """ self.soup = soup def reset(self): + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ pass def can_be_empty_element(self, tag_name): @@ -159,23 +182,56 @@ class TreeBuilder(object): For instance: an HTMLBuilder does not consider a <p> tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty <p> tag - will be presented as "<p></p>", not "<p />". + will be presented as "<p></p>", not "<p/>" or "<p>". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will be left alone. + + :param tag_name: The name of a markup tag. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ yield markup, None, None, False def test_fragment_to_document(self, fragment): @@ -188,16 +244,36 @@ class TreeBuilder(object): results against other HTML fragments. This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. """ return fragment def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """Replaces class="foo bar" with class=["foo", "bar"] + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. - Modifies its input in place. + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. """ if not attrs: return attrs @@ -225,7 +301,11 @@ class TreeBuilder(object): return attrs class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ def feed(self, markup): raise NotImplementedError() @@ -317,6 +397,16 @@ class HTMLTreeBuilder(TreeBuilder): DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) def set_up_substitutions(self, tag): + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ # We are only interested in <meta> tags if tag.name != 'meta': return False @@ -363,6 +453,9 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) class ParserRejectedMarkup(Exception): + """An Exception to be raised when the underlying parser simply + refuses to parse the given markup. + """ def __init__(self, message_or_exception): """Explain why the parser rejected the given markup, either with a textual explanation or another exception. @@ -375,7 +468,7 @@ class ParserRejectedMarkup(Exception): # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only -# want to use HTMLParser as a last result. +# want to use HTMLParser as a last resort. from . import _htmlparser register_treebuilders_from(_htmlparser) try: |