diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
commit | bef726b23d0770860cd347b03009ffb027159572 (patch) | |
tree | 325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/_htmlparser.py | |
parent | 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff) |
Added docstrings for some but not all tree buidlers.
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 111 |
1 files changed, 98 insertions, 13 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 4531407..2bb764f 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -53,7 +53,11 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + """ + def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) @@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser): self.already_closed_empty_element = [] def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although this - requirement doesn't appear to be documented. + """In Python 3, HTMLParser subclasses must implement error(), although + this requirement doesn't appear to be documented. - In Python 2, HTMLParser implements error() as raising an exception. + In Python 2, HTMLParser implements error() by raising an exception, + which we don't want to do. - In any event, this method is called only on very strange markup and our best strategy - is to pretend it didn't happen and keep going. + In any event, this method is called only on very strange + markup and our best strategy is to pretend it didn't happen + and keep going. """ warnings.warn(msg) def handle_startendtag(self, name, attrs): - # This is only called when the markup looks like - # <tag/>. + """Handle an incoming empty-element tag. + This is only called when the markup looks like <tag/>. + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + """ # is_startend() tells handle_starttag not to close the tag # just because its name matches a known empty-element tag. We # know that this is an empty-element tag and we want to call @@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_endtag(name) def handle_starttag(self, name, attrs, handle_empty_element=True): + """Handle an opening tag, e.g. '<tag>' + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ # XXX namespace attr_dict = {} for key, value in attrs: @@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser): self.already_closed_empty_element.append(name) def handle_endtag(self, name, check_already_closed=True): + """Handle a closing tag, e.g. '</tag>' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. '<tag></tag>'. + """ #print "END", name if check_already_closed and name in self.already_closed_empty_element: # This is a redundant end tag for an empty-element tag. @@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.handle_endtag(name) def handle_data(self, data): + """Handle some textual data that shows up between tags.""" self.soup.handle_data(data) def handle_charref(self, name): + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed in all supported versions. # http://bugs.python.org/issue13633 @@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_data(data) def handle_entityref(self, name): + """Handle a named entity reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Name of the entity reference. + """ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character @@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser): self.handle_data(data) def handle_comment(self, data): + """Handle an HTML comment. + + :param data: The text of the comment. + """ self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ self.soup.endData() data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] @@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData(cls) def handle_pi(self, data): + """Handle a processing instruction. + + :param data: The text of the instruction. + """ self.soup.endData() self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): - + """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, + found in the Python standard library. + """ is_xml = False picklable = True NAME = HTMLPARSER @@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): TRACKS_LINE_NUMBERS = True def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ super(HTMLParserTreeBuilder, self).__init__(**kwargs) parser_args = parser_args or [] parser_kwargs = parser_kwargs or {} @@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). + + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. """ if isinstance(markup, unicode): + # Parse Unicode as-is. yield (markup, None, None, False) return + # Ask UnicodeDammit to sniff the most likely encoding. try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) @@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): dammit.contains_replacement_characters) def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + """ args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup |