diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-03-21 11:27:08 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-03-21 11:27:08 -0400 |
commit | 3be39f46ec502fe20d5a95ed3292d0dccd3b1aec (patch) | |
tree | 01a8f5556c32dadc4e47f639d0a4dac078adeab2 /bs4/element.py | |
parent | d923a1cc966a4faa966f5d6f0a1fe09bd482949a (diff) |
Reimplemented the pretty-print algorithm to remove recursive function calls.
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 240 |
1 files changed, 154 insertions, 86 deletions
diff --git a/bs4/element.py b/bs4/element.py index 1dd5984..bcfad08 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1642,108 +1642,176 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. + def str_from_sax(self): + data = [] + for e, content in self.saxlike(): + if e == 'string': + data.append(content) + elif e in ( 'openclose', 'open'): + data.append("<%s>" % content.name) + elif e == 'close': + data.append("</%s>" % content.name) + return "".join(data) - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. + @property + def self_and_descendants(self): + if not self.hidden: + yield self + for i in self.descendants: + yield i + + def saxlike(self): + """Yield a sequence of SAX-like events that can be used to + reconstruct this parse tree. + + This lets us recreate the nested structure of the document + without using recursive method calls. """ + tag_stack = [] + + for c in self.self_and_descendants: + + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield "close", now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield "openclose", c + else: + yield "open", c + tag_stack.append(c) + continue + else: + yield "string", c + + while tag_stack: + now_closed_tag = tag_stack.pop() + prettyprint_suppressed_by = None + yield "close", now_closed_tag + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and # over again. if not isinstance(formatter, Formatter): formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key + + if indent_level is True: + indent_level = 0 + + string_literal_mode = False + for event, element in self.saxlike(): + if event in ('open', 'openclose'): + piece = element._format_tag( + eventual_encoding, formatter, opening=True) + elif event == 'close': + piece = element._format_tag( + eventual_encoding, formatter, opening=False) + if indent_level is not None: + indent_level -= 1 + string_literal_mode = False else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) - - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' + piece = element.output_ready(formatter) + + if isinstance(element, Tag) and not element._should_pretty_print(): + if event in ('open', 'openclose'): + # After processing this event we will be in string + # literal mode. + string_literal_mode = True + indent_before = True + indent_after = False + else: + # After processing this event we will no longer be + # in string literal mode. + string_literal_mode = False + indent_before = False + indent_after = True + elif string_literal_mode: + indent_before = indent_after = False + else: + indent_before = indent_after = True + if indent_level is not None: + if (indent_before or indent_after): + if isinstance(element, NavigableString): + piece = piece.strip() + piece = self._indent( + piece, element, indent_level, formatter, + indent_before, indent_after + ) + if event == 'open': + indent_level += 1 + pieces.append(piece) + return "".join(pieces) + + def _indent(self, s, e, indent_level, formatter, indent_before, indent_after): + space_before = '' + if indent_before: + space_before = (formatter.indent * indent_level) + + space_after = '' + if indent_after: + space_after = "\n" + + return space_before + s + space_after + + def _format_tag(self, eventual_encoding, formatter, opening): + # A tag starts with the < character. + + # Then the / character, if this is a closing tag. + closing_slash = '' + if not opening: + closing_slash = '/' + + # Then an optional namespace prefix. prefix = '' if self.prefix: prefix = self.prefix + ":" - if self.is_empty_element: - close = formatter.void_element_close_prefix or '' - else: - closeTag = '</%s%s>' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (formatter.indent * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter - ) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' + # Then a list of attribute values, if this is an opening tag. + attribute_string = '' + if opening: + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) if attrs: attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s - - def _should_pretty_print(self, indent_level): + + # Then an optional closing slash (for a void element in an + # XML document). + void_element_closing_slash = '' + if self.is_empty_element: + void_element_closing_slash = formatter.void_element_close_prefix or '' + + # Put it all together. + return f'<{closing_slash}{prefix}{self.name}{attribute_string}{void_element_closing_slash}>' + + def _should_pretty_print(self, indent_level=1): """Should this tag be pretty-printed? Most of them should, but some (such as <pre> in HTML |