Reimplemented the pretty-print algorithm to remove recursive function calls.

author: Leonard Richardson <leonardr@segfault.org> 2023-03-21 11:27:08 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2023-03-21 11:27:08 -0400
commit: 3be39f46ec502fe20d5a95ed3292d0dccd3b1aec (patch)
tree: 01a8f5556c32dadc4e47f639d0a4dac078adeab2
parent: d923a1cc966a4faa966f5d6f0a1fe09bd482949a (diff)
4 files changed, 168 insertions, 100 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 1dd5984..bcfad08 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1642,108 +1642,176 @@ class Tag(PageElement):
         u = self.decode(indent_level, encoding, formatter)
         return u.encode(encoding, errors)
 
-    def decode(self, indent_level=None,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               formatter="minimal"):
-        """Render a Unicode representation of this PageElement and its
-        contents.
+    def str_from_sax(self):
+        data = []
+        for e, content in self.saxlike():
+            if e == 'string':
+                data.append(content)
+            elif e in ( 'openclose', 'open'):
+                data.append("<%s>" % content.name)
+            elif e == 'close':
+                data.append("</%s>" % content.name)
+        return "".join(data)
 
-        :param indent_level: Each line of the rendering will be
-             indented this many spaces. Used internally in
-             recursive calls while pretty-printing.
-        :param eventual_encoding: The tag is destined to be
-            encoded into this encoding. This method is _not_
-            responsible for performing that encoding. This information
-            is passed in so that it can be substituted in if the
-            document contains a <META> tag that mentions the document's
-            encoding.
-        :param formatter: A Formatter object, or a string naming one of
-            the standard formatters.
+    @property
+    def self_and_descendants(self):
+        if not self.hidden:
+            yield self
+        for i in self.descendants:
+            yield i
+
+    def saxlike(self):
+        """Yield a sequence of SAX-like events that can be used to
+        reconstruct this parse tree.
+
+        This lets us recreate the nested structure of the document
+        without using recursive method calls.
         """
+        tag_stack = []
+
+        for c in self.self_and_descendants:
+
+            # If the parent of the element we're about to yield is not
+            # the tag currently on the stack, it means that the tag on
+            # the stack closed before this element appeared.
+            while tag_stack and c.parent != tag_stack[-1]:
+                now_closed_tag = tag_stack.pop()
+                yield "close", now_closed_tag
+
+            if isinstance(c, Tag):
+                if c.is_empty_element:
+                    yield "openclose", c
+                else:
+                    yield "open", c
+                    tag_stack.append(c)
+                    continue
+            else:
+                yield "string", c
+
+        while tag_stack:
+            now_closed_tag = tag_stack.pop()
+            prettyprint_suppressed_by = None
+            yield "close", now_closed_tag
 
+    def decode(self, indent_level=None,
+                     eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+                     formatter="minimal"):
+        pieces = []
         # First off, turn a non-Formatter `formatter` into a Formatter
         # object. This will stop the lookup from happening over and
         # over again.
         if not isinstance(formatter, Formatter):
             formatter = self.formatter_for_name(formatter)
-        attributes = formatter.attributes(self)
-        attrs = []
-        for key, val in attributes:
-            if val is None:
-                decoded = key
+
+        if indent_level is True:
+            indent_level = 0
+
+        string_literal_mode = False
+        for event, element in self.saxlike():
+            if event in ('open', 'openclose'):
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=True)
+            elif event == 'close':
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=False)
+                if indent_level is not None:
+                    indent_level -= 1
+                string_literal_mode = False
             else:
-                if isinstance(val, list) or isinstance(val, tuple):
-                    val = ' '.join(val)
-                elif not isinstance(val, str):
-                    val = str(val)
-                elif (
-                        isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None
-                ):
-                    val = val.encode(eventual_encoding)
-
-                text = formatter.attribute_value(val)
-                decoded = (
-                    str(key) + '='
-                    + formatter.quoted_attribute_value(text))
-            attrs.append(decoded)
-        close = ''
-        closeTag = ''
+                piece = element.output_ready(formatter)
+
+            if isinstance(element, Tag) and not element._should_pretty_print():
+                if event in ('open', 'openclose'):
+                    # After processing this event we will be in string
+                    # literal mode.
+                    string_literal_mode = True
+                    indent_before = True
+                    indent_after = False
+                else:
+                    # After processing this event we will no longer be
+                    # in string literal mode.
+                    string_literal_mode = False
+                    indent_before = False
+                    indent_after = True
+            elif string_literal_mode:
+                indent_before = indent_after = False
+            else:
+                indent_before = indent_after = True
 
+            if indent_level is not None:
+                if (indent_before or indent_after):
+                    if isinstance(element, NavigableString):
+                        piece = piece.strip()
+                    piece = self._indent(
+                        piece, element, indent_level, formatter,
+                        indent_before, indent_after
+                    )
+                if event == 'open':
+                    indent_level += 1
+            pieces.append(piece)
+        return "".join(pieces)
+
+    def _indent(self, s, e, indent_level, formatter, indent_before, indent_after):
+        space_before = ''
+        if indent_before:
+            space_before = (formatter.indent * indent_level)
+
+        space_after = ''
+        if indent_after:
+            space_after = "\n"
+
+        return space_before + s + space_after
+
+    def _format_tag(self, eventual_encoding, formatter, opening):
+        # A tag starts with the < character.
+
+        # Then the / character, if this is a closing tag.
+        closing_slash = ''
+        if not opening:
+            closing_slash = '/'
+
+        # Then an optional namespace prefix.
         prefix = ''
         if self.prefix:
             prefix = self.prefix + ":"
 
-        if self.is_empty_element:
-            close = formatter.void_element_close_prefix or ''
-        else:
-            closeTag = '</%s%s>' % (prefix, self.name)
-
-        pretty_print = self._should_pretty_print(indent_level)
-        space = ''
-        indent_space = ''
-        if indent_level is not None:
-            indent_space = (formatter.indent * (indent_level - 1))
-        if pretty_print:
-            space = indent_space
-            indent_contents = indent_level + 1
-        else:
-            indent_contents = None
-        contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter
-        )
-
-        if self.hidden:
-            # This is the 'document root' object.
-            s = contents
-        else:
-            s = []
-            attribute_string = ''
+        # Then a list of attribute values, if this is an opening tag.
+        attribute_string = ''
+        if opening:
+            attributes = formatter.attributes(self)
+            attrs = []
+            for key, val in attributes:
+                if val is None:
+                    decoded = key
+                else:
+                    if isinstance(val, list) or isinstance(val, tuple):
+                        val = ' '.join(val)
+                    elif not isinstance(val, str):
+                        val = str(val)
+                    elif (
+                            isinstance(val, AttributeValueWithCharsetSubstitution)
+                            and eventual_encoding is not None
+                    ):
+                        val = val.encode(eventual_encoding)
+
+                    text = formatter.attribute_value(val)
+                    decoded = (
+                        str(key) + '='
+                        + formatter.quoted_attribute_value(text))
+                attrs.append(decoded)
             if attrs:
                 attribute_string = ' ' + ' '.join(attrs)
-            if indent_level is not None:
-                # Even if this particular tag is not pretty-printed,
-                # we should indent up to the start of the tag.
-                s.append(indent_space)
-            s.append('<%s%s%s%s>' % (
-                    prefix, self.name, attribute_string, close))
-            if pretty_print:
-                s.append("\n")
-            s.append(contents)
-            if pretty_print and contents and contents[-1] != "\n":
-                s.append("\n")
-            if pretty_print and closeTag:
-                s.append(space)
-            s.append(closeTag)
-            if indent_level is not None and closeTag and self.next_sibling:
-                # Even if this particular tag is not pretty-printed,
-                # we're now done with the tag, and we should add a
-                # newline if appropriate.
-                s.append("\n")
-            s = ''.join(s)
-        return s
-
-    def _should_pretty_print(self, indent_level):
+
+        # Then an optional closing slash (for a void element in an
+        # XML document).
+        void_element_closing_slash = ''
+        if self.is_empty_element:
+            void_element_closing_slash = formatter.void_element_close_prefix or ''
+
+        # Put it all together.
+        return f'<{closing_slash}{prefix}{self.name}{attribute_string}{void_element_closing_slash}>'
+
+    def _should_pretty_print(self, indent_level=1):
         """Should this tag be pretty-printed?
 
         Most of them should, but some (such as <pre> in HTML
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index d8b3b9b..dbb1593 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -551,8 +551,8 @@ Hello, world!
         """Whitespace must be preserved in <pre> and <textarea> tags,
         even if that would mean not prettifying the markup.
         """
-        pre_markup = "<pre>   </pre>"
-        textarea_markup = "<textarea> woo\nwoo  </textarea>"
+        pre_markup = "<pre>a   z</pre>\n"
+        textarea_markup = "<textarea> woo\nwoo  </textarea>\n"
         self.assert_soup(pre_markup)
         self.assert_soup(textarea_markup)
 
@@ -563,7 +563,7 @@ Hello, world!
         assert soup.textarea.prettify() == textarea_markup
 
         soup = self.soup("<textarea></textarea>")
-        assert soup.textarea.prettify() == "<textarea></textarea>"
+        assert soup.textarea.prettify() == "<textarea></textarea>\n"
 
     def test_nested_inline_elements(self):
         """Inline elements can be nested indefinitely."""
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
index 84d4e3b..528b16d 100644
--- a/bs4/tests/test_formatter.py
+++ b/bs4/tests/test_formatter.py
@@ -80,20 +80,20 @@ class TestFormatter(SoupTest):
     @pytest.mark.parametrize(
         "indent,expect",
         [
-            (None, '<a>\n<b>\ntext\n</b>\n</a>'),
-            (-1, '<a>\n<b>\ntext\n</b>\n</a>'),
-            (0, '<a>\n<b>\ntext\n</b>\n</a>'),
-            ("", '<a>\n<b>\ntext\n</b>\n</a>'),
+            (None, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+            (-1, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+            (0, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+            ("", '<a>\n<b>\ntext\n</b>\n</a>\n'),
 
-            (1, '<a>\n <b>\n  text\n </b>\n</a>'),
-            (2, '<a>\n  <b>\n    text\n  </b>\n</a>'),
+            (1, '<a>\n <b>\n  text\n </b>\n</a>\n'),
+            (2, '<a>\n  <b>\n    text\n  </b>\n</a>\n'),
 
-            ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
-            ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
+            ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'),
+            ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'),
 
             # Some invalid inputs -- the default behavior is used.
-            (object(), '<a>\n <b>\n  text\n </b>\n</a>'),
-            (b'bytes', '<a>\n <b>\n  text\n </b>\n</a>'),
+            (object(), '<a>\n <b>\n  text\n </b>\n</a>\n'),
+            (b'bytes', '<a>\n <b>\n  text\n </b>\n</a>\n'),
         ]
     )
     def test_indent(self, indent, expect):
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index a94280f..f8eb9bb 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -156,7 +156,7 @@ class TestFormatters(SoupTest):
         soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  <textarea> eee\nfff\t</textarea></div>")
         # Everything outside the <pre> tag is reformatted, but everything
         # inside is left alone.
-        assert '<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
+        assert '<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
 
     def test_prettify_accepts_formatter_function(self):
         soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
author	Leonard Richardson <leonardr@segfault.org>	2023-03-21 11:27:08 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2023-03-21 11:27:08 -0400
commit	3be39f46ec502fe20d5a95ed3292d0dccd3b1aec (patch)
tree	01a8f5556c32dadc4e47f639d0a4dac078adeab2
parent	d923a1cc966a4faa966f5d6f0a1fe09bd482949a (diff)