It's now possible to customize the way output is indented by

providing a value for the 'indent' argument to the Formatter constructor. The 'indent' argument works very similarly to the argument of the same name in the Python standard library's json.dump() method. [bug=1955497]
author: Leonard Richardson <leonardr@segfault.org> 2021-12-21 12:57:04 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-12-21 12:57:04 -0500
commit: 3ac8524a1263f170ae0a9096d255d3e28aa76340 (patch)
tree: e6aab155135f553f3043a425dcf8e61884091919
parent: 792a9e485e1b110534345a4f96fd65099879421e (diff)
5 files changed, 88 insertions, 9 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d235cf4..8ac1f2a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,12 @@ Python 2 was revision 605.
   to make it possible to treat ruby text specially in get_text() calls.
   [bug=1941980]
 
+* It's now possible to customize the way output is indented by
+  providing a value for the 'indent' argument to the Formatter
+  constructor. The 'indent' argument works very similarly to the
+  argument of the same name in the Python standard library's
+  json.dump() method. [bug=1955497]
+
 * If the charset-normalizer Python module
   (https://pypi.org/project/charset-normalizer/) is installed, Beautiful
   Soup will use it to detect the character sets of incoming documents.
diff --git a/bs4/element.py b/bs4/element.py
index c6cb2eb..86123f8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1614,8 +1614,10 @@ class Tag(PageElement):
 
         :param encoding: The destination encoding.
         :param indent_level: Each line of the rendering will be
-            indented this many spaces. Used internally in
-            recursive calls while pretty-printing.
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
         :param formatter: A Formatter object, or a string naming one of
             the standard formatters.
         :param errors: An error handling strategy such as
@@ -1691,7 +1693,7 @@ class Tag(PageElement):
         space = ''
         indent_space = ''
         if indent_level is not None:
-            indent_space = (' ' * (indent_level - 1))
+            indent_space = (formatter.indent * (indent_level - 1))
         if pretty_print:
             space = indent_space
             indent_contents = indent_level + 1
@@ -1766,8 +1768,10 @@ class Tag(PageElement):
         """Renders the contents of this tag as a Unicode string.
 
         :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
 
         :param eventual_encoding: The tag is destined to be
            encoded into this encoding. decode_contents() is _not_
@@ -1778,6 +1782,7 @@ class Tag(PageElement):
 
         :param formatter: A Formatter object, or a string naming one of
             the standard Formatters.
+
         """
         # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
@@ -1800,7 +1805,7 @@ class Tag(PageElement):
                 text = text.strip()
             if text:
                 if pretty_print and not preserve_whitespace:
-                    s.append(" " * (indent_level - 1))
+                    s.append(formatter.indent * (indent_level - 1))
                 s.append(text)
                 if pretty_print and not preserve_whitespace:
                     s.append("\n")
@@ -1812,8 +1817,10 @@ class Tag(PageElement):
         """Renders the contents of this PageElement as a bytestring.
 
         :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
 
         :param eventual_encoding: The bytestring will be in this encoding.
 
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 3bd9f85..65e57b5 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -49,7 +49,7 @@ class Formatter(EntitySubstitution):
     def __init__(
             self, language=None, entity_substitution=None,
             void_element_close_prefix='/', cdata_containing_tags=None,
-            empty_attributes_are_booleans=False,
+            empty_attributes_are_booleans=False, indent=1,
     ):
         """Constructor.
 
@@ -69,6 +69,15 @@ class Formatter(EntitySubstitution):
         :param blank_attributes_are_booleans: Render attributes whose value
             is the empty string as HTML-style boolean attributes.
             (Attributes whose value is None are always rendered this way.)
+
+        :param indent: If indent is a non-negative integer or string,
+            then the contents of elements will be indented
+            appropriately when pretty-printing. An indent level of 0,
+            negative, or "" will only insert newlines. Using a
+            positive integer indent indents that many spaces per
+            level. If indent is a string (such as "\t"), that string
+            is used to indent each level. The default behavior to
+            indent one space per level.
         """
         self.language = language
         self.entity_substitution = entity_substitution
@@ -77,6 +86,17 @@ class Formatter(EntitySubstitution):
             language, cdata_containing_tags, 'cdata_containing_tags'
         )
         self.empty_attributes_are_booleans=empty_attributes_are_booleans
+        if indent is None:
+            indent = 0
+        if isinstance(indent, int):
+            if indent < 0:
+                indent = 0
+            indent = ' ' * indent
+        elif isinstance(indent, str):
+            indent = indent
+        else:
+            indent = ' '
+        self.indent = indent
         
     def substitute(self, ns):
         """Process a string that needs to undergo entity substitution.
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
index 12327ef..84d4e3b 100644
--- a/bs4/tests/test_formatter.py
+++ b/bs4/tests/test_formatter.py
@@ -1,3 +1,5 @@
+import pytest
+
 from bs4.element import Tag
 from bs4.formatter import (
     Formatter,
@@ -24,6 +26,8 @@ class TestFormatter(SoupTest):
         # normally happen.
         tag.attrs = None
         assert [] == formatter.attributes(tag)
+
+        assert ' ' == formatter.indent
         
     def test_sort_attributes(self):
         # Test the ability to override Formatter.attributes() to,
@@ -73,3 +77,37 @@ class TestFormatter(SoupTest):
                 assert b'<option selected=""></option>' == soup.option.encode(formatter='html')
                 assert b'<option selected></option>' == soup.option.encode(formatter='html5')
 
+    @pytest.mark.parametrize(
+        "indent,expect",
+        [
+            (None, '<a>\n<b>\ntext\n</b>\n</a>'),
+            (-1, '<a>\n<b>\ntext\n</b>\n</a>'),
+            (0, '<a>\n<b>\ntext\n</b>\n</a>'),
+            ("", '<a>\n<b>\ntext\n</b>\n</a>'),
+
+            (1, '<a>\n <b>\n  text\n </b>\n</a>'),
+            (2, '<a>\n  <b>\n    text\n  </b>\n</a>'),
+
+            ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
+            ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
+
+            # Some invalid inputs -- the default behavior is used.
+            (object(), '<a>\n <b>\n  text\n </b>\n</a>'),
+            (b'bytes', '<a>\n <b>\n  text\n </b>\n</a>'),
+        ]
+    )
+    def test_indent(self, indent, expect):
+        # Pretty-print a tree with a Formatter set to
+        # indent in a certain way and verify the results.
+        soup = self.soup("<a><b>text</b></a>")
+        formatter = Formatter(indent=indent)
+        assert soup.prettify(formatter=formatter) == expect
+
+        # Pretty-printing only happens with prettify(), not
+        # encode().
+        assert soup.encode(formatter=formatter) != expect
+        
+    def test_default_indent_value(self):
+        formatter = Formatter()
+        assert formatter.indent == ' '
+
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 66bd03e..67251e8 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2331,6 +2331,14 @@ attribute value::
  #  A LINK
  # </a>
 
+Here's a formatter that increases the indentation when pretty-printing::
+
+ formatter = HTMLFormatter(indent=8)
+ print(link_soup.a.prettify(formatter=formatter))
+ # <a href="http://example.com/?foo=val1&bar=val2">
+ #         A link
+ # </a>
+ 
 Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even
 more control over the output. For example, Beautiful Soup sorts the
 attributes in every tag by default::
author	Leonard Richardson <leonardr@segfault.org>	2021-12-21 12:57:04 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-12-21 12:57:04 -0500
commit	3ac8524a1263f170ae0a9096d255d3e28aa76340 (patch)
tree	e6aab155135f553f3043a425dcf8e61884091919
parent	792a9e485e1b110534345a4f96fd65099879421e (diff)