summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-12-21 12:57:04 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-12-21 12:57:04 -0500
commit3ac8524a1263f170ae0a9096d255d3e28aa76340 (patch)
treee6aab155135f553f3043a425dcf8e61884091919
parent792a9e485e1b110534345a4f96fd65099879421e (diff)
It's now possible to customize the way output is indented by
providing a value for the 'indent' argument to the Formatter constructor. The 'indent' argument works very similarly to the argument of the same name in the Python standard library's json.dump() method. [bug=1955497]
-rw-r--r--CHANGELOG6
-rw-r--r--bs4/element.py23
-rw-r--r--bs4/formatter.py22
-rw-r--r--bs4/tests/test_formatter.py38
-rw-r--r--doc/source/index.rst8
5 files changed, 88 insertions, 9 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d235cf4..8ac1f2a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,12 @@ Python 2 was revision 605.
to make it possible to treat ruby text specially in get_text() calls.
[bug=1941980]
+* It's now possible to customize the way output is indented by
+ providing a value for the 'indent' argument to the Formatter
+ constructor. The 'indent' argument works very similarly to the
+ argument of the same name in the Python standard library's
+ json.dump() method. [bug=1955497]
+
* If the charset-normalizer Python module
(https://pypi.org/project/charset-normalizer/) is installed, Beautiful
Soup will use it to detect the character sets of incoming documents.
diff --git a/bs4/element.py b/bs4/element.py
index c6cb2eb..86123f8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1614,8 +1614,10 @@ class Tag(PageElement):
:param encoding: The destination encoding.
:param indent_level: Each line of the rendering will be
- indented this many spaces. Used internally in
- recursive calls while pretty-printing.
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
:param errors: An error handling strategy such as
@@ -1691,7 +1693,7 @@ class Tag(PageElement):
space = ''
indent_space = ''
if indent_level is not None:
- indent_space = (' ' * (indent_level - 1))
+ indent_space = (formatter.indent * (indent_level - 1))
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
@@ -1766,8 +1768,10 @@ class Tag(PageElement):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
- indented this many spaces. Used internally in
- recursive calls while pretty-printing.
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. decode_contents() is _not_
@@ -1778,6 +1782,7 @@ class Tag(PageElement):
:param formatter: A Formatter object, or a string naming one of
the standard Formatters.
+
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
@@ -1800,7 +1805,7 @@ class Tag(PageElement):
text = text.strip()
if text:
if pretty_print and not preserve_whitespace:
- s.append(" " * (indent_level - 1))
+ s.append(formatter.indent * (indent_level - 1))
s.append(text)
if pretty_print and not preserve_whitespace:
s.append("\n")
@@ -1812,8 +1817,10 @@ class Tag(PageElement):
"""Renders the contents of this PageElement as a bytestring.
:param indent_level: Each line of the rendering will be
- indented this many spaces. Used internally in
- recursive calls while pretty-printing.
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
:param eventual_encoding: The bytestring will be in this encoding.
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 3bd9f85..65e57b5 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -49,7 +49,7 @@ class Formatter(EntitySubstitution):
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
- empty_attributes_are_booleans=False,
+ empty_attributes_are_booleans=False, indent=1,
):
"""Constructor.
@@ -69,6 +69,15 @@ class Formatter(EntitySubstitution):
:param blank_attributes_are_booleans: Render attributes whose value
is the empty string as HTML-style boolean attributes.
(Attributes whose value is None are always rendered this way.)
+
+ :param indent: If indent is a non-negative integer or string,
+ then the contents of elements will be indented
+ appropriately when pretty-printing. An indent level of 0,
+ negative, or "" will only insert newlines. Using a
+ positive integer indent indents that many spaces per
+ level. If indent is a string (such as "\t"), that string
+ is used to indent each level. The default behavior to
+ indent one space per level.
"""
self.language = language
self.entity_substitution = entity_substitution
@@ -77,6 +86,17 @@ class Formatter(EntitySubstitution):
language, cdata_containing_tags, 'cdata_containing_tags'
)
self.empty_attributes_are_booleans=empty_attributes_are_booleans
+ if indent is None:
+ indent = 0
+ if isinstance(indent, int):
+ if indent < 0:
+ indent = 0
+ indent = ' ' * indent
+ elif isinstance(indent, str):
+ indent = indent
+ else:
+ indent = ' '
+ self.indent = indent
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
index 12327ef..84d4e3b 100644
--- a/bs4/tests/test_formatter.py
+++ b/bs4/tests/test_formatter.py
@@ -1,3 +1,5 @@
+import pytest
+
from bs4.element import Tag
from bs4.formatter import (
Formatter,
@@ -24,6 +26,8 @@ class TestFormatter(SoupTest):
# normally happen.
tag.attrs = None
assert [] == formatter.attributes(tag)
+
+ assert ' ' == formatter.indent
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
@@ -73,3 +77,37 @@ class TestFormatter(SoupTest):
assert b'<option selected=""></option>' == soup.option.encode(formatter='html')
assert b'<option selected></option>' == soup.option.encode(formatter='html5')
+ @pytest.mark.parametrize(
+ "indent,expect",
+ [
+ (None, '<a>\n<b>\ntext\n</b>\n</a>'),
+ (-1, '<a>\n<b>\ntext\n</b>\n</a>'),
+ (0, '<a>\n<b>\ntext\n</b>\n</a>'),
+ ("", '<a>\n<b>\ntext\n</b>\n</a>'),
+
+ (1, '<a>\n <b>\n text\n </b>\n</a>'),
+ (2, '<a>\n <b>\n text\n </b>\n</a>'),
+
+ ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
+ ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
+
+ # Some invalid inputs -- the default behavior is used.
+ (object(), '<a>\n <b>\n text\n </b>\n</a>'),
+ (b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
+ ]
+ )
+ def test_indent(self, indent, expect):
+ # Pretty-print a tree with a Formatter set to
+ # indent in a certain way and verify the results.
+ soup = self.soup("<a><b>text</b></a>")
+ formatter = Formatter(indent=indent)
+ assert soup.prettify(formatter=formatter) == expect
+
+ # Pretty-printing only happens with prettify(), not
+ # encode().
+ assert soup.encode(formatter=formatter) != expect
+
+ def test_default_indent_value(self):
+ formatter = Formatter()
+ assert formatter.indent == ' '
+
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 66bd03e..67251e8 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2331,6 +2331,14 @@ attribute value::
# A LINK
# </a>
+Here's a formatter that increases the indentation when pretty-printing::
+
+ formatter = HTMLFormatter(indent=8)
+ print(link_soup.a.prettify(formatter=formatter))
+ # <a href="http://example.com/?foo=val1&bar=val2">
+ # A link
+ # </a>
+
Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even
more control over the output. For example, Beautiful Soup sorts the
attributes in every tag by default::