diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-12-21 12:57:04 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-12-21 12:57:04 -0500 |
commit | 3ac8524a1263f170ae0a9096d255d3e28aa76340 (patch) | |
tree | e6aab155135f553f3043a425dcf8e61884091919 | |
parent | 792a9e485e1b110534345a4f96fd65099879421e (diff) |
It's now possible to customize the way output is indented by
providing a value for the 'indent' argument to the Formatter
constructor. The 'indent' argument works very similarly to the
argument of the same name in the Python standard library's
json.dump() method. [bug=1955497]
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/element.py | 23 | ||||
-rw-r--r-- | bs4/formatter.py | 22 | ||||
-rw-r--r-- | bs4/tests/test_formatter.py | 38 | ||||
-rw-r--r-- | doc/source/index.rst | 8 |
5 files changed, 88 insertions, 9 deletions
@@ -11,6 +11,12 @@ Python 2 was revision 605. to make it possible to treat ruby text specially in get_text() calls. [bug=1941980] +* It's now possible to customize the way output is indented by + providing a value for the 'indent' argument to the Formatter + constructor. The 'indent' argument works very similarly to the + argument of the same name in the Python standard library's + json.dump() method. [bug=1955497] + * If the charset-normalizer Python module (https://pypi.org/project/charset-normalizer/) is installed, Beautiful Soup will use it to detect the character sets of incoming documents. diff --git a/bs4/element.py b/bs4/element.py index c6cb2eb..86123f8 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1614,8 +1614,10 @@ class Tag(PageElement): :param encoding: The destination encoding. :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. :param formatter: A Formatter object, or a string naming one of the standard formatters. :param errors: An error handling strategy such as @@ -1691,7 +1693,7 @@ class Tag(PageElement): space = '' indent_space = '' if indent_level is not None: - indent_space = (' ' * (indent_level - 1)) + indent_space = (formatter.indent * (indent_level - 1)) if pretty_print: space = indent_space indent_contents = indent_level + 1 @@ -1766,8 +1768,10 @@ class Tag(PageElement): """Renders the contents of this tag as a Unicode string. :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. :param eventual_encoding: The tag is destined to be encoded into this encoding. decode_contents() is _not_ @@ -1778,6 +1782,7 @@ class Tag(PageElement): :param formatter: A Formatter object, or a string naming one of the standard Formatters. + """ # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. @@ -1800,7 +1805,7 @@ class Tag(PageElement): text = text.strip() if text: if pretty_print and not preserve_whitespace: - s.append(" " * (indent_level - 1)) + s.append(formatter.indent * (indent_level - 1)) s.append(text) if pretty_print and not preserve_whitespace: s.append("\n") @@ -1812,8 +1817,10 @@ class Tag(PageElement): """Renders the contents of this PageElement as a bytestring. :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. :param eventual_encoding: The bytestring will be in this encoding. diff --git a/bs4/formatter.py b/bs4/formatter.py index 3bd9f85..65e57b5 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -49,7 +49,7 @@ class Formatter(EntitySubstitution): def __init__( self, language=None, entity_substitution=None, void_element_close_prefix='/', cdata_containing_tags=None, - empty_attributes_are_booleans=False, + empty_attributes_are_booleans=False, indent=1, ): """Constructor. @@ -69,6 +69,15 @@ class Formatter(EntitySubstitution): :param blank_attributes_are_booleans: Render attributes whose value is the empty string as HTML-style boolean attributes. (Attributes whose value is None are always rendered this way.) + + :param indent: If indent is a non-negative integer or string, + then the contents of elements will be indented + appropriately when pretty-printing. An indent level of 0, + negative, or "" will only insert newlines. Using a + positive integer indent indents that many spaces per + level. If indent is a string (such as "\t"), that string + is used to indent each level. The default behavior to + indent one space per level. """ self.language = language self.entity_substitution = entity_substitution @@ -77,6 +86,17 @@ class Formatter(EntitySubstitution): language, cdata_containing_tags, 'cdata_containing_tags' ) self.empty_attributes_are_booleans=empty_attributes_are_booleans + if indent is None: + indent = 0 + if isinstance(indent, int): + if indent < 0: + indent = 0 + indent = ' ' * indent + elif isinstance(indent, str): + indent = indent + else: + indent = ' ' + self.indent = indent def substitute(self, ns): """Process a string that needs to undergo entity substitution. diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py index 12327ef..84d4e3b 100644 --- a/bs4/tests/test_formatter.py +++ b/bs4/tests/test_formatter.py @@ -1,3 +1,5 @@ +import pytest + from bs4.element import Tag from bs4.formatter import ( Formatter, @@ -24,6 +26,8 @@ class TestFormatter(SoupTest): # normally happen. tag.attrs = None assert [] == formatter.attributes(tag) + + assert ' ' == formatter.indent def test_sort_attributes(self): # Test the ability to override Formatter.attributes() to, @@ -73,3 +77,37 @@ class TestFormatter(SoupTest): assert b'<option selected=""></option>' == soup.option.encode(formatter='html') assert b'<option selected></option>' == soup.option.encode(formatter='html5') + @pytest.mark.parametrize( + "indent,expect", + [ + (None, '<a>\n<b>\ntext\n</b>\n</a>'), + (-1, '<a>\n<b>\ntext\n</b>\n</a>'), + (0, '<a>\n<b>\ntext\n</b>\n</a>'), + ("", '<a>\n<b>\ntext\n</b>\n</a>'), + + (1, '<a>\n <b>\n text\n </b>\n</a>'), + (2, '<a>\n <b>\n text\n </b>\n</a>'), + + ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'), + ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'), + + # Some invalid inputs -- the default behavior is used. + (object(), '<a>\n <b>\n text\n </b>\n</a>'), + (b'bytes', '<a>\n <b>\n text\n </b>\n</a>'), + ] + ) + def test_indent(self, indent, expect): + # Pretty-print a tree with a Formatter set to + # indent in a certain way and verify the results. + soup = self.soup("<a><b>text</b></a>") + formatter = Formatter(indent=indent) + assert soup.prettify(formatter=formatter) == expect + + # Pretty-printing only happens with prettify(), not + # encode(). + assert soup.encode(formatter=formatter) != expect + + def test_default_indent_value(self): + formatter = Formatter() + assert formatter.indent == ' ' + diff --git a/doc/source/index.rst b/doc/source/index.rst index 66bd03e..67251e8 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2331,6 +2331,14 @@ attribute value:: # A LINK # </a> +Here's a formatter that increases the indentation when pretty-printing:: + + formatter = HTMLFormatter(indent=8) + print(link_soup.a.prettify(formatter=formatter)) + # <a href="http://example.com/?foo=val1&bar=val2"> + # A link + # </a> + Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even more control over the output. For example, Beautiful Soup sorts the attributes in every tag by default:: |