diff options
-rw-r--r-- | bs4/element.py | 89 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 24 |
2 files changed, 94 insertions, 19 deletions
diff --git a/bs4/element.py b/bs4/element.py index aaa00fb..80ebbef 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1656,38 +1656,89 @@ class Tag(PageElement): if indent_level is True: indent_level = 0 - string_literal_mode = False + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + for event, element in self._event_stream(iterator): if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): piece = element._format_tag( - eventual_encoding, formatter, opening=True) + eventual_encoding, formatter, opening=True + ) elif event is Tag.END_ELEMENT_EVENT: piece = element._format_tag( - eventual_encoding, formatter, opening=False) + eventual_encoding, formatter, opening=False + ) if indent_level is not None: indent_level -= 1 - string_literal_mode = False else: piece = element.output_ready(formatter) - if isinstance(element, Tag) and not element._should_pretty_print(): - if event is Tag.START_ELEMENT_EVENT: - # After processing this event we will be in string - # literal mode. - string_literal_mode = True - indent_before = True - indent_after = False - else: - # After processing this event we will no longer be - # in string literal mode. - string_literal_mode = False - indent_before = False - indent_after = True - elif string_literal_mode: + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like <pre> and + # <script>, can't be prettified, since adding whitespace would + # change the meaning of the content. + + # When we encounter one of those Tags we need to enter + # what I'm calling "string literal mode". We will stay + # inside string literal mode until that particular Tag is + # closed. + # + # By definition, string literal mode is on when the + # string_literal_tag is set to a Tag. + # + # For each event we process, there are four possibilities: + # + # 1. We are entering string literal mode (e.g. by + # encountering a <pre> tag). In this case we want + # whitespace before the tag but not after. + # + # 2. We are exiting string literal mode (by closing the + # tag that originally put us into string literal + # mode). In this case we want whitespace after the tag + # but not before. + # + # 3. We are in string literal mode and will be staying + # there. We will not be adding whitespace before or + # after this element. + # + # 4. We are outside string literal mode and will be + # staying there. We will be putting whitespace before + # and after this element. + + # The default behavior is to add whitespace before and + # after an element when string literal mode is off, and to + # leave things as they are when string literal mode is on. + if string_literal_tag: indent_before = indent_after = False else: indent_before = indent_after = True + # The only time the behavior is more complex than that is + # when we encounter an opening or closing tag that might + # put us into or out of string literal mode. + if isinstance(element, Tag) and not element._should_pretty_print(): + if event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: + # We are about to exit string literal mode. Add + # whitespace after this tag but not before. + indent_before = False + indent_after = True + string_literal_tag = None + elif event is Tag.START_ELEMENT_EVENT: + if not string_literal_tag: + # We are about to enter string literal mode. + # Add whitespace before this tag but not after. + indent_before = True + indent_after = False + string_literal_tag = element + + # Now we know whether to add whitespace before and/or + # after this element. if indent_level is not None: if (indent_before or indent_after): if isinstance(element, NavigableString): @@ -1763,7 +1814,7 @@ class Tag(PageElement): (a newline) after the string. """ space_before = '' - if indent_before: + if indent_before and indent_level: space_before = (formatter.indent * indent_level) space_after = '' diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index f8eb9bb..a0476e4 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -158,6 +158,30 @@ class TestFormatters(SoupTest): # inside is left alone. assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify() + def test_prettify_handles_nested_string_literal_tags(self): + # Most of this markup is inside a <pre> tag, so prettify() + # only does three things to it: + # 1. Add a newline and a space between the <div> and the <pre> + # 2. Add a newline after the </pre> + # 3. Add a newline at the end. + # + # The contents of the <pre> tag are left completely alone. In + # particular, we don't start adding whitespace again once we + # encounter the first </pre> tag, because we know it's not + # the one that put us into string literal mode. + markup = """<div><pre><code>some +<script><pre>code</pre></script> for you +</code></pre></div>""" + + expect = """<div> + <pre><code>some +<script><pre>code</pre></script> for you +</code></pre> +</div> +""" + soup = self.soup(markup) + assert expect == soup.div.prettify() + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) |