summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/element.py89
-rw-r--r--bs4/tests/test_pageelement.py24
2 files changed, 94 insertions, 19 deletions
diff --git a/bs4/element.py b/bs4/element.py
index aaa00fb..80ebbef 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1656,38 +1656,89 @@ class Tag(PageElement):
if indent_level is True:
indent_level = 0
- string_literal_mode = False
+ # The currently active tag that put us into string literal
+ # mode. Until this element is closed, children will be treated
+ # as string literals and not pretty-printed. String literal
+ # mode is turned on immediately after this tag begins, and
+ # turned off immediately before it's closed. This means there
+ # will be whitespace before and after the tag itself.
+ string_literal_tag = None
+
for event, element in self._event_stream(iterator):
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
piece = element._format_tag(
- eventual_encoding, formatter, opening=True)
+ eventual_encoding, formatter, opening=True
+ )
elif event is Tag.END_ELEMENT_EVENT:
piece = element._format_tag(
- eventual_encoding, formatter, opening=False)
+ eventual_encoding, formatter, opening=False
+ )
if indent_level is not None:
indent_level -= 1
- string_literal_mode = False
else:
piece = element.output_ready(formatter)
- if isinstance(element, Tag) and not element._should_pretty_print():
- if event is Tag.START_ELEMENT_EVENT:
- # After processing this event we will be in string
- # literal mode.
- string_literal_mode = True
- indent_before = True
- indent_after = False
- else:
- # After processing this event we will no longer be
- # in string literal mode.
- string_literal_mode = False
- indent_before = False
- indent_after = True
- elif string_literal_mode:
+ # Now we need to apply the 'prettiness' -- extra
+ # whitespace before and/or after this tag. This can get
+ # complicated because certain tags, like <pre> and
+ # <script>, can't be prettified, since adding whitespace would
+ # change the meaning of the content.
+
+ # When we encounter one of those Tags we need to enter
+ # what I'm calling "string literal mode". We will stay
+ # inside string literal mode until that particular Tag is
+ # closed.
+ #
+ # By definition, string literal mode is on when the
+ # string_literal_tag is set to a Tag.
+ #
+ # For each event we process, there are four possibilities:
+ #
+ # 1. We are entering string literal mode (e.g. by
+ # encountering a <pre> tag). In this case we want
+ # whitespace before the tag but not after.
+ #
+ # 2. We are exiting string literal mode (by closing the
+ # tag that originally put us into string literal
+ # mode). In this case we want whitespace after the tag
+ # but not before.
+ #
+ # 3. We are in string literal mode and will be staying
+ # there. We will not be adding whitespace before or
+ # after this element.
+ #
+ # 4. We are outside string literal mode and will be
+ # staying there. We will be putting whitespace before
+ # and after this element.
+
+ # The default behavior is to add whitespace before and
+ # after an element when string literal mode is off, and to
+ # leave things as they are when string literal mode is on.
+ if string_literal_tag:
indent_before = indent_after = False
else:
indent_before = indent_after = True
+ # The only time the behavior is more complex than that is
+ # when we encounter an opening or closing tag that might
+ # put us into or out of string literal mode.
+ if isinstance(element, Tag) and not element._should_pretty_print():
+ if event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
+ # We are about to exit string literal mode. Add
+ # whitespace after this tag but not before.
+ indent_before = False
+ indent_after = True
+ string_literal_tag = None
+ elif event is Tag.START_ELEMENT_EVENT:
+ if not string_literal_tag:
+ # We are about to enter string literal mode.
+ # Add whitespace before this tag but not after.
+ indent_before = True
+ indent_after = False
+ string_literal_tag = element
+
+ # Now we know whether to add whitespace before and/or
+ # after this element.
if indent_level is not None:
if (indent_before or indent_after):
if isinstance(element, NavigableString):
@@ -1763,7 +1814,7 @@ class Tag(PageElement):
(a newline) after the string.
"""
space_before = ''
- if indent_before:
+ if indent_before and indent_level:
space_before = (formatter.indent * indent_level)
space_after = ''
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index f8eb9bb..a0476e4 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -158,6 +158,30 @@ class TestFormatters(SoupTest):
# inside is left alone.
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
+ def test_prettify_handles_nested_string_literal_tags(self):
+ # Most of this markup is inside a <pre> tag, so prettify()
+ # only does three things to it:
+ # 1. Add a newline and a space between the <div> and the <pre>
+ # 2. Add a newline after the </pre>
+ # 3. Add a newline at the end.
+ #
+ # The contents of the <pre> tag are left completely alone. In
+ # particular, we don't start adding whitespace again once we
+ # encounter the first </pre> tag, because we know it's not
+ # the one that put us into string literal mode.
+ markup = """<div><pre><code>some
+<script><pre>code</pre></script> for you
+</code></pre></div>"""
+
+ expect = """<div>
+ <pre><code>some
+<script><pre>code</pre></script> for you
+</code></pre>
+</div>
+"""
+ soup = self.soup(markup)
+ assert expect == soup.div.prettify()
+
def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())