From 3be39f46ec502fe20d5a95ed3292d0dccd3b1aec Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 21 Mar 2023 11:27:08 -0400 Subject: Reimplemented the pretty-print algorithm to remove recursive function calls. --- bs4/element.py | 240 +++++++++++++++++++++++++++--------------- bs4/tests/__init__.py | 6 +- bs4/tests/test_formatter.py | 20 ++-- bs4/tests/test_pageelement.py | 2 +- 4 files changed, 168 insertions(+), 100 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 1dd5984..bcfad08 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1642,108 +1642,176 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. + def str_from_sax(self): + data = [] + for e, content in self.saxlike(): + if e == 'string': + data.append(content) + elif e in ( 'openclose', 'open'): + data.append("<%s>" % content.name) + elif e == 'close': + data.append("" % content.name) + return "".join(data) - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. + @property + def self_and_descendants(self): + if not self.hidden: + yield self + for i in self.descendants: + yield i + + def saxlike(self): + """Yield a sequence of SAX-like events that can be used to + reconstruct this parse tree. + + This lets us recreate the nested structure of the document + without using recursive method calls. """ + tag_stack = [] + + for c in self.self_and_descendants: + + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield "close", now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield "openclose", c + else: + yield "open", c + tag_stack.append(c) + continue + else: + yield "string", c + + while tag_stack: + now_closed_tag = tag_stack.pop() + prettyprint_suppressed_by = None + yield "close", now_closed_tag + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and # over again. if not isinstance(formatter, Formatter): formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key + + if indent_level is True: + indent_level = 0 + + string_literal_mode = False + for event, element in self.saxlike(): + if event in ('open', 'openclose'): + piece = element._format_tag( + eventual_encoding, formatter, opening=True) + elif event == 'close': + piece = element._format_tag( + eventual_encoding, formatter, opening=False) + if indent_level is not None: + indent_level -= 1 + string_literal_mode = False else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) - - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' + piece = element.output_ready(formatter) + + if isinstance(element, Tag) and not element._should_pretty_print(): + if event in ('open', 'openclose'): + # After processing this event we will be in string + # literal mode. + string_literal_mode = True + indent_before = True + indent_after = False + else: + # After processing this event we will no longer be + # in string literal mode. + string_literal_mode = False + indent_before = False + indent_after = True + elif string_literal_mode: + indent_before = indent_after = False + else: + indent_before = indent_after = True + if indent_level is not None: + if (indent_before or indent_after): + if isinstance(element, NavigableString): + piece = piece.strip() + piece = self._indent( + piece, element, indent_level, formatter, + indent_before, indent_after + ) + if event == 'open': + indent_level += 1 + pieces.append(piece) + return "".join(pieces) + + def _indent(self, s, e, indent_level, formatter, indent_before, indent_after): + space_before = '' + if indent_before: + space_before = (formatter.indent * indent_level) + + space_after = '' + if indent_after: + space_after = "\n" + + return space_before + s + space_after + + def _format_tag(self, eventual_encoding, formatter, opening): + # A tag starts with the < character. + + # Then the / character, if this is a closing tag. + closing_slash = '' + if not opening: + closing_slash = '/' + + # Then an optional namespace prefix. prefix = '' if self.prefix: prefix = self.prefix + ":" - if self.is_empty_element: - close = formatter.void_element_close_prefix or '' - else: - closeTag = '' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (formatter.indent * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter - ) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' + # Then a list of attribute values, if this is an opening tag. + attribute_string = '' + if opening: + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) if attrs: attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s - - def _should_pretty_print(self, indent_level): + + # Then an optional closing slash (for a void element in an + # XML document). + void_element_closing_slash = '' + if self.is_empty_element: + void_element_closing_slash = formatter.void_element_close_prefix or '' + + # Put it all together. + return f'<{closing_slash}{prefix}{self.name}{attribute_string}{void_element_closing_slash}>' + + def _should_pretty_print(self, indent_level=1): """Should this tag be pretty-printed? Most of them should, but some (such as
 in HTML
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index d8b3b9b..dbb1593 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -551,8 +551,8 @@ Hello, world!
         """Whitespace must be preserved in 
 and "
+        pre_markup = "
a   z
\n" + textarea_markup = "\n" self.assert_soup(pre_markup) self.assert_soup(textarea_markup) @@ -563,7 +563,7 @@ Hello, world! assert soup.textarea.prettify() == textarea_markup soup = self.soup("") - assert soup.textarea.prettify() == "" + assert soup.textarea.prettify() == "\n" def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py index 84d4e3b..528b16d 100644 --- a/bs4/tests/test_formatter.py +++ b/bs4/tests/test_formatter.py @@ -80,20 +80,20 @@ class TestFormatter(SoupTest): @pytest.mark.parametrize( "indent,expect", [ - (None, '\n\ntext\n\n'), - (-1, '\n\ntext\n\n'), - (0, '\n\ntext\n\n'), - ("", '\n\ntext\n\n'), + (None, '\n\ntext\n\n\n'), + (-1, '\n\ntext\n\n\n'), + (0, '\n\ntext\n\n\n'), + ("", '\n\ntext\n\n\n'), - (1, '\n \n text\n \n'), - (2, '\n \n text\n \n'), + (1, '\n \n text\n \n\n'), + (2, '\n \n text\n \n\n'), - ("\t", '\n\t\n\t\ttext\n\t\n'), - ('abc', '\nabc\nabcabctext\nabc\n'), + ("\t", '\n\t\n\t\ttext\n\t\n\n'), + ('abc', '\nabc\nabcabctext\nabc\n\n'), # Some invalid inputs -- the default behavior is used. - (object(), '\n \n text\n \n'), - (b'bytes', '\n \n text\n \n'), + (object(), '\n \n text\n \n\n'), + (b'bytes', '\n \n text\n \n\n'), ] ) def test_indent(self, indent, expect): diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index a94280f..f8eb9bb 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -156,7 +156,7 @@ class TestFormatters(SoupTest): soup = self.soup("
foo
  \tbar\n  \n  
baz
") # Everything outside the
 tag is reformatted, but everything
         # inside is left alone.
-        assert '
\n foo\n
  \tbar\n  \n  
\n baz\n \n
' == soup.div.prettify() + assert '
\n foo\n
  \tbar\n  \n  
\n baz\n \n
\n' == soup.div.prettify() def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("foo", 'html.parser') -- cgit v1.2.3 From 74e53f0c5997c8b28f449dcc038f928b50b1314e Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 21 Mar 2023 11:38:27 -0400 Subject: Removed old implementation code. --- bs4/element.py | 61 ++++++++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index bcfad08..8d6251c 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1660,23 +1660,31 @@ class Tag(PageElement): for i in self.descendants: yield i - def saxlike(self): - """Yield a sequence of SAX-like events that can be used to - reconstruct this parse tree. + CLOSE_EVENT = object() + + def saxlike(self, iterator=None): + """Yield a sequence of SAX-like events that can be used to reconstruct + the DOM for this element. - This lets us recreate the nested structure of the document - without using recursive method calls. + This lets us recreate the nested structure of this element + (e.g. when formatting as a string) without using recursive + method calls. + + :param iterator: An alternate iterator to use when traversing + the tree. """ tag_stack = [] - for c in self.self_and_descendants: + iterator = iterator or self.self_and_descendants + + for c in iterator: # If the parent of the element we're about to yield is not # the tag currently on the stack, it means that the tag on # the stack closed before this element appeared. while tag_stack and c.parent != tag_stack[-1]: now_closed_tag = tag_stack.pop() - yield "close", now_closed_tag + yield self.CLOSE_EVENT, now_closed_tag if isinstance(c, Tag): if c.is_empty_element: @@ -1691,11 +1699,12 @@ class Tag(PageElement): while tag_stack: now_closed_tag = tag_stack.pop() prettyprint_suppressed_by = None - yield "close", now_closed_tag + yield self.CLOSE_EVENT, now_closed_tag def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal", + iterator=None): pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and @@ -1707,11 +1716,11 @@ class Tag(PageElement): indent_level = 0 string_literal_mode = False - for event, element in self.saxlike(): + for event, element in self.saxlike(iterator): if event in ('open', 'openclose'): piece = element._format_tag( eventual_encoding, formatter, opening=True) - elif event == 'close': + elif event == self.CLOSE_EVENT: piece = element._format_tag( eventual_encoding, formatter, opening=False) if indent_level is not None: @@ -1862,32 +1871,8 @@ class Tag(PageElement): the standard Formatters. """ - # First off, turn a string formatter into a Formatter object. This - # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - - pretty_print = (indent_level is not None) - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(formatter) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - formatter)) - preserve_whitespace = ( - self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags - ) - if text and indent_level and not preserve_whitespace: - text = text.strip() - if text: - if pretty_print and not preserve_whitespace: - s.append(formatter.indent * (indent_level - 1)) - s.append(text) - if pretty_print and not preserve_whitespace: - s.append("\n") - return ''.join(s) + return self.decode(indent_level, eventual_encoding, formatter, + iterator=self.descendants) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, -- cgit v1.2.3 From 484a003315c822eb0a52b8e498a682da2b93bfd4 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 21 Mar 2023 12:33:55 -0400 Subject: Reorganize code and rename saxlike, since this isn' --- bs4/element.py | 148 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 80 insertions(+), 68 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 8d6251c..7bbb7fc 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1642,65 +1642,6 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def str_from_sax(self): - data = [] - for e, content in self.saxlike(): - if e == 'string': - data.append(content) - elif e in ( 'openclose', 'open'): - data.append("<%s>" % content.name) - elif e == 'close': - data.append("" % content.name) - return "".join(data) - - @property - def self_and_descendants(self): - if not self.hidden: - yield self - for i in self.descendants: - yield i - - CLOSE_EVENT = object() - - def saxlike(self, iterator=None): - """Yield a sequence of SAX-like events that can be used to reconstruct - the DOM for this element. - - This lets us recreate the nested structure of this element - (e.g. when formatting as a string) without using recursive - method calls. - - :param iterator: An alternate iterator to use when traversing - the tree. - """ - tag_stack = [] - - iterator = iterator or self.self_and_descendants - - for c in iterator: - - # If the parent of the element we're about to yield is not - # the tag currently on the stack, it means that the tag on - # the stack closed before this element appeared. - while tag_stack and c.parent != tag_stack[-1]: - now_closed_tag = tag_stack.pop() - yield self.CLOSE_EVENT, now_closed_tag - - if isinstance(c, Tag): - if c.is_empty_element: - yield "openclose", c - else: - yield "open", c - tag_stack.append(c) - continue - else: - yield "string", c - - while tag_stack: - now_closed_tag = tag_stack.pop() - prettyprint_suppressed_by = None - yield self.CLOSE_EVENT, now_closed_tag - def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal", @@ -1716,11 +1657,11 @@ class Tag(PageElement): indent_level = 0 string_literal_mode = False - for event, element in self.saxlike(iterator): - if event in ('open', 'openclose'): + for event, element in self._event_stream(iterator): + if event in (self.START_ELEMENT_EVENT, self.EMPTY_ELEMENT_EVENT): piece = element._format_tag( eventual_encoding, formatter, opening=True) - elif event == self.CLOSE_EVENT: + elif event is self.END_ELEMENT_EVENT: piece = element._format_tag( eventual_encoding, formatter, opening=False) if indent_level is not None: @@ -1730,7 +1671,7 @@ class Tag(PageElement): piece = element.output_ready(formatter) if isinstance(element, Tag) and not element._should_pretty_print(): - if event in ('open', 'openclose'): + if event is self.START_ELEMENT_EVENT: # After processing this event we will be in string # literal mode. string_literal_mode = True @@ -1751,16 +1692,75 @@ class Tag(PageElement): if (indent_before or indent_after): if isinstance(element, NavigableString): piece = piece.strip() - piece = self._indent( - piece, element, indent_level, formatter, + piece = self._indent_string( + piece, indent_level, formatter, indent_before, indent_after ) - if event == 'open': + if event == self.START_ELEMENT_EVENT: indent_level += 1 pieces.append(piece) return "".join(pieces) - def _indent(self, s, e, indent_level, formatter, indent_before, indent_after): + # Names for the different events yielded by _event_stream + START_ELEMENT_EVENT = object() + END_ELEMENT_EVENT = object() + VOID_ELEMENT_EVENT = object() + STRING_ELEMENT_EVENT = object() + + def _event_stream(self, iterator=None): + """Yield a sequence of events that can be used to reconstruct the DOM + for this element. + + This lets us recreate the nested structure of this element + (e.g. when formatting it as a string) without using recursive + method calls. + + This is similar in concept to the SAX API, but it's a simpler + interface designed for internal use. The events are different + from SAX and the arguments associated with the events are Tags + and other Beautiful Soup objects. + + :param iterator: An alternate iterator to use when traversing + the tree. + """ + tag_stack = [] + + iterator = iterator or self.self_and_descendants + + for c in iterator: + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield self.END_ELEMENT_EVENT, now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield self.EMPTY_ELEMENT_EVENT, c + else: + yield self.START_ELEMENT_EVENT, c + tag_stack.append(c) + continue + else: + yield self.STRING_ELEMENT_EVENT, c + + while tag_stack: + now_closed_tag = tag_stack.pop() + yield self.END_ELEMENT_EVENT, now_closed_tag + + def _indent_string(self, s, indent_level, formatter, + indent_before, indent_after): + """Add indentation whitespace before and/or after a string. + + :param s: The string to amend with whitespace. + :param indent_level: The indentation level; affects how much + whitespace goes before the string. + :param indent_before: Whether or not to add whitespace + before the string. + :param indent_after: Whether or not to add whitespace + (a newline) after the string. + """ space_before = '' if indent_before: space_before = (formatter.indent * indent_level) @@ -1772,7 +1772,7 @@ class Tag(PageElement): return space_before + s + space_after def _format_tag(self, eventual_encoding, formatter, opening): - # A tag starts with the < character. + # A tag starts with the < character (see below). # Then the / character, if this is a closing tag. closing_slash = '' @@ -1969,6 +1969,18 @@ class Tag(PageElement): # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. + @property + def self_and_descendants(self): + """Iterate over this PageElement and its children in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ + if not self.hidden: + yield self + for i in self.descendants: + yield i + @property def descendants(self): """Iterate over all children of this PageElement in a -- cgit v1.2.3 From c91087b78b3584b1e696056bc2ad14e34ebd689e Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 23 Mar 2023 15:25:32 -0400 Subject: Bump version number preemptively. --- bs4/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 9a76a15..01fca6d 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.12.0" +__version__ = "4.12.1" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" -- cgit v1.2.3 From a342497cb81f01384d61e467daf91540369d4fc3 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 23 Mar 2023 16:59:27 -0400 Subject: Found and removed accidental calls to find(), greatly improving performance. --- bs4/__init__.py | 1 + bs4/element.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 01fca6d..5e1bebe 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -469,6 +469,7 @@ class BeautifulSoup(Tag): self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] self.string_container_stack = [] + self._most_recent_element = None self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, diff --git a/bs4/element.py b/bs4/element.py index 7bbb7fc..4f1372a 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1658,10 +1658,10 @@ class Tag(PageElement): string_literal_mode = False for event, element in self._event_stream(iterator): - if event in (self.START_ELEMENT_EVENT, self.EMPTY_ELEMENT_EVENT): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): piece = element._format_tag( eventual_encoding, formatter, opening=True) - elif event is self.END_ELEMENT_EVENT: + elif event is Tag.END_ELEMENT_EVENT: piece = element._format_tag( eventual_encoding, formatter, opening=False) if indent_level is not None: @@ -1671,7 +1671,7 @@ class Tag(PageElement): piece = element.output_ready(formatter) if isinstance(element, Tag) and not element._should_pretty_print(): - if event is self.START_ELEMENT_EVENT: + if event is Tag.START_ELEMENT_EVENT: # After processing this event we will be in string # literal mode. string_literal_mode = True @@ -1696,7 +1696,7 @@ class Tag(PageElement): piece, indent_level, formatter, indent_before, indent_after ) - if event == self.START_ELEMENT_EVENT: + if event == Tag.START_ELEMENT_EVENT: indent_level += 1 pieces.append(piece) return "".join(pieces) @@ -1704,7 +1704,7 @@ class Tag(PageElement): # Names for the different events yielded by _event_stream START_ELEMENT_EVENT = object() END_ELEMENT_EVENT = object() - VOID_ELEMENT_EVENT = object() + EMPTY_ELEMENT_EVENT = object() STRING_ELEMENT_EVENT = object() def _event_stream(self, iterator=None): @@ -1733,21 +1733,21 @@ class Tag(PageElement): # the stack closed before this element appeared. while tag_stack and c.parent != tag_stack[-1]: now_closed_tag = tag_stack.pop() - yield self.END_ELEMENT_EVENT, now_closed_tag + yield Tag.END_ELEMENT_EVENT, now_closed_tag if isinstance(c, Tag): if c.is_empty_element: - yield self.EMPTY_ELEMENT_EVENT, c + yield Tag.EMPTY_ELEMENT_EVENT, c else: - yield self.START_ELEMENT_EVENT, c + yield Tag.START_ELEMENT_EVENT, c tag_stack.append(c) continue else: - yield self.STRING_ELEMENT_EVENT, c + yield Tag.STRING_ELEMENT_EVENT, c while tag_stack: now_closed_tag = tag_stack.pop() - yield self.END_ELEMENT_EVENT, now_closed_tag + yield Tag.END_ELEMENT_EVENT, now_closed_tag def _indent_string(self, s, indent_level, formatter, indent_before, indent_after): -- cgit v1.2.3 From 5003f474b26505c9bfc6c44d1f78af76ff8f2634 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 24 Mar 2023 09:44:12 -0400 Subject: Using a format string is very slightly slower than just adding all the bits of the string together. --- bs4/element.py | 2 +- bs4/formatter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 4f1372a..630fd96 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1818,7 +1818,7 @@ class Tag(PageElement): void_element_closing_slash = formatter.void_element_close_prefix or '' # Put it all together. - return f'<{closing_slash}{prefix}{self.name}{attribute_string}{void_element_closing_slash}>' + return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' def _should_pretty_print(self, indent_level=1): """Should this tag be pretty-printed? diff --git a/bs4/formatter.py b/bs4/formatter.py index 83cc1c5..c821318 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -97,7 +97,7 @@ class Formatter(EntitySubstitution): else: indent = ' ' self.indent = indent - + def substitute(self, ns): """Process a string that needs to undergo entity substitution. This may be a string encountered in an attribute value or as -- cgit v1.2.3 From 0917e9326f7b7f5e7d7c43180049ddc5eced8a9c Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 24 Mar 2023 10:00:58 -0400 Subject: Don't indent an empty string. 1084 of 1474 test documents now give identical results between versions. --- bs4/element.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 630fd96..aaa00fb 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1692,10 +1692,11 @@ class Tag(PageElement): if (indent_before or indent_after): if isinstance(element, NavigableString): piece = piece.strip() - piece = self._indent_string( - piece, indent_level, formatter, - indent_before, indent_after - ) + if piece: + piece = self._indent_string( + piece, indent_level, formatter, + indent_before, indent_after + ) if event == Tag.START_ELEMENT_EVENT: indent_level += 1 pieces.append(piece) -- cgit v1.2.3 From c3a7983ee092cb0b185c323e974404dd623878a9 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 24 Mar 2023 14:14:46 -0400 Subject: Keep track of the specific tag that put us into string literal mode, and only exit when that particular tag is closed. --- bs4/element.py | 89 ++++++++++++++++++++++++++++++++++--------- bs4/tests/test_pageelement.py | 24 ++++++++++++ 2 files changed, 94 insertions(+), 19 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index aaa00fb..80ebbef 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1656,38 +1656,89 @@ class Tag(PageElement): if indent_level is True: indent_level = 0 - string_literal_mode = False + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + for event, element in self._event_stream(iterator): if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): piece = element._format_tag( - eventual_encoding, formatter, opening=True) + eventual_encoding, formatter, opening=True + ) elif event is Tag.END_ELEMENT_EVENT: piece = element._format_tag( - eventual_encoding, formatter, opening=False) + eventual_encoding, formatter, opening=False + ) if indent_level is not None: indent_level -= 1 - string_literal_mode = False else: piece = element.output_ready(formatter) - if isinstance(element, Tag) and not element._should_pretty_print(): - if event is Tag.START_ELEMENT_EVENT: - # After processing this event we will be in string - # literal mode. - string_literal_mode = True - indent_before = True - indent_after = False - else: - # After processing this event we will no longer be - # in string literal mode. - string_literal_mode = False - indent_before = False - indent_after = True - elif string_literal_mode: + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like
 and
+            #  for you 
+
""" + + expect = """
+
some
+ for you 
+
+
+""" + soup = self.soup(markup) + assert expect == soup.div.prettify() + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("foo", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) -- cgit v1.2.3 From 2236d4acae21d9c5595924902134e5072648c29c Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 24 Mar 2023 14:22:11 -0400 Subject: Simplified the rules for going in and out of string_literal_tag, so less documentation in comments is necessary. --- bs4/element.py | 59 ++++++++++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 41 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index 80ebbef..daffec3 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1684,33 +1684,6 @@ class Tag(PageElement): #