diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 3 | ||||
-rw-r--r-- | bs4/element.py | 320 | ||||
-rw-r--r-- | bs4/formatter.py | 2 | ||||
-rw-r--r-- | bs4/tests/__init__.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_formatter.py | 20 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 37 |
6 files changed, 259 insertions, 129 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 9a76a15..5e1bebe 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.12.0" +__version__ = "4.12.1" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -469,6 +469,7 @@ class BeautifulSoup(Tag): self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] self.string_container_stack = [] + self._most_recent_element = None self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, diff --git a/bs4/element.py b/bs4/element.py index 1dd5984..daffec3 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1644,106 +1644,212 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - """ - + formatter="minimal", + iterator=None): + pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and # over again. if not isinstance(formatter, Formatter): formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key + + if indent_level is True: + indent_level = 0 + + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + + for event, element in self._event_stream(iterator): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): + piece = element._format_tag( + eventual_encoding, formatter, opening=True + ) + elif event is Tag.END_ELEMENT_EVENT: + piece = element._format_tag( + eventual_encoding, formatter, opening=False + ) + if indent_level is not None: + indent_level -= 1 + else: + piece = element.output_ready(formatter) + + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like <pre> and + # <script>, can't be prettified, since adding whitespace would + # change the meaning of the content. + + # The default behavior is to add whitespace before and + # after an element when string literal mode is off, and to + # leave things as they are when string literal mode is on. + if string_literal_tag: + indent_before = indent_after = False else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) - - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' + indent_before = indent_after = True + + # The only time the behavior is more complex than that is + # when we encounter an opening or closing tag that might + # put us into or out of string literal mode. + if (event is Tag.START_ELEMENT_EVENT + and not string_literal_tag + and not element._should_pretty_print()): + # We are about to enter string literal mode. Add + # whitespace before this tag, but not after. We + # will stay in string literal mode until this tag + # is closed. + indent_before = True + indent_after = False + string_literal_tag = element + elif (event is Tag.END_ELEMENT_EVENT + and element is string_literal_tag): + # We are about to exit string literal mode by closing + # the tag that sent us into that mode. Add whitespace + # after this tag, but not before. + indent_before = False + indent_after = True + string_literal_tag = None + + # Now we know whether to add whitespace before and/or + # after this element. + if indent_level is not None: + if (indent_before or indent_after): + if isinstance(element, NavigableString): + piece = piece.strip() + if piece: + piece = self._indent_string( + piece, indent_level, formatter, + indent_before, indent_after + ) + if event == Tag.START_ELEMENT_EVENT: + indent_level += 1 + pieces.append(piece) + return "".join(pieces) + + # Names for the different events yielded by _event_stream + START_ELEMENT_EVENT = object() + END_ELEMENT_EVENT = object() + EMPTY_ELEMENT_EVENT = object() + STRING_ELEMENT_EVENT = object() + + def _event_stream(self, iterator=None): + """Yield a sequence of events that can be used to reconstruct the DOM + for this element. + + This lets us recreate the nested structure of this element + (e.g. when formatting it as a string) without using recursive + method calls. + + This is similar in concept to the SAX API, but it's a simpler + interface designed for internal use. The events are different + from SAX and the arguments associated with the events are Tags + and other Beautiful Soup objects. + + :param iterator: An alternate iterator to use when traversing + the tree. + """ + tag_stack = [] + + iterator = iterator or self.self_and_descendants + + for c in iterator: + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield Tag.EMPTY_ELEMENT_EVENT, c + else: + yield Tag.START_ELEMENT_EVENT, c + tag_stack.append(c) + continue + else: + yield Tag.STRING_ELEMENT_EVENT, c + + while tag_stack: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + def _indent_string(self, s, indent_level, formatter, + indent_before, indent_after): + """Add indentation whitespace before and/or after a string. + + :param s: The string to amend with whitespace. + :param indent_level: The indentation level; affects how much + whitespace goes before the string. + :param indent_before: Whether or not to add whitespace + before the string. + :param indent_after: Whether or not to add whitespace + (a newline) after the string. + """ + space_before = '' + if indent_before and indent_level: + space_before = (formatter.indent * indent_level) + + space_after = '' + if indent_after: + space_after = "\n" + + return space_before + s + space_after + + def _format_tag(self, eventual_encoding, formatter, opening): + # A tag starts with the < character (see below). + + # Then the / character, if this is a closing tag. + closing_slash = '' + if not opening: + closing_slash = '/' + # Then an optional namespace prefix. prefix = '' if self.prefix: prefix = self.prefix + ":" - if self.is_empty_element: - close = formatter.void_element_close_prefix or '' - else: - closeTag = '</%s%s>' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (formatter.indent * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter - ) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' + # Then a list of attribute values, if this is an opening tag. + attribute_string = '' + if opening: + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) if attrs: attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s - - def _should_pretty_print(self, indent_level): + + # Then an optional closing slash (for a void element in an + # XML document). + void_element_closing_slash = '' + if self.is_empty_element: + void_element_closing_slash = formatter.void_element_close_prefix or '' + + # Put it all together. + return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' + + def _should_pretty_print(self, indent_level=1): """Should this tag be pretty-printed? Most of them should, but some (such as <pre> in HTML @@ -1794,32 +1900,8 @@ class Tag(PageElement): the standard Formatters. """ - # First off, turn a string formatter into a Formatter object. This - # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - - pretty_print = (indent_level is not None) - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(formatter) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - formatter)) - preserve_whitespace = ( - self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags - ) - if text and indent_level and not preserve_whitespace: - text = text.strip() - if text: - if pretty_print and not preserve_whitespace: - s.append(formatter.indent * (indent_level - 1)) - s.append(text) - if pretty_print and not preserve_whitespace: - s.append("\n") - return ''.join(s) + return self.decode(indent_level, eventual_encoding, formatter, + iterator=self.descendants) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, @@ -1917,6 +1999,18 @@ class Tag(PageElement): return iter(self.contents) # XXX This seems to be untested. @property + def self_and_descendants(self): + """Iterate over this PageElement and its children in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ + if not self.hidden: + yield self + for i in self.descendants: + yield i + + @property def descendants(self): """Iterate over all children of this PageElement in a breadth-first sequence. diff --git a/bs4/formatter.py b/bs4/formatter.py index 83cc1c5..c821318 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -97,7 +97,7 @@ class Formatter(EntitySubstitution): else: indent = ' ' self.indent = indent - + def substitute(self, ns): """Process a string that needs to undergo entity substitution. This may be a string encountered in an attribute value or as diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py index d8b3b9b..dbb1593 100644 --- a/bs4/tests/__init__.py +++ b/bs4/tests/__init__.py @@ -551,8 +551,8 @@ Hello, world! """Whitespace must be preserved in <pre> and <textarea> tags, even if that would mean not prettifying the markup. """ - pre_markup = "<pre> </pre>" - textarea_markup = "<textarea> woo\nwoo </textarea>" + pre_markup = "<pre>a z</pre>\n" + textarea_markup = "<textarea> woo\nwoo </textarea>\n" self.assert_soup(pre_markup) self.assert_soup(textarea_markup) @@ -563,7 +563,7 @@ Hello, world! assert soup.textarea.prettify() == textarea_markup soup = self.soup("<textarea></textarea>") - assert soup.textarea.prettify() == "<textarea></textarea>" + assert soup.textarea.prettify() == "<textarea></textarea>\n" def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py index 84d4e3b..528b16d 100644 --- a/bs4/tests/test_formatter.py +++ b/bs4/tests/test_formatter.py @@ -80,20 +80,20 @@ class TestFormatter(SoupTest): @pytest.mark.parametrize( "indent,expect", [ - (None, '<a>\n<b>\ntext\n</b>\n</a>'), - (-1, '<a>\n<b>\ntext\n</b>\n</a>'), - (0, '<a>\n<b>\ntext\n</b>\n</a>'), - ("", '<a>\n<b>\ntext\n</b>\n</a>'), + (None, '<a>\n<b>\ntext\n</b>\n</a>\n'), + (-1, '<a>\n<b>\ntext\n</b>\n</a>\n'), + (0, '<a>\n<b>\ntext\n</b>\n</a>\n'), + ("", '<a>\n<b>\ntext\n</b>\n</a>\n'), - (1, '<a>\n <b>\n text\n </b>\n</a>'), - (2, '<a>\n <b>\n text\n </b>\n</a>'), + (1, '<a>\n <b>\n text\n </b>\n</a>\n'), + (2, '<a>\n <b>\n text\n </b>\n</a>\n'), - ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'), - ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'), + ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'), + ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'), # Some invalid inputs -- the default behavior is used. - (object(), '<a>\n <b>\n text\n </b>\n</a>'), - (b'bytes', '<a>\n <b>\n text\n </b>\n</a>'), + (object(), '<a>\n <b>\n text\n </b>\n</a>\n'), + (b'bytes', '<a>\n <b>\n text\n </b>\n</a>\n'), ] ) def test_indent(self, indent, expect): diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index a94280f..d98c577 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -2,6 +2,7 @@ import copy import pickle import pytest +import sys from bs4 import BeautifulSoup from bs4.element import ( @@ -49,6 +50,16 @@ class TestEncoding(SoupTest): encoding="utf8" ) + def test_encode_deeply_nested_document(self): + # This test verifies that encoding a string doesn't involve + # any recursive function calls. If it did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "<span>" * limit + soup = self.soup(markup) + encoded = soup.encode() + assert limit == encoded.count(b"<span>") + def test_deprecated_renderContents(self): html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) @@ -156,7 +167,31 @@ class TestFormatters(SoupTest): soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") # Everything outside the <pre> tag is reformatted, but everything # inside is left alone. - assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify() + assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify() + + def test_prettify_handles_nested_string_literal_tags(self): + # Most of this markup is inside a <pre> tag, so prettify() + # only does three things to it: + # 1. Add a newline and a space between the <div> and the <pre> + # 2. Add a newline after the </pre> + # 3. Add a newline at the end. + # + # The contents of the <pre> tag are left completely alone. In + # particular, we don't start adding whitespace again once we + # encounter the first </pre> tag, because we know it's not + # the one that put us into string literal mode. + markup = """<div><pre><code>some +<script><pre>code</pre></script> for you +</code></pre></div>""" + + expect = """<div> + <pre><code>some +<script><pre>code</pre></script> for you +</code></pre> +</div> +""" + soup = self.soup(markup) + assert expect == soup.div.prettify() def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') |