summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/__init__.py3
-rw-r--r--bs4/element.py320
-rw-r--r--bs4/formatter.py2
-rw-r--r--bs4/tests/__init__.py6
-rw-r--r--bs4/tests/test_formatter.py20
-rw-r--r--bs4/tests/test_pageelement.py37
6 files changed, 259 insertions, 129 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9a76a15..5e1bebe 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.12.0"
+__version__ = "4.12.1"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@@ -469,6 +469,7 @@ class BeautifulSoup(Tag):
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
+ self._most_recent_element = None
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
diff --git a/bs4/element.py b/bs4/element.py
index 1dd5984..daffec3 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1644,106 +1644,212 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Render a Unicode representation of this PageElement and its
- contents.
-
- :param indent_level: Each line of the rendering will be
- indented this many spaces. Used internally in
- recursive calls while pretty-printing.
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
- :param formatter: A Formatter object, or a string naming one of
- the standard formatters.
- """
-
+ formatter="minimal",
+ iterator=None):
+ pieces = []
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
- attributes = formatter.attributes(self)
- attrs = []
- for key, val in attributes:
- if val is None:
- decoded = key
+
+ if indent_level is True:
+ indent_level = 0
+
+ # The currently active tag that put us into string literal
+ # mode. Until this element is closed, children will be treated
+ # as string literals and not pretty-printed. String literal
+ # mode is turned on immediately after this tag begins, and
+ # turned off immediately before it's closed. This means there
+ # will be whitespace before and after the tag itself.
+ string_literal_tag = None
+
+ for event, element in self._event_stream(iterator):
+ if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=True
+ )
+ elif event is Tag.END_ELEMENT_EVENT:
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=False
+ )
+ if indent_level is not None:
+ indent_level -= 1
+ else:
+ piece = element.output_ready(formatter)
+
+ # Now we need to apply the 'prettiness' -- extra
+ # whitespace before and/or after this tag. This can get
+ # complicated because certain tags, like <pre> and
+ # <script>, can't be prettified, since adding whitespace would
+ # change the meaning of the content.
+
+ # The default behavior is to add whitespace before and
+ # after an element when string literal mode is off, and to
+ # leave things as they are when string literal mode is on.
+ if string_literal_tag:
+ indent_before = indent_after = False
else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = ' '.join(val)
- elif not isinstance(val, str):
- val = str(val)
- elif (
- isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None
- ):
- val = val.encode(eventual_encoding)
-
- text = formatter.attribute_value(val)
- decoded = (
- str(key) + '='
- + formatter.quoted_attribute_value(text))
- attrs.append(decoded)
- close = ''
- closeTag = ''
+ indent_before = indent_after = True
+
+ # The only time the behavior is more complex than that is
+ # when we encounter an opening or closing tag that might
+ # put us into or out of string literal mode.
+ if (event is Tag.START_ELEMENT_EVENT
+ and not string_literal_tag
+ and not element._should_pretty_print()):
+ # We are about to enter string literal mode. Add
+ # whitespace before this tag, but not after. We
+ # will stay in string literal mode until this tag
+ # is closed.
+ indent_before = True
+ indent_after = False
+ string_literal_tag = element
+ elif (event is Tag.END_ELEMENT_EVENT
+ and element is string_literal_tag):
+ # We are about to exit string literal mode by closing
+ # the tag that sent us into that mode. Add whitespace
+ # after this tag, but not before.
+ indent_before = False
+ indent_after = True
+ string_literal_tag = None
+
+ # Now we know whether to add whitespace before and/or
+ # after this element.
+ if indent_level is not None:
+ if (indent_before or indent_after):
+ if isinstance(element, NavigableString):
+ piece = piece.strip()
+ if piece:
+ piece = self._indent_string(
+ piece, indent_level, formatter,
+ indent_before, indent_after
+ )
+ if event == Tag.START_ELEMENT_EVENT:
+ indent_level += 1
+ pieces.append(piece)
+ return "".join(pieces)
+
+ # Names for the different events yielded by _event_stream
+ START_ELEMENT_EVENT = object()
+ END_ELEMENT_EVENT = object()
+ EMPTY_ELEMENT_EVENT = object()
+ STRING_ELEMENT_EVENT = object()
+
+ def _event_stream(self, iterator=None):
+ """Yield a sequence of events that can be used to reconstruct the DOM
+ for this element.
+
+ This lets us recreate the nested structure of this element
+ (e.g. when formatting it as a string) without using recursive
+ method calls.
+
+ This is similar in concept to the SAX API, but it's a simpler
+ interface designed for internal use. The events are different
+ from SAX and the arguments associated with the events are Tags
+ and other Beautiful Soup objects.
+
+ :param iterator: An alternate iterator to use when traversing
+ the tree.
+ """
+ tag_stack = []
+
+ iterator = iterator or self.self_and_descendants
+
+ for c in iterator:
+ # If the parent of the element we're about to yield is not
+ # the tag currently on the stack, it means that the tag on
+ # the stack closed before this element appeared.
+ while tag_stack and c.parent != tag_stack[-1]:
+ now_closed_tag = tag_stack.pop()
+ yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+ if isinstance(c, Tag):
+ if c.is_empty_element:
+ yield Tag.EMPTY_ELEMENT_EVENT, c
+ else:
+ yield Tag.START_ELEMENT_EVENT, c
+ tag_stack.append(c)
+ continue
+ else:
+ yield Tag.STRING_ELEMENT_EVENT, c
+
+ while tag_stack:
+ now_closed_tag = tag_stack.pop()
+ yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+ def _indent_string(self, s, indent_level, formatter,
+ indent_before, indent_after):
+ """Add indentation whitespace before and/or after a string.
+
+ :param s: The string to amend with whitespace.
+ :param indent_level: The indentation level; affects how much
+ whitespace goes before the string.
+ :param indent_before: Whether or not to add whitespace
+ before the string.
+ :param indent_after: Whether or not to add whitespace
+ (a newline) after the string.
+ """
+ space_before = ''
+ if indent_before and indent_level:
+ space_before = (formatter.indent * indent_level)
+
+ space_after = ''
+ if indent_after:
+ space_after = "\n"
+
+ return space_before + s + space_after
+
+ def _format_tag(self, eventual_encoding, formatter, opening):
+ # A tag starts with the < character (see below).
+
+ # Then the / character, if this is a closing tag.
+ closing_slash = ''
+ if not opening:
+ closing_slash = '/'
+ # Then an optional namespace prefix.
prefix = ''
if self.prefix:
prefix = self.prefix + ":"
- if self.is_empty_element:
- close = formatter.void_element_close_prefix or ''
- else:
- closeTag = '</%s%s>' % (prefix, self.name)
-
- pretty_print = self._should_pretty_print(indent_level)
- space = ''
- indent_space = ''
- if indent_level is not None:
- indent_space = (formatter.indent * (indent_level - 1))
- if pretty_print:
- space = indent_space
- indent_contents = indent_level + 1
- else:
- indent_contents = None
- contents = self.decode_contents(
- indent_contents, eventual_encoding, formatter
- )
-
- if self.hidden:
- # This is the 'document root' object.
- s = contents
- else:
- s = []
- attribute_string = ''
+ # Then a list of attribute values, if this is an opening tag.
+ attribute_string = ''
+ if opening:
+ attributes = formatter.attributes(self)
+ attrs = []
+ for key, val in attributes:
+ if val is None:
+ decoded = key
+ else:
+ if isinstance(val, list) or isinstance(val, tuple):
+ val = ' '.join(val)
+ elif not isinstance(val, str):
+ val = str(val)
+ elif (
+ isinstance(val, AttributeValueWithCharsetSubstitution)
+ and eventual_encoding is not None
+ ):
+ val = val.encode(eventual_encoding)
+
+ text = formatter.attribute_value(val)
+ decoded = (
+ str(key) + '='
+ + formatter.quoted_attribute_value(text))
+ attrs.append(decoded)
if attrs:
attribute_string = ' ' + ' '.join(attrs)
- if indent_level is not None:
- # Even if this particular tag is not pretty-printed,
- # we should indent up to the start of the tag.
- s.append(indent_space)
- s.append('<%s%s%s%s>' % (
- prefix, self.name, attribute_string, close))
- if pretty_print:
- s.append("\n")
- s.append(contents)
- if pretty_print and contents and contents[-1] != "\n":
- s.append("\n")
- if pretty_print and closeTag:
- s.append(space)
- s.append(closeTag)
- if indent_level is not None and closeTag and self.next_sibling:
- # Even if this particular tag is not pretty-printed,
- # we're now done with the tag, and we should add a
- # newline if appropriate.
- s.append("\n")
- s = ''.join(s)
- return s
-
- def _should_pretty_print(self, indent_level):
+
+ # Then an optional closing slash (for a void element in an
+ # XML document).
+ void_element_closing_slash = ''
+ if self.is_empty_element:
+ void_element_closing_slash = formatter.void_element_close_prefix or ''
+
+ # Put it all together.
+ return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
+
+ def _should_pretty_print(self, indent_level=1):
"""Should this tag be pretty-printed?
Most of them should, but some (such as <pre> in HTML
@@ -1794,32 +1900,8 @@ class Tag(PageElement):
the standard Formatters.
"""
- # First off, turn a string formatter into a Formatter object. This
- # will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter):
- formatter = self.formatter_for_name(formatter)
-
- pretty_print = (indent_level is not None)
- s = []
- for c in self:
- text = None
- if isinstance(c, NavigableString):
- text = c.output_ready(formatter)
- elif isinstance(c, Tag):
- s.append(c.decode(indent_level, eventual_encoding,
- formatter))
- preserve_whitespace = (
- self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
- )
- if text and indent_level and not preserve_whitespace:
- text = text.strip()
- if text:
- if pretty_print and not preserve_whitespace:
- s.append(formatter.indent * (indent_level - 1))
- s.append(text)
- if pretty_print and not preserve_whitespace:
- s.append("\n")
- return ''.join(s)
+ return self.decode(indent_level, eventual_encoding, formatter,
+ iterator=self.descendants)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
@@ -1917,6 +1999,18 @@ class Tag(PageElement):
return iter(self.contents) # XXX This seems to be untested.
@property
+ def self_and_descendants(self):
+ """Iterate over this PageElement and its children in a
+ breadth-first sequence.
+
+ :yield: A sequence of PageElements.
+ """
+ if not self.hidden:
+ yield self
+ for i in self.descendants:
+ yield i
+
+ @property
def descendants(self):
"""Iterate over all children of this PageElement in a
breadth-first sequence.
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 83cc1c5..c821318 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -97,7 +97,7 @@ class Formatter(EntitySubstitution):
else:
indent = ' '
self.indent = indent
-
+
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index d8b3b9b..dbb1593 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -551,8 +551,8 @@ Hello, world!
"""Whitespace must be preserved in <pre> and <textarea> tags,
even if that would mean not prettifying the markup.
"""
- pre_markup = "<pre> </pre>"
- textarea_markup = "<textarea> woo\nwoo </textarea>"
+ pre_markup = "<pre>a z</pre>\n"
+ textarea_markup = "<textarea> woo\nwoo </textarea>\n"
self.assert_soup(pre_markup)
self.assert_soup(textarea_markup)
@@ -563,7 +563,7 @@ Hello, world!
assert soup.textarea.prettify() == textarea_markup
soup = self.soup("<textarea></textarea>")
- assert soup.textarea.prettify() == "<textarea></textarea>"
+ assert soup.textarea.prettify() == "<textarea></textarea>\n"
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
index 84d4e3b..528b16d 100644
--- a/bs4/tests/test_formatter.py
+++ b/bs4/tests/test_formatter.py
@@ -80,20 +80,20 @@ class TestFormatter(SoupTest):
@pytest.mark.parametrize(
"indent,expect",
[
- (None, '<a>\n<b>\ntext\n</b>\n</a>'),
- (-1, '<a>\n<b>\ntext\n</b>\n</a>'),
- (0, '<a>\n<b>\ntext\n</b>\n</a>'),
- ("", '<a>\n<b>\ntext\n</b>\n</a>'),
+ (None, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+ (-1, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+ (0, '<a>\n<b>\ntext\n</b>\n</a>\n'),
+ ("", '<a>\n<b>\ntext\n</b>\n</a>\n'),
- (1, '<a>\n <b>\n text\n </b>\n</a>'),
- (2, '<a>\n <b>\n text\n </b>\n</a>'),
+ (1, '<a>\n <b>\n text\n </b>\n</a>\n'),
+ (2, '<a>\n <b>\n text\n </b>\n</a>\n'),
- ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
- ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
+ ("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'),
+ ('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'),
# Some invalid inputs -- the default behavior is used.
- (object(), '<a>\n <b>\n text\n </b>\n</a>'),
- (b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
+ (object(), '<a>\n <b>\n text\n </b>\n</a>\n'),
+ (b'bytes', '<a>\n <b>\n text\n </b>\n</a>\n'),
]
)
def test_indent(self, indent, expect):
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index a94280f..d98c577 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -2,6 +2,7 @@
import copy
import pickle
import pytest
+import sys
from bs4 import BeautifulSoup
from bs4.element import (
@@ -49,6 +50,16 @@ class TestEncoding(SoupTest):
encoding="utf8"
)
+ def test_encode_deeply_nested_document(self):
+ # This test verifies that encoding a string doesn't involve
+ # any recursive function calls. If it did, this test would
+ # overflow the Python interpreter stack.
+ limit = sys.getrecursionlimit() + 1
+ markup = "<span>" * limit
+ soup = self.soup(markup)
+ encoded = soup.encode()
+ assert limit == encoded.count(b"<span>")
+
def test_deprecated_renderContents(self):
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
@@ -156,7 +167,31 @@ class TestFormatters(SoupTest):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
- assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
+ assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
+
+ def test_prettify_handles_nested_string_literal_tags(self):
+ # Most of this markup is inside a <pre> tag, so prettify()
+ # only does three things to it:
+ # 1. Add a newline and a space between the <div> and the <pre>
+ # 2. Add a newline after the </pre>
+ # 3. Add a newline at the end.
+ #
+ # The contents of the <pre> tag are left completely alone. In
+ # particular, we don't start adding whitespace again once we
+ # encounter the first </pre> tag, because we know it's not
+ # the one that put us into string literal mode.
+ markup = """<div><pre><code>some
+<script><pre>code</pre></script> for you
+</code></pre></div>"""
+
+ expect = """<div>
+ <pre><code>some
+<script><pre>code</pre></script> for you
+</code></pre>
+</div>
+"""
+ soup = self.soup(markup)
+ assert expect == soup.div.prettify()
def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')