summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/element.py43
-rw-r--r--bs4/tests/test_tree.py42
-rw-r--r--doc/source/index.rst2
3 files changed, 84 insertions, 3 deletions
diff --git a/bs4/element.py b/bs4/element.py
index c4b5bc7..73e3867 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -191,7 +191,7 @@ class PageElement(object):
def replace_with(self, replace_with):
if self.parent is None:
raise ValueError(
- "Cannot replace one element with another when the"
+ "Cannot replace one element with another when the "
"element to be replaced is not part of a tree.")
if replace_with is self:
return
@@ -899,6 +899,43 @@ class Tag(PageElement):
for element in self.contents[:]:
element.extract()
+ def smooth(self):
+ """Smooth out this element's children by consolidating consecutive strings.
+
+ This makes pretty-printed output look more natural following a
+ lot of operations that modified the tree.
+ """
+ # Mark the first position of every pair of children that need
+ # to be consolidated. Do this rather than making a copy of
+ # self.contents, since in most cases very few strings will be
+ # affected.
+ marked = []
+ for i, a in enumerate(self.contents):
+ if isinstance(a, Tag):
+ # Recursively smooth children.
+ a.smooth()
+ if i == len(self.contents)-1:
+ # This is the last item in .contents, and it's not a
+ # tag. There's no chance it needs any work.
+ continue
+ b = self.contents[i+1]
+ if (isinstance(a, NavigableString)
+ and isinstance(b, NavigableString)
+ and not isinstance(a, PreformattedString)
+ and not isinstance(b, PreformattedString)
+ ):
+ marked.append(i)
+
+ # Go over the marked positions in reverse order, so that
+ # removing items from .contents won't affect the remaining
+ # positions.
+ for i in reversed(marked):
+ a = self.contents[i]
+ b = self.contents[i+1]
+ b.extract()
+ n = NavigableString(a+b)
+ a.replace_with(n)
+
def index(self, element):
"""
Find the index of a child by identity, not value. Avoids issues with
@@ -1173,7 +1210,9 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
- preserve_whitespace = self.name in self.preserve_whitespace_tags
+ preserve_whitespace = (
+ self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
+ )
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6510f85..e655dcc 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
self.assertEqual([], soup.find_all(id=1, text="bar"))
+class TestSmooth(TreeTest):
+ """Test Tag.smooth."""
+
+ def test_smooth(self):
+ soup = self.soup("<div>a</div>")
+ div = soup.div
+ div.append("b")
+ div.append("c")
+ div.append(Comment("Comment 1"))
+ div.append(Comment("Comment 2"))
+ div.append("d")
+ builder = self.default_builder()
+ span = Tag(soup, builder, 'span')
+ span.append('1')
+ span.append('2')
+ div.append(span)
+
+ # At this point the tree has a bunch of adjacent
+ # NavigableStrings. This is normal, but it has no meaning in
+ # terms of HTML, so we may want to smooth things out for
+ # output.
+
+ # Since the <span> tag has two children, its .string is None.
+ self.assertEquals(None, div.span.string)
+
+ self.assertEqual(7, len(div.contents))
+ div.smooth()
+ self.assertEqual(5, len(div.contents))
+
+ # The three strings at the beginning of div.contents have been
+ # merged into on string.
+ #
+ self.assertEqual('abc', div.contents[0])
+
+ # The call is recursive -- the <span> tag was also smoothed.
+ self.assertEqual('12', div.span.string)
+
+ # The two comments have _not_ been merged, even though
+ # comments are strings. Merging comments would change the
+ # meaning of the HTML.
+ self.assertEqual('Comment 1', div.contents[1])
+ self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 0c09964..4bca0ae 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2292,7 +2292,7 @@ Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even
more control over the output. For example, Beautiful Soup sorts the
attributes in every tag by default::
- attr_soup = BeautifulSoup('<p z="1" m="2" a="3"></p>')
+ attr_soup = BeautifulSoup(b'<p z="1" m="2" a="3"></p>')
print(attr_soup.p.encode())
# <p a="3" m="2" z="1"></p>