diff options
-rw-r--r-- | bs4/element.py | 43 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 42 | ||||
-rw-r--r-- | doc/source/index.rst | 2 |
3 files changed, 84 insertions, 3 deletions
diff --git a/bs4/element.py b/bs4/element.py index c4b5bc7..73e3867 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -191,7 +191,7 @@ class PageElement(object): def replace_with(self, replace_with): if self.parent is None: raise ValueError( - "Cannot replace one element with another when the" + "Cannot replace one element with another when the " "element to be replaced is not part of a tree.") if replace_with is self: return @@ -899,6 +899,43 @@ class Tag(PageElement): for element in self.contents[:]: element.extract() + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with @@ -1173,7 +1210,9 @@ class Tag(PageElement): elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) - preserve_whitespace = self.name in self.preserve_whitespace_tags + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) if text and indent_level and not preserve_whitespace: text = text.strip() if text: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6510f85..e655dcc 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest): self.assertEqual([], soup.find_all(id=1, text="bar")) +class TestSmooth(TreeTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("<div>a</div>") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, 'span') + span.append('1') + span.append('2') + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the <span> tag has two children, its .string is None. + self.assertEquals(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() + self.assertEqual(5, len(div.contents)) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + self.assertEqual('abc', div.contents[0]) + + # The call is recursive -- the <span> tag was also smoothed. + self.assertEqual('12', div.span.string) + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + self.assertEqual('Comment 1', div.contents[1]) + self.assertEqual('Comment 2', div.contents[2]) class TestIndex(TreeTest): diff --git a/doc/source/index.rst b/doc/source/index.rst index 0c09964..4bca0ae 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2292,7 +2292,7 @@ Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even more control over the output. For example, Beautiful Soup sorts the attributes in every tag by default:: - attr_soup = BeautifulSoup('<p z="1" m="2" a="3"></p>') + attr_soup = BeautifulSoup(b'<p z="1" m="2" a="3"></p>') print(attr_soup.p.encode()) # <p a="3" m="2" z="1"></p> |