diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:33:51 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:33:51 -0400 |
commit | 431e078fbdb54adeb3875cb8c5cc75d6722de2bd (patch) | |
tree | 5b0aad64b7fe429318b3d2cd3539db0221605526 | |
parent | 457fa9096e5cee673063b41d58da9f2442814f0f (diff) |
The prettify() method now leaves the contents of <pre> tags
alone. [bug=1095654]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/element.py | 35 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 10 |
3 files changed, 38 insertions, 10 deletions
@@ -15,6 +15,9 @@ or from bs4 import _soup +* The prettify() method now leaves the contents of <pre> tags + alone. [bug=1095654] + * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] diff --git a/bs4/element.py b/bs4/element.py index 5ccb019..398eb05 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -94,6 +94,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): cdata_containing_tags = set(["script", "style"]) + preformatted_tags = set(["pre"]) + @classmethod def _substitute_if_appropriate(cls, ns, f): if (isinstance(ns, NavigableString) @@ -1047,6 +1049,13 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None and + (self.name not in HTMLAwareEntitySubstitution.preformatted_tags + or self._is_xml)) + def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1097,12 +1106,15 @@ class Tag(PageElement): else: closeTag = '</%s%s>' % (prefix, self.name) - pretty_print = (indent_level is not None) + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) if pretty_print: - space = (' ' * (indent_level - 1)) + space = indent_space indent_contents = indent_level + 1 else: - space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) @@ -1115,8 +1127,10 @@ class Tag(PageElement): attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) - if pretty_print: - s.append(space) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: @@ -1127,7 +1141,10 @@ class Tag(PageElement): if pretty_print and closeTag: s.append(space) s.append(closeTag) - if pretty_print and closeTag and self.next_sibling: + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. s.append("\n") s = ''.join(s) return s @@ -1164,13 +1181,13 @@ class Tag(PageElement): elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) - if text and indent_level: + if text and indent_level and not self.name == 'pre': text = text.strip() if text: - if pretty_print: + if pretty_print and not self.name == 'pre': s.append(" " * (indent_level - 1)) s.append(text) - if pretty_print: + if pretty_print and not self.name == 'pre': s.append("\n") return ''.join(s) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 5e4a9dd..503af63 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1050,7 +1050,7 @@ class TestTreeModification(SoupTest): # clear using decompose() em = a.em a.clear(decompose=True) - self.assertFalse(hasattr(em, "contents")) + self.assertEqual(0, len(em.contents)) def test_string_set(self): """Tag.string = 'string'""" @@ -1356,6 +1356,14 @@ class TestSubstitutions(SoupTest): encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") + # Everything outside the <pre> tag is reformatted, but everything + # inside is left alone. + self.assertEqual( + u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', + soup.div.prettify()) + def test_prettify_accepts_formatter(self): soup = BeautifulSoup("<html><body>foo</body></html>") pretty = soup.prettify(formatter = lambda x: x.upper()) |