summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-07 10:33:51 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-07 10:33:51 -0400
commit431e078fbdb54adeb3875cb8c5cc75d6722de2bd (patch)
tree5b0aad64b7fe429318b3d2cd3539db0221605526
parent457fa9096e5cee673063b41d58da9f2442814f0f (diff)
The prettify() method now leaves the contents of <pre> tags
alone. [bug=1095654]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/element.py35
-rw-r--r--bs4/tests/test_tree.py10
3 files changed, 38 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index f714bfe..03418ab 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -15,6 +15,9 @@
or
from bs4 import _soup
+* The prettify() method now leaves the contents of <pre> tags
+ alone. [bug=1095654]
+
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
diff --git a/bs4/element.py b/bs4/element.py
index 5ccb019..398eb05 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -94,6 +94,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
cdata_containing_tags = set(["script", "style"])
+ preformatted_tags = set(["pre"])
+
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
@@ -1047,6 +1049,13 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
+ def _should_pretty_print(self, indent_level):
+ """Should this tag be pretty-printed?"""
+ return (
+ indent_level is not None and
+ (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
+ or self._is_xml))
+
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -1097,12 +1106,15 @@ class Tag(PageElement):
else:
closeTag = '</%s%s>' % (prefix, self.name)
- pretty_print = (indent_level is not None)
+ pretty_print = self._should_pretty_print(indent_level)
+ space = ''
+ indent_space = ''
+ if indent_level is not None:
+ indent_space = (' ' * (indent_level - 1))
if pretty_print:
- space = (' ' * (indent_level - 1))
+ space = indent_space
indent_contents = indent_level + 1
else:
- space = ''
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter)
@@ -1115,8 +1127,10 @@ class Tag(PageElement):
attribute_string = ''
if attrs:
attribute_string = ' ' + ' '.join(attrs)
- if pretty_print:
- s.append(space)
+ if indent_level is not None:
+ # Even if this particular tag is not pretty-printed,
+ # we should indent up to the start of the tag.
+ s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
@@ -1127,7 +1141,10 @@ class Tag(PageElement):
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
- if pretty_print and closeTag and self.next_sibling:
+ if indent_level is not None and closeTag and self.next_sibling:
+ # Even if this particular tag is not pretty-printed,
+ # we're now done with the tag, and we should add a
+ # newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
@@ -1164,13 +1181,13 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
- if text and indent_level:
+ if text and indent_level and not self.name == 'pre':
text = text.strip()
if text:
- if pretty_print:
+ if pretty_print and not self.name == 'pre':
s.append(" " * (indent_level - 1))
s.append(text)
- if pretty_print:
+ if pretty_print and not self.name == 'pre':
s.append("\n")
return ''.join(s)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 5e4a9dd..503af63 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1050,7 +1050,7 @@ class TestTreeModification(SoupTest):
# clear using decompose()
em = a.em
a.clear(decompose=True)
- self.assertFalse(hasattr(em, "contents"))
+ self.assertEqual(0, len(em.contents))
def test_string_set(self):
"""Tag.string = 'string'"""
@@ -1356,6 +1356,14 @@ class TestSubstitutions(SoupTest):
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
+ def test_prettify_leaves_preformatted_text_alone(self):
+ soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
+ # Everything outside the <pre> tag is reformatted, but everything
+ # inside is left alone.
+ self.assertEqual(
+ u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
+ soup.div.prettify())
+
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>")
pretty = soup.prettify(formatter = lambda x: x.upper())