summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/element.py8
-rw-r--r--bs4/tests/test_tree.py24
-rw-r--r--doc/source/index.rst16
4 files changed, 49 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 12922c9..772b9fa 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,6 +7,9 @@
definitions ending with two question marks instead of
one. [bug=984258]
+* Upon document generation, CData objects are no longer run through
+ the formatter. [bug=988905]
+
* The test suite now passes when lxml is not installed, whether or not
html5lib is installed. [bug=987004]
diff --git a/bs4/element.py b/bs4/element.py
index 282193e..c1ad992 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -677,6 +677,12 @@ class CData(NavigableString):
PREFIX = u'<![CDATA['
SUFFIX = u']]>'
+ def output_ready(self, formatter="minimal"):
+ """CData strings are passed into the formatter.
+ But the return value is ignored."""
+ self.format_string(self, formatter)
+ return self.PREFIX + self + self.SUFFIX
+
class ProcessingInstruction(NavigableString):
@@ -791,7 +797,7 @@ class Tag(PageElement):
@string.setter
def string(self, string):
self.clear()
- self.append(unicode(string))
+ self.append(string.__class__(string))
def _all_strings(self, strip=False):
"""Yield all child strings, possibly stripping them."""
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 5acaeea..2dea886 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -995,6 +995,12 @@ class TestTreeModification(SoupTest):
soup.b.string = soup.c.string
self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
+ def test_set_string_preserves_class_of_string(self):
+ soup = self.soup("<a></a>")
+ cdata = CData("foo")
+ soup.a.string = cdata
+ self.assertTrue(isinstance(soup.a.string, CData))
+
class TestElementObjects(SoupTest):
"""Test various features of element objects."""
@@ -1346,6 +1352,24 @@ class TestNavigableStringSubclasses(SoupTest):
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
+ def test_cdata_is_never_formatted(self):
+ """Text inside a CData object is passed into the formatter.
+
+ But the return value is ignored.
+ """
+
+ self.count = 0
+ def increment(*args):
+ self.count += 1
+ return "BITTER FAILURE"
+
+ soup = self.soup("")
+ cdata = CData("<><><>")
+ soup.insert(1, cdata)
+ self.assertEqual(
+ b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
+ self.assertEqual(1, self.count)
+
def test_doctype_ends_in_newline(self):
# Unlike other NavigableString subclasses, a DOCTYPE always ends
# in a newline.
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 5b65354..17d2211 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2048,7 +2048,6 @@ to Beautiful Soup generating invalid HTML/XML, as in these examples::
print(link_soup.a.encode(formatter=None))
# <a href="http://example.com/?foo=val1&bar=val2">A link</a>
-
Finally, if you pass in a function for ``formatter``, Beautiful Soup
will call that function once for every string and attribute value in
the document. You can do whatever you want in this function. Here's a
@@ -2096,6 +2095,21 @@ whenever possible, but `also` converts all strings to uppercase::
# </body>
# </html>
+One last caveat: if you create a ``CData`` object, the text inside
+that object is always presented `exactly as it appears, with no
+formatting`. Beautiful Soup will call the formatter method, just in
+case you've written a custom method that counts all the strings in the
+document or something, but it will ignore the return value.
+
+ from bs4.element import CData
+ soup = BeautifulSoup("<a></a>")
+ soup.a.string = CData("one < three")
+ print(soup.a.prettify(formatter="xml"))
+ # <a>
+ # <![CDATA[one < three]]>
+ # </a>
+
+
``get_text()``
--------------