diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/element.py | 8 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 24 | ||||
-rw-r--r-- | doc/source/index.rst | 16 |
4 files changed, 49 insertions, 2 deletions
@@ -7,6 +7,9 @@ definitions ending with two question marks instead of one. [bug=984258] +* Upon document generation, CData objects are no longer run through + the formatter. [bug=988905] + * The test suite now passes when lxml is not installed, whether or not html5lib is installed. [bug=987004] diff --git a/bs4/element.py b/bs4/element.py index 282193e..c1ad992 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -677,6 +677,12 @@ class CData(NavigableString): PREFIX = u'<![CDATA[' SUFFIX = u']]>' + def output_ready(self, formatter="minimal"): + """CData strings are passed into the formatter. + But the return value is ignored.""" + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + class ProcessingInstruction(NavigableString): @@ -791,7 +797,7 @@ class Tag(PageElement): @string.setter def string(self, string): self.clear() - self.append(unicode(string)) + self.append(string.__class__(string)) def _all_strings(self, strip=False): """Yield all child strings, possibly stripping them.""" diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 5acaeea..2dea886 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -995,6 +995,12 @@ class TestTreeModification(SoupTest): soup.b.string = soup.c.string self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") + def test_set_string_preserves_class_of_string(self): + soup = self.soup("<a></a>") + cdata = CData("foo") + soup.a.string = cdata + self.assertTrue(isinstance(soup.a.string, CData)) + class TestElementObjects(SoupTest): """Test various features of element objects.""" @@ -1346,6 +1352,24 @@ class TestNavigableStringSubclasses(SoupTest): self.assertEqual(soup.find(text="foo"), "foo") self.assertEqual(soup.contents[0], "foo") + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + def test_doctype_ends_in_newline(self): # Unlike other NavigableString subclasses, a DOCTYPE always ends # in a newline. diff --git a/doc/source/index.rst b/doc/source/index.rst index 5b65354..17d2211 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2048,7 +2048,6 @@ to Beautiful Soup generating invalid HTML/XML, as in these examples:: print(link_soup.a.encode(formatter=None)) # <a href="http://example.com/?foo=val1&bar=val2">A link</a> - Finally, if you pass in a function for ``formatter``, Beautiful Soup will call that function once for every string and attribute value in the document. You can do whatever you want in this function. Here's a @@ -2096,6 +2095,21 @@ whenever possible, but `also` converts all strings to uppercase:: # </body> # </html> +One last caveat: if you create a ``CData`` object, the text inside +that object is always presented `exactly as it appears, with no +formatting`. Beautiful Soup will call the formatter method, just in +case you've written a custom method that counts all the strings in the +document or something, but it will ignore the return value. + + from bs4.element import CData + soup = BeautifulSoup("<a></a>") + soup.a.string = CData("one < three") + print(soup.a.prettify(formatter="xml")) + # <a> + # <![CDATA[one < three]]> + # </a> + + ``get_text()`` -------------- |