diff options
-rw-r--r-- | CHANGELOG | 19 | ||||
-rw-r--r-- | tests/test_lxml.py | 11 | ||||
-rw-r--r-- | tests/test_tree.py | 16 |
3 files changed, 41 insertions, 5 deletions
@@ -74,6 +74,25 @@ Unicode character. There are no longer any smartQuotesTo or convert_entities arguments. (Unicode Dammit still has smart_quotes_to, but the default is now to turn smart quotes into Unicode.) +== CDATA sections are normal text, if they're understood at all. == + +Currently, both HTML parsers ignore CDATA sections in markup: + + <p><![CDATA[foo]]></p> => <p></p> + +A future version of html5lib will turn CDATA sections into text nodes, +but only within tags like <svg> and <math>: + + <svg><![CDATA[foo]]></svg> => <p>foo</p> + +The default XML parser (which uses lxml behind the scenes) turns CDATA +sections into ordinary text elements: + + <p><![CDATA[foo]]></p> => <p>foo</p> + +In theory it's possible to preserve the CDATA sections when using the +XML parser, but I don't see how to get it to work in practice. + = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted diff --git a/tests/test_lxml.py b/tests/test_lxml.py index c178457..88c866d 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -518,6 +518,17 @@ class TestLXMLXMLBuilder(SoupTest): def default_builder(self): return LXMLTreeBuilderForXML() + def test_cdata_becomes_text(self): + # LXML sends CData sections as 'data' events, so we can't + # create special CData objects for them. We have to use + # NavigableString. I would like to fix this, but it's not a + # very high priority. + markup = "<foo><![CDATA[iamcdata]]></foo>" + soup = self.soup(markup) + cdata = soup.foo.contents[0] + self.assertEquals(cdata.__class__.__name__, 'NavigableString') + + def test_can_handle_invalid_xml(self): self.assertSoupEquals("<a><b>", "<a><b /></a>") diff --git a/tests/test_tree.py b/tests/test_tree.py index 40643dc..6f00716 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,7 +13,7 @@ import copy import cPickle as pickle import re from beautifulsoup import BeautifulSoup -from beautifulsoup.element import SoupStrainer, Tag +from beautifulsoup.element import CData, SoupStrainer, Tag from beautifulsoup.testing import SoupTest class TreeTest(SoupTest): @@ -883,9 +883,15 @@ class TestEncoding(SoupTest): soup.b.encode("utf-8"), html.encode("utf-8")) -class TestEmptyElementTags(SoupTest): +class TestNavigableStringSubclasses(SoupTest): - @property - def default_builder(self): - return LXMLTreeBuilderForXML() + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEquals(str(soup), "<![CDATA[foo]]>") + self.assertEquals(soup.find(text="foo"), "foo") + self.assertEquals(soup.contents[0], "foo") |