From f2532b1d63bd4a4d2be6ad9a4dce5eea03f43e7a Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 20 Feb 2011 10:39:30 -0500 Subject: Since we can't parse in CData objects ATM, added a test for CData objects created manually, to keep the bits from rotting. --- CHANGELOG | 19 +++++++++++++++++++ tests/test_lxml.py | 11 +++++++++++ tests/test_tree.py | 16 +++++++++++----- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 96a9ed4..3fb4f36 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -74,6 +74,25 @@ Unicode character. There are no longer any smartQuotesTo or convert_entities arguments. (Unicode Dammit still has smart_quotes_to, but the default is now to turn smart quotes into Unicode.) +== CDATA sections are normal text, if they're understood at all. == + +Currently, both HTML parsers ignore CDATA sections in markup: + +

=>

+ +A future version of html5lib will turn CDATA sections into text nodes, +but only within tags like and : + + foo =>

foo

+ +The default XML parser (which uses lxml behind the scenes) turns CDATA +sections into ordinary text elements: + +

=>

foo

+ +In theory it's possible to preserve the CDATA sections when using the +XML parser, but I don't see how to get it to work in practice. + = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted diff --git a/tests/test_lxml.py b/tests/test_lxml.py index c178457..88c866d 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -518,6 +518,17 @@ class TestLXMLXMLBuilder(SoupTest): def default_builder(self): return LXMLTreeBuilderForXML() + def test_cdata_becomes_text(self): + # LXML sends CData sections as 'data' events, so we can't + # create special CData objects for them. We have to use + # NavigableString. I would like to fix this, but it's not a + # very high priority. + markup = "" + soup = self.soup(markup) + cdata = soup.foo.contents[0] + self.assertEquals(cdata.__class__.__name__, 'NavigableString') + + def test_can_handle_invalid_xml(self): self.assertSoupEquals("", "") diff --git a/tests/test_tree.py b/tests/test_tree.py index 40643dc..6f00716 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,7 +13,7 @@ import copy import cPickle as pickle import re from beautifulsoup import BeautifulSoup -from beautifulsoup.element import SoupStrainer, Tag +from beautifulsoup.element import CData, SoupStrainer, Tag from beautifulsoup.testing import SoupTest class TreeTest(SoupTest): @@ -883,9 +883,15 @@ class TestEncoding(SoupTest): soup.b.encode("utf-8"), html.encode("utf-8")) -class TestEmptyElementTags(SoupTest): +class TestNavigableStringSubclasses(SoupTest): - @property - def default_builder(self): - return LXMLTreeBuilderForXML() + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEquals(str(soup), "") + self.assertEquals(soup.find(text="foo"), "foo") + self.assertEquals(soup.contents[0], "foo") -- cgit v1.2.3