I couldn't get the XML parser to parse CDATA as CData objects, but at least I documented the current behavior.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 10:39:56 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 10:39:56 -0500
commit: 232311a2f682e59078012e5b05e382982862f627 (patch)
tree: 60bd21949b54bdb5588ecce31a3bb89e40617692
parent: ae349fd47c627f8166526fed8906811707d2f4b2 (diff)
parent: f2532b1d63bd4a4d2be6ad9a4dce5eea03f43e7a (diff)
5 files changed, 53 insertions, 17 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 96a9ed4..3fb4f36 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -74,6 +74,25 @@ Unicode character. There are no longer any smartQuotesTo or
 convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
 but the default is now to turn smart quotes into Unicode.)
 
+== CDATA sections are normal text, if they're understood at all. ==
+
+Currently, both HTML parsers ignore CDATA sections in markup:
+
+ <p><![CDATA[foo]]></p> => <p></p>
+
+A future version of html5lib will turn CDATA sections into text nodes,
+but only within tags like <svg> and <math>:
+
+ <svg><![CDATA[foo]]></svg> => <p>foo</p>
+
+The default XML parser (which uses lxml behind the scenes) turns CDATA
+sections into ordinary text elements:
+
+ <p><![CDATA[foo]]></p> => <p>foo</p>
+
+In theory it's possible to preserve the CDATA sections when using the
+XML parser, but I don't see how to get it to work in practice.
+
 = 3.1.0 =
 
 A hybrid version that supports 2.4 and can be automatically converted
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index deaa613..9ffa9ef 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -28,12 +28,12 @@ class TreeBuilder(Entities):
         The final markup may or may not actually present this tag as
         self-closing.
 
-        For instance: an HTML builder does not consider a <p> tag to
-        be an empty-element tag (it's not in empty_element_tags). This
-        means an empty <p> tag will be presented as "<p></p>", not
-        "<p />".
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
 
-        The default builder has no opinion about which tags are
+        The default implementation has no opinion about which tags are
         empty-element tags, so a tag will be presented as an
         empty-element tag if and only if it has no contents.
         "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index e431a62..9f4c0bd 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -11,7 +11,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def default_parser(self):
         # This can either return a parser object or a class, which
         # will be instantiated with default arguments.
-        return etree.XMLParser
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
 
     def __init__(self, parser=None, empty_element_tags=None):
         if empty_element_tags is not None:
@@ -71,10 +71,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         self.soup.handle_data(content)
         self.soup.endData(Comment)
 
-    def test_fragment_to_document(self, fragment):
-        """See `TreeBuilder`."""
-        return u'<html><body>%s</body></html>' % fragment
-
 
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
 
@@ -82,5 +78,6 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
     def default_parser(self):
         return etree.HTMLParser
 
-    def end(self, name):
-        self.soup.handle_endtag(name)
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 7d916da..88c866d 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -518,6 +518,20 @@ class TestLXMLXMLBuilder(SoupTest):
     def default_builder(self):
         return LXMLTreeBuilderForXML()
 
+    def test_cdata_becomes_text(self):
+        # LXML sends CData sections as 'data' events, so we can't
+        # create special CData objects for them. We have to use
+        # NavigableString. I would like to fix this, but it's not a
+        # very high priority.
+        markup = "<foo><![CDATA[iamcdata]]></foo>"
+        soup = self.soup(markup)
+        cdata = soup.foo.contents[0]
+        self.assertEquals(cdata.__class__.__name__, 'NavigableString')
+
+
+    def test_can_handle_invalid_xml(self):
+        self.assertSoupEquals("<a><b>", "<a><b /></a>")
+
     def test_empty_element_tag(self):
         soup = self.soup("<p><iamselfclosing /></p>")
         self.assertTrue(soup.iamselfclosing.is_empty_element)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 40643dc..6f00716 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -13,7 +13,7 @@ import copy
 import cPickle as pickle
 import re
 from beautifulsoup import BeautifulSoup
-from beautifulsoup.element import SoupStrainer, Tag
+from beautifulsoup.element import CData, SoupStrainer, Tag
 from beautifulsoup.testing import SoupTest
 
 class TreeTest(SoupTest):
@@ -883,9 +883,15 @@ class TestEncoding(SoupTest):
             soup.b.encode("utf-8"), html.encode("utf-8"))
 
 
-class TestEmptyElementTags(SoupTest):
+class TestNavigableStringSubclasses(SoupTest):
 
-    @property
-    def default_builder(self):
-        return LXMLTreeBuilderForXML()
 
+    def test_cdata(self):
+        # None of the current builders turn CDATA sections into CData
+        # objects, but you can create them manually.
+        soup = self.soup("")
+        cdata = CData("foo")
+        soup.insert(1, cdata)
+        self.assertEquals(str(soup), "<![CDATA[foo]]>")
+        self.assertEquals(soup.find(text="foo"), "foo")
+        self.assertEquals(soup.contents[0], "foo")
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 10:39:56 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 10:39:56 -0500
commit	232311a2f682e59078012e5b05e382982862f627 (patch)
tree	60bd21949b54bdb5588ecce31a3bb89e40617692
parent	ae349fd47c627f8166526fed8906811707d2f4b2 (diff)
parent	f2532b1d63bd4a4d2be6ad9a4dce5eea03f43e7a (diff)