4 files changed, 48 insertions, 2 deletions
diff --git a/TODO b/TODO
index 9792743..ea32bbb 100644
--- a/TODO
+++ b/TODO
@@ -7,6 +7,23 @@ Bare ampersands should be converted to HTML entities upon output.
 It should also be possible to convert certain Unicode characters to
 HTML entities upon output.
 
+XML handling:
+
+The elementtree XMLParser has a strip_cdata argument that, when set to
+False, should allow Beautiful Soup to preserve CDATA sections instead
+of treating them as text. (This argument is also present for
+HTMLParser, but does nothing.)
+
+Later:
+
+Currently, htm5lib converts CDATA sections into comments. An
+as-yet-unreleased version of html5lib changes the parser's handling of
+CDATA sections to allow CDATA sections in tags like <svg> and
+<math>. The HTML5TreeBuilder will need to be updated to create CData
+objects instead of Comment objects in this situation.
+
+
+
 ---
 
 Here are some unit tests that fail with HTMLParser.
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 8336ab4..4e83bba 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -5,6 +5,9 @@ from beautifulsoup.builder import HTMLTreeBuilder
 class LXMLTreeBuilder(HTMLTreeBuilder):
 
     def __init__(self, parser_class=etree.HTMLParser):
+        # etree.HTMLParser's constructor has an argument strip_cdata,
+        # but it does nothing. CDATA sections will become text when
+        # passed through etree.HTMLParser.
         self.parser = parser_class(target=self)
         self.soup = None
 
@@ -21,6 +24,9 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
     def end(self, name):
         self.soup.handle_endtag(name)
 
+    def pi(self, target, data):
+        pass
+
     def data(self, content):
         self.soup.handle_data(content)
 
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index dada900..2d16bbb 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -1,4 +1,5 @@
 from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder
+from beautifulsoup.element import Comment
 from test_lxml import (
     TestLXMLBuilder,
     TestLXMLBuilderInvalidMarkup,
@@ -43,8 +44,19 @@ class TestHTML5Builder(TestLXMLBuilder):
         self.assertSoupEquals("<p>   </p>")
         self.assertSoupEquals("<b>   </b>")
 
-    def test_cdata(self):
-        print self.soup("<div><![CDATA[foo]]></div>")
+    def test_cdata_where_its_ok(self):
+        # In html5lib 0.9.0, all CDATA sections are converted into
+        # comments.  In a later version (unreleased as of this
+        # writing), CDATA sections in tags like <svg> and <math> will
+        # be preserved. BUT, I'm not sure how Beautiful Soup needs to
+        # adjust to transform this preservation into the construction
+        # of a BS CData object.
+        markup = "<svg><![CDATA[foobar]]>"
+
+        # Eventually we should be able to do a find(text="foobar") and
+        # get a CData object.
+        self.assertSoupEquals(markup, "<svg><!--[CDATA[foobar]]--></svg>")
+
 
 class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
     """See `BuilderInvalidMarkupSmokeTest`."""
@@ -76,6 +88,13 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
         self.assertSoupEquals(markup, "<p>onetwo</p>")
 
+    def test_cdata_where_it_doesnt_belong(self):
+        # Random CDATA sections are converted into comments.
+        markup = "<div><![CDATA[foo]]>"
+        soup = self.soup(markup)
+        data = soup.find(text="[CDATA[foo]]")
+        self.assertEquals(data.__class__, Comment)
+
     def test_foo(self):
         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
         soup = self.soup(isolatin)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 9a65f6a..207d141 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -273,3 +273,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
         self.assertSoupEquals(markup)
 
+    def test_cdata_where_it_doesnt_belong(self):
+        #CDATA sections are ignored.
+        markup = "<div><![CDATA[foo]]>"
+        self.assertSoupEquals(markup, "<div></div>")