From 4191d5ff45015c6fac1db0bbdd7b3fcaff234424 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 13 Feb 2011 18:04:03 -0500 Subject: Clarified lxml's behavior w/r/t CDATA sections. --- beautifulsoup/builder/lxml_builder.py | 4 ++-- tests/test_lxml.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 4e83bba..86ac183 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -6,8 +6,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): def __init__(self, parser_class=etree.HTMLParser): # etree.HTMLParser's constructor has an argument strip_cdata, - # but it does nothing. CDATA sections will become text when - # passed through etree.HTMLParser. + # but it does nothing. CDATA sections are always stripped when + # passed through HTMLParser. self.parser = parser_class(target=self) self.soup = None diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 207d141..8f36b41 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -196,16 +196,17 @@ class TestLXMLBuilder(SoupTest): soup = self.soup("  ") self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + def test_cdata_where_its_ok(self): + # lxml strips CDATA sections, no matter where they occur. + markup = "foobar" + self.assertSoupEquals(markup, "") + # Tests below this line need work. #def test_doctype(self): # xml = 'foo

' # self.assertSoupEquals(xml) - - #def test_cdata(self): - # print self.soup("
") - def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") -- cgit v1.2.3