summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO17
-rw-r--r--beautifulsoup/builder/lxml_builder.py6
-rw-r--r--tests/test_html5lib.py23
-rw-r--r--tests/test_lxml.py4
4 files changed, 48 insertions, 2 deletions
diff --git a/TODO b/TODO
index 9792743..ea32bbb 100644
--- a/TODO
+++ b/TODO
@@ -7,6 +7,23 @@ Bare ampersands should be converted to HTML entities upon output.
It should also be possible to convert certain Unicode characters to
HTML entities upon output.
+XML handling:
+
+The elementtree XMLParser has a strip_cdata argument that, when set to
+False, should allow Beautiful Soup to preserve CDATA sections instead
+of treating them as text. (This argument is also present for
+HTMLParser, but does nothing.)
+
+Later:
+
+Currently, htm5lib converts CDATA sections into comments. An
+as-yet-unreleased version of html5lib changes the parser's handling of
+CDATA sections to allow CDATA sections in tags like <svg> and
+<math>. The HTML5TreeBuilder will need to be updated to create CData
+objects instead of Comment objects in this situation.
+
+
+
---
Here are some unit tests that fail with HTMLParser.
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 8336ab4..4e83bba 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -5,6 +5,9 @@ from beautifulsoup.builder import HTMLTreeBuilder
class LXMLTreeBuilder(HTMLTreeBuilder):
def __init__(self, parser_class=etree.HTMLParser):
+ # etree.HTMLParser's constructor has an argument strip_cdata,
+ # but it does nothing. CDATA sections will become text when
+ # passed through etree.HTMLParser.
self.parser = parser_class(target=self)
self.soup = None
@@ -21,6 +24,9 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
def end(self, name):
self.soup.handle_endtag(name)
+ def pi(self, target, data):
+ pass
+
def data(self, content):
self.soup.handle_data(content)
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index dada900..2d16bbb 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -1,4 +1,5 @@
from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder
+from beautifulsoup.element import Comment
from test_lxml import (
TestLXMLBuilder,
TestLXMLBuilderInvalidMarkup,
@@ -43,8 +44,19 @@ class TestHTML5Builder(TestLXMLBuilder):
self.assertSoupEquals("<p> </p>")
self.assertSoupEquals("<b> </b>")
- def test_cdata(self):
- print self.soup("<div><![CDATA[foo]]></div>")
+ def test_cdata_where_its_ok(self):
+ # In html5lib 0.9.0, all CDATA sections are converted into
+ # comments. In a later version (unreleased as of this
+ # writing), CDATA sections in tags like <svg> and <math> will
+ # be preserved. BUT, I'm not sure how Beautiful Soup needs to
+ # adjust to transform this preservation into the construction
+ # of a BS CData object.
+ markup = "<svg><![CDATA[foobar]]>"
+
+ # Eventually we should be able to do a find(text="foobar") and
+ # get a CData object.
+ self.assertSoupEquals(markup, "<svg><!--[CDATA[foobar]]--></svg>")
+
class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
"""See `BuilderInvalidMarkupSmokeTest`."""
@@ -76,6 +88,13 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
markup = "<p>one<!DOCTYPE foobar>two</p>"
self.assertSoupEquals(markup, "<p>onetwo</p>")
+ def test_cdata_where_it_doesnt_belong(self):
+ # Random CDATA sections are converted into comments.
+ markup = "<div><![CDATA[foo]]>"
+ soup = self.soup(markup)
+ data = soup.find(text="[CDATA[foo]]")
+ self.assertEquals(data.__class__, Comment)
+
def test_foo(self):
isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
soup = self.soup(isolatin)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 9a65f6a..207d141 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -273,3 +273,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
markup = "<p>one<!DOCTYPE foobar>two</p>"
self.assertSoupEquals(markup)
+ def test_cdata_where_it_doesnt_belong(self):
+ #CDATA sections are ignored.
+ markup = "<div><![CDATA[foo]]>"
+ self.assertSoupEquals(markup, "<div></div>")