diff options
-rw-r--r-- | TODO | 17 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 6 | ||||
-rw-r--r-- | tests/test_html5lib.py | 23 | ||||
-rw-r--r-- | tests/test_lxml.py | 4 |
4 files changed, 48 insertions, 2 deletions
@@ -7,6 +7,23 @@ Bare ampersands should be converted to HTML entities upon output. It should also be possible to convert certain Unicode characters to HTML entities upon output. +XML handling: + +The elementtree XMLParser has a strip_cdata argument that, when set to +False, should allow Beautiful Soup to preserve CDATA sections instead +of treating them as text. (This argument is also present for +HTMLParser, but does nothing.) + +Later: + +Currently, htm5lib converts CDATA sections into comments. An +as-yet-unreleased version of html5lib changes the parser's handling of +CDATA sections to allow CDATA sections in tags like <svg> and +<math>. The HTML5TreeBuilder will need to be updated to create CData +objects instead of Comment objects in this situation. + + + --- Here are some unit tests that fail with HTMLParser. diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 8336ab4..4e83bba 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -5,6 +5,9 @@ from beautifulsoup.builder import HTMLTreeBuilder class LXMLTreeBuilder(HTMLTreeBuilder): def __init__(self, parser_class=etree.HTMLParser): + # etree.HTMLParser's constructor has an argument strip_cdata, + # but it does nothing. CDATA sections will become text when + # passed through etree.HTMLParser. self.parser = parser_class(target=self) self.soup = None @@ -21,6 +24,9 @@ class LXMLTreeBuilder(HTMLTreeBuilder): def end(self, name): self.soup.handle_endtag(name) + def pi(self, target, data): + pass + def data(self, content): self.soup.handle_data(content) diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index dada900..2d16bbb 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -1,4 +1,5 @@ from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder +from beautifulsoup.element import Comment from test_lxml import ( TestLXMLBuilder, TestLXMLBuilderInvalidMarkup, @@ -43,8 +44,19 @@ class TestHTML5Builder(TestLXMLBuilder): self.assertSoupEquals("<p> </p>") self.assertSoupEquals("<b> </b>") - def test_cdata(self): - print self.soup("<div><![CDATA[foo]]></div>") + def test_cdata_where_its_ok(self): + # In html5lib 0.9.0, all CDATA sections are converted into + # comments. In a later version (unreleased as of this + # writing), CDATA sections in tags like <svg> and <math> will + # be preserved. BUT, I'm not sure how Beautiful Soup needs to + # adjust to transform this preservation into the construction + # of a BS CData object. + markup = "<svg><![CDATA[foobar]]>" + + # Eventually we should be able to do a find(text="foobar") and + # get a CData object. + self.assertSoupEquals(markup, "<svg><!--[CDATA[foobar]]--></svg>") + class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @@ -76,6 +88,13 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup, "<p>onetwo</p>") + def test_cdata_where_it_doesnt_belong(self): + # Random CDATA sections are converted into comments. + markup = "<div><![CDATA[foo]]>" + soup = self.soup(markup) + data = soup.find(text="[CDATA[foo]]") + self.assertEquals(data.__class__, Comment) + def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" soup = self.soup(isolatin) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 9a65f6a..207d141 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -273,3 +273,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup) + def test_cdata_where_it_doesnt_belong(self): + #CDATA sections are ignored. + markup = "<div><![CDATA[foo]]>" + self.assertSoupEquals(markup, "<div></div>") |