summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG19
-rw-r--r--beautifulsoup/builder/__init__.py10
-rw-r--r--beautifulsoup/builder/lxml_builder.py11
-rw-r--r--tests/test_lxml.py14
-rw-r--r--tests/test_tree.py16
5 files changed, 53 insertions, 17 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 96a9ed4..3fb4f36 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -74,6 +74,25 @@ Unicode character. There are no longer any smartQuotesTo or
convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
but the default is now to turn smart quotes into Unicode.)
+== CDATA sections are normal text, if they're understood at all. ==
+
+Currently, both HTML parsers ignore CDATA sections in markup:
+
+ <p><![CDATA[foo]]></p> => <p></p>
+
+A future version of html5lib will turn CDATA sections into text nodes,
+but only within tags like <svg> and <math>:
+
+ <svg><![CDATA[foo]]></svg> => <p>foo</p>
+
+The default XML parser (which uses lxml behind the scenes) turns CDATA
+sections into ordinary text elements:
+
+ <p><![CDATA[foo]]></p> => <p>foo</p>
+
+In theory it's possible to preserve the CDATA sections when using the
+XML parser, but I don't see how to get it to work in practice.
+
= 3.1.0 =
A hybrid version that supports 2.4 and can be automatically converted
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index deaa613..9ffa9ef 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -28,12 +28,12 @@ class TreeBuilder(Entities):
The final markup may or may not actually present this tag as
self-closing.
- For instance: an HTML builder does not consider a <p> tag to
- be an empty-element tag (it's not in empty_element_tags). This
- means an empty <p> tag will be presented as "<p></p>", not
- "<p />".
+ For instance: an HTMLBuilder does not consider a <p> tag to be
+ an empty-element tag (it's not in
+ HTMLBuilder.empty_element_tags). This means an empty <p> tag
+ will be presented as "<p></p>", not "<p />".
- The default builder has no opinion about which tags are
+ The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index e431a62..9f4c0bd 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -11,7 +11,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def default_parser(self):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
- return etree.XMLParser
+ return etree.XMLParser(target=self, strip_cdata=False, recover=True)
def __init__(self, parser=None, empty_element_tags=None):
if empty_element_tags is not None:
@@ -71,10 +71,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup.handle_data(content)
self.soup.endData(Comment)
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment
-
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -82,5 +78,6 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
def default_parser(self):
return etree.HTMLParser
- def end(self, name):
- self.soup.handle_endtag(name)
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><body>%s</body></html>' % fragment
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 7d916da..88c866d 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -518,6 +518,20 @@ class TestLXMLXMLBuilder(SoupTest):
def default_builder(self):
return LXMLTreeBuilderForXML()
+ def test_cdata_becomes_text(self):
+ # LXML sends CData sections as 'data' events, so we can't
+ # create special CData objects for them. We have to use
+ # NavigableString. I would like to fix this, but it's not a
+ # very high priority.
+ markup = "<foo><![CDATA[iamcdata]]></foo>"
+ soup = self.soup(markup)
+ cdata = soup.foo.contents[0]
+ self.assertEquals(cdata.__class__.__name__, 'NavigableString')
+
+
+ def test_can_handle_invalid_xml(self):
+ self.assertSoupEquals("<a><b>", "<a><b /></a>")
+
def test_empty_element_tag(self):
soup = self.soup("<p><iamselfclosing /></p>")
self.assertTrue(soup.iamselfclosing.is_empty_element)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 40643dc..6f00716 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -13,7 +13,7 @@ import copy
import cPickle as pickle
import re
from beautifulsoup import BeautifulSoup
-from beautifulsoup.element import SoupStrainer, Tag
+from beautifulsoup.element import CData, SoupStrainer, Tag
from beautifulsoup.testing import SoupTest
class TreeTest(SoupTest):
@@ -883,9 +883,15 @@ class TestEncoding(SoupTest):
soup.b.encode("utf-8"), html.encode("utf-8"))
-class TestEmptyElementTags(SoupTest):
+class TestNavigableStringSubclasses(SoupTest):
- @property
- def default_builder(self):
- return LXMLTreeBuilderForXML()
+ def test_cdata(self):
+ # None of the current builders turn CDATA sections into CData
+ # objects, but you can create them manually.
+ soup = self.soup("")
+ cdata = CData("foo")
+ soup.insert(1, cdata)
+ self.assertEquals(str(soup), "<![CDATA[foo]]>")
+ self.assertEquals(soup.find(text="foo"), "foo")
+ self.assertEquals(soup.contents[0], "foo")