diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 10:39:56 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 10:39:56 -0500 |
commit | 232311a2f682e59078012e5b05e382982862f627 (patch) | |
tree | 60bd21949b54bdb5588ecce31a3bb89e40617692 | |
parent | ae349fd47c627f8166526fed8906811707d2f4b2 (diff) | |
parent | f2532b1d63bd4a4d2be6ad9a4dce5eea03f43e7a (diff) |
I couldn't get the XML parser to parse CDATA as CData objects, but at least I documented the current behavior.
-rw-r--r-- | CHANGELOG | 19 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 10 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 11 | ||||
-rw-r--r-- | tests/test_lxml.py | 14 | ||||
-rw-r--r-- | tests/test_tree.py | 16 |
5 files changed, 53 insertions, 17 deletions
@@ -74,6 +74,25 @@ Unicode character. There are no longer any smartQuotesTo or convert_entities arguments. (Unicode Dammit still has smart_quotes_to, but the default is now to turn smart quotes into Unicode.) +== CDATA sections are normal text, if they're understood at all. == + +Currently, both HTML parsers ignore CDATA sections in markup: + + <p><![CDATA[foo]]></p> => <p></p> + +A future version of html5lib will turn CDATA sections into text nodes, +but only within tags like <svg> and <math>: + + <svg><![CDATA[foo]]></svg> => <p>foo</p> + +The default XML parser (which uses lxml behind the scenes) turns CDATA +sections into ordinary text elements: + + <p><![CDATA[foo]]></p> => <p>foo</p> + +In theory it's possible to preserve the CDATA sections when using the +XML parser, but I don't see how to get it to work in practice. + = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index deaa613..9ffa9ef 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -28,12 +28,12 @@ class TreeBuilder(Entities): The final markup may or may not actually present this tag as self-closing. - For instance: an HTML builder does not consider a <p> tag to - be an empty-element tag (it's not in empty_element_tags). This - means an empty <p> tag will be presented as "<p></p>", not - "<p />". + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p />". - The default builder has no opinion about which tags are + The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no contents. "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index e431a62..9f4c0bd 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -11,7 +11,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. - return etree.XMLParser + return etree.XMLParser(target=self, strip_cdata=False, recover=True) def __init__(self, parser=None, empty_element_tags=None): if empty_element_tags is not None: @@ -71,10 +71,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.handle_data(content) self.soup.endData(Comment) - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment - class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -82,5 +78,6 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): def default_parser(self): return etree.HTMLParser - def end(self, name): - self.soup.handle_endtag(name) + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><body>%s</body></html>' % fragment diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 7d916da..88c866d 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -518,6 +518,20 @@ class TestLXMLXMLBuilder(SoupTest): def default_builder(self): return LXMLTreeBuilderForXML() + def test_cdata_becomes_text(self): + # LXML sends CData sections as 'data' events, so we can't + # create special CData objects for them. We have to use + # NavigableString. I would like to fix this, but it's not a + # very high priority. + markup = "<foo><![CDATA[iamcdata]]></foo>" + soup = self.soup(markup) + cdata = soup.foo.contents[0] + self.assertEquals(cdata.__class__.__name__, 'NavigableString') + + + def test_can_handle_invalid_xml(self): + self.assertSoupEquals("<a><b>", "<a><b /></a>") + def test_empty_element_tag(self): soup = self.soup("<p><iamselfclosing /></p>") self.assertTrue(soup.iamselfclosing.is_empty_element) diff --git a/tests/test_tree.py b/tests/test_tree.py index 40643dc..6f00716 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,7 +13,7 @@ import copy import cPickle as pickle import re from beautifulsoup import BeautifulSoup -from beautifulsoup.element import SoupStrainer, Tag +from beautifulsoup.element import CData, SoupStrainer, Tag from beautifulsoup.testing import SoupTest class TreeTest(SoupTest): @@ -883,9 +883,15 @@ class TestEncoding(SoupTest): soup.b.encode("utf-8"), html.encode("utf-8")) -class TestEmptyElementTags(SoupTest): +class TestNavigableStringSubclasses(SoupTest): - @property - def default_builder(self): - return LXMLTreeBuilderForXML() + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEquals(str(soup), "<![CDATA[foo]]>") + self.assertEquals(soup.find(text="foo"), "foo") + self.assertEquals(soup.contents[0], "foo") |