From b3ba2b97b2d4c4e5559baadc96f1844753b38df4 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 20 Feb 2011 10:04:35 -0500 Subject: Made the XML treebuilder able to handle basic invalid XML. --- beautifulsoup/builder/__init__.py | 10 +++++----- beautifulsoup/builder/lxml_builder.py | 11 ++++------- tests/test_lxml.py | 3 +++ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index deaa613..9ffa9ef 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -28,12 +28,12 @@ class TreeBuilder(Entities): The final markup may or may not actually present this tag as self-closing. - For instance: an HTML builder does not consider a

tag to - be an empty-element tag (it's not in empty_element_tags). This - means an empty

tag will be presented as "

", not - "

". + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". - The default builder has no opinion about which tags are + The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no contents. "" will become "", and "bar" will diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index e431a62..9f4c0bd 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -11,7 +11,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. - return etree.XMLParser + return etree.XMLParser(target=self, strip_cdata=False, recover=True) def __init__(self, parser=None, empty_element_tags=None): if empty_element_tags is not None: @@ -71,10 +71,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.handle_data(content) self.soup.endData(Comment) - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'%s' % fragment - class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -82,5 +78,6 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): def default_parser(self): return etree.HTMLParser - def end(self, name): - self.soup.handle_endtag(name) + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 7d916da..c178457 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -518,6 +518,9 @@ class TestLXMLXMLBuilder(SoupTest): def default_builder(self): return LXMLTreeBuilderForXML() + def test_can_handle_invalid_xml(self): + self.assertSoupEquals("", "") + def test_empty_element_tag(self): soup = self.soup("

") self.assertTrue(soup.iamselfclosing.is_empty_element) -- cgit v1.2.3