diff options
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 37 | ||||
-rw-r--r-- | tests/test_lxml.py | 21 |
2 files changed, 51 insertions, 7 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index afdf760..72e5913 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -2,15 +2,28 @@ from lxml import etree from beautifulsoup.element import Comment, Doctype from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder from beautifulsoup.dammit import UnicodeDammit +import types class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser - def __init__(self, parser_class=None): - # strip_cdata only has an effect on XMLParser. HTMLParser's - # constructor accepts strip_cdata but ignores it. - parser_class = parser_class or self.DEFAULT_PARSER_CLASS - self.parser = parser_class(target=self, strip_cdata=False) + preserve_whitespace_tags = set() + self_closing_tags = set() + + @property + def default_parser(self): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + return etree.XMLParser + + def __init__(self, parser=None): + if parser is None: + # Use the default parser. + parser = self.default_parser + if callable(parser): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False) + self.parser = parser self.soup = None def prepare_markup(self, markup, user_specified_encoding=None, @@ -38,6 +51,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.handle_starttag(name, attrs) def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + if len(completed_tag.contents) == 0: + completed_tag.isSelfClosing = True + self.soup.handle_endtag(name) def pi(self, target, data): @@ -62,6 +80,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): return u'<html><body>%s</body></html>' % fragment -class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder): +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + @property + def default_parser(self): + return etree.HTMLParser - DEFAULT_PARSER_CLASS = etree.HTMLParser + def end(self, name): + self.soup.handle_endtag(name) diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 8670806..58d16ff 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -483,3 +483,24 @@ class TestLXMLBuilderEncodingConversion(SoupTest): self.assertEquals( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) + + +from beautifulsoup.builder.lxml_builder import LXMLTreeBuilderForXML +class TestLXMLXMLBuilder(SoupTest): + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() + + def test_self_closing_tag(self): + soup = self.soup("<p><iamselfclosing /></p>") + self.assertTrue(soup.iamselfclosing.isSelfClosing) + + def test_self_empty_tag_treated_as_self_closing(self): + soup = self.soup("<p><iamclosed></iamclosed></p>") + self.assertFalse(soup.iamclosed.isSelfClosing) + + def test_self_nonempty_tag_is_not_self_closing(self): + soup = self.soup("<p><ihavecontents>contents</ihavecontents></p>") + self.assertFalse(soup.ihavecontents.isSelfClosing) + |