summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/builder/lxml_builder.py37
-rw-r--r--tests/test_lxml.py21
2 files changed, 51 insertions, 7 deletions
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index afdf760..72e5913 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -2,15 +2,28 @@ from lxml import etree
from beautifulsoup.element import Comment, Doctype
from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
from beautifulsoup.dammit import UnicodeDammit
+import types
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
- def __init__(self, parser_class=None):
- # strip_cdata only has an effect on XMLParser. HTMLParser's
- # constructor accepts strip_cdata but ignores it.
- parser_class = parser_class or self.DEFAULT_PARSER_CLASS
- self.parser = parser_class(target=self, strip_cdata=False)
+ preserve_whitespace_tags = set()
+ self_closing_tags = set()
+
+ @property
+ def default_parser(self):
+ # This can either return a parser object or a class, which
+ # will be instantiated with default arguments.
+ return etree.XMLParser
+
+ def __init__(self, parser=None):
+ if parser is None:
+ # Use the default parser.
+ parser = self.default_parser
+ if callable(parser):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False)
+ self.parser = parser
self.soup = None
def prepare_markup(self, markup, user_specified_encoding=None,
@@ -38,6 +51,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup.handle_starttag(name, attrs)
def end(self, name):
+ self.soup.endData()
+ completed_tag = self.soup.tagStack[-1]
+ if len(completed_tag.contents) == 0:
+ completed_tag.isSelfClosing = True
+
self.soup.handle_endtag(name)
def pi(self, target, data):
@@ -62,6 +80,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return u'<html><body>%s</body></html>' % fragment
-class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder):
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+ @property
+ def default_parser(self):
+ return etree.HTMLParser
- DEFAULT_PARSER_CLASS = etree.HTMLParser
+ def end(self, name):
+ self.soup.handle_endtag(name)
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 8670806..58d16ff 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -483,3 +483,24 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
self.assertEquals(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
+
+
+from beautifulsoup.builder.lxml_builder import LXMLTreeBuilderForXML
+class TestLXMLXMLBuilder(SoupTest):
+
+ @property
+ def default_builder(self):
+ return LXMLTreeBuilderForXML()
+
+ def test_self_closing_tag(self):
+ soup = self.soup("<p><iamselfclosing /></p>")
+ self.assertTrue(soup.iamselfclosing.isSelfClosing)
+
+ def test_self_empty_tag_treated_as_self_closing(self):
+ soup = self.soup("<p><iamclosed></iamclosed></p>")
+ self.assertFalse(soup.iamclosed.isSelfClosing)
+
+ def test_self_nonempty_tag_is_not_self_closing(self):
+ soup = self.soup("<p><ihavecontents>contents</ihavecontents></p>")
+ self.assertFalse(soup.ihavecontents.isSelfClosing)
+