Greatly improved the handling of empty-element tags.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 09:54:42 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-20 09:54:42 -0500
commit: ae349fd47c627f8166526fed8906811707d2f4b2 (patch)
tree: 116edd8c1d9a7cf6348f784162fd2291608833c2
parent: 158e76fd3e1005f6f5f932414cb741083d114cb6 (diff)
parent: 9f437ea591aeaf16d593350baf081315e56a8b73 (diff)
8 files changed, 152 insertions, 29 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 5d13a6d..96a9ed4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -17,6 +17,10 @@ work. Here are the renames:
  * findPreviousSibling -> find_previous_sibling
  * findPreviousSiblings -> find_previous_siblings
 
+Some attributes have also been renamed:
+
+ * Tag.isSelfClosing -> Tag.is_empty_element
+
 == Generators are now properties ==
 
 The generators have been given more sensible (and PEP 8-compliant)
@@ -51,6 +55,18 @@ and nothing else, then A.string is the same as B.string. So:
 
 The value of a.string used to be None, and now it's "foo".
 
+== Empty-element tags ==
+
+Beautiful Soup's handling of empty-element tags (aka self-closing
+tags) has been improved, especially when parsing XML. Previously you
+had to explicitly specify a list of empty-element tags when parsing
+XML. You can still do that, but if you don't, Beautiful Soup now
+considers any empty tag to be an empty-element tag.
+
+The determination of empty-element-ness is now made at runtime rather
+than parse time. If you add a child to an empty-element tag, it stops
+being an empty-element tag.
+
 == Entities are always converted to Unicode ==
 
 An HTML or XML entity is always converted into the corresponding
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 922005c..ca32589 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -106,11 +106,9 @@ class BeautifulSoup(Tag):
     able to build a tree using 'start tag' events, 'end tag' events,
     'data' events, and "done with data" events.
 
-    If you encounter a self-closing tag, call handle_starttag and then
-    handle_endtag, but note that the tag will not be displayed as a
-    self-closing tag unless you also have your builder's
-    isSelfClosingTag() implementation return True when passed the tag
-    name.
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
     """
     ROOT_TAG_NAME = u'[document]'
 
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 5c275d7..deaa613 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -12,16 +12,37 @@ class TreeBuilder(Entities):
     """Turn a document into a Beautiful Soup object tree."""
 
     assume_html = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
 
     def __init__(self):
         self.soup = None
 
-    def isSelfClosingTag(self, name):
-        return name in self.self_closing_tags
-
     def reset(self):
         pass
 
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTML builder does not consider a <p> tag to
+        be an empty-element tag (it's not in empty_element_tags). This
+        means an empty <p> tag will be presented as "<p></p>", not
+        "<p />".
+
+        The default builder has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
     def feed(self, markup):
         raise NotImplementedError()
 
@@ -95,14 +116,14 @@ class SAXTreeBuilder(TreeBuilder):
 class HTMLTreeBuilder(TreeBuilder):
     """This TreeBuilder knows facts about HTML.
 
-    Such as which tags are self-closing tags.
+    Such as which tags are empty-element tags.
     """
 
     assume_html = True
 
     preserve_whitespace_tags = set(['pre', 'textarea'])
-    self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
-                            'spacer', 'link', 'frame', 'base'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
 
     # Used by set_up_substitutions to detect the charset in a META tag
     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 72e5913..e431a62 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -7,16 +7,15 @@ import types
 class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
-    preserve_whitespace_tags = set()
-    self_closing_tags = set()
-
     @property
     def default_parser(self):
         # This can either return a parser object or a class, which
         # will be instantiated with default arguments.
         return etree.XMLParser
 
-    def __init__(self, parser=None):
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
         if parser is None:
             # Use the default parser.
             parser = self.default_parser
@@ -53,9 +52,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def end(self, name):
         self.soup.endData()
         completed_tag = self.soup.tagStack[-1]
-        if len(completed_tag.contents) == 0:
-            completed_tag.isSelfClosing = True
-
         self.soup.handle_endtag(name)
 
     def pi(self, target, data):
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 5793d59..a70813d 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -429,7 +429,6 @@ class Tag(PageElement, Entities):
         # chunks be garbage-collected.
         self.parserClass = parser.__class__
         self.name = name
-        self.isSelfClosing = builder.isSelfClosingTag(name)
         if attrs == None:
             attrs = []
         if isinstance(attrs, types.DictType):
@@ -447,6 +446,26 @@ class Tag(PageElement, Entities):
         # Set up any substitutions, such as the charset in a META tag.
         self.contains_substitutions = builder.set_up_substitutions(self)
 
+        self.can_be_empty_element = builder.can_be_empty_element(name)
+
+    @property
+    def is_empty_element(self):
+        """Is this tag an empty-element tag? (aka a self-closing tag)
+
+        A tag that has contents is never an empty-element tag.
+
+        A tag that has no contents may or may not be an empty-element
+        tag. It depends on the builder used to create the tag. If the
+        builder has a designated list of empty-element tags, then only
+        a tag whose name shows up in that list is considered an
+        empty-element tag.
+
+        If the builder has no designated list of empty-element tags,
+        then any tag with no contents is an empty-element tag.
+        """
+        return len(self.contents) == 0 and self.can_be_empty_element
+    isSelfClosing = is_empty_element # BS3
+
 
     @property
     def string(self):
@@ -624,7 +643,7 @@ class Tag(PageElement, Entities):
                 attrs.append(decoded)
         close = ''
         closeTag = ''
-        if self.isSelfClosing:
+        if self.is_empty_element:
             close = ' /'
         else:
             closeTag = '</%s>' % self.name
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 336f9a5..021c603 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -91,6 +91,9 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
             ('<table><tbody><tr></tr></tbody></table>'
              '<table><tbody><tr id="nested"></tr></tbody></table>'))
 
+    def test_empty_element_tag_with_contents(self):
+        self.assertSoupEquals("<br>foo</br>", "<br />foo<br />")
+
     def test_doctype_in_body(self):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
         self.assertSoupEquals(markup, "<p>onetwo</p>")
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 58d16ff..7d916da 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -33,14 +33,19 @@ class TestLXMLBuilder(SoupTest):
             "<a><B><Cd><EFG></efg></CD></b></A>",
             "<a><b><cd><efg></efg></cd></b></a>")
 
-    def test_self_closing(self):
-        # HTML's self-closing tags are recognized as such.
+    def test_empty_element(self):
+        # HTML's empty-element tags are recognized as such.
         self.assertSoupEquals(
             "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>")
 
         self.assertSoupEquals(
             "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
 
+    def test_empty_tag_thats_not_an_empty_element_tag(self):
+        # A tag that is empty but not an HTML empty-element tag
+        # is not presented as an empty-element tag.
+        self.assertSoupEquals("<p>", "<p></p>")
+
     def test_comment(self):
         # Comments are represented as Comment objects.
         markup = "<p>foo<!--foobar-->baz</p>"
@@ -303,6 +308,18 @@ class TestLXMLBuilder(SoupTest):
         str = soup.p.string
         #self.assertEquals(str.encode("utf-8"), expected)
 
+    def test_br_tag_is_empty_element(self):
+        """A <br> tag is designated as an empty-element tag."""
+        soup = self.soup("<br></br>")
+        self.assertTrue(soup.br.is_empty_element)
+        self.assertEquals(str(soup.br), "<br />")
+
+    def test_p_tag_is_not_empty_element(self):
+        """A <p> tag is not designated as an empty-element tag."""
+        soup = self.soup("<p />")
+        self.assertFalse(soup.p.is_empty_element)
+        self.assertEquals(str(soup.p), "<p></p>")
+
 
 class TestLXMLBuilderInvalidMarkup(SoupTest):
     """Tests of invalid markup for the LXML tree builder.
@@ -351,6 +368,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
             '<table><tr><table><tr id="nested">',
             '<table><tr><table><tr id="nested"></tr></table></tr></table>')
 
+    def test_empty_element_tag_with_contents(self):
+        self.assertSoupEquals("<br>foo</br>", "<br />foo")
+
     def test_doctype_in_body(self):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
         self.assertSoupEquals(markup)
@@ -487,20 +507,53 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
 
 from beautifulsoup.builder.lxml_builder import LXMLTreeBuilderForXML
 class TestLXMLXMLBuilder(SoupTest):
+    """Test XML-specific parsing behavior.
+
+    Most of the tests use HTML as an example, since Beautiful Soup is
+    mainly an HTML parser. This test suite is a base for XML-specific
+    tree builders.
+    """
 
     @property
     def default_builder(self):
         return LXMLTreeBuilderForXML()
 
-    def test_self_closing_tag(self):
+    def test_empty_element_tag(self):
         soup = self.soup("<p><iamselfclosing /></p>")
-        self.assertTrue(soup.iamselfclosing.isSelfClosing)
+        self.assertTrue(soup.iamselfclosing.is_empty_element)
 
-    def test_self_empty_tag_treated_as_self_closing(self):
+    def test_self_empty_tag_treated_as_empty_element(self):
         soup = self.soup("<p><iamclosed></iamclosed></p>")
-        self.assertFalse(soup.iamclosed.isSelfClosing)
+        self.assertTrue(soup.iamclosed.is_empty_element)
 
-    def test_self_nonempty_tag_is_not_self_closing(self):
+    def test_self_nonempty_tag_is_not_empty_element(self):
         soup = self.soup("<p><ihavecontents>contents</ihavecontents></p>")
-        self.assertFalse(soup.ihavecontents.isSelfClosing)
-
+        self.assertFalse(soup.ihavecontents.is_empty_element)
+
+    def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self):
+        soup = self.soup("<bar />")
+        self.assertTrue(soup.bar.is_empty_element)
+        soup.bar.insert(1, "Contents")
+        self.assertFalse(soup.bar.is_empty_element)
+        self.assertEquals(str(soup), "<bar>Contents</bar>")
+
+    def test_designated_empty_element_tag_has_no_closing_tag(self):
+        builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
+        soup = BeautifulSoup(builder=builder, markup="<bar></bar>")
+        self.assertTrue(soup.bar.is_empty_element)
+        self.assertEquals(str(soup), "<bar />")
+
+    def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self):
+        builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
+
+        soup = BeautifulSoup(builder=builder, markup="<foo />")
+        self.assertFalse(soup.foo.is_empty_element)
+        self.assertEquals(str(soup), "<foo></foo>")
+
+    def test_designated_empty_element_tag_does_not_change_parser_behavior(self):
+        # The designated list of empty-element tags only affects how
+        # empty tags are presented. It does not affect how tags are
+        # parsed--that's the parser's job.
+        builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
+        soup = BeautifulSoup(builder=builder, markup="<bar>contents</bar>")
+        self.assertEquals(str(soup), "<bar>contents</bar>")
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 233cb3c..40643dc 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -620,6 +620,15 @@ class TestTreeModification(SoupTest):
         self.assertEqual(the.next, c_tag)
         self.assertEqual(c_tag.previous, the)
 
+    def test_insert_works_on_empty_element_tag(self):
+        # This is a little strange, since most HTML parsers don't allow
+        # markup like this to come through. But in general, we don't
+        # know what the parser would or wouldn't have allowed, so
+        # I'm letting this succeed for now.
+        soup = self.soup("<br />")
+        soup.br.insert(1, "Contents")
+        self.assertEquals(str(soup.br), "<br>Contents</br>")
+
     def test_replace_with(self):
         soup = self.soup(
                 "<p>There's <b>no</b> business like <b>show</b> business</p>")
@@ -872,3 +881,11 @@ class TestEncoding(SoupTest):
         soup = self.soup(html)
         self.assertEquals(
             soup.b.encode("utf-8"), html.encode("utf-8"))
+
+
+class TestEmptyElementTags(SoupTest):
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilderForXML()
+
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 09:54:42 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-20 09:54:42 -0500
commit	ae349fd47c627f8166526fed8906811707d2f4b2 (patch)
tree	116edd8c1d9a7cf6348f784162fd2291608833c2
parent	158e76fd3e1005f6f5f932414cb741083d114cb6 (diff)
parent	9f437ea591aeaf16d593350baf081315e56a8b73 (diff)