summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py8
-rw-r--r--beautifulsoup/builder/__init__.py31
-rw-r--r--beautifulsoup/builder/lxml_builder.py6
-rw-r--r--beautifulsoup/element.py23
-rw-r--r--tests/test_lxml.py13
5 files changed, 56 insertions, 25 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 922005c..ca32589 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -106,11 +106,9 @@ class BeautifulSoup(Tag):
able to build a tree using 'start tag' events, 'end tag' events,
'data' events, and "done with data" events.
- If you encounter a self-closing tag, call handle_starttag and then
- handle_endtag, but note that the tag will not be displayed as a
- self-closing tag unless you also have your builder's
- isSelfClosingTag() implementation return True when passed the tag
- name.
+ If you encounter an empty-element tag (aka a self-closing tag,
+ like HTML's <br> tag), call handle_starttag and then
+ handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 5c275d7..37e9c8a 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -12,16 +12,37 @@ class TreeBuilder(Entities):
"""Turn a document into a Beautiful Soup object tree."""
assume_html = False
+ preserve_whitespace_tags = set()
+ empty_element_tags = None # A tag will be considered an empty-element
+ # tag when and only when it has no contents.
def __init__(self):
self.soup = None
- def isSelfClosingTag(self, name):
- return name in self.self_closing_tags
-
def reset(self):
pass
+ def can_be_empty_element(self, tag_name):
+ """Might a tag with this name be an empty-element tag?
+
+ The final markup may or may not actually present this tag as
+ self-closing.
+
+ For instance: an HTML builder does not consider a <p> tag to
+ be an empty-element tag (it's not in empty_element_tags). This
+ means an empty <p> tag will be presented as "<p></p>", not
+ "<p />".
+
+ The default builder has no opinion about which tags are
+ empty-element tags, so a tag will be presented as an
+ empty-element tag if and only if it has no contents.
+ "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+ be left alone.
+ """
+ if self.empty_element_tags is None:
+ return True
+ return tag_name in self.empty_element_tags
+
def feed(self, markup):
raise NotImplementedError()
@@ -101,8 +122,8 @@ class HTMLTreeBuilder(TreeBuilder):
assume_html = True
preserve_whitespace_tags = set(['pre', 'textarea'])
- self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
# Used by set_up_substitutions to detect the charset in a META tag
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 5e053b1..0cc9e51 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -7,9 +7,6 @@ import types
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
- preserve_whitespace_tags = set()
- self_closing_tags = set()
-
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -55,9 +52,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
- if len(completed_tag.contents) == 0:
- completed_tag.isSelfClosing = True
-
self.soup.handle_endtag(name)
def pi(self, target, data):
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 5793d59..c0c28d4 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -429,7 +429,6 @@ class Tag(PageElement, Entities):
# chunks be garbage-collected.
self.parserClass = parser.__class__
self.name = name
- self.isSelfClosing = builder.isSelfClosingTag(name)
if attrs == None:
attrs = []
if isinstance(attrs, types.DictType):
@@ -447,6 +446,26 @@ class Tag(PageElement, Entities):
# Set up any substitutions, such as the charset in a META tag.
self.contains_substitutions = builder.set_up_substitutions(self)
+ self.can_be_empty_element = (builder.can_be_empty_element(name))
+
+ @property
+ def is_empty_element(self):
+ """Is this tag an empty-element tag? (aka a self-closing tag)
+
+ A tag that has contents is never an empty-element tag.
+
+ A tag that has no contents may or may not be an empty-element
+ tag. It depends on the builder used to create the tag. If the
+ builder has a designated list of empty-element tags, then only
+ a tag whose name shows up in that list is considered an
+ empty-element tag.
+
+ If the builder has no designated list of empty-element tags,
+ then any tag with no contents is an empty-element tag.
+ """
+ return len(self.contents) == 0 and self.can_be_empty_element
+ isSelfClosing = is_empty_element # BS3
+
@property
def string(self):
@@ -624,7 +643,7 @@ class Tag(PageElement, Entities):
attrs.append(decoded)
close = ''
closeTag = ''
- if self.isSelfClosing:
+ if self.is_empty_element:
close = ' /'
else:
closeTag = '</%s>' % self.name
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index dd1c363..602fe05 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -492,15 +492,14 @@ class TestLXMLXMLBuilder(SoupTest):
def default_builder(self):
return LXMLTreeBuilderForXML()
- def test_self_closing_tag(self):
+ def test_empty_element_tag(self):
soup = self.soup("<p><iamselfclosing /></p>")
- self.assertTrue(soup.iamselfclosing.isSelfClosing)
+ self.assertTrue(soup.iamselfclosing.is_empty_element)
- def test_self_empty_tag_treated_as_self_closing(self):
+ def test_self_empty_tag_treated_as_empty_element(self):
soup = self.soup("<p><iamclosed></iamclosed></p>")
- self.assertTrue(soup.iamclosed.isSelfClosing)
+ self.assertTrue(soup.iamclosed.is_empty_element)
- def test_self_nonempty_tag_is_not_self_closing(self):
+ def test_self_nonempty_tag_is_not_empty_element(self):
soup = self.soup("<p><ihavecontents>contents</ihavecontents></p>")
- self.assertFalse(soup.ihavecontents.isSelfClosing)
-
+ self.assertFalse(soup.ihavecontents.is_empty_element)