diff options
-rw-r--r-- | CHANGELOG | 16 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 8 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 33 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 10 | ||||
-rw-r--r-- | beautifulsoup/element.py | 23 | ||||
-rw-r--r-- | tests/test_html5lib.py | 3 | ||||
-rw-r--r-- | tests/test_lxml.py | 71 | ||||
-rw-r--r-- | tests/test_tree.py | 17 |
8 files changed, 152 insertions, 29 deletions
@@ -17,6 +17,10 @@ work. Here are the renames: * findPreviousSibling -> find_previous_sibling * findPreviousSiblings -> find_previous_siblings +Some attributes have also been renamed: + + * Tag.isSelfClosing -> Tag.is_empty_element + == Generators are now properties == The generators have been given more sensible (and PEP 8-compliant) @@ -51,6 +55,18 @@ and nothing else, then A.string is the same as B.string. So: The value of a.string used to be None, and now it's "foo". +== Empty-element tags == + +Beautiful Soup's handling of empty-element tags (aka self-closing +tags) has been improved, especially when parsing XML. Previously you +had to explicitly specify a list of empty-element tags when parsing +XML. You can still do that, but if you don't, Beautiful Soup now +considers any empty tag to be an empty-element tag. + +The determination of empty-element-ness is now made at runtime rather +than parse time. If you add a child to an empty-element tag, it stops +being an empty-element tag. + == Entities are always converted to Unicode == An HTML or XML entity is always converted into the corresponding diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 922005c..ca32589 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -106,11 +106,9 @@ class BeautifulSoup(Tag): able to build a tree using 'start tag' events, 'end tag' events, 'data' events, and "done with data" events. - If you encounter a self-closing tag, call handle_starttag and then - handle_endtag, but note that the tag will not be displayed as a - self-closing tag unless you also have your builder's - isSelfClosingTag() implementation return True when passed the tag - name. + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's <br> tag), call handle_starttag and then + handle_endtag. """ ROOT_TAG_NAME = u'[document]' diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5c275d7..deaa613 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -12,16 +12,37 @@ class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" assume_html = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. def __init__(self): self.soup = None - def isSelfClosingTag(self, name): - return name in self.self_closing_tags - def reset(self): pass + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTML builder does not consider a <p> tag to + be an empty-element tag (it's not in empty_element_tags). This + means an empty <p> tag will be presented as "<p></p>", not + "<p />". + + The default builder has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + def feed(self, markup): raise NotImplementedError() @@ -95,14 +116,14 @@ class SAXTreeBuilder(TreeBuilder): class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. - Such as which tags are self-closing tags. + Such as which tags are empty-element tags. """ assume_html = True preserve_whitespace_tags = set(['pre', 'textarea']) - self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 72e5913..e431a62 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -7,16 +7,15 @@ import types class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser - preserve_whitespace_tags = set() - self_closing_tags = set() - @property def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. return etree.XMLParser - def __init__(self, parser=None): + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) if parser is None: # Use the default parser. parser = self.default_parser @@ -53,9 +52,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - if len(completed_tag.contents) == 0: - completed_tag.isSelfClosing = True - self.soup.handle_endtag(name) def pi(self, target, data): diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 5793d59..a70813d 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -429,7 +429,6 @@ class Tag(PageElement, Entities): # chunks be garbage-collected. self.parserClass = parser.__class__ self.name = name - self.isSelfClosing = builder.isSelfClosingTag(name) if attrs == None: attrs = [] if isinstance(attrs, types.DictType): @@ -447,6 +446,26 @@ class Tag(PageElement, Entities): # Set up any substitutions, such as the charset in a META tag. self.contains_substitutions = builder.set_up_substitutions(self) + self.can_be_empty_element = builder.can_be_empty_element(name) + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + @property def string(self): @@ -624,7 +643,7 @@ class Tag(PageElement, Entities): attrs.append(decoded) close = '' closeTag = '' - if self.isSelfClosing: + if self.is_empty_element: close = ' /' else: closeTag = '</%s>' % self.name diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 336f9a5..021c603 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -91,6 +91,9 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): ('<table><tbody><tr></tr></tbody></table>' '<table><tbody><tr id="nested"></tr></tbody></table>')) + def test_empty_element_tag_with_contents(self): + self.assertSoupEquals("<br>foo</br>", "<br />foo<br />") + def test_doctype_in_body(self): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup, "<p>onetwo</p>") diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 58d16ff..7d916da 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -33,14 +33,19 @@ class TestLXMLBuilder(SoupTest): "<a><B><Cd><EFG></efg></CD></b></A>", "<a><b><cd><efg></efg></cd></b></a>") - def test_self_closing(self): - # HTML's self-closing tags are recognized as such. + def test_empty_element(self): + # HTML's empty-element tags are recognized as such. self.assertSoupEquals( "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>") self.assertSoupEquals( "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + def test_empty_tag_thats_not_an_empty_element_tag(self): + # A tag that is empty but not an HTML empty-element tag + # is not presented as an empty-element tag. + self.assertSoupEquals("<p>", "<p></p>") + def test_comment(self): # Comments are represented as Comment objects. markup = "<p>foo<!--foobar-->baz</p>" @@ -303,6 +308,18 @@ class TestLXMLBuilder(SoupTest): str = soup.p.string #self.assertEquals(str.encode("utf-8"), expected) + def test_br_tag_is_empty_element(self): + """A <br> tag is designated as an empty-element tag.""" + soup = self.soup("<br></br>") + self.assertTrue(soup.br.is_empty_element) + self.assertEquals(str(soup.br), "<br />") + + def test_p_tag_is_not_empty_element(self): + """A <p> tag is not designated as an empty-element tag.""" + soup = self.soup("<p />") + self.assertFalse(soup.p.is_empty_element) + self.assertEquals(str(soup.p), "<p></p>") + class TestLXMLBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the LXML tree builder. @@ -351,6 +368,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '<table><tr><table><tr id="nested">', '<table><tr><table><tr id="nested"></tr></table></tr></table>') + def test_empty_element_tag_with_contents(self): + self.assertSoupEquals("<br>foo</br>", "<br />foo") + def test_doctype_in_body(self): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup) @@ -487,20 +507,53 @@ class TestLXMLBuilderEncodingConversion(SoupTest): from beautifulsoup.builder.lxml_builder import LXMLTreeBuilderForXML class TestLXMLXMLBuilder(SoupTest): + """Test XML-specific parsing behavior. + + Most of the tests use HTML as an example, since Beautiful Soup is + mainly an HTML parser. This test suite is a base for XML-specific + tree builders. + """ @property def default_builder(self): return LXMLTreeBuilderForXML() - def test_self_closing_tag(self): + def test_empty_element_tag(self): soup = self.soup("<p><iamselfclosing /></p>") - self.assertTrue(soup.iamselfclosing.isSelfClosing) + self.assertTrue(soup.iamselfclosing.is_empty_element) - def test_self_empty_tag_treated_as_self_closing(self): + def test_self_empty_tag_treated_as_empty_element(self): soup = self.soup("<p><iamclosed></iamclosed></p>") - self.assertFalse(soup.iamclosed.isSelfClosing) + self.assertTrue(soup.iamclosed.is_empty_element) - def test_self_nonempty_tag_is_not_self_closing(self): + def test_self_nonempty_tag_is_not_empty_element(self): soup = self.soup("<p><ihavecontents>contents</ihavecontents></p>") - self.assertFalse(soup.ihavecontents.isSelfClosing) - + self.assertFalse(soup.ihavecontents.is_empty_element) + + def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self): + soup = self.soup("<bar />") + self.assertTrue(soup.bar.is_empty_element) + soup.bar.insert(1, "Contents") + self.assertFalse(soup.bar.is_empty_element) + self.assertEquals(str(soup), "<bar>Contents</bar>") + + def test_designated_empty_element_tag_has_no_closing_tag(self): + builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) + soup = BeautifulSoup(builder=builder, markup="<bar></bar>") + self.assertTrue(soup.bar.is_empty_element) + self.assertEquals(str(soup), "<bar />") + + def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self): + builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) + + soup = BeautifulSoup(builder=builder, markup="<foo />") + self.assertFalse(soup.foo.is_empty_element) + self.assertEquals(str(soup), "<foo></foo>") + + def test_designated_empty_element_tag_does_not_change_parser_behavior(self): + # The designated list of empty-element tags only affects how + # empty tags are presented. It does not affect how tags are + # parsed--that's the parser's job. + builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) + soup = BeautifulSoup(builder=builder, markup="<bar>contents</bar>") + self.assertEquals(str(soup), "<bar>contents</bar>") diff --git a/tests/test_tree.py b/tests/test_tree.py index 233cb3c..40643dc 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -620,6 +620,15 @@ class TestTreeModification(SoupTest): self.assertEqual(the.next, c_tag) self.assertEqual(c_tag.previous, the) + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("<br />") + soup.br.insert(1, "Contents") + self.assertEquals(str(soup.br), "<br>Contents</br>") + def test_replace_with(self): soup = self.soup( "<p>There's <b>no</b> business like <b>show</b> business</p>") @@ -872,3 +881,11 @@ class TestEncoding(SoupTest): soup = self.soup(html) self.assertEquals( soup.b.encode("utf-8"), html.encode("utf-8")) + + +class TestEmptyElementTags(SoupTest): + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() + |