Added a kind of hacky way to interpret the restriction class='foo bar'. Stop generating a space before the slash that closes an empty-element tag.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-15 17:03:37 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-15 17:03:37 -0500
commit: 87747b712cfe63d173332f06ee1ba2bf9adf9ce5 (patch)
tree: 1829d574032666de65230c2c9df6f1aa522c1b05
parent: ac197c5ad0ffe0795436cb54e0766640d12c6a31 (diff)
6 files changed, 63 insertions, 32 deletions
diff --git a/NEWS.txt b/NEWS.txt
index ea49586..d77b95b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,11 @@
 * The value of multi-valued attributes like "class" are always turned
   into a list, even if there's only one value.
 
+* Stopped generating a space before the slash that closes an
+  empty-element tag. This may come back if I add a special XHTML mode
+  (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty
+  useless.
+
 * Passing text along with tag-specific arguments to a find* method:
 
    find("a", text="Click here")
@@ -15,6 +20,10 @@
   partially disconnected tree. Generally cleaned up the html5lib tree
   builder.
 
+* If you restrict a multi-valued attribute like "class" to a string
+  that contains spaces, Beautiful Soup will only consider it a match
+  if the values correspond to that specific string.
+
 = 4.0.0b5 (20120209) =
 
 * Rationalized Beautiful Soup's treatment of CSS class. A tag
diff --git a/bs4/element.py b/bs4/element.py
index cf1ed32..a0f64ba 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -782,7 +782,7 @@ class Tag(PageElement):
         close = ''
         closeTag = ''
         if self.is_empty_element:
-            close = ' /'
+            close = '/'
         else:
             closeTag = '</%s>' % self.name
 
@@ -1013,11 +1013,22 @@ class SoupStrainer(object):
         result = False
 
         if isinstance(markup, list) or isinstance(markup, tuple):
-            # This should only happen when searching the 'class'
-            # attribute of a tag with multiple CSS classes.
-            for item in markup:
-                if self._matches(item, match_against):
-                    result = True
+            # This should only happen when searching, e.g. the 'class'
+            # attribute.
+            if (isinstance(match_against, basestring)
+                and ' ' in match_against):
+                # A bit of a special case. If they try to match "foo
+                # bar" on a multivalue attribute's value, only accept
+                # the literal value "foo bar"
+                #
+                # XXX This is going to be pretty slow because we keep
+                # splitting match_against. But it shouldn't come up
+                # too often.
+                result = (whitespace_re.split(match_against) == markup)
+            else:
+                for item in markup:
+                    if self._matches(item, match_against):
+                        result = True
         elif match_against is True:
             result = markup is not None
         elif isinstance(match_against, collections.Callable):
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index dcbd204..d972b2d 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -137,7 +137,7 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
             "foo<table><tbody><tr><td></td><td>bar</td></tr></tbody></table>")
 
     def test_empty_element_tag_with_contents(self):
-        self.assertSoupEquals("<br>foo</br>", "<br />foo<br />")
+        self.assertSoupEquals("<br>foo</br>", "<br/>foo<br/>")
 
     def test_doctype_in_body(self):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 9ba7df7..ea94363 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -42,7 +42,7 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder):
             "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
 
         self.assertSoupEquals(
-            "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+            "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>")
 
     def test_hex_entities_in_text(self):
         # XXX This tests a workaround for a bug in HTMLParser.
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 359f619..3603528 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -38,10 +38,10 @@ class TestLXMLBuilder(SoupTest):
     def test_empty_element(self):
         # HTML's empty-element tags are recognized as such.
         self.assertSoupEquals(
-            "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>")
+            "<p>A <meta> tag</p>", "<p>A <meta/> tag</p>")
 
         self.assertSoupEquals(
-            "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+            "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>")
 
     def test_empty_tag_thats_not_an_empty_element_tag(self):
         # A tag that is empty but not an HTML empty-element tag
@@ -218,7 +218,7 @@ class TestLXMLBuilder(SoupTest):
         # easy-to-understand document.
 
         # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
-        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
 
         # That's because we're going to encode it into ISO-Latin-1, and use
         # that to test.
@@ -263,12 +263,12 @@ class TestLXMLBuilder(SoupTest):
         # Here's the <meta> tag saying that a document is
         # encoded in Shift-JIS.
         meta_tag = ('<meta content="text/html; charset=x-sjis" '
-                    'http-equiv="Content-type" />')
+                    'http-equiv="Content-type"/>')
 
         # Here's a document incorporating that meta tag.
         shift_jis_html = (
             '<html><head>\n%s\n'
-            '<meta http-equiv="Content-language" content="ja" />'
+            '<meta http-equiv="Content-language" content="ja"/>'
             '</head><body>Shift-JIS markup goes here.') % meta_tag
         soup = self.soup(shift_jis_html)
 
@@ -293,17 +293,17 @@ class TestLXMLBuilder(SoupTest):
         """A <br> tag is designated as an empty-element tag."""
         soup = self.soup("<br></br>")
         self.assertTrue(soup.br.is_empty_element)
-        self.assertEqual(str(soup.br), "<br />")
+        self.assertEqual(str(soup.br), "<br/>")
 
     def test_p_tag_is_not_empty_element(self):
         """A <p> tag is not designated as an empty-element tag."""
-        soup = self.soup("<p />")
+        soup = self.soup("<p/>")
         self.assertFalse(soup.p.is_empty_element)
         self.assertEqual(str(soup.p), "<p></p>")
 
     def test_soupstrainer(self):
         strainer = SoupStrainer("b")
-        soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
+        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                          parse_only=strainer)
         self.assertEqual(soup.decode(), "<b>bold</b>")
 
@@ -370,7 +370,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
         # the 'close' tag is ignored.
         self.assertSoupEquals(
             "<item><link>http://foo.com/</link></item>",
-            "<item><link />http://foo.com/</item>")
+            "<item><link/>http://foo.com/</item>")
 
     def test_boolean_attribute_with_no_value_gets_empty_value(self):
         soup = self.soup("<table><td nowrap>foo</td></table>")
@@ -391,7 +391,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
         self.assertEqual(markup.p.contents, ["this is the definition:"])
 
     def test_empty_element_tag_with_contents(self):
-        self.assertSoupEquals("<br>foo</br>", "<br />foo")
+        self.assertSoupEquals("<br>foo</br>", "<br/>foo")
 
     def test_doctype_in_body(self):
         markup = "<p>one<!DOCTYPE foobar>two</p>"
@@ -556,8 +556,8 @@ class TestLXMLXMLBuilder(SoupTest):
         # Mixed-case tags are *not* folded to lowercase, but the
         # end tag is always the same case as the start tag.
         self.assertSoupEquals(
-            "<a><B><Cd><EFG /></CD></b></A>",
-            "<a><B><Cd><EFG /></Cd></B></a>")
+            "<a><B><Cd><EFG/></CD></b></A>",
+            "<a><B><Cd><EFG/></Cd></B></a>")
 
 
     def test_cdata_becomes_text(self):
@@ -572,10 +572,10 @@ class TestLXMLXMLBuilder(SoupTest):
 
 
     def test_can_handle_invalid_xml(self):
-        self.assertSoupEquals("<a><b>", "<a><b /></a>")
+        self.assertSoupEquals("<a><b>", "<a><b/></a>")
 
     def test_empty_element_tag(self):
-        soup = self.soup("<p><iamselfclosing /></p>")
+        soup = self.soup("<p><iamselfclosing/></p>")
         self.assertTrue(soup.iamselfclosing.is_empty_element)
 
     def test_self_empty_tag_treated_as_empty_element(self):
@@ -587,7 +587,7 @@ class TestLXMLXMLBuilder(SoupTest):
         self.assertFalse(soup.ihavecontents.is_empty_element)
 
     def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self):
-        soup = self.soup("<bar />")
+        soup = self.soup("<bar/>")
         self.assertTrue(soup.bar.is_empty_element)
         soup.bar.insert(1, "Contents")
         self.assertFalse(soup.bar.is_empty_element)
@@ -597,12 +597,12 @@ class TestLXMLXMLBuilder(SoupTest):
         builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
         soup = BeautifulSoup(builder=builder, markup="<bar></bar>")
         self.assertTrue(soup.bar.is_empty_element)
-        self.assertEqual(str(soup), self.document_for("<bar />"))
+        self.assertEqual(str(soup), self.document_for("<bar/>"))
 
     def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self):
         builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
 
-        soup = BeautifulSoup(builder=builder, markup="<foo />")
+        soup = BeautifulSoup(builder=builder, markup="<foo/>")
         self.assertFalse(soup.foo.is_empty_element)
         self.assertEqual(str(soup), self.document_for("<foo></foo>"))
 
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 3684777..9e57d54 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -220,6 +220,17 @@ class TestFindAllByAttribute(TreeTest):
         self.assertSelects(
             soup.find_all("a", small_attribute_value), ["Found it"])
 
+    def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
+        soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
+        a, a2 = soup.find_all("a")
+        self.assertEqual([a, a2], soup.find_all("a", "foo"))
+        self.assertEqual([a], soup.find_all("a", "bar"))
+
+        # If you specify the attribute as a string that contains a
+        # space, only that specific value will be found.
+        self.assertEqual([a], soup.find_all("a", "foo bar"))
+        self.assertEqual([], soup.find_all("a", "bar foo"))
+
     def test_find_all_by_attribute_soupstrainer(self):
         tree = self.soup("""
                          <a id="first">Match.</a>
@@ -595,8 +606,8 @@ class TestTagCreation(SoupTest):
 
         # Both the <br> and <p> tag are empty-element, just because
         # they have no contents.
-        self.assertEqual(b"<br />", xml_br.encode())
-        self.assertEqual(b"<p />", xml_p.encode())
+        self.assertEqual(b"<br/>", xml_br.encode())
+        self.assertEqual(b"<p/>", xml_p.encode())
 
         html_soup = BeautifulSoup("", "html")
         html_br = html_soup.new_tag("br")
@@ -604,7 +615,7 @@ class TestTagCreation(SoupTest):
 
         # The HTML builder users HTML's rules about which tags are
         # empty-element tags, and the new tags reflect these rules.
-        self.assertEqual(b"<br />", html_br.encode())
+        self.assertEqual(b"<br/>", html_br.encode())
         self.assertEqual(b"<p></p>", html_p.encode())
 
     def test_new_string_creates_navigablestring(self):
@@ -775,7 +786,7 @@ class TestTreeModification(SoupTest):
         # markup like this to come through. But in general, we don't
         # know what the parser would or wouldn't have allowed, so
         # I'm letting this succeed for now.
-        soup = self.soup("<br />")
+        soup = self.soup("<br/>")
         soup.br.insert(1, "Contents")
         self.assertEqual(str(soup.br), "<br>Contents</br>")
 
@@ -1071,7 +1082,7 @@ class TestCDAtaListAttributes(SoupTest):
         # We saw in another test that accept-charset is a cdata-list
         # attribute for the <form> tag. But it's not a cdata-list
         # attribute for any other tag.
-        self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset'])
+        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
 
 
 class TestPersistence(SoupTest):
@@ -1183,7 +1194,7 @@ class TestSubstitutions(SoupTest):
         # Here's the <meta> tag saying that a document is
         # encoded in Shift-JIS.
         meta_tag = ('<meta content="text/html; charset=x-sjis" '
-                    'http-equiv="Content-type" />')
+                    'http-equiv="Content-type"/>')
         soup = self.soup(meta_tag)
 
         # Parse the document, and the charset is replaced with a
@@ -1207,7 +1218,7 @@ class TestSubstitutions(SoupTest):
 
     def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
         markup = ('<head><meta content="text/html; charset=x-sjis" '
-                    'http-equiv="Content-type" /></head><pre>foo</pre>')
+                    'http-equiv="Content-type"/></head><pre>foo</pre>')
 
         # Beautiful Soup used to try to rewrite the meta tag even if the
         # meta tag got filtered out by the strainer. This test makes
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-15 17:03:37 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-15 17:03:37 -0500
commit	87747b712cfe63d173332f06ee1ba2bf9adf9ce5 (patch)
tree	1829d574032666de65230c2c9df6f1aa522c1b05
parent	ac197c5ad0ffe0795436cb54e0766640d12c6a31 (diff)