diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 17:03:37 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 17:03:37 -0500 |
commit | 87747b712cfe63d173332f06ee1ba2bf9adf9ce5 (patch) | |
tree | 1829d574032666de65230c2c9df6f1aa522c1b05 | |
parent | ac197c5ad0ffe0795436cb54e0766640d12c6a31 (diff) |
Added a kind of hacky way to interpret the restriction class='foo bar'. Stop generating a space before the slash that closes an empty-element tag.
-rw-r--r-- | NEWS.txt | 9 | ||||
-rw-r--r-- | bs4/element.py | 23 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 34 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 25 |
6 files changed, 63 insertions, 32 deletions
@@ -3,6 +3,11 @@ * The value of multi-valued attributes like "class" are always turned into a list, even if there's only one value. +* Stopped generating a space before the slash that closes an + empty-element tag. This may come back if I add a special XHTML mode + (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty + useless. + * Passing text along with tag-specific arguments to a find* method: find("a", text="Click here") @@ -15,6 +20,10 @@ partially disconnected tree. Generally cleaned up the html5lib tree builder. +* If you restrict a multi-valued attribute like "class" to a string + that contains spaces, Beautiful Soup will only consider it a match + if the values correspond to that specific string. + = 4.0.0b5 (20120209) = * Rationalized Beautiful Soup's treatment of CSS class. A tag diff --git a/bs4/element.py b/bs4/element.py index cf1ed32..a0f64ba 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -782,7 +782,7 @@ class Tag(PageElement): close = '' closeTag = '' if self.is_empty_element: - close = ' /' + close = '/' else: closeTag = '</%s>' % self.name @@ -1013,11 +1013,22 @@ class SoupStrainer(object): result = False if isinstance(markup, list) or isinstance(markup, tuple): - # This should only happen when searching the 'class' - # attribute of a tag with multiple CSS classes. - for item in markup: - if self._matches(item, match_against): - result = True + # This should only happen when searching, e.g. the 'class' + # attribute. + if (isinstance(match_against, basestring) + and ' ' in match_against): + # A bit of a special case. If they try to match "foo + # bar" on a multivalue attribute's value, only accept + # the literal value "foo bar" + # + # XXX This is going to be pretty slow because we keep + # splitting match_against. But it shouldn't come up + # too often. + result = (whitespace_re.split(match_against) == markup) + else: + for item in markup: + if self._matches(item, match_against): + result = True elif match_against is True: result = markup is not None elif isinstance(match_against, collections.Callable): diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index dcbd204..d972b2d 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -137,7 +137,7 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): "foo<table><tbody><tr><td></td><td>bar</td></tr></tbody></table>") def test_empty_element_tag_with_contents(self): - self.assertSoupEquals("<br>foo</br>", "<br />foo<br />") + self.assertSoupEquals("<br>foo</br>", "<br/>foo<br/>") def test_doctype_in_body(self): markup = "<p>one<!DOCTYPE foobar>two</p>" diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 9ba7df7..ea94363 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -42,7 +42,7 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder): "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>") self.assertSoupEquals( - "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>") def test_hex_entities_in_text(self): # XXX This tests a workaround for a bug in HTMLParser. diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 359f619..3603528 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -38,10 +38,10 @@ class TestLXMLBuilder(SoupTest): def test_empty_element(self): # HTML's empty-element tags are recognized as such. self.assertSoupEquals( - "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>") + "<p>A <meta> tag</p>", "<p>A <meta/> tag</p>") self.assertSoupEquals( - "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>") def test_empty_tag_thats_not_an_empty_element_tag(self): # A tag that is empty but not an HTML empty-element tag @@ -218,7 +218,7 @@ class TestLXMLBuilder(SoupTest): # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. @@ -263,12 +263,12 @@ class TestLXMLBuilder(SoupTest): # Here's the <meta> tag saying that a document is # encoded in Shift-JIS. meta_tag = ('<meta content="text/html; charset=x-sjis" ' - 'http-equiv="Content-type" />') + 'http-equiv="Content-type"/>') # Here's a document incorporating that meta tag. shift_jis_html = ( '<html><head>\n%s\n' - '<meta http-equiv="Content-language" content="ja" />' + '<meta http-equiv="Content-language" content="ja"/>' '</head><body>Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) @@ -293,17 +293,17 @@ class TestLXMLBuilder(SoupTest): """A <br> tag is designated as an empty-element tag.""" soup = self.soup("<br></br>") self.assertTrue(soup.br.is_empty_element) - self.assertEqual(str(soup.br), "<br />") + self.assertEqual(str(soup.br), "<br/>") def test_p_tag_is_not_empty_element(self): """A <p> tag is not designated as an empty-element tag.""" - soup = self.soup("<p />") + soup = self.soup("<p/>") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "<p></p>") def test_soupstrainer(self): strainer = SoupStrainer("b") - soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>", + soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", parse_only=strainer) self.assertEqual(soup.decode(), "<b>bold</b>") @@ -370,7 +370,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): # the 'close' tag is ignored. self.assertSoupEquals( "<item><link>http://foo.com/</link></item>", - "<item><link />http://foo.com/</item>") + "<item><link/>http://foo.com/</item>") def test_boolean_attribute_with_no_value_gets_empty_value(self): soup = self.soup("<table><td nowrap>foo</td></table>") @@ -391,7 +391,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): self.assertEqual(markup.p.contents, ["this is the definition:"]) def test_empty_element_tag_with_contents(self): - self.assertSoupEquals("<br>foo</br>", "<br />foo") + self.assertSoupEquals("<br>foo</br>", "<br/>foo") def test_doctype_in_body(self): markup = "<p>one<!DOCTYPE foobar>two</p>" @@ -556,8 +556,8 @@ class TestLXMLXMLBuilder(SoupTest): # Mixed-case tags are *not* folded to lowercase, but the # end tag is always the same case as the start tag. self.assertSoupEquals( - "<a><B><Cd><EFG /></CD></b></A>", - "<a><B><Cd><EFG /></Cd></B></a>") + "<a><B><Cd><EFG/></CD></b></A>", + "<a><B><Cd><EFG/></Cd></B></a>") def test_cdata_becomes_text(self): @@ -572,10 +572,10 @@ class TestLXMLXMLBuilder(SoupTest): def test_can_handle_invalid_xml(self): - self.assertSoupEquals("<a><b>", "<a><b /></a>") + self.assertSoupEquals("<a><b>", "<a><b/></a>") def test_empty_element_tag(self): - soup = self.soup("<p><iamselfclosing /></p>") + soup = self.soup("<p><iamselfclosing/></p>") self.assertTrue(soup.iamselfclosing.is_empty_element) def test_self_empty_tag_treated_as_empty_element(self): @@ -587,7 +587,7 @@ class TestLXMLXMLBuilder(SoupTest): self.assertFalse(soup.ihavecontents.is_empty_element) def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self): - soup = self.soup("<bar />") + soup = self.soup("<bar/>") self.assertTrue(soup.bar.is_empty_element) soup.bar.insert(1, "Contents") self.assertFalse(soup.bar.is_empty_element) @@ -597,12 +597,12 @@ class TestLXMLXMLBuilder(SoupTest): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="<bar></bar>") self.assertTrue(soup.bar.is_empty_element) - self.assertEqual(str(soup), self.document_for("<bar />")) + self.assertEqual(str(soup), self.document_for("<bar/>")) def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) - soup = BeautifulSoup(builder=builder, markup="<foo />") + soup = BeautifulSoup(builder=builder, markup="<foo/>") self.assertFalse(soup.foo.is_empty_element) self.assertEqual(str(soup), self.document_for("<foo></foo>")) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 3684777..9e57d54 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -220,6 +220,17 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects( soup.find_all("a", small_attribute_value), ["Found it"]) + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') + a, a2 = soup.find_all("a") + self.assertEqual([a, a2], soup.find_all("a", "foo")) + self.assertEqual([a], soup.find_all("a", "bar")) + + # If you specify the attribute as a string that contains a + # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", "foo bar")) + self.assertEqual([], soup.find_all("a", "bar foo")) + def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" <a id="first">Match.</a> @@ -595,8 +606,8 @@ class TestTagCreation(SoupTest): # Both the <br> and <p> tag are empty-element, just because # they have no contents. - self.assertEqual(b"<br />", xml_br.encode()) - self.assertEqual(b"<p />", xml_p.encode()) + self.assertEqual(b"<br/>", xml_br.encode()) + self.assertEqual(b"<p/>", xml_p.encode()) html_soup = BeautifulSoup("", "html") html_br = html_soup.new_tag("br") @@ -604,7 +615,7 @@ class TestTagCreation(SoupTest): # The HTML builder users HTML's rules about which tags are # empty-element tags, and the new tags reflect these rules. - self.assertEqual(b"<br />", html_br.encode()) + self.assertEqual(b"<br/>", html_br.encode()) self.assertEqual(b"<p></p>", html_p.encode()) def test_new_string_creates_navigablestring(self): @@ -775,7 +786,7 @@ class TestTreeModification(SoupTest): # markup like this to come through. But in general, we don't # know what the parser would or wouldn't have allowed, so # I'm letting this succeed for now. - soup = self.soup("<br />") + soup = self.soup("<br/>") soup.br.insert(1, "Contents") self.assertEqual(str(soup.br), "<br>Contents</br>") @@ -1071,7 +1082,7 @@ class TestCDAtaListAttributes(SoupTest): # We saw in another test that accept-charset is a cdata-list # attribute for the <form> tag. But it's not a cdata-list # attribute for any other tag. - self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset']) + self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) class TestPersistence(SoupTest): @@ -1183,7 +1194,7 @@ class TestSubstitutions(SoupTest): # Here's the <meta> tag saying that a document is # encoded in Shift-JIS. meta_tag = ('<meta content="text/html; charset=x-sjis" ' - 'http-equiv="Content-type" />') + 'http-equiv="Content-type"/>') soup = self.soup(meta_tag) # Parse the document, and the charset is replaced with a @@ -1207,7 +1218,7 @@ class TestSubstitutions(SoupTest): def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): markup = ('<head><meta content="text/html; charset=x-sjis" ' - 'http-equiv="Content-type" /></head><pre>foo</pre>') + 'http-equiv="Content-type"/></head><pre>foo</pre>') # Beautiful Soup used to try to rewrite the meta tag even if the # meta tag got filtered out by the strainer. This test makes |