summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt9
-rw-r--r--bs4/element.py23
-rw-r--r--bs4/tests/test_html5lib.py2
-rw-r--r--bs4/tests/test_htmlparser.py2
-rw-r--r--bs4/tests/test_lxml.py34
-rw-r--r--bs4/tests/test_tree.py25
6 files changed, 63 insertions, 32 deletions
diff --git a/NEWS.txt b/NEWS.txt
index ea49586..d77b95b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,11 @@
* The value of multi-valued attributes like "class" are always turned
into a list, even if there's only one value.
+* Stopped generating a space before the slash that closes an
+ empty-element tag. This may come back if I add a special XHTML mode
+ (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty
+ useless.
+
* Passing text along with tag-specific arguments to a find* method:
find("a", text="Click here")
@@ -15,6 +20,10 @@
partially disconnected tree. Generally cleaned up the html5lib tree
builder.
+* If you restrict a multi-valued attribute like "class" to a string
+ that contains spaces, Beautiful Soup will only consider it a match
+ if the values correspond to that specific string.
+
= 4.0.0b5 (20120209) =
* Rationalized Beautiful Soup's treatment of CSS class. A tag
diff --git a/bs4/element.py b/bs4/element.py
index cf1ed32..a0f64ba 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -782,7 +782,7 @@ class Tag(PageElement):
close = ''
closeTag = ''
if self.is_empty_element:
- close = ' /'
+ close = '/'
else:
closeTag = '</%s>' % self.name
@@ -1013,11 +1013,22 @@ class SoupStrainer(object):
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
- # This should only happen when searching the 'class'
- # attribute of a tag with multiple CSS classes.
- for item in markup:
- if self._matches(item, match_against):
- result = True
+ # This should only happen when searching, e.g. the 'class'
+ # attribute.
+ if (isinstance(match_against, basestring)
+ and ' ' in match_against):
+ # A bit of a special case. If they try to match "foo
+ # bar" on a multivalue attribute's value, only accept
+ # the literal value "foo bar"
+ #
+ # XXX This is going to be pretty slow because we keep
+ # splitting match_against. But it shouldn't come up
+ # too often.
+ result = (whitespace_re.split(match_against) == markup)
+ else:
+ for item in markup:
+ if self._matches(item, match_against):
+ result = True
elif match_against is True:
result = markup is not None
elif isinstance(match_against, collections.Callable):
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index dcbd204..d972b2d 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -137,7 +137,7 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
"foo<table><tbody><tr><td></td><td>bar</td></tr></tbody></table>")
def test_empty_element_tag_with_contents(self):
- self.assertSoupEquals("<br>foo</br>", "<br />foo<br />")
+ self.assertSoupEquals("<br>foo</br>", "<br/>foo<br/>")
def test_doctype_in_body(self):
markup = "<p>one<!DOCTYPE foobar>two</p>"
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 9ba7df7..ea94363 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -42,7 +42,7 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder):
"<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
self.assertSoupEquals(
- "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+ "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>")
def test_hex_entities_in_text(self):
# XXX This tests a workaround for a bug in HTMLParser.
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 359f619..3603528 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -38,10 +38,10 @@ class TestLXMLBuilder(SoupTest):
def test_empty_element(self):
# HTML's empty-element tags are recognized as such.
self.assertSoupEquals(
- "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>")
+ "<p>A <meta> tag</p>", "<p>A <meta/> tag</p>")
self.assertSoupEquals(
- "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+ "<p>Foo<br/>bar</p>", "<p>Foo<br/>bar</p>")
def test_empty_tag_thats_not_an_empty_element_tag(self):
# A tag that is empty but not an HTML empty-element tag
@@ -218,7 +218,7 @@ class TestLXMLBuilder(SoupTest):
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+ unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@@ -263,12 +263,12 @@ class TestLXMLBuilder(SoupTest):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
- 'http-equiv="Content-type" />')
+ 'http-equiv="Content-type"/>')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
- '<meta http-equiv="Content-language" content="ja" />'
+ '<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
@@ -293,17 +293,17 @@ class TestLXMLBuilder(SoupTest):
"""A <br> tag is designated as an empty-element tag."""
soup = self.soup("<br></br>")
self.assertTrue(soup.br.is_empty_element)
- self.assertEqual(str(soup.br), "<br />")
+ self.assertEqual(str(soup.br), "<br/>")
def test_p_tag_is_not_empty_element(self):
"""A <p> tag is not designated as an empty-element tag."""
- soup = self.soup("<p />")
+ soup = self.soup("<p/>")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "<p></p>")
def test_soupstrainer(self):
strainer = SoupStrainer("b")
- soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
+ soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
@@ -370,7 +370,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
# the 'close' tag is ignored.
self.assertSoupEquals(
"<item><link>http://foo.com/</link></item>",
- "<item><link />http://foo.com/</item>")
+ "<item><link/>http://foo.com/</item>")
def test_boolean_attribute_with_no_value_gets_empty_value(self):
soup = self.soup("<table><td nowrap>foo</td></table>")
@@ -391,7 +391,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
self.assertEqual(markup.p.contents, ["this is the definition:"])
def test_empty_element_tag_with_contents(self):
- self.assertSoupEquals("<br>foo</br>", "<br />foo")
+ self.assertSoupEquals("<br>foo</br>", "<br/>foo")
def test_doctype_in_body(self):
markup = "<p>one<!DOCTYPE foobar>two</p>"
@@ -556,8 +556,8 @@ class TestLXMLXMLBuilder(SoupTest):
# Mixed-case tags are *not* folded to lowercase, but the
# end tag is always the same case as the start tag.
self.assertSoupEquals(
- "<a><B><Cd><EFG /></CD></b></A>",
- "<a><B><Cd><EFG /></Cd></B></a>")
+ "<a><B><Cd><EFG/></CD></b></A>",
+ "<a><B><Cd><EFG/></Cd></B></a>")
def test_cdata_becomes_text(self):
@@ -572,10 +572,10 @@ class TestLXMLXMLBuilder(SoupTest):
def test_can_handle_invalid_xml(self):
- self.assertSoupEquals("<a><b>", "<a><b /></a>")
+ self.assertSoupEquals("<a><b>", "<a><b/></a>")
def test_empty_element_tag(self):
- soup = self.soup("<p><iamselfclosing /></p>")
+ soup = self.soup("<p><iamselfclosing/></p>")
self.assertTrue(soup.iamselfclosing.is_empty_element)
def test_self_empty_tag_treated_as_empty_element(self):
@@ -587,7 +587,7 @@ class TestLXMLXMLBuilder(SoupTest):
self.assertFalse(soup.ihavecontents.is_empty_element)
def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self):
- soup = self.soup("<bar />")
+ soup = self.soup("<bar/>")
self.assertTrue(soup.bar.is_empty_element)
soup.bar.insert(1, "Contents")
self.assertFalse(soup.bar.is_empty_element)
@@ -597,12 +597,12 @@ class TestLXMLXMLBuilder(SoupTest):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="<bar></bar>")
self.assertTrue(soup.bar.is_empty_element)
- self.assertEqual(str(soup), self.document_for("<bar />"))
+ self.assertEqual(str(soup), self.document_for("<bar/>"))
def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
- soup = BeautifulSoup(builder=builder, markup="<foo />")
+ soup = BeautifulSoup(builder=builder, markup="<foo/>")
self.assertFalse(soup.foo.is_empty_element)
self.assertEqual(str(soup), self.document_for("<foo></foo>"))
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 3684777..9e57d54 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -220,6 +220,17 @@ class TestFindAllByAttribute(TreeTest):
self.assertSelects(
soup.find_all("a", small_attribute_value), ["Found it"])
+ def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
+ soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
+ a, a2 = soup.find_all("a")
+ self.assertEqual([a, a2], soup.find_all("a", "foo"))
+ self.assertEqual([a], soup.find_all("a", "bar"))
+
+ # If you specify the attribute as a string that contains a
+ # space, only that specific value will be found.
+ self.assertEqual([a], soup.find_all("a", "foo bar"))
+ self.assertEqual([], soup.find_all("a", "bar foo"))
+
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
<a id="first">Match.</a>
@@ -595,8 +606,8 @@ class TestTagCreation(SoupTest):
# Both the <br> and <p> tag are empty-element, just because
# they have no contents.
- self.assertEqual(b"<br />", xml_br.encode())
- self.assertEqual(b"<p />", xml_p.encode())
+ self.assertEqual(b"<br/>", xml_br.encode())
+ self.assertEqual(b"<p/>", xml_p.encode())
html_soup = BeautifulSoup("", "html")
html_br = html_soup.new_tag("br")
@@ -604,7 +615,7 @@ class TestTagCreation(SoupTest):
# The HTML builder users HTML's rules about which tags are
# empty-element tags, and the new tags reflect these rules.
- self.assertEqual(b"<br />", html_br.encode())
+ self.assertEqual(b"<br/>", html_br.encode())
self.assertEqual(b"<p></p>", html_p.encode())
def test_new_string_creates_navigablestring(self):
@@ -775,7 +786,7 @@ class TestTreeModification(SoupTest):
# markup like this to come through. But in general, we don't
# know what the parser would or wouldn't have allowed, so
# I'm letting this succeed for now.
- soup = self.soup("<br />")
+ soup = self.soup("<br/>")
soup.br.insert(1, "Contents")
self.assertEqual(str(soup.br), "<br>Contents</br>")
@@ -1071,7 +1082,7 @@ class TestCDAtaListAttributes(SoupTest):
# We saw in another test that accept-charset is a cdata-list
# attribute for the <form> tag. But it's not a cdata-list
# attribute for any other tag.
- self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset'])
+ self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
class TestPersistence(SoupTest):
@@ -1183,7 +1194,7 @@ class TestSubstitutions(SoupTest):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
- 'http-equiv="Content-type" />')
+ 'http-equiv="Content-type"/>')
soup = self.soup(meta_tag)
# Parse the document, and the charset is replaced with a
@@ -1207,7 +1218,7 @@ class TestSubstitutions(SoupTest):
def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
markup = ('<head><meta content="text/html; charset=x-sjis" '
- 'http-equiv="Content-type" /></head><pre>foo</pre>')
+ 'http-equiv="Content-type"/></head><pre>foo</pre>')
# Beautiful Soup used to try to rewrite the meta tag even if the
# meta tag got filtered out by the strainer. This test makes