summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
commit33f0db7b3c4eef632700418068769b9cb762f708 (patch)
tree879f715a4ddc59f8826d790b55ccf74a31b797a5
parent91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff)
Rationalized the treatment of multi-valued HTML attributes such as 'class'
-rw-r--r--NEWS.txt16
-rw-r--r--bs4/builder/__init__.py15
-rw-r--r--bs4/doc/source/index.rst79
-rw-r--r--bs4/element.py35
-rw-r--r--bs4/tests/test_tree.py55
5 files changed, 186 insertions, 14 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 8515b80..31cfdbe 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,19 @@
+= 4.0.0b5 =
+
+* Rationalized Beautiful Soup's treatment of CSS class. A tag
+ belonging to multiple CSS classes is treated as having a list of
+ values for the 'class' attribute. Searching for a CSS class will
+ match *any* of the CSS classes.
+
+ This actually affects all attributes that the HTML standard defines
+ as taking multiple values (class, rel, rev, archive, accept-charset,
+ and headers), but 'class' is by far the most common.
+
+* If you pass anything other than a dictionary as the second argument
+ to one of the find* methods, it'll assume you want to use that
+ object to search against a tag's CSS classes. Previously this only
+ worked if you passed in a string.
+
= 4.0.0b4 (20120208) =
* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag()
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index a17dce6..2728606 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -82,6 +82,11 @@ class TreeBuilder(object):
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
+ # A value for these attributes is a space- or comma-separated list
+ # of CDATA, rather than a single CDATA.
+ cdata_list_attributes = None
+
+
def __init__(self):
self.soup = None
@@ -189,6 +194,16 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ # The HTML standard defines these attributes as containing a
+ # space-separated list of values, not a single value. That is,
+ # class="foo bar" means that the 'class' attribute has two values,
+ # 'foo' and 'bar', not the single value 'foo bar'. When we
+ # encounter one of these attributes, we will parse its value into
+ # a list of values if possible. Upon output, the list will be
+ # converted back into a string.
+ cdata_list_attributes = set(
+ ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers'])
+
# Used by set_up_substitutions to detect the charset in a META tag
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
index d05acb5..abea5c6 100644
--- a/bs4/doc/source/index.rst
+++ b/bs4/doc/source/index.rst
@@ -298,6 +298,39 @@ done by treating the tag as a dictionary::
tag
# <blockquote>Extremely bold</blockquote>
+.. _multivalue:
+
+Multi-valued attributes
+&&&&&&&&&&&&&&&&&&&&&&&
+
+HTML defines a few attributes that can have multiple values. The most
+common is ``class`` (a tag can have more than one CSS class), but
+there are a few others: ``rel``, ``rev``, ``archive``,
+``accept-charset``, and ``headers``. If one of these attributes has
+more than one value, Beautiful Soup will turn its values into a list::
+
+ css_soup = BeautifulSoup('<p class="body strikeout"></p>')
+ css_soup.p['class']
+ # ["body", "strikeout"]
+
+If an attribute `looks` like it has more than one value, but it's not
+one of the special attributes listed above, Beautiful Soup will leave
+the attribute alone::
+
+ id_soup = BeautifulSoup('<p id="my id"></p>')
+ id_soup.p['id']
+ # 'my id'
+
+When you turn a tag back into a string, multiple attribute values are
+consolidated::
+
+ rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
+ rel_soup.a['rel']
+ # 'index'
+ rel_soup.a['rel'] = ['index', 'contents']
+ print(rel_soup.p)
+ # <p>Back to the <a rel="index contents">homepage</a></p>
+
``NavigableString``
-------------------
@@ -1084,11 +1117,11 @@ keyword argument::
.. _attrs:
-``attrs`` and searching by CSS class
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Searching by CSS class
+^^^^^^^^^^^^^^^^^^^^^^
Instead of using keyword arguments, you can filter tags based on their
-attributes passing a dictionary in for ``attrs``. These two lines of
+attributes by passing a dictionary in for ``attrs``. These two lines of
code are equivalent::
soup.find_all(href=re.compile("elsie"), id='link1')
@@ -1107,14 +1140,46 @@ You can use ``attrs`` to search by CSS class::
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
But that's a lot of code for such a common operation. Instead, you can
-pass a string for `attrs` instead of a dictionary. The string will be
-used to restrict the CSS class::
+pass a string `attrs` instead of a dictionary. The string will be used
+to restrict the CSS class::
soup.find_all("a", "sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+You can also pass in a regular expression, a function or
+True. Anything you pass in for ``attrs`` that's not a dictionary will
+be used to search against the CSS class::
+
+ soup.find_all(attrs=re.compile("itl"))
+ # [<p class="title"><b>The Dormouse's story</b></p>]
+
+ def has_six_characters(css_class):
+ return css_class is not None and len(css_class) == 6
+
+ soup.find_all(attrs=has_six_characters)
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+:ref:`Remember <multivalue>` that a single tag can have multiple
+values for its "class" attribute. When you search for a tag that
+matches a certain CSS class, you're matching against `any` of its CSS
+classes::
+
+ css_soup = BeautifulSoup('<p class="body strikeout"></p>')
+ css_soup.find_all("p", "strikeout")
+ # [<p class="body strikeout"></p>]
+
+ css_soup.find_all("p", "body")
+ # [<p class="body strikeout"></p>]
+
+Searching for the string value of the ``class`` attribute won't work::
+
+ css_soup.find_all("p", "body strikeout")
+ # []
+
.. _text:
The ``text`` argument
@@ -2418,6 +2483,10 @@ Miscellaneous
contains a single tag B and nothing else, then A.string is the same as
B.string. (Previously, it was None.)
+`Multi-valued attributes`_ like ``class`` are parsed into lists if
+they have more than one value. This may affect the way you search by
+CSS class.
+
The ``BeautifulSoup`` constructor no longer recognizes the
`markupMassage` argument. It's now the parser's responsibility to
handle markup correctly.
diff --git a/bs4/element.py b/bs4/element.py
index 257cdbb..ff7c972 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -7,11 +7,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-
-def _match_css_class(str):
- """Build a RE to match the given CSS class."""
- return re.compile(r"(^|.*\s)%s($|\s)" % str)
-
+whitespace_re = re.compile("\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -524,6 +520,16 @@ class Tag(PageElement):
attrs = {}
else:
attrs = dict(attrs)
+ if builder.cdata_list_attributes:
+ for cdata_list_attr in builder.cdata_list_attributes:
+ if cdata_list_attr in attrs:
+ # Basically, we have a "class" attribute whose
+ # value is a whitespace-separated list of CSS
+ # classes. Split it into a list.
+ value = attrs[cdata_list_attr]
+ values = whitespace_re.split(value)
+ if len(values) > 1:
+ attrs[cdata_list_attr] = values
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
@@ -755,7 +761,9 @@ class Tag(PageElement):
if val is None:
decoded = key
else:
- if not isinstance(val, basestring):
+ if isinstance(val, list) or isinstance(val, tuple):
+ val = ' '.join(val)
+ elif not isinstance(val, basestring):
val = str(val)
if (self.contains_substitutions
and eventual_encoding is not None
@@ -907,8 +915,10 @@ class SoupStrainer(object):
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
- if isinstance(attrs, basestring):
- kwargs['class'] = _match_css_class(attrs)
+ if not isinstance(attrs, dict):
+ # Treat a non-dict value for attrs as a search for the 'class'
+ # attribute.
+ kwargs['class'] = attrs
attrs = None
if kwargs:
if attrs:
@@ -993,7 +1003,14 @@ class SoupStrainer(object):
def _matches(self, markup, match_against):
#print "Matching %s against %s" % (markup, match_against)
result = False
- if match_against is True:
+
+ if isinstance(markup, list) or isinstance(markup, tuple):
+ # This should only happen when searching the 'class'
+ # attribute of a tag with multiple CSS classes.
+ for item in markup:
+ if self._matches(item, match_against):
+ result = True
+ elif match_against is True:
result = markup is not None
elif isinstance(match_against, collections.Callable):
result = match_against(markup)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 2c47aa5..6ff87fc 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -188,6 +188,38 @@ class TestFindAllByAttribute(TreeTest):
self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
+ def test_find_by_class_when_multiple_classes_present(self):
+ tree = self.soup("<gar class='foo bar'>Found it</gar>")
+
+ attrs = { 'class' : re.compile("o") }
+ f = tree.find_all("gar", attrs=attrs)
+ self.assertSelects(f, ["Found it"])
+
+ f = tree.find_all("gar", re.compile("a"))
+ self.assertSelects(f, ["Found it"])
+
+ # Since the class is not the string "foo bar", but the two
+ # strings "foo" and "bar", this will not find anything.
+ attrs = { 'class' : re.compile("o b") }
+ f = tree.find_all("gar", attrs=attrs)
+ self.assertSelects(f, [])
+
+ def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
+ soup = self.soup("<a class='bar'>Found it</a>")
+
+ self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
+
+ def big_attribute_value(value):
+ return len(value) > 3
+
+ self.assertSelects(soup.find_all("a", big_attribute_value), [])
+
+ def small_attribute_value(value):
+ return len(value) <= 3
+
+ self.assertSelects(
+ soup.find_all("a", small_attribute_value), ["Found it"])
+
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
<a id="first">Match.</a>
@@ -956,6 +988,29 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
+class TestCDAtaListAttributes(SoupTest):
+
+ """Testing cdata-list attributes like 'class'.
+ """
+ def test_single_value_stays_string(self):
+ soup = self.soup("<a class='foo'>")
+ self.assertEqual("foo",soup.a['class'])
+
+ def test_multiple_values_becomes_list(self):
+ soup = self.soup("<a class='foo bar'>")
+ self.assertEqual(["foo", "bar"], soup.a['class'])
+
+ def test_multiple_values_separated_by_weird_whitespace(self):
+ soup = self.soup("<a class='foo\tbar\nbaz'>")
+ self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
+
+ def test_attributes_joined_into_string_on_output(self):
+ soup = self.soup("<a class='foo\tbar'>")
+ self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
+
+ def test_accept_charset(self):
+ soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
+ self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
class TestPersistence(SoupTest):
"Testing features like pickle and deepcopy."