Rationalized the treatment of multi-valued HTML attributes such as 'class'

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-08 18:47:23 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-08 18:47:23 -0500
commit: 33f0db7b3c4eef632700418068769b9cb762f708 (patch)
tree: 879f715a4ddc59f8826d790b55ccf74a31b797a5
parent: 91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff)
5 files changed, 186 insertions, 14 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 8515b80..31cfdbe 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,19 @@
+= 4.0.0b5 =
+
+* Rationalized Beautiful Soup's treatment of CSS class. A tag
+  belonging to multiple CSS classes is treated as having a list of
+  values for the 'class' attribute. Searching for a CSS class will
+  match *any* of the CSS classes.
+
+  This actually affects all attributes that the HTML standard defines
+  as taking multiple values (class, rel, rev, archive, accept-charset,
+  and headers), but 'class' is by far the most common.
+
+* If you pass anything other than a dictionary as the second argument
+  to one of the find* methods, it'll assume you want to use that
+  object to search against a tag's CSS classes. Previously this only
+  worked if you passed in a string.
+
 = 4.0.0b4 (20120208) =
 
 * Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag()
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index a17dce6..2728606 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -82,6 +82,11 @@ class TreeBuilder(object):
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
 
+    # A value for these attributes is a space- or comma-separated list
+    # of CDATA, rather than a single CDATA.
+    cdata_list_attributes = None
+
+
     def __init__(self):
         self.soup = None
 
@@ -189,6 +194,16 @@ class HTMLTreeBuilder(TreeBuilder):
     empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                               'spacer', 'link', 'frame', 'base'])
 
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = set(
+        ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers'])
+
     # Used by set_up_substitutions to detect the charset in a META tag
     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
 
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
index d05acb5..abea5c6 100644
--- a/bs4/doc/source/index.rst
+++ b/bs4/doc/source/index.rst
@@ -298,6 +298,39 @@ done by treating the tag as a dictionary::
  tag
  # <blockquote>Extremely bold</blockquote>
 
+.. _multivalue:
+
+Multi-valued attributes
+&&&&&&&&&&&&&&&&&&&&&&&
+
+HTML defines a few attributes that can have multiple values. The most
+common is ``class`` (a tag can have more than one CSS class), but
+there are a few others: ``rel``, ``rev``, ``archive``,
+``accept-charset``, and ``headers``. If one of these attributes has
+more than one value, Beautiful Soup will turn its values into a list::
+
+ css_soup = BeautifulSoup('<p class="body strikeout"></p>')
+ css_soup.p['class']
+ # ["body", "strikeout"]
+
+If an attribute `looks` like it has more than one value, but it's not
+one of the special attributes listed above, Beautiful Soup will leave
+the attribute alone::
+
+ id_soup = BeautifulSoup('<p id="my id"></p>')
+ id_soup.p['id']
+ # 'my id'
+
+When you turn a tag back into a string, multiple attribute values are
+consolidated::
+
+ rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
+ rel_soup.a['rel']
+ # 'index'
+ rel_soup.a['rel'] = ['index', 'contents']
+ print(rel_soup.p)
+ # <p>Back to the <a rel="index contents">homepage</a></p>
+
 ``NavigableString``
 -------------------
 
@@ -1084,11 +1117,11 @@ keyword argument::
 
 .. _attrs:
 
-``attrs`` and searching by CSS class
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Searching by CSS class
+^^^^^^^^^^^^^^^^^^^^^^
 
 Instead of using keyword arguments, you can filter tags based on their
-attributes passing a dictionary in for ``attrs``. These two lines of
+attributes by passing a dictionary in for ``attrs``. These two lines of
 code are equivalent::
 
  soup.find_all(href=re.compile("elsie"), id='link1')
@@ -1107,14 +1140,46 @@ You can use ``attrs`` to search by CSS class::
  #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 
 But that's a lot of code for such a common operation. Instead, you can
-pass a string for `attrs` instead of a dictionary. The string will be
-used to restrict the CSS class::
+pass a string `attrs` instead of a dictionary. The string will be used
+to restrict the CSS class::
 
  soup.find_all("a", "sister")
  # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
  #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
  #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 
+You can also pass in a regular expression, a function or
+True. Anything you pass in for ``attrs`` that's not a dictionary will
+be used to search against the CSS class::
+
+ soup.find_all(attrs=re.compile("itl"))
+ # [<p class="title"><b>The Dormouse's story</b></p>]
+
+ def has_six_characters(css_class):
+     return css_class is not None and len(css_class) == 6
+
+ soup.find_all(attrs=has_six_characters)
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+:ref:`Remember <multivalue>` that a single tag can have multiple
+values for its "class" attribute. When you search for a tag that
+matches a certain CSS class, you're matching against `any` of its CSS
+classes::
+
+ css_soup = BeautifulSoup('<p class="body strikeout"></p>')
+ css_soup.find_all("p", "strikeout")
+ # [<p class="body strikeout"></p>]
+
+ css_soup.find_all("p", "body")
+ # [<p class="body strikeout"></p>]
+
+Searching for the string value of the ``class`` attribute won't work::
+
+ css_soup.find_all("p", "body strikeout")
+ # []
+
 .. _text:
 
 The ``text`` argument
@@ -2418,6 +2483,10 @@ Miscellaneous
 contains a single tag B and nothing else, then A.string is the same as
 B.string. (Previously, it was None.)
 
+`Multi-valued attributes`_ like ``class`` are parsed into lists if
+they have more than one value. This may affect the way you search by
+CSS class.
+
 The ``BeautifulSoup`` constructor no longer recognizes the
 `markupMassage` argument. It's now the parser's responsibility to
 handle markup correctly.
diff --git a/bs4/element.py b/bs4/element.py
index 257cdbb..ff7c972 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -7,11 +7,7 @@ from bs4.dammit import EntitySubstitution
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
 
-
-def _match_css_class(str):
-    """Build a RE to match the given CSS class."""
-    return re.compile(r"(^|.*\s)%s($|\s)" % str)
-
+whitespace_re = re.compile("\s+")
 
 def _alias(attr):
     """Alias one attribute name to another for backward compatibility"""
@@ -524,6 +520,16 @@ class Tag(PageElement):
             attrs = {}
         else:
             attrs = dict(attrs)
+            if builder.cdata_list_attributes:
+                for cdata_list_attr in builder.cdata_list_attributes:
+                    if cdata_list_attr in attrs:
+                        # Basically, we have a "class" attribute whose
+                        # value is a whitespace-separated list of CSS
+                        # classes. Split it into a list.
+                        value = attrs[cdata_list_attr]
+                        values = whitespace_re.split(value)
+                        if len(values) > 1:
+                            attrs[cdata_list_attr] = values
         self.attrs = attrs
         self.contents = []
         self.setup(parent, previous)
@@ -755,7 +761,9 @@ class Tag(PageElement):
                 if val is None:
                     decoded = key
                 else:
-                    if not isinstance(val, basestring):
+                    if isinstance(val, list) or isinstance(val, tuple):
+                        val = ' '.join(val)
+                    elif not isinstance(val, basestring):
                         val = str(val)
                     if (self.contains_substitutions
                         and eventual_encoding is not None
@@ -907,8 +915,10 @@ class SoupStrainer(object):
 
     def __init__(self, name=None, attrs={}, text=None, **kwargs):
         self.name = name
-        if isinstance(attrs, basestring):
-            kwargs['class'] = _match_css_class(attrs)
+        if not isinstance(attrs, dict):
+            # Treat a non-dict value for attrs as a search for the 'class'
+            # attribute.
+            kwargs['class'] = attrs
             attrs = None
         if kwargs:
             if attrs:
@@ -993,7 +1003,14 @@ class SoupStrainer(object):
     def _matches(self, markup, match_against):
         #print "Matching %s against %s" % (markup, match_against)
         result = False
-        if match_against is True:
+
+        if isinstance(markup, list) or isinstance(markup, tuple):
+            # This should only happen when searching the 'class'
+            # attribute of a tag with multiple CSS classes.
+            for item in markup:
+                if self._matches(item, match_against):
+                    result = True
+        elif match_against is True:
             result = markup is not None
         elif isinstance(match_against, collections.Callable):
             result = match_against(markup)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 2c47aa5..6ff87fc 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -188,6 +188,38 @@ class TestFindAllByAttribute(TreeTest):
         self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
         self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
 
+    def test_find_by_class_when_multiple_classes_present(self):
+        tree = self.soup("<gar class='foo bar'>Found it</gar>")
+
+        attrs = { 'class' : re.compile("o") }
+        f = tree.find_all("gar", attrs=attrs)
+        self.assertSelects(f, ["Found it"])
+
+        f = tree.find_all("gar", re.compile("a"))
+        self.assertSelects(f, ["Found it"])
+
+        # Since the class is not the string "foo bar", but the two
+        # strings "foo" and "bar", this will not find anything.
+        attrs = { 'class' : re.compile("o b") }
+        f = tree.find_all("gar", attrs=attrs)
+        self.assertSelects(f, [])
+
+    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
+        soup = self.soup("<a class='bar'>Found it</a>")
+
+        self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
+
+        def big_attribute_value(value):
+            return len(value) > 3
+
+        self.assertSelects(soup.find_all("a", big_attribute_value), [])
+
+        def small_attribute_value(value):
+            return len(value) <= 3
+
+        self.assertSelects(
+            soup.find_all("a", small_attribute_value), ["Found it"])
+
     def test_find_all_by_attribute_soupstrainer(self):
         tree = self.soup("""
                          <a id="first">Match.</a>
@@ -956,6 +988,29 @@ class TestElementObjects(SoupTest):
         self.assertEqual(soup.a.get_text(","), "a,r, , t ")
         self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
 
+class TestCDAtaListAttributes(SoupTest):
+
+    """Testing cdata-list attributes like 'class'.
+    """
+    def test_single_value_stays_string(self):
+        soup = self.soup("<a class='foo'>")
+        self.assertEqual("foo",soup.a['class'])
+
+    def test_multiple_values_becomes_list(self):
+        soup = self.soup("<a class='foo bar'>")
+        self.assertEqual(["foo", "bar"], soup.a['class'])
+
+    def test_multiple_values_separated_by_weird_whitespace(self):
+        soup = self.soup("<a class='foo\tbar\nbaz'>")
+        self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
+
+    def test_attributes_joined_into_string_on_output(self):
+        soup = self.soup("<a class='foo\tbar'>")
+        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
+
+    def test_accept_charset(self):
+        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
+        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
 
 class TestPersistence(SoupTest):
     "Testing features like pickle and deepcopy."
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-08 18:47:23 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-08 18:47:23 -0500
commit	33f0db7b3c4eef632700418068769b9cb762f708 (patch)
tree	879f715a4ddc59f8826d790b55ccf74a31b797a5
parent	91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff)