Some cdata-list attributes are only cdata lists for certain tags.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-15 14:27:21 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-02-15 14:27:21 -0500
commit: a06152365c336f41bdb5fb9513b9316740c1564a (patch)
tree: 72f7f6993624453d8f4a8ccc21ef88ef93cee59a
parent: be0c08585f54ec709740ff4352006bf3e605b8f2 (diff)
4 files changed, 33 insertions, 10 deletions
diff --git a/TODO.txt b/TODO.txt
index 06dd6bc..2f03dd2 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -6,10 +6,6 @@ Bugs
 * html5lib doesn't support SoupStrainers, which is OK, but there
   should be a warning about it.
 
-* HTML5 defines more multivalue attributes: iframe.sandbox,
-  output.for, icon.sizes, *.accesskey, *.dropzone, td.headers,
-  th.headers, a.rel, area.rel, link.rel, *.class, form.accept-charset
-
 Big features
 ------------
 
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 067623e..4e31572 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -82,9 +82,9 @@ class TreeBuilder(object):
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
 
-    # A value for these attributes is a space- or comma-separated list
-    # of CDATA, rather than a single CDATA.
-    cdata_list_attributes = None
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
 
 
     def __init__(self):
@@ -201,8 +201,22 @@ class HTMLTreeBuilder(TreeBuilder):
     # encounter one of these attributes, we will parse its value into
     # a list of values if possible. Upon output, the list will be
     # converted back into a string.
-    cdata_list_attributes = set(
-        ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers'])
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
 
     # Used by set_up_substitutions to detect the charset in a META tag
     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
diff --git a/bs4/element.py b/bs4/element.py
index 5e15252..c6a7823 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1,4 +1,5 @@
 import collections
+import itertools
 import re
 import sys
 import warnings
@@ -524,7 +525,10 @@ class Tag(PageElement):
         else:
             attrs = dict(attrs)
             if builder.cdata_list_attributes:
-                for cdata_list_attr in builder.cdata_list_attributes:
+                universal = builder.cdata_list_attributes.get('*', [])
+                tag_specific = builder.cdata_list_attributes.get(
+                    self.name.lower(), [])
+                for cdata_list_attr in itertools.chain(universal, tag_specific):
                     if cdata_list_attr in attrs:
                         # Basically, we have a "class" attribute whose
                         # value is a whitespace-separated list of CSS
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 2e74c00..8e61429 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1065,6 +1065,15 @@ class TestCDAtaListAttributes(SoupTest):
         soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
         self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
 
+    def test_cdata_attribute_applying_only_to_one_tag(self):
+        data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
+        soup = self.soup(data)
+        # We saw in another test that accept-charset is a cdata-list
+        # attribute for the <form> tag. But it's not a cdata-list
+        # attribute for any other tag.
+        self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset'])
+
+
 class TestPersistence(SoupTest):
     "Testing features like pickle and deepcopy."
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-15 14:27:21 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-02-15 14:27:21 -0500
commit	a06152365c336f41bdb5fb9513b9316740c1564a (patch)
tree	72f7f6993624453d8f4a8ccc21ef88ef93cee59a
parent	be0c08585f54ec709740ff4352006bf3e605b8f2 (diff)