summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO.txt4
-rw-r--r--bs4/builder/__init__.py24
-rw-r--r--bs4/element.py6
-rw-r--r--bs4/tests/test_tree.py9
4 files changed, 33 insertions, 10 deletions
diff --git a/TODO.txt b/TODO.txt
index 06dd6bc..2f03dd2 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -6,10 +6,6 @@ Bugs
* html5lib doesn't support SoupStrainers, which is OK, but there
should be a warning about it.
-* HTML5 defines more multivalue attributes: iframe.sandbox,
- output.for, icon.sizes, *.accesskey, *.dropzone, td.headers,
- th.headers, a.rel, area.rel, link.rel, *.class, form.accept-charset
-
Big features
------------
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 067623e..4e31572 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -82,9 +82,9 @@ class TreeBuilder(object):
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
- # A value for these attributes is a space- or comma-separated list
- # of CDATA, rather than a single CDATA.
- cdata_list_attributes = None
+ # A value for these tag/attribute combinations is a space- or
+ # comma-separated list of CDATA, rather than a single CDATA.
+ cdata_list_attributes = {}
def __init__(self):
@@ -201,8 +201,22 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
- cdata_list_attributes = set(
- ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers'])
+ cdata_list_attributes = {
+ "*" : ['class', 'accesskey', 'dropzone'],
+ "a" : ['rel', 'rev'],
+ "link" : ['rel', 'rev'],
+ "td" : ["headers"],
+ "th" : ["headers"],
+ "td" : ["headers"],
+ "form" : ["accept-charset"],
+ "object" : ["archive"],
+
+ # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+ "area" : ["rel"],
+ "icon" : ["sizes"],
+ "iframe" : ["sandbox"],
+ "output" : ["for"],
+ }
# Used by set_up_substitutions to detect the charset in a META tag
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
diff --git a/bs4/element.py b/bs4/element.py
index 5e15252..c6a7823 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1,4 +1,5 @@
import collections
+import itertools
import re
import sys
import warnings
@@ -524,7 +525,10 @@ class Tag(PageElement):
else:
attrs = dict(attrs)
if builder.cdata_list_attributes:
- for cdata_list_attr in builder.cdata_list_attributes:
+ universal = builder.cdata_list_attributes.get('*', [])
+ tag_specific = builder.cdata_list_attributes.get(
+ self.name.lower(), [])
+ for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in attrs:
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 2e74c00..8e61429 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1065,6 +1065,15 @@ class TestCDAtaListAttributes(SoupTest):
soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
+ def test_cdata_attribute_applying_only_to_one_tag(self):
+ data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
+ soup = self.soup(data)
+ # We saw in another test that accept-charset is a cdata-list
+ # attribute for the <form> tag. But it's not a cdata-list
+ # attribute for any other tag.
+ self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset'])
+
+
class TestPersistence(SoupTest):
"Testing features like pickle and deepcopy."