diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 14:27:21 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 14:27:21 -0500 |
commit | a06152365c336f41bdb5fb9513b9316740c1564a (patch) | |
tree | 72f7f6993624453d8f4a8ccc21ef88ef93cee59a | |
parent | be0c08585f54ec709740ff4352006bf3e605b8f2 (diff) |
Some cdata-list attributes are only cdata lists for certain tags.
-rw-r--r-- | TODO.txt | 4 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 24 | ||||
-rw-r--r-- | bs4/element.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 9 |
4 files changed, 33 insertions, 10 deletions
@@ -6,10 +6,6 @@ Bugs * html5lib doesn't support SoupStrainers, which is OK, but there should be a warning about it. -* HTML5 defines more multivalue attributes: iframe.sandbox, - output.for, icon.sizes, *.accesskey, *.dropzone, td.headers, - th.headers, a.rel, area.rel, link.rel, *.class, form.accept-charset - Big features ------------ diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 067623e..4e31572 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -82,9 +82,9 @@ class TreeBuilder(object): empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - # A value for these attributes is a space- or comma-separated list - # of CDATA, rather than a single CDATA. - cdata_list_attributes = None + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + cdata_list_attributes = {} def __init__(self): @@ -201,8 +201,22 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = set( - ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers']) + cdata_list_attributes = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) diff --git a/bs4/element.py b/bs4/element.py index 5e15252..c6a7823 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,4 +1,5 @@ import collections +import itertools import re import sys import warnings @@ -524,7 +525,10 @@ class Tag(PageElement): else: attrs = dict(attrs) if builder.cdata_list_attributes: - for cdata_list_attr in builder.cdata_list_attributes: + universal = builder.cdata_list_attributes.get('*', []) + tag_specific = builder.cdata_list_attributes.get( + self.name.lower(), []) + for cdata_list_attr in itertools.chain(universal, tag_specific): if cdata_list_attr in attrs: # Basically, we have a "class" attribute whose # value is a whitespace-separated list of CSS diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 2e74c00..8e61429 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1065,6 +1065,15 @@ class TestCDAtaListAttributes(SoupTest): soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the <form> tag. But it's not a cdata-list + # attribute for any other tag. + self.assertEquals('ISO-8859-1 UTF-8', soup.a['accept-charset']) + + class TestPersistence(SoupTest): "Testing features like pickle and deepcopy." |