summaryrefslogtreecommitdiff
path: root/bs4/builder/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
commit33f0db7b3c4eef632700418068769b9cb762f708 (patch)
tree879f715a4ddc59f8826d790b55ccf74a31b797a5 /bs4/builder/__init__.py
parent91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff)
Rationalized the treatment of multi-valued HTML attributes such as 'class'
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r--bs4/builder/__init__.py15
1 files changed, 15 insertions, 0 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index a17dce6..2728606 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -82,6 +82,11 @@ class TreeBuilder(object):
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
+ # A value for these attributes is a space- or comma-separated list
+ # of CDATA, rather than a single CDATA.
+ cdata_list_attributes = None
+
+
def __init__(self):
self.soup = None
@@ -189,6 +194,16 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ # The HTML standard defines these attributes as containing a
+ # space-separated list of values, not a single value. That is,
+ # class="foo bar" means that the 'class' attribute has two values,
+ # 'foo' and 'bar', not the single value 'foo bar'. When we
+ # encounter one of these attributes, we will parse its value into
+ # a list of values if possible. Upon output, the list will be
+ # converted back into a string.
+ cdata_list_attributes = set(
+ ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers'])
+
# Used by set_up_substitutions to detect the charset in a META tag
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)