diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-08 18:47:23 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-08 18:47:23 -0500 |
commit | 33f0db7b3c4eef632700418068769b9cb762f708 (patch) | |
tree | 879f715a4ddc59f8826d790b55ccf74a31b797a5 /bs4/builder/__init__.py | |
parent | 91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff) |
Rationalized the treatment of multi-valued HTML attributes such as 'class'
Diffstat (limited to 'bs4/builder/__init__.py')
-rw-r--r-- | bs4/builder/__init__.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index a17dce6..2728606 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -82,6 +82,11 @@ class TreeBuilder(object): empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. + # A value for these attributes is a space- or comma-separated list + # of CDATA, rather than a single CDATA. + cdata_list_attributes = None + + def __init__(self): self.soup = None @@ -189,6 +194,16 @@ class HTMLTreeBuilder(TreeBuilder): empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = set( + ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers']) + # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) |