summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-08 18:47:23 -0500
commit33f0db7b3c4eef632700418068769b9cb762f708 (patch)
tree879f715a4ddc59f8826d790b55ccf74a31b797a5 /bs4/element.py
parent91f0756b0cdf07a118092e17b69168c86f40a4e0 (diff)
Rationalized the treatment of multi-valued HTML attributes such as 'class'
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py35
1 files changed, 26 insertions, 9 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 257cdbb..ff7c972 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -7,11 +7,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-
-def _match_css_class(str):
- """Build a RE to match the given CSS class."""
- return re.compile(r"(^|.*\s)%s($|\s)" % str)
-
+whitespace_re = re.compile("\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -524,6 +520,16 @@ class Tag(PageElement):
attrs = {}
else:
attrs = dict(attrs)
+ if builder.cdata_list_attributes:
+ for cdata_list_attr in builder.cdata_list_attributes:
+ if cdata_list_attr in attrs:
+ # Basically, we have a "class" attribute whose
+ # value is a whitespace-separated list of CSS
+ # classes. Split it into a list.
+ value = attrs[cdata_list_attr]
+ values = whitespace_re.split(value)
+ if len(values) > 1:
+ attrs[cdata_list_attr] = values
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
@@ -755,7 +761,9 @@ class Tag(PageElement):
if val is None:
decoded = key
else:
- if not isinstance(val, basestring):
+ if isinstance(val, list) or isinstance(val, tuple):
+ val = ' '.join(val)
+ elif not isinstance(val, basestring):
val = str(val)
if (self.contains_substitutions
and eventual_encoding is not None
@@ -907,8 +915,10 @@ class SoupStrainer(object):
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
- if isinstance(attrs, basestring):
- kwargs['class'] = _match_css_class(attrs)
+ if not isinstance(attrs, dict):
+ # Treat a non-dict value for attrs as a search for the 'class'
+ # attribute.
+ kwargs['class'] = attrs
attrs = None
if kwargs:
if attrs:
@@ -993,7 +1003,14 @@ class SoupStrainer(object):
def _matches(self, markup, match_against):
#print "Matching %s against %s" % (markup, match_against)
result = False
- if match_against is True:
+
+ if isinstance(markup, list) or isinstance(markup, tuple):
+ # This should only happen when searching the 'class'
+ # attribute of a tag with multiple CSS classes.
+ for item in markup:
+ if self._matches(item, match_against):
+ result = True
+ elif match_against is True:
result = markup is not None
elif isinstance(match_against, collections.Callable):
result = match_against(markup)