diff options
-rw-r--r-- | NEWS.txt | 16 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 15 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 79 | ||||
-rw-r--r-- | bs4/element.py | 35 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 55 |
5 files changed, 186 insertions, 14 deletions
@@ -1,3 +1,19 @@ += 4.0.0b5 = + +* Rationalized Beautiful Soup's treatment of CSS class. A tag + belonging to multiple CSS classes is treated as having a list of + values for the 'class' attribute. Searching for a CSS class will + match *any* of the CSS classes. + + This actually affects all attributes that the HTML standard defines + as taking multiple values (class, rel, rev, archive, accept-charset, + and headers), but 'class' is by far the most common. + +* If you pass anything other than a dictionary as the second argument + to one of the find* methods, it'll assume you want to use that + object to search against a tag's CSS classes. Previously this only + worked if you passed in a string. + = 4.0.0b4 (20120208) = * Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index a17dce6..2728606 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -82,6 +82,11 @@ class TreeBuilder(object): empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. + # A value for these attributes is a space- or comma-separated list + # of CDATA, rather than a single CDATA. + cdata_list_attributes = None + + def __init__(self): self.soup = None @@ -189,6 +194,16 @@ class HTMLTreeBuilder(TreeBuilder): empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = set( + ['class', 'rel', 'rev', 'archive', 'accept-charset', 'headers']) + # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst index d05acb5..abea5c6 100644 --- a/bs4/doc/source/index.rst +++ b/bs4/doc/source/index.rst @@ -298,6 +298,39 @@ done by treating the tag as a dictionary:: tag # <blockquote>Extremely bold</blockquote> +.. _multivalue: + +Multi-valued attributes +&&&&&&&&&&&&&&&&&&&&&&& + +HTML defines a few attributes that can have multiple values. The most +common is ``class`` (a tag can have more than one CSS class), but +there are a few others: ``rel``, ``rev``, ``archive``, +``accept-charset``, and ``headers``. If one of these attributes has +more than one value, Beautiful Soup will turn its values into a list:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.p['class'] + # ["body", "strikeout"] + +If an attribute `looks` like it has more than one value, but it's not +one of the special attributes listed above, Beautiful Soup will leave +the attribute alone:: + + id_soup = BeautifulSoup('<p id="my id"></p>') + id_soup.p['id'] + # 'my id' + +When you turn a tag back into a string, multiple attribute values are +consolidated:: + + rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>') + rel_soup.a['rel'] + # 'index' + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + # <p>Back to the <a rel="index contents">homepage</a></p> + ``NavigableString`` ------------------- @@ -1084,11 +1117,11 @@ keyword argument:: .. _attrs: -``attrs`` and searching by CSS class -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Searching by CSS class +^^^^^^^^^^^^^^^^^^^^^^ Instead of using keyword arguments, you can filter tags based on their -attributes passing a dictionary in for ``attrs``. These two lines of +attributes by passing a dictionary in for ``attrs``. These two lines of code are equivalent:: soup.find_all(href=re.compile("elsie"), id='link1') @@ -1107,14 +1140,46 @@ You can use ``attrs`` to search by CSS class:: # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] But that's a lot of code for such a common operation. Instead, you can -pass a string for `attrs` instead of a dictionary. The string will be -used to restrict the CSS class:: +pass a string `attrs` instead of a dictionary. The string will be used +to restrict the CSS class:: soup.find_all("a", "sister") # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] +You can also pass in a regular expression, a function or +True. Anything you pass in for ``attrs`` that's not a dictionary will +be used to search against the CSS class:: + + soup.find_all(attrs=re.compile("itl")) + # [<p class="title"><b>The Dormouse's story</b></p>] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(attrs=has_six_characters) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +:ref:`Remember <multivalue>` that a single tag can have multiple +values for its "class" attribute. When you search for a tag that +matches a certain CSS class, you're matching against `any` of its CSS +classes:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.find_all("p", "strikeout") + # [<p class="body strikeout"></p>] + + css_soup.find_all("p", "body") + # [<p class="body strikeout"></p>] + +Searching for the string value of the ``class`` attribute won't work:: + + css_soup.find_all("p", "body strikeout") + # [] + .. _text: The ``text`` argument @@ -2418,6 +2483,10 @@ Miscellaneous contains a single tag B and nothing else, then A.string is the same as B.string. (Previously, it was None.) +`Multi-valued attributes`_ like ``class`` are parsed into lists if +they have more than one value. This may affect the way you search by +CSS class. + The ``BeautifulSoup`` constructor no longer recognizes the `markupMassage` argument. It's now the parser's responsibility to handle markup correctly. diff --git a/bs4/element.py b/bs4/element.py index 257cdbb..ff7c972 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -7,11 +7,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) - -def _match_css_class(str): - """Build a RE to match the given CSS class.""" - return re.compile(r"(^|.*\s)%s($|\s)" % str) - +whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -524,6 +520,16 @@ class Tag(PageElement): attrs = {} else: attrs = dict(attrs) + if builder.cdata_list_attributes: + for cdata_list_attr in builder.cdata_list_attributes: + if cdata_list_attr in attrs: + # Basically, we have a "class" attribute whose + # value is a whitespace-separated list of CSS + # classes. Split it into a list. + value = attrs[cdata_list_attr] + values = whitespace_re.split(value) + if len(values) > 1: + attrs[cdata_list_attr] = values self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -755,7 +761,9 @@ class Tag(PageElement): if val is None: decoded = key else: - if not isinstance(val, basestring): + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): val = str(val) if (self.contains_substitutions and eventual_encoding is not None @@ -907,8 +915,10 @@ class SoupStrainer(object): def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs attrs = None if kwargs: if attrs: @@ -993,7 +1003,14 @@ class SoupStrainer(object): def _matches(self, markup, match_against): #print "Matching %s against %s" % (markup, match_against) result = False - if match_against is True: + + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching the 'class' + # attribute of a tag with multiple CSS classes. + for item in markup: + if self._matches(item, match_against): + result = True + elif match_against is True: result = markup is not None elif isinstance(match_against, collections.Callable): result = match_against(markup) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 2c47aa5..6ff87fc 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -188,6 +188,38 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("<gar class='foo bar'>Found it</gar>") + + attrs = { 'class' : re.compile("o") } + f = tree.find_all("gar", attrs=attrs) + self.assertSelects(f, ["Found it"]) + + f = tree.find_all("gar", re.compile("a")) + self.assertSelects(f, ["Found it"]) + + # Since the class is not the string "foo bar", but the two + # strings "foo" and "bar", this will not find anything. + attrs = { 'class' : re.compile("o b") } + f = tree.find_all("gar", attrs=attrs) + self.assertSelects(f, []) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("<a class='bar'>Found it</a>") + + self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assertSelects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assertSelects( + soup.find_all("a", small_attribute_value), ["Found it"]) + def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" <a id="first">Match.</a> @@ -956,6 +988,29 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") +class TestCDAtaListAttributes(SoupTest): + + """Testing cdata-list attributes like 'class'. + """ + def test_single_value_stays_string(self): + soup = self.soup("<a class='foo'>") + self.assertEqual("foo",soup.a['class']) + + def test_multiple_values_becomes_list(self): + soup = self.soup("<a class='foo bar'>") + self.assertEqual(["foo", "bar"], soup.a['class']) + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("<a class='foo\tbar\nbaz'>") + self.assertEqual(["foo", "bar", "baz"],soup.a['class']) + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("<a class='foo\tbar'>") + self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) + + def test_accept_charset(self): + soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') + self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) class TestPersistence(SoupTest): "Testing features like pickle and deepcopy." |