diff options
-rw-r--r-- | NEWS.txt | 8 | ||||
-rw-r--r-- | bs4/element.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 21 | ||||
-rw-r--r-- | doc/source/index.rst | 84 |
4 files changed, 74 insertions, 45 deletions
@@ -1,10 +1,14 @@ = 4.1.2 (Unreleased) = +* As per PEP-8, allow searching by CSS class using the 'class_' + keyword argument. [bug=1037624] + * Use namespace prefixes for namespaced attribute names, instead of the fully-qualified names given by the lxml parser. [bug=1037597] -* When sniffing encodings, if the cchardet library is installed, use - it instead of chardet. It's much faster. [bug=1020748] +* When sniffing encodings, if the cchardet library is installed, + Beautiful Soup uses ccharder instead of chardet. cchardet is much + faster. [bug=1020748] * Use logging.warning() instead of warning.warn() to notify the user that characters were replaced with REPLACEMENT diff --git a/bs4/element.py b/bs4/element.py index 4a4d3ed..2e3be46 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1170,6 +1170,12 @@ class SoupStrainer(object): kwargs['class'] = attrs attrs = None + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + if kwargs: if attrs: attrs = attrs.copy() diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index cc573ed..9397f24 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -228,18 +228,24 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) - # Passing class='class2' would cause a syntax error. self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): - # Passing in a string to 'attrs' will search the CSS class. tree = self.soup(""" <a class="1">Class 1.</a> <a class="2">Class 2.</a> <b class="1">Class 1.</b> <c class="3 4">Class 3 and 4.</c> """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) + self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) + + # Passing in a string to 'attrs' will also search the CSS class. self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) @@ -248,17 +254,15 @@ class TestFindAllByAttribute(TreeTest): def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("<gar class='foo bar'>Found it</gar>") - attrs = { 'class' : re.compile("o") } - f = tree.find_all("gar", attrs=attrs) + f = tree.find_all("gar", class_=re.compile("o")) self.assertSelects(f, ["Found it"]) - f = tree.find_all("gar", re.compile("a")) + f = tree.find_all("gar", class_=re.compile("a")) self.assertSelects(f, ["Found it"]) # Since the class is not the string "foo bar", but the two # strings "foo" and "bar", this will not find anything. - attrs = { 'class' : re.compile("o b") } - f = tree.find_all("gar", attrs=attrs) + f = tree.find_all("gar", class_=re.compile("o b")) self.assertSelects(f, []) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): @@ -283,8 +287,9 @@ class TestFindAllByAttribute(TreeTest): self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) - # If you specify the attribute as a string that contains a + # If you specify the class as a string that contains a # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", class_="foo bar")) self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) diff --git a/doc/source/index.rst b/doc/source/index.rst index 3c8cc76..e51ec84 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -20,9 +20,11 @@ The examples in this documentation should work the same way in Python 2.7 and Python 3.2. You might be looking for the documentation for `Beautiful Soup 3 -<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. If -you want to learn about the differences between Beautiful Soup 3 and -Beautiful Soup 4, see `Porting code to BS4`_. +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. +If so, you should know that Beautiful Soup 3 is no longer being +developed, and that Beautiful Soup 4 is recommended for all new +projects. If you want to learn about the differences between Beautiful +Soup 3 and Beautiful Soup 4, see `Porting code to BS4`_. Getting help ------------ @@ -1217,45 +1219,27 @@ keyword argument:: Searching by CSS class ^^^^^^^^^^^^^^^^^^^^^^ -Instead of using keyword arguments, you can filter tags based on their -attributes by passing a dictionary in for ``attrs``. These two lines of -code are equivalent:: - - soup.find_all(href=re.compile("elsie"), id='link1') - soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) - -The ``attrs`` argument would be a pretty obscure feature were it not for -one thing: CSS. It's very useful to search for a tag that has a -certain CSS class, but the name of the CSS attribute, "class", is also a -Python reserved word. - -You can use ``attrs`` to search by CSS class:: +It's very useful to search for a tag that has a certain CSS class, but +the name of the CSS attribute, "class", is a reserved word in +Python. Using ``class`` as a keyword argument will give you a syntax +error. As of Beautiful Soup 4.1.2, you can search by CSS class using +the keyword argument ``class_``:: - soup.find_all("a", { "class" : "sister" }) + soup.find_all("a", class_="sister") # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] -But that's a lot of code for such a common operation. Instead, you can -pass a string `attrs` instead of a dictionary. The string will be used -to restrict the CSS class:: +As with any keyword argument, you can pass ``class_`` a string, a regular +expression, a function, or ``True``:: - soup.find_all("a", "sister") - # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, - # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, - # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] - -You can also pass in a regular expression, a function or -True. Anything you pass in for ``attrs`` that's not a dictionary will -be used to search against the CSS class:: - - soup.find_all(attrs=re.compile("itl")) + soup.find_all(class_=re.compile("itl")) # [<p class="title"><b>The Dormouse's story</b></p>] def has_six_characters(css_class): return css_class is not None and len(css_class) == 6 - soup.find_all(attrs=has_six_characters) + soup.find_all(class_=has_six_characters) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] @@ -1266,17 +1250,47 @@ matches a certain CSS class, you're matching against `any` of its CSS classes:: css_soup = BeautifulSoup('<p class="body strikeout"></p>') - css_soup.find_all("p", "strikeout") + css_soup.find_all("p", class_="strikeout") # [<p class="body strikeout"></p>] - css_soup.find_all("p", "body") + css_soup.find_all("p", class_="body") # [<p class="body strikeout"></p>] -Searching for the string value of the ``class`` attribute won't work:: +You can also search for the exact string value of the ``class`` attribute: + + css_soup.find_all("p", class_="body strikeout") + # [<p class="body strikeout"></p>] - css_soup.find_all("p", "body strikeout") +But searching for variants of the string value won't work:: + + css_soup.find_all("p", class_="strikeout body") # [] +There's a shortcut for ``class_`` present in all versions of Beautiful +Soup. The second argument to any ``find()``-type method is called +``attrs``, and passing in a string for ``attrs`` will search for that +string as a CSS class:: + + soup.find_all("a", "sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +You can also pass in a regular expression, a function or +True—anything except a dictionary. Whatever you pass in will be +used to search against the CSS class, the same as if you'd passed it +in for the ``class_`` keyword argument:: + +By passing in a dictionary to ``attrs``, you can search many HTML +attributes at once, not just the CSS class. These two lines of code +are equivalent:: + + soup.find_all(href=re.compile("elsie"), id='link1') + soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) + +This isn't a very useful feature, since it's usually easier +to use the keyword arguments. + .. _text: The ``text`` argument |