summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/element.py45
-rw-r--r--bs4/tests/test_tree.py62
-rw-r--r--doc/source/index.rst36
4 files changed, 110 insertions, 39 deletions
diff --git a/NEWS.txt b/NEWS.txt
index d9b421e..cf76b84 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,9 @@
+= 4.0.0b11 () =
+
+* Brought BS up to date with the latest release of soupselect, adding
+ CSS selector support for direct descendant matches and multiple CSS
+ class matches.
+
= 4.0.0b10 (20120302) =
* Added support for simple CSS selectors, taken from the soupselect project.
diff --git a/bs4/element.py b/bs4/element.py
index 2851a75..d2fa19f 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -483,7 +483,11 @@ class PageElement(object):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
- for token in tokens:
+ for index, token in enumerate(tokens):
+ if tokens[index - 1] == '>':
+ # already found direct descendants in last step. skip this
+ # step.
+ continue
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
@@ -493,9 +497,11 @@ class PageElement(object):
checker = self._attribute_checker(operator, attribute, value)
found = []
for context in current_context:
- found.extend([el for el in context.find_all(tag) if checker(el)])
+ found.extend(
+ [el for el in context.find_all(tag) if checker(el)])
current_context = found
continue
+
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
@@ -506,21 +512,25 @@ class PageElement(object):
return [] # No match
current_context = [el]
continue
+
if '.' in token:
# Class selector
- tag, klass = token.split('.', 1)
- if not tag:
- tag = True
+ tag_name, klass = token.split('.', 1)
+ if not tag_name:
+ tag_name = True
+ classes = set(klass.split('.'))
found = []
+ def classes_match(tag):
+ if tag_name is not True and tag.name != tag_name:
+ return False
+ if not tag.has_attr('class'):
+ return False
+ return classes.issubset(tag['class'])
for context in current_context:
- found.extend(
- context.find_all(
- tag,
- {'class': lambda attr: attr and klass in attr.split()}
- )
- )
+ found.extend(context.find_all(classes_match))
current_context = found
continue
+
if token == '*':
# Star selector
found = []
@@ -528,6 +538,19 @@ class PageElement(object):
found.extend(context.findAll(True))
current_context = found
continue
+
+ if token == '>':
+ # Child selector
+ tag = tokens[index + 1]
+ if not tag:
+ tag = True
+
+ found = []
+ for context in current_context:
+ found.extend(context.find_all(tag, recursive=False))
+ current_context = found
+ continue
+
# Here we should just have a regular tag
if not self.tag_name_re.match(token):
return []
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6d22448..e9a5763 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1308,37 +1308,43 @@ class TestNavigableStringSubclasses(SoupTest):
class TestSoupSelector(TreeTest):
- HTML = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
- "http://www.w3.org/TR/html4/strict.dtd">
+ HTML = """
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
- <title>The title</title>
- <link rel="stylesheet" href="blah.css" type="text/css" id="l1">
+<title>The title</title>
+<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
</head>
<body>
<div id="main">
- <div id="inner">
- <h1 id="header1">An H1</h1>
- <p>Some text</p>
- <p class="onep" id="p1">Some more text</p>
- <h2 id="header2">An H2</h2>
- <p class="class1 class2 class3" id="pmulti">Another</p>
- <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
- <h2 id="header3">Another H2</h2>
- <a id="me" href="http://simonwillison.net/" rel="me">me</a>
- </div>
- <p lang="en" id="lang-en">English</p>
- <p lang="en-gb" id="lang-en-gb">English UK</p>
- <p lang="en-us" id="lang-en-us">English US</p>
- <p lang="fr" id="lang-fr">French</p>
+<div id="inner">
+<h1 id="header1">An H1</h1>
+<p>Some text</p>
+<p class="onep" id="p1">Some more text</p>
+<h2 id="header2">An H2</h2>
+<p class="class1 class2 class3" id="pmulti">Another</p>
+<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
+<h2 id="header3">Another H2</h2>
+<a id="me" href="http://simonwillison.net/" rel="me">me</a>
+<span class="s1">
+<a href="#" id="s1a1">span1a1</a>
+<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
+<span class="span2">
+<a href="#" id="s2a1">span2a1</a>
+</span>
+<span class="span3"></span>
+</span>
+</div>
+<p lang="en" id="lang-en">English</p>
+<p lang="en-gb" id="lang-en-gb">English UK</p>
+<p lang="en-us" id="lang-en-us">English US</p>
+<p lang="fr" id="lang-fr">French</p>
</div>
<div id="footer">
</div>
-
-</body>
-</html>
"""
def setUp(self):
@@ -1428,6 +1434,16 @@ class TestSoupSelector(TreeTest):
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
self.assertSelects(selector, ['pmulti'])
+ def test_multi_class_selection(self):
+ for selector in ('.class1.class3', '.class3.class2',
+ '.class1.class2.class3'):
+ self.assertSelects(selector, ['pmulti'])
+
+ def test_child_selector(self):
+ self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
+ self.assertSelects('.s1 > a span', ['s1a2s1'])
+
+
def test_attribute_equals(self):
self.assertSelectMultiple(
('p[class="onep"]', ['p1']),
@@ -1481,7 +1497,7 @@ class TestSoupSelector(TreeTest):
('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']),
- ('[id$="1"]', ['l1', 'p1', 'header1']),
+ ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
('div[id$="1"]', []),
('[id$="noending"]', []),
)
@@ -1504,7 +1520,7 @@ class TestSoupSelector(TreeTest):
('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']),
- ('[id*="1"]', ['l1', 'p1', 'header1']),
+ ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
('div[id*="1"]', []),
('[id*="noending"]', []),
# New for this test
diff --git a/doc/source/index.rst b/doc/source/index.rst
index a9d404a..37d5f07 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1538,15 +1538,27 @@ You can find tags::
Find tags beneath other tags::
- soup.select("p a")
- # [<a class="sister" href="http://example.com/elsie"
- id="link1">Elsie</a>, <a class="sister"
- href="http://example.com/lacie" id="link2">Lacie</a>, <a
- class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+ soup.select("body a")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("html head title")
# [<title>The Dormouse's story</title>]
+Find tags `directly` beneath other tags::
+
+ soup.select("head > title")
+ # [<title>The Dormouse's story</title>]
+
+ soup.select("p > a")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+ soup.select("body > a")
+ # []
+
Find tags by CSS class::
soup.select(".sister")
@@ -1590,6 +1602,20 @@ Find tags by attribute value::
soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
+Match language codes::
+
+ multilingual_markup = """
+ <p lang="en">Hello</p>
+ <p lang="en-us">Howdy, y'all</p>
+ <p lang="en-gb">Pip-pip, old fruit</p>
+ <p lang="fr">Bonjour mes amis</p>
+ """
+ multilingual_soup = BeautifulSoup(multilingual_markup)
+ multilingual_soup.select('p[lang|=en]')
+ # [<p lang="en">Hello</p>,
+ # <p lang="en-us">Howdy, y'all</p>,
+ # <p lang="en-gb">Pip-pip, old fruit</p>]
+
This is a convenience for users who know the CSS selector syntax. You
can do all this stuff with the Beautiful Soup API. And if CSS
selectors are all you need, you might as well use lxml directly,