summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-03-02 10:29:08 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-03-02 10:29:08 -0500
commit74ca8e3f33d44475401be0bc418da83264f91207 (patch)
tree329891346e0a4a9fb032666b7b36d42c44d1857f /bs4
parente3671b76b089f015ded142966aae0e8cdb572aa6 (diff)
Brought the soupselect port up to date.
Diffstat (limited to 'bs4')
-rw-r--r--bs4/element.py45
-rw-r--r--bs4/tests/test_tree.py62
2 files changed, 73 insertions, 34 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 2851a75..d2fa19f 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -483,7 +483,11 @@ class PageElement(object):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
- for token in tokens:
+ for index, token in enumerate(tokens):
+ if tokens[index - 1] == '>':
+ # already found direct descendants in last step. skip this
+ # step.
+ continue
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
@@ -493,9 +497,11 @@ class PageElement(object):
checker = self._attribute_checker(operator, attribute, value)
found = []
for context in current_context:
- found.extend([el for el in context.find_all(tag) if checker(el)])
+ found.extend(
+ [el for el in context.find_all(tag) if checker(el)])
current_context = found
continue
+
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
@@ -506,21 +512,25 @@ class PageElement(object):
return [] # No match
current_context = [el]
continue
+
if '.' in token:
# Class selector
- tag, klass = token.split('.', 1)
- if not tag:
- tag = True
+ tag_name, klass = token.split('.', 1)
+ if not tag_name:
+ tag_name = True
+ classes = set(klass.split('.'))
found = []
+ def classes_match(tag):
+ if tag_name is not True and tag.name != tag_name:
+ return False
+ if not tag.has_attr('class'):
+ return False
+ return classes.issubset(tag['class'])
for context in current_context:
- found.extend(
- context.find_all(
- tag,
- {'class': lambda attr: attr and klass in attr.split()}
- )
- )
+ found.extend(context.find_all(classes_match))
current_context = found
continue
+
if token == '*':
# Star selector
found = []
@@ -528,6 +538,19 @@ class PageElement(object):
found.extend(context.findAll(True))
current_context = found
continue
+
+ if token == '>':
+ # Child selector
+ tag = tokens[index + 1]
+ if not tag:
+ tag = True
+
+ found = []
+ for context in current_context:
+ found.extend(context.find_all(tag, recursive=False))
+ current_context = found
+ continue
+
# Here we should just have a regular tag
if not self.tag_name_re.match(token):
return []
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6d22448..e9a5763 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1308,37 +1308,43 @@ class TestNavigableStringSubclasses(SoupTest):
class TestSoupSelector(TreeTest):
- HTML = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
- "http://www.w3.org/TR/html4/strict.dtd">
+ HTML = """
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
- <title>The title</title>
- <link rel="stylesheet" href="blah.css" type="text/css" id="l1">
+<title>The title</title>
+<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
</head>
<body>
<div id="main">
- <div id="inner">
- <h1 id="header1">An H1</h1>
- <p>Some text</p>
- <p class="onep" id="p1">Some more text</p>
- <h2 id="header2">An H2</h2>
- <p class="class1 class2 class3" id="pmulti">Another</p>
- <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
- <h2 id="header3">Another H2</h2>
- <a id="me" href="http://simonwillison.net/" rel="me">me</a>
- </div>
- <p lang="en" id="lang-en">English</p>
- <p lang="en-gb" id="lang-en-gb">English UK</p>
- <p lang="en-us" id="lang-en-us">English US</p>
- <p lang="fr" id="lang-fr">French</p>
+<div id="inner">
+<h1 id="header1">An H1</h1>
+<p>Some text</p>
+<p class="onep" id="p1">Some more text</p>
+<h2 id="header2">An H2</h2>
+<p class="class1 class2 class3" id="pmulti">Another</p>
+<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
+<h2 id="header3">Another H2</h2>
+<a id="me" href="http://simonwillison.net/" rel="me">me</a>
+<span class="s1">
+<a href="#" id="s1a1">span1a1</a>
+<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
+<span class="span2">
+<a href="#" id="s2a1">span2a1</a>
+</span>
+<span class="span3"></span>
+</span>
+</div>
+<p lang="en" id="lang-en">English</p>
+<p lang="en-gb" id="lang-en-gb">English UK</p>
+<p lang="en-us" id="lang-en-us">English US</p>
+<p lang="fr" id="lang-fr">French</p>
</div>
<div id="footer">
</div>
-
-</body>
-</html>
"""
def setUp(self):
@@ -1428,6 +1434,16 @@ class TestSoupSelector(TreeTest):
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
self.assertSelects(selector, ['pmulti'])
+ def test_multi_class_selection(self):
+ for selector in ('.class1.class3', '.class3.class2',
+ '.class1.class2.class3'):
+ self.assertSelects(selector, ['pmulti'])
+
+ def test_child_selector(self):
+ self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
+ self.assertSelects('.s1 > a span', ['s1a2s1'])
+
+
def test_attribute_equals(self):
self.assertSelectMultiple(
('p[class="onep"]', ['p1']),
@@ -1481,7 +1497,7 @@ class TestSoupSelector(TreeTest):
('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']),
- ('[id$="1"]', ['l1', 'p1', 'header1']),
+ ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
('div[id$="1"]', []),
('[id$="noending"]', []),
)
@@ -1504,7 +1520,7 @@ class TestSoupSelector(TreeTest):
('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']),
- ('[id*="1"]', ['l1', 'p1', 'header1']),
+ ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
('div[id*="1"]', []),
('[id*="noending"]', []),
# New for this test