diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-03-02 10:29:08 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-03-02 10:29:08 -0500 |
commit | 74ca8e3f33d44475401be0bc418da83264f91207 (patch) | |
tree | 329891346e0a4a9fb032666b7b36d42c44d1857f /bs4 | |
parent | e3671b76b089f015ded142966aae0e8cdb572aa6 (diff) |
Brought the soupselect port up to date.
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/element.py | 45 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 62 |
2 files changed, 73 insertions, 34 deletions
diff --git a/bs4/element.py b/bs4/element.py index 2851a75..d2fa19f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -483,7 +483,11 @@ class PageElement(object): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] - for token in tokens: + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + # already found direct descendants in last step. skip this + # step. + continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector @@ -493,9 +497,11 @@ class PageElement(object): checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: - found.extend([el for el in context.find_all(tag) if checker(el)]) + found.extend( + [el for el in context.find_all(tag) if checker(el)]) current_context = found continue + if '#' in token: # ID selector tag, id = token.split('#', 1) @@ -506,21 +512,25 @@ class PageElement(object): return [] # No match current_context = [el] continue + if '.' in token: # Class selector - tag, klass = token.split('.', 1) - if not tag: - tag = True + tag_name, klass = token.split('.', 1) + if not tag_name: + tag_name = True + classes = set(klass.split('.')) found = [] + def classes_match(tag): + if tag_name is not True and tag.name != tag_name: + return False + if not tag.has_attr('class'): + return False + return classes.issubset(tag['class']) for context in current_context: - found.extend( - context.find_all( - tag, - {'class': lambda attr: attr and klass in attr.split()} - ) - ) + found.extend(context.find_all(classes_match)) current_context = found continue + if token == '*': # Star selector found = [] @@ -528,6 +538,19 @@ class PageElement(object): found.extend(context.findAll(True)) current_context = found continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + current_context = found + continue + # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6d22448..e9a5763 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1308,37 +1308,43 @@ class TestNavigableStringSubclasses(SoupTest): class TestSoupSelector(TreeTest): - HTML = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html4/strict.dtd"> + HTML = """ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> <html> <head> - <title>The title</title> - <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +<title>The title</title> +<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> </head> <body> <div id="main"> - <div id="inner"> - <h1 id="header1">An H1</h1> - <p>Some text</p> - <p class="onep" id="p1">Some more text</p> - <h2 id="header2">An H2</h2> - <p class="class1 class2 class3" id="pmulti">Another</p> - <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> - <h2 id="header3">Another H2</h2> - <a id="me" href="http://simonwillison.net/" rel="me">me</a> - </div> - <p lang="en" id="lang-en">English</p> - <p lang="en-gb" id="lang-en-gb">English UK</p> - <p lang="en-us" id="lang-en-us">English US</p> - <p lang="fr" id="lang-fr">French</p> +<div id="inner"> +<h1 id="header1">An H1</h1> +<p>Some text</p> +<p class="onep" id="p1">Some more text</p> +<h2 id="header2">An H2</h2> +<p class="class1 class2 class3" id="pmulti">Another</p> +<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> +<h2 id="header3">Another H2</h2> +<a id="me" href="http://simonwillison.net/" rel="me">me</a> +<span class="s1"> +<a href="#" id="s1a1">span1a1</a> +<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> +<span class="span2"> +<a href="#" id="s2a1">span2a1</a> +</span> +<span class="span3"></span> +</span> +</div> +<p lang="en" id="lang-en">English</p> +<p lang="en-gb" id="lang-en-gb">English UK</p> +<p lang="en-us" id="lang-en-us">English US</p> +<p lang="fr" id="lang-fr">French</p> </div> <div id="footer"> </div> - -</body> -</html> """ def setUp(self): @@ -1428,6 +1434,16 @@ class TestSoupSelector(TreeTest): '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): self.assertSelects(selector, ['pmulti']) + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_attribute_equals(self): self.assertSelectMultiple( ('p[class="onep"]', ['p1']), @@ -1481,7 +1497,7 @@ class TestSoupSelector(TreeTest): ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1']), + ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), ('div[id$="1"]', []), ('[id$="noending"]', []), ) @@ -1504,7 +1520,7 @@ class TestSoupSelector(TreeTest): ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1']), + ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), ('div[id*="1"]', []), ('[id*="noending"]', []), # New for this test |