diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/element.py | 45 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 62 | ||||
-rw-r--r-- | doc/source/index.rst | 36 |
4 files changed, 110 insertions, 39 deletions
@@ -1,3 +1,9 @@ += 4.0.0b11 () = + +* Brought BS up to date with the latest release of soupselect, adding + CSS selector support for direct descendant matches and multiple CSS + class matches. + = 4.0.0b10 (20120302) = * Added support for simple CSS selectors, taken from the soupselect project. diff --git a/bs4/element.py b/bs4/element.py index 2851a75..d2fa19f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -483,7 +483,11 @@ class PageElement(object): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] - for token in tokens: + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + # already found direct descendants in last step. skip this + # step. + continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector @@ -493,9 +497,11 @@ class PageElement(object): checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: - found.extend([el for el in context.find_all(tag) if checker(el)]) + found.extend( + [el for el in context.find_all(tag) if checker(el)]) current_context = found continue + if '#' in token: # ID selector tag, id = token.split('#', 1) @@ -506,21 +512,25 @@ class PageElement(object): return [] # No match current_context = [el] continue + if '.' in token: # Class selector - tag, klass = token.split('.', 1) - if not tag: - tag = True + tag_name, klass = token.split('.', 1) + if not tag_name: + tag_name = True + classes = set(klass.split('.')) found = [] + def classes_match(tag): + if tag_name is not True and tag.name != tag_name: + return False + if not tag.has_attr('class'): + return False + return classes.issubset(tag['class']) for context in current_context: - found.extend( - context.find_all( - tag, - {'class': lambda attr: attr and klass in attr.split()} - ) - ) + found.extend(context.find_all(classes_match)) current_context = found continue + if token == '*': # Star selector found = [] @@ -528,6 +538,19 @@ class PageElement(object): found.extend(context.findAll(True)) current_context = found continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + current_context = found + continue + # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6d22448..e9a5763 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1308,37 +1308,43 @@ class TestNavigableStringSubclasses(SoupTest): class TestSoupSelector(TreeTest): - HTML = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html4/strict.dtd"> + HTML = """ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> <html> <head> - <title>The title</title> - <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +<title>The title</title> +<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> </head> <body> <div id="main"> - <div id="inner"> - <h1 id="header1">An H1</h1> - <p>Some text</p> - <p class="onep" id="p1">Some more text</p> - <h2 id="header2">An H2</h2> - <p class="class1 class2 class3" id="pmulti">Another</p> - <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> - <h2 id="header3">Another H2</h2> - <a id="me" href="http://simonwillison.net/" rel="me">me</a> - </div> - <p lang="en" id="lang-en">English</p> - <p lang="en-gb" id="lang-en-gb">English UK</p> - <p lang="en-us" id="lang-en-us">English US</p> - <p lang="fr" id="lang-fr">French</p> +<div id="inner"> +<h1 id="header1">An H1</h1> +<p>Some text</p> +<p class="onep" id="p1">Some more text</p> +<h2 id="header2">An H2</h2> +<p class="class1 class2 class3" id="pmulti">Another</p> +<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> +<h2 id="header3">Another H2</h2> +<a id="me" href="http://simonwillison.net/" rel="me">me</a> +<span class="s1"> +<a href="#" id="s1a1">span1a1</a> +<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> +<span class="span2"> +<a href="#" id="s2a1">span2a1</a> +</span> +<span class="span3"></span> +</span> +</div> +<p lang="en" id="lang-en">English</p> +<p lang="en-gb" id="lang-en-gb">English UK</p> +<p lang="en-us" id="lang-en-us">English US</p> +<p lang="fr" id="lang-fr">French</p> </div> <div id="footer"> </div> - -</body> -</html> """ def setUp(self): @@ -1428,6 +1434,16 @@ class TestSoupSelector(TreeTest): '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): self.assertSelects(selector, ['pmulti']) + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_attribute_equals(self): self.assertSelectMultiple( ('p[class="onep"]', ['p1']), @@ -1481,7 +1497,7 @@ class TestSoupSelector(TreeTest): ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1']), + ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), ('div[id$="1"]', []), ('[id$="noending"]', []), ) @@ -1504,7 +1520,7 @@ class TestSoupSelector(TreeTest): ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1']), + ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), ('div[id*="1"]', []), ('[id*="noending"]', []), # New for this test diff --git a/doc/source/index.rst b/doc/source/index.rst index a9d404a..37d5f07 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1538,15 +1538,27 @@ You can find tags:: Find tags beneath other tags:: - soup.select("p a") - # [<a class="sister" href="http://example.com/elsie" - id="link1">Elsie</a>, <a class="sister" - href="http://example.com/lacie" id="link2">Lacie</a>, <a - class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + soup.select("body a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] soup.select("html head title") # [<title>The Dormouse's story</title>] +Find tags `directly` beneath other tags:: + + soup.select("head > title") + # [<title>The Dormouse's story</title>] + + soup.select("p > a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("body > a") + # [] + Find tags by CSS class:: soup.select(".sister") @@ -1590,6 +1602,20 @@ Find tags by attribute value:: soup.select('a[href*=".com/el"]') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] +Match language codes:: + + multilingual_markup = """ + <p lang="en">Hello</p> + <p lang="en-us">Howdy, y'all</p> + <p lang="en-gb">Pip-pip, old fruit</p> + <p lang="fr">Bonjour mes amis</p> + """ + multilingual_soup = BeautifulSoup(multilingual_markup) + multilingual_soup.select('p[lang|=en]') + # [<p lang="en">Hello</p>, + # <p lang="en-us">Howdy, y'all</p>, + # <p lang="en-gb">Pip-pip, old fruit</p>] + This is a convenience for users who know the CSS selector syntax. You can do all this stuff with the Beautiful Soup API. And if CSS selectors are all you need, you might as well use lxml directly, |