From 74ca8e3f33d44475401be0bc418da83264f91207 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 2 Mar 2012 10:29:08 -0500 Subject: Brought the soupselect port up to date. --- NEWS.txt | 6 +++++ bs4/element.py | 45 +++++++++++++++++++++++++++--------- bs4/tests/test_tree.py | 62 +++++++++++++++++++++++++++++++------------------- doc/source/index.rst | 36 +++++++++++++++++++++++++---- 4 files changed, 110 insertions(+), 39 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index d9b421e..cf76b84 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,3 +1,9 @@ += 4.0.0b11 () = + +* Brought BS up to date with the latest release of soupselect, adding + CSS selector support for direct descendant matches and multiple CSS + class matches. + = 4.0.0b10 (20120302) = * Added support for simple CSS selectors, taken from the soupselect project. diff --git a/bs4/element.py b/bs4/element.py index 2851a75..d2fa19f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -483,7 +483,11 @@ class PageElement(object): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] - for token in tokens: + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + # already found direct descendants in last step. skip this + # step. + continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector @@ -493,9 +497,11 @@ class PageElement(object): checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: - found.extend([el for el in context.find_all(tag) if checker(el)]) + found.extend( + [el for el in context.find_all(tag) if checker(el)]) current_context = found continue + if '#' in token: # ID selector tag, id = token.split('#', 1) @@ -506,21 +512,25 @@ class PageElement(object): return [] # No match current_context = [el] continue + if '.' in token: # Class selector - tag, klass = token.split('.', 1) - if not tag: - tag = True + tag_name, klass = token.split('.', 1) + if not tag_name: + tag_name = True + classes = set(klass.split('.')) found = [] + def classes_match(tag): + if tag_name is not True and tag.name != tag_name: + return False + if not tag.has_attr('class'): + return False + return classes.issubset(tag['class']) for context in current_context: - found.extend( - context.find_all( - tag, - {'class': lambda attr: attr and klass in attr.split()} - ) - ) + found.extend(context.find_all(classes_match)) current_context = found continue + if token == '*': # Star selector found = [] @@ -528,6 +538,19 @@ class PageElement(object): found.extend(context.findAll(True)) current_context = found continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + current_context = found + continue + # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6d22448..e9a5763 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1308,37 +1308,43 @@ class TestNavigableStringSubclasses(SoupTest): class TestSoupSelector(TreeTest): - HTML = """ + HTML = """ + - The title - +The title +
-
-

An H1

-

Some text

-

Some more text

-

An H2

-

Another

- Bob -

Another H2

- me -
-

English

-

English UK

-

English US

-

French

+
+

An H1

+

Some text

+

Some more text

+

An H2

+

Another

+Bob +

Another H2

+me + +span1a1 +span1a2 test + +span2a1 + + + +
+

English

+

English UK

+

English US

+

French

- - - """ def setUp(self): @@ -1428,6 +1434,16 @@ class TestSoupSelector(TreeTest): '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): self.assertSelects(selector, ['pmulti']) + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_attribute_equals(self): self.assertSelectMultiple( ('p[class="onep"]', ['p1']), @@ -1481,7 +1497,7 @@ class TestSoupSelector(TreeTest): ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1']), + ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), ('div[id$="1"]', []), ('[id$="noending"]', []), ) @@ -1504,7 +1520,7 @@ class TestSoupSelector(TreeTest): ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1']), + ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), ('div[id*="1"]', []), ('[id*="noending"]', []), # New for this test diff --git a/doc/source/index.rst b/doc/source/index.rst index a9d404a..37d5f07 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1538,15 +1538,27 @@ You can find tags:: Find tags beneath other tags:: - soup.select("p a") - # [Elsie, Lacie, Tillie] + soup.select("body a") + # [Elsie, + # Lacie, + # Tillie] soup.select("html head title") # [The Dormouse's story] +Find tags `directly` beneath other tags:: + + soup.select("head > title") + # [The Dormouse's story] + + soup.select("p > a") + # [Elsie, + # Lacie, + # Tillie] + + soup.select("body > a") + # [] + Find tags by CSS class:: soup.select(".sister") @@ -1590,6 +1602,20 @@ Find tags by attribute value:: soup.select('a[href*=".com/el"]') # [Elsie] +Match language codes:: + + multilingual_markup = """ +

Hello

+

Howdy, y'all

+

Pip-pip, old fruit

+

Bonjour mes amis

+ """ + multilingual_soup = BeautifulSoup(multilingual_markup) + multilingual_soup.select('p[lang|=en]') + # [

Hello

, + #

Howdy, y'all

, + #

Pip-pip, old fruit

] + This is a convenience for users who know the CSS selector syntax. You can do all this stuff with the Beautiful Soup API. And if CSS selectors are all you need, you might as well use lxml directly, -- cgit v1.2.3