summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2014-12-10 19:01:46 -0500
committerLeonard Richardson <leonardr@segfault.org>2014-12-10 19:01:46 -0500
commit5048db4ab06f6065d8d364500b38631456b58690 (patch)
treeb2d13695ebfa4806389170f36b47749b32a04933
parent8b1dd38e165d211d904d7143ea5042f26353bdb5 (diff)
The select() method now supports selector grouping. Patch by
Francisco Canas [bug=1191917]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/element.py350
-rw-r--r--bs4/tests/test_tree.py46
-rw-r--r--doc/source/index.rst13
4 files changed, 241 insertions, 171 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 2f0121e..9b4ac1e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,9 @@
* Fixed yet another problem that caused the html5lib tree builder to
create a disconnected parse tree. [bug=1237763]
+* The select() method now supports selector grouping. Patch by
+ Francisco Canas [bug=1191917]
+
* Restored the helpful syntax error that happens when you try to
import the Python 2 edition of Beautiful Soup under Python
3. [bug=1213387]
diff --git a/bs4/element.py b/bs4/element.py
index da9afdf..197722d 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1203,192 +1203,206 @@ class Tag(PageElement):
_select_debug = False
def select(self, selector, _candidate_generator=None):
"""Perform a CSS selection operation on the current element."""
- tokens = selector.split()
+
+ # Remove whitespace directly after the grouping operator ','
+ # then split into tokens.
+ tokens = re.sub(',[\s]*',',', selector).split()
current_context = [self]
if tokens[-1] in self._selector_combinators:
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
+
if self._select_debug:
print 'Running CSS selector "%s"' % selector
- for index, token in enumerate(tokens):
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
+
+ for index, token_group in enumerate(tokens):
+ new_context = []
+ new_context_ids = set([])
+
+ # Grouping selectors, ie: p,a
+ grouped_tokens = token_group.split(',')
+ if '' in grouped_tokens:
+ raise ValueError('Invalid group selection syntax: %s' % token_group)
+
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token:
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is not None:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- if self.count > self.destination:
- # Stop the generator that's sending us
- # these things.
- raise StopIteration()
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
+ for token in grouped_tokens:
+ if self._select_debug:
+ print ' Considering token "%s"' % token
+ recursive_candidate_generator = None
+ tag_name = None
+
+ # Each operation corresponds to a checker function, a rule
+ # for determining whether a candidate matches the
+ # selector. Candidates are generated by the active
+ # iterator.
+ checker = None
+
+ m = self.attribselect_re.match(token)
+ if m is not None:
+ # Attribute selector
+ tag_name, attribute, operator, value = m.groups()
+ checker = self._attribute_checker(operator, attribute, value)
+
+ elif '#' in token:
+ # ID selector
+ tag_name, tag_id = token.split('#', 1)
+ def id_matches(tag):
+ return tag.get('id', None) == tag_id
+ checker = id_matches
+
+ elif '.' in token:
+ # Class selector
+ tag_name, klass = token.split('.', 1)
+ classes = set(klass.split('.'))
+ def classes_match(candidate):
+ return classes.issubset(candidate.get('class', []))
+ checker = classes_match
+
+ elif ':' in token:
+ # Pseudo-class
+ tag_name, pseudo = token.split(':', 1)
+ if tag_name == '':
+ raise ValueError(
+ "A pseudo-class must be prefixed with a tag name.")
+ pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ found = []
+ if pseudo_attributes is not None:
+ pseudo_type, pseudo_value = pseudo_attributes.groups()
+ if pseudo_type == 'nth-of-type':
+ try:
+ pseudo_value = int(pseudo_value)
+ except:
+ raise NotImplementedError(
+ 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+ if pseudo_value < 1:
+ raise ValueError(
+ 'nth-of-type pseudo-class value must be at least 1.')
+ class Counter(object):
+ def __init__(self, destination):
+ self.count = 0
+ self.destination = destination
+
+ def nth_child_of_type(self, tag):
+ self.count += 1
+ if self.count == self.destination:
+ return True
+ if self.count > self.destination:
+ # Stop the generator that's sending us
+ # these things.
+ raise StopIteration()
+ return False
+ checker = Counter(pseudo_value).nth_child_of_type
+ else:
+ raise NotImplementedError(
+ 'Only the following pseudo-classes are implemented: nth-of-type.')
+
+ elif token == '*':
+ # Star selector -- matches everything
+ pass
+ elif token == '>':
+ # Run the next token as a CSS selector against the
+ # direct children of each tag in the current context.
+ recursive_candidate_generator = lambda tag: tag.children
+ elif token == '~':
+ # Run the next token as a CSS selector against the
+ # siblings of each tag in the current context.
+ recursive_candidate_generator = lambda tag: tag.next_siblings
+ elif token == '+':
+ # For each tag in the current context, run the next
+ # token as a CSS selector against the tag's next
+ # sibling that's a tag.
+ def next_tag_sibling(tag):
+ yield tag.find_next_sibling(True)
+ recursive_candidate_generator = next_tag_sibling
+
+ elif self.tag_name_re.match(token):
+ # Just a tag name.
+ tag_name = token
+ else:
+ raise ValueError(
+ 'Unsupported or invalid CSS selector: "%s"' % token)
+ if recursive_candidate_generator:
+ # This happens when the selector looks like "> foo".
+ #
+ # The generator calls select() recursively on every
+ # member of the current context, passing in a different
+ # candidate generator and a different selector.
+ #
+ # In the case of "> foo", the candidate generator is
+ # one that yields a tag's direct children (">"), and
+ # the selector is "foo".
+ next_token = tokens[index+1]
+ def recursive_select(tag):
+ if self._select_debug:
+ print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
+ print '-' * 40
+ for i in tag.select(next_token, recursive_candidate_generator):
+ if self._select_debug:
+ print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+ yield i
if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
+ print '-' * 40
+ _use_candidate_generator = recursive_select
+ elif _candidate_generator is None:
+ # By default, a tag's candidates are all of its
+ # children. If tag_name is defined, only yield tags
+ # with that name.
if self._select_debug:
- print '-' * 40
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
+ if tag_name:
+ check = "[any]"
+ else:
+ check = tag_name
+ print ' Default candidate generator, tag name="%s"' % check
+ if self._select_debug:
+ # This is redundant with later code, but it stops
+ # a bunch of bogus tags from cluttering up the
+ # debug log.
+ def default_candidate_generator(tag):
+ for child in tag.descendants:
+ if not isinstance(child, Tag):
+ continue
+ if tag_name and not child.name == tag_name:
+ continue
+ yield child
+ _use_candidate_generator = default_candidate_generator
else:
- check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
+ _use_candidate_generator = lambda tag: tag.descendants
else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
+ _use_candidate_generator = _candidate_generator
+
+ for tag in current_context:
+ if self._select_debug:
+ print " Running candidate generator on %s %s" % (
+ tag.name, repr(tag.attrs))
+ for candidate in _use_candidate_generator(tag):
+ if not isinstance(candidate, Tag):
+ continue
+ if tag_name and candidate.name != tag_name:
+ continue
+ if checker is not None:
+ try:
+ result = checker(candidate)
+ except StopIteration:
+ # The checker has decided we should no longer
+ # run the generator.
+ break
+ if checker is None or result:
+ if self._select_debug:
+ print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+ if id(candidate) not in new_context_ids:
+ # If a tag matches a selector more than once,
+ # don't include it in the context more than once.
+ new_context.append(candidate)
+ new_context_ids.add(id(candidate))
+ elif self._select_debug:
+ print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
- new_context = []
- new_context_ids = set([])
- for tag in current_context:
- if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- # The checker has decided we should no longer
- # run the generator.
- break
- if checker is None or result:
- if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
- if id(candidate) not in new_context_ids:
- # If a tag matches a selector more than once,
- # don't include it in the context more than once.
- new_context.append(candidate)
- new_context_ids.add(id(candidate))
- elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index de9543d..8f629d9 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1554,6 +1554,14 @@ class TestSoupSelector(TreeTest):
<span class="span3"></span>
</span>
</div>
+<x id="xid">
+<z id="zida"/>
+<z id="zidab"/>
+<z id="zidac"/>
+</x>
+<y id="yid">
+<z id="zidb"/>
+</y>
<p lang="en" id="lang-en">English</p>
<p lang="en-gb" id="lang-en-gb">English UK</p>
<p lang="en-us" id="lang-en-us">English US</p>
@@ -1827,3 +1835,41 @@ class TestSoupSelector(TreeTest):
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
+
+ # Test the selector grouping operator (the comma)
+ def test_multiple_select(self):
+ self.assertSelects('x, y',['xid','yid'])
+
+ def test_multiple_select_with_no_space(self):
+ self.assertSelects('x,y',['xid','yid'])
+
+ def test_multiple_select_with_more_space(self):
+ self.assertSelects('x, y',['xid', 'yid'])
+
+ def test_multiple_select_sibling(self):
+ self.assertSelects('x, y ~ p[lang=fr]',['lang-fr'])
+
+ def test_multiple_select(self):
+ self.assertSelects('x, y > z', ['zida', 'zidb', 'zidab', 'zidac'])
+
+ def test_multiple_select_direct_descendant(self):
+ self.assertSelects('div > x, y, z', ['xid', 'yid'])
+
+ def test_multiple_select_indirect_descendant(self):
+ self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
+
+ def test_invalid_multiple_select(self):
+ self.assertRaises(ValueError, self.soup.select, ',x, y')
+ self.assertRaises(ValueError, self.soup.select, 'x,,y')
+
+ def test_multiple_select(self):
+ self.assertSelects('p[lang=en], p[lang=en-gb]',['lang-en','lang-en-gb'])
+
+ def test_multiple_select_ids(self):
+ self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['zida', 'zidb','zidab'])
+
+ def test_multiple_select_nested(self):
+ self.assertSelects('body > div > x, y > z', ['zida', 'zidb', 'zidab', 'zidac'])
+
+
+
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 5d067ea..4f3bd0b 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -552,7 +552,7 @@ Here's the "Three sisters" HTML document again::
"""
from bs4 import BeautifulSoup
- soup = BeautifulSoup(html_doc)
+ soup = BeautifulSoup(html_doc, 'html.parser')
I'll use this as an example to show you how to move from one part of
a document to another.
@@ -1009,7 +1009,7 @@ Once again, I'll be using the "three sisters" document as an example::
"""
from bs4 import BeautifulSoup
- soup = BeautifulSoup(html_doc)
+ soup = BeautifulSoup(html_doc, 'html.parser')
By passing in a filter to an argument like ``find_all()``, you can
zoom in on the parts of the document you're interested in.
@@ -1692,6 +1692,12 @@ Find tags by ID::
soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+Find tags that match any selector from a list of selectors:
+
+ soup.select("#link1,#link2")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
Test for the existence of an attribute::
soup.select('a[href]')
@@ -1729,7 +1735,8 @@ Match language codes::
# <p lang="en-us">Howdy, y'all</p>,
# <p lang="en-gb">Pip-pip, old fruit</p>]
-This is a convenience for users who know the CSS selector syntax. You
+
+This is all a convenience for users who know the CSS selector syntax. You
can do all this stuff with the Beautiful Soup API. And if CSS
selectors are all you need, you might as well use lxml directly: it's
a lot faster, and it supports more CSS selectors. But this lets you