summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/element.py145
1 files changed, 94 insertions, 51 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 9f3adfb..982a5b0 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -567,6 +567,14 @@ class PageElement(object):
value =" ".join(value)
return value
+ def _tag_name_matches_and(self, function, tag_name):
+ if not tag_name:
+ return function
+ else:
+ def _match(tag):
+ return tag.name == tag_name and function(tag)
+ return _match
+
def _attribute_checker(self, operator, attribute, value=''):
"""Create a function that performs a CSS selector operation.
@@ -608,53 +616,54 @@ class PageElement(object):
else:
return lambda el: el.has_attr(attribute)
- def select(self, selector, recursive=True):
+ _selectors_that_consume_an_extra_token = ['>', '+', '~']
+
+ debug = True
+
+ def select(self, selector, _candidate_generator=None):
"""Perform a CSS selection operation on the current element."""
+ debug = True
tokens = selector.split()
current_context = [self]
+
+ if tokens[-1] in self._selectors_that_consume_an_extra_token:
+ raise ValueError(
+ 'Final selector "%s" is missing an argument.' % tokens[-1])
+ if self.debug:
+ print 'Running CSS selector "%s"' % selector
for index, token in enumerate(tokens):
- if tokens[index - 1] == '>':
- # already found direct descendants in last step. skip this
- # step.
+ if self.debug:
+ print ' Considering token "%s"' % token
+ recursive_candidate_generator = None
+ tag_name = None
+ if tokens[index-1] in self._selectors_that_consume_an_extra_token:
+ # This token was consumed by the previous selector. Skip it.
+ if self.debug:
+ print ' Token was consumed by the previous selector.'
continue
-
- # Each operation corresponds to a candidate generator (a
- # rule for finding tags that might match) and a checker function (a
- # rule for determining whether a tag does match.
- production_rule = None
+ # Each operation corresponds to a checker function, a rule
+ # for determining whether a candidate matches the
+ # selector. Candidates are generated by the active
+ # iterator.
checker = None
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag_name, attribute, operator, value = m.groups()
- if not tag_name:
- tag_name = True
- production_rule = lambda tag: tag.find_all(
- tag_name, recursive=recursive)
checker = self._attribute_checker(operator, attribute, value)
elif '#' in token:
# ID selector
tag_name, id = token.split('#', 1)
- if tag_name == "":
- tag_name = True
- def find_by_id(tag):
- found = tag.find(tag_name, id=id, recursive=recursive)
- if found is None:
- return []
- return [found]
- production_rule = find_by_id
- checker = lambda x: True
+ def id_matches(tag):
+ return tag.get('id', None) == id
+ checker = id_matches
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
- if tag_name == '':
- tag_name = True
classes = set(klass.split('.'))
- production_rule = lambda tag: tag.find_all(
- tag_name, recursive=recursive)
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
@@ -678,49 +687,83 @@ class PageElement(object):
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudoselector value must be at least 1.')
+ count = 0
def nth_child_of_type(tag):
- children = tag.find_all(
- tag_name, limit=pseudo_value, recursive=recursive)
- if len(children) < pseudo_value:
- return []
- return [children[pseudo_value-1]]
- production_rule = nth_child_of_type
- checker = lambda x: True
+ count += 1
+ if count == pseudo_value:
+ return True
+ return False
+ checker = nth_child_of_type
else:
raise NotImplementedError(
'The following pseudoselectors are implemented: nth-of-type.')
elif token == '*':
- # Star selector
- production_rule = lambda tag: tag.find_all(True, recursive=recursive)
- checker = lambda x: True
+ # Star selector -- matches everything
+ checker = True
elif token == '>':
- # Child selector
- # TODO If this is the last token, there's a problem.
- next_selector = tokens[index + 1]
- production_rule = lambda tag: tag.select(
- next_selector, recursive=False)
- checker = lambda candidate: True
+ # Run the next token as a CSS selector against the
+ # direct children of each tag in the current context.
+ recursive_candidate_generator = tag.children
elif self.tag_name_re.match(token):
+ # Just a tag name.
tag_name = token
- production_rule = lambda tag: tag.find_all(tag_name, recursive=recursive)
- checker = lambda candidate: True
+ checker = True
else:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
- # We now have a production rule and a checker. Find
- # candidates by applying the production rule to every
- # member of the current context. Check each candidate
- # against the checker. The new context is the set of
- # candidates that pass the checker.
+ if recursive_candidate_generator:
+ # This happens when the selector looks like "> foo".
+ #
+ # The generator calls select() recursively on every
+ # member of the current context, passing in a different
+ # candidate generator and a different selector.
+ #
+ # In the case of "> foo", the candidate generator is
+ # one that yields a tag's direct children (">"), and
+ # the selector is "foo".
+ next_token = tokens[index+1]
+ def recursive_select(tag):
+ tag.select(next_token, recursive_candidate_generator)
+ _candidate_generator = recursive_select
+ checker = True
+ elif _candidate_generator is None:
+ # By default, a tag's candidates are all of its
+ # children. If tag_name is defined, only yield tags
+ # with that name.
+ if self.debug:
+ if tag_name:
+ check = "[any]"
+ else:
+ check = tag_name
+ print ' Default candidate generator, tag name="%s"' % check
+ def default_candidate_generator(tag):
+ for child in tag.descendants:
+ if not isinstance(child, Tag):
+ continue
+ if tag_name and not child.name == tag_name:
+ continue
+ yield child
+ _candidate_generator = default_candidate_generator
+
new_context = []
for tag in current_context:
- for candidate in production_rule(tag):
- if checker(candidate):
+ if self.debug:
+ print " Running candidate generator on %s %s" % (
+ tag.name, repr(tag.attrs))
+ for candidate in _candidate_generator(tag):
+ if not isinstance(candidate, Tag):
+ continue
+ if checker is True or checker(candidate):
+ if self.debug:
+ print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
new_context.append(candidate)
+ elif self.debug:
+ print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+
current_context = new_context
return current_context