diff options
-rw-r--r-- | bs4/element.py | 145 |
1 files changed, 94 insertions, 51 deletions
diff --git a/bs4/element.py b/bs4/element.py index 9f3adfb..982a5b0 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -567,6 +567,14 @@ class PageElement(object): value =" ".join(value) return value + def _tag_name_matches_and(self, function, tag_name): + if not tag_name: + return function + else: + def _match(tag): + return tag.name == tag_name and function(tag) + return _match + def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. @@ -608,53 +616,54 @@ class PageElement(object): else: return lambda el: el.has_attr(attribute) - def select(self, selector, recursive=True): + _selectors_that_consume_an_extra_token = ['>', '+', '~'] + + debug = True + + def select(self, selector, _candidate_generator=None): """Perform a CSS selection operation on the current element.""" + debug = True tokens = selector.split() current_context = [self] + + if tokens[-1] in self._selectors_that_consume_an_extra_token: + raise ValueError( + 'Final selector "%s" is missing an argument.' % tokens[-1]) + if self.debug: + print 'Running CSS selector "%s"' % selector for index, token in enumerate(tokens): - if tokens[index - 1] == '>': - # already found direct descendants in last step. skip this - # step. + if self.debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None + if tokens[index-1] in self._selectors_that_consume_an_extra_token: + # This token was consumed by the previous selector. Skip it. + if self.debug: + print ' Token was consumed by the previous selector.' continue - - # Each operation corresponds to a candidate generator (a - # rule for finding tags that might match) and a checker function (a - # rule for determining whether a tag does match. - production_rule = None + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. checker = None m = self.attribselect_re.match(token) if m is not None: # Attribute selector tag_name, attribute, operator, value = m.groups() - if not tag_name: - tag_name = True - production_rule = lambda tag: tag.find_all( - tag_name, recursive=recursive) checker = self._attribute_checker(operator, attribute, value) elif '#' in token: # ID selector tag_name, id = token.split('#', 1) - if tag_name == "": - tag_name = True - def find_by_id(tag): - found = tag.find(tag_name, id=id, recursive=recursive) - if found is None: - return [] - return [found] - production_rule = find_by_id - checker = lambda x: True + def id_matches(tag): + return tag.get('id', None) == id + checker = id_matches elif '.' in token: # Class selector tag_name, klass = token.split('.', 1) - if tag_name == '': - tag_name = True classes = set(klass.split('.')) - production_rule = lambda tag: tag.find_all( - tag_name, recursive=recursive) def classes_match(candidate): return classes.issubset(candidate.get('class', [])) checker = classes_match @@ -678,49 +687,83 @@ class PageElement(object): if pseudo_value < 1: raise ValueError( 'nth-of-type pseudoselector value must be at least 1.') + count = 0 def nth_child_of_type(tag): - children = tag.find_all( - tag_name, limit=pseudo_value, recursive=recursive) - if len(children) < pseudo_value: - return [] - return [children[pseudo_value-1]] - production_rule = nth_child_of_type - checker = lambda x: True + count += 1 + if count == pseudo_value: + return True + return False + checker = nth_child_of_type else: raise NotImplementedError( 'The following pseudoselectors are implemented: nth-of-type.') elif token == '*': - # Star selector - production_rule = lambda tag: tag.find_all(True, recursive=recursive) - checker = lambda x: True + # Star selector -- matches everything + checker = True elif token == '>': - # Child selector - # TODO If this is the last token, there's a problem. - next_selector = tokens[index + 1] - production_rule = lambda tag: tag.select( - next_selector, recursive=False) - checker = lambda candidate: True + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = tag.children elif self.tag_name_re.match(token): + # Just a tag name. tag_name = token - production_rule = lambda tag: tag.find_all(tag_name, recursive=recursive) - checker = lambda candidate: True + checker = True else: raise ValueError( 'Unsupported or invalid CSS selector: "%s"' % token) - # We now have a production rule and a checker. Find - # candidates by applying the production rule to every - # member of the current context. Check each candidate - # against the checker. The new context is the set of - # candidates that pass the checker. + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): + tag.select(next_token, recursive_candidate_generator) + _candidate_generator = recursive_select + checker = True + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. + if self.debug: + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _candidate_generator = default_candidate_generator + new_context = [] for tag in current_context: - for candidate in production_rule(tag): - if checker(candidate): + if self.debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if checker is True or checker(candidate): + if self.debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) new_context.append(candidate) + elif self.debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + current_context = new_context return current_context |