diff options
-rw-r--r-- | NEWS.txt | 24 | ||||
-rw-r--r-- | bs4/element.py | 382 |
2 files changed, 205 insertions, 201 deletions
@@ -1,5 +1,17 @@ = 4.2.0 (Unreleased) = +* The Tag.select() method now supports most features of + CSS selectors. + + - Added support for the adjacent sibling combinator (+) and the + general sibling combinator (~). Tests by "liquider". [bug=1082144] + + - Added limited support for the "nth-of-type" pseudo-class. Code + by Sven Slootweg. [bug=1109952] + + - Refactored the code so that the combinators (+, ~, and >) can + select child or sibling tags by attributes other than name. + * In an HTML document, the contents of a <script> or <style> tag will no longer undergo entity substitution by default. XML documents work the same way they did before. [bug=1085953] @@ -21,27 +33,21 @@ * Added the 'diagnose' submodule, which includes several useful functions for reporting problems and doing tech support. - * diagnose(data) tries the given markup on every installed parser, + - diagnose(data) tries the given markup on every installed parser, reporting exceptions and displaying successes. If a parser is not installed, diagnose() mentions this fact. - * lxml_trace(data, html=True) runs the given markup through lxml's + - lxml_trace(data, html=True) runs the given markup through lxml's XML parser or HTML parser, and prints out the parser events as they happen. This helps you quickly determine whether a given problem occurs in lxml code or Beautiful Soup code. - * htmlparser_trace(data) is the same thing, but for Python's + - htmlparser_trace(data) is the same thing, but for Python's built-in HTMLParser class. * The prettify() method now leaves the contents of <pre> tags alone. [bug=1095654] -* Added support for the "nth-of-type" CSS selector. Code by Sven - Slootweg. [bug=1109952] - -* The CSS selector ">" can now find a tag by means other than the - tag name. Code by Sven Slootweg. [bug=1109952] - * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] diff --git a/bs4/element.py b/bs4/element.py index 120471e..0be349d 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -616,198 +616,6 @@ class PageElement(object): else: return lambda el: el.has_attr(attribute) - _selectors_that_consume_an_extra_token = ['>', '+', '~'] - - _select_debug = False - - def select(self, selector, _candidate_generator=None): - """Perform a CSS selection operation on the current element.""" - tokens = selector.split() - current_context = [self] - - if tokens[-1] in self._selectors_that_consume_an_extra_token: - raise ValueError( - 'Final selector "%s" is missing an argument.' % tokens[-1]) - if self._select_debug: - print 'Running CSS selector "%s"' % selector - for index, token in enumerate(tokens): - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None - if tokens[index-1] in self._selectors_that_consume_an_extra_token: - # This token was consumed by the previous selector. Skip it. - if self._select_debug: - print ' Token was consumed by the previous selector.' - continue - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token: - # Pseudoselector - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudoselector must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is not None: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are supported for the nth-of-type pseudoselector for now.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudoselector value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - if self.count > self.destination: - raise StopIteration() - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'The following pseudoselectors are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - new_context = [] - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - new_context.append(candidate) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - - current_context = new_context - - if self._select_debug: - print "Final verdict:" - for i in current_context: - print " %s %s" % (i.name, i.attrs) - return current_context - # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -1376,6 +1184,196 @@ class Tag(PageElement): yield current current = current.next_element + _selectors_that_consume_an_extra_token = ['>', '+', '~'] + _select_debug = False + def select(self, selector, _candidate_generator=None): + """Perform a CSS selection operation on the current element.""" + tokens = selector.split() + current_context = [self] + + if tokens[-1] in self._selectors_that_consume_an_extra_token: + raise ValueError( + 'Final selector "%s" is missing an argument.' % tokens[-1]) + if self._select_debug: + print 'Running CSS selector "%s"' % selector + for index, token in enumerate(tokens): + if self._select_debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None + if tokens[index-1] in self._selectors_that_consume_an_extra_token: + # This token was consumed by the previous selector. Skip it. + if self._select_debug: + print ' Token was consumed by the previous selector.' + continue + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. + checker = None + + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag_name, attribute, operator, value = m.groups() + checker = self._attribute_checker(operator, attribute, value) + + elif '#' in token: + # ID selector + tag_name, id = token.split('#', 1) + def id_matches(tag): + return tag.get('id', None) == id + checker = id_matches + + elif '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + classes = set(klass.split('.')) + def classes_match(candidate): + return classes.issubset(candidate.get('class', [])) + checker = classes_match + + elif ':' in token: + # Pseudo-class + tag_name, pseudo = token.split(':', 1) + if tag_name == '': + raise ValueError( + "A pseudo-class must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is not None: + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are currently supported for the nth-of-type pseudo-class.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudo-class value must be at least 1.') + class Counter(object): + def __init__(self, destination): + self.count = 0 + self.destination = destination + + def nth_child_of_type(self, tag): + self.count += 1 + if self.count == self.destination: + return True + if self.count > self.destination: + raise StopIteration() + return False + checker = Counter(pseudo_value).nth_child_of_type + else: + raise NotImplementedError( + 'Only the following pseudo-classes are implemented: nth-of-type.') + + elif token == '*': + # Star selector -- matches everything + pass + elif token == '>': + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.children + elif token == '~': + # Run the next token as a CSS selector against the + # siblings of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.next_siblings + elif token == '+': + # For each tag in the current context, run the next + # token as a CSS selector against the tag's next + # sibling that's a tag. + def next_tag_sibling(tag): + yield tag.find_next_sibling(True) + recursive_candidate_generator = next_tag_sibling + + elif self.tag_name_re.match(token): + # Just a tag name. + tag_name = token + else: + raise ValueError( + 'Unsupported or invalid CSS selector: "%s"' % token) + + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): + if self._select_debug: + print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) + print '-' * 40 + for i in tag.select(next_token, recursive_candidate_generator): + if self._select_debug: + print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + yield i + if self._select_debug: + print '-' * 40 + _use_candidate_generator = recursive_select + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. + if self._select_debug: + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + if self._select_debug: + # This is redundant with later code, but it stops + # a bunch of bogus tags from cluttering up the + # debug log. + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _use_candidate_generator = default_candidate_generator + else: + _use_candidate_generator = lambda tag: tag.descendants + else: + _use_candidate_generator = _candidate_generator + + new_context = [] + for tag in current_context: + if self._select_debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _use_candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if tag_name and candidate.name != tag_name: + continue + if checker is not None: + try: + result = checker(candidate) + except StopIteration: + break + if checker is None or result: + if self._select_debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + new_context.append(candidate) + elif self._select_debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + + current_context = new_context + + if self._select_debug: + print "Final verdict:" + for i in current_context: + print " %s %s" % (i.name, i.attrs) + return current_context + # Old names for backwards compatibility def childGenerator(self): return self.children |