summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt24
-rw-r--r--bs4/element.py382
2 files changed, 205 insertions, 201 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7abc700..419843b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,17 @@
= 4.2.0 (Unreleased) =
+* The Tag.select() method now supports most features of
+ CSS selectors.
+
+ - Added support for the adjacent sibling combinator (+) and the
+ general sibling combinator (~). Tests by "liquider". [bug=1082144]
+
+ - Added limited support for the "nth-of-type" pseudo-class. Code
+ by Sven Slootweg. [bug=1109952]
+
+ - Refactored the code so that the combinators (+, ~, and >) can
+ select child or sibling tags by attributes other than name.
+
* In an HTML document, the contents of a <script> or <style> tag will
no longer undergo entity substitution by default. XML documents work
the same way they did before. [bug=1085953]
@@ -21,27 +33,21 @@
* Added the 'diagnose' submodule, which includes several useful
functions for reporting problems and doing tech support.
- * diagnose(data) tries the given markup on every installed parser,
+ - diagnose(data) tries the given markup on every installed parser,
reporting exceptions and displaying successes. If a parser is not
installed, diagnose() mentions this fact.
- * lxml_trace(data, html=True) runs the given markup through lxml's
+ - lxml_trace(data, html=True) runs the given markup through lxml's
XML parser or HTML parser, and prints out the parser events as
they happen. This helps you quickly determine whether a given
problem occurs in lxml code or Beautiful Soup code.
- * htmlparser_trace(data) is the same thing, but for Python's
+ - htmlparser_trace(data) is the same thing, but for Python's
built-in HTMLParser class.
* The prettify() method now leaves the contents of <pre> tags
alone. [bug=1095654]
-* Added support for the "nth-of-type" CSS selector. Code by Sven
- Slootweg. [bug=1109952]
-
-* The CSS selector ">" can now find a tag by means other than the
- tag name. Code by Sven Slootweg. [bug=1109952]
-
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
diff --git a/bs4/element.py b/bs4/element.py
index 120471e..0be349d 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -616,198 +616,6 @@ class PageElement(object):
else:
return lambda el: el.has_attr(attribute)
- _selectors_that_consume_an_extra_token = ['>', '+', '~']
-
- _select_debug = False
-
- def select(self, selector, _candidate_generator=None):
- """Perform a CSS selection operation on the current element."""
- tokens = selector.split()
- current_context = [self]
-
- if tokens[-1] in self._selectors_that_consume_an_extra_token:
- raise ValueError(
- 'Final selector "%s" is missing an argument.' % tokens[-1])
- if self._select_debug:
- print 'Running CSS selector "%s"' % selector
- for index, token in enumerate(tokens):
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
- if tokens[index-1] in self._selectors_that_consume_an_extra_token:
- # This token was consumed by the previous selector. Skip it.
- if self._select_debug:
- print ' Token was consumed by the previous selector.'
- continue
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token:
- # Pseudoselector
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudoselector must be prefixed with a tag name.")
- pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is not None:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are supported for the nth-of-type pseudoselector for now.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudoselector value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- if self.count > self.destination:
- raise StopIteration()
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'The following pseudoselectors are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
-
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
- if self._select_debug:
- print '-' * 40
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
- else:
- check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
- else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
-
- new_context = []
- for tag in current_context:
- if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- break
- if checker is None or result:
- if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
- new_context.append(candidate)
- elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
-
- current_context = new_context
-
- if self._select_debug:
- print "Final verdict:"
- for i in current_context:
- print " %s %s" % (i.name, i.attrs)
- return current_context
-
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -1376,6 +1184,196 @@ class Tag(PageElement):
yield current
current = current.next_element
+ _selectors_that_consume_an_extra_token = ['>', '+', '~']
+ _select_debug = False
+ def select(self, selector, _candidate_generator=None):
+ """Perform a CSS selection operation on the current element."""
+ tokens = selector.split()
+ current_context = [self]
+
+ if tokens[-1] in self._selectors_that_consume_an_extra_token:
+ raise ValueError(
+ 'Final selector "%s" is missing an argument.' % tokens[-1])
+ if self._select_debug:
+ print 'Running CSS selector "%s"' % selector
+ for index, token in enumerate(tokens):
+ if self._select_debug:
+ print ' Considering token "%s"' % token
+ recursive_candidate_generator = None
+ tag_name = None
+ if tokens[index-1] in self._selectors_that_consume_an_extra_token:
+ # This token was consumed by the previous selector. Skip it.
+ if self._select_debug:
+ print ' Token was consumed by the previous selector.'
+ continue
+ # Each operation corresponds to a checker function, a rule
+ # for determining whether a candidate matches the
+ # selector. Candidates are generated by the active
+ # iterator.
+ checker = None
+
+ m = self.attribselect_re.match(token)
+ if m is not None:
+ # Attribute selector
+ tag_name, attribute, operator, value = m.groups()
+ checker = self._attribute_checker(operator, attribute, value)
+
+ elif '#' in token:
+ # ID selector
+ tag_name, id = token.split('#', 1)
+ def id_matches(tag):
+ return tag.get('id', None) == id
+ checker = id_matches
+
+ elif '.' in token:
+ # Class selector
+ tag_name, klass = token.split('.', 1)
+ classes = set(klass.split('.'))
+ def classes_match(candidate):
+ return classes.issubset(candidate.get('class', []))
+ checker = classes_match
+
+ elif ':' in token:
+ # Pseudo-class
+ tag_name, pseudo = token.split(':', 1)
+ if tag_name == '':
+ raise ValueError(
+ "A pseudo-class must be prefixed with a tag name.")
+ pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ found = []
+ if pseudo_attributes is not None:
+ pseudo_type, pseudo_value = pseudo_attributes.groups()
+ if pseudo_type == 'nth-of-type':
+ try:
+ pseudo_value = int(pseudo_value)
+ except:
+ raise NotImplementedError(
+ 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+ if pseudo_value < 1:
+ raise ValueError(
+ 'nth-of-type pseudo-class value must be at least 1.')
+ class Counter(object):
+ def __init__(self, destination):
+ self.count = 0
+ self.destination = destination
+
+ def nth_child_of_type(self, tag):
+ self.count += 1
+ if self.count == self.destination:
+ return True
+ if self.count > self.destination:
+ raise StopIteration()
+ return False
+ checker = Counter(pseudo_value).nth_child_of_type
+ else:
+ raise NotImplementedError(
+ 'Only the following pseudo-classes are implemented: nth-of-type.')
+
+ elif token == '*':
+ # Star selector -- matches everything
+ pass
+ elif token == '>':
+ # Run the next token as a CSS selector against the
+ # direct children of each tag in the current context.
+ recursive_candidate_generator = lambda tag: tag.children
+ elif token == '~':
+ # Run the next token as a CSS selector against the
+ # siblings of each tag in the current context.
+ recursive_candidate_generator = lambda tag: tag.next_siblings
+ elif token == '+':
+ # For each tag in the current context, run the next
+ # token as a CSS selector against the tag's next
+ # sibling that's a tag.
+ def next_tag_sibling(tag):
+ yield tag.find_next_sibling(True)
+ recursive_candidate_generator = next_tag_sibling
+
+ elif self.tag_name_re.match(token):
+ # Just a tag name.
+ tag_name = token
+ else:
+ raise ValueError(
+ 'Unsupported or invalid CSS selector: "%s"' % token)
+
+ if recursive_candidate_generator:
+ # This happens when the selector looks like "> foo".
+ #
+ # The generator calls select() recursively on every
+ # member of the current context, passing in a different
+ # candidate generator and a different selector.
+ #
+ # In the case of "> foo", the candidate generator is
+ # one that yields a tag's direct children (">"), and
+ # the selector is "foo".
+ next_token = tokens[index+1]
+ def recursive_select(tag):
+ if self._select_debug:
+ print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
+ print '-' * 40
+ for i in tag.select(next_token, recursive_candidate_generator):
+ if self._select_debug:
+ print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+ yield i
+ if self._select_debug:
+ print '-' * 40
+ _use_candidate_generator = recursive_select
+ elif _candidate_generator is None:
+ # By default, a tag's candidates are all of its
+ # children. If tag_name is defined, only yield tags
+ # with that name.
+ if self._select_debug:
+ if tag_name:
+ check = "[any]"
+ else:
+ check = tag_name
+ print ' Default candidate generator, tag name="%s"' % check
+ if self._select_debug:
+ # This is redundant with later code, but it stops
+ # a bunch of bogus tags from cluttering up the
+ # debug log.
+ def default_candidate_generator(tag):
+ for child in tag.descendants:
+ if not isinstance(child, Tag):
+ continue
+ if tag_name and not child.name == tag_name:
+ continue
+ yield child
+ _use_candidate_generator = default_candidate_generator
+ else:
+ _use_candidate_generator = lambda tag: tag.descendants
+ else:
+ _use_candidate_generator = _candidate_generator
+
+ new_context = []
+ for tag in current_context:
+ if self._select_debug:
+ print " Running candidate generator on %s %s" % (
+ tag.name, repr(tag.attrs))
+ for candidate in _use_candidate_generator(tag):
+ if not isinstance(candidate, Tag):
+ continue
+ if tag_name and candidate.name != tag_name:
+ continue
+ if checker is not None:
+ try:
+ result = checker(candidate)
+ except StopIteration:
+ break
+ if checker is None or result:
+ if self._select_debug:
+ print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+ new_context.append(candidate)
+ elif self._select_debug:
+ print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+
+ current_context = new_context
+
+ if self._select_debug:
+ print "Final verdict:"
+ for i in current_context:
+ print " %s %s" % (i.name, i.attrs)
+ return current_context
+
# Old names for backwards compatibility
def childGenerator(self):
return self.children