diff options
| author | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:58:27 -0400 |
|---|---|---|
| committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:58:27 -0400 |
| commit | 1cd5ad49b15d17fac017543876ec5d0a67b57b69 (patch) | |
| tree | 2b56628a4a16c53d654df0d7478ebcbc50ee5db4 /bs4/element.py | |
| parent | 431e078fbdb54adeb3875cb8c5cc75d6722de2bd (diff) | |
Added support for the "nth-of-type" CSS selector. The CSS selector ">" can now find a tag by means other than the tag name. Code by Sven Slootweg.
Diffstat (limited to 'bs4/element.py')
| -rw-r--r-- | bs4/element.py | 46 |
1 files changed, 39 insertions, 7 deletions
diff --git a/bs4/element.py b/bs4/element.py index 398eb05..67f2a79 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -608,7 +608,7 @@ class PageElement(object): else: return lambda el: el.has_attr(attribute) - def select(self, selector): + def select(self, selector, recursive=True): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] @@ -627,7 +627,9 @@ class PageElement(object): found = [] for context in current_context: found.extend( - [el for el in context.find_all(tag) if checker(el)]) + [el for el in + context.find_all(tag, recursive=recursive) + if checker(el)]) current_context = found continue @@ -656,15 +658,45 @@ class PageElement(object): return False return classes.issubset(tag['class']) for context in current_context: - found.extend(context.find_all(classes_match)) + found.extend(context.find_all(classes_match, recursive=recursive)) current_context = found continue + if ':' in token: + # Pseudoselector + tag_name, pseudo = token.split(':', 1) + if not tag_name: + raise ValueError( + "A pseudoselector must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is not None: + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are supported for the nth-of-type pseudoselector for now.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudoselector value must be at least 1.') + pseudo_value = pseudo_value - 1 + for context in current_context: + all_nodes = context.find_all(tag_name, recursive=recursive) + if pseudo_value < len(all_nodes): + found.extend([all_nodes[pseudo_value]]) + current_context = found + continue + else: + raise NotImplementedError( + 'Only the nth-of-type pseudoselector is supported for now.') + if token == '*': # Star selector found = [] for context in current_context: - found.extend(context.findAll(True)) + found.extend(context.find_all(True, recursive=recursive)) current_context = found continue @@ -676,16 +708,16 @@ class PageElement(object): found = [] for context in current_context: - found.extend(context.find_all(tag, recursive=False)) + found.extend(context.select(tag, recursive=False)) current_context = found continue - + # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] found = [] for context in current_context: - found.extend(context.findAll(token)) + found.extend(context.find_all(token, recursive=recursive)) current_context = found return current_context |
