diff options
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/bs4/element.py b/bs4/element.py index 650bacf..c660359 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -411,6 +411,114 @@ class PageElement(object): yield i i = i.parent + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + + r'=?"?(?P<value>[^\]"]*)"?\]$' + ) + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of attribute is equal to value + return lambda el: str(el.get(attribute)) == value + elif operator == '~': + # string representation of attribute includes value as one + # of a set of space separated tokens + return lambda el: value in str(el.get(attribute, '')).split() + elif operator == '^': + # string representation of attribute starts with value + return lambda el: str(el.get(attribute, '')).startswith(value) + elif operator == '$': + # string represenation of attribute ends with value + return lambda el: str(el.get(attribute, '')).endswith(value) + elif operator == '*': + # string representation of attribute contains value + return lambda el: value in str(el.get(attribute, '')) + elif operator == '|': + # string representation of attribute is either exactly + # value or starts with value- + return lambda el: ( + str(el.get(attribute, '')) == value + or str(el.get(attribute, '')).startswith('%s-' % value)) + else: + return lambda el: el.has_key(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + if selector == 'p[class~="class1"]': + import pdb; pdb.set_trace() + tokens = selector.split() + current_context = [self] + for token in tokens: + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend([el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + if '.' in token: + # Class selector + tag, klass = token.split('.', 1) + if not tag: + tag = True + found = [] + for context in current_context: + found.extend( + context.find_all( + tag, + {'class': lambda attr: attr and klass in attr.split()} + ) + ) + current_context = found + continue + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): |