summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-08 09:21:56 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-08 09:21:56 -0400
commite2df0c682a1348a0170ac5efa1775cf38a49de6d (patch)
tree6a1da4e46a0bb1e38ac9459d717d5935a4ba6972
parent39efcb4b7ab30145b3733ba820f3c0df0da35ace (diff)
Initial refactoring.
-rw-r--r--bs4/element.py130
1 files changed, 62 insertions, 68 deletions
diff --git a/bs4/element.py b/bs4/element.py
index f4d5c40..7d84083 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -617,55 +617,42 @@ class PageElement(object):
# already found direct descendants in last step. skip this
# step.
continue
+
+ # Each operation corresponds to a candidate generator (a
+ # rule for finding tags that might match) and a checker function (a
+ # rule for determining whether a tag does match.
+ production_rule = None
+ checker = None
+
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
- tag, attribute, operator, value = m.groups()
- if not tag:
- tag = True
+ tag_name, attribute, operator, value = m.groups()
+ if not tag_name:
+ tag_name = True
+ production_rule = lambda tag: tag.find_all(tag_name)
checker = self._attribute_checker(operator, attribute, value)
- found = []
- for context in current_context:
- found.extend(
- [el for el in
- context.find_all(tag, recursive=recursive)
- if checker(el)])
- current_context = found
- continue
- if '#' in token:
+ elif '#' in token:
# ID selector
- tag, id = token.split('#', 1)
- if tag == "":
- tag = True
- if len(current_context) == 0:
- # No match.
- return []
- el = current_context[0].find(tag, {'id': id})
- if el is None:
- return [] # No match
- current_context = [el]
- continue
+ tag_name, id = token.split('#', 1)
+ if tag_name == "":
+ tag_name = True
+ production_rule = lambda tag: tag.find(tag_name, id=id)
+ checker = None
- if '.' in token:
+ elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
- if not tag_name:
+ if tag_name == '':
tag_name = True
classes = set(klass.split('.'))
- found = []
- def classes_match(tag):
- if tag_name is not True and tag.name != tag_name:
- return False
- if not tag.has_attr('class'):
- return False
- return classes.issubset(tag['class'])
- for context in current_context:
- found.extend(context.find_all(classes_match, recursive=recursive))
- current_context = found
- continue
+ production_rule = lambda tag: tag.find_all(tag_name)
+ def classes_match(candidate):
+ return classes.issubset(tag.getattr('class', []))
+ checker = classes_match
- if ':' in token:
+ elif ':' in token:
# Pseudoselector
tag_name, pseudo = token.split(':', 1)
if not tag_name:
@@ -684,44 +671,51 @@ class PageElement(object):
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudoselector value must be at least 1.')
- pseudo_value = pseudo_value - 1
- for context in current_context:
- all_nodes = context.find_all(tag_name, recursive=recursive)
- if pseudo_value < len(all_nodes):
- found.extend([all_nodes[pseudo_value]])
- current_context = found
- continue
+ count = 0
+ def checker(tag):
+ if tag.name == tag_name:
+ count += 1
+ if pseudo_value == count:
+ return True
+ return False
+ production_rule = lambda tag: tag.find_all(tag_name, limit=pseudo_value)
+ checker = checker
else:
raise NotImplementedError(
- 'Only the nth-of-type pseudoselector is supported for now.')
+ 'The following pseudoselectors are implemented: nth-of-type.')
- if token == '*':
+ elif token == '*':
# Star selector
- found = []
- for context in current_context:
- found.extend(context.find_all(True, recursive=recursive))
- current_context = found
- continue
+ production_rule = lambda tag: tag.find_all()
+ checker = lambda x: True
- if token == '>':
+ elif token == '>':
# Child selector
- tag = tokens[index + 1]
- if not tag:
- tag = True
+ # TODO If this is the last token, there's a problem.
+ next_selector = tokens[index + 1]
+ production_rule = lambda tag: tag.select(next_selector)
+ checker = lambda candidate: True
+
+ elif self.tag_name_re.match(token):
+ tag_name = token
+ production_rule = lambda tag: tag.find_all(tag_name)
+ checker = lambda candidate: True
+ else:
+ raise ValueError(
+ 'Unsupported or invalid CSS selector: "%s"' % token)
+
+ # We now have a production rule and a checker. Apply the
+ # production rule to every member of the new context to
+ # find candidates. Check each candidate against the
+ # checker. The new context is the set of candidates that
+ # pass the checker.
+ new_context = []
+ for tag in current_context:
+ for candidate in production_rule(tag):
+ if checker(candidate):
+ new_context.append(candidate)
+ current_context = new_context
- found = []
- for context in current_context:
- found.extend(context.select(tag, recursive=False))
- current_context = found
- continue
-
- # Here we should just have a regular tag
- if not self.tag_name_re.match(token):
- return []
- found = []
- for context in current_context:
- found.extend(context.find_all(token, recursive=recursive))
- current_context = found
return current_context
# Old non-property versions of the generators, for backwards