diff options
-rw-r--r-- | bs4/element.py | 108 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 228 |
2 files changed, 336 insertions, 0 deletions
diff --git a/bs4/element.py b/bs4/element.py index 650bacf..c660359 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -411,6 +411,114 @@ class PageElement(object): yield i i = i.parent + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + + r'=?"?(?P<value>[^\]"]*)"?\]$' + ) + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of attribute is equal to value + return lambda el: str(el.get(attribute)) == value + elif operator == '~': + # string representation of attribute includes value as one + # of a set of space separated tokens + return lambda el: value in str(el.get(attribute, '')).split() + elif operator == '^': + # string representation of attribute starts with value + return lambda el: str(el.get(attribute, '')).startswith(value) + elif operator == '$': + # string represenation of attribute ends with value + return lambda el: str(el.get(attribute, '')).endswith(value) + elif operator == '*': + # string representation of attribute contains value + return lambda el: value in str(el.get(attribute, '')) + elif operator == '|': + # string representation of attribute is either exactly + # value or starts with value- + return lambda el: ( + str(el.get(attribute, '')) == value + or str(el.get(attribute, '')).startswith('%s-' % value)) + else: + return lambda el: el.has_key(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + if selector == 'p[class~="class1"]': + import pdb; pdb.set_trace() + tokens = selector.split() + current_context = [self] + for token in tokens: + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend([el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + if '.' in token: + # Class selector + tag, klass = token.split('.', 1) + if not tag: + tag = True + found = [] + for context in current_context: + found.extend( + context.find_all( + tag, + {'class': lambda attr: attr and klass in attr.split()} + ) + ) + current_context = found + continue + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 4dda90e..39f78f7 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1300,3 +1300,231 @@ class TestNavigableStringSubclasses(SoupTest): soup.insert(1, doctype) self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + +class TestSoupSelector(SoupTest): + + HTML = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> + <title>The title</title> + <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +</head> +<body> + +<div id="main"> + <div id="inner"> + <h1 id="header1">An H1</h1> + <p>Some text</p> + <p class="onep" id="p1">Some more text</p> + <h2 id="header2">An H2</h2> + <p class="class1 class2 class3" id="pmulti">Another</p> + <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> + <h2 id="header3">Another H2</h2> + <a id="me" href="http://simonwillison.net/" rel="me">me</a> + </div> + <p lang="en" id="lang-en">English</p> + <p lang="en-gb" id="lang-en-gb">English UK</p> + <p lang="en-us" id="lang-en-us">English US</p> + <p lang="fr" id="lang-fr">French</p> +</div> + +<div id="footer"> +</div> + +</body> +</html> +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML) + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, [u'The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 3) + for div in els: + self.assertEqual(div.name, 'div') + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertEqual(len(self.soup.select('tag%t')), 0) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assert_(not els[0].has_key('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['l1', 'p1', 'header1']), + ('div[id$="1"]', []), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('a[href*="http://"]', ['bob', 'me']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['l1', 'p1', 'header1']), + ('div[id*="1"]', []), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ) |