From d91ac0c2a204fa79b4796cf079929fbbc6d214c8 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 08:29:35 -0500 Subject: Initial port of code and tests. --- bs4/element.py | 108 +++++++++++++++++++++++ bs4/tests/test_tree.py | 228 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+) diff --git a/bs4/element.py b/bs4/element.py index 650bacf..c660359 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -411,6 +411,114 @@ class PageElement(object): yield i i = i.parent + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'=?"?(?P[^\]"]*)"?\]$' + ) + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of attribute is equal to value + return lambda el: str(el.get(attribute)) == value + elif operator == '~': + # string representation of attribute includes value as one + # of a set of space separated tokens + return lambda el: value in str(el.get(attribute, '')).split() + elif operator == '^': + # string representation of attribute starts with value + return lambda el: str(el.get(attribute, '')).startswith(value) + elif operator == '$': + # string represenation of attribute ends with value + return lambda el: str(el.get(attribute, '')).endswith(value) + elif operator == '*': + # string representation of attribute contains value + return lambda el: value in str(el.get(attribute, '')) + elif operator == '|': + # string representation of attribute is either exactly + # value or starts with value- + return lambda el: ( + str(el.get(attribute, '')) == value + or str(el.get(attribute, '')).startswith('%s-' % value)) + else: + return lambda el: el.has_key(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + if selector == 'p[class~="class1"]': + import pdb; pdb.set_trace() + tokens = selector.split() + current_context = [self] + for token in tokens: + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend([el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + if '.' in token: + # Class selector + tag, klass = token.split('.', 1) + if not tag: + tag = True + found = [] + for context in current_context: + found.extend( + context.find_all( + tag, + {'class': lambda attr: attr and klass in attr.split()} + ) + ) + current_context = found + continue + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 4dda90e..39f78f7 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1300,3 +1300,231 @@ class TestNavigableStringSubclasses(SoupTest): soup.insert(1, doctype) self.assertEqual(soup.encode(), b"\n") + +class TestSoupSelector(SoupTest): + + HTML = """ + + + The title + + + + +
+
+

An H1

+

Some text

+

Some more text

+

An H2

+

Another

+ Bob +

Another H2

+ me +
+

English

+

English UK

+

English US

+

French

+
+ + + + + +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML) + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, [u'The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 3) + for div in els: + self.assertEqual(div.name, 'div') + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertEqual(len(self.soup.select('tag%t')), 0) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assert_(not els[0].has_key('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['l1', 'p1', 'header1']), + ('div[id$="1"]', []), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('a[href*="http://"]', ['bob', 'me']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['l1', 'p1', 'header1']), + ('div[id*="1"]', []), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ) -- cgit v1.2.3 From 8bbc84dfc6324a32066b87cf3a78ce0eb719e289 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 08:49:27 -0500 Subject: Got tests to pass on Python 2 and Python 3. --- bs4/element.py | 40 +++++++++++++++++++++++++++------------- bs4/tests/test_tree.py | 2 +- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index c660359..3baafe3 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -427,6 +427,17 @@ class PageElement(object): r'=?"?(?P[^\]"]*)"?\]$' ) + def _attr_value_as_string(self, value, default=None): + """Force an attribute value into a string representation. + + A multi-valued attribute will be converted into a + space-separated stirng. + """ + value = self.get(value, default) + if isinstance(value, list) or isinstance(value, tuple): + value =" ".join(value) + return value + def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. @@ -436,33 +447,36 @@ class PageElement(object): """ if operator == '=': # string representation of attribute is equal to value - return lambda el: str(el.get(attribute)) == value + return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': - # string representation of attribute includes value as one - # of a set of space separated tokens - return lambda el: value in str(el.get(attribute, '')).split() + def _includes_value(element): + attribute_value = element.get(attribute, []) + if not isinstance(attribute_value, list): + attribute_value = attribute_value.split() + return value in attribute_value + return _includes_value elif operator == '^': # string representation of attribute starts with value - return lambda el: str(el.get(attribute, '')).startswith(value) + return lambda el: el._attr_value_as_string(attribute, '').startswith(value) elif operator == '$': # string represenation of attribute ends with value - return lambda el: str(el.get(attribute, '')).endswith(value) + return lambda el: el._attr_value_as_string(attribute, '').endswith(value) elif operator == '*': # string representation of attribute contains value - return lambda el: value in str(el.get(attribute, '')) + return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': # string representation of attribute is either exactly # value or starts with value- - return lambda el: ( - str(el.get(attribute, '')) == value - or str(el.get(attribute, '')).startswith('%s-' % value)) + def _is_or_starts_with_dash(element): + attribute_value = element._attr_value_as_string(attribute, '') + return (attribute_value == value or attribute_value.startswith( + value + '-')) + return _is_or_starts_with_dash else: - return lambda el: el.has_key(attribute) + return lambda el: el.has_attr(attribute) def select(self, selector): """Perform a CSS selection operation on the current element.""" - if selector == 'p[class~="class1"]': - import pdb; pdb.set_trace() tokens = selector.split() current_context = [self] for token in tokens: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 39f78f7..bfc4218 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1412,7 +1412,7 @@ class TestSoupSelector(SoupTest): for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], ['onep']) - self.assert_(not els[0].has_key('class')) + self.assertFalse(els[0].has_key('class')) def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): -- cgit v1.2.3 From 4a5136d31bf07a7b28b58343f0c32e41d895e110 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 09:06:25 -0500 Subject: Test that CSS selectors work within the tree as well as at the top level. --- AUTHORS.txt | 10 +++++++--- NEWS.txt | 4 ++++ bs4/element.py | 20 ++++++++++++-------- bs4/tests/test_tree.py | 11 ++++++++++- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/AUTHORS.txt b/AUTHORS.txt index e093cd6..2ac8fcc 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -11,12 +11,16 @@ of UnicodeDammit. Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful Soup 4 working under Python 3. +Simon Willison wrote soupselect, which was used to make Beautiful Soup +support CSS selectors. + Sam Ruby helped with a lot of edge cases. Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his work in solving the nestable tags conundrum. -The following people have contributed patches to Beautiful Soup: +An incomplete list of people have contributed patches to Beautiful +Soup: Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris @@ -26,8 +30,8 @@ The following people have contributed patches to Beautiful Soup: Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn Webster, Paul Wright, Danny Yoo -The following people made suggestions or found bugs or found ways to -break Beautiful Soup: +An incomplete list of people who made suggestions or found bugs or +found ways to break Beautiful Soup: Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, diff --git a/NEWS.txt b/NEWS.txt index 3079aa1..fff19ad 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,3 +1,7 @@ += 4.0.0b10 () = + +* Added support for CSS selectors, taken from the soupselect project. + = 4.0.0b9 (20110228) = * Fixed the string representation of DOCTYPEs that have both a public diff --git a/bs4/element.py b/bs4/element.py index 3baafe3..584e171 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -446,9 +446,11 @@ class PageElement(object): combination. """ if operator == '=': - # string representation of attribute is equal to value + # string representation of `attribute` is equal to `value` return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': + # space-separated list representation of `attribute` + # contains `value` def _includes_value(element): attribute_value = element.get(attribute, []) if not isinstance(attribute_value, list): @@ -456,17 +458,19 @@ class PageElement(object): return value in attribute_value return _includes_value elif operator == '^': - # string representation of attribute starts with value - return lambda el: el._attr_value_as_string(attribute, '').startswith(value) + # string representation of `attribute` starts with `value` + return lambda el: el._attr_value_as_string( + attribute, '').startswith(value) elif operator == '$': - # string represenation of attribute ends with value - return lambda el: el._attr_value_as_string(attribute, '').endswith(value) + # string represenation of `attribute` ends with `value` + return lambda el: el._attr_value_as_string( + attribute, '').endswith(value) elif operator == '*': - # string representation of attribute contains value + # string representation of `attribute` contains `value` return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': - # string representation of attribute is either exactly - # value or starts with value- + # string representation of `attribute` is either exactly + # `value` or starts with `value` and then a dash. def _is_or_starts_with_dash(element): attribute_value = element._attr_value_as_string(attribute, '') return (attribute_value == value or attribute_value.startswith( diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index bfc4218..3f32736 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1301,7 +1301,7 @@ class TestNavigableStringSubclasses(SoupTest): self.assertEqual(soup.encode(), b"\n") -class TestSoupSelector(SoupTest): +class TestSoupSelector(TreeTest): HTML = """ @@ -1528,3 +1528,12 @@ class TestSoupSelector(SoupTest): ('[blah]', []), ('p[blah]', []), ) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The
tag was selected. The