diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:58:27 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-07 10:58:27 -0400 |
commit | 1cd5ad49b15d17fac017543876ec5d0a67b57b69 (patch) | |
tree | 2b56628a4a16c53d654df0d7478ebcbc50ee5db4 | |
parent | 431e078fbdb54adeb3875cb8c5cc75d6722de2bd (diff) |
Added support for the "nth-of-type" CSS selector. The CSS selector ">" can now find a tag by means other than the tag name. Code by Sven Slootweg.
-rw-r--r-- | NEWS.txt | 11 | ||||
-rw-r--r-- | bs4/element.py | 46 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 30 | ||||
-rw-r--r-- | doc/source/index.rst | 9 |
4 files changed, 88 insertions, 8 deletions
@@ -9,15 +9,24 @@ processing commands. [bug=1050164] * The BeautifulSoup class is now aliased to "_s" and "_soup", making - it quicker to type an import statement in an interactive session: + it quicker to type the import statement in an interactive session: from bs4 import _s or from bs4 import _soup + This may change in the future, so don't use this in code that goes + into a file. + * The prettify() method now leaves the contents of <pre> tags alone. [bug=1095654] +* Added support for the "nth-of-type" CSS selector. Code by Sven + Slootweg. [bug=1109952] + +* The CSS selector ">" can now find a tag by means other than the + tag name. Code by Sven Slootweg. [bug=1109952] + * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] diff --git a/bs4/element.py b/bs4/element.py index 398eb05..67f2a79 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -608,7 +608,7 @@ class PageElement(object): else: return lambda el: el.has_attr(attribute) - def select(self, selector): + def select(self, selector, recursive=True): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] @@ -627,7 +627,9 @@ class PageElement(object): found = [] for context in current_context: found.extend( - [el for el in context.find_all(tag) if checker(el)]) + [el for el in + context.find_all(tag, recursive=recursive) + if checker(el)]) current_context = found continue @@ -656,15 +658,45 @@ class PageElement(object): return False return classes.issubset(tag['class']) for context in current_context: - found.extend(context.find_all(classes_match)) + found.extend(context.find_all(classes_match, recursive=recursive)) current_context = found continue + if ':' in token: + # Pseudoselector + tag_name, pseudo = token.split(':', 1) + if not tag_name: + raise ValueError( + "A pseudoselector must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is not None: + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are supported for the nth-of-type pseudoselector for now.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudoselector value must be at least 1.') + pseudo_value = pseudo_value - 1 + for context in current_context: + all_nodes = context.find_all(tag_name, recursive=recursive) + if pseudo_value < len(all_nodes): + found.extend([all_nodes[pseudo_value]]) + current_context = found + continue + else: + raise NotImplementedError( + 'Only the nth-of-type pseudoselector is supported for now.') + if token == '*': # Star selector found = [] for context in current_context: - found.extend(context.findAll(True)) + found.extend(context.find_all(True, recursive=recursive)) current_context = found continue @@ -676,16 +708,16 @@ class PageElement(object): found = [] for context in current_context: - found.extend(context.find_all(tag, recursive=False)) + found.extend(context.select(tag, recursive=False)) current_context = found continue - + # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] found = [] for context in current_context: - found.extend(context.findAll(token)) + found.extend(context.find_all(token, recursive=recursive)) current_context = found return current_context diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 503af63..5f9e24b 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1637,6 +1637,9 @@ class TestSoupSelector(TreeTest): def test_child_selector(self): self.assertSelects('.s1 > a', ['s1a1', 's1a2']) self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_child_selector_id(self): + self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) def test_attribute_equals(self): self.assertSelectMultiple( @@ -1744,6 +1747,33 @@ class TestSoupSelector(TreeTest): ('p[blah]', []), ) + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + self.assertEqual(len(els), 0) + + # Pass in an invalid value. + self.assertRaises( + ValueError, self.soup.select, 'div p:nth-of-type(0)') + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) + def test_select_on_element(self): # Other tests operate on the tree; this operates on an element # within the tree. diff --git a/doc/source/index.rst b/doc/source/index.rst index bfaa4d5..03d4824 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1613,6 +1613,9 @@ You can find tags:: soup.select("title") # [<title>The Dormouse's story</title>] + soup.select("p nth-of-type(3)") + # [<p class="story">...</p>] + Find tags beneath other tags:: soup.select("body a") @@ -1633,6 +1636,12 @@ Find tags `directly` beneath other tags:: # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + soup.select("p > a:nth-of-type(2)") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + soup.select("p > #link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + soup.select("body > a") # [] |