summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-07 10:58:27 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-07 10:58:27 -0400
commit1cd5ad49b15d17fac017543876ec5d0a67b57b69 (patch)
tree2b56628a4a16c53d654df0d7478ebcbc50ee5db4
parent431e078fbdb54adeb3875cb8c5cc75d6722de2bd (diff)
Added support for the "nth-of-type" CSS selector. The CSS selector ">" can now find a tag by means other than the tag name. Code by Sven Slootweg.
-rw-r--r--NEWS.txt11
-rw-r--r--bs4/element.py46
-rw-r--r--bs4/tests/test_tree.py30
-rw-r--r--doc/source/index.rst9
4 files changed, 88 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 03418ab..dbc9cae 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -9,15 +9,24 @@
processing commands. [bug=1050164]
* The BeautifulSoup class is now aliased to "_s" and "_soup", making
- it quicker to type an import statement in an interactive session:
+ it quicker to type the import statement in an interactive session:
from bs4 import _s
or
from bs4 import _soup
+ This may change in the future, so don't use this in code that goes
+ into a file.
+
* The prettify() method now leaves the contents of <pre> tags
alone. [bug=1095654]
+* Added support for the "nth-of-type" CSS selector. Code by Sven
+ Slootweg. [bug=1109952]
+
+* The CSS selector ">" can now find a tag by means other than the
+ tag name. Code by Sven Slootweg. [bug=1109952]
+
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
diff --git a/bs4/element.py b/bs4/element.py
index 398eb05..67f2a79 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -608,7 +608,7 @@ class PageElement(object):
else:
return lambda el: el.has_attr(attribute)
- def select(self, selector):
+ def select(self, selector, recursive=True):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
@@ -627,7 +627,9 @@ class PageElement(object):
found = []
for context in current_context:
found.extend(
- [el for el in context.find_all(tag) if checker(el)])
+ [el for el in
+ context.find_all(tag, recursive=recursive)
+ if checker(el)])
current_context = found
continue
@@ -656,15 +658,45 @@ class PageElement(object):
return False
return classes.issubset(tag['class'])
for context in current_context:
- found.extend(context.find_all(classes_match))
+ found.extend(context.find_all(classes_match, recursive=recursive))
current_context = found
continue
+ if ':' in token:
+ # Pseudoselector
+ tag_name, pseudo = token.split(':', 1)
+ if not tag_name:
+ raise ValueError(
+ "A pseudoselector must be prefixed with a tag name.")
+ pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ found = []
+ if pseudo_attributes is not None:
+ pseudo_type, pseudo_value = pseudo_attributes.groups()
+ if pseudo_type == 'nth-of-type':
+ try:
+ pseudo_value = int(pseudo_value)
+ except:
+ raise NotImplementedError(
+ 'Only numeric values are supported for the nth-of-type pseudoselector for now.')
+ if pseudo_value < 1:
+ raise ValueError(
+ 'nth-of-type pseudoselector value must be at least 1.')
+ pseudo_value = pseudo_value - 1
+ for context in current_context:
+ all_nodes = context.find_all(tag_name, recursive=recursive)
+ if pseudo_value < len(all_nodes):
+ found.extend([all_nodes[pseudo_value]])
+ current_context = found
+ continue
+ else:
+ raise NotImplementedError(
+ 'Only the nth-of-type pseudoselector is supported for now.')
+
if token == '*':
# Star selector
found = []
for context in current_context:
- found.extend(context.findAll(True))
+ found.extend(context.find_all(True, recursive=recursive))
current_context = found
continue
@@ -676,16 +708,16 @@ class PageElement(object):
found = []
for context in current_context:
- found.extend(context.find_all(tag, recursive=False))
+ found.extend(context.select(tag, recursive=False))
current_context = found
continue
-
+
# Here we should just have a regular tag
if not self.tag_name_re.match(token):
return []
found = []
for context in current_context:
- found.extend(context.findAll(token))
+ found.extend(context.find_all(token, recursive=recursive))
current_context = found
return current_context
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 503af63..5f9e24b 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1637,6 +1637,9 @@ class TestSoupSelector(TreeTest):
def test_child_selector(self):
self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
self.assertSelects('.s1 > a span', ['s1a2s1'])
+
+ def test_child_selector_id(self):
+ self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self):
self.assertSelectMultiple(
@@ -1744,6 +1747,33 @@ class TestSoupSelector(TreeTest):
('p[blah]', []),
)
+ def test_nth_of_type(self):
+ # Try to select first paragraph
+ els = self.soup.select('div#inner p:nth-of-type(1)')
+ self.assertEqual(len(els), 1)
+ self.assertEqual(els[0].string, u'Some text')
+
+ # Try to select third paragraph
+ els = self.soup.select('div#inner p:nth-of-type(3)')
+ self.assertEqual(len(els), 1)
+ self.assertEqual(els[0].string, u'Another')
+
+ # Try to select (non-existent!) fourth paragraph
+ els = self.soup.select('div#inner p:nth-of-type(4)')
+ self.assertEqual(len(els), 0)
+
+ # Pass in an invalid value.
+ self.assertRaises(
+ ValueError, self.soup.select, 'div p:nth-of-type(0)')
+
+ def test_nth_of_type_direct_descendant(self):
+ els = self.soup.select('div#inner > p:nth-of-type(1)')
+ self.assertEqual(len(els), 1)
+ self.assertEqual(els[0].string, u'Some text')
+
+ def test_id_child_selector_nth_of_type(self):
+ self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
+
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
diff --git a/doc/source/index.rst b/doc/source/index.rst
index bfaa4d5..03d4824 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1613,6 +1613,9 @@ You can find tags::
soup.select("title")
# [<title>The Dormouse's story</title>]
+ soup.select("p nth-of-type(3)")
+ # [<p class="story">...</p>]
+
Find tags beneath other tags::
soup.select("body a")
@@ -1633,6 +1636,12 @@ Find tags `directly` beneath other tags::
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+ soup.select("p > a:nth-of-type(2)")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+ soup.select("p > #link1")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
+
soup.select("body > a")
# []