Added support for the "nth-of-type" CSS selector. The CSS selector ">" can now find a tag by means other than the tag name. Code by Sven Slootweg.

author: Leonard Richardson <leonardr@segfault.org> 2013-05-07 10:58:27 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2013-05-07 10:58:27 -0400
commit: 1cd5ad49b15d17fac017543876ec5d0a67b57b69 (patch)
tree: 2b56628a4a16c53d654df0d7478ebcbc50ee5db4
parent: 431e078fbdb54adeb3875cb8c5cc75d6722de2bd (diff)
4 files changed, 88 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 03418ab..dbc9cae 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -9,15 +9,24 @@
   processing commands. [bug=1050164]
 
 * The BeautifulSoup class is now aliased to "_s" and "_soup", making
-  it quicker to type an import statement in an interactive session:
+  it quicker to type the import statement in an interactive session:
 
   from bs4 import _s
    or
   from bs4 import _soup
 
+  This may change in the future, so don't use this in code that goes
+  into a file.
+
 * The prettify() method now leaves the contents of <pre> tags
   alone. [bug=1095654]
 
+* Added support for the "nth-of-type" CSS selector. Code by Sven
+  Slootweg. [bug=1109952]
+
+* The CSS selector ">" can now find a tag by means other than the
+  tag name. Code by Sven Slootweg. [bug=1109952]
+
 * Fix a bug in the html5lib treebuilder which sometimes created
   disconnected trees. [bug=1039527]
 
diff --git a/bs4/element.py b/bs4/element.py
index 398eb05..67f2a79 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -608,7 +608,7 @@ class PageElement(object):
         else:
             return lambda el: el.has_attr(attribute)
 
-    def select(self, selector):
+    def select(self, selector, recursive=True):
         """Perform a CSS selection operation on the current element."""
         tokens = selector.split()
         current_context = [self]
@@ -627,7 +627,9 @@ class PageElement(object):
                 found = []
                 for context in current_context:
                     found.extend(
-                        [el for el in context.find_all(tag) if checker(el)])
+                        [el for el in
+                         context.find_all(tag, recursive=recursive)
+                         if checker(el)])
                 current_context = found
                 continue
 
@@ -656,15 +658,45 @@ class PageElement(object):
                         return False
                     return classes.issubset(tag['class'])
                 for context in current_context:
-                    found.extend(context.find_all(classes_match))
+                    found.extend(context.find_all(classes_match, recursive=recursive))
                 current_context = found
                 continue
 
+            if ':' in token:
+                # Pseudoselector
+                tag_name, pseudo = token.split(':', 1)
+                if not tag_name:
+                    raise ValueError(
+                        "A pseudoselector must be prefixed with a tag name.")
+                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                found = []
+                if pseudo_attributes is not None:
+                    pseudo_type, pseudo_value = pseudo_attributes.groups()
+                    if pseudo_type == 'nth-of-type':
+                        try:
+                            pseudo_value = int(pseudo_value)
+                        except:
+                            raise NotImplementedError(
+                                'Only numeric values are supported for the nth-of-type pseudoselector for now.')
+                        if pseudo_value < 1:
+                            raise ValueError(
+                                'nth-of-type pseudoselector value must be at least 1.')
+                        pseudo_value = pseudo_value - 1
+                        for context in current_context:
+                            all_nodes = context.find_all(tag_name, recursive=recursive)
+                            if pseudo_value < len(all_nodes):
+                                found.extend([all_nodes[pseudo_value]])
+                        current_context = found
+                        continue
+                    else:
+                        raise NotImplementedError(
+                            'Only the nth-of-type pseudoselector is supported for now.')
+
             if token == '*':
                 # Star selector
                 found = []
                 for context in current_context:
-                    found.extend(context.findAll(True))
+                    found.extend(context.find_all(True, recursive=recursive))
                 current_context = found
                 continue
 
@@ -676,16 +708,16 @@ class PageElement(object):
 
                 found = []
                 for context in current_context:
-                    found.extend(context.find_all(tag, recursive=False))
+                    found.extend(context.select(tag, recursive=False))
                 current_context = found
                 continue
-
+            
             # Here we should just have a regular tag
             if not self.tag_name_re.match(token):
                 return []
             found = []
             for context in current_context:
-                found.extend(context.findAll(token))
+                found.extend(context.find_all(token, recursive=recursive))
             current_context = found
         return current_context
 
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 503af63..5f9e24b 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1637,6 +1637,9 @@ class TestSoupSelector(TreeTest):
     def test_child_selector(self):
         self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
         self.assertSelects('.s1 > a span', ['s1a2s1'])
+        
+    def test_child_selector_id(self):
+        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
 
     def test_attribute_equals(self):
         self.assertSelectMultiple(
@@ -1744,6 +1747,33 @@ class TestSoupSelector(TreeTest):
             ('p[blah]', []),
         )
 
+    def test_nth_of_type(self):
+        # Try to select first paragraph
+        els = self.soup.select('div#inner p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+        # Try to select third paragraph
+        els = self.soup.select('div#inner p:nth-of-type(3)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Another')
+
+        # Try to select (non-existent!) fourth paragraph
+        els = self.soup.select('div#inner p:nth-of-type(4)')
+        self.assertEqual(len(els), 0)
+
+        # Pass in an invalid value.
+        self.assertRaises(
+            ValueError, self.soup.select, 'div p:nth-of-type(0)')
+
+    def test_nth_of_type_direct_descendant(self):
+        els = self.soup.select('div#inner > p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+    def test_id_child_selector_nth_of_type(self):
+        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
+
     def test_select_on_element(self):
         # Other tests operate on the tree; this operates on an element
         # within the tree.
diff --git a/doc/source/index.rst b/doc/source/index.rst
index bfaa4d5..03d4824 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1613,6 +1613,9 @@ You can find tags::
  soup.select("title")
  # [<title>The Dormouse's story</title>]
 
+ soup.select("p nth-of-type(3)")
+ # [<p class="story">...</p>]
+
 Find tags beneath other tags::
 
  soup.select("body a")
@@ -1633,6 +1636,12 @@ Find tags `directly` beneath other tags::
  #  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
  #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 
+ soup.select("p > a:nth-of-type(2)")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+ soup.select("p > #link1")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
+
  soup.select("body > a")
  # []
author	Leonard Richardson <leonardr@segfault.org>	2013-05-07 10:58:27 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2013-05-07 10:58:27 -0400
commit	1cd5ad49b15d17fac017543876ec5d0a67b57b69 (patch)
tree	2b56628a4a16c53d654df0d7478ebcbc50ee5db4
parent	431e078fbdb54adeb3875cb8c5cc75d6722de2bd (diff)