diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 14:07:04 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-15 14:07:04 -0500 |
commit | be0c08585f54ec709740ff4352006bf3e605b8f2 (patch) | |
tree | 342c8a482bef4490a8f0fbb528611888bcf76721 | |
parent | 0f6d3cfbef6fc0b90f0e9fbe58408e00c2383070 (diff) |
Better defined behavior when the user wants to search for a combination of text and tag-specific arguments. [bug=695312]
-rw-r--r-- | NEWS.txt | 8 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 21 | ||||
-rw-r--r-- | bs4/element.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 21 |
4 files changed, 51 insertions, 5 deletions
@@ -1,5 +1,13 @@ = 4.0.0b6 () = +* Passing text along with tag-specific arguments to a find* method: + + find("a", text="Click here") + + will find tags that contain the given text as their + .string. Previously, the tag-specific arguments were ignored and + only strings were searched. + * Fixed a bug that caused the html5lib tree builder to build a partially disconnected tree. Generally cleaned up the html5lib tree builder. diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst index 1ad6449..8b7f1e4 100644 --- a/bs4/doc/source/index.rst +++ b/bs4/doc/source/index.rst @@ -1206,6 +1206,14 @@ Here are some examples:: soup.find_all(text=is_the_only_string_within_a_tag) # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] +Although ``text`` is for finding strings, you can combine it with +arguments for finding tags, Beautiful Soup will find all tags whose +``.string`` matches your value for ``text``. This code finds the <a> +tags whose ``.string`` is "Elsie":: + + soup.find_all("a", "Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + .. _limit: The ``limit`` argument @@ -2495,9 +2503,16 @@ Miscellaneous contains a single tag B and nothing else, then A.string is the same as B.string. (Previously, it was None.) -`Multi-valued attributes`_ like ``class`` are parsed into lists if -they have more than one value. This may affect the way you search by -CSS class. +`Multi-valued attributes`_ like ``class`` are presented as lists. This +may affect the way you search by CSS class. + +If you pass one of the ``find*`` methods both :ref:`text <text>` `and` +a tag-specific argument like :ref:`name <name>`, Beautiful Soup will +search for tags that match your tag-specific criteria and whose +:ref:`Tag.string <.string>` matches your value for :ref:`text +<text>`. It will `not` find the strings themselves. Previously, +Beautiful Soup ignored the tag-specific arguments and looked for +strings. The ``BeautifulSoup`` constructor no longer recognizes the `markupMassage` argument. It's now the parser's responsibility to diff --git a/bs4/element.py b/bs4/element.py index 474364b..5e15252 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -974,6 +974,8 @@ class SoupStrainer(object): found = markup else: found = markup_name + if found and self.text and self.text != found.string: + found = None return found searchTag = search_tag @@ -991,12 +993,12 @@ class SoupStrainer(object): # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): - if not self.text: + if not self.text or self.name or self.attrs: found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isinstance(markup, basestring): - if self._matches(markup, self.text): + if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: raise Exception( diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 4da6fd9..2e74c00 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -277,6 +277,27 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) + def test_find_by_name_and_containing_string(self): + soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") + a = soup.a + + self.assertEqual([a], soup.find_all("a", text="foo")) + self.assertEqual([], soup.find_all("a", text="bar")) + self.assertEqual([], soup.find_all("a", text="bar")) + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") + self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') + a = soup.a + + self.assertEqual([a], soup.find_all(id=2, text="foo")) + self.assertEqual([], soup.find_all(id=1, text="bar")) + + + class TestIndex(TreeTest): """Test Tag.index""" |