summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-08-12 12:02:18 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-08-12 12:02:18 -0400
commitc22a10a31aa0bc6671ce22509dade90496736b32 (patch)
tree8916b29ca7e2bcc6fd7315068444cb88147db7be
parentb7fae1bd115492eb489359715ed74a742e664f46 (diff)
All find_all calls should now return a ResultSet object. Patch by
Aaron DeVore. [bug=1194034]
-rw-r--r--NEWS.txt20
-rw-r--r--bs4/element.py24
-rw-r--r--bs4/tests/test_tree.py13
3 files changed, 37 insertions, 20 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 248befb..fdea5be 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,14 +1,15 @@
= 4.3.0 (Unreleased) =
* Instead of converting incoming data to Unicode and feeding it to the
- lxml tree builder, Beautiful Soup now makes successive guesses at
- the encoding of the incoming data, and tells lxml to parse the data
- as that encoding. Giving lxml more control over the parsing process
- improves performance and avoids a number of bugs and issues with the
- lxml parser which had previously required elaborate workarounds:
+ lxml tree builder in chunks, Beautiful Soup now makes successive
+ guesses at the encoding of the incoming data, and tells lxml to
+ parse the data as that encoding. Giving lxml more control over the
+ parsing process improves performance and avoids a number of bugs and
+ issues with the lxml parser which had previously required elaborate
+ workarounds:
- - An issue in which lxml refuses to parse Unicode strings.
- [bug=1180527]
+ - An issue in which lxml refuses to parse Unicode strings on some
+ systems. [bug=1180527]
- A returning bug that truncated documents longer than a (very
small) size. [bug=963880]
@@ -26,12 +27,15 @@
undocumented features have also been removed.
* Beautiful Soup will issue a warning if instead of markup you pass it
- a URL or the name of a file on disk (a common beginner mistake).
+ a URL or the name of a file on disk (a common beginner's mistake).
* A number of optimizations improve the performance of the lxml tree
builder by about 33%, the html.parser tree builder by about 20%, and
the html5lib tree builder by about 15%.
+* All find_all calls should now return a ResultSet object. Patch by
+ Aaron DeVore. [bug=1194034]
+
= 4.2.1 (20130531) =
* The default XML formatter will now replace ampersands even if they
diff --git a/bs4/element.py b/bs4/element.py
index f248895..2484853 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -477,20 +477,20 @@ class PageElement(object):
if isinstance(name, SoupStrainer):
strainer = name
- elif text is None and not limit and not attrs and not kwargs:
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+
+ if text is None and not limit and not attrs and not kwargs:
# Optimization to find all tags.
if name is True or name is None:
- return [element for element in generator
- if isinstance(element, Tag)]
+ result = (element for element in generator
+ if isinstance(element, Tag))
+ ResultSet(strainer, result)
# Optimization to find all tags with a given name.
elif isinstance(name, basestring):
- return [element for element in generator
- if isinstance(element, Tag) and element.name == name]
- else:
- strainer = SoupStrainer(name, attrs, text, **kwargs)
- else:
- # Build a SoupStrainer
- strainer = SoupStrainer(name, attrs, text, **kwargs)
+ result = (element for element in generator
+ if isinstance(element, Tag)
+ and element.name == name)
results = ResultSet(strainer)
while True:
try:
@@ -1602,6 +1602,6 @@ class SoupStrainer(object):
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
- def __init__(self, source):
- list.__init__([])
+ def __init__(self, source, result=()):
+ super(list, self).__init__(result)
self.source = source
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 0acc092..1c2c93b 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -115,6 +115,19 @@ class TestFindAll(TreeTest):
# recursion.
self.assertEqual([], soup.find_all(l))
+ def test_find_all_resultset(self):
+ """All find_all calls return a ResultSet"""
+ soup = self.soup("<a></a>")
+ result = soup.find_all("a")
+ self.assertTrue(hasattr(result, "source"))
+
+ result = soup.find_all(True)
+ self.assertTrue(hasattr(result, "source"))
+
+ result = soup.find_all(text="foo")
+ self.assertTrue(hasattr(result, "source"))
+
+
class TestFindAllBasicNamespaces(TreeTest):
def test_find_by_namespaced_name(self):