summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/element.py124
-rw-r--r--bs4/tests/test_tree.py10
3 files changed, 91 insertions, 49 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 6a21d45..2682720 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -17,7 +17,11 @@
into a tag it's already inside, and replacing one of a tag's
children with another. [bug=997529]
-* Fixed the inability to search for non-ASCII attribute values. [bug=1003974]
+* Fixed the inability to search for non-ASCII attribute
+ values. [bug=1003974]
+
+ This caused a major refactoring of the search code. All the tests
+ pass, but it's possible that some searches will behave differently.
= 4.0.5 (20120427) =
diff --git a/bs4/element.py b/bs4/element.py
index 6fb89ea..91a4007 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1161,20 +1161,55 @@ class SoupStrainer(object):
text)."""
def __init__(self, name=None, attrs={}, text=None, **kwargs):
- self.name = name
+ self.name = self._normalize_search_value(name)
if not isinstance(attrs, dict):
# Treat a non-dict value for attrs as a search for the 'class'
# attribute.
kwargs['class'] = attrs
attrs = None
+
if kwargs:
if attrs:
attrs = attrs.copy()
attrs.update(kwargs)
else:
attrs = kwargs
- self.attrs = attrs
- self.text = text
+ normalized_attrs = {}
+ for key, value in attrs.items():
+ normalized_attrs[key] = self._normalize_search_value(value)
+
+ self.attrs = normalized_attrs
+ self.text = self._normalize_search_value(text)
+
+ def _normalize_search_value(self, value):
+ # Leave it alone if it's a Unicode string, a callable, a
+ # regular expression, a boolean, or None.
+ if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
+ or isinstance(value, bool) or value is None):
+ return value
+
+ # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
+ if isinstance(value, bytes):
+ return value.decode("utf8")
+
+ # If it's listlike, convert it into a list of strings.
+ if hasattr(value, '__iter__'):
+ new_value = []
+ for v in value:
+ if (hasattr(v, '__iter__') and not isinstance(v, bytes)
+ and not isinstance(v, unicode)):
+ # This is almost certainly the user's mistake. In the
+ # interests of avoiding infinite loops, we'll let
+ # it through as-is rather than doing a recursive call.
+ new_value.append(v)
+ else:
+ new_value.append(self._normalize_search_value(v))
+ return new_value
+
+ # Otherwise, convert it into a Unicode string.
+ # The unicode(str()) thing is so this will do the same thing on Python 2
+ # and Python 3.
+ return unicode(str(value))
def __str__(self):
if self.text:
@@ -1250,13 +1285,12 @@ class SoupStrainer(object):
return found
def _matches(self, markup, match_against):
- #print "Matching %s against %s" % (markup, match_against)
+ # print u"Matching %s against %s" % (markup, match_against)
result = False
-
if isinstance(markup, list) or isinstance(markup, tuple):
- # This should only happen when searching, e.g. the 'class'
- # attribute.
- if (isinstance(match_against, basestring)
+ # This should only happen when searching a multi-valued attribute
+ # like 'class'.
+ if (isinstance(match_against, unicode)
and ' ' in match_against):
# A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept
@@ -1265,50 +1299,44 @@ class SoupStrainer(object):
# XXX This is going to be pretty slow because we keep
# splitting match_against. But it shouldn't come up
# too often.
- result = (whitespace_re.split(match_against) == markup)
+ return (whitespace_re.split(match_against) == markup)
else:
for item in markup:
if self._matches(item, match_against):
- result = True
- elif match_against is True:
- result = markup is not None
- elif isinstance(match_against, collections.Callable):
- result = match_against(markup)
- else:
- #Custom match methods take the tag as an argument, but all
- #other ways of matching match the tag name as a string.
- if isinstance(markup, Tag):
- markup = markup.name
- if markup is not None and not isinstance(markup, basestring):
- markup = unicode(markup)
- #Now we know that chunk is either a string, or None.
- if hasattr(match_against, 'match'):
- # It's a regexp object.
- result = markup and match_against.search(markup)
- elif (hasattr(match_against, '__iter__')
- and markup is not None
- and not isinstance(match_against, bytes)
- and not isinstance(match_against, unicode)):
- result = markup in match_against
- elif hasattr(match_against, 'items'):
- if markup is None:
- result = len(match_against.items()) == 0
- else:
- result = match_against in markup
- elif match_against is not None:
- if isinstance(match_against, unicode):
- # Unicode is fine.
- pass
- elif isinstance(match_against, bytes):
- # A bytestring should be converted into Unicode.
- match_against = match_against.decode("utf8")
- else:
- # Anything else should be converted into a string, then to Unicode.
- match_against = str(match_against)
+ return True
+ return False
+
+ if match_against is True:
+ # True matches any non-None value.
+ return markup is not None
+
+ if isinstance(match_against, collections.Callable):
+ return match_against(markup)
+
+ # Custom callables take the tag as an argument, but all
+ # other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+
+ # Ensure that `markup` is either a Unicode string, or None.
+ markup = self._normalize_search_value(markup)
+
+ if markup is None:
+ # None matches None, False, an empty string, an empty list, and so on.
+ return not match_against
+
+ if isinstance(match_against, unicode):
+ # Exact string match
+ return markup == match_against
+
+ if hasattr(match_against, 'match'):
+ # Regexp match
+ return match_against.search(markup)
- if not result:
- result = match_against == markup
- return result
+ if hasattr(match_against, '__iter__'):
+ # The markup must be an exact match against something
+ # in the iterable.
+ return markup in match_against
class ResultSet(list):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 1bb479e..cc573ed 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -104,6 +104,16 @@ class TestFindAll(TreeTest):
self.assertSelects(soup('a', limit=1), ["1"])
self.assertSelects(soup.b(id="foo"), ["3"])
+ def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
+ soup = self.soup("<a></a>")
+ # Create a self-referential list.
+ l = []
+ l.append(l)
+
+ # Without special code in _normalize_search_value, this would cause infinite
+ # recursion.
+ self.assertEqual([], soup.find_all(l))
+
class TestFindAllBasicNamespaces(TreeTest):
def test_find_by_namespaced_name(self):