summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt2
-rw-r--r--bs4/element.py15
-rw-r--r--bs4/tests/test_tree.py10
-rw-r--r--doc/source/index.rst3
4 files changed, 26 insertions, 4 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 16f4598..6a21d45 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -17,6 +17,8 @@
into a tag it's already inside, and replacing one of a tag's
children with another. [bug=997529]
+* Fixed the inability to search for non-ASCII attribute values. [bug=1003974]
+
= 4.0.5 (20120427) =
* Added a new method, wrap(), which wraps an element in a tag.
diff --git a/bs4/element.py b/bs4/element.py
index 99a3540..6fb89ea 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1287,15 +1287,24 @@ class SoupStrainer(object):
result = markup and match_against.search(markup)
elif (hasattr(match_against, '__iter__')
and markup is not None
- and not isinstance(match_against, basestring)):
+ and not isinstance(match_against, bytes)
+ and not isinstance(match_against, unicode)):
result = markup in match_against
elif hasattr(match_against, 'items'):
if markup is None:
result = len(match_against.items()) == 0
else:
result = match_against in markup
- elif match_against and isinstance(markup, basestring):
- match_against = markup.__class__(match_against)
+ elif match_against is not None:
+ if isinstance(match_against, unicode):
+ # Unicode is fine.
+ pass
+ elif isinstance(match_against, bytes):
+ # A bytestring should be converted into Unicode.
+ match_against = match_against.decode("utf8")
+ else:
+ # Anything else should be converted into a string, then to Unicode.
+ match_against = str(match_against)
if not result:
result = match_against == markup
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 1e24c29..1bb479e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -192,6 +192,14 @@ class TestFindAllByAttribute(TreeTest):
self.assertSelects(tree.find_all(id='first'),
["Matching a.", "Matching b."])
+ def test_find_all_by_utf8_attribute_value(self):
+ peace = u"םולש".encode("utf8")
+ data = u'<a title="םולש"></a>'.encode("utf8")
+ soup = self.soup(data)
+ self.assertEqual([soup.a], soup.find_all(title=peace))
+ self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
+ self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
+
def test_find_all_by_attribute_dict(self):
# You can pass in a dictionary as the argument 'attrs'. This
# lets you search for attributes like 'name' (a fixed argument
@@ -825,7 +833,7 @@ class TestTreeModification(SoupTest):
data = "<a><b></b></a>"
soup = self.soup(data)
soup.a.append(soup.b)
- self.assertEquals(data, soup.decode())
+ self.assertEqual(data, soup.decode())
def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>"
diff --git a/doc/source/index.rst b/doc/source/index.rst
index e2d81aa..3a2069d 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1017,6 +1017,9 @@ code finds all the <b> tags in the document::
soup.find_all('b')
# [<b>The Dormouse's story</b>]
+If you pass in a byte string, Beautiful Soup will assume the string is
+encoded as UTF-8. You can avoid this by passing in a Unicode string instead.
+
.. _a regular expression:
A regular expression