diff options
-rw-r--r-- | NEWS.txt | 2 | ||||
-rw-r--r-- | bs4/element.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 10 | ||||
-rw-r--r-- | doc/source/index.rst | 3 |
4 files changed, 26 insertions, 4 deletions
@@ -17,6 +17,8 @@ into a tag it's already inside, and replacing one of a tag's children with another. [bug=997529] +* Fixed the inability to search for non-ASCII attribute values. [bug=1003974] + = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. diff --git a/bs4/element.py b/bs4/element.py index 99a3540..6fb89ea 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1287,15 +1287,24 @@ class SoupStrainer(object): result = markup and match_against.search(markup) elif (hasattr(match_against, '__iter__') and markup is not None - and not isinstance(match_against, basestring)): + and not isinstance(match_against, bytes) + and not isinstance(match_against, unicode)): result = markup in match_against elif hasattr(match_against, 'items'): if markup is None: result = len(match_against.items()) == 0 else: result = match_against in markup - elif match_against and isinstance(markup, basestring): - match_against = markup.__class__(match_against) + elif match_against is not None: + if isinstance(match_against, unicode): + # Unicode is fine. + pass + elif isinstance(match_against, bytes): + # A bytestring should be converted into Unicode. + match_against = match_against.decode("utf8") + else: + # Anything else should be converted into a string, then to Unicode. + match_against = str(match_against) if not result: result = match_against == markup diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 1e24c29..1bb479e 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -192,6 +192,14 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) + def test_find_all_by_utf8_attribute_value(self): + peace = u"םולש".encode("utf8") + data = u'<a title="םולש"></a>'.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) + self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) + def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument @@ -825,7 +833,7 @@ class TestTreeModification(SoupTest): data = "<a><b></b></a>" soup = self.soup(data) soup.a.append(soup.b) - self.assertEquals(data, soup.decode()) + self.assertEqual(data, soup.decode()) def test_move_tag_to_beginning_of_parent(self): data = "<a><b></b><c></c><d></d></a>" diff --git a/doc/source/index.rst b/doc/source/index.rst index e2d81aa..3a2069d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1017,6 +1017,9 @@ code finds all the <b> tags in the document:: soup.find_all('b') # [<b>The Dormouse's story</b>] +If you pass in a byte string, Beautiful Soup will assume the string is +encoded as UTF-8. You can avoid this by passing in a Unicode string instead. + .. _a regular expression: A regular expression |