# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ from pdb import set_trace import copy import pickle import pytest import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( PY3K, CData, Comment, Declaration, Doctype, Formatter, NavigableString, Script, SoupStrainer, Stylesheet, Tag, TemplateString, ) from bs4.testing import ( SoupTest, skipIf, ) from soupsieve import SelectorSyntaxError XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assert_selects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ assert [tag.string for tag in tags] == should_match def assert_selects_ids(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ assert [tag['id'] for tag in tags] == should_match class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") assert soup.find("b").string == "2" def test_unicode_text_find(self): soup = self.soup('

Räksmörgås

') assert soup.find(string='Räksmörgås') == 'Räksmörgås' def test_unicode_attribute_find(self): soup = self.soup('

here it is

') str(soup) assert "here it is" == soup.find(id='Räksmörgås').text def test_find_everything(self): """Test an optimization that finds all tags.""" soup = self.soup("foobar") assert 2 == len(soup.find_all()) def test_find_everything_with_name(self): """Test an optimization that finds all tags with a given name.""" soup = self.soup("foobarbaz") assert 2 == len(soup.find_all('a')) class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. assert soup.find_all(string="bar") == ["bar"] assert soup.find_all(text="bar") == ["bar"] # Match any of a number of strings. assert soup.find_all(text=["Foo", "bar"]) == ["Foo", "bar"] # Match a regular expression. assert soup.find_all(text=re.compile('.*')) == ["Foo", "bar", '\xbb'] # Match anything. assert soup.find_all(text=True) == ["Foo", "bar", '\xbb'] def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("12345") self.assert_selects(soup.find_all('a', limit=3), ["1", "2", "3"]) self.assert_selects(soup.find_all('a', limit=1), ["1"]) self.assert_selects( soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assert_selects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assert_selects(soup('a', limit=1), ["1"]) self.assert_selects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): soup = self.soup("") # Create a self-referential list. l = [] l.append(l) # Without special code in _normalize_search_value, this would cause infinite # recursion. assert [] == soup.find_all(l) def test_find_all_resultset(self): """All find_all calls return a ResultSet""" soup = self.soup("") result = soup.find_all("a") assert hasattr(result, "source") result = soup.find_all(True) assert hasattr(result, "source") result = soup.find_all(text="foo") assert hasattr(result, "source") class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('4') assert "4" == soup.find("mathml:msqrt").string assert "a" == soup.find(attrs= { "svg:fill" : "red" }).name class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" def setup_method(self): self.tree = self.soup("""First tag. Second tag. Third Nested tag. tag.""") def test_find_all_by_tag_name(self): # Find all the tags. self.assert_selects( self.tree.find_all('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_name_and_text(self): self.assert_selects( self.tree.find_all('a', text='First tag.'), ['First tag.']) self.assert_selects( self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) self.assert_selects( self.tree.find_all('a', text=re.compile("tag")), ['First tag.', 'Nested tag.']) def test_find_all_on_non_root_element(self): # You can call find_all on any node, not just the root. self.assert_selects(self.tree.c.find_all('a'), ['Nested tag.']) def test_calling_element_invokes_find_all(self): self.assert_selects(self.tree('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_strainer(self): self.assert_selects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_names(self): self.assert_selects( self.tree.find_all(['a', 'b']), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_dict(self): self.assert_selects( self.tree.find_all({'a' : True, 'b' : True}), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_re(self): self.assert_selects( self.tree.find_all(re.compile('^[ab]$')), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_with_tags_matching_method(self): # You can define an oracle method that determines whether # a tag matches the search. def id_matches_name(tag): return tag.name == tag.get('id') tree = self.soup("""Match 1. Does not match. Match 2.""") self.assert_selects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) def test_find_with_multi_valued_attribute(self): soup = self.soup( "
1
2
3
" ) r1 = soup.find('div', 'a d'); r2 = soup.find('div', re.compile(r'a d')); r3, r4 = soup.find_all('div', ['a b', 'a d']); assert '3' == r1.string assert '3' == r2.string assert '1' == r3.string assert '3' == r4.string class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by # attribute. tree = self.soup(""" Matching a. Non-matching Matching b.a. """) self.assert_selects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): peace = "םולש".encode("utf8") data = ''.encode("utf8") soup = self.soup(data) assert [soup.a] == soup.find_all(title=peace) assert [soup.a] == soup.find_all(title=peace.decode("utf8")) assert [soup.a], soup.find_all(title=[peace, "something else"]) def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument # to find_all) and 'class' (a reserved word in Python.) tree = self.soup(""" Name match. Class match. Non-match. A tag called 'name1'. """) # This doesn't do what you want. self.assert_selects(tree.find_all(name='name1'), ["A tag called 'name1'."]) # This does what you want. self.assert_selects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) self.assert_selects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) # Passing in the class_ keyword argument will search against # the 'class' attribute. self.assert_selects(tree.find_all('a', class_='1'), ['Class 1.']) self.assert_selects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) self.assert_selects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) # Passing in a string to 'attrs' will also search the CSS class. self.assert_selects(tree.find_all('a', '1'), ['Class 1.']) self.assert_selects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assert_selects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assert_selects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") f = tree.find_all("gar", class_=re.compile("o")) self.assert_selects(f, ["Found it"]) f = tree.find_all("gar", class_=re.compile("a")) self.assert_selects(f, ["Found it"]) # If the search fails to match the individual strings "foo" and "bar", # it will be tried against the combined string "foo bar". f = tree.find_all("gar", class_=re.compile("o b")) self.assert_selects(f, ["Found it"]) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") self.assert_selects(soup.find_all("a", re.compile("ba")), ["Found it"]) def big_attribute_value(value): return len(value) > 3 self.assert_selects(soup.find_all("a", big_attribute_value), []) def small_attribute_value(value): return len(value) <= 3 self.assert_selects( soup.find_all("a", small_attribute_value), ["Found it"]) def test_find_all_with_string_for_attrs_finds_multiple_classes(self): soup = self.soup('') a, a2 = soup.find_all("a") assert [a, a2], soup.find_all("a", "foo") assert [a], soup.find_all("a", "bar") # If you specify the class as a string that contains a # space, only that specific value will be found. assert [a] == soup.find_all("a", class_="foo bar") assert [a] == soup.find_all("a", "foo bar") assert [] == soup.find_all("a", "bar foo") def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" Match. Non-match.""") strainer = SoupStrainer(attrs={'id' : 'first'}) self.assert_selects(tree.find_all(strainer), ['Match.']) def test_find_all_with_missing_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assert_selects(tree.find_all('a', id=None), ["No ID present."]) def test_find_all_with_defined_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that have that attribute set to any value. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assert_selects( tree.find_all(id=True), ["ID present.", "ID is empty."]) def test_find_all_with_numeric_attribute(self): # If you search for a number, it's treated as a string. tree = self.soup("""Unquoted attribute. Quoted attribute.""") expected = ["Unquoted attribute.", "Quoted attribute."] self.assert_selects(tree.find_all(id=1), expected) self.assert_selects(tree.find_all(id="1"), expected) def test_find_all_with_list_attribute_values(self): # You can pass a list of attribute values instead of just one, # and you'll get tags that match any of the values. tree = self.soup("""1 2 3 No ID.""") self.assert_selects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) def test_find_all_with_regular_expression_attribute_value(self): # You can pass a regular expression as an attribute value, and # you'll get tags whose values for that attribute match the # regular expression. tree = self.soup("""One a. Two as. Mixed as and bs. One b. No ID.""") self.assert_selects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) def test_find_by_name_and_containing_string(self): soup = self.soup("foobarfoo") a = soup.a assert [a] == soup.find_all("a", text="foo") assert [] == soup.find_all("a", text="bar") def test_find_by_name_and_containing_string_when_string_is_buried(self): soup = self.soup("foofoo") assert soup.find_all("a") == soup.find_all("a", text="foo") def test_find_by_attribute_and_containing_string(self): soup = self.soup('foofoo') a = soup.a assert [a] == soup.find_all(id=2, text="foo") assert [] == soup.find_all(id=1, text="bar") class TestSmooth(TreeTest): """Test Tag.smooth.""" def test_smooth(self): soup = self.soup("
a
") div = soup.div div.append("b") div.append("c") div.append(Comment("Comment 1")) div.append(Comment("Comment 2")) div.append("d") builder = self.default_builder() span = Tag(soup, builder, 'span') span.append('1') span.append('2') div.append(span) # At this point the tree has a bunch of adjacent # NavigableStrings. This is normal, but it has no meaning in # terms of HTML, so we may want to smooth things out for # output. # Since the tag has two children, its .string is None. assert None == div.span.string assert 7 == len(div.contents) div.smooth() assert 5 == len(div.contents) # The three strings at the beginning of div.contents have been # merged into on string. # assert 'abc' == div.contents[0] # The call is recursive -- the tag was also smoothed. assert '12' == div.span.string # The two comments have _not_ been merged, even though # comments are strings. Merging comments would change the # meaning of the HTML. assert 'Comment 1' == div.contents[1] assert 'Comment 2' == div.contents[2] class TestIndex(TreeTest): """Test Tag.index""" def test_index(self): tree = self.soup("""
Identical Not identical Identical Identical with child Also not identical Identical with child
""") div = tree.div for i, element in enumerate(div.contents): assert i == div.index(element) with pytest.raises(ValueError): tree.index(1) class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" def setup_method(self): self.tree = self.soup('''