diff options
Diffstat (limited to 'bs4/tests')
-rw-r--r-- | bs4/tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 35 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 39 |
3 files changed, 78 insertions, 2 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 6446f84..7b0a6d4 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -182,3 +182,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): soup = self.soup(markup, store_line_numbers=False) self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + + def test_special_string_containers(self): + # The html5lib tree builder doesn't support this standard feature, + # because there's no way of knowing, when a string is created, + # where in the tree it will eventually end up. + pass diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index dc88662..8d0583c 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -73,6 +73,7 @@ class TestConstructor(SoupTest): self.store_line_numbers = False self.cdata_list_attributes = [] self.preserve_whitespace_tags = [] + self.string_containers = {} def initialize_soup(self, soup): pass def feed(self, markup): @@ -186,7 +187,41 @@ class TestConstructor(SoupTest): isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.recursiveChildGenerator() ) + + def test_alternate_string_containers(self): + # Test the ability to customize the string containers for + # different types of tags. + class PString(NavigableString): + pass + + class BString(NavigableString): + pass + + soup = self.soup( + "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", + string_containers = { + 'b': BString, + 'p': PString, + } + ) + + # The string before the <p> tag is a regular NavigableString. + assert isinstance(soup.div.contents[0], NavigableString) + # The string inside the <p> tag, but not inside the <i> tag, + # is a PString. + assert isinstance(soup.p.contents[0], PString) + + # Every string inside the <b> tag is a BString, even the one that + # was also inside an <i> tag. + for s in soup.b.strings: + assert isinstance(s, BString) + + # Now that parsing was complete, the string_container_stack + # (where this information was kept) has been cleared out. + self.assertEqual([], soup.string_container_stack) + + class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 80aaaff..7ecab9e 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -27,8 +27,11 @@ from bs4.element import ( Doctype, Formatter, NavigableString, + Script, SoupStrainer, + Stylesheet, Tag, + TemplateString, ) from bs4.testing import ( SoupTest, @@ -1408,7 +1411,7 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") - def test_get_text_ignores_comments(self): + def test_get_text_ignores_special_string_containers(self): soup = self.soup("foo<!--IGNORE-->bar") self.assertEqual(soup.get_text(), "foobar") @@ -1417,10 +1420,17 @@ class TestElementObjects(SoupTest): self.assertEqual( soup.get_text(types=None), "fooIGNOREbar") - def test_all_strings_ignores_comments(self): + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + self.assertEqual(soup.get_text(), "foobar") + + def test_all_strings_ignores_special_string_containers(self): soup = self.soup("foo<!--IGNORE-->bar") self.assertEqual(['foo', 'bar'], list(soup.strings)) + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + + class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. @@ -1874,6 +1884,31 @@ class TestNavigableStringSubclasses(SoupTest): d = Declaration("foo") self.assertEqual("<?foo?>", d.output_ready()) + def test_default_string_containers(self): + # In some cases, we use different NavigableString subclasses for + # the same text in different tags. + soup = self.soup( + "<div>text</div><script>text</script><style>text</style>" + ) + self.assertEqual( + [NavigableString, Script, Stylesheet], + [x.__class__ for x in soup.find_all(text=True)] + ) + + # The TemplateString is a little unusual because it's generally found + # _inside_ children of a <template> element, not a direct child of the + # <template> element. + soup = self.soup( + "<template>Some text<p>In a tag</p></template>Some text outside" + ) + assert all(isinstance(x, TemplateString) for x in soup.template.strings) + + # Once the <template> tag closed, we went back to using + # NavigableString. + outside = soup.template.next_sibling + assert isinstance(outside, NavigableString) + assert not isinstance(outside, TemplateString) + class TestSoupSelector(TreeTest): HTML = """ |