summaryrefslogtreecommitdiff
path: root/bs4/tests
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests')
-rw-r--r--bs4/tests/test_html5lib.py6
-rw-r--r--bs4/tests/test_soup.py35
-rw-r--r--bs4/tests/test_tree.py39
3 files changed, 78 insertions, 2 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 6446f84..7b0a6d4 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -182,3 +182,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+ def test_special_string_containers(self):
+ # The html5lib tree builder doesn't support this standard feature,
+ # because there's no way of knowing, when a string is created,
+ # where in the tree it will eventually end up.
+ pass
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index dc88662..8d0583c 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -73,6 +73,7 @@ class TestConstructor(SoupTest):
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
+ self.string_containers = {}
def initialize_soup(self, soup):
pass
def feed(self, markup):
@@ -186,7 +187,41 @@ class TestConstructor(SoupTest):
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
+
+ def test_alternate_string_containers(self):
+ # Test the ability to customize the string containers for
+ # different types of tags.
+ class PString(NavigableString):
+ pass
+
+ class BString(NavigableString):
+ pass
+
+ soup = self.soup(
+ "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+ string_containers = {
+ 'b': BString,
+ 'p': PString,
+ }
+ )
+
+ # The string before the <p> tag is a regular NavigableString.
+ assert isinstance(soup.div.contents[0], NavigableString)
+ # The string inside the <p> tag, but not inside the <i> tag,
+ # is a PString.
+ assert isinstance(soup.p.contents[0], PString)
+
+ # Every string inside the <b> tag is a BString, even the one that
+ # was also inside an <i> tag.
+ for s in soup.b.strings:
+ assert isinstance(s, BString)
+
+ # Now that parsing was complete, the string_container_stack
+ # (where this information was kept) has been cleared out.
+ self.assertEqual([], soup.string_container_stack)
+
+
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 80aaaff..7ecab9e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -27,8 +27,11 @@ from bs4.element import (
Doctype,
Formatter,
NavigableString,
+ Script,
SoupStrainer,
+ Stylesheet,
Tag,
+ TemplateString,
)
from bs4.testing import (
SoupTest,
@@ -1408,7 +1411,7 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
- def test_get_text_ignores_comments(self):
+ def test_get_text_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar")
@@ -1417,10 +1420,17 @@ class TestElementObjects(SoupTest):
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
- def test_all_strings_ignores_comments(self):
+ soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+ self.assertEqual(soup.get_text(), "foobar")
+
+ def test_all_strings_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
+ soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+ self.assertEqual(['foo', 'bar'], list(soup.strings))
+
+
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
@@ -1874,6 +1884,31 @@ class TestNavigableStringSubclasses(SoupTest):
d = Declaration("foo")
self.assertEqual("<?foo?>", d.output_ready())
+ def test_default_string_containers(self):
+ # In some cases, we use different NavigableString subclasses for
+ # the same text in different tags.
+ soup = self.soup(
+ "<div>text</div><script>text</script><style>text</style>"
+ )
+ self.assertEqual(
+ [NavigableString, Script, Stylesheet],
+ [x.__class__ for x in soup.find_all(text=True)]
+ )
+
+ # The TemplateString is a little unusual because it's generally found
+ # _inside_ children of a <template> element, not a direct child of the
+ # <template> element.
+ soup = self.soup(
+ "<template>Some text<p>In a tag</p></template>Some text outside"
+ )
+ assert all(isinstance(x, TemplateString) for x in soup.template.strings)
+
+ # Once the <template> tag closed, we went back to using
+ # NavigableString.
+ outside = soup.template.next_sibling
+ assert isinstance(outside, NavigableString)
+ assert not isinstance(outside, TemplateString)
+
class TestSoupSelector(TreeTest):
HTML = """