The behavior of methods like .get_text() and .strings now differs

depending on the type of tag. The change is visible with HTML tags like <script>, <style>, and <template>. Starting in 4.9.0, methods like get_text() returned no results on such tags, because the contents of those tags are not considered 'text' within the document as a whole. But a user who calls script.get_text() is working from a different definition of 'text' than a user who calls div.get_text()--otherwise there would be no need to call script.get_text() at all. In 4.10.0, the contents of (e.g.) a <script> tag are considered 'text' during a get_text() call on the tag itself, but not considered 'text' during a get_text() call on the tag's parent. Because of this change, calling get_text() on each child of a tag may now return a different result than calling get_text() on the tag itself. That's because different tags now have different understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
author: Leonard Richardson <leonardr@segfault.org> 2021-02-13 16:43:34 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-02-13 16:43:34 -0500
commit: c876fbf402f15d924b7c0d9a9be5ba80769444a3 (patch)
tree: d2589d7db86200d17cb05e949f7fe09a439e53b2
parent: 185ec704743ffa0dfd95b7a29e2f5d38a25433b5 (diff)
3 files changed, 101 insertions, 17 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 288a276..9cddc55 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,24 @@
-= 4.9.4 (unreleased)
-
+= 4.10.0 (unreleased)
+
+* The behavior of methods like .get_text() and .strings now differs
+  depending on the type of tag. The change is visible with HTML tags
+  like <script>, <style>, and <template>. Starting in 4.9.0, methods
+  like get_text() returned no results on such tags, because the
+  contents of those tags are not considered 'text' within the document
+  as a whole.
+
+  But a user who calls script.get_text() is working from a different
+  definition of 'text' than a user who calls div.get_text()--otherwise
+  there would be no need to call script.get_text() at all. In 4.10.0,
+  the contents of (e.g.) a <script> tag are considered 'text' during a
+  get_text() call on the tag itself, but not considered 'text' during
+  a get_text() call on the tag's parent.
+
+  Because of this change, calling get_text() on each child of a tag
+  may now return a different result than calling get_text() on the tag
+  itself. That's because different tags now have different
+  understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
+	
 * Corrected output when the namespace prefix associated with a
   namespaced attribute is the empty string, as opposed to
   None. [bug=1915583]
diff --git a/bs4/element.py b/bs4/element.py
index 4d9c150..e994678 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1069,7 +1069,8 @@ class Tag(PageElement):
                  prefix=None, attrs=None, parent=None, previous=None,
                  is_xml=None, sourceline=None, sourcepos=None,
                  can_be_empty_element=None, cdata_list_attributes=None,
-                 preserve_whitespace_tags=None
+                 preserve_whitespace_tags=None,
+                 interesting_string_types=None,
     ):
         """Basic constructor.
 
@@ -1095,6 +1096,13 @@ class Tag(PageElement):
             be treated as CDATA if they ever show up on this tag.
         :param preserve_whitespace_tags: A list of tag names whose contents
             should have their whitespace preserved.
+        :param interesting_string_types: This is a NavigableString
+            subclass or a tuple of them. When iterating over this
+            Tag's strings in methods like Tag.strings or Tag.get_text,
+            these are the types of strings that are interesting enough
+            to be considered. The default is to consider
+            NavigableString and CData the only interesting string
+            subtypes.
         """
         if parser is None:
             self.parser_class = None
@@ -1140,6 +1148,7 @@ class Tag(PageElement):
             self.can_be_empty_element = can_be_empty_element
             self.cdata_list_attributes = cdata_list_attributes
             self.preserve_whitespace_tags = preserve_whitespace_tags
+            self.interesting_string_types = interesting_string_types
         else:
             # Set up any substitutions for this tag, such as the charset in a META tag.
             builder.set_up_substitutions(self)
@@ -1160,6 +1169,13 @@ class Tag(PageElement):
             # Keep track of the names that might cause this tag to be treated as a
             # whitespace-preserved tag.
             self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+
+            if self.name in builder.string_containers:
+                # This sort of tag uses a special string container
+                # subclass for most of its strings. When we ask the
+                self.interesting_string_types = builder.string_containers[self.name]
+            else:
+                self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
             
     parserClass = _alias("parser_class")  # BS3
 
@@ -1226,25 +1242,38 @@ class Tag(PageElement):
         self.clear()
         self.append(string.__class__(string))
 
-    def _all_strings(self, strip=False, types=(NavigableString, CData)):
+    DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
+    default = object()
+    def _all_strings(self, strip=False, types=default):
         """Yield all strings of certain classes, possibly stripping them.
 
         :param strip: If True, all strings will be stripped before being
             yielded.
 
-        :types: A tuple of NavigableString subclasses. Any strings of
+        :param types: A tuple of NavigableString subclasses. Any strings of
             a subclass not found in this list will be ignored. By
-            default, this means only NavigableString and CData objects
-            will be considered. So no comments, processing instructions,
-            etc.
+            default, the subclasses considered are the ones found in
+            self.interesting_string_types. If that's not specified,
+            only NavigableString and CData objects will be
+            considered. That means no comments, processing
+            instructions, etc.
 
         :yield: A sequence of strings.
+
         """
+        if types is self.default:
+            types = self.interesting_string_types
+
         for descendant in self.descendants:
-            if (
-                (types is None and not isinstance(descendant, NavigableString))
-                or
-                (types is not None and type(descendant) not in types)):
+            if (types is None and not isinstance(descendant, NavigableString)):
+                continue
+            descendant_type = type(descendant)
+            if isinstance(types, type):
+                if descendant_type is not types:
+                    # We're not interested in strings of this type.
+                    continue
+            elif types is not None and descendant_type not in types:
+                # We're not interested in strings of this type.
                 continue
             if strip:
                 descendant = descendant.strip()
@@ -1264,7 +1293,7 @@ class Tag(PageElement):
             yield string
 
     def get_text(self, separator=u"", strip=False,
-                 types=(NavigableString, CData)):
+                 types=default):
         """Get all child strings, concatenated using the given separator.
 
         :param separator: Strings will be concatenated using this separator.
@@ -1272,11 +1301,13 @@ class Tag(PageElement):
         :param strip: If True, strings will be stripped before being
             concatenated.
 
-        :types: A tuple of NavigableString subclasses. Any strings of
+        :param types: A tuple of NavigableString subclasses. Any strings of
             a subclass not found in this list will be ignored. By
-            default, this means only NavigableString and CData objects
-            will be considered. So no comments, processing instructions,
-            stylesheets, etc.
+            default, the subclasses considered are the ones found in
+            self.interesting_string_types. If that's not specified,
+            only NavigableString and CData objects will be
+            considered. That means no comments, processing
+            instructions, etc.
 
         :return: A string.
         """
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 1bd1577..9267a8f 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1440,6 +1440,40 @@ class TestElementObjects(SoupTest):
         soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
         self.assertEqual(['foo', 'bar'], list(soup.strings))
 
+    def test_string_methods_inside_special_string_container_tags(self):
+        # Strings inside tags like <script> are generally ignored by
+        # methods like get_text, because they're not what humans
+        # consider 'text'. But if you call get_text on the <script>
+        # tag itself, those strings _are_ considered to be 'text',
+        # because there's nothing else you might be looking for.
+        
+        style = self.soup("<div>a<style>Some CSS</style></div>")
+        template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
+        script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
+        
+        self.assertEqual(style.div.get_text(), "a")
+        self.assertEqual(list(style.div.strings), ["a"])
+        self.assertEqual(style.div.style.get_text(), "Some CSS")
+        self.assertEqual(list(style.div.style.strings),
+                         ['Some CSS'])
+        
+        # The comment is not picked up here. That's because it was
+        # parsed into a Comment object, which is not considered
+        # interesting by template.strings.
+        self.assertEqual(template.div.get_text(), "a")
+        self.assertEqual(list(template.div.strings), ["a"])
+        self.assertEqual(template.div.template.get_text(), "Templated text.")
+        self.assertEqual(list(template.div.template.strings),
+                         ["Templated ", "text", "."])
+
+        # The comment is included here, because it didn't get parsed
+        # into a Comment object--it's part of the Script string.
+        self.assertEqual(script.div.get_text(), "a")
+        self.assertEqual(list(script.div.strings), ["a"])
+        self.assertEqual(script.div.script.get_text(),
+                         "<!--a comment-->Some text")
+        self.assertEqual(list(script.div.script.strings),
+                         ['<!--a comment-->Some text'])
 
 class TestCDAtaListAttributes(SoupTest):
author	Leonard Richardson <leonardr@segfault.org>	2021-02-13 16:43:34 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-02-13 16:43:34 -0500
commit	c876fbf402f15d924b7c0d9a9be5ba80769444a3 (patch)
tree	d2589d7db86200d17cb05e949f7fe09a439e53b2
parent	185ec704743ffa0dfd95b7a29e2f5d38a25433b5 (diff)