diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 16:43:34 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-02-13 16:43:34 -0500 |
commit | c876fbf402f15d924b7c0d9a9be5ba80769444a3 (patch) | |
tree | d2589d7db86200d17cb05e949f7fe09a439e53b2 | |
parent | 185ec704743ffa0dfd95b7a29e2f5d38a25433b5 (diff) |
The behavior of methods like .get_text() and .strings now differs
depending on the type of tag. The change is visible with HTML tags
like <script>, <style>, and <template>. Starting in 4.9.0, methods
like get_text() returned no results on such tags, because the
contents of those tags are not considered 'text' within the document
as a whole.
But a user who calls script.get_text() is working from a different
definition of 'text' than a user who calls div.get_text()--otherwise
there would be no need to call script.get_text() at all. In 4.10.0,
the contents of (e.g.) a <script> tag are considered 'text' during a
get_text() call on the tag itself, but not considered 'text' during
a get_text() call on the tag's parent.
Because of this change, calling get_text() on each child of a tag
may now return a different result than calling get_text() on the tag
itself. That's because different tags now have different
understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
-rw-r--r-- | CHANGELOG | 23 | ||||
-rw-r--r-- | bs4/element.py | 61 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 34 |
3 files changed, 101 insertions, 17 deletions
@@ -1,5 +1,24 @@ -= 4.9.4 (unreleased) - += 4.10.0 (unreleased) + +* The behavior of methods like .get_text() and .strings now differs + depending on the type of tag. The change is visible with HTML tags + like <script>, <style>, and <template>. Starting in 4.9.0, methods + like get_text() returned no results on such tags, because the + contents of those tags are not considered 'text' within the document + as a whole. + + But a user who calls script.get_text() is working from a different + definition of 'text' than a user who calls div.get_text()--otherwise + there would be no need to call script.get_text() at all. In 4.10.0, + the contents of (e.g.) a <script> tag are considered 'text' during a + get_text() call on the tag itself, but not considered 'text' during + a get_text() call on the tag's parent. + + Because of this change, calling get_text() on each child of a tag + may now return a different result than calling get_text() on the tag + itself. That's because different tags now have different + understandings of what counts as 'text'. [bug=1906226] [bug=1868861] + * Corrected output when the namespace prefix associated with a namespaced attribute is the empty string, as opposed to None. [bug=1915583] diff --git a/bs4/element.py b/bs4/element.py index 4d9c150..e994678 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1069,7 +1069,8 @@ class Tag(PageElement): prefix=None, attrs=None, parent=None, previous=None, is_xml=None, sourceline=None, sourcepos=None, can_be_empty_element=None, cdata_list_attributes=None, - preserve_whitespace_tags=None + preserve_whitespace_tags=None, + interesting_string_types=None, ): """Basic constructor. @@ -1095,6 +1096,13 @@ class Tag(PageElement): be treated as CDATA if they ever show up on this tag. :param preserve_whitespace_tags: A list of tag names whose contents should have their whitespace preserved. + :param interesting_string_types: This is a NavigableString + subclass or a tuple of them. When iterating over this + Tag's strings in methods like Tag.strings or Tag.get_text, + these are the types of strings that are interesting enough + to be considered. The default is to consider + NavigableString and CData the only interesting string + subtypes. """ if parser is None: self.parser_class = None @@ -1140,6 +1148,7 @@ class Tag(PageElement): self.can_be_empty_element = can_be_empty_element self.cdata_list_attributes = cdata_list_attributes self.preserve_whitespace_tags = preserve_whitespace_tags + self.interesting_string_types = interesting_string_types else: # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) @@ -1160,6 +1169,13 @@ class Tag(PageElement): # Keep track of the names that might cause this tag to be treated as a # whitespace-preserved tag. self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + if self.name in builder.string_containers: + # This sort of tag uses a special string container + # subclass for most of its strings. When we ask the + self.interesting_string_types = builder.string_containers[self.name] + else: + self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES parserClass = _alias("parser_class") # BS3 @@ -1226,25 +1242,38 @@ class Tag(PageElement): self.clear() self.append(string.__class__(string)) - def _all_strings(self, strip=False, types=(NavigableString, CData)): + DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) + default = object() + def _all_strings(self, strip=False, types=default): """Yield all strings of certain classes, possibly stripping them. :param strip: If True, all strings will be stripped before being yielded. - :types: A tuple of NavigableString subclasses. Any strings of + :param types: A tuple of NavigableString subclasses. Any strings of a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - etc. + default, the subclasses considered are the ones found in + self.interesting_string_types. If that's not specified, + only NavigableString and CData objects will be + considered. That means no comments, processing + instructions, etc. :yield: A sequence of strings. + """ + if types is self.default: + types = self.interesting_string_types + for descendant in self.descendants: - if ( - (types is None and not isinstance(descendant, NavigableString)) - or - (types is not None and type(descendant) not in types)): + if (types is None and not isinstance(descendant, NavigableString)): + continue + descendant_type = type(descendant) + if isinstance(types, type): + if descendant_type is not types: + # We're not interested in strings of this type. + continue + elif types is not None and descendant_type not in types: + # We're not interested in strings of this type. continue if strip: descendant = descendant.strip() @@ -1264,7 +1293,7 @@ class Tag(PageElement): yield string def get_text(self, separator=u"", strip=False, - types=(NavigableString, CData)): + types=default): """Get all child strings, concatenated using the given separator. :param separator: Strings will be concatenated using this separator. @@ -1272,11 +1301,13 @@ class Tag(PageElement): :param strip: If True, strings will be stripped before being concatenated. - :types: A tuple of NavigableString subclasses. Any strings of + :param types: A tuple of NavigableString subclasses. Any strings of a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - stylesheets, etc. + default, the subclasses considered are the ones found in + self.interesting_string_types. If that's not specified, + only NavigableString and CData objects will be + considered. That means no comments, processing + instructions, etc. :return: A string. """ diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 1bd1577..9267a8f 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1440,6 +1440,40 @@ class TestElementObjects(SoupTest): soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") self.assertEqual(['foo', 'bar'], list(soup.strings)) + def test_string_methods_inside_special_string_container_tags(self): + # Strings inside tags like <script> are generally ignored by + # methods like get_text, because they're not what humans + # consider 'text'. But if you call get_text on the <script> + # tag itself, those strings _are_ considered to be 'text', + # because there's nothing else you might be looking for. + + style = self.soup("<div>a<style>Some CSS</style></div>") + template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>") + script = self.soup("<div>a<script><!--a comment-->Some text</script></div>") + + self.assertEqual(style.div.get_text(), "a") + self.assertEqual(list(style.div.strings), ["a"]) + self.assertEqual(style.div.style.get_text(), "Some CSS") + self.assertEqual(list(style.div.style.strings), + ['Some CSS']) + + # The comment is not picked up here. That's because it was + # parsed into a Comment object, which is not considered + # interesting by template.strings. + self.assertEqual(template.div.get_text(), "a") + self.assertEqual(list(template.div.strings), ["a"]) + self.assertEqual(template.div.template.get_text(), "Templated text.") + self.assertEqual(list(template.div.template.strings), + ["Templated ", "text", "."]) + + # The comment is included here, because it didn't get parsed + # into a Comment object--it's part of the Script string. + self.assertEqual(script.div.get_text(), "a") + self.assertEqual(list(script.div.strings), ["a"]) + self.assertEqual(script.div.script.get_text(), + "<!--a comment-->Some text") + self.assertEqual(list(script.div.script.strings), + ['<!--a comment-->Some text']) class TestCDAtaListAttributes(SoupTest): |