diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-02-14 16:53:14 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-02-14 16:53:14 -0500 |
commit | 34e0ce8a9dd43ada1c55b50a156fbce63b1e2ebb (patch) | |
tree | fdeb487c1f52e32c6eb4761cd2a530a24c10b8b0 | |
parent | 7201eecc09b51df5a0fb704670aa66bcc9d8e635 (diff) |
NavigableString and its subclasses now implement the get_text()
method, as well as the properties .strings and
.stripped_strings. These methods will either return the string
itself, or nothing, so the only reason to use this is when iterating
over a list of mixed Tag and NavigableString objects. [bug=1904309]
-rw-r--r-- | CHANGELOG | 10 | ||||
-rw-r--r-- | bs4/element.py | 122 | ||||
-rw-r--r-- | bs4/tests/test_navigablestring.py | 123 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 78 | ||||
-rw-r--r-- | doc/source/index.rst | 11 |
5 files changed, 227 insertions, 117 deletions
@@ -1,5 +1,9 @@ = 4.10.0 (unreleased) +Beautiful Soup's official support for Python 2 ended on December 31st, +2020. This release supports both Python 2 and Python 3, but there's no +guarantee that this will hold for the next release. + * The behavior of methods like .get_text() and .strings now differs depending on the type of tag. The change is visible with HTML tags like <script>, <style>, and <template>. Starting in 4.9.0, methods @@ -19,6 +23,12 @@ itself. That's because different tags now have different understandings of what counts as 'text'. [bug=1906226] [bug=1868861] +* NavigableString and its subclasses now implement the get_text() + method, as well as the properties .strings and + .stripped_strings. These methods will either return the string + itself, or nothing, so the only reason to use this is when iterating + over a list of mixed Tag and NavigableString objects. [bug=1904309] + * The 'html5' formatter now treats attributes whose values are the empty string as HTML boolean attributes. Previously (and in other formatters), an attribute value must be set as None to be treated as diff --git a/bs4/element.py b/bs4/element.py index e994678..3428e21 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -255,6 +255,47 @@ class PageElement(object): nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 + default = object() + def _all_strings(self, strip=False, types=default): + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in Tag and NavigableString. + """ + raise NotImplementedError() + + @property + def stripped_strings(self): + """Yield all strings in this PageElement, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False, + types=default): + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + def replace_with(self, replace_with): """Replace this PageElement with another one, keeping the rest of the tree the same. @@ -945,7 +986,49 @@ class NavigableString(unicode, PageElement): """Prevent NavigableString.name from ever being set.""" raise AttributeError("A NavigableString cannot be given a name.") - + def _all_strings(self, strip=False, types=PageElement.default): + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.DEFAULT_INTERESTING_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + value = value.strip() + if len(value) > 0: + yield value + strings = property(_all_strings) + class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. @@ -1243,8 +1326,7 @@ class Tag(PageElement): self.append(string.__class__(string)) DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) - default = object() - def _all_strings(self, strip=False, types=default): + def _all_strings(self, strip=False, types=PageElement.default): """Yield all strings of certain classes, possibly stripping them. :param strip: If True, all strings will be stripped before being @@ -1280,42 +1362,8 @@ class Tag(PageElement): if len(descendant) == 0: continue yield descendant - strings = property(_all_strings) - @property - def stripped_strings(self): - """Yield all strings in the document, stripping them first. - - :yield: A sequence of stripped strings. - """ - for string in self._all_strings(True): - yield string - - def get_text(self, separator=u"", strip=False, - types=default): - """Get all child strings, concatenated using the given separator. - - :param separator: Strings will be concatenated using this separator. - - :param strip: If True, strings will be stripped before being - concatenated. - - :param types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, the subclasses considered are the ones found in - self.interesting_string_types. If that's not specified, - only NavigableString and CData objects will be - considered. That means no comments, processing - instructions, etc. - - :return: A string. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) - def decompose(self): """Recursively destroys this PageElement and its children. diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py new file mode 100644 index 0000000..8b903ea --- /dev/null +++ b/bs4/tests/test_navigablestring.py @@ -0,0 +1,123 @@ +from bs4.testing import SoupTest +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + NavigableString, + Script, + Stylesheet, + TemplateString, +) + +class TestNavigableString(SoupTest): + + def test_text_acquisition_methods(self): + # These methods are intended for use against Tag, but they + # work on NavigableString as well, + eq_ = self.assertEquals + + s = NavigableString("fee ") + cdata = CData("fie ") + comment = Comment("foe ") + + eq_("fee ", s.get_text()) + eq_("fee", s.get_text(strip=True)) + eq_(["fee "], list(s.strings)) + eq_(["fee"], list(s.stripped_strings)) + eq_(["fee "], list(s._all_strings())) + + eq_("fie ", cdata.get_text()) + eq_("fie", cdata.get_text(strip=True)) + eq_(["fie "], list(cdata.strings)) + eq_(["fie"], list(cdata.stripped_strings)) + eq_(["fie "], list(cdata._all_strings())) + + # Since a Comment isn't normally considered 'text', + # these methods generally do nothing. + eq_("", comment.get_text()) + eq_([], list(comment.strings)) + eq_([], list(comment.stripped_strings)) + eq_([], list(comment._all_strings())) + + # Unless you specifically say that comments are okay. + eq_("foe", comment.get_text(strip=True, types=Comment)) + eq_("foe ", comment.get_text(types=(Comment, NavigableString))) + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "<![CDATA[foo]]>") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("<?foo?>", d.output_ready()) + + def test_default_string_containers(self): + # In some cases, we use different NavigableString subclasses for + # the same text in different tags. + soup = self.soup( + "<div>text</div><script>text</script><style>text</style>" + ) + self.assertEqual( + [NavigableString, Script, Stylesheet], + [x.__class__ for x in soup.find_all(text=True)] + ) + + # The TemplateString is a little unusual because it's generally found + # _inside_ children of a <template> element, not a direct child of the + # <template> element. + soup = self.soup( + "<template>Some text<p>In a tag</p></template>Some text outside" + ) + assert all( + isinstance(x, TemplateString) + for x in soup.template._all_strings(types=None) + ) + + # Once the <template> tag closed, we went back to using + # NavigableString. + outside = soup.template.next_sibling + assert isinstance(outside, NavigableString) + assert not isinstance(outside, TemplateString) + + # The TemplateString is also unusual because it can contain + # NavigableString subclasses of _other_ types, such as + # Comment. + markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>" + soup = self.soup(markup) + self.assertEqual(markup, soup.template.encode("utf8")) + diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index d1ca5ea..875befe 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1846,84 +1846,6 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) - -class TestNavigableStringSubclasses(SoupTest): - - def test_cdata(self): - # None of the current builders turn CDATA sections into CData - # objects, but you can create them manually. - soup = self.soup("") - cdata = CData("foo") - soup.insert(1, cdata) - self.assertEqual(str(soup), "<![CDATA[foo]]>") - self.assertEqual(soup.find(text="foo"), "foo") - self.assertEqual(soup.contents[0], "foo") - - def test_cdata_is_never_formatted(self): - """Text inside a CData object is passed into the formatter. - - But the return value is ignored. - """ - - self.count = 0 - def increment(*args): - self.count += 1 - return "BITTER FAILURE" - - soup = self.soup("") - cdata = CData("<><><>") - soup.insert(1, cdata) - self.assertEqual( - b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) - self.assertEqual(1, self.count) - - def test_doctype_ends_in_newline(self): - # Unlike other NavigableString subclasses, a DOCTYPE always ends - # in a newline. - doctype = Doctype("foo") - soup = self.soup("") - soup.insert(1, doctype) - self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") - - def test_declaration(self): - d = Declaration("foo") - self.assertEqual("<?foo?>", d.output_ready()) - - def test_default_string_containers(self): - # In some cases, we use different NavigableString subclasses for - # the same text in different tags. - soup = self.soup( - "<div>text</div><script>text</script><style>text</style>" - ) - self.assertEqual( - [NavigableString, Script, Stylesheet], - [x.__class__ for x in soup.find_all(text=True)] - ) - - # The TemplateString is a little unusual because it's generally found - # _inside_ children of a <template> element, not a direct child of the - # <template> element. - soup = self.soup( - "<template>Some text<p>In a tag</p></template>Some text outside" - ) - assert all( - isinstance(x, TemplateString) - for x in soup.template._all_strings(types=None) - ) - - # Once the <template> tag closed, we went back to using - # NavigableString. - outside = soup.template.next_sibling - assert isinstance(outside, NavigableString) - assert not isinstance(outside, TemplateString) - - # The TemplateString is also unusual because it can contain - # NavigableString subclasses of _other_ types, such as - # Comment. - markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>" - soup = self.soup(markup) - self.assertEqual(markup, soup.template.encode("utf8")) - class TestSoupSelector(TreeTest): diff --git a/doc/source/index.rst b/doc/source/index.rst index 2b5843d..63e74e2 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2312,7 +2312,7 @@ omit the closing slash in HTML void tags like "br":: # b'<br>' In addition, any attributes whose values are the empty string -will become HTML-style boolean attributes: +will become HTML-style boolean attributes:: option = BeautifulSoup('<option selected=""></option>').option print(option.encode(formatter="html")) @@ -2321,6 +2321,8 @@ will become HTML-style boolean attributes: print(option.encode(formatter="html5")) # b'<option selected></option>' +*(This behavior is new as of Beautiful Soup 4.10.0.)* + If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: @@ -2429,9 +2431,14 @@ generator instead, and process the text yourself:: *As of Beautiful Soup version 4.9.0, when lxml or html.parser are in use, the contents of <script>, <style>, and <template> -tags are not considered to be 'text', since those tags are not part of +tags are generally not considered to be 'text', since those tags are not part of the human-visible content of the page.* +*As of Beautiful Soup version 4.10.0, you can call get_text(), +.strings, or .stripped_strings on a NavigableString object. It will +either return the object itself, or nothing, so the only reason to do +this is when you're iterating over a mixed list.* + Specifying the parser to use ============================ |