summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-02-14 16:53:14 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-02-14 16:53:14 -0500
commit34e0ce8a9dd43ada1c55b50a156fbce63b1e2ebb (patch)
treefdeb487c1f52e32c6eb4761cd2a530a24c10b8b0 /bs4/element.py
parent7201eecc09b51df5a0fb704670aa66bcc9d8e635 (diff)
NavigableString and its subclasses now implement the get_text()
method, as well as the properties .strings and .stripped_strings. These methods will either return the string itself, or nothing, so the only reason to use this is when iterating over a list of mixed Tag and NavigableString objects. [bug=1904309]
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py122
1 files changed, 85 insertions, 37 deletions
diff --git a/bs4/element.py b/bs4/element.py
index e994678..3428e21 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -255,6 +255,47 @@ class PageElement(object):
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
+ default = object()
+ def _all_strings(self, strip=False, types=default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This is implemented differently in Tag and NavigableString.
+ """
+ raise NotImplementedError()
+
+ @property
+ def stripped_strings(self):
+ """Yield all strings in this PageElement, stripping them first.
+
+ :yield: A sequence of stripped strings.
+ """
+ for string in self._all_strings(True):
+ yield string
+
+ def get_text(self, separator=u"", strip=False,
+ types=default):
+ """Get all child strings of this PageElement, concatenated using the
+ given separator.
+
+ :param separator: Strings will be concatenated using this separator.
+
+ :param strip: If True, strings will be stripped before being
+ concatenated.
+
+ :param types: A tuple of NavigableString subclasses. Any
+ strings of a subclass not found in this list will be
+ ignored. Although there are exceptions, the default
+ behavior in most cases is to consider only NavigableString
+ and CData objects. That means no comments, processing
+ instructions, etc.
+
+ :return: A string.
+ """
+ return separator.join([s for s in self._all_strings(
+ strip, types=types)])
+ getText = get_text
+ text = property(get_text)
+
def replace_with(self, replace_with):
"""Replace this PageElement with another one, keeping the rest of the
tree the same.
@@ -945,7 +986,49 @@ class NavigableString(unicode, PageElement):
"""Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")
-
+ def _all_strings(self, strip=False, types=PageElement.default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This makes it easy for NavigableString to implement methods
+ like get_text() as conveniences, creating a consistent
+ text-extraction API across all PageElements.
+
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. If this
+ NavigableString isn't one of those subclasses, the
+ sequence will be empty. By default, the subclasses
+ considered are NavigableString and CData objects. That
+ means no comments, processing instructions, etc.
+
+ :yield: A sequence that either contains this string, or is empty.
+
+ """
+ if types is self.default:
+ # This is kept in Tag because it's full of subclasses of
+ # this class, which aren't defined until later in the file.
+ types = Tag.DEFAULT_INTERESTING_STRING_TYPES
+
+ # Do nothing if the caller is looking for specific types of
+ # string, and we're of a different type.
+ my_type = type(self)
+ if types is not None:
+ if isinstance(types, type):
+ # Looking for a single type.
+ if my_type is not types:
+ return
+ elif my_type not in types:
+ # Looking for one of a list of types.
+ return
+
+ value = self
+ if strip:
+ value = value.strip()
+ if len(value) > 0:
+ yield value
+ strings = property(_all_strings)
+
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
@@ -1243,8 +1326,7 @@ class Tag(PageElement):
self.append(string.__class__(string))
DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
- default = object()
- def _all_strings(self, strip=False, types=default):
+ def _all_strings(self, strip=False, types=PageElement.default):
"""Yield all strings of certain classes, possibly stripping them.
:param strip: If True, all strings will be stripped before being
@@ -1280,42 +1362,8 @@ class Tag(PageElement):
if len(descendant) == 0:
continue
yield descendant
-
strings = property(_all_strings)
- @property
- def stripped_strings(self):
- """Yield all strings in the document, stripping them first.
-
- :yield: A sequence of stripped strings.
- """
- for string in self._all_strings(True):
- yield string
-
- def get_text(self, separator=u"", strip=False,
- types=default):
- """Get all child strings, concatenated using the given separator.
-
- :param separator: Strings will be concatenated using this separator.
-
- :param strip: If True, strings will be stripped before being
- concatenated.
-
- :param types: A tuple of NavigableString subclasses. Any strings of
- a subclass not found in this list will be ignored. By
- default, the subclasses considered are the ones found in
- self.interesting_string_types. If that's not specified,
- only NavigableString and CData objects will be
- considered. That means no comments, processing
- instructions, etc.
-
- :return: A string.
- """
- return separator.join([s for s in self._all_strings(
- strip, types=types)])
- getText = get_text
- text = property(get_text)
-
def decompose(self):
"""Recursively destroys this PageElement and its children.