summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-06 21:25:16 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-06 21:25:16 -0400
commitc4ce22b415ab81ba0e3fb4a3fb28f4ce68dccbde (patch)
treef021dbfe567fea9cbaba09821e9311e71668d5f1 /bs4/element.py
parent3ad71fceec0002be87306174f0b2464cc2342a7e (diff)
Methods like get_text() and properties like .strings now only give
you strings that are visible in the document--no comments or processing commands. [bug=1050164]
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py20
1 files changed, 15 insertions, 5 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 77c8da0..d58da92 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -873,16 +873,24 @@ class Tag(PageElement):
self.clear()
self.append(string.__class__(string))
- def _all_strings(self, strip=False):
- """Yield all child strings, possibly stripping them."""
+ def _all_strings(self, strip=False, types=(NavigableString, CData)):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ By default, yields only NavigableString and CData objects. So
+ no comments, processing instructions, etc.
+ """
for descendant in self.descendants:
- if not isinstance(descendant, NavigableString):
+ if (
+ (types is None and not isinstance(descendant, NavigableString))
+ or
+ (types is not None and type(descendant) not in types)):
continue
if strip:
descendant = descendant.strip()
if len(descendant) == 0:
continue
yield descendant
+
strings = property(_all_strings)
@property
@@ -890,11 +898,13 @@ class Tag(PageElement):
for string in self._all_strings(True):
yield string
- def get_text(self, separator=u"", strip=False):
+ def get_text(self, separator=u"", strip=False,
+ types=(NavigableString, CData)):
"""
Get all child strings, concatenated using the given separator.
"""
- return separator.join([s for s in self._all_strings(strip)])
+ return separator.join([s for s in self._all_strings(
+ strip, types=types)])
getText = get_text
text = property(get_text)