diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 21:25:16 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 21:25:16 -0400 |
commit | c4ce22b415ab81ba0e3fb4a3fb28f4ce68dccbde (patch) | |
tree | f021dbfe567fea9cbaba09821e9311e71668d5f1 /bs4/element.py | |
parent | 3ad71fceec0002be87306174f0b2464cc2342a7e (diff) |
Methods like get_text() and properties like .strings now only give
you strings that are visible in the document--no comments or
processing commands. [bug=1050164]
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/bs4/element.py b/bs4/element.py index 77c8da0..d58da92 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -873,16 +873,24 @@ class Tag(PageElement): self.clear() self.append(string.__class__(string)) - def _all_strings(self, strip=False): - """Yield all child strings, possibly stripping them.""" + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + By default, yields only NavigableString and CData objects. So + no comments, processing instructions, etc. + """ for descendant in self.descendants: - if not isinstance(descendant, NavigableString): + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant + strings = property(_all_strings) @property @@ -890,11 +898,13 @@ class Tag(PageElement): for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False): + def get_text(self, separator=u"", strip=False, + types=(NavigableString, CData)): """ Get all child strings, concatenated using the given separator. """ - return separator.join([s for s in self._all_strings(strip)]) + return separator.join([s for s in self._all_strings( + strip, types=types)]) getText = get_text text = property(get_text) |