summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/element.py122
-rw-r--r--bs4/tests/test_navigablestring.py123
-rw-r--r--bs4/tests/test_tree.py78
3 files changed, 208 insertions, 115 deletions
diff --git a/bs4/element.py b/bs4/element.py
index e994678..3428e21 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -255,6 +255,47 @@ class PageElement(object):
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
+ default = object()
+ def _all_strings(self, strip=False, types=default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This is implemented differently in Tag and NavigableString.
+ """
+ raise NotImplementedError()
+
+ @property
+ def stripped_strings(self):
+ """Yield all strings in this PageElement, stripping them first.
+
+ :yield: A sequence of stripped strings.
+ """
+ for string in self._all_strings(True):
+ yield string
+
+ def get_text(self, separator=u"", strip=False,
+ types=default):
+ """Get all child strings of this PageElement, concatenated using the
+ given separator.
+
+ :param separator: Strings will be concatenated using this separator.
+
+ :param strip: If True, strings will be stripped before being
+ concatenated.
+
+ :param types: A tuple of NavigableString subclasses. Any
+ strings of a subclass not found in this list will be
+ ignored. Although there are exceptions, the default
+ behavior in most cases is to consider only NavigableString
+ and CData objects. That means no comments, processing
+ instructions, etc.
+
+ :return: A string.
+ """
+ return separator.join([s for s in self._all_strings(
+ strip, types=types)])
+ getText = get_text
+ text = property(get_text)
+
def replace_with(self, replace_with):
"""Replace this PageElement with another one, keeping the rest of the
tree the same.
@@ -945,7 +986,49 @@ class NavigableString(unicode, PageElement):
"""Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")
-
+ def _all_strings(self, strip=False, types=PageElement.default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This makes it easy for NavigableString to implement methods
+ like get_text() as conveniences, creating a consistent
+ text-extraction API across all PageElements.
+
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. If this
+ NavigableString isn't one of those subclasses, the
+ sequence will be empty. By default, the subclasses
+ considered are NavigableString and CData objects. That
+ means no comments, processing instructions, etc.
+
+ :yield: A sequence that either contains this string, or is empty.
+
+ """
+ if types is self.default:
+ # This is kept in Tag because it's full of subclasses of
+ # this class, which aren't defined until later in the file.
+ types = Tag.DEFAULT_INTERESTING_STRING_TYPES
+
+ # Do nothing if the caller is looking for specific types of
+ # string, and we're of a different type.
+ my_type = type(self)
+ if types is not None:
+ if isinstance(types, type):
+ # Looking for a single type.
+ if my_type is not types:
+ return
+ elif my_type not in types:
+ # Looking for one of a list of types.
+ return
+
+ value = self
+ if strip:
+ value = value.strip()
+ if len(value) > 0:
+ yield value
+ strings = property(_all_strings)
+
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
@@ -1243,8 +1326,7 @@ class Tag(PageElement):
self.append(string.__class__(string))
DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
- default = object()
- def _all_strings(self, strip=False, types=default):
+ def _all_strings(self, strip=False, types=PageElement.default):
"""Yield all strings of certain classes, possibly stripping them.
:param strip: If True, all strings will be stripped before being
@@ -1280,42 +1362,8 @@ class Tag(PageElement):
if len(descendant) == 0:
continue
yield descendant
-
strings = property(_all_strings)
- @property
- def stripped_strings(self):
- """Yield all strings in the document, stripping them first.
-
- :yield: A sequence of stripped strings.
- """
- for string in self._all_strings(True):
- yield string
-
- def get_text(self, separator=u"", strip=False,
- types=default):
- """Get all child strings, concatenated using the given separator.
-
- :param separator: Strings will be concatenated using this separator.
-
- :param strip: If True, strings will be stripped before being
- concatenated.
-
- :param types: A tuple of NavigableString subclasses. Any strings of
- a subclass not found in this list will be ignored. By
- default, the subclasses considered are the ones found in
- self.interesting_string_types. If that's not specified,
- only NavigableString and CData objects will be
- considered. That means no comments, processing
- instructions, etc.
-
- :return: A string.
- """
- return separator.join([s for s in self._all_strings(
- strip, types=types)])
- getText = get_text
- text = property(get_text)
-
def decompose(self):
"""Recursively destroys this PageElement and its children.
diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py
new file mode 100644
index 0000000..8b903ea
--- /dev/null
+++ b/bs4/tests/test_navigablestring.py
@@ -0,0 +1,123 @@
+from bs4.testing import SoupTest
+from bs4.element import (
+ CData,
+ Comment,
+ Declaration,
+ Doctype,
+ NavigableString,
+ Script,
+ Stylesheet,
+ TemplateString,
+)
+
+class TestNavigableString(SoupTest):
+
+ def test_text_acquisition_methods(self):
+ # These methods are intended for use against Tag, but they
+ # work on NavigableString as well,
+ eq_ = self.assertEquals
+
+ s = NavigableString("fee ")
+ cdata = CData("fie ")
+ comment = Comment("foe ")
+
+ eq_("fee ", s.get_text())
+ eq_("fee", s.get_text(strip=True))
+ eq_(["fee "], list(s.strings))
+ eq_(["fee"], list(s.stripped_strings))
+ eq_(["fee "], list(s._all_strings()))
+
+ eq_("fie ", cdata.get_text())
+ eq_("fie", cdata.get_text(strip=True))
+ eq_(["fie "], list(cdata.strings))
+ eq_(["fie"], list(cdata.stripped_strings))
+ eq_(["fie "], list(cdata._all_strings()))
+
+ # Since a Comment isn't normally considered 'text',
+ # these methods generally do nothing.
+ eq_("", comment.get_text())
+ eq_([], list(comment.strings))
+ eq_([], list(comment.stripped_strings))
+ eq_([], list(comment._all_strings()))
+
+ # Unless you specifically say that comments are okay.
+ eq_("foe", comment.get_text(strip=True, types=Comment))
+ eq_("foe ", comment.get_text(types=(Comment, NavigableString)))
+
+class TestNavigableStringSubclasses(SoupTest):
+
+ def test_cdata(self):
+ # None of the current builders turn CDATA sections into CData
+ # objects, but you can create them manually.
+ soup = self.soup("")
+ cdata = CData("foo")
+ soup.insert(1, cdata)
+ self.assertEqual(str(soup), "<![CDATA[foo]]>")
+ self.assertEqual(soup.find(text="foo"), "foo")
+ self.assertEqual(soup.contents[0], "foo")
+
+ def test_cdata_is_never_formatted(self):
+ """Text inside a CData object is passed into the formatter.
+
+ But the return value is ignored.
+ """
+
+ self.count = 0
+ def increment(*args):
+ self.count += 1
+ return "BITTER FAILURE"
+
+ soup = self.soup("")
+ cdata = CData("<><><>")
+ soup.insert(1, cdata)
+ self.assertEqual(
+ b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
+ self.assertEqual(1, self.count)
+
+ def test_doctype_ends_in_newline(self):
+ # Unlike other NavigableString subclasses, a DOCTYPE always ends
+ # in a newline.
+ doctype = Doctype("foo")
+ soup = self.soup("")
+ soup.insert(1, doctype)
+ self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
+
+ def test_declaration(self):
+ d = Declaration("foo")
+ self.assertEqual("<?foo?>", d.output_ready())
+
+ def test_default_string_containers(self):
+ # In some cases, we use different NavigableString subclasses for
+ # the same text in different tags.
+ soup = self.soup(
+ "<div>text</div><script>text</script><style>text</style>"
+ )
+ self.assertEqual(
+ [NavigableString, Script, Stylesheet],
+ [x.__class__ for x in soup.find_all(text=True)]
+ )
+
+ # The TemplateString is a little unusual because it's generally found
+ # _inside_ children of a <template> element, not a direct child of the
+ # <template> element.
+ soup = self.soup(
+ "<template>Some text<p>In a tag</p></template>Some text outside"
+ )
+ assert all(
+ isinstance(x, TemplateString)
+ for x in soup.template._all_strings(types=None)
+ )
+
+ # Once the <template> tag closed, we went back to using
+ # NavigableString.
+ outside = soup.template.next_sibling
+ assert isinstance(outside, NavigableString)
+ assert not isinstance(outside, TemplateString)
+
+ # The TemplateString is also unusual because it can contain
+ # NavigableString subclasses of _other_ types, such as
+ # Comment.
+ markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>"
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.template.encode("utf8"))
+
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index d1ca5ea..875befe 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1846,84 +1846,6 @@ class TestEncoding(SoupTest):
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
-
-class TestNavigableStringSubclasses(SoupTest):
-
- def test_cdata(self):
- # None of the current builders turn CDATA sections into CData
- # objects, but you can create them manually.
- soup = self.soup("")
- cdata = CData("foo")
- soup.insert(1, cdata)
- self.assertEqual(str(soup), "<![CDATA[foo]]>")
- self.assertEqual(soup.find(text="foo"), "foo")
- self.assertEqual(soup.contents[0], "foo")
-
- def test_cdata_is_never_formatted(self):
- """Text inside a CData object is passed into the formatter.
-
- But the return value is ignored.
- """
-
- self.count = 0
- def increment(*args):
- self.count += 1
- return "BITTER FAILURE"
-
- soup = self.soup("")
- cdata = CData("<><><>")
- soup.insert(1, cdata)
- self.assertEqual(
- b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
- self.assertEqual(1, self.count)
-
- def test_doctype_ends_in_newline(self):
- # Unlike other NavigableString subclasses, a DOCTYPE always ends
- # in a newline.
- doctype = Doctype("foo")
- soup = self.soup("")
- soup.insert(1, doctype)
- self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
-
- def test_declaration(self):
- d = Declaration("foo")
- self.assertEqual("<?foo?>", d.output_ready())
-
- def test_default_string_containers(self):
- # In some cases, we use different NavigableString subclasses for
- # the same text in different tags.
- soup = self.soup(
- "<div>text</div><script>text</script><style>text</style>"
- )
- self.assertEqual(
- [NavigableString, Script, Stylesheet],
- [x.__class__ for x in soup.find_all(text=True)]
- )
-
- # The TemplateString is a little unusual because it's generally found
- # _inside_ children of a <template> element, not a direct child of the
- # <template> element.
- soup = self.soup(
- "<template>Some text<p>In a tag</p></template>Some text outside"
- )
- assert all(
- isinstance(x, TemplateString)
- for x in soup.template._all_strings(types=None)
- )
-
- # Once the <template> tag closed, we went back to using
- # NavigableString.
- outside = soup.template.next_sibling
- assert isinstance(outside, NavigableString)
- assert not isinstance(outside, TemplateString)
-
- # The TemplateString is also unusual because it can contain
- # NavigableString subclasses of _other_ types, such as
- # Comment.
- markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>"
- soup = self.soup(markup)
- self.assertEqual(markup, soup.template.encode("utf8"))
-
class TestSoupSelector(TreeTest):