diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/element.py | 122 | ||||
-rw-r--r-- | bs4/tests/test_navigablestring.py | 123 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 78 |
3 files changed, 208 insertions, 115 deletions
diff --git a/bs4/element.py b/bs4/element.py index e994678..3428e21 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -255,6 +255,47 @@ class PageElement(object): nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 + default = object() + def _all_strings(self, strip=False, types=default): + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in Tag and NavigableString. + """ + raise NotImplementedError() + + @property + def stripped_strings(self): + """Yield all strings in this PageElement, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False, + types=default): + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + def replace_with(self, replace_with): """Replace this PageElement with another one, keeping the rest of the tree the same. @@ -945,7 +986,49 @@ class NavigableString(unicode, PageElement): """Prevent NavigableString.name from ever being set.""" raise AttributeError("A NavigableString cannot be given a name.") - + def _all_strings(self, strip=False, types=PageElement.default): + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.DEFAULT_INTERESTING_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + value = value.strip() + if len(value) > 0: + yield value + strings = property(_all_strings) + class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. @@ -1243,8 +1326,7 @@ class Tag(PageElement): self.append(string.__class__(string)) DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) - default = object() - def _all_strings(self, strip=False, types=default): + def _all_strings(self, strip=False, types=PageElement.default): """Yield all strings of certain classes, possibly stripping them. :param strip: If True, all strings will be stripped before being @@ -1280,42 +1362,8 @@ class Tag(PageElement): if len(descendant) == 0: continue yield descendant - strings = property(_all_strings) - @property - def stripped_strings(self): - """Yield all strings in the document, stripping them first. - - :yield: A sequence of stripped strings. - """ - for string in self._all_strings(True): - yield string - - def get_text(self, separator=u"", strip=False, - types=default): - """Get all child strings, concatenated using the given separator. - - :param separator: Strings will be concatenated using this separator. - - :param strip: If True, strings will be stripped before being - concatenated. - - :param types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, the subclasses considered are the ones found in - self.interesting_string_types. If that's not specified, - only NavigableString and CData objects will be - considered. That means no comments, processing - instructions, etc. - - :return: A string. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) - def decompose(self): """Recursively destroys this PageElement and its children. diff --git a/bs4/tests/test_navigablestring.py b/bs4/tests/test_navigablestring.py new file mode 100644 index 0000000..8b903ea --- /dev/null +++ b/bs4/tests/test_navigablestring.py @@ -0,0 +1,123 @@ +from bs4.testing import SoupTest +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + NavigableString, + Script, + Stylesheet, + TemplateString, +) + +class TestNavigableString(SoupTest): + + def test_text_acquisition_methods(self): + # These methods are intended for use against Tag, but they + # work on NavigableString as well, + eq_ = self.assertEquals + + s = NavigableString("fee ") + cdata = CData("fie ") + comment = Comment("foe ") + + eq_("fee ", s.get_text()) + eq_("fee", s.get_text(strip=True)) + eq_(["fee "], list(s.strings)) + eq_(["fee"], list(s.stripped_strings)) + eq_(["fee "], list(s._all_strings())) + + eq_("fie ", cdata.get_text()) + eq_("fie", cdata.get_text(strip=True)) + eq_(["fie "], list(cdata.strings)) + eq_(["fie"], list(cdata.stripped_strings)) + eq_(["fie "], list(cdata._all_strings())) + + # Since a Comment isn't normally considered 'text', + # these methods generally do nothing. + eq_("", comment.get_text()) + eq_([], list(comment.strings)) + eq_([], list(comment.stripped_strings)) + eq_([], list(comment._all_strings())) + + # Unless you specifically say that comments are okay. + eq_("foe", comment.get_text(strip=True, types=Comment)) + eq_("foe ", comment.get_text(types=(Comment, NavigableString))) + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "<![CDATA[foo]]>") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("<?foo?>", d.output_ready()) + + def test_default_string_containers(self): + # In some cases, we use different NavigableString subclasses for + # the same text in different tags. + soup = self.soup( + "<div>text</div><script>text</script><style>text</style>" + ) + self.assertEqual( + [NavigableString, Script, Stylesheet], + [x.__class__ for x in soup.find_all(text=True)] + ) + + # The TemplateString is a little unusual because it's generally found + # _inside_ children of a <template> element, not a direct child of the + # <template> element. + soup = self.soup( + "<template>Some text<p>In a tag</p></template>Some text outside" + ) + assert all( + isinstance(x, TemplateString) + for x in soup.template._all_strings(types=None) + ) + + # Once the <template> tag closed, we went back to using + # NavigableString. + outside = soup.template.next_sibling + assert isinstance(outside, NavigableString) + assert not isinstance(outside, TemplateString) + + # The TemplateString is also unusual because it can contain + # NavigableString subclasses of _other_ types, such as + # Comment. + markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>" + soup = self.soup(markup) + self.assertEqual(markup, soup.template.encode("utf8")) + diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index d1ca5ea..875befe 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1846,84 +1846,6 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) - -class TestNavigableStringSubclasses(SoupTest): - - def test_cdata(self): - # None of the current builders turn CDATA sections into CData - # objects, but you can create them manually. - soup = self.soup("") - cdata = CData("foo") - soup.insert(1, cdata) - self.assertEqual(str(soup), "<![CDATA[foo]]>") - self.assertEqual(soup.find(text="foo"), "foo") - self.assertEqual(soup.contents[0], "foo") - - def test_cdata_is_never_formatted(self): - """Text inside a CData object is passed into the formatter. - - But the return value is ignored. - """ - - self.count = 0 - def increment(*args): - self.count += 1 - return "BITTER FAILURE" - - soup = self.soup("") - cdata = CData("<><><>") - soup.insert(1, cdata) - self.assertEqual( - b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) - self.assertEqual(1, self.count) - - def test_doctype_ends_in_newline(self): - # Unlike other NavigableString subclasses, a DOCTYPE always ends - # in a newline. - doctype = Doctype("foo") - soup = self.soup("") - soup.insert(1, doctype) - self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") - - def test_declaration(self): - d = Declaration("foo") - self.assertEqual("<?foo?>", d.output_ready()) - - def test_default_string_containers(self): - # In some cases, we use different NavigableString subclasses for - # the same text in different tags. - soup = self.soup( - "<div>text</div><script>text</script><style>text</style>" - ) - self.assertEqual( - [NavigableString, Script, Stylesheet], - [x.__class__ for x in soup.find_all(text=True)] - ) - - # The TemplateString is a little unusual because it's generally found - # _inside_ children of a <template> element, not a direct child of the - # <template> element. - soup = self.soup( - "<template>Some text<p>In a tag</p></template>Some text outside" - ) - assert all( - isinstance(x, TemplateString) - for x in soup.template._all_strings(types=None) - ) - - # Once the <template> tag closed, we went back to using - # NavigableString. - outside = soup.template.next_sibling - assert isinstance(outside, NavigableString) - assert not isinstance(outside, TemplateString) - - # The TemplateString is also unusual because it can contain - # NavigableString subclasses of _other_ types, such as - # Comment. - markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>" - soup = self.soup(markup) - self.assertEqual(markup, soup.template.encode("utf8")) - class TestSoupSelector(TreeTest): |