# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" from pdb import set_trace import logging import os import pytest import sys import tempfile from bs4 import ( BeautifulSoup, BeautifulStoneSoup, GuessedAtParserWarning, MarkupResemblesLocatorWarning, dammit, ) from bs4.builder import ( builder_registry, TreeBuilder, ParserRejectedMarkup, ) from bs4.element import ( Comment, SoupStrainer, Tag, NavigableString, ) from . import ( default_builder, SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): data = "
Here is some bolded text", string_containers = { 'b': BString, 'p': PString, } ) # The string before the
tag is a regular NavigableString. assert isinstance(soup.div.contents[0], NavigableString) # The string inside the
tag, but not inside the tag,
# is a PString.
assert isinstance(soup.p.contents[0], PString)
# Every string inside the tag is a BString, even the one that
# was also inside an tag.
for s in soup.b.strings:
assert isinstance(s, BString)
# Now that parsing was complete, the string_container_stack
# (where this information was kept) has been cleared out.
assert [] == soup.string_container_stack
class TestWarnings(SoupTest):
def _assert_warning(self, warnings, cls):
for w in warnings:
if isinstance(w.message, cls):
return w
raise Exception("%s warning not found in %r" % cls, warnings)
def _assert_no_parser_specified(self, w):
warning = self._assert_warning(w, GuessedAtParserWarning)
message = str(warning.message)
assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("")
self._assert_no_parser_specified(w)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("", "html")
self._assert_no_parser_specified(w)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("", "html.parser")
assert [] == w
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
assert "parseOnlyThese" in msg
assert "parse_only" in msg
assert b"" == soup.encode()
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
assert "fromEncoding" in msg
assert "from_encoding" in msg
assert "utf8" == soup.original_encoding
def test_unrecognized_keyword_argument(self):
with pytest.raises(TypeError):
self.soup("", no_such_argument=True)
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks like a filename" in str(warning.message)
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
assert [] == w
def test_directory_warning(self):
try:
filename = tempfile.mkdtemp()
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks like a directory" in str(warning.message)
finally:
os.rmdir(filename)
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
assert [] == w
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
assert "looks like a URL" in str(warning.message)
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup("http://www.crummyunicode.com/")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
assert "looks like a URL" in str(warning.message)
def test_url_warning_with_bytes_and_space(self):
# Here the markup contains something besides a URL, so no warning
# is issued.
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
assert not any("looks like a URL" in str(w.message)
for w in warning_list)
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup("http://www.crummyunicode.com/ is great")
assert not any("looks like a URL" in str(w.message)
for w in warning_list)
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "NoYesNoYes tag are empty-element, just because
# they have no contents.
assert b"
and
" == xml_br.encode()
assert b"
" == html_br.encode()
assert b"" == html_p.encode()
class TestNewString(SoupTest):
"""Test the BeautifulSoup.new_string() method."""
def test_new_string_creates_navigablestring(self):
soup = self.soup("")
s = soup.new_string("foo")
assert "foo" == s
assert isinstance(s, NavigableString)
def test_new_string_can_create_navigablestring_subclass(self):
soup = self.soup("")
s = soup.new_string("foo", Comment)
assert "foo" == s
assert isinstance(s, Comment)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setup_method(self):
self.unicode_data = '