# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" from pdb import set_trace import logging import os import pytest import sys import tempfile from bs4 import ( BeautifulSoup, BeautifulStoneSoup, GuessedAtParserWarning, MarkupResemblesLocatorWarning, ) from bs4.builder import ( TreeBuilder, ParserRejectedMarkup, ) from bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, Tag, NavigableString, ) import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, ) from bs4.testing import ( default_builder, SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): data = "

éé

" soup = self.soup(data) assert "éé" == soup.h1.string def test_embedded_null(self): data = "

foo\0bar

" soup = self.soup(data) assert "foo\0bar" == soup.h1.string def test_exclude_encodings(self): utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) assert "windows-1252" == soup.original_encoding def test_custom_builder_class(self): # Verify that you can pass in a custom Builder class and # it'll be instantiated with the appropriate keyword arguments. class Mock(object): def __init__(self, **kwargs): self.called_with = kwargs self.is_xml = True self.store_line_numbers = False self.cdata_list_attributes = [] self.preserve_whitespace_tags = [] self.string_containers = {} def initialize_soup(self, soup): pass def feed(self, markup): self.fed = markup def reset(self): pass def ignore(self, ignore): pass set_up_substitutions = can_be_empty_element = ignore def prepare_markup(self, *args, **kwargs): yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters" kwargs = dict( var="value", # This is a deprecated BS3-era keyword argument, which # will be stripped out. convertEntities=True, ) with warnings.catch_warnings(record=True): soup = BeautifulSoup('', builder=Mock, **kwargs) assert isinstance(soup.builder, Mock) assert dict(var="value") == soup.builder.called_with assert "prepared markup" == soup.builder.fed # You can also instantiate the TreeBuilder yourself. In this # case, that specific object is used and any keyword arguments # to the BeautifulSoup constructor are ignored. builder = Mock(**kwargs) with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup( '', builder=builder, ignored_value=True, ) msg = str(w[0].message) assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") assert builder == soup.builder assert kwargs == builder.called_with def test_parser_markup_rejection(self): # If markup is completely rejected by the parser, an # explanatory ParserRejectedMarkup exception is raised. class Mock(TreeBuilder): def feed(self, *args, **kwargs): raise ParserRejectedMarkup("Nope.") def prepare_markup(self, *args, **kwargs): # We're going to try two different ways of preparing this markup, # but feed() will reject both of them. yield markup, None, None, False yield markup, None, None, False import re with pytest.raises(ParserRejectedMarkup) as exc_info: BeautifulSoup('', builder=Mock) assert "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." in str(exc_info.value) def test_cdata_list_attributes(self): # Most attribute values are represented as scalars, but the # HTML standard says that some attributes, like 'class' have # space-separated lists as values. markup = '' soup = self.soup(markup) # Note that the spaces are stripped for 'class' but not for 'id'. a = soup.a assert " an id " == a['id'] assert ["a", "class"] == a['class'] # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets # you customize or disable this. As always, you can customize the TreeBuilder # by passing in a keyword argument to the BeautifulSoup constructor. soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) assert " a class " == soup.a['class'] # Here are two ways of saying that `id` is a multi-valued # attribute in this context, but 'class' is not. for switcheroo in ({'*': 'id'}, {'a': 'id'}): with warnings.catch_warnings(record=True) as w: # This will create a warning about not explicitly # specifying a parser, but we'll ignore it. soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) a = soup.a assert ["an", "id"] == a['id'] assert " a class " == a['class'] def test_replacement_classes(self): # Test the ability to pass in replacements for element classes # which will be used when building the tree. class TagPlus(Tag): pass class StringPlus(NavigableString): pass class CommentPlus(Comment): pass soup = self.soup( "foobar", element_classes = { Tag: TagPlus, NavigableString: StringPlus, Comment: CommentPlus, } ) # The tree was built with TagPlus, StringPlus, and CommentPlus objects, # rather than Tag, String, and Comment objects. assert all( isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.recursiveChildGenerator() ) def test_alternate_string_containers(self): # Test the ability to customize the string containers for # different types of tags. class PString(NavigableString): pass class BString(NavigableString): pass soup = self.soup( "

Hello.

Here is some bolded text", string_containers = { 'b': BString, 'p': PString, } ) # The string before the

tag is a regular NavigableString. assert isinstance(soup.div.contents[0], NavigableString) # The string inside the

tag, but not inside the tag, # is a PString. assert isinstance(soup.p.contents[0], PString) # Every string inside the tag is a BString, even the one that # was also inside an tag. for s in soup.b.strings: assert isinstance(s, BString) # Now that parsing was complete, the string_container_stack # (where this information was kept) has been cleared out. assert [] == soup.string_container_stack class TestWarnings(SoupTest): def _assert_warning(self, warnings, cls): for w in warnings: if isinstance(w.message, cls): return w raise Exception("%s warning not found in %r" % cls, warnings) def _assert_no_parser_specified(self, w): warning = self._assert_warning(w, GuessedAtParserWarning) message = str(warning.message) assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60]) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("") self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("", "html") self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("", "html.parser") assert [] == w def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", parseOnlyThese=SoupStrainer("b")) msg = str(w[0].message) assert "parseOnlyThese" in msg assert "parse_only" in msg assert b"" == soup.encode() def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) assert "fromEncoding" in msg assert "from_encoding" in msg assert "utf8" == soup.original_encoding def test_unrecognized_keyword_argument(self): with pytest.raises(TypeError): self.soup("", no_such_argument=True) def test_disk_file_warning(self): filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name try: with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks like a filename" in str(warning.message) finally: filehandle.close() # The file no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) assert [] == w def test_directory_warning(self): try: filename = tempfile.mkdtemp() with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks like a directory" in str(warning.message) finally: os.rmdir(filename) # The directory no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) assert [] == w def test_url_warning_with_bytes_url(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) assert "looks like a URL" in str(warning.message) def test_url_warning_with_unicode_url(self): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning soup = self.soup("http://www.crummyunicode.com/") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) assert "looks like a URL" in str(warning.message) def test_url_warning_with_bytes_and_space(self): # Here the markup contains something besides a URL, so no warning # is issued. with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") assert not any("looks like a URL" in str(w.message) for w in warning_list) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup("http://www.crummyunicode.com/ is great") assert not any("looks like a URL" in str(w.message) for w in warning_list) class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) assert soup.encode() == b"YesYes Yes" class TestEntitySubstitution(object): """Standalone tests of the EntitySubstitution class.""" def setup_method(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = "foo\u2200\N{SNOWMAN}\u00f5bar" assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar" def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = b"\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) assert self.sub.substitute_html(dammit.markup) == "‘’foo“”" def test_html5_entity(self): # Some HTML5 entities correspond to single- or multi-character # Unicode sequences. for entity, u in ( # A few spot checks of our ability to recognize # special character sequences and convert them # to named entities. ('⊧', '\u22a7'), ('𝔑', '\U0001d511'), ('≧̸', '\u2267\u0338'), ('¬', '\xac'), ('⫬', '\u2aec'), # We _could_ convert | to &verbarr;, but we don't, because # | is an ASCII character. ('|' '|'), # Similarly for the fj ligature, which we could convert to # fj, but we don't. ("fj", "fj"), # We do convert _these_ ASCII characters to HTML entities, # because that's required to generate valid HTML. ('>', '>'), ('<', '<'), ('&', '&'), ): template = '3 %s 4' raw = template % u with_entities = template % entity assert self.sub.substitute_html(raw) == with_entities def test_html5_entity_with_variation_selector(self): # Some HTML5 entities correspond either to a single-character # Unicode sequence _or_ to the same character plus U+FE00, # VARIATION SELECTOR 1. We can handle this. data = "fjords \u2294 penguins" markup = "fjords ⊔ penguins" assert self.sub.substitute_html(data) == markup data = "fjords \u2294\ufe00 penguins" markup = "fjords ⊔︀ penguins" assert self.sub.substitute_html(data) == markup def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' assert self.sub.substitute_xml(s, False) == s def test_xml_attribute_quoting_normally_uses_double_quotes(self): assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' assert self.sub.substitute_xml(quoted) == quoted def test_xml_quoting_handles_angle_brackets(self): assert self.sub.substitute_xml("foo") == "foo<bar>" def test_xml_quoting_handles_ampersands(self): assert self.sub.substitute_xml("AT&T") == "AT&T" def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): assert self.sub.substitute_xml("ÁT&T") == "ÁT&T" def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T" def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' assert self.sub.substitute_html(text) == text class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setup_method(self): self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. assert self.utf8_data == b'Sacr\xc3\xa9 bleu!' def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() assert isinstance(unicode_output, str) assert unicode_output == self.document_for(ascii.decode()) assert soup_from_ascii.original_encoding.lower() == "utf-8" finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.decode() == self.unicode_data assert soup_from_unicode.foo.string == 'Sacr\xe9 bleu!' assert soup_from_unicode.original_encoding == None def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) assert soup_from_utf8.decode() == self.unicode_data assert soup_from_utf8.foo.string == 'Sacr\xe9 bleu!' def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.encode('utf-8') == self.utf8_data @skipIf( PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): markup = '

' assert self.soup(markup).div.encode("utf8") == markup.encode("utf8") class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none_or_missing(self): a = NamespacedAttribute("xmlns", None) assert a == "xmlns" a = NamespacedAttribute("xmlns", "") assert a == "xmlns" a = NamespacedAttribute("xmlns") assert a == "xmlns" def test_namespace_may_be_none_or_missing(self): a = NamespacedAttribute(None, "tag") assert a == "tag" a = NamespacedAttribute("", "tag") assert a == "tag" def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") assert "a:b" == a def test_attributes_are_equivalent_if_prefix_and_name_identical(self): a = NamespacedAttribute("a", "b", "c") b = NamespacedAttribute("a", "b", "c") assert a == b # The actual namespace is not considered. c = NamespacedAttribute("a", "b", None) assert a == c # But name and prefix are important. d = NamespacedAttribute("a", "z", "c") assert a != d e = NamespacedAttribute("z", "b", "c") assert a != e class TestAttributeValueWithCharsetSubstitution(object): def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") assert "euc-jp" == value assert "euc-jp" == value.original_value assert "utf8" == value.encode("utf8") def test_content_meta_attribute_value(self): value = ContentMetaAttributeValue("text/html; charset=euc-jp") assert "text/html; charset=euc-jp" == value assert "text/html; charset=euc-jp" == value.original_value assert "text/html; charset=utf8" == value.encode("utf8")