diff options
-rw-r--r-- | bs4/element.py | 32 | ||||
-rw-r--r-- | bs4/tests/__init__.py | 18 | ||||
-rw-r--r-- | bs4/tests/test_dammit.py | 101 | ||||
-rw-r--r-- | bs4/tests/test_element.py | 74 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 751 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 215 | ||||
-rw-r--r-- | bs4/tests/test_tag.py | 169 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 988 |
8 files changed, 1181 insertions, 1167 deletions
diff --git a/bs4/element.py b/bs4/element.py index 82a986e..3eed924 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -23,7 +23,6 @@ from bs4.formatter import ( ) DEFAULT_OUTPUT_ENCODING = "utf-8" -PY3K = (sys.version_info[0] > 2) nonwhitespace_re = re.compile(r"\S+") @@ -1558,36 +1557,19 @@ class Tag(PageElement): def __repr__(self, encoding="unicode-escape"): """Renders this PageElement as a string. - :param encoding: The encoding to use (Python 2 only). - :return: Under Python 2, a bytestring; under Python 3, - a Unicode string. + :param encoding: The encoding to use (Python 2 only). + TODO: This is now ignored and a warning should be issued + if a value is provided. + :return: A (Unicode) string. """ - if PY3K: - # "The return value must be a string object", i.e. Unicode - return self.decode() - else: - # "The return value must be a string object", i.e. a bytestring. - # By convention, the return value of __repr__ should also be - # an ASCII string. - return self.encode(encoding) + # "The return value must be a string object", i.e. Unicode + return self.decode() def __unicode__(self): """Renders this PageElement as a Unicode string.""" return self.decode() - def __str__(self): - """Renders this PageElement as a generic string. - - :return: Under Python 2, a UTF-8 bytestring; under Python 3, - a Unicode string. - """ - if PY3K: - return self.decode() - else: - return self.encode() - - if PY3K: - __str__ = __repr__ = __unicode__ + __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py index 97914c0..6b70c34 100644 --- a/bs4/tests/__init__.py +++ b/bs4/tests/__init__.py @@ -230,6 +230,24 @@ class SoupTest(object): # Return the child to the recursive caller return child + def assert_selects(self, tags, should_match): + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + assert [tag.string for tag in tags] == should_match + + def assert_selects_ids(self, tags, should_match): + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + assert [tag['id'] for tag in tags] == should_match + class TreeBuilderSmokeTest(object): # Tests that are common to HTML and XML tree builders. diff --git a/bs4/tests/test_dammit.py b/bs4/tests/test_dammit.py index 348c600..9971234 100644 --- a/bs4/tests/test_dammit.py +++ b/bs4/tests/test_dammit.py @@ -4,6 +4,7 @@ import logging import bs4 from bs4 import BeautifulSoup from bs4.dammit import ( + EntitySubstitution, EncodingDetector, UnicodeDammit, ) @@ -267,4 +268,104 @@ class TestEncodingDetector(object): assert m(xml_bytes, search_entire_document=True) == "iso-8859-1" assert m(b' ' + xml_bytes, search_entire_document=True) == "iso-8859-1" assert m(b'a' + xml_bytes, search_entire_document=True) is None + + +class TestEntitySubstitution(object): + """Standalone tests of the EntitySubstitution class.""" + def setup_method(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = "foo\u2200\N{SNOWMAN}\u00f5bar" + assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar" + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + assert self.sub.substitute_html(dammit.markup) == "‘’foo“”" + + def test_html5_entity(self): + # Some HTML5 entities correspond to single- or multi-character + # Unicode sequences. + + for entity, u in ( + # A few spot checks of our ability to recognize + # special character sequences and convert them + # to named entities. + ('⊧', '\u22a7'), + ('𝔑', '\U0001d511'), + ('≧̸', '\u2267\u0338'), + ('¬', '\xac'), + ('⫬', '\u2aec'), + + # We _could_ convert | to &verbarr;, but we don't, because + # | is an ASCII character. + ('|' '|'), + + # Similarly for the fj ligature, which we could convert to + # fj, but we don't. + ("fj", "fj"), + + # We do convert _these_ ASCII characters to HTML entities, + # because that's required to generate valid HTML. + ('>', '>'), + ('<', '<'), + ('&', '&'), + ): + template = '3 %s 4' + raw = template % u + with_entities = template % entity + assert self.sub.substitute_html(raw) == with_entities + def test_html5_entity_with_variation_selector(self): + # Some HTML5 entities correspond either to a single-character + # Unicode sequence _or_ to the same character plus U+FE00, + # VARIATION SELECTOR 1. We can handle this. + data = "fjords \u2294 penguins" + markup = "fjords ⊔ penguins" + assert self.sub.substitute_html(data) == markup + + data = "fjords \u2294\ufe00 penguins" + markup = "fjords ⊔︀ penguins" + assert self.sub.substitute_html(data) == markup + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + assert self.sub.substitute_xml(s, False) == s + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' + assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + assert self.sub.substitute_xml(quoted) == quoted + + def test_xml_quoting_handles_angle_brackets(self): + assert self.sub.substitute_xml("foo<bar>") == "foo<bar>" + + def test_xml_quoting_handles_ampersands(self): + assert self.sub.substitute_xml("AT&T") == "AT&T" + + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): + assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T" + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T" + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + assert self.sub.substitute_html(text) == text diff --git a/bs4/tests/test_element.py b/bs4/tests/test_element.py new file mode 100644 index 0000000..6d08ab5 --- /dev/null +++ b/bs4/tests/test_element.py @@ -0,0 +1,74 @@ +"""Tests of classes in element.py. + +The really big classes -- Tag, PageElement, and NavigableString -- +are tested in separate files. +""" + +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + NamespacedAttribute, +) +from . import SoupTest + + +class TestNamedspacedAttribute(object): + + def test_name_may_be_none_or_missing(self): + a = NamespacedAttribute("xmlns", None) + assert a == "xmlns" + + a = NamespacedAttribute("xmlns", "") + assert a == "xmlns" + + a = NamespacedAttribute("xmlns") + assert a == "xmlns" + + def test_namespace_may_be_none_or_missing(self): + a = NamespacedAttribute(None, "tag") + assert a == "tag" + + a = NamespacedAttribute("", "tag") + assert a == "tag" + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + assert "a:b" == a + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + assert a == b + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + assert a == c + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + assert a != d + + e = NamespacedAttribute("z", "b", "c") + assert a != e + + +class TestAttributeValueWithCharsetSubstitution(object): + """Certain attributes are designed to have the charset of the + final document substituted into their value. + """ + + def test_content_meta_attribute_value(self): + # The value of a CharsetMetaAttributeValue is whatever + # encoding the string is in. + value = CharsetMetaAttributeValue("euc-jp") + assert "euc-jp" == value + assert "euc-jp" == value.original_value + assert "utf8" == value.encode("utf8") + assert "ascii" == value.encode("ascii") + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + assert "text/html; charset=euc-jp" == value + assert "text/html; charset=euc-jp" == value.original_value + assert "text/html; charset=utf8" == value.encode("utf8") + assert "text/html; charset=ascii" == value.encode("ascii") diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py new file mode 100644 index 0000000..26783f2 --- /dev/null +++ b/bs4/tests/test_pageelement.py @@ -0,0 +1,751 @@ +"""Tests of the bs4.element.PageElement class""" +import copy +import pickle +import pytest + +from soupsieve import SelectorSyntaxError + +from bs4 import BeautifulSoup +from bs4.element import ( + Comment, + SoupStrainer, +) +from . import SoupTest + + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8") + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.encode("utf-8") == html.encode("utf-8") + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert soup.b.encode("ascii") == b"<b>☃</b>" + + def test_encoding_can_be_made_strict(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + with pytest.raises(UnicodeEncodeError): + soup.encode("ascii", errors="strict") + + def test_decode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert "\N{SNOWMAN}" == soup.b.decode_contents() + + def test_encode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents( + encoding="utf8" + ) + + def test_deprecated_renderContents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() + + def test_repr(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + assert html == repr(soup) + + +class TestFormatters(SoupTest): + """Test the formatting feature, used by methods like decode() and + prettify(), and the formatters themselves. + """ + + def test_default_formatter_is_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_html(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + assert decoded == self.document_for( + "<br/><b><<Sacré bleu!>></b>" + ) + + def test_formatter_html5(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + assert decoded == self.document_for( + "<br><b><<Sacré bleu!>></b>" + ) + + def test_formatter_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_null(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + assert decoded == self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + ) + + def test_formatter_custom(self): + markup = "<b><foo></b><b>bar</b><br/>" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + assert decoded == self.document_for("<b><FOO></b><b>BAR</b><br/>") + + def test_formatter_is_run_on_attribute_values(self): + markup = '<a href="http://a.com?a=b&c=é">e</a>' + soup = self.soup(markup) + a = soup.a + + expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' + + assert expect_minimal == a.decode() + assert expect_minimal == a.decode(formatter="minimal") + + expect_html = '<a href="http://a.com?a=b&c=é">e</a>' + assert expect_html == a.decode(formatter="html") + + assert markup == a.decode(formatter=None) + expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' + assert expect_upper == a.decode(formatter=lambda x: x.upper()) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + <script type="text/javascript"> + console.log("< < hey > > "); + </script> +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + assert b"< < hey > >" in encoded + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + <style type="text/css"> + console.log("< < hey > > "); + </style> +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + assert b"< < hey > >" in encoded + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") + # Everything outside the <pre> tag is reformatted, but everything + # inside is left alone. + assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify() + + def test_prettify_accepts_formatter_function(self): + soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') + pretty = soup.prettify(formatter = lambda x: x.upper()) + assert "FOO" in pretty + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("<a></a>") + assert str == type(soup.prettify()) + + def test_prettify_can_encode_data(self): + soup = self.soup("<a></a>") + assert bytes == type(soup.prettify("utf-8")) + + def test_html_entity_substitution_off_by_default(self): + markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + assert encoded == markup.encode('utf-8') + + def test_encoding_substitution(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/>') + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + assert soup.meta['content'] == 'text/html; charset=x-sjis' + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + assert b"charset=utf-8" in utf_8 + + euc_jp = soup.encode("euc_jp") + assert b"charset=euc_jp" in euc_jp + + shift_jis = soup.encode("shift-jis") + assert b"charset=shift-jis" in shift_jis + + utf_16_u = soup.encode("utf-16").decode("utf-16") + assert "charset=utf-16" in utf_16_u + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('<head><meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/></head><pre>foo</pre>') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = self.soup(markup, parse_only=strainer) + assert soup.contents[0].name == 'pre' + + +class TestCSSSelectors(SoupTest): + """Test basic CSS selector functionality. + + This functionality is implemented in soupsieve, which has a much + more comprehensive test suite, so this is basically an extra check + that soupsieve works as expected. + """ + + HTML = """ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<title>The title</title> +<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +</head> +<body> +<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> +<div id="main" class="fancy"> +<div id="inner"> +<h1 id="header1">An H1</h1> +<p>Some text</p> +<p class="onep" id="p1">Some more text</p> +<h2 id="header2">An H2</h2> +<p class="class1 class2 class3" id="pmulti">Another</p> +<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> +<h2 id="header3">Another H2</h2> +<a id="me" href="http://simonwillison.net/" rel="me">me</a> +<span class="s1"> +<a href="#" id="s1a1">span1a1</a> +<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> +<span class="span2"> +<a href="#" id="s2a1">span2a1</a> +</span> +<span class="span3"></span> +<custom-dashed-tag class="dashed" id="dash2"/> +<div data-tag="dashedvalue" id="data1"/> +</span> +</div> +<x id="xid"> +<z id="zida"/> +<z id="zidab"/> +<z id="zidac"/> +</x> +<y id="yid"> +<z id="zidb"/> +</y> +<p lang="en" id="lang-en">English</p> +<p lang="en-gb" id="lang-en-gb">English UK</p> +<p lang="en-us" id="lang-en-us">English US</p> +<p lang="fr" id="lang-fr">French</p> +</div> + +<div id="footer"> +</div> +""" + + def setup_method(self): + self.soup = BeautifulSoup(self.HTML, 'html.parser') + + def assert_selects(self, selector, expected_ids, **kwargs): + el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] + el_ids.sort() + expected_ids.sort() + assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + + assertSelect = assert_selects + + def assert_select_multiple(self, *tests): + for selector, expected_ids in tests: + self.assert_selects(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + assert len(els) == 1 + assert els[0].name == 'title' + assert els[0].contents == ['The title'] + + def test_one_tag_many(self): + els = self.soup.select('div') + assert len(els) == 4 + for div in els: + assert div.name == 'div' + + el = self.soup.select_one('div') + assert 'main' == el['id'] + + def test_select_one_returns_none_if_no_match(self): + match = self.soup.select_one('nonexistenttag') + assert None == match + + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assert_selects('div div', ['inner', 'data1']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assert_selects(selector, ['data1', 'main', 'inner', 'footer']) + + + def test_limit(self): + self.assert_selects('html div', ['main'], limit=1) + self.assert_selects('html body div', ['inner', 'main'], limit=2) + self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'], + limit=10) + + def test_tag_no_match(self): + assert len(self.soup.select('del')) == 0 + + def test_invalid_tag(self): + with pytest.raises(SelectorSyntaxError): + self.soup.select('tag%t') + + def test_select_dashed_tag_ids(self): + self.assert_selects('custom-dashed-tag', ['dash1', 'dash2']) + + def test_select_dashed_by_id(self): + dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') + assert dashed[0].name == 'custom-dashed-tag' + assert dashed[0]['id'] == 'dash2' + + def test_dashed_tag_text(self): + assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.' + + def test_select_dashed_matches_find_all(self): + assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag') + + def test_header_tags(self): + self.assert_select_multiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + assert len(els) == 1 + assert els[0].name == 'p' + assert els[0]['class'] == ['onep'] + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + assert len(els) == 0 + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assert_selects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + assert len(els) == 0 + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + assert len(els) == 3 + for el in els: + assert el.name == 'p' + assert els[1]['class'] == ['onep'] + assert not els[0].has_attr('class') + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + assert len(self.soup.select(selector)) == 0 + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assert_selects(selector, ['pmulti']) + + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assert_selects(selector, ['pmulti']) + + def test_child_selector(self): + self.assert_selects('.s1 > a', ['s1a1', 's1a2']) + self.assert_selects('.s1 > a span', ['s1a2s1']) + + def test_child_selector_id(self): + self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1']) + + def test_attribute_equals(self): + self.assert_select_multiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assert_select_multiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assert_select_multiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ('div[data-tag^="dashed"]', ['data1']) + ) + + def test_attribute_endswith(self): + self.assert_select_multiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), + ('div[id$="1"]', ['data1']), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assert_select_multiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), + ('div[id*="1"]', ['data1']), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ('div[data-tag*="edval"]', ['data1']) + ) + + def test_attribute_exact_or_hypen(self): + self.assert_select_multiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assert_select_multiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ('div[data-tag]', ['data1']) + ) + + def test_quoted_space_in_selector_name(self): + html = """<div style="display: wrong">nope</div> + <div style="display: right">yes</div> + """ + soup = BeautifulSoup(html, 'html.parser') + [chosen] = soup.select('div[style="display: right"]') + assert "yes" == chosen.string + + def test_unsupported_pseudoclass(self): + with pytest.raises(NotImplementedError): + self.soup.select("a:no-such-pseudoclass") + + with pytest.raises(SelectorSyntaxError): + self.soup.select("a:nth-of-type(a)") + + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + assert len(els) == 1 + assert els[0].string == 'Some text' + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + assert len(els) == 1 + assert els[0].string == 'Another' + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + assert len(els) == 0 + + # Zero will select no tags. + els = self.soup.select('div p:nth-of-type(0)') + assert len(els) == 0 + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + assert len(els) == 1 + assert els[0].string == 'Some text' + + def test_id_child_selector_nth_of_type(self): + self.assert_selects('#inner > p:nth-of-type(2)', ['p1']) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The <div id="inner"> tag was selected. The <div id="footer"> + # tag was not. + self.assert_selects_ids(selected, ['inner', 'data1']) + + def test_overspecified_child_id(self): + self.assert_selects(".fancy #inner", ['inner']) + self.assert_selects(".normal #inner", []) + + def test_adjacent_sibling_selector(self): + self.assert_selects('#p1 + h2', ['header2']) + self.assert_selects('#p1 + h2 + p', ['pmulti']) + self.assert_selects('#p1 + #header2 + .class1', ['pmulti']) + assert [] == self.soup.select('#p1 + p') + + def test_general_sibling_selector(self): + self.assert_selects('#p1 ~ h2', ['header2', 'header3']) + self.assert_selects('#p1 ~ #header2', ['header2']) + self.assert_selects('#p1 ~ h2 + a', ['me']) + self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me']) + assert [] == self.soup.select('#inner ~ h2') + + def test_dangling_combinator(self): + with pytest.raises(SelectorSyntaxError): + self.soup.select('h1 >') + + def test_sibling_combinator_wont_select_same_tag_twice(self): + self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) + + # Test the selector grouping operator (the comma) + def test_multiple_select(self): + self.assert_selects('x, y', ['xid', 'yid']) + + def test_multiple_select_with_no_space(self): + self.assert_selects('x,y', ['xid', 'yid']) + + def test_multiple_select_with_more_space(self): + self.assert_selects('x, y', ['xid', 'yid']) + + def test_multiple_select_duplicated(self): + self.assert_selects('x, x', ['xid']) + + def test_multiple_select_sibling(self): + self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) + + def test_multiple_select_tag_and_direct_descendant(self): + self.assert_selects('x, y > z', ['xid', 'zidb']) + + def test_multiple_select_direct_descendant_and_tags(self): + self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) + + def test_multiple_select_indirect_descendant(self): + self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) + + def test_invalid_multiple_select(self): + with pytest.raises(SelectorSyntaxError): + self.soup.select(',x, y') + with pytest.raises(SelectorSyntaxError): + self.soup.select('x,,y') + + def test_multiple_select_attrs(self): + self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) + + def test_multiple_select_ids(self): + self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) + + def test_multiple_select_nested(self): + self.assert_selects('body > div > x, y > z', ['xid', 'zidb']) + + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + assert 3 == len(selected) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setup_method(self): + self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" +"http://www.w3.org/TR/REC-html40/transitional.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title>Beautiful Soup: We called him Tortoise because he taught us.</title> +<link rev="made" href="mailto:leonardr@segfault.org"> +<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> +<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> +<meta name="author" content="Leonard Richardson"> +</head> +<body> +<a href="foo">foo</a> +<a href="foo"><b>bar</b></a> +</body> +</html>""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + assert loaded.__class__ == BeautifulSoup + assert loaded.decode() == self.tree.decode() + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + assert copied.decode() == self.tree.decode() + + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b'<p> </p>', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() + assert "<p> </p>" == str(copy) + assert encoding == copy.original_encoding + + def test_copy_preserves_builder_information(self): + + tag = self.soup('<p></p>').p + + # Simulate a tag obtained from a source file. + tag.sourceline = 10 + tag.sourcepos = 33 + + copied = tag.__copy__() + + # The TreeBuilder object is no longer availble, but information + # obtained from it gets copied over to the new Tag object. + assert tag.sourceline == copied.sourceline + assert tag.sourcepos == copied.sourcepos + assert tag.can_be_empty_element == copied.can_be_empty_element + assert tag.cdata_list_attributes == copied.cdata_list_attributes + assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + assert loaded.decode() == soup.decode() + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "<b>Foo<a></a></b><b>Bar</b>" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + assert s1 == s2 + assert None == s2.parent + assert None == s2.next_element + assert None != s1.next_sibling + assert None == s2.next_sibling + assert None == s2.previous_element + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "<b><!--Foo--></b>" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + assert s1 == s2 + assert isinstance(s2, Comment) + + def test_copy_entire_soup(self): + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + assert soup == soup_copy + + def test_copy_tag_copies_contents(self): + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + assert str(div) == str(div_copy) + assert div == div_copy + + # But they're not the same object. + assert div is not div_copy + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + assert None == div_copy.parent + assert None == div_copy.previous_element + assert None == div_copy.find(string='Bar').next_element + assert None != div.find(string='Bar').next_element + diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 0233e38..67845cc 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -13,26 +13,20 @@ from bs4 import ( BeautifulStoneSoup, GuessedAtParserWarning, MarkupResemblesLocatorWarning, + dammit, ) from bs4.builder import ( + builder_registry, TreeBuilder, ParserRejectedMarkup, ) from bs4.element import ( - CharsetMetaAttributeValue, Comment, - ContentMetaAttributeValue, SoupStrainer, - NamespacedAttribute, Tag, NavigableString, - ) - -import bs4.dammit -from bs4.dammit import ( - EntitySubstitution, - UnicodeDammit, ) + from . import ( default_builder, SoupTest, @@ -45,7 +39,7 @@ try: LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False - + PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): @@ -344,106 +338,50 @@ class TestSelectiveParsing(SoupTest): soup = self.soup(markup, parse_only=strainer) assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>" - -class TestEntitySubstitution(object): - """Standalone tests of the EntitySubstitution class.""" - def setup_method(self): - self.sub = EntitySubstitution - - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = "foo\u2200\N{SNOWMAN}\u00f5bar" - assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar" - - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = b"\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - assert self.sub.substitute_html(dammit.markup) == "‘’foo“”" - - def test_html5_entity(self): - # Some HTML5 entities correspond to single- or multi-character - # Unicode sequences. - - for entity, u in ( - # A few spot checks of our ability to recognize - # special character sequences and convert them - # to named entities. - ('⊧', '\u22a7'), - ('𝔑', '\U0001d511'), - ('≧̸', '\u2267\u0338'), - ('¬', '\xac'), - ('⫬', '\u2aec'), - - # We _could_ convert | to &verbarr;, but we don't, because - # | is an ASCII character. - ('|' '|'), - - # Similarly for the fj ligature, which we could convert to - # fj, but we don't. - ("fj", "fj"), - - # We do convert _these_ ASCII characters to HTML entities, - # because that's required to generate valid HTML. - ('>', '>'), - ('<', '<'), - ('&', '&'), - ): - template = '3 %s 4' - raw = template % u - with_entities = template % entity - assert self.sub.substitute_html(raw) == with_entities - - def test_html5_entity_with_variation_selector(self): - # Some HTML5 entities correspond either to a single-character - # Unicode sequence _or_ to the same character plus U+FE00, - # VARIATION SELECTOR 1. We can handle this. - data = "fjords \u2294 penguins" - markup = "fjords ⊔ penguins" - assert self.sub.substitute_html(data) == markup - - data = "fjords \u2294\ufe00 penguins" - markup = "fjords ⊔︀ penguins" - assert self.sub.substitute_html(data) == markup - def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): - s = 'Welcome to "my bar"' - assert self.sub.substitute_xml(s, False) == s - - def test_xml_attribute_quoting_normally_uses_double_quotes(self): - assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' - assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' - - def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): - s = 'Welcome to "my bar"' - assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" - - def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): - s = 'Welcome to "Bob\'s Bar"' - assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' - - def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): - quoted = 'Welcome to "Bob\'s Bar"' - assert self.sub.substitute_xml(quoted) == quoted - - def test_xml_quoting_handles_angle_brackets(self): - assert self.sub.substitute_xml("foo<bar>") == "foo<bar>" - - def test_xml_quoting_handles_ampersands(self): - assert self.sub.substitute_xml("AT&T") == "AT&T" - - def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): - assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T" - - def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): - assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T" - - def test_quotes_not_html_substituted(self): - """There's no need to do this except inside attribute values.""" - text = 'Bob\'s "bar"' - assert self.sub.substitute_html(text) == text +class TestNewTag(SoupTest): + """Test the BeautifulSoup.new_tag() method.""" + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) + assert isinstance(new_tag, Tag) + assert "foo" == new_tag.name + assert dict(bar="baz", name="a name") == new_tag.attrs + assert None == new_tag.parent + + def test_tag_inherits_self_closing_rules_from_builder(self): + if LXML_PRESENT: + xml_soup = BeautifulSoup("", "lxml-xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the <br> and <p> tag are empty-element, just because + # they have no contents. + assert b"<br/>" == xml_br.encode() + assert b"<p/>" == xml_p.encode() + + html_soup = BeautifulSoup("", "html.parser") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + assert b"<br/>" == html_br.encode() + assert b"<p></p>" == html_p.encode() + +class TestNewString(SoupTest): + """Test the BeautifulSoup.new_string() method.""" + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + assert "foo" == s + assert isinstance(s, NavigableString) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + assert "foo" == s + assert isinstance(s, Comment) class TestEncodingConversion(SoupTest): @@ -459,13 +397,13 @@ class TestEncodingConversion(SoupTest): def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. - chardet = bs4.dammit.chardet_dammit + chardet = dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. - bs4.dammit.chardet_dammit = noop + dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() @@ -474,7 +412,7 @@ class TestEncodingConversion(SoupTest): assert soup_from_ascii.original_encoding.lower() == "utf-8" finally: logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet + dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute @@ -504,57 +442,4 @@ class TestEncodingConversion(SoupTest): assert self.soup(markup).div.encode("utf8") == markup.encode("utf8") -class TestNamedspacedAttribute(SoupTest): - - def test_name_may_be_none_or_missing(self): - a = NamespacedAttribute("xmlns", None) - assert a == "xmlns" - - a = NamespacedAttribute("xmlns", "") - assert a == "xmlns" - - a = NamespacedAttribute("xmlns") - assert a == "xmlns" - - def test_namespace_may_be_none_or_missing(self): - a = NamespacedAttribute(None, "tag") - assert a == "tag" - - a = NamespacedAttribute("", "tag") - assert a == "tag" - - def test_attribute_is_equivalent_to_colon_separated_string(self): - a = NamespacedAttribute("a", "b") - assert "a:b" == a - - def test_attributes_are_equivalent_if_prefix_and_name_identical(self): - a = NamespacedAttribute("a", "b", "c") - b = NamespacedAttribute("a", "b", "c") - assert a == b - - # The actual namespace is not considered. - c = NamespacedAttribute("a", "b", None) - assert a == c - - # But name and prefix are important. - d = NamespacedAttribute("a", "z", "c") - assert a != d - - e = NamespacedAttribute("z", "b", "c") - assert a != e - - -class TestAttributeValueWithCharsetSubstitution(object): - - def test_content_meta_attribute_value(self): - value = CharsetMetaAttributeValue("euc-jp") - assert "euc-jp" == value - assert "euc-jp" == value.original_value - assert "utf8" == value.encode("utf8") - - def test_content_meta_attribute_value(self): - value = ContentMetaAttributeValue("text/html; charset=euc-jp") - assert "text/html; charset=euc-jp" == value - assert "text/html; charset=euc-jp" == value.original_value - assert "text/html; charset=utf8" == value.encode("utf8") diff --git a/bs4/tests/test_tag.py b/bs4/tests/test_tag.py new file mode 100644 index 0000000..7a6308a --- /dev/null +++ b/bs4/tests/test_tag.py @@ -0,0 +1,169 @@ +import warnings +from bs4.element import ( + Comment, + NavigableString, +) +from . import SoupTest + +class TestTag(SoupTest): + """Test various methods of Tag which aren't so complicated they + need their own classes. + """ + + def test__should_pretty_print(self): + # Test the rules about when a tag should be pretty-printed. + tag = self.soup("").new_tag("a_tag") + + # No list of whitespace-preserving tags -> pretty-print + tag._preserve_whitespace_tags = None + assert True == tag._should_pretty_print(0) + + # List exists but tag is not on the list -> pretty-print + tag.preserve_whitespace_tags = ["some_other_tag"] + assert True == tag._should_pretty_print(1) + + # Indent level is None -> don't pretty-print + assert False == tag._should_pretty_print(None) + + # Tag is on the whitespace-preserving list -> don't pretty-print + tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"] + assert False == tag._should_pretty_print(1) + + def test_len(self): + """The length of a Tag is its number of children.""" + soup = self.soup("<top>1<b>2</b>3</top>") + + # The BeautifulSoup object itself contains one element: the + # <top> tag. + assert len(soup.contents) == 1 + assert len(soup) == 1 + + # The <top> tag contains three elements: the text node "1", the + # <b> tag, and the text node "3". + assert len(soup.top) == 3 + assert len(soup.top.contents) == 3 + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup('<b><i></i></b>') + assert soup.b == soup.find('b') + assert soup.b.i == soup.find('b').find('i') + assert soup.a == None + + def test_deprecated_member_access(self): + soup = self.soup('<b><i></i></b>') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + assert soup.b == tag + assert '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")' == str(w[0].message) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("<foo attr='bar'>") + assert soup.foo.has_attr('attr') + assert not soup.foo.has_attr('attr2') + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' + self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') + + def test_string(self): + # A Tag that contains only a text node makes that node + # available as .string. + soup = self.soup("<b>foo</b>") + assert soup.b.string == 'foo' + + def test_empty_tag_has_no_string(self): + # A Tag with no children has no .stirng. + soup = self.soup("<b></b>") + assert soup.b.string == None + + def test_tag_with_multiple_children_has_no_string(self): + # A Tag with no children has no .string. + soup = self.soup("<a>foo<b></b><b></b></b>") + assert soup.b.string == None + + soup = self.soup("<a>foo<b></b>bar</b>") + assert soup.b.string == None + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("<a>foo</b>") + soup.a.insert(1, "bar") + assert soup.a.string == None + + def test_tag_with_recursive_string_has_string(self): + # A Tag with a single child which has a .string inherits that + # .string. + soup = self.soup("<a><b>foo</b></a>") + assert soup.a.string == "foo" + assert soup.string == "foo" + + def test_lack_of_string(self): + """Only a Tag containing a single text node has a .string.""" + soup = self.soup("<b>f<i>e</i>o</b>") + assert soup.b.string is None + + soup = self.soup("<b></b>") + assert soup.b.string is None + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("<a>a<b>r</b> <r> t </r></a>") + assert soup.a.text == "ar t " + assert soup.a.get_text(strip=True) == "art" + assert soup.a.get_text(",") == "a,r, , t " + assert soup.a.get_text(",", strip=True) == "a,r,t" + + def test_get_text_ignores_special_string_containers(self): + soup = self.soup("foo<!--IGNORE-->bar") + assert soup.get_text() == "foobar" + + assert soup.get_text(types=(NavigableString, Comment)) == "fooIGNOREbar" + assert soup.get_text(types=None) == "fooIGNOREbar" + + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + assert soup.get_text() == "foobar" + + def test_all_strings_ignores_special_string_containers(self): + soup = self.soup("foo<!--IGNORE-->bar") + assert ['foo', 'bar'] == list(soup.strings) + + soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") + assert ['foo', 'bar'] == list(soup.strings) + + def test_string_methods_inside_special_string_container_tags(self): + # Strings inside tags like <script> are generally ignored by + # methods like get_text, because they're not what humans + # consider 'text'. But if you call get_text on the <script> + # tag itself, those strings _are_ considered to be 'text', + # because there's nothing else you might be looking for. + + style = self.soup("<div>a<style>Some CSS</style></div>") + template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>") + script = self.soup("<div>a<script><!--a comment-->Some text</script></div>") + + assert style.div.get_text() == "a" + assert list(style.div.strings) == ["a"] + assert style.div.style.get_text() == "Some CSS" + assert list(style.div.style.strings) == ['Some CSS'] + + # The comment is not picked up here. That's because it was + # parsed into a Comment object, which is not considered + # interesting by template.strings. + assert template.div.get_text() == "a" + assert list(template.div.strings) == ["a"] + assert template.div.template.get_text() == "Templated text." + assert list(template.div.template.strings) == ["Templated ", "text", "."] + + # The comment is included here, because it didn't get parsed + # into a Comment object--it's part of the Script string. + assert script.div.get_text() == "a" + assert list(script.div.strings) == ["a"] + assert script.div.script.get_text() == "<!--a comment-->Some text" + assert list(script.div.script.strings) == ['<!--a comment-->Some text'] diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 776bcca..43fe284 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -10,8 +10,6 @@ methods tested here. """ from pdb import set_trace -import copy -import pickle import pytest import re import warnings @@ -21,7 +19,6 @@ from bs4.builder import ( HTMLParserTreeBuilder, ) from bs4.element import ( - PY3K, CData, Comment, Declaration, @@ -38,33 +35,8 @@ from . import ( SoupTest, skipIf, ) -from soupsieve import SelectorSyntaxError -XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) -LXML_PRESENT = (builder_registry.lookup("lxml") is not None) - -class TreeTest(SoupTest): - - def assert_selects(self, tags, should_match): - """Make sure that the given tags have the correct text. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - assert [tag.string for tag in tags] == should_match - - def assert_selects_ids(self, tags, should_match): - """Make sure that the given tags have the correct IDs. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - assert [tag['id'] for tag in tags] == should_match - - -class TestFind(TreeTest): +class TestFind(SoupTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all @@ -95,7 +67,7 @@ class TestFind(TreeTest): soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") assert 2 == len(soup.find_all('a')) -class TestFindAll(TreeTest): +class TestFindAll(SoupTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): @@ -151,7 +123,7 @@ class TestFindAll(TreeTest): assert hasattr(result, "source") -class TestFindAllBasicNamespaces(TreeTest): +class TestFindAllBasicNamespaces(SoupTest): def test_find_by_namespaced_name(self): soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') @@ -159,7 +131,7 @@ class TestFindAllBasicNamespaces(TreeTest): assert "a" == soup.find(attrs= { "svg:fill" : "red" }).name -class TestFindAllByName(TreeTest): +class TestFindAllByName(SoupTest): """Test ways of finding tags by tag name.""" def setup_method(self): @@ -237,7 +209,7 @@ class TestFindAllByName(TreeTest): assert '3' == r4.string -class TestFindAllByAttribute(TreeTest): +class TestFindAllByAttribute(SoupTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by @@ -417,7 +389,7 @@ class TestFindAllByAttribute(TreeTest): assert [] == soup.find_all(id=1, text="bar") -class TestSmooth(TreeTest): +class TestSmooth(SoupTest): """Test Tag.smooth.""" def test_smooth(self): @@ -461,7 +433,7 @@ class TestSmooth(TreeTest): assert 'Comment 2' == div.contents[2] -class TestIndex(TreeTest): +class TestIndex(SoupTest): """Test Tag.index""" def test_index(self): tree = self.soup("""<div> @@ -480,7 +452,7 @@ class TestIndex(TreeTest): tree.index(1) -class TestParentOperations(TreeTest): +class TestParentOperations(SoupTest): """Test navigation and searching through an element's parents.""" def setup_method(self): @@ -530,7 +502,7 @@ class TestParentOperations(TreeTest): assert parents, ['bottom', 'middle' == 'top'] -class ProximityTest(TreeTest): +class ProximityTest(SoupTest): def setup_method(self): self.tree = self.soup( @@ -593,9 +565,7 @@ class TestPreviousOperations(ProximityTest): def test_previous_of_root_is_none(self): # The document root is outside the next/previous chain. - # XXX This is broken! - #assert self.tree.previous_element == None - pass + assert self.tree.previous_element == None def test_find_all_previous(self): # The <b> tag containing the "Three" node is the predecessor @@ -628,7 +598,7 @@ class TestPreviousOperations(ProximityTest): assert html.name == "html" -class SiblingTest(TreeTest): +class SiblingTest(SoupTest): def setup_method(self): markup = '''<html> @@ -739,72 +709,6 @@ class TestPreviousSibling(SiblingTest): assert start.find_previous_sibling(text="nonesuch") == None -class TestTag(SoupTest): - - # Test various methods of Tag. - - def test__should_pretty_print(self): - # Test the rules about when a tag should be pretty-printed. - tag = self.soup("").new_tag("a_tag") - - # No list of whitespace-preserving tags -> pretty-print - tag._preserve_whitespace_tags = None - assert True == tag._should_pretty_print(0) - - # List exists but tag is not on the list -> pretty-print - tag.preserve_whitespace_tags = ["some_other_tag"] - assert True == tag._should_pretty_print(1) - - # Indent level is None -> don't pretty-print - assert False == tag._should_pretty_print(None) - - # Tag is on the whitespace-preserving list -> don't pretty-print - tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"] - assert False == tag._should_pretty_print(1) - - -class TestTagCreation(SoupTest): - """Test the ability to create new tags.""" - def test_new_tag(self): - soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) - assert isinstance(new_tag, Tag) - assert "foo" == new_tag.name - assert dict(bar="baz", name="a name") == new_tag.attrs - assert None == new_tag.parent - - def test_tag_inherits_self_closing_rules_from_builder(self): - if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "lxml-xml") - xml_br = xml_soup.new_tag("br") - xml_p = xml_soup.new_tag("p") - - # Both the <br> and <p> tag are empty-element, just because - # they have no contents. - assert b"<br/>" == xml_br.encode() - assert b"<p/>" == xml_p.encode() - - html_soup = BeautifulSoup("", "html.parser") - html_br = html_soup.new_tag("br") - html_p = html_soup.new_tag("p") - - # The HTML builder users HTML's rules about which tags are - # empty-element tags, and the new tags reflect these rules. - assert b"<br/>" == html_br.encode() - assert b"<p></p>" == html_p.encode() - - def test_new_string_creates_navigablestring(self): - soup = self.soup("") - s = soup.new_string("foo") - assert "foo" == s - assert isinstance(s, NavigableString) - - def test_new_string_can_create_navigablestring_subclass(self): - soup = self.soup("") - s = soup.new_string("foo", Comment) - assert "foo" == s - assert isinstance(s, Comment) - class TestTreeModification(SoupTest): def test_attribute_modification(self): @@ -1364,149 +1268,6 @@ class TestTreeModification(SoupTest): soup.a.string = cdata assert isinstance(soup.a.string, CData) -class TestElementObjects(SoupTest): - """Test various features of element objects.""" - - def test_len(self): - """The length of an element is its number of children.""" - soup = self.soup("<top>1<b>2</b>3</top>") - - # The BeautifulSoup object itself contains one element: the - # <top> tag. - assert len(soup.contents) == 1 - assert len(soup) == 1 - - # The <top> tag contains three elements: the text node "1", the - # <b> tag, and the text node "3". - assert len(soup.top) == 3 - assert len(soup.top.contents) == 3 - - def test_member_access_invokes_find(self): - """Accessing a Python member .foo invokes find('foo')""" - soup = self.soup('<b><i></i></b>') - assert soup.b == soup.find('b') - assert soup.b.i == soup.find('b').find('i') - assert soup.a == None - - def test_deprecated_member_access(self): - soup = self.soup('<b><i></i></b>') - with warnings.catch_warnings(record=True) as w: - tag = soup.bTag - assert soup.b == tag - assert '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")' == str(w[0].message) - - def test_has_attr(self): - """has_attr() checks for the presence of an attribute. - - Please note note: has_attr() is different from - __in__. has_attr() checks the tag's attributes and __in__ - checks the tag's chidlren. - """ - soup = self.soup("<foo attr='bar'>") - assert soup.foo.has_attr('attr') - assert not soup.foo.has_attr('attr2') - - - def test_attributes_come_out_in_alphabetical_order(self): - markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' - self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') - - def test_string(self): - # A tag that contains only a text node makes that node - # available as .string. - soup = self.soup("<b>foo</b>") - assert soup.b.string == 'foo' - - def test_empty_tag_has_no_string(self): - # A tag with no children has no .stirng. - soup = self.soup("<b></b>") - assert soup.b.string == None - - def test_tag_with_multiple_children_has_no_string(self): - # A tag with no children has no .string. - soup = self.soup("<a>foo<b></b><b></b></b>") - assert soup.b.string == None - - soup = self.soup("<a>foo<b></b>bar</b>") - assert soup.b.string == None - - # Even if all the children are strings, due to trickery, - # it won't work--but this would be a good optimization. - soup = self.soup("<a>foo</b>") - soup.a.insert(1, "bar") - assert soup.a.string == None - - def test_tag_with_recursive_string_has_string(self): - # A tag with a single child which has a .string inherits that - # .string. - soup = self.soup("<a><b>foo</b></a>") - assert soup.a.string == "foo" - assert soup.string == "foo" - - def test_lack_of_string(self): - """Only a tag containing a single text node has a .string.""" - soup = self.soup("<b>f<i>e</i>o</b>") - assert soup.b.string is None - - soup = self.soup("<b></b>") - assert soup.b.string is None - - def test_all_text(self): - """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" - soup = self.soup("<a>a<b>r</b> <r> t </r></a>") - assert soup.a.text == "ar t " - assert soup.a.get_text(strip=True) == "art" - assert soup.a.get_text(",") == "a,r, , t " - assert soup.a.get_text(",", strip=True) == "a,r,t" - - def test_get_text_ignores_special_string_containers(self): - soup = self.soup("foo<!--IGNORE-->bar") - assert soup.get_text() == "foobar" - - assert soup.get_text(types=(NavigableString, Comment)) == "fooIGNOREbar" - assert soup.get_text(types=None) == "fooIGNOREbar" - - soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") - assert soup.get_text() == "foobar" - - def test_all_strings_ignores_special_string_containers(self): - soup = self.soup("foo<!--IGNORE-->bar") - assert ['foo', 'bar'] == list(soup.strings) - - soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar") - assert ['foo', 'bar'] == list(soup.strings) - - def test_string_methods_inside_special_string_container_tags(self): - # Strings inside tags like <script> are generally ignored by - # methods like get_text, because they're not what humans - # consider 'text'. But if you call get_text on the <script> - # tag itself, those strings _are_ considered to be 'text', - # because there's nothing else you might be looking for. - - style = self.soup("<div>a<style>Some CSS</style></div>") - template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>") - script = self.soup("<div>a<script><!--a comment-->Some text</script></div>") - - assert style.div.get_text() == "a" - assert list(style.div.strings) == ["a"] - assert style.div.style.get_text() == "Some CSS" - assert list(style.div.style.strings) == ['Some CSS'] - - # The comment is not picked up here. That's because it was - # parsed into a Comment object, which is not considered - # interesting by template.strings. - assert template.div.get_text() == "a" - assert list(template.div.strings) == ["a"] - assert template.div.template.get_text() == "Templated text." - assert list(template.div.template.strings) == ["Templated ", "text", "."] - - # The comment is included here, because it didn't get parsed - # into a Comment object--it's part of the Script string. - assert script.div.get_text() == "a" - assert list(script.div.strings) == ["a"] - assert script.div.script.get_text() == "<!--a comment-->Some text" - assert list(script.div.script.strings) == ['<!--a comment-->Some text'] - class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. @@ -1549,730 +1310,3 @@ class TestCDAtaListAttributes(SoupTest): with pytest.raises(AttributeError): string.name = 'foo' -class TestPersistence(SoupTest): - "Testing features like pickle and deepcopy." - - def setup_method(self): - self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" -"http://www.w3.org/TR/REC-html40/transitional.dtd"> -<html> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Beautiful Soup: We called him Tortoise because he taught us.</title> -<link rev="made" href="mailto:leonardr@segfault.org"> -<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> -<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> -<meta name="author" content="Leonard Richardson"> -</head> -<body> -<a href="foo">foo</a> -<a href="foo"><b>bar</b></a> -</body> -</html>""" - self.tree = self.soup(self.page) - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - dumped = pickle.dumps(self.tree, 2) - loaded = pickle.loads(dumped) - assert loaded.__class__ == BeautifulSoup - assert loaded.decode() == self.tree.decode() - - def test_deepcopy_identity(self): - # Making a deepcopy of a tree yields an identical tree. - copied = copy.deepcopy(self.tree) - assert copied.decode() == self.tree.decode() - - def test_copy_preserves_encoding(self): - soup = BeautifulSoup(b'<p> </p>', 'html.parser') - encoding = soup.original_encoding - copy = soup.__copy__() - assert "<p> </p>" == str(copy) - assert encoding == copy.original_encoding - - def test_copy_preserves_builder_information(self): - - tag = self.soup('<p></p>').p - - # Simulate a tag obtained from a source file. - tag.sourceline = 10 - tag.sourcepos = 33 - - copied = tag.__copy__() - - # The TreeBuilder object is no longer availble, but information - # obtained from it gets copied over to the new Tag object. - assert tag.sourceline == copied.sourceline - assert tag.sourcepos == copied.sourcepos - assert tag.can_be_empty_element == copied.can_be_empty_element - assert tag.cdata_list_attributes == copied.cdata_list_attributes - assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags - - def test_unicode_pickle(self): - # A tree containing Unicode characters can be pickled. - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) - loaded = pickle.loads(dumped) - assert loaded.decode() == soup.decode() - - def test_copy_navigablestring_is_not_attached_to_tree(self): - html = "<b>Foo<a></a></b><b>Bar</b>" - soup = self.soup(html) - s1 = soup.find(string="Foo") - s2 = copy.copy(s1) - assert s1 == s2 - assert None == s2.parent - assert None == s2.next_element - assert None != s1.next_sibling - assert None == s2.next_sibling - assert None == s2.previous_element - - def test_copy_navigablestring_subclass_has_same_type(self): - html = "<b><!--Foo--></b>" - soup = self.soup(html) - s1 = soup.string - s2 = copy.copy(s1) - assert s1 == s2 - assert isinstance(s2, Comment) - - def test_copy_entire_soup(self): - html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" - soup = self.soup(html) - soup_copy = copy.copy(soup) - assert soup == soup_copy - - def test_copy_tag_copies_contents(self): - html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" - soup = self.soup(html) - div = soup.div - div_copy = copy.copy(div) - - # The two tags look the same, and evaluate to equal. - assert str(div) == str(div_copy) - assert div == div_copy - - # But they're not the same object. - assert div is not div_copy - - # And they don't have the same relation to the parse tree. The - # copy is not associated with a parse tree at all. - assert None == div_copy.parent - assert None == div_copy.previous_element - assert None == div_copy.find(string='Bar').next_element - assert None != div.find(string='Bar').next_element - -class TestSubstitutions(SoupTest): - - def test_default_formatter_is_minimal(self): - markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - assert decoded == self.document_for( - "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - ) - - def test_formatter_html(self): - markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter="html") - assert decoded == self.document_for( - "<br/><b><<Sacré bleu!>></b>" - ) - - def test_formatter_html5(self): - markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter="html5") - assert decoded == self.document_for( - "<br><b><<Sacré bleu!>></b>" - ) - - def test_formatter_minimal(self): - markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - assert decoded == self.document_for( - "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - ) - - def test_formatter_null(self): - markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter=None) - # Neither the angle brackets nor the e-with-acute are converted. - # This is not valid HTML, but it's what the user wanted. - assert decoded == self.document_for( - "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - ) - - def test_formatter_custom(self): - markup = "<b><foo></b><b>bar</b><br/>" - soup = self.soup(markup) - decoded = soup.decode(formatter = lambda x: x.upper()) - # Instead of normal entity conversion code, the custom - # callable is called on every string. - assert decoded == self.document_for("<b><FOO></b><b>BAR</b><br/>") - - def test_formatter_is_run_on_attribute_values(self): - markup = '<a href="http://a.com?a=b&c=é">e</a>' - soup = self.soup(markup) - a = soup.a - - expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' - - assert expect_minimal == a.decode() - assert expect_minimal == a.decode(formatter="minimal") - - expect_html = '<a href="http://a.com?a=b&c=é">e</a>' - assert expect_html == a.decode(formatter="html") - - assert markup == a.decode(formatter=None) - expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' - assert expect_upper == a.decode(formatter=lambda x: x.upper()) - - def test_formatter_skips_script_tag_for_html_documents(self): - doc = """ - <script type="text/javascript"> - console.log("< < hey > > "); - </script> -""" - encoded = BeautifulSoup(doc, 'html.parser').encode() - assert b"< < hey > >" in encoded - - def test_formatter_skips_style_tag_for_html_documents(self): - doc = """ - <style type="text/css"> - console.log("< < hey > > "); - </style> -""" - encoded = BeautifulSoup(doc, 'html.parser').encode() - assert b"< < hey > >" in encoded - - def test_prettify_leaves_preformatted_text_alone(self): - soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") - # Everything outside the <pre> tag is reformatted, but everything - # inside is left alone. - assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify() - - def test_prettify_accepts_formatter_function(self): - soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') - pretty = soup.prettify(formatter = lambda x: x.upper()) - assert "FOO" in pretty - - def test_prettify_outputs_unicode_by_default(self): - soup = self.soup("<a></a>") - assert str == type(soup.prettify()) - - def test_prettify_can_encode_data(self): - soup = self.soup("<a></a>") - assert bytes == type(soup.prettify("utf-8")) - - def test_html_entity_substitution_off_by_default(self): - markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" - soup = self.soup(markup) - encoded = soup.b.encode("utf-8") - assert encoded == markup.encode('utf-8') - - def test_encoding_substitution(self): - # Here's the <meta> tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('<meta content="text/html; charset=x-sjis" ' - 'http-equiv="Content-type"/>') - soup = self.soup(meta_tag) - - # Parse the document, and the charset apprears unchanged. - assert soup.meta['content'] == 'text/html; charset=x-sjis' - - # Encode the document into some encoding, and the encoding is - # substituted into the meta tag. - utf_8 = soup.encode("utf-8") - assert b"charset=utf-8" in utf_8 - - euc_jp = soup.encode("euc_jp") - assert b"charset=euc_jp" in euc_jp - - shift_jis = soup.encode("shift-jis") - assert b"charset=shift-jis" in shift_jis - - utf_16_u = soup.encode("utf-16").decode("utf-16") - assert "charset=utf-16" in utf_16_u - - def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): - markup = ('<head><meta content="text/html; charset=x-sjis" ' - 'http-equiv="Content-type"/></head><pre>foo</pre>') - - # Beautiful Soup used to try to rewrite the meta tag even if the - # meta tag got filtered out by the strainer. This test makes - # sure that doesn't happen. - strainer = SoupStrainer('pre') - soup = self.soup(markup, parse_only=strainer) - assert soup.contents[0].name == 'pre' - -class TestEncoding(SoupTest): - """Test the ability to encode objects into strings.""" - - def test_unicode_string_can_be_encoded(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8") - - def test_tag_containing_unicode_string_can_be_encoded(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert soup.b.encode("utf-8") == html.encode("utf-8") - - def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert soup.b.encode("ascii") == b"<b>☃</b>" - - def test_encoding_can_be_made_strict(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - with pytest.raises(UnicodeEncodeError): - soup.encode("ascii", errors="strict") - - def test_decode_contents(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert "\N{SNOWMAN}" == soup.b.decode_contents() - - def test_encode_contents(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents( - encoding="utf8" - ) - - def test_deprecated_renderContents(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() - - def test_repr(self): - html = "<b>\N{SNOWMAN}</b>" - soup = self.soup(html) - if PY3K: - assert html == repr(soup) - else: - assert b'<b>\\u2603</b>' == repr(soup) - - -class TestSoupSelector(TreeTest): - - HTML = """ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" -"http://www.w3.org/TR/html4/strict.dtd"> -<html> -<head> -<title>The title</title> -<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> -</head> -<body> -<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> -<div id="main" class="fancy"> -<div id="inner"> -<h1 id="header1">An H1</h1> -<p>Some text</p> -<p class="onep" id="p1">Some more text</p> -<h2 id="header2">An H2</h2> -<p class="class1 class2 class3" id="pmulti">Another</p> -<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> -<h2 id="header3">Another H2</h2> -<a id="me" href="http://simonwillison.net/" rel="me">me</a> -<span class="s1"> -<a href="#" id="s1a1">span1a1</a> -<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> -<span class="span2"> -<a href="#" id="s2a1">span2a1</a> -</span> -<span class="span3"></span> -<custom-dashed-tag class="dashed" id="dash2"/> -<div data-tag="dashedvalue" id="data1"/> -</span> -</div> -<x id="xid"> -<z id="zida"/> -<z id="zidab"/> -<z id="zidac"/> -</x> -<y id="yid"> -<z id="zidb"/> -</y> -<p lang="en" id="lang-en">English</p> -<p lang="en-gb" id="lang-en-gb">English UK</p> -<p lang="en-us" id="lang-en-us">English US</p> -<p lang="fr" id="lang-fr">French</p> -</div> - -<div id="footer"> -</div> -""" - - def setup_method(self): - self.soup = BeautifulSoup(self.HTML, 'html.parser') - - def assert_selects(self, selector, expected_ids, **kwargs): - el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] - el_ids.sort() - expected_ids.sort() - assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( - selector, ', '.join(expected_ids), ', '.join(el_ids) - ) - - assertSelect = assert_selects - - def assert_select_multiple(self, *tests): - for selector, expected_ids in tests: - self.assert_selects(selector, expected_ids) - - def test_one_tag_one(self): - els = self.soup.select('title') - assert len(els) == 1 - assert els[0].name == 'title' - assert els[0].contents == ['The title'] - - def test_one_tag_many(self): - els = self.soup.select('div') - assert len(els) == 4 - for div in els: - assert div.name == 'div' - - el = self.soup.select_one('div') - assert 'main' == el['id'] - - def test_select_one_returns_none_if_no_match(self): - match = self.soup.select_one('nonexistenttag') - assert None == match - - - def test_tag_in_tag_one(self): - els = self.soup.select('div div') - self.assert_selects('div div', ['inner', 'data1']) - - def test_tag_in_tag_many(self): - for selector in ('html div', 'html body div', 'body div'): - self.assert_selects(selector, ['data1', 'main', 'inner', 'footer']) - - - def test_limit(self): - self.assert_selects('html div', ['main'], limit=1) - self.assert_selects('html body div', ['inner', 'main'], limit=2) - self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'], - limit=10) - - def test_tag_no_match(self): - assert len(self.soup.select('del')) == 0 - - def test_invalid_tag(self): - with pytest.raises(SelectorSyntaxError): - self.soup.select('tag%t') - - def test_select_dashed_tag_ids(self): - self.assert_selects('custom-dashed-tag', ['dash1', 'dash2']) - - def test_select_dashed_by_id(self): - dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') - assert dashed[0].name == 'custom-dashed-tag' - assert dashed[0]['id'] == 'dash2' - - def test_dashed_tag_text(self): - assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.' - - def test_select_dashed_matches_find_all(self): - assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag') - - def test_header_tags(self): - self.assert_select_multiple( - ('h1', ['header1']), - ('h2', ['header2', 'header3']), - ) - - def test_class_one(self): - for selector in ('.onep', 'p.onep', 'html p.onep'): - els = self.soup.select(selector) - assert len(els) == 1 - assert els[0].name == 'p' - assert els[0]['class'] == ['onep'] - - def test_class_mismatched_tag(self): - els = self.soup.select('div.onep') - assert len(els) == 0 - - def test_one_id(self): - for selector in ('div#inner', '#inner', 'div div#inner'): - self.assert_selects(selector, ['inner']) - - def test_bad_id(self): - els = self.soup.select('#doesnotexist') - assert len(els) == 0 - - def test_items_in_id(self): - els = self.soup.select('div#inner p') - assert len(els) == 3 - for el in els: - assert el.name == 'p' - assert els[1]['class'] == ['onep'] - assert not els[0].has_attr('class') - - def test_a_bunch_of_emptys(self): - for selector in ('div#main del', 'div#main div.oops', 'div div#main'): - assert len(self.soup.select(selector)) == 0 - - def test_multi_class_support(self): - for selector in ('.class1', 'p.class1', '.class2', 'p.class2', - '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): - self.assert_selects(selector, ['pmulti']) - - def test_multi_class_selection(self): - for selector in ('.class1.class3', '.class3.class2', - '.class1.class2.class3'): - self.assert_selects(selector, ['pmulti']) - - def test_child_selector(self): - self.assert_selects('.s1 > a', ['s1a1', 's1a2']) - self.assert_selects('.s1 > a span', ['s1a2s1']) - - def test_child_selector_id(self): - self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1']) - - def test_attribute_equals(self): - self.assert_select_multiple( - ('p[class="onep"]', ['p1']), - ('p[id="p1"]', ['p1']), - ('[class="onep"]', ['p1']), - ('[id="p1"]', ['p1']), - ('link[rel="stylesheet"]', ['l1']), - ('link[type="text/css"]', ['l1']), - ('link[href="blah.css"]', ['l1']), - ('link[href="no-blah.css"]', []), - ('[rel="stylesheet"]', ['l1']), - ('[type="text/css"]', ['l1']), - ('[href="blah.css"]', ['l1']), - ('[href="no-blah.css"]', []), - ('p[href="no-blah.css"]', []), - ('[href="no-blah.css"]', []), - ) - - def test_attribute_tilde(self): - self.assert_select_multiple( - ('p[class~="class1"]', ['pmulti']), - ('p[class~="class2"]', ['pmulti']), - ('p[class~="class3"]', ['pmulti']), - ('[class~="class1"]', ['pmulti']), - ('[class~="class2"]', ['pmulti']), - ('[class~="class3"]', ['pmulti']), - ('a[rel~="friend"]', ['bob']), - ('a[rel~="met"]', ['bob']), - ('[rel~="friend"]', ['bob']), - ('[rel~="met"]', ['bob']), - ) - - def test_attribute_startswith(self): - self.assert_select_multiple( - ('[rel^="style"]', ['l1']), - ('link[rel^="style"]', ['l1']), - ('notlink[rel^="notstyle"]', []), - ('[rel^="notstyle"]', []), - ('link[rel^="notstyle"]', []), - ('link[href^="bla"]', ['l1']), - ('a[href^="http://"]', ['bob', 'me']), - ('[href^="http://"]', ['bob', 'me']), - ('[id^="p"]', ['pmulti', 'p1']), - ('[id^="m"]', ['me', 'main']), - ('div[id^="m"]', ['main']), - ('a[id^="m"]', ['me']), - ('div[data-tag^="dashed"]', ['data1']) - ) - - def test_attribute_endswith(self): - self.assert_select_multiple( - ('[href$=".css"]', ['l1']), - ('link[href$=".css"]', ['l1']), - ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), - ('div[id$="1"]', ['data1']), - ('[id$="noending"]', []), - ) - - def test_attribute_contains(self): - self.assert_select_multiple( - # From test_attribute_startswith - ('[rel*="style"]', ['l1']), - ('link[rel*="style"]', ['l1']), - ('notlink[rel*="notstyle"]', []), - ('[rel*="notstyle"]', []), - ('link[rel*="notstyle"]', []), - ('link[href*="bla"]', ['l1']), - ('[href*="http://"]', ['bob', 'me']), - ('[id*="p"]', ['pmulti', 'p1']), - ('div[id*="m"]', ['main']), - ('a[id*="m"]', ['me']), - # From test_attribute_endswith - ('[href*=".css"]', ['l1']), - ('link[href*=".css"]', ['l1']), - ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), - ('div[id*="1"]', ['data1']), - ('[id*="noending"]', []), - # New for this test - ('[href*="."]', ['bob', 'me', 'l1']), - ('a[href*="."]', ['bob', 'me']), - ('link[href*="."]', ['l1']), - ('div[id*="n"]', ['main', 'inner']), - ('div[id*="nn"]', ['inner']), - ('div[data-tag*="edval"]', ['data1']) - ) - - def test_attribute_exact_or_hypen(self): - self.assert_select_multiple( - ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('p[lang|="fr"]', ['lang-fr']), - ('p[lang|="gb"]', []), - ) - - def test_attribute_exists(self): - self.assert_select_multiple( - ('[rel]', ['l1', 'bob', 'me']), - ('link[rel]', ['l1']), - ('a[rel]', ['bob', 'me']), - ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), - ('p[class]', ['p1', 'pmulti']), - ('[blah]', []), - ('p[blah]', []), - ('div[data-tag]', ['data1']) - ) - - def test_quoted_space_in_selector_name(self): - html = """<div style="display: wrong">nope</div> - <div style="display: right">yes</div> - """ - soup = BeautifulSoup(html, 'html.parser') - [chosen] = soup.select('div[style="display: right"]') - assert "yes" == chosen.string - - def test_unsupported_pseudoclass(self): - with pytest.raises(NotImplementedError): - self.soup.select("a:no-such-pseudoclass") - - with pytest.raises(SelectorSyntaxError): - self.soup.select("a:nth-of-type(a)") - - def test_nth_of_type(self): - # Try to select first paragraph - els = self.soup.select('div#inner p:nth-of-type(1)') - assert len(els) == 1 - assert els[0].string == 'Some text' - - # Try to select third paragraph - els = self.soup.select('div#inner p:nth-of-type(3)') - assert len(els) == 1 - assert els[0].string == 'Another' - - # Try to select (non-existent!) fourth paragraph - els = self.soup.select('div#inner p:nth-of-type(4)') - assert len(els) == 0 - - # Zero will select no tags. - els = self.soup.select('div p:nth-of-type(0)') - assert len(els) == 0 - - def test_nth_of_type_direct_descendant(self): - els = self.soup.select('div#inner > p:nth-of-type(1)') - assert len(els) == 1 - assert els[0].string == 'Some text' - - def test_id_child_selector_nth_of_type(self): - self.assert_selects('#inner > p:nth-of-type(2)', ['p1']) - - def test_select_on_element(self): - # Other tests operate on the tree; this operates on an element - # within the tree. - inner = self.soup.find("div", id="main") - selected = inner.select("div") - # The <div id="inner"> tag was selected. The <div id="footer"> - # tag was not. - self.assert_selects_ids(selected, ['inner', 'data1']) - - def test_overspecified_child_id(self): - self.assert_selects(".fancy #inner", ['inner']) - self.assert_selects(".normal #inner", []) - - def test_adjacent_sibling_selector(self): - self.assert_selects('#p1 + h2', ['header2']) - self.assert_selects('#p1 + h2 + p', ['pmulti']) - self.assert_selects('#p1 + #header2 + .class1', ['pmulti']) - assert [] == self.soup.select('#p1 + p') - - def test_general_sibling_selector(self): - self.assert_selects('#p1 ~ h2', ['header2', 'header3']) - self.assert_selects('#p1 ~ #header2', ['header2']) - self.assert_selects('#p1 ~ h2 + a', ['me']) - self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me']) - assert [] == self.soup.select('#inner ~ h2') - - def test_dangling_combinator(self): - with pytest.raises(SelectorSyntaxError): - self.soup.select('h1 >') - - def test_sibling_combinator_wont_select_same_tag_twice(self): - self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) - - # Test the selector grouping operator (the comma) - def test_multiple_select(self): - self.assert_selects('x, y', ['xid', 'yid']) - - def test_multiple_select_with_no_space(self): - self.assert_selects('x,y', ['xid', 'yid']) - - def test_multiple_select_with_more_space(self): - self.assert_selects('x, y', ['xid', 'yid']) - - def test_multiple_select_duplicated(self): - self.assert_selects('x, x', ['xid']) - - def test_multiple_select_sibling(self): - self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) - - def test_multiple_select_tag_and_direct_descendant(self): - self.assert_selects('x, y > z', ['xid', 'zidb']) - - def test_multiple_select_direct_descendant_and_tags(self): - self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) - - def test_multiple_select_indirect_descendant(self): - self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) - - def test_invalid_multiple_select(self): - with pytest.raises(SelectorSyntaxError): - self.soup.select(',x, y') - with pytest.raises(SelectorSyntaxError): - self.soup.select('x,,y') - - def test_multiple_select_attrs(self): - self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) - - def test_multiple_select_ids(self): - self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) - - def test_multiple_select_nested(self): - self.assert_selects('body > div > x, y > z', ['xid', 'zidb']) - - def test_select_duplicate_elements(self): - # When markup contains duplicate elements, a multiple select - # will find all of them. - markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' - soup = BeautifulSoup(markup, 'html.parser') - selected = soup.select(".c1, .c2") - assert 3 == len(selected) - - # Verify that find_all finds the same elements, though because - # of an implementation detail it finds them in a different - # order. - for element in soup.find_all(class_=['c1', 'c2']): - assert element in selected |