diff options
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py new file mode 100644 index 0000000..3c5d742 --- /dev/null +++ b/bs4/tests/test_soup.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +import unittest +from bs4.element import SoupStrainer +from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.testing import SoupTest + + +class TestSelectiveParsing(SoupTest): + + def test_parse_with_soupstrainer(self): + markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + self.assertEquals(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") + + +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEquals(self.sub.substitute_html(s), + u"foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEquals(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEquals(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEquals(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + self.assertEquals( + self.sub.substitute_xml(s, True), + '"Welcome to "Bob\'s Bar""') + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEquals(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEquals( + self.sub.substitute_xml("foo<bar>"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + self.assertEquals( + self.sub.substitute_xml("ÁT&T"), + "ÁT&T") + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEquals(self.sub.substitute_html(text), text) + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of Unicode, Dammit.""" + + def test_smart_quotes_to_unicode(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup) + self.assertEquals( + dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") + + def test_smart_quotes_to_xml_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEquals( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_html_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEquals( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_detect_utf8(self): + utf8 = b"\xc3\xa9" + dammit = UnicodeDammit(utf8) + self.assertEquals(dammit.unicode_markup, u'\xe9') + self.assertEquals(dammit.original_encoding, 'utf-8') + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEquals(dammit.original_encoding, 'iso-8859-8') + self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEquals(dammit.original_encoding, 'utf-8') + self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEquals(dammit.original_encoding, 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEquals(dammit.original_encoding, 'utf-8') |