# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" import unittest from bs4.element import SoupStrainer from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import SoupTest class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYesNoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEquals(soup.encode(), "YesYes Yes") class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = u"foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEquals(self.sub.substitute_html(s), u"foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = "\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) self.assertEquals(self.sub.substitute_html(dammit.markup), "‘’foo“”") def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEquals(self.sub.substitute_xml(s, False), s) def test_xml_attribute_quoting_normally_uses_double_quotes(self): self.assertEquals(self.sub.substitute_xml("Welcome", True), '"Welcome"') self.assertEquals(self.sub.substitute_xml("Bob's Bar", True), '"Bob\'s Bar"') def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' self.assertEquals(self.sub.substitute_xml(s, True), "'Welcome to \"my bar\"'") def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' self.assertEquals( self.sub.substitute_xml(s, True), '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' self.assertEquals(self.sub.substitute_xml(quoted), quoted) def test_xml_quoting_handles_angle_brackets(self): self.assertEquals( self.sub.substitute_xml("foo"), "foo<bar>") def test_xml_quoting_handles_ampersands(self): self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T") def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): self.assertEquals( self.sub.substitute_xml("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' self.assertEquals(self.sub.substitute_html(text), text) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" def test_smart_quotes_to_unicode(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEquals( dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEquals( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEquals( dammit.unicode_markup, "‘’“”") def test_detect_utf8(self): utf8 = "\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEquals(dammit.unicode_markup, u'\xe9') self.assertEquals(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = "\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEquals(dammit.original_encoding, 'iso-8859-8') self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEquals(dammit.original_encoding, 'utf-8') self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEquals(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEquals(dammit.original_encoding, 'utf-8')