diff options
Diffstat (limited to 'tests/test_soup.py')
-rw-r--r-- | tests/test_soup.py | 130 |
1 files changed, 0 insertions, 130 deletions
diff --git a/tests/test_soup.py b/tests/test_soup.py deleted file mode 100644 index 87d6f3b..0000000 --- a/tests/test_soup.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests of Beautiful Soup as a whole.""" - -import unittest -from bs4.element import SoupStrainer -from bs4.dammit import EntitySubstitution, UnicodeDammit -from bs4.testing import SoupTest - - -class TestSelectiveParsing(SoupTest): - - def test_parse_with_soupstrainer(self): - markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" - strainer = SoupStrainer("b") - soup = self.soup(markup, parse_only=strainer) - self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") - - -class TestEntitySubstitution(unittest.TestCase): - """Standalone tests of the EntitySubstitution class.""" - def setUp(self): - self.sub = EntitySubstitution - - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" - self.assertEquals(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") - - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = "\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - self.assertEquals(self.sub.substitute_html(dammit.markup), - "‘’foo“”") - - def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): - s = 'Welcome to "my bar"' - self.assertEquals(self.sub.substitute_xml(s, False), s) - - def test_xml_attribute_quoting_normally_uses_double_quotes(self): - self.assertEquals(self.sub.substitute_xml("Welcome", True), - '"Welcome"') - self.assertEquals(self.sub.substitute_xml("Bob's Bar", True), - '"Bob\'s Bar"') - - def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): - s = 'Welcome to "my bar"' - self.assertEquals(self.sub.substitute_xml(s, True), - "'Welcome to \"my bar\"'") - - def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): - s = 'Welcome to "Bob\'s Bar"' - self.assertEquals( - self.sub.substitute_xml(s, True), - '"Welcome to "Bob\'s Bar""') - - def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): - quoted = 'Welcome to "Bob\'s Bar"' - self.assertEquals(self.sub.substitute_xml(quoted), quoted) - - def test_xml_quoting_handles_angle_brackets(self): - self.assertEquals( - self.sub.substitute_xml("foo<bar>"), - "foo<bar>") - - def test_xml_quoting_handles_ampersands(self): - self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T") - - def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): - self.assertEquals( - self.sub.substitute_xml("ÁT&T"), - "ÁT&T") - - def test_quotes_not_html_substituted(self): - """There's no need to do this except inside attribute values.""" - text = 'Bob\'s "bar"' - self.assertEquals(self.sub.substitute_html(text), text) - -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" - - def test_smart_quotes_to_unicode(self): - markup = "<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup) - self.assertEquals( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") - - def test_smart_quotes_to_xml_entities(self): - markup = "<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEquals( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_smart_quotes_to_html_entities(self): - markup = "<foo>\x91\x92\x93\x94</foo>" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEquals( - dammit.unicode_markup, "<foo>‘’“”</foo>") - - def test_detect_utf8(self): - utf8 = "\xc3\xa9" - dammit = UnicodeDammit(utf8) - self.assertEquals(dammit.unicode_markup, u'\xe9') - self.assertEquals(dammit.original_encoding, 'utf-8') - - def test_convert_hebrew(self): - hebrew = "\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEquals(dammit.original_encoding, 'iso-8859-8') - self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEquals(dammit.original_encoding, 'utf-8') - self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEquals(dammit.original_encoding, 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEquals(dammit.original_encoding, 'utf-8') |