# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
import unittest
from bs4.element import SoupStrainer
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import SoupTest
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "NoYesNoYes Yes"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEquals(soup.encode(), "YesYes Yes")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEquals(self.sub.substitute_html(s),
u"foo∀\N{SNOWMAN}õbar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = "\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEquals(self.sub.substitute_html(dammit.markup),
"‘’foo“”")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEquals(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEquals(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEquals(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEquals(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEquals(
self.sub.substitute_xml(s, True),
'"Welcome to "Bob\'s Bar""')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEquals(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEquals(
self.sub.substitute_xml("foo"),
"foo<bar>")
def test_xml_quoting_handles_ampersands(self):
self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T")
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
self.assertEquals(
self.sub.substitute_xml("ÁT&T"),
"ÁT&T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEquals(self.sub.substitute_html(text), text)
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
def test_smart_quotes_to_unicode(self):
markup = "\x91\x92\x93\x94"
dammit = UnicodeDammit(markup)
self.assertEquals(
dammit.unicode_markup, u"\u2018\u2019\u201c\u201d")
def test_smart_quotes_to_xml_entities(self):
markup = "\x91\x92\x93\x94"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEquals(
dammit.unicode_markup, "‘’“”")
def test_smart_quotes_to_html_entities(self):
markup = "\x91\x92\x93\x94"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEquals(
dammit.unicode_markup, "‘’“”")
def test_detect_utf8(self):
utf8 = "\xc3\xa9"
dammit = UnicodeDammit(utf8)
self.assertEquals(dammit.unicode_markup, u'\xe9')
self.assertEquals(dammit.original_encoding, 'utf-8')
def test_convert_hebrew(self):
hebrew = "\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEquals(dammit.original_encoding, 'iso-8859-8')
self.assertEquals(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEquals(dammit.original_encoding, 'utf-8')
self.assertEquals(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEquals(dammit.original_encoding, 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEquals(dammit.original_encoding, 'utf-8')