diff options
-rw-r--r-- | beautifulsoup/testing.py | 20 | ||||
-rw-r--r-- | tests/test_tree.py | 47 |
2 files changed, 67 insertions, 0 deletions
diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py index eea14f0..9d0fa3a 100644 --- a/beautifulsoup/testing.py +++ b/beautifulsoup/testing.py @@ -119,6 +119,26 @@ class BuilderSmokeTest(SoupTest): self.assertSoupEquals('<this is="really messed up & stuff"></this>', '<this is="really messed up & stuff"></this>') + def test_literal_in_textarea(self): + # Anything inside a <textarea> is supposed to be treated as + # the literal value of the field, (XXX citation needed). + # + # But, both lxml and html5lib do their best to parse the + # contents of a <textarea> as HTML. + text = '<textarea>Junk like <b> tags and <&<&</textarea>' + soup = BeautifulSoup(text) + self.assertEquals(len(soup.textarea.contents), 2) + self.assertEquals(soup.textarea.contents[0], u"Junk like ") + self.assertEquals(soup.textarea.contents[1].name, 'b') + self.assertEquals(soup.textarea.b.string, u" tags and ") + + def test_literal_in_script(self): + # The contents of a <script> tag are treated as a literal string, + # even if that string contains HTML. + javascript = 'if (i < 2) { alert("<b>foo</b>"); }' + soup = BeautifulSoup('<script>%s</script>' % javascript) + self.assertEquals(soup.script.string, javascript) + class BuilderInvalidMarkupSmokeTest(SoupTest): """Tests of invalid markup. diff --git a/tests/test_tree.py b/tests/test_tree.py index a3c4b3b..eac4e72 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -9,6 +9,8 @@ same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ +import copy +import cPickle as pickle import re from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer, Tag @@ -768,3 +770,48 @@ class TestElementObjects(SoupTest): soup = self.soup("<b></b>") self.assertFalse(soup.b.string) + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + super(TestPersistence, self).setUp() + self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" +"http://www.w3.org/TR/REC-html40/transitional.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title>Beautiful Soup: We called him Tortoise because he taught us.</title> +<link rev="made" href="mailto:leonardr@segfault.org"> +<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> +<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> +<meta name="author" content="Leonard Richardson"> +</head> +<body> +<a href="foo">foo</a> +<a href="foo"><b>bar</b></a> +</body> +</html>""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.tree.decode()) + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + self.assertEqual(copied.decode(), self.tree.decode()) + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = u"<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) |