diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-30 22:22:28 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-30 22:22:28 -0500 |
commit | 02b149019ac6c7b7791c63b5bbc312a6b1a0636c (patch) | |
tree | fd41c4fa0a2fade6b6c98aca581e5e72a32168fb | |
parent | f79871fb934eeb3ab220bcbf7d471dd9f6feca93 (diff) |
Stop pretending that the 'generic' builder test is different from the lxml test.
-rw-r--r-- | beautifulsoup/testing.py | 139 | ||||
-rw-r--r-- | tests/test_html5lib.py | 13 | ||||
-rw-r--r-- | tests/test_lxml.py | 152 |
3 files changed, 150 insertions, 154 deletions
diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py index 9d0fa3a..74937d9 100644 --- a/beautifulsoup/testing.py +++ b/beautifulsoup/testing.py @@ -32,144 +32,5 @@ class SoupTest(unittest.TestCase): -class BuilderSmokeTest(SoupTest): - """A generic smoke test for tree builders. - - Subclasses of this test ensure that all of Beautiful Soup's tree - builders generate more or less the same trees. It's okay for trees - to differ, especially when given invalid markup--just override the - appropriate test method to demonstrate how one tree builder - differs from others. - """ - - def test_bare_string(self): - # A bare string is turned into some kind of HTML document or - # fragment recognizable as the original string. - # - # In this case, lxml puts a <p> tag around the bare string. - self.assertSoupEquals( - "A bare string", "<p>A bare string</p>") - - def test_mixed_case_tags(self): - # Mixed-case tags are folded to lowercase. - self.assertSoupEquals( - "<a><B><Cd><EFG></efg></CD></b></A>", - "<a><b><cd><efg></efg></cd></b></a>") - - def test_self_closing(self): - # HTML's self-closing tags are recognized as such. - self.assertSoupEquals( - "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>") - - self.assertSoupEquals( - "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "<p>foo<!--foobar-->baz</p>" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEquals(comment.__class__, Comment) - - def test_nested_inline_elements(self): - # Inline tags can be nested indefinitely. - b_tag = "<b>Inside a B tag</b>" - self.assertSoupEquals(b_tag) - - nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_collapsed_whitespace(self): - """In most tags, whitespace is collapsed.""" - self.assertSoupEquals("<p> </p>", "<p> </p>") - - def test_preserved_whitespace_in_pre_and_textarea(self): - """In <pre> and <textarea> tags, whitespace is preserved.""" - self.assertSoupEquals("<pre> </pre>") - self.assertSoupEquals("<textarea> woo </textarea>") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("<foo attr='bar'></foo>", - '<foo attr="bar"></foo>') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """<foo attr='bar "brawls" happen'>a</foo>""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """<foo attr='bar "brawls" happen'>a</foo>""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""") - - def test_ampersand_in_attribute_value_gets_quoted(self): - self.assertSoupEquals('<this is="really messed up & stuff"></this>', - '<this is="really messed up & stuff"></this>') - - def test_literal_in_textarea(self): - # Anything inside a <textarea> is supposed to be treated as - # the literal value of the field, (XXX citation needed). - # - # But, both lxml and html5lib do their best to parse the - # contents of a <textarea> as HTML. - text = '<textarea>Junk like <b> tags and <&<&</textarea>' - soup = BeautifulSoup(text) - self.assertEquals(len(soup.textarea.contents), 2) - self.assertEquals(soup.textarea.contents[0], u"Junk like ") - self.assertEquals(soup.textarea.contents[1].name, 'b') - self.assertEquals(soup.textarea.b.string, u" tags and ") - - def test_literal_in_script(self): - # The contents of a <script> tag are treated as a literal string, - # even if that string contains HTML. - javascript = 'if (i < 2) { alert("<b>foo</b>"); }' - soup = BeautifulSoup('<script>%s</script>' % javascript) - self.assertEquals(soup.script.string, javascript) - - -class BuilderInvalidMarkupSmokeTest(SoupTest): - """Tests of invalid markup. - - These are very likely to give different results for different tree - builders. It's not required that a tree builder handle invalid - markup at all. - """ - - def test_unclosed_block_level_elements(self): - # Unclosed block-level elements should be closed. - self.assertSoupEquals( - '<blockquote><p><b>Foo</blockquote><p>Bar', - '<blockquote><p><b>Foo</b></p></blockquote><p>Bar</p>') - - def test_fake_self_closing_tag(self): - # If a self-closing tag presents as a normal tag, the 'open' - # tag is treated as an instance of the self-closing tag and - # the 'close' tag is ignored. - self.assertSoupEquals( - "<item><link>http://foo.com/</link></item>", - "<item><link />http://foo.com/</item>") - - def test_boolean_attribute_with_no_value_gets_empty_value(self): - soup = self.soup("<table><td nowrap>foo</td></table>") - self.assertEquals(soup.table.td['nowrap'], '') - - def test_incorrectly_nested_tables(self): - self.assertSoupEquals( - '<table><tr><table><tr id="nested">', - '<table><tr><table><tr id="nested"></tr></table></tr></table>') - diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 417e87b..3a4ee27 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -1,11 +1,10 @@ from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder -from beautifulsoup.testing import ( - BuilderInvalidMarkupSmokeTest, - BuilderSmokeTest, -) +from test_lxml import ( + TestLXMLBuilder, + TestLXMLBuilderInvalidMarkup, + ) - -class TestHTML5Builder(BuilderSmokeTest): +class TestHTML5Builder(TestLXMLBuilder): """See `BuilderSmokeTest`.""" @property @@ -26,7 +25,7 @@ class TestHTML5Builder(BuilderSmokeTest): self.assertSoupEquals("<b> </b>") -class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): +class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @property diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 7fe6870..d16e8d9 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -1,13 +1,118 @@ """Tests to ensure that the lxml tree builder generates good trees.""" +from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder -from beautifulsoup.testing import ( - BuilderInvalidMarkupSmokeTest, - BuilderSmokeTest, -) +from beautifulsoup.element import Comment +from beautifulsoup.testing import SoupTest -class TestLXMLBuilder(BuilderSmokeTest): - """See `BuilderSmokeTest`.""" + +class TestLXMLBuilder(SoupTest): + """A smoke test for the LXML tree builders. + + Subclass this to test some other tree builder. Subclasses of this + test ensure that all of Beautiful Soup's tree builders generate + more or less the same trees. It's okay for trees to differ, + especially when given invalid markup--just override the + appropriate test method to demonstrate how one tree builder + differs from the LXML builder. + """ + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + # + # In this case, lxml puts a <p> tag around the bare string. + self.assertSoupEquals( + "A bare string", "<p>A bare string</p>") + + def test_mixed_case_tags(self): + # Mixed-case tags are folded to lowercase. + self.assertSoupEquals( + "<a><B><Cd><EFG></efg></CD></b></A>", + "<a><b><cd><efg></efg></cd></b></a>") + + def test_self_closing(self): + # HTML's self-closing tags are recognized as such. + self.assertSoupEquals( + "<p>A <meta> tag</p>", "<p>A <meta /> tag</p>") + + self.assertSoupEquals( + "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "<p>foo<!--foobar-->baz</p>" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEquals(comment.__class__, Comment) + + def test_nested_inline_elements(self): + # Inline tags can be nested indefinitely. + b_tag = "<b>Inside a B tag</b>" + self.assertSoupEquals(b_tag) + + nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_collapsed_whitespace(self): + """In most tags, whitespace is collapsed.""" + self.assertSoupEquals("<p> </p>", "<p> </p>") + + def test_preserved_whitespace_in_pre_and_textarea(self): + """In <pre> and <textarea> tags, whitespace is preserved.""" + self.assertSoupEquals("<pre> </pre>") + self.assertSoupEquals("<textarea> woo </textarea>") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("<foo attr='bar'></foo>", + '<foo attr="bar"></foo>') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""") + + def test_ampersand_in_attribute_value_gets_quoted(self): + self.assertSoupEquals('<this is="really messed up & stuff"></this>', + '<this is="really messed up & stuff"></this>') + + def test_literal_in_textarea(self): + # Anything inside a <textarea> is supposed to be treated as + # the literal value of the field, (XXX citation needed). + # + # But, both lxml and html5lib do their best to parse the + # contents of a <textarea> as HTML. + text = '<textarea>Junk like <b> tags and <&<&</textarea>' + soup = BeautifulSoup(text) + self.assertEquals(len(soup.textarea.contents), 2) + self.assertEquals(soup.textarea.contents[0], u"Junk like ") + self.assertEquals(soup.textarea.contents[1].name, 'b') + self.assertEquals(soup.textarea.b.string, u" tags and ") + + def test_literal_in_script(self): + # The contents of a <script> tag are treated as a literal string, + # even if that string contains HTML. + javascript = 'if (i < 2) { alert("<b>foo</b>"); }' + soup = BeautifulSoup('<script>%s</script>' % javascript) + self.assertEquals(soup.script.string, javascript) def test_foo(self): isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" @@ -19,6 +124,37 @@ class TestLXMLBuilder(BuilderSmokeTest): print soup -class TestLXMLBuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): - """See `BuilderInvalidMarkupSmokeTest`.""" +class TestLXMLBuilderInvalidMarkup(SoupTest): + """Tests of invalid markup for the LXML tree builder. + + Subclass this to test other builders. + + These are very likely to give different results for different tree + builders. It's not required that a tree builder handle invalid + markup at all. + """ + + def test_unclosed_block_level_elements(self): + # Unclosed block-level elements should be closed. + self.assertSoupEquals( + '<blockquote><p><b>Foo</blockquote><p>Bar', + '<blockquote><p><b>Foo</b></p></blockquote><p>Bar</p>') + + def test_fake_self_closing_tag(self): + # If a self-closing tag presents as a normal tag, the 'open' + # tag is treated as an instance of the self-closing tag and + # the 'close' tag is ignored. + self.assertSoupEquals( + "<item><link>http://foo.com/</link></item>", + "<item><link />http://foo.com/</item>") + + def test_boolean_attribute_with_no_value_gets_empty_value(self): + soup = self.soup("<table><td nowrap>foo</td></table>") + self.assertEquals(soup.table.td['nowrap'], '') + + def test_incorrectly_nested_tables(self): + self.assertSoupEquals( + '<table><tr><table><tr id="nested">', + '<table><tr><table><tr id="nested"></tr></table></tr></table>') + |