diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 12:32:19 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 12:32:19 -0500 |
commit | 261f981dc4e62a41bb0e85285d1f40927b34e3d3 (patch) | |
tree | 0c58d7bac3f18c43316a0f1e1651c5928e0c1f9f | |
parent | bb9d9c5dc0af0deefc1a77542c007b7040aa55bb (diff) |
Added tests illustrating the different ways lxml and html5lib handle nested tables.
-rw-r--r-- | tests/test_html5lib.py | 26 | ||||
-rw-r--r-- | tests/test_lxml.py | 46 |
2 files changed, 72 insertions, 0 deletions
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 3a4ee27..ef38f9f 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -19,6 +19,32 @@ class TestHTML5Builder(TestLXMLBuilder): self.assertSoupEquals( "A bare string", "A bare string") + def test_nested_tables(self): + # See TestLXMLBuilder for TABLE_MARKUP_1 and + # TABLE_MARKUP_2. They're both nested tables where the + # top-level <table> and <tr> aren't closed. In TABLE_MARKUP_1 + # the second table is within a <td> tag. In + # TABLE_MARKUP_2, the second table is floating inside a <tr> tag. + # + # html5lib adds <tbody> tags to each table. It treats + # TABLE_MARKUP_1 as a nested table, and TABLE_MARKUP_2 as two + # different tables. + self.assertSoupEquals( + self.TABLE_MARKUP_1, + '<table id="1"><tbody>' + "<tr><td>Here's another table:" + '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' + "</td></tr></tbody></table>" + ) + + self.assertSoupEquals( + self.TABLE_MARKUP_2, + '<table id="1"><tbody>' + "<tr><td>Here's another table:</td></tr>" + '</tbody></table>' + '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' + ) + def test_collapsed_whitespace(self): """Whitespace is preserved even in tags that don't require it.""" self.assertSoupEquals("<p> </p>") diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 455c953..76bcd32 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -1,5 +1,7 @@ """Tests to ensure that the lxml tree builder generates good trees.""" +import re + from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder from beautifulsoup.element import Comment @@ -65,6 +67,50 @@ class TestLXMLBuilder(SoupTest): self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') + # This is a <table> tag containing another <table> tag in one of its + # cells. + TABLE_MARKUP_1 = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + # This is the same as TABLE_MARKUP_1, but the nested table is + # floating freely rather than being inside a <td> cell. + TABLE_MARKUP_2 = ('<table id="1">' + '<tr>' + "<td>Here's another table:</td>" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + + def test_nested_tables(self): + # lxml closes the <tr> and <table> tags that weren't closed by + # TABLE_MARKUP. Unlike html5lib, it treats both bits of markup + # as nested tables. + self.assertSoupEquals( + self.TABLE_MARKUP_1, + '<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table>' + '</td></tr></table>') + + self.assertSoupEquals( + self.TABLE_MARKUP_2, + '<table id="1">' + '<tr>' + "<td>Here's another table:</td>" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table>' + '</tr></table>') + + def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" self.assertSoupEquals("<p> </p>", "<p> </p>") |