From 749f01e2b664dcbf4f58dfbdcaa4d314f6e3b9ef Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 10 Feb 2011 09:50:32 -0500
Subject: Added a test to verify that both lxml and html5lib convert entities
 to Unicode characters during parsing.

---
 tests/test_lxml.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'tests/test_lxml.py')
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index d16e8d9..e6e015b 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -114,14 +114,29 @@ class TestLXMLBuilder(SoupTest):
         soup = BeautifulSoup('<script>%s</script>' % javascript)
         self.assertEquals(soup.script.string, javascript)
 
+    def test_entities_converted_on_the_way_in(self):
+        # Both XML and HTML entities are converted to Unicode characters
+        # during parsing.
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
+        self.assertSoupEquals(text, expected)
+
+    # Tests below this line need work.
+
+    def test_entities_converted_on_the_way_out(self):
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;".encode("utf-8")
+        soup = BeautifulSoup(text)
+        str = soup.p.string
+        #self.assertEquals(str.encode("utf-8"), expected)
+
     def test_foo(self):
         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
         soup = self.soup(isolatin)
 
         utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
         utf8 = utf8.replace("\xe9", "\xc3\xa9")
-
-        print soup
+        #print soup
 
 
 class TestLXMLBuilderInvalidMarkup(SoupTest):
-- 
cgit v1.2.3


From bb9d9c5dc0af0deefc1a77542c007b7040aa55bb Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 10 Feb 2011 11:52:30 -0500
Subject: Ported some more tests demonstrating that entities are converted to
 Unicode characters on the way in.

---
 tests/test_lxml.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

(limited to 'tests/test_lxml.py')

diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index e6e015b..455c953 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -114,13 +114,58 @@ class TestLXMLBuilder(SoupTest):
         soup = BeautifulSoup('<script>%s</script>' % javascript)
         self.assertEquals(soup.script.string, javascript)
 
-    def test_entities_converted_on_the_way_in(self):
+    def test_naked_ampersands(self):
+        # Ampersands are left alone.
+        text = "<p>AT&T</p>"
+        soup = self.soup(text)
+        self.assertEquals(soup.p.string, "AT&T")
+
+        # Even if they're in attribute values.
+        invalid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>'
+        soup = self.soup(invalid_url)
+        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+    def test_entities_in_strings_converted_during_parsing(self):
         # Both XML and HTML entities are converted to Unicode characters
         # during parsing.
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
         expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
         self.assertSoupEquals(text, expected)
 
+    def test_entities_in_attribute_values_converted_during_parsing(self):
+        text = '<x t="pi&#241ata">'
+        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], expected)
+
+        text = '<x t="pi&#xf1;ata">'
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], expected)
+
+        text = '<x t="sacr&eacute; bleu">'
+        soup = self.soup(text)
+        self.assertEquals(
+            soup.x['t'],
+            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+        # This can cause valid HTML to become invalid.
+        valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+        soup = self.soup(valid_url)
+        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+    def test_smart_quotes_converted_on_the_way_in(self):
+        # Microsoft smart quotes are converted to Unicode characters during
+        # parsing.
+        quote = "<p>\x91Foo\x92</p>"
+        soup = self.soup(quote)
+        self.assertEquals(
+            soup.p.string,
+            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+    def test_non_breaking_spaces_converted_on_the_way_in(self):
+        soup = self.soup("<a>&nbsp;&nbsp;</a>")
+        self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
     # Tests below this line need work.
 
     def test_entities_converted_on_the_way_out(self):
-- 
cgit v1.2.3


From 261f981dc4e62a41bb0e85285d1f40927b34e3d3 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 10 Feb 2011 12:32:19 -0500
Subject: Added tests illustrating the different ways lxml and html5lib handle
 nested tables.

---
 tests/test_lxml.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'tests/test_lxml.py')

diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 455c953..76bcd32 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -1,5 +1,7 @@
 """Tests to ensure that the lxml tree builder generates good trees."""
 
+import re
+
 from beautifulsoup import BeautifulSoup
 from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder
 from beautifulsoup.element import Comment
@@ -65,6 +67,50 @@ class TestLXMLBuilder(SoupTest):
         self.assertEqual(blockquote.p.b.string, 'Foo')
         self.assertEqual(blockquote.b.string, 'Foo')
 
+    # This is a <table> tag containing another <table> tag in one of its
+    # cells.
+    TABLE_MARKUP_1 = ('<table id="1">'
+                     '<tr>'
+                     "<td>Here's another table:"
+                     '<table id="2">'
+                     '<tr><td>foo</td></tr>'
+                     '</table></td>')
+
+    # This is the same as TABLE_MARKUP_1, but the nested table is
+    # floating freely rather than being inside a <td> cell.
+    TABLE_MARKUP_2 = ('<table id="1">'
+                     '<tr>'
+                     "<td>Here's another table:</td>"
+                     '<table id="2">'
+                     '<tr><td>foo</td></tr>'
+                     '</table></td>')
+
+
+    def test_nested_tables(self):
+        # lxml closes the <tr> and <table> tags that weren't closed by
+        # TABLE_MARKUP. Unlike html5lib, it treats both bits of markup
+        # as nested tables.
+        self.assertSoupEquals(
+            self.TABLE_MARKUP_1,
+            '<table id="1">'
+            '<tr>'
+            "<td>Here's another table:"
+            '<table id="2">'
+            '<tr><td>foo</td></tr>'
+            '</table>'
+            '</td></tr></table>')
+
+        self.assertSoupEquals(
+            self.TABLE_MARKUP_2,
+            '<table id="1">'
+            '<tr>'
+            "<td>Here's another table:</td>"
+            '<table id="2">'
+            '<tr><td>foo</td></tr>'
+            '</table>'
+            '</tr></table>')
+
+
     def test_collapsed_whitespace(self):
         """In most tags, whitespace is collapsed."""
         self.assertSoupEquals("<p>   </p>", "<p> </p>")
-- 
cgit v1.2.3


From d444427275459c6be2dc255274831bae26eb5e04 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 10 Feb 2011 13:13:50 -0500
Subject: Added more table tests.

---
 tests/test_lxml.py | 56 ++++++++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

(limited to 'tests/test_lxml.py')

diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 76bcd32..2af952f 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -76,40 +76,24 @@ class TestLXMLBuilder(SoupTest):
                      '<tr><td>foo</td></tr>'
                      '</table></td>')
 
-    # This is the same as TABLE_MARKUP_1, but the nested table is
-    # floating freely rather than being inside a <td> cell.
-    TABLE_MARKUP_2 = ('<table id="1">'
-                     '<tr>'
-                     "<td>Here's another table:</td>"
-                     '<table id="2">'
-                     '<tr><td>foo</td></tr>'
-                     '</table></td>')
+    def test_correctly_nested_tables(self):
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
 
-
-    def test_nested_tables(self):
-        # lxml closes the <tr> and <table> tags that weren't closed by
-        # TABLE_MARKUP. Unlike html5lib, it treats both bits of markup
-        # as nested tables.
         self.assertSoupEquals(
-            self.TABLE_MARKUP_1,
-            '<table id="1">'
-            '<tr>'
-            "<td>Here's another table:"
-            '<table id="2">'
-            '<tr><td>foo</td></tr>'
-            '</table>'
+            markup,
+            '<table id="1"><tr><td>Here\'s another table:'
+            '<table id="2"><tr><td>foo</td></tr></table>'
             '</td></tr></table>')
 
         self.assertSoupEquals(
-            self.TABLE_MARKUP_2,
-            '<table id="1">'
-            '<tr>'
-            "<td>Here's another table:</td>"
-            '<table id="2">'
-            '<tr><td>foo</td></tr>'
-            '</table>'
-            '</tr></table>')
-
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
 
     def test_collapsed_whitespace(self):
         """In most tags, whitespace is collapsed."""
@@ -240,6 +224,20 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
     markup at all.
     """
 
+    def test_table_containing_bare_markup(self):
+        # Markup should be in table cells, not directly in the table.
+        self.assertSoupEquals("<table><div>Foo</div></table>")
+
+    def test_incorrectly_nested_table(self):
+        # The second <table> tag is floating in the <tr> tag
+        # rather than being inside a <td>.
+        bad_markup = ('<table id="1">'
+                      '<tr>'
+                      "<td>Here's another table:</td>"
+                      '<table id="2">'
+                      '<tr><td>foo</td></tr>'
+                      '</table></td>')
+
     def test_unclosed_block_level_elements(self):
         # Unclosed block-level elements should be closed.
         self.assertSoupEquals(
-- 
cgit v1.2.3


From d89c8878ea86a2575c87e9fad8081cfcd81e0bcd Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 10 Feb 2011 16:41:10 -0500
Subject: Added some elementary doctype handling.

---
 tests/test_lxml.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tests/test_lxml.py')

diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 2af952f..9a65f6a 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -198,6 +198,14 @@ class TestLXMLBuilder(SoupTest):
 
     # Tests below this line need work.
 
+    #def test_doctype(self):
+    #    xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>'
+    #    self.assertSoupEquals(xml)
+
+
+    #def test_cdata(self):
+    #    print self.soup("<div><![CDATA[foo]]></div>")
+
     def test_entities_converted_on_the_way_out(self):
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
         expected = u"&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;".encode("utf-8")
@@ -261,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
             '<table><tr><table><tr id="nested">',
             '<table><tr><table><tr id="nested"></tr></table></tr></table>')
 
+    def test_doctype_in_body(self):
+        markup = "<p>one<!DOCTYPE foobar>two</p>"
+        self.assertSoupEquals(markup)
 
-- 
cgit v1.2.3