diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 11:52:30 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-10 11:52:30 -0500 |
commit | bb9d9c5dc0af0deefc1a77542c007b7040aa55bb (patch) | |
tree | 1873ec97e3684c4676d1c62177b60e42aa4f1f2b | |
parent | 749f01e2b664dcbf4f58dfbdcaa4d314f6e3b9ef (diff) |
Ported some more tests demonstrating that entities are converted to Unicode characters on the way in.
-rw-r--r-- | TODO | 5 | ||||
-rw-r--r-- | tests/test_lxml.py | 47 | ||||
-rw-r--r-- | tests/test_tree.py | 6 |
3 files changed, 54 insertions, 4 deletions
@@ -2,6 +2,11 @@ html5lib has its own Unicode, Dammit-like system. Converting the input to Unicode should be up to the builder. The lxml builder would use Unicode, Dammit, and the html5lib builder would be a no-op. +Bare ampersands should be converted to HTML entities upon output. + +It should also be possible to convert certain Unicode characters to +HTML entities upon output. + --- Here are some unit tests that fail with HTMLParser. diff --git a/tests/test_lxml.py b/tests/test_lxml.py index e6e015b..455c953 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -114,13 +114,58 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('<script>%s</script>' % javascript) self.assertEquals(soup.script.string, javascript) - def test_entities_converted_on_the_way_in(self): + def test_naked_ampersands(self): + # Ampersands are left alone. + text = "<p>AT&T</p>" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T") + + # Even if they're in attribute values. + invalid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(invalid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "<p><<sacré bleu!>></p>" expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" self.assertSoupEquals(text, expected) + def test_entities_in_attribute_values_converted_during_parsing(self): + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="piñata">' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="sacré bleu">' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = "<p>\x91Foo\x92</p>" + soup = self.soup(quote) + self.assertEquals( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + # Tests below this line need work. def test_entities_converted_on_the_way_out(self): diff --git a/tests/test_tree.py b/tests/test_tree.py index ed29d76..367489e 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -818,7 +818,7 @@ class TestPersistence(SoupTest): class TestEncoding(SoupTest): - """Test the ability to encode strings.""" + """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): html = u"<b>\N{SNOWMAN}</b>" @@ -829,5 +829,5 @@ class TestEncoding(SoupTest): def test_tag_containing_unicode_string_can_be_encoded(self): html = u"<b>\N{SNOWMAN}</b>" soup = self.soup(html) - self.assertEquals(soup.b.encode("utf-8"), - html.encode("utf-8")) + self.assertEquals( + soup.b.encode("utf-8"), html.encode("utf-8")) |