From bb9d9c5dc0af0deefc1a77542c007b7040aa55bb Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 11:52:30 -0500 Subject: Ported some more tests demonstrating that entities are converted to Unicode characters on the way in. --- tests/test_lxml.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index e6e015b..455c953 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -114,13 +114,58 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('' % javascript) self.assertEquals(soup.script.string, javascript) - def test_entities_converted_on_the_way_in(self): + def test_naked_ampersands(self): + # Ampersands are left alone. + text = "

AT&T

" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T") + + # Even if they're in attribute values. + invalid_url = 'foo' + soup = self.soup(invalid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<>

" self.assertSoupEquals(text, expected) + def test_entities_in_attribute_values_converted_during_parsing(self): + text = '' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = 'foo' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = "

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEquals( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + # Tests below this line need work. def test_entities_converted_on_the_way_out(self): -- cgit v1.2.3