From 749f01e2b664dcbf4f58dfbdcaa4d314f6e3b9ef Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 09:50:32 -0500 Subject: Added a test to verify that both lxml and html5lib convert entities to Unicode characters during parsing. --- tests/test_lxml.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index d16e8d9..e6e015b 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -114,14 +114,29 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('' % javascript) self.assertEquals(soup.script.string, javascript) + def test_entities_converted_on_the_way_in(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = u"

<>

" + self.assertSoupEquals(text, expected) + + # Tests below this line need work. + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") + soup = BeautifulSoup(text) + str = soup.p.string + #self.assertEquals(str.encode("utf-8"), expected) + def test_foo(self): isolatin = """Sacr\xe9 bleu!""" soup = self.soup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) utf8 = utf8.replace("\xe9", "\xc3\xa9") - - print soup + #print soup class TestLXMLBuilderInvalidMarkup(SoupTest): -- cgit v1.2.3