From 749f01e2b664dcbf4f58dfbdcaa4d314f6e3b9ef Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 09:50:32 -0500 Subject: Added a test to verify that both lxml and html5lib convert entities to Unicode characters during parsing. --- tests/test_lxml.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index d16e8d9..e6e015b 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -114,14 +114,29 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('' % javascript) self.assertEquals(soup.script.string, javascript) + def test_entities_converted_on_the_way_in(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = u"

<>

" + self.assertSoupEquals(text, expected) + + # Tests below this line need work. + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") + soup = BeautifulSoup(text) + str = soup.p.string + #self.assertEquals(str.encode("utf-8"), expected) + def test_foo(self): isolatin = """Sacr\xe9 bleu!""" soup = self.soup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) utf8 = utf8.replace("\xe9", "\xc3\xa9") - - print soup + #print soup class TestLXMLBuilderInvalidMarkup(SoupTest): -- cgit v1.2.3 From bb9d9c5dc0af0deefc1a77542c007b7040aa55bb Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 11:52:30 -0500 Subject: Ported some more tests demonstrating that entities are converted to Unicode characters on the way in. --- tests/test_lxml.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index e6e015b..455c953 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -114,13 +114,58 @@ class TestLXMLBuilder(SoupTest): soup = BeautifulSoup('' % javascript) self.assertEquals(soup.script.string, javascript) - def test_entities_converted_on_the_way_in(self): + def test_naked_ampersands(self): + # Ampersands are left alone. + text = "

AT&T

" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T") + + # Even if they're in attribute values. + invalid_url = 'foo' + soup = self.soup(invalid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<>

" self.assertSoupEquals(text, expected) + def test_entities_in_attribute_values_converted_during_parsing(self): + text = '' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = 'foo' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = "

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEquals( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + # Tests below this line need work. def test_entities_converted_on_the_way_out(self): -- cgit v1.2.3 From 261f981dc4e62a41bb0e85285d1f40927b34e3d3 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 12:32:19 -0500 Subject: Added tests illustrating the different ways lxml and html5lib handle nested tables. --- tests/test_lxml.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 455c953..76bcd32 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -1,5 +1,7 @@ """Tests to ensure that the lxml tree builder generates good trees.""" +import re + from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder from beautifulsoup.element import Comment @@ -65,6 +67,50 @@ class TestLXMLBuilder(SoupTest): self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') + # This is a tag containing another
tag in one of its + # cells. + TABLE_MARKUP_1 = ('
' + '' + "') + + # This is the same as TABLE_MARKUP_1, but the nested table is + # floating freely rather than being inside a ') + + + def test_nested_tables(self): + # lxml closes the and
Here's another table:" + '' + '' + '
foo
cell. + TABLE_MARKUP_2 = ('' + '' + "" + '
Here's another table:
' + '' + '
foo
tags that weren't closed by + # TABLE_MARKUP. Unlike html5lib, it treats both bits of markup + # as nested tables. + self.assertSoupEquals( + self.TABLE_MARKUP_1, + '
' + '' + "
Here's another table:" + '' + '' + '
foo
' + '
') + + self.assertSoupEquals( + self.TABLE_MARKUP_2, + '' + '' + "" + '
Here's another table:
' + '' + '
foo
' + '') + + def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" self.assertSoupEquals("

", "

") -- cgit v1.2.3 From d444427275459c6be2dc255274831bae26eb5e04 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 13:13:50 -0500 Subject: Added more table tests. --- tests/test_lxml.py | 56 ++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 76bcd32..2af952f 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -76,40 +76,24 @@ class TestLXMLBuilder(SoupTest): 'foo' '') - # This is the same as TABLE_MARKUP_1, but the nested table is - # floating freely rather than being inside a cell. - TABLE_MARKUP_2 = ('' - '' - "" - '
Here's another table:
' - '' - '
foo
') + def test_correctly_nested_tables(self): + markup = ('' + '' + "') - - def test_nested_tables(self): - # lxml closes the and
Here's another table:" + '' + '' + '
foo
tags that weren't closed by - # TABLE_MARKUP. Unlike html5lib, it treats both bits of markup - # as nested tables. self.assertSoupEquals( - self.TABLE_MARKUP_1, - '
' - '' - "
Here's another table:" - '' - '' - '
foo
' + markup, + '
Here\'s another table:' + '
foo
' '
') self.assertSoupEquals( - self.TABLE_MARKUP_2, - '' - '' - "" - '
Here's another table:
' - '' - '
foo
' - '
') - + "" + "" + "
Foo
Bar
Baz
") def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" @@ -240,6 +224,20 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup at all. """ + def test_table_containing_bare_markup(self): + # Markup should be in table cells, not directly in the table. + self.assertSoupEquals("
Foo
") + + def test_incorrectly_nested_table(self): + # The second tag is floating in the tag + # rather than being inside a ') + def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( -- cgit v1.2.3 From d89c8878ea86a2575c87e9fad8081cfcd81e0bcd Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 10 Feb 2011 16:41:10 -0500 Subject: Added some elementary doctype handling. --- tests/test_lxml.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tests/test_lxml.py') diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 2af952f..9a65f6a 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -198,6 +198,14 @@ class TestLXMLBuilder(SoupTest): # Tests below this line need work. + #def test_doctype(self): + # xml = 'foo

' + # self.assertSoupEquals(xml) + + + #def test_cdata(self): + # print self.soup("
") + def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") @@ -261,4 +269,7 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '
. + bad_markup = ('' + '' + "" + '
Here's another table:
' + '' + '
foo
', '
') + def test_doctype_in_body(self): + markup = "

onetwo

" + self.assertSoupEquals(markup) -- cgit v1.2.3