Foo
Bar
Baz

Foo

Bar

Baz

" + soup = self.soup(markup) + data = soup.find(text="[CDATA[foo]]") + self.assertEquals(data.__class__, Comment) + + def test_nonsensical_declaration(self): + # Declarations that don't make any sense are turned into comments. + soup = self.soup('

') + self.assertEquals(str(soup), + ("" + "

")) + + soup = self.soup('

') + self.assertEquals(str(soup), + ("

" + "")) + + def test_whitespace_in_doctype(self): + # A declaration that has extra whitespace is turned into a comment. + soup = self.soup(( + '' + '

foo

')) + self.assertEquals( + str(soup), + ('' + '

foo

')) + + def test_incomplete_declaration(self): + # An incomplete declaration is treated as a comment. + markup = 'ac' + self.assertSoupEquals(markup, "ac") + + # Let's spell that out a little more explicitly. + soup = self.soup(markup) + str1, comment, str2 = soup.body.contents + self.assertEquals(str1, 'a') + self.assertEquals(comment.__class__, Comment) + self.assertEquals(comment, 'b a') + # 'Foo' becomes a comment that appears before the HTML. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'Foo') + + self.assertEquals(self.find(text="a") == "a") + + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """baz""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo,

') + # The declaration becomes a comment. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, ' Foo ') + self.assertEquals(soup.p.string, 'a') + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('

a<Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "") + + def test_nonexistent_entity(self): + soup = self.soup("

foo&#bar;baz

") + self.assertEquals(soup.p.string, "foo&#bar;baz") + + # Compare a real entity. + soup = self.soup("

foodbaz

") + self.assertEquals(soup.p.string, "foodbaz") + + def test_entity_out_of_range(self): + # An entity that's out of range will be converted to + # REPLACEMENT CHARACTER. + soup = self.soup("

�

") + self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}") + + soup = self.soup("

�

") + self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}") + + +class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-8 (a + # Hebrew encoding) to UTF-8. + soup = self.soup(self.HEBREW_DOCUMENT, + from_encoding="iso-8859-8") + self.assertEquals(soup.original_encoding, 'iso8859-8') + self.assertEquals( + soup.encode('utf-8'), + self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) -- cgit v1.2.3