From b171ca04063136d2cc399c0335226e5016fbe6a1 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 7 Jul 2011 21:02:16 +0100 Subject: Small fix for test. --- bs4/tests/test_html5lib.py | 226 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 bs4/tests/test_html5lib.py (limited to 'bs4/tests/test_html5lib.py') diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py new file mode 100644 index 0000000..4d8dcc0 --- /dev/null +++ b/bs4/tests/test_html5lib.py @@ -0,0 +1,226 @@ +from bs4.builder import HTML5TreeBuilder +from bs4.element import Comment, SoupStrainer +from test_lxml import ( + TestLXMLBuilder, + TestLXMLBuilderInvalidMarkup, + TestLXMLBuilderEncodingConversion, + ) + +class TestHTML5Builder(TestLXMLBuilder): + """See `BuilderSmokeTest`.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + soup = self.soup(markup, + parse_only=strainer) + self.assertEquals( + soup.decode(), self.document_for(markup)) + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + # + # In this case, lxml puts a

tag around the bare string. + self.assertSoupEquals( + "A bare string", "A bare string") + + def test_correctly_nested_tables(self): + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_literal_in_textarea(self): + markup = '' + soup = self.soup(markup) + self.assertEquals( + soup.textarea.contents, ["Junk like tags and <&<&"]) + + def test_collapsed_whitespace(self): + """Whitespace is preserved even in tags that don't require it.""" + self.assertSoupEquals("

") + self.assertSoupEquals(" ") + + def test_cdata_where_its_ok(self): + # In html5lib 0.9.0, all CDATA sections are converted into + # comments. In a later version (unreleased as of this + # writing), CDATA sections in tags like and will + # be preserved. BUT, I'm not sure how Beautiful Soup needs to + # adjust to transform this preservation into the construction + # of a BS CData object. + markup = "foobar" + + # Eventually we should be able to do a find(text="foobar") and + # get a CData object. + self.assertSoupEquals(markup, "") + + +class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): + """See `BuilderInvalidMarkupSmokeTest`.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_unclosed_block_level_elements(self): + # The unclosed tag is closed so that the block-level tag + # can be closed, and another tag is inserted after the + # next block-level tag begins. + self.assertSoupEquals( + '

Foo

Bar', + '

Foo

Bar

') + + def test_table_containing_bare_markup(self): + # Markup should be in table cells, not directly in the table. + self.assertSoupEquals("
Foo
", + "
Foo
") + + def test_incorrectly_nested_tables(self): + self.assertSoupEquals( + '
', + ('
' + '
')) + + def test_empty_element_tag_with_contents(self): + self.assertSoupEquals("
foo
", "
foo
") + + def test_doctype_in_body(self): + markup = "

onetwo

" + self.assertSoupEquals(markup, "

onetwo

") + + def test_cdata_where_it_doesnt_belong(self): + # Random CDATA sections are converted into comments. + markup = "
" + soup = self.soup(markup) + data = soup.find(text="[CDATA[foo]]") + self.assertEquals(data.__class__, Comment) + + def test_nonsensical_declaration(self): + # Declarations that don't make any sense are turned into comments. + soup = self.soup('

a

') + self.assertEquals(str(soup), + ("" + "

a

")) + + soup = self.soup('

a

') + self.assertEquals(str(soup), + ("

a

" + "")) + + def test_whitespace_in_doctype(self): + # A declaration that has extra whitespace is turned into a comment. + soup = self.soup(( + '' + '

foo

')) + self.assertEquals( + str(soup), + ('' + '

foo

')) + + def test_incomplete_declaration(self): + # An incomplete declaration is treated as a comment. + markup = 'ac' + self.assertSoupEquals(markup, "ac") + + # Let's spell that out a little more explicitly. + soup = self.soup(markup) + str1, comment, str2 = soup.body.contents + self.assertEquals(str1, 'a') + self.assertEquals(comment.__class__, Comment) + self.assertEquals(comment, 'b a') + # 'Foo' becomes a comment that appears before the HTML. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, 'Foo') + + self.assertEquals(self.find(text="a") == "a") + + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """baz""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEquals(soup.a['href'], 'foo,

a

') + # The declaration becomes a comment. + comment = soup.contents[0] + self.assertTrue(isinstance(comment, Comment)) + self.assertEquals(comment, ' Foo ') + self.assertEquals(soup.p.string, 'a') + + def test_document_ends_with_incomplete_declaration(self): + soup = self.soup('

a<Hello>") + # Compare html5lib, which completes the entity. + self.assertEquals(soup.p.string, "") + + def test_nonexistent_entity(self): + soup = self.soup("

foo&#bar;baz

") + self.assertEquals(soup.p.string, "foo&#bar;baz") + + # Compare a real entity. + soup = self.soup("

foodbaz

") + self.assertEquals(soup.p.string, "foodbaz") + + def test_entity_out_of_range(self): + # An entity that's out of range will be converted to + # REPLACEMENT CHARACTER. + soup = self.soup("

") + self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}") + + soup = self.soup("

") + self.assertEquals(soup.p.string, u"\N{REPLACEMENT CHARACTER}") + + +class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-8 (a + # Hebrew encoding) to UTF-8. + soup = self.soup(self.HEBREW_DOCUMENT, + from_encoding="iso-8859-8") + self.assertEquals(soup.original_encoding, 'iso8859-8') + self.assertEquals( + soup.encode('utf-8'), + self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) -- cgit v1.2.3