From ab7ed77ab3560f6d574d577befc7a1f593e45327 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Mon, 20 Feb 2012 11:43:46 -0500 Subject: Changd the class structure so that the default parser test class uses html.parser. --- bs4/tests/test_html5lib.py | 108 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 10 deletions(-) (limited to 'bs4/tests/test_html5lib.py') diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index f1f3727..3f00d52 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -4,18 +4,14 @@ try: except ImportError, e: HTML5LIB_PRESENT = False from bs4.element import Comment, SoupStrainer -from test_lxml import ( - TestLXMLBuilder, - TestLXMLBuilderInvalidMarkup, - TestLXMLBuilderEncodingConversion, - ) +import test_htmlparser import unittest from bs4.testing import skipIf @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") -class TestHTML5Builder(TestLXMLBuilder): +class TestHTML5Builder(test_htmlparser.TestHTMLParserTreeBuilder): """See `BuilderSmokeTest`.""" @property @@ -35,7 +31,7 @@ class TestHTML5Builder(TestLXMLBuilder): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # - # In this case, lxml puts a

tag around the bare string. + # In this case, html5lib puts a

tag around the bare string. self.assertSoupEquals( "A bare string", "A bare string") @@ -82,10 +78,35 @@ class TestHTML5Builder(TestLXMLBuilder): # get a CData object. self.assertSoupEquals(markup, "") + def test_entities_in_attribute_values_converted_during_parsing(self): + + # The numeric entity is recognized even without the closing + # semicolon. + text = '' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEqual(soup.x['t'], expected) + + def test_naked_ampersands(self): + # Ampersands are not treated as entities, unlike in html.parser. + text = "

AT&T

" + soup = self.soup(text) + self.assertEqual(soup.p.string, "AT&T") + + def test_namespaced_system_doctype(self): + # Test a namespaced doctype with a system id. + self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') + + @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing it on invalid markup.") -class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): +class TestHTML5BuilderInvalidMarkup( + test_htmlparser.TestHTMLParserTreeBuilderInvalidMarkup): """See `BuilderInvalidMarkupSmokeTest`.""" @property @@ -100,6 +121,29 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): '

Foo

Bar', '

Foo

Bar

') + def test_attribute_value_never_got_closed(self): + markup = ' and blah and blah") + + def test_attribute_value_was_closed_by_subsequent_tag(self): + markup = """baz""" + soup = self.soup(markup) + # The string between the first and second quotes was interpreted + # as the value of the 'href' attribute. + self.assertEqual(soup.a['href'], 'foo,

a

') + # The declaration is ignored altogether. + self.assertEqual(soup.encode(), b"

a

") + def test_table_containing_bare_markup(self): # Markup should be in table cells, not directly in the table. self.assertSoupEquals("
Foo
", @@ -246,11 +290,55 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): soup = self.soup("

") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") + def test_incomplete_declaration(self): + self.assertSoupEquals('ac', 'ac') + + def test_nonsensical_declaration(self): + soup = self.soup('

a

') + self.assertEquals( + soup.decode(), + "

a

") + + def test_unquoted_attribute_value(self): + soup = self.soup('
') + self.assertEqual(soup.a['style'], '{height:21px;}') + + def test_boolean_attribute_with_no_value(self): + soup = self.soup("
foo
") + self.assertEqual(soup.table.td['nowrap'], '') + + def test_cdata_where_it_doesnt_belong(self): + #CDATA sections are ignored. + markup = "
" + self.assertSoupEquals(markup, "
") + + def test_empty_element_tag_with_contents(self): + self.assertSoupEquals("
foo
", "
foo
") + + def test_fake_self_closing_tag(self): + # If a self-closing tag presents as a normal tag, the 'open' + # tag is treated as an instance of the self-closing tag and + # the 'close' tag is ignored. + self.assertSoupEquals( + "http://foo.com/", + "http://foo.com/") + + def test_paragraphs_containing_block_display_elements(self): + markup = self.soup("

this is the definition:" + "

first case
") + # The

tag is closed before the

tag begins. + self.assertEqual(markup.p.contents, ["this is the definition:"]) + + def test_multiple_values_for_the_same_attribute(self): + markup = '' + self.assertSoupEquals(markup, '') + @skipIf( not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing encoding conversion.") -class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): + "html5lib seems not to be present, not testing it on encoding conversion.") +class TestHTML5LibEncodingConversion( + test_htmlparser.TestHTMLParserTreeBuilderEncodingConversion): @property def default_builder(self): return HTML5TreeBuilder() -- cgit v1.2.3