From 55fed485ac9280c2509b418f8d1a8c140d5ec822 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 11:51:17 -0500 Subject: Removed tests that merely illustrated parser behavior, behavior that wouldn't break Beautiful Soup if it changed. --- bs4/tests/test_lxml.py | 234 ++----------------------------------------------- 1 file changed, 7 insertions(+), 227 deletions(-) (limited to 'bs4/tests/test_lxml.py') diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 4d19e7f..d60bd3b 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -12,237 +12,17 @@ from bs4 import BeautifulSoup from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser -from bs4.testing import skipIf +from bs4.testing import ( + HTMLTreeBuilderSmokeTest, + SoupTest, + skipIf, +) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") -class TestLXMLTreeBuilder(test_htmlparser.TestHTMLParserTreeBuilder): - """A smoke test for the LXML tree builder. - - Subclass this to test some other HTML tree builder. Subclasses of - this test ensure that all of Beautiful Soup's tree builders - generate more or less the same trees. - - It's okay for trees to differ--just override the appropriate test - method to demonstrate how one tree builder differs from the LXML - builder. But in general, all HTML tree builders should generate - trees that make most of these tests pass. - """ - - @property - def default_builder(self): - return LXMLTreeBuilder() - - def test_bare_string(self): - # A bare string is turned into some kind of HTML document or - # fragment recognizable as the original string. - # - # In this case, lxml puts a

tag around the bare string. - self.assertSoupEquals( - "A bare string", "

A bare string

") - - def test_cdata_where_its_ok(self): - # lxml strips CDATA sections, no matter where they occur. - markup = "foobar" - self.assertSoupEquals(markup, "") - - def test_empty_element(self): - # HTML's empty-element tags are recognized as such. - self.assertSoupEquals( - "

A tag

", "

A tag

") - - self.assertSoupEquals( - "

Foo
bar

", "

Foo
bar

") - - def test_naked_ampersands(self): - # Ampersands are left alone. - text = "

AT&T

" - soup = self.soup(text) - self.assertEqual(soup.p.string, "AT&T") - - # Even if they're in attribute values. - invalid_url = 'foo' - soup = self.soup(invalid_url) - self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") - - def test_literal_in_textarea(self): - # Anything inside a ' - soup = self.soup(text) - self.assertEqual(len(soup.textarea.contents), 2) - self.assertEqual(soup.textarea.contents[0], u"Junk like ") - self.assertEqual(soup.textarea.contents[1].name, 'b') - self.assertEqual(soup.textarea.b.string, u" tags and ") - - def test_literal_in_script(self): - # The contents of a ' % javascript) - self.assertEqual(soup.script.string, javascript) - - def test_doctype(self): - # Test a normal HTML doctype you'll commonly see in a real document. - self._test_doctype( - 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') - - def test_namespaced_system_doctype(self): - # Test a namespaced doctype with a system id. - self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') - - def test_namespaced_public_doctype(self): - # Test a namespaced doctype with a public id. - self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') - - def test_entities_in_attribute_values_converted_during_parsing(self): - - # The numeric entity isn't recognized without the closing - # semicolon. - text = '' - expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" - soup = self.soup(text) - self.assertEqual(soup.x['t'], expected) - - text = '' - expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" - soup = self.soup(text) - self.assertEqual(soup.x['t'], u"pi\xf1ata") - - text = '' - soup = self.soup(text) - self.assertEqual(soup.x['t'], expected) - - text = '' - soup = self.soup(text) - self.assertEqual( - soup.x['t'], - u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") - - # This can cause valid HTML to become invalid. - valid_url = 'foo' - soup = self.soup(valid_url) - self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") - - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing it on invalid markup.") -class TestLXMLTreeBuilderInvalidMarkup( - test_htmlparser.TestHTMLParserTreeBuilderInvalidMarkup): - - @property - def default_builder(self): - return LXMLTreeBuilder() - - def test_attribute_value_never_got_closed(self): - markup = ' and blah and blah") - - def test_attribute_value_was_closed_by_subsequent_tag(self): - markup = """baz""" - soup = self.soup(markup) - # The string between the first and second quotes was interpreted - # as the value of the 'href' attribute. - self.assertEqual(soup.a['href'], 'foo,

a

') - # The declaration is ignored altogether. - self.assertEqual(soup.encode(), b"

a

") - - def test_incomplete_declaration(self): - # An incomplete declaration will screw up the rest of the document. - self.assertSoupEquals('ac', '

a

') - - def test_nonsensical_declaration(self): - # Declarations that don't make any sense are ignored. - self.assertSoupEquals('

a

', "

a

") - - def test_unquoted_attribute_value(self): - soup = self.soup('
') - self.assertEqual(soup.a['style'], '{height:21px;}') - - def test_whitespace_in_doctype(self): - # A declaration that has extra whitespace is ignored. - self.assertSoupEquals( - ('' - '

foo

'), - '

foo

') - - def test_boolean_attribute_with_no_value(self): - soup = self.soup("
foo
") - self.assertEqual(soup.table.td['nowrap'], '') - - def test_cdata_where_it_doesnt_belong(self): - #CDATA sections are ignored. - markup = "
" - self.assertSoupEquals(markup, "
") - - def test_empty_element_tag_with_contents(self): - self.assertSoupEquals("
foo
", "
foo") - - def test_nonexistent_entity(self): - soup = self.soup("

foo&#bar;baz

") - self.assertEqual(soup.p.string, "foobar;baz") - - # Compare a real entity. - soup = self.soup("

foodbaz

") - self.assertEqual(soup.p.string, "foodbaz") - - # Also compare html5lib, which preserves the &# before the - # entity name. - - def test_entity_was_not_finished(self): - soup = self.soup("

<Hello>") - # Compare html5lib, which completes the entity. - self.assertEqual(soup.p.string, "http://foo.com/", - "http://foo.com/") - - def test_paragraphs_containing_block_display_elements(self): - markup = self.soup("

this is the definition:" - "

first case
") - # The

tag is closed before the

tag begins. - self.assertEqual(markup.p.contents, ["this is the definition:"]) - - def test_multiple_values_for_the_same_attribute(self): - markup = '' - self.assertSoupEquals(markup, '') - - def test_entity_out_of_range(self): - # An entity that's out of range will be ignored. - soup = self.soup("

") - self.assertEqual(0, len(soup.p.contents)) - - soup = self.soup("

") - self.assertEqual(0, len(soup.p.contents)) - - soup = self.soup("

") - self.assertEqual(0, len(soup.p.contents)) - - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing it on encoding conversion.") -class TestLXMLParserTreeBuilderEncodingConversion( - test_htmlparser.TestHTMLParserTreeBuilderEncodingConversion): +class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): -- cgit v1.2.3 From 091cba0d56e089061d49b7790c61f361b2981905 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 13:18:35 -0500 Subject: Minor cleanup. --- bs4/tests/test_lxml.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'bs4/tests/test_lxml.py') diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index d60bd3b..92b7389 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -27,3 +27,11 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): return LXMLTreeBuilder() + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") -- cgit v1.2.3