From df26dc64d868875d7cd8ca550f1a174d68dd7c67 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 20 Jan 2012 14:22:42 -0500 Subject: Replaced assertEquals with assertEqual to get rid of deprecation notice. --- bs4/tests/test_htmlparser.py | 126 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 bs4/tests/test_htmlparser.py (limited to 'bs4/tests/test_htmlparser.py') diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py new file mode 100644 index 0000000..d2db38e --- /dev/null +++ b/bs4/tests/test_htmlparser.py @@ -0,0 +1,126 @@ +from HTMLParser import HTMLParseError +from bs4.builder import HTMLParserTreeBuilder +from bs4.element import CData +from test_lxml import ( + TestLXMLBuilder, + TestLXMLBuilderEncodingConversion, + TestLXMLBuilderInvalidMarkup, + ) + +class TestHTMLParserTreeBuilder(TestLXMLBuilder): + """See `BuilderSmokeTest`.""" + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + # + # HTMLParser does not modify the bare string at all. + self.assertSoupEquals("A bare string") + + def test_cdata_where_its_ok(self): + # HTMLParser recognizes CDATA sections and passes them through. + markup = "foobar" + self.assertSoupEquals(markup) + soup = self.soup(markup) + string = soup.svg.string + self.assertEqual(string, "foobar") + self.assertTrue(isinstance(string, CData)) + + # These are tests that could be 'fixed' by improving the + # HTMLParserTreeBuilder, but I don't think it's worth it. Users + # will have fewer headaches if they use one of the other tree + # builders. + + def test_empty_element(self): + # HTML's empty-element tags are not recognized as such + # unless they are presented as empty-element tags. + self.assertSoupEquals( + "

A tag

", "

A tag

") + + self.assertSoupEquals( + "

Foo
bar

", "

Foo
bar

") + + def test_entities_in_attribute_values_converted_during_parsing(self): + + # The numeric entity isn't recognized without the closing + # semicolon. + text = '' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEqual(soup.x['t'], "piñata") + + text = '' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEqual(soup.x['t'], u"pi\xf1ata") + + text = '' + soup = self.soup(text) + self.assertEqual(soup.x['t'], expected) + + text = '' + soup = self.soup(text) + self.assertEqual( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = 'foo' + soup = self.soup(valid_url) + self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") + + # I think it would be very difficult to 'fix' these tests, judging + # from my experience with previous versions of Beautiful Soup. + def test_naked_ampersands(self): + # Ampersands are treated as entities. + text = "

AT&T

" + soup = self.soup(text) + self.assertEqual(soup.p.string, "AT&T;") + + def test_literal_in_textarea(self): + # Anything inside a ' + soup = self.soup(text) + self.assertEqual(len(soup.textarea.contents), 2) + self.assertEqual(soup.textarea.contents[0], u"Junk like ") + self.assertEqual(soup.textarea.contents[1].name, 'b') + self.assertEqual(soup.textarea.b.string, u" tags and <&<&") + + def test_literal_in_script(self): + # The contents of a ' % javascript) + self.assertEqual(soup.script.contents, + ['if (i < 2) { alert("foo', + '"); }']) + + # Namespaced doctypes cause an HTMLParseError + def test_namespaced_system_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet PUBLIC "htmlent.dtd"') + + +class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): + # Oddly enough, HTMLParser seems to handle invalid markup exactly + # the same as lxml. + pass + + +class TestHTMLParserTreeBuilderEncodingConversion( + TestLXMLBuilderEncodingConversion): + # Re-run the lxml tests for HTMLParser + pass -- cgit v1.2.3