from HTMLParser import HTMLParseError from bs4.builder import HTMLParserTreeBuilder from bs4.element import CData from test_lxml import ( TestLXMLBuilder, TestLXMLBuilderEncodingConversion, TestLXMLBuilderInvalidMarkup, ) class TestHTMLParserTreeBuilder(TestLXMLBuilder): """See `BuilderSmokeTest`.""" @property def default_builder(self): return HTMLParserTreeBuilder() def test_bare_string(self): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # # HTMLParser does not modify the bare string at all. self.assertSoupEquals("A bare string") def test_cdata_where_its_ok(self): # HTMLParser recognizes CDATA sections and passes them through. markup = "foobar" self.assertSoupEquals(markup) soup = self.soup(markup) string = soup.svg.string self.assertEqual(string, "foobar") self.assertTrue(isinstance(string, CData)) # These are tests that could be 'fixed' by improving the # HTMLParserTreeBuilder, but I don't think it's worth it. Users # will have fewer headaches if they use one of the other tree # builders. def test_empty_element(self): # HTML's empty-element tags are not recognized as such # unless they are presented as empty-element tags. self.assertSoupEquals( "

A tag

", "

A tag

") self.assertSoupEquals( "

Foo
bar

", "

Foo
bar

") def test_hex_entities_in_text(self): # XXX This tests a workaround for a bug in HTMLParser. self.assertSoupEquals("ñ", u"\xf1") def test_entities_in_attribute_values_converted_during_parsing(self): # The numeric entity isn't recognized without the closing # semicolon. text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], "piñata") text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], u"pi\xf1ata") text = '' soup = self.soup(text) self.assertEqual(soup.x['t'], expected) text = '' soup = self.soup(text) self.assertEqual( soup.x['t'], u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") # This can cause valid HTML to become invalid. valid_url = 'foo' soup = self.soup(valid_url) self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") # I think it would be very difficult to 'fix' these tests, judging # from my experience with previous versions of Beautiful Soup. def test_naked_ampersands(self): # Ampersands are treated as entities. text = "

AT&T

" soup = self.soup(text) self.assertEqual(soup.p.string, "AT&T;") def test_literal_in_textarea(self): # Anything inside a ' soup = self.soup(text) self.assertEqual(len(soup.textarea.contents), 2) self.assertEqual(soup.textarea.contents[0], u"Junk like ") self.assertEqual(soup.textarea.contents[1].name, 'b') self.assertEqual(soup.textarea.b.string, u" tags and <&<&") def test_literal_in_script(self): # Some versions of HTMLParser choke on markup like this: # if (i < 2) { alert("foo"); } # Some versions of HTMLParser don't. # # The easiest thing is to just not run this test for HTMLParser. pass # Namespaced doctypes cause an HTMLParseError def test_namespaced_system_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet PUBLIC "htmlent.dtd"') class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): # Oddly enough, HTMLParser seems to handle invalid markup exactly # the same as lxml. pass class TestHTMLParserTreeBuilderEncodingConversion( TestLXMLBuilderEncodingConversion): # Re-run the lxml tests for HTMLParser pass