import copy from HTMLParser import HTMLParseError from bs4.element import Comment, Doctype, SoupStrainer from bs4.builder import HTMLParserTreeBuilder from bs4.element import CData from bs4.testing import SoupTest class TestHTMLParserTreeBuilder(SoupTest): """A smoke test for the built-in tree builder. Subclass this to test some other HTML tree builder. Subclasses of this test ensure that all of Beautiful Soup's tree builders generate more or less the same trees. It's okay for trees to differ--just override the appropriate test method to demonstrate how one tree builder differs from the default builder. But in general, all HTML tree builders should generate trees that make most of these tests pass. """ @property def default_builder(self): return HTMLParserTreeBuilder() def test_bare_string(self): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # # HTMLParser does not modify the bare string at all. self.assertSoupEquals("A bare string") def test_cdata_where_its_ok(self): # HTMLParser recognizes CDATA sections and passes them through. markup = "" self.assertSoupEquals(markup) soup = self.soup(markup) string = soup.svg.string self.assertEqual(string, "foobar") self.assertTrue(isinstance(string, CData)) def test_hex_entities_in_text(self): # XXX This tests a workaround for a bug in HTMLParser. self.assertSoupEquals("
ñ
", u"\xf1
") def test_entities_in_attribute_values_converted_during_parsing(self): # The numeric entity isn't recognized without the closing # semicolon. text = 'AT&T
" soup = self.soup(text) self.assertEqual(soup.p.string, "AT&T;") def test_literal_in_textarea(self): # Anything inside a