"""Tests to ensure that the lxml tree builder generates good trees.""" import re try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError, e: LXML_PRESENT = False from bs4 import BeautifulSoup from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import skipIf @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") class TestLXMLTreeBuilder(test_htmlparser.TestHTMLParserTreeBuilder): """A smoke test for the LXML tree builder. Subclass this to test some other HTML tree builder. Subclasses of this test ensure that all of Beautiful Soup's tree builders generate more or less the same trees. It's okay for trees to differ--just override the appropriate test method to demonstrate how one tree builder differs from the LXML builder. But in general, all HTML tree builders should generate trees that make most of these tests pass. """ @property def default_builder(self): return LXMLTreeBuilder() def test_bare_string(self): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # # In this case, lxml puts a

tag around the bare string. self.assertSoupEquals( "A bare string", "

A bare string

") def test_cdata_where_its_ok(self): # lxml strips CDATA sections, no matter where they occur. markup = "

A tag

", "

A tag

") self.assertSoupEquals( "

Foo
bar

", "

Foo
bar

") def test_naked_ampersands(self): # Ampersands are left alone. text = "

AT&T

" soup = self.soup(text) self.assertEqual(soup.p.string, "AT&T") # Even if they're in attribute values. invalid_url = 'foo' soup = self.soup(invalid_url) self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") def test_literal_in_textarea(self): # Anything inside a ' soup = self.soup(text) self.assertEqual(len(soup.textarea.contents), 2) self.assertEqual(soup.textarea.contents[0], u"Junk like ") self.assertEqual(soup.textarea.contents[1].name, 'b') self.assertEqual(soup.textarea.b.string, u" tags and ") def test_literal_in_script(self): # The contents of a ' % javascript) self.assertEqual(soup.script.string, javascript) def test_doctype(self): # Test a normal HTML doctype you'll commonly see in a real document. self._test_doctype( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') def test_namespaced_system_doctype(self): # Test a namespaced doctype with a system id. self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') def test_entities_in_attribute_values_converted_during_parsing(self): # The numeric entity isn't recognized without the closing # semicolon. text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], expected) text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], u"pi\xf1ata") text = '' soup = self.soup(text) self.assertEqual(soup.x['t'], expected) text = '' soup = self.soup(text) self.assertEqual( soup.x['t'], u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") # This can cause valid HTML to become invalid. valid_url = 'foo' soup = self.soup(valid_url) self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing it on invalid markup.") class TestLXMLTreeBuilderInvalidMarkup( test_htmlparser.TestHTMLParserTreeBuilderInvalidMarkup): @property def default_builder(self): return LXMLTreeBuilder() def test_attribute_value_never_got_closed(self): markup = ' and blah and blah") def test_attribute_value_was_closed_by_subsequent_tag(self): markup = """baz""" soup = self.soup(markup) # The string between the first and second quotes was interpreted # as the value of the 'href' attribute. self.assertEqual(soup.a['href'], 'foo,

a

') # The declaration is ignored altogether. self.assertEqual(soup.encode(), b"

a

") def test_incomplete_declaration(self): # An incomplete declaration will screw up the rest of the document. self.assertSoupEquals('ac', '

a

') def test_nonsensical_declaration(self): # Declarations that don't make any sense are ignored. self.assertSoupEquals('

a

', "

a

") def test_unquoted_attribute_value(self): soup = self.soup('') self.assertEqual(soup.a['style'], '{height:21px;}') def test_whitespace_in_doctype(self): # A declaration that has extra whitespace is ignored. self.assertSoupEquals( ('' '

foo

'), '

foo

') def test_boolean_attribute_with_no_value(self): soup = self.soup("

foo

") self.assertEqual(soup.table.td['nowrap'], '') def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "

" self.assertSoupEquals(markup, "

") def test_empty_element_tag_with_contents(self): self.assertSoupEquals("
foo
", "
foo") def test_nonexistent_entity(self): soup = self.soup("

foo&#bar;baz

") self.assertEqual(soup.p.string, "foobar;baz") # Compare a real entity. soup = self.soup("

foodbaz

") self.assertEqual(soup.p.string, "foodbaz") # Also compare html5lib, which preserves the &# before the # entity name. def test_entity_was_not_finished(self): soup = self.soup("

<Hello>") # Compare html5lib, which completes the entity. self.assertEqual(soup.p.string, "http://foo.com/", "http://foo.com/") def test_paragraphs_containing_block_display_elements(self): markup = self.soup("

this is the definition:" "

first case