from HTMLParser import HTMLParseError from bs4.element import Comment, Doctype, SoupStrainer from bs4.builder import HTMLParserTreeBuilder from bs4.element import CData from bs4.testing import SoupTest class TestHTMLParserTreeBuilder(SoupTest): """A smoke test for the built-in tree builder. Subclass this to test some other HTML tree builder. Subclasses of this test ensure that all of Beautiful Soup's tree builders generate more or less the same trees. It's okay for trees to differ--just override the appropriate test method to demonstrate how one tree builder differs from the default builder. But in general, all HTML tree builders should generate trees that make most of these tests pass. """ @property def default_builder(self): return HTMLParserTreeBuilder() def test_bare_string(self): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # # HTMLParser does not modify the bare string at all. self.assertSoupEquals("A bare string") def test_cdata_where_its_ok(self): # HTMLParser recognizes CDATA sections and passes them through. markup = "foobar" self.assertSoupEquals(markup) soup = self.soup(markup) string = soup.svg.string self.assertEqual(string, "foobar") self.assertTrue(isinstance(string, CData)) def test_hex_entities_in_text(self): # XXX This tests a workaround for a bug in HTMLParser. self.assertSoupEquals("

ñ

", u"

\xf1

") def test_entities_in_attribute_values_converted_during_parsing(self): # The numeric entity isn't recognized without the closing # semicolon. text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], "piñata") text = '' expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" soup = self.soup(text) self.assertEqual(soup.x['t'], u"pi\xf1ata") text = '' soup = self.soup(text) self.assertEqual(soup.x['t'], expected) text = '' soup = self.soup(text) self.assertEqual( soup.x['t'], u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") # This can cause valid HTML to become invalid. valid_url = 'foo' soup = self.soup(valid_url) self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") # I think it would be very difficult to 'fix' these tests, judging # from my experience with previous versions of Beautiful Soup. def test_naked_ampersands(self): # Ampersands are treated as entities. text = "

AT&T

" soup = self.soup(text) self.assertEqual(soup.p.string, "AT&T;") def test_literal_in_textarea(self): # Anything inside a ' soup = self.soup(text) self.assertEqual(len(soup.textarea.contents), 2) self.assertEqual(soup.textarea.contents[0], u"Junk like ") self.assertEqual(soup.textarea.contents[1].name, 'b') self.assertEqual(soup.textarea.b.string, u" tags and <&<&") def test_literal_in_script(self): # Some versions of HTMLParser choke on markup like this: # if (i < 2) { alert("foo"); } # Some versions of HTMLParser don't. # # The easiest thing is to just not run this test for HTMLParser. pass # Namespaced doctypes cause an HTMLParseError def test_namespaced_system_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet PUBLIC "htmlent.dtd"') def _test_doctype(self, doctype_fragment): """Run a battery of assertions on a given doctype string. HTMLParser doesn't actually behave like this, so this method is never called in this class. But many other builders do behave like this, so I've put the method in the superclass. """ doctype_str = '' % doctype_fragment markup = doctype_str + '

foo

' soup = self.soup(markup) doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') # ------------------------- def test_mixed_case_tags(self): # Mixed-case tags are folded to lowercase. self.assertSoupEquals( "", "") def test_empty_tag_thats_not_an_empty_element_tag(self): # A tag that is empty but not an HTML empty-element tag # is not presented as an empty-element tag. self.assertSoupEquals("

", "

") def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) def test_nested_inline_elements(self): # Inline tags can be nested indefinitely. b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') # This is a tag containing another
tag in one of its # cells. TABLE_MARKUP_1 = ('
' '' "') def test_correctly_nested_tables(self): markup = ('
Here's another table:" '' '' '
foo
' '' "') self.assertSoupEquals( markup, '
Here's another table:" '' '' '
foo
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "" "" "
Foo
Bar
Baz
") def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" self.assertSoupEquals("

", "

") def test_preserved_whitespace_in_pre_and_textarea(self): """In
 and ")

    def test_single_quote_attribute_values_become_double_quotes(self):
        self.assertSoupEquals("",
                              '')

    def test_attribute_values_with_nested_quotes_are_left_alone(self):
        text = """a"""
        self.assertSoupEquals(text)

    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
        text = """a"""
        soup = self.soup(text)
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        self.assertSoupEquals(
            soup.foo.decode(),
            """a""")

    def test_ampersand_in_attribute_value_gets_quoted(self):
        self.assertSoupEquals('',
                              '')

    def test_entities_in_strings_converted_during_parsing(self):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("  ") self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'
'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'
') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) # Tests below this line need work. def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is replaced with a # generic value. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) self.assertEqual(parsed_meta['content'], 'text/html; charset=%SOUP-ENCODING%') self.assertEqual(parsed_meta.contains_substitutions, True) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") soup = self.soup(text) str = soup.p.string #self.assertEqual(str.encode("utf-8"), expected) def test_br_tag_is_empty_element(self): """A
tag is designated as an empty-element tag.""" soup = self.soup("

") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), "
") def test_p_tag_is_not_empty_element(self): """A

tag is not designated as an empty-element tag.""" soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") class TestHTMLParserTreeBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the default tree builder. Subclass this to test other builders. These are very likely to give different results for different tree builders. It's not required that a tree builder handle invalid markup at all. """ @property def default_builder(self): return HTMLParserTreeBuilder() def test_table_containing_bare_markup(self): # Markup should be in table cells, not directly in the table. self.assertSoupEquals("
Foo
") def test_incorrectly_nested_table(self): # The second tag is floating in the tag # rather than being inside a ') def test_unclosed_a_tag(self): # tags really ought to be closed at some point. # # We have all the
tags because HTML5 says to duplicate # the tag rather than closing it, and that's what html5lib # does. markup = """ """ expect = """
""" self.assertSoupEquals(markup, expect) def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( '

Foo

Bar', '

Foo

Bar

') def test_fake_self_closing_tag(self): # If a self-closing tag presents as a normal tag, it's treated # as one. self.assertSoupEquals( "http://foo.com/", "http://foo.com/") def test_boolean_attribute_with_no_value(self): soup = self.soup("
. bad_markup = ('' '' "" '
Here's another table:
' '' '
foo
foo
") self.assertEqual(soup.table.td['nowrap'], None) def test_incorrectly_nested_tables(self): self.assertSoupEquals( '
', '
') def test_floating_text_in_table(self): self.assertSoupEquals("foo
bar
") def test_paragraphs_containing_block_display_elements(self): markup = self.soup("

this is the definition:" "

first case
") # The

tag is not closed before the

tag begins. self.assertEqual(len(markup.p.contents), 2) def test_empty_element_tag_with_contents(self): self.assertSoupEquals("
foo
", "
foo
") def test_doctype_in_body(self): markup = "

onetwo

" self.assertSoupEquals(markup) def test_nonsensical_declaration(self): # Declarations that don't make any sense are ignored. self.assertRaises(HTMLParseError, self.soup, '

a

') def test_whitespace_in_doctype(self): # A declaration that has extra whitespace is ignored. self.assertRaises( HTMLParseError, self.soup, '' + '

foo

') def test_incomplete_declaration(self): self.assertRaises(HTMLParseError, self.soup, 'ac') def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "
" soup = self.soup(markup) self.assertEquals(soup.div.contents[0], CData("foo")) def test_attribute_value_never_got_closed(self): markup = '') self.assertEqual(soup.a['b'], '') def test_nonexistent_entity(self): soup = self.soup("

foo&#bar;baz

") # This is very strange. self.assertEqual(soup.p.string, "foofoodbaz

") self.assertEqual(soup.p.string, "foodbaz") # Also compare html5lib, which preserves the &# before the # entity name. def test_entity_out_of_range(self): # An entity that's out of range will be replaced with # REPLACEMENT CHARACTER. soup = self.soup("

") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") soup = self.soup("

") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") soup = self.soup("

") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") def test_entity_was_not_finished(self): soup = self.soup("

<Hello>") # Compare html5lib, which completes the entity. self.assertEqual(soup.p.string, "a

a

') def test_tag_name_contains_unicode(self): # Unicode characters in tag names are stripped. tag_name = u"Joe" self.assertSoupEquals("Joe") def test_multiple_values_for_the_same_attribute(self): markup = '' self.assertSoupEquals(markup, '') class TestHTMLParserTreeBuilderEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. @property def default_builder(self): return HTMLParserTreeBuilder() def setUp(self): super(TestHTMLParserTreeBuilderEncodingConversion, self).setUp() self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) HEBREW_DOCUMENT = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, from_encoding="iso-8859-8") self.assertEqual(soup.original_encoding, 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))