Hebrew (ISO 8859-8) in Visual Directionality

Foo
Bar
Baz

Foo

Bar

Baz

and ") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_quoted(self): self.assertSoupEquals('', '') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup(" ") self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'

'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'

') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) # Tests below this line need work. def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is replaced with a # generic value. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) self.assertEqual(parsed_meta['content'], 'text/html; charset=%SOUP-ENCODING%') self.assertEqual(parsed_meta.contains_substitutions, True) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") soup = self.soup(text) str = soup.p.string #self.assertEqual(str.encode("utf-8"), expected) def test_br_tag_is_empty_element(self): """A
tag is designated as an empty-element tag.""" soup = self.soup("

") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), "
") def test_p_tag_is_not_empty_element(self): """A

tag is not designated as an empty-element tag.""" soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") class TestHTMLParserTreeBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the default tree builder. Subclass this to test other builders. These are very likely to give different results for different tree builders. It's not required that a tree builder handle invalid markup at all. """ @property def default_builder(self): return HTMLParserTreeBuilder() def test_table_containing_bare_markup(self): # Markup should be in table cells, not directly in the table. self.assertSoupEquals("

Foo

") def test_incorrectly_nested_table(self): # The second tag is floating in the tag # rather than being inside a ') def test_unclosed_a_tag(self): # tags really ought to be closed at some point. # # We have all the

tags because HTML5 says to duplicate # the tag rather than closing it, and that's what html5lib # does. markup = """

""" expect = """

""" self.assertSoupEquals(markup, expect) def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( '

Foo

Bar', '

Foo

Bar

') def test_fake_self_closing_tag(self): # If a self-closing tag presents as a normal tag, it's treated # as one. self.assertSoupEquals( "http://foo.com/", "http://foo.com/") def test_boolean_attribute_with_no_value(self): soup = self.soup("

. bad_markup = ('' '' "" '

Here's another table:

' '' '

foo

") self.assertEqual(soup.table.td['nowrap'], None) def test_incorrectly_nested_tables(self): self.assertSoupEquals( '
', '
') def test_floating_text_in_table(self): self.assertSoupEquals("foo
bar
") def test_paragraphs_containing_block_display_elements(self): markup = self.soup("
this is the definition:" "
first case
") # The
tag is not closed before the
tag begins. self.assertEqual(len(markup.p.contents), 2) def test_empty_element_tag_with_contents(self): self.assertSoupEquals("
foo
", "
foo
") def test_doctype_in_body(self): markup = "
onetwo
" self.assertSoupEquals(markup) def test_nonsensical_declaration(self): # Declarations that don't make any sense are ignored. self.assertRaises(HTMLParseError, self.soup, '
a
') def test_whitespace_in_doctype(self): # A declaration that has extra whitespace is ignored. self.assertRaises( HTMLParseError, self.soup, '' + '
foo
') def test_incomplete_declaration(self): self.assertRaises(HTMLParseError, self.soup, 'ac') def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "
" soup = self.soup(markup) self.assertEquals(soup.div.contents[0], CData("foo")) def test_attribute_value_never_got_closed(self): markup = '') self.assertEqual(soup.a['b'], '') def test_nonexistent_entity(self): soup = self.soup("
foo&#bar;baz
") # This is very strange. self.assertEqual(soup.p.string, "foofoodbaz
") self.assertEqual(soup.p.string, "foodbaz") # Also compare html5lib, which preserves the &# before the # entity name. def test_entity_out_of_range(self): # An entity that's out of range will be replaced with # REPLACEMENT CHARACTER. soup = self.soup("
�
") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") soup = self.soup("
�
") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") soup = self.soup("
�
") self.assertEqual(soup.p.string, u"\N{REPLACEMENT CHARACTER}") def test_entity_was_not_finished(self): soup = self.soup("
<Hello>") # Compare html5lib, which completes the entity. self.assertEqual(soup.p.string, "a
a
') def test_tag_name_contains_unicode(self): # Unicode characters in tag names are stripped. tag_name = u"Joe" self.assertSoupEquals("Joe") def test_multiple_values_for_the_same_attribute(self): markup = '' self.assertSoupEquals(markup, '') class TestHTMLParserTreeBuilderEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. @property def default_builder(self): return HTMLParserTreeBuilder() def setUp(self): super(TestHTMLParserTreeBuilderEncodingConversion, self).setUp() self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) HEBREW_DOCUMENT = b'Hebrew (ISO 8859-8) in Visual Directionality
Hebrew (ISO 8859-8) in Visual Directionality
\xed\xe5\xec\xf9' def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, from_encoding="iso-8859-8") self.assertEqual(soup.original_encoding, 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))