Hebrew (ISO 8859-8) in Visual Directionality

Foo
Bar
Baz

Foo

Bar

Baz

and ") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_quoted(self): self.assertSoupEquals('', '') def test_literal_in_textarea(self): # Anything inside a ' soup = self.soup(text) self.assertEqual(len(soup.textarea.contents), 2) self.assertEqual(soup.textarea.contents[0], u"Junk like ") self.assertEqual(soup.textarea.contents[1].name, 'b') self.assertEqual(soup.textarea.b.string, u" tags and ") def test_literal_in_script(self): # The contents of a ' % javascript) self.assertEqual(soup.script.string, javascript) def test_naked_ampersands(self): # Ampersands are left alone. text = "

AT&T

" soup = self.soup(text) self.assertEqual(soup.p.string, "AT&T") # Even if they're in attribute values. invalid_url = 'foo' soup = self.soup(invalid_url) self.assertEqual(soup.a['href'], "http://example.org?a=1&b=2;3") def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup(" ") self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) def test_cdata_where_its_ok(self): # lxml strips CDATA sections, no matter where they occur. markup = "

foo

' soup = self.soup(markup) doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') def test_doctype(self): # Test a normal HTML doctype you'll commonly see in a real document. self._test_doctype( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') def test_namespaced_system_doctype(self): # Test a namespaced doctype with a system id. self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'

'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'

') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) # Tests below this line need work. def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is replaced with a # generic value. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) self.assertEqual(parsed_meta['content'], 'text/html; charset=%SOUP-ENCODING%') self.assertEqual(parsed_meta.contains_substitutions, True) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") soup = self.soup(text) str = soup.p.string #self.assertEqual(str.encode("utf-8"), expected) def test_br_tag_is_empty_element(self): """A
tag is designated as an empty-element tag.""" soup = self.soup("

") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), "
") def test_p_tag_is_not_empty_element(self): """A

tag is not designated as an empty-element tag.""" soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") class TestLXMLBuilderInvalidMarkup(SoupTest): """Tests of invalid markup for the LXML tree builder. Subclass this to test other builders. These are very likely to give different results for different tree builders. It's not required that a tree builder handle invalid markup at all. """ def test_table_containing_bare_markup(self): # Markup should be in table cells, not directly in the table. self.assertSoupEquals("

Foo

") def test_incorrectly_nested_table(self): # The second tag is floating in the tag # rather than being inside a ') def test_unclosed_a_tag(self): # tags really ought to be closed at some point. # # We have all the

tags because HTML5 says to duplicate # the tag rather than closing it, and that's what html5lib # does. markup = """

""" expect = """

""" self.assertSoupEquals(markup, expect) def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( '

Foo

Bar', '

Foo

Bar

') def test_fake_self_closing_tag(self): # If a self-closing tag presents as a normal tag, the 'open' # tag is treated as an instance of the self-closing tag and # the 'close' tag is ignored. self.assertSoupEquals( "http://foo.com/", "http://foo.com/") def test_boolean_attribute_with_no_value_gets_empty_value(self): soup = self.soup("

. bad_markup = ('' '' "" '

Here's another table:

' '' '

foo

") self.assertEqual(soup.table.td['nowrap'], '') def test_incorrectly_nested_tables(self): self.assertSoupEquals( '
', '
') def test_floating_text_in_table(self): self.assertSoupEquals("foo
bar
") def test_paragraphs_containing_block_display_elements(self): markup = self.soup("
this is the definition:" "
first case
") # The
tag is closed before the
tag begins. self.assertEqual(markup.p.contents, ["this is the definition:"]) def test_empty_element_tag_with_contents(self): self.assertSoupEquals("
foo
", "
foo") def test_doctype_in_body(self): markup = "
onetwo
" self.assertSoupEquals(markup) def test_nonsensical_declaration(self): # Declarations that don't make any sense are ignored. self.assertSoupEquals('
a
', "
a
") def test_whitespace_in_doctype(self): # A declaration that has extra whitespace is ignored. self.assertSoupEquals( ('' '
foo
'), '
foo
') def test_incomplete_declaration(self): # An incomplete declaration will screw up the rest of the document. self.assertSoupEquals('ac', '
a
') def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "
" self.assertSoupEquals(markup, "
") def test_attribute_value_never_got_closed(self): markup = ' and blah and blah") def test_attribute_value_was_closed_by_subsequent_tag(self): markup = """baz""" soup = self.soup(markup) # The string between the first and second quotes was interpreted # as the value of the 'href' attribute. self.assertEqual(soup.a['href'], 'foo, ') self.assertEqual(soup.a['style'], '{height:21px;}') def test_attribute_value_with_embedded_brackets(self): soup = self.soup('') self.assertEqual(soup.a['b'], '') def test_nonexistent_entity(self): soup = self.soup("
foo&#bar;baz
") self.assertEqual(soup.p.string, "foobar;baz") # Compare a real entity. soup = self.soup("
foodbaz
") self.assertEqual(soup.p.string, "foodbaz") # Also compare html5lib, which preserves the &# before the # entity name. def test_entity_out_of_range(self): # An entity that's out of range will be ignored. soup = self.soup("
�
") self.assertEqual(soup.p.string, None) soup = self.soup("
�
") self.assertEqual(soup.p.string, None) def test_entity_was_not_finished(self): soup = self.soup("
<Hello>") # Compare html5lib, which completes the entity. self.assertEqual(soup.p.string, "a
a
') # The declaration is ignored altogether. self.assertEqual(soup.encode(), b"
a
") def test_tag_name_contains_unicode(self): # Unicode characters in tag names are stripped. tag_name = u"Joe" self.assertSoupEquals("Joe") class TestLXMLBuilderEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setUp(self): super(TestLXMLBuilderEncodingConversion, self).setUp() self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) HEBREW_DOCUMENT = b'Hebrew (ISO 8859-8) in Visual Directionality
Hebrew (ISO 8859-8) in Visual Directionality
\xed\xe5\xec\xf9' def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, from_encoding="iso-8859-8") self.assertEqual(soup.original_encoding, 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8")) class TestLXMLXMLBuilder(SoupTest): """Test XML-specific parsing behavior. Most of the tests use HTML as an example, since Beautiful Soup is mainly an HTML parser. This test suite is a base for XML-specific tree builders. """ @property def default_builder(self): return LXMLTreeBuilderForXML() def test_mixed_case_tags(self): # Mixed-case tags are *not* folded to lowercase, but the # end tag is always the same case as the start tag. self.assertSoupEquals( "", "") def test_cdata_becomes_text(self): # LXML sends CData sections as 'data' events, so we can't # create special CData objects for them. We have to use # NavigableString. I would like to fix this, but it's not a # very high priority. markup = "" soup = self.soup(markup) cdata = soup.foo.contents[0] self.assertEqual(cdata.__class__.__name__, 'NavigableString') def test_can_handle_invalid_xml(self): self.assertSoupEquals("", "") def test_empty_element_tag(self): soup = self.soup("
") self.assertTrue(soup.iamselfclosing.is_empty_element) def test_self_empty_tag_treated_as_empty_element(self): soup = self.soup("
") self.assertTrue(soup.iamclosed.is_empty_element) def test_self_nonempty_tag_is_not_empty_element(self): soup = self.soup("
contents
") self.assertFalse(soup.ihavecontents.is_empty_element) def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self): soup = self.soup("") self.assertTrue(soup.bar.is_empty_element) soup.bar.insert(1, "Contents") self.assertFalse(soup.bar.is_empty_element) self.assertEqual(str(soup), self.document_for("Contents")) def test_designated_empty_element_tag_has_no_closing_tag(self): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="") self.assertTrue(soup.bar.is_empty_element) self.assertEqual(str(soup), self.document_for("")) def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self): builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="") self.assertFalse(soup.foo.is_empty_element) self.assertEqual(str(soup), self.document_for("")) def test_designated_empty_element_tag_does_not_change_parser_behavior(self): # The designated list of empty-element tags only affects how # empty tags are presented. It does not affect how tags are # parsed--that's the parser's job. builder = LXMLTreeBuilderForXML(empty_element_tags=['bar']) soup = BeautifulSoup(builder=builder, markup="contents") self.assertEqual(str(soup), self.document_for("contents"))