"""
soup = self.soup(markup)
# The string between the first and second quotes was interpreted
# as the value of the 'href' attribute.
self.assertEqual(soup.a['href'], 'foo,
')
self.assertEqual(soup.a['style'], '{height:21px;}')
def test_attribute_value_with_embedded_brackets(self):
soup = self.soup('
')
def test_nonexistent_entity(self):
soup = self.soup("foobar;baz
")
self.assertEqual(soup.p.string, "foobar;baz")
# Compare a real entity.
soup = self.soup("foodbaz
")
self.assertEqual(soup.p.string, "foodbaz")
# Also compare html5lib, which preserves the before the
# entity name.
def test_entity_out_of_range(self):
# An entity that's out of range will be ignored.
soup = self.soup("")
self.assertEqual(soup.p.string, None)
soup = self.soup("")
self.assertEqual(soup.p.string, None)
def test_entity_was_not_finished(self):
soup = self.soup("<Hello>")
# Compare html5lib, which completes the entity.
self.assertEqual(soup.p.string, "aa
')
# The declaration is ignored altogether.
self.assertEqual(soup.encode(), b"a
")
def test_tag_name_contains_unicode(self):
# Unicode characters in tag names are stripped.
tag_name = u"Joe"
self.assertSoupEquals("Joe")
class TestLXMLBuilderEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestLXMLBuilderEncodingConversion, self).setUp()
self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!"
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b"Sacr\xc3\xa9 bleu!")
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set.
ascii = b"a"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding, "ascii")
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
HEBREW_DOCUMENT = b'Hebrew (ISO 8859-8) in Visual DirectionalityHebrew (ISO 8859-8) in Visual Directionality
\xed\xe5\xec\xf9'
def test_real_hebrew_document(self):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
from_encoding="iso-8859-8")
self.assertEqual(soup.original_encoding, 'iso-8859-8')
self.assertEqual(
soup.encode('utf-8'),
self.HEBREW_DOCUMENT.decode("iso-8859-8").encode("utf-8"))
class TestLXMLXMLBuilder(SoupTest):
"""Test XML-specific parsing behavior.
Most of the tests use HTML as an example, since Beautiful Soup is
mainly an HTML parser. This test suite is a base for XML-specific
tree builders.
"""
@property
def default_builder(self):
return LXMLTreeBuilderForXML()
def test_mixed_case_tags(self):
# Mixed-case tags are *not* folded to lowercase, but the
# end tag is always the same case as the start tag.
self.assertSoupEquals(
"",
"")
def test_cdata_becomes_text(self):
# LXML sends CData sections as 'data' events, so we can't
# create special CData objects for them. We have to use
# NavigableString. I would like to fix this, but it's not a
# very high priority.
markup = ""
soup = self.soup(markup)
cdata = soup.foo.contents[0]
self.assertEqual(cdata.__class__.__name__, 'NavigableString')
def test_can_handle_invalid_xml(self):
self.assertSoupEquals("", "")
def test_empty_element_tag(self):
soup = self.soup("
")
self.assertTrue(soup.iamselfclosing.is_empty_element)
def test_self_empty_tag_treated_as_empty_element(self):
soup = self.soup("
")
self.assertTrue(soup.iamclosed.is_empty_element)
def test_self_nonempty_tag_is_not_empty_element(self):
soup = self.soup("contents
")
self.assertFalse(soup.ihavecontents.is_empty_element)
def test_empty_tag_that_stops_being_empty_gets_a_closing_tag(self):
soup = self.soup("")
self.assertTrue(soup.bar.is_empty_element)
soup.bar.insert(1, "Contents")
self.assertFalse(soup.bar.is_empty_element)
self.assertEqual(str(soup), self.document_for("Contents"))
def test_designated_empty_element_tag_has_no_closing_tag(self):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="")
self.assertTrue(soup.bar.is_empty_element)
self.assertEqual(str(soup), self.document_for(""))
def test_empty_tag_not_in_empty_element_tag_list_has_closing_tag(self):
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="")
self.assertFalse(soup.foo.is_empty_element)
self.assertEqual(str(soup), self.document_for(""))
def test_designated_empty_element_tag_does_not_change_parser_behavior(self):
# The designated list of empty-element tags only affects how
# empty tags are presented. It does not affect how tags are
# parsed--that's the parser's job.
builder = LXMLTreeBuilderForXML(empty_element_tags=['bar'])
soup = BeautifulSoup(builder=builder, markup="contents")
self.assertEqual(str(soup), self.document_for("contents"))