diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/testing.py | 44 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 8 |
2 files changed, 47 insertions, 5 deletions
diff --git a/bs4/testing.py b/bs4/testing.py index 49f50a5..cc30e17 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -105,6 +105,16 @@ class HTMLTreeBuilderSmokeTest(object): self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "<p></p>") + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("<p>", "<p></p>") + self.assertSoupEquals("<b>", "<b></b>") + + self.assertSoupEquals("<br>", "<br/>") + def test_br_is_always_empty_element_tag(self): """A <br> tag is designated as an empty-element tag. @@ -167,15 +177,32 @@ class HTMLTreeBuilderSmokeTest(object): "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") - def test_hex_entities_in_text(self): - """This mainly tests a BS workaround for a bug in HTMLParser.""" - self.assertSoupEquals("<p>ñ</p>", u"<p>\xf1</p>") + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + + def test_entities_in_attributes_converted_to_unicode(self): + expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + + def test_out_of_range_entity(self): + expect = u"\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder - # to detect any differences. + # to detect any differences between them. # def test_soupstrainer(self): @@ -201,10 +228,17 @@ class HTMLTreeBuilderSmokeTest(object): soup.foo.decode(), """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") - def test_ampersand_in_attribute_value_gets_quoted(self): + def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('<this is="really messed up & stuff"></this>', '<this is="really messed up & stuff"></this>') + self.assertSoupEquals( + '<a href="http://example.org?a=1&b=2;3">foo</a>', + '<a href="http://example.org?a=1&b=2;3">foo</a>') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') + def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index d60bd3b..92b7389 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -27,3 +27,11 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): return LXMLTreeBuilder() + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") |