diff options
Diffstat (limited to 'bs4/tests/test_htmlparser.py')
-rw-r--r-- | bs4/tests/test_htmlparser.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index db85d2d..e84eced 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): attrs[key].append(value) assert_attribute(accumulate, ["url1", "url2", "url3"]) + def test_html5_attributes(self): + # The html.parser TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + for input_element, output_unicode, output_element in ( + ("⇄", u'\u21c4', b'⇄'), + ('⊧', u'\u22a7', b'⊧'), + ('𝔑', u'\U0001d511', b'𝔑'), + ('≧̸', u'\u2267\u0338', b'≧̸'), + ('¬', u'\xac', b'¬'), + ('⫬', u'\u2aec', b'⫬'), + ('"', u'"', b'"'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ("fj", u'fj', b'fj'), + ("⊔", u'\u2294', b'⊔'), + ("⊔︀", u'\u2294\ufe00', b'⊔︀'), + ("'", u"'", b"'"), + ("|", u"|", b"|"), + ): + markup = u'<div>%s</div>' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + self.assertEquals(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + self.assertEquals(with_element, expect) + class TestHTMLParserSubclass(SoupTest): def test_error(self): |