diff options
Diffstat (limited to 'bs4/tests/test_html5lib.py')
-rw-r--r-- | bs4/tests/test_html5lib.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 7b0a6d4..2adebc8 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): # because there's no way of knowing, when a string is created, # where in the tree it will eventually end up. pass + + def test_html5_attributes(self): + # The html5lib TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + # + # This is a copy of the same test from + # HTMLParserTreeBuilderSmokeTest. It's not in the superclass + # because the lxml HTML TreeBuilder _doesn't_ work this way. + for input_element, output_unicode, output_element in ( + ("⇄", u'\u21c4', b'⇄'), + ('⊧', u'\u22a7', b'⊧'), + ('𝔑', u'\U0001d511', b'𝔑'), + ('≧̸', u'\u2267\u0338', b'≧̸'), + ('¬', u'\xac', b'¬'), + ('⫬', u'\u2aec', b'⫬'), + ('"', u'"', b'"'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ("fj", u'fj', b'fj'), + ("⊔", u'\u2294', b'⊔'), + ("⊔︀", u'\u2294\ufe00', b'⊔︀'), + ("'", u"'", b"'"), + ("|", u"|", b"|"), + ): + markup = u'<div>%s</div>' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + self.assertEquals(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + self.assertEquals(with_element, expect) |