summaryrefslogtreecommitdiff
path: root/bs4/tests/test_html5lib.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests/test_html5lib.py')
-rw-r--r--bs4/tests/test_html5lib.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 7b0a6d4..2adebc8 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
# because there's no way of knowing, when a string is created,
# where in the tree it will eventually end up.
pass
+
+ def test_html5_attributes(self):
+ # The html5lib TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ #
+ # This is a copy of the same test from
+ # HTMLParserTreeBuilderSmokeTest. It's not in the superclass
+ # because the lxml HTML TreeBuilder _doesn't_ work this way.
+ for input_element, output_unicode, output_element in (
+ ("⇄", u'\u21c4', b'⇄'),
+ ('⊧', u'\u22a7', b'⊧'),
+ ('𝔑', u'\U0001d511', b'𝔑'),
+ ('≧̸', u'\u2267\u0338', b'≧̸'),
+ ('¬', u'\xac', b'¬'),
+ ('⫬', u'\u2aec', b'⫬'),
+ ('"', u'"', b'"'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ("fj", u'fj', b'fj'),
+ ("⊔", u'\u2294', b'⊔'),
+ ("⊔︀", u'\u2294\ufe00', b'⊔︀'),
+ ("'", u"'", b"'"),
+ ("|", u"|", b"|"),
+ ):
+ markup = u'<div>%s</div>' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+ self.assertEquals(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"<div>%s</div>" % output_element
+ self.assertEquals(with_element, expect)