summaryrefslogtreecommitdiff
path: root/bs4/tests/test_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests/test_htmlparser.py')
-rw-r--r--bs4/tests/test_htmlparser.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index db85d2d..e84eced 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
+ def test_html5_attributes(self):
+ # The html.parser TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ for input_element, output_unicode, output_element in (
+ ("⇄", u'\u21c4', b'⇄'),
+ ('⊧', u'\u22a7', b'⊧'),
+ ('𝔑', u'\U0001d511', b'𝔑'),
+ ('≧̸', u'\u2267\u0338', b'≧̸'),
+ ('¬', u'\xac', b'¬'),
+ ('⫬', u'\u2aec', b'⫬'),
+ ('"', u'"', b'"'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ("fj", u'fj', b'fj'),
+ ("⊔", u'\u2294', b'⊔'),
+ ("⊔︀", u'\u2294\ufe00', b'⊔︀'),
+ ("'", u"'", b"'"),
+ ("|", u"|", b"|"),
+ ):
+ markup = u'<div>%s</div>' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+ self.assertEquals(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"<div>%s</div>" % output_element
+ self.assertEquals(with_element, expect)
+
class TestHTMLParserSubclass(SoupTest):
def test_error(self):