summaryrefslogtreecommitdiff
path: root/bs4/tests/test_html5lib.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-05-31 15:49:41 -0400
committerLeonard Richardson <leonardr@segfault.org>2021-05-31 15:49:41 -0400
commita00624d7fc2e29b41b286f46844cb75f4d96ff63 (patch)
tree339396570eeaef7e51454dd5de9c432df29cce36 /bs4/tests/test_html5lib.py
parent8d73b97105bf6534057ee93af6795a2a0aceb993 (diff)
The html.parser tree builder can now handles named entities
found in the HTML5 spec in much the same way that the html5lib tree builder does. Note that the lxml tree builder still handles named entities differently. [bug=1924908]
Diffstat (limited to 'bs4/tests/test_html5lib.py')
-rw-r--r--bs4/tests/test_html5lib.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 7b0a6d4..2adebc8 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
# because there's no way of knowing, when a string is created,
# where in the tree it will eventually end up.
pass
+
+ def test_html5_attributes(self):
+ # The html5lib TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ #
+ # This is a copy of the same test from
+ # HTMLParserTreeBuilderSmokeTest. It's not in the superclass
+ # because the lxml HTML TreeBuilder _doesn't_ work this way.
+ for input_element, output_unicode, output_element in (
+ ("&RightArrowLeftArrow;", u'\u21c4', b'&rlarr;'),
+ ('&models;', u'\u22a7', b'&models;'),
+ ('&Nfr;', u'\U0001d511', b'&Nfr;'),
+ ('&ngeqq;', u'\u2267\u0338', b'&ngeqq;'),
+ ('&not;', u'\xac', b'&not;'),
+ ('&Not;', u'\u2aec', b'&Not;'),
+ ('&quot;', u'"', b'"'),
+ ('&there4;', u'\u2234', b'&there4;'),
+ ('&Therefore;', u'\u2234', b'&there4;'),
+ ('&therefore;', u'\u2234', b'&there4;'),
+ ("&fjlig;", u'fj', b'fj'),
+ ("&sqcup;", u'\u2294', b'&sqcup;'),
+ ("&sqcups;", u'\u2294\ufe00', b'&sqcups;'),
+ ("&apos;", u"'", b"'"),
+ ("&verbar;", u"|", b"|"),
+ ):
+ markup = u'<div>%s</div>' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+ self.assertEquals(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"<div>%s</div>" % output_element
+ self.assertEquals(with_element, expect)