From a00624d7fc2e29b41b286f46844cb75f4d96ff63 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Mon, 31 May 2021 15:49:41 -0400
Subject: The html.parser tree builder can now handles named entities   found
 in the HTML5 spec in much the same way that the html5lib   tree builder does.
 Note that the lxml tree builder still handles   named entities differently.
 [bug=1924908]

---
 bs4/tests/test_htmlparser.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'bs4/tests/test_htmlparser.py')
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index db85d2d..e84eced 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
             attrs[key].append(value)
         assert_attribute(accumulate, ["url1", "url2", "url3"])            
 
+    def test_html5_attributes(self):
+        # The html.parser TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        for input_element, output_unicode, output_element in (
+                ("&RightArrowLeftArrow;", u'\u21c4', b'&rlarr;'),
+                ('&models;', u'\u22a7', b'&models;'),
+                ('&Nfr;', u'\U0001d511', b'&Nfr;'),
+                ('&ngeqq;', u'\u2267\u0338', b'&ngeqq;'),
+                ('&not;', u'\xac', b'&not;'),
+                ('&Not;', u'\u2aec', b'&Not;'),
+                ('&quot;', u'"', b'"'),
+                ('&there4;', u'\u2234', b'&there4;'),
+                ('&Therefore;', u'\u2234', b'&there4;'),
+                ('&therefore;', u'\u2234', b'&there4;'),
+                ("&fjlig;", u'fj', b'fj'),                
+                ("&sqcup;", u'\u2294', b'&sqcup;'),
+                ("&sqcups;", u'\u2294\ufe00', b'&sqcups;'),
+                ("&apos;", u"'", b"'"),
+                ("&verbar;", u"|", b"|"),
+        ):
+            markup = u'<div>%s</div>' % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            self.assertEquals(without_element, expect)
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            self.assertEquals(with_element, expect)
+
 
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
-- 
cgit v1.2.3