The html.parser tree builder can now handles named entities

found in the HTML5 spec in much the same way that the html5lib tree builder does. Note that the lxml tree builder still handles named entities differently. [bug=1924908]
author: Leonard Richardson <leonardr@segfault.org> 2021-05-31 15:49:41 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2021-05-31 15:49:41 -0400
commit: a00624d7fc2e29b41b286f46844cb75f4d96ff63 (patch)
tree: 339396570eeaef7e51454dd5de9c432df29cce36 /bs4/tests
parent: 8d73b97105bf6534057ee93af6795a2a0aceb993 (diff)
4 files changed, 114 insertions, 1 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 7b0a6d4..2adebc8 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
         # because there's no way of knowing, when a string is created,
         # where in the tree it will eventually end up.
         pass
+
+    def test_html5_attributes(self):
+        # The html5lib TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        #
+        # This is a copy of the same test from
+        # HTMLParserTreeBuilderSmokeTest.  It's not in the superclass
+        # because the lxml HTML TreeBuilder _doesn't_ work this way.
+        for input_element, output_unicode, output_element in (
+                ("&RightArrowLeftArrow;", u'\u21c4', b'&rlarr;'),
+                ('&models;', u'\u22a7', b'&models;'),
+                ('&Nfr;', u'\U0001d511', b'&Nfr;'),
+                ('&ngeqq;', u'\u2267\u0338', b'&ngeqq;'),
+                ('&not;', u'\xac', b'&not;'),
+                ('&Not;', u'\u2aec', b'&Not;'),
+                ('&quot;', u'"', b'"'),
+                ('&there4;', u'\u2234', b'&there4;'),
+                ('&Therefore;', u'\u2234', b'&there4;'),
+                ('&therefore;', u'\u2234', b'&there4;'),
+                ("&fjlig;", u'fj', b'fj'),                
+                ("&sqcup;", u'\u2294', b'&sqcup;'),
+                ("&sqcups;", u'\u2294\ufe00', b'&sqcups;'),
+                ("&apos;", u"'", b"'"),
+                ("&verbar;", u"|", b"|"),
+        ):
+            markup = u'<div>%s</div>' % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            self.assertEquals(without_element, expect)
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            self.assertEquals(with_element, expect)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index db85d2d..e84eced 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
             attrs[key].append(value)
         assert_attribute(accumulate, ["url1", "url2", "url3"])            
 
+    def test_html5_attributes(self):
+        # The html.parser TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        for input_element, output_unicode, output_element in (
+                ("&RightArrowLeftArrow;", u'\u21c4', b'&rlarr;'),
+                ('&models;', u'\u22a7', b'&models;'),
+                ('&Nfr;', u'\U0001d511', b'&Nfr;'),
+                ('&ngeqq;', u'\u2267\u0338', b'&ngeqq;'),
+                ('&not;', u'\xac', b'&not;'),
+                ('&Not;', u'\u2aec', b'&Not;'),
+                ('&quot;', u'"', b'"'),
+                ('&there4;', u'\u2234', b'&there4;'),
+                ('&Therefore;', u'\u2234', b'&there4;'),
+                ('&therefore;', u'\u2234', b'&there4;'),
+                ("&fjlig;", u'fj', b'fj'),                
+                ("&sqcup;", u'\u2294', b'&sqcup;'),
+                ("&sqcups;", u'\u2294\ufe00', b'&sqcups;'),
+                ("&apos;", u"'", b"'"),
+                ("&verbar;", u"|", b"|"),
+        ):
+            markup = u'<div>%s</div>' % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            self.assertEquals(without_element, expect)
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            self.assertEquals(with_element, expect)
+
 
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index f96e4ae..d8dada4 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
         self.assertSoupEquals(
             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
-
+        
     def test_entities_in_foreign_document_encoding(self):
         # We can't implement this case correctly because by the time we
         # hear about markup like "&#147;", it's been (incorrectly) converted into
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddb6446..9074bdb 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -368,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
         self.assertEqual(self.sub.substitute_html(dammit.markup),
                           "&lsquo;&rsquo;foo&ldquo;&rdquo;")
 
+    def test_html5_entity(self):
+        # Some HTML5 entities correspond to single- or multi-character
+        # Unicode sequences.
+
+        for entity, u in (
+            # A few spot checks of our ability to recognize
+            # special character sequences and convert them
+            # to named entities.
+            ('&models;', u'\u22a7'),
+            ('&Nfr;', u'\U0001d511'),
+            ('&ngeqq;', u'\u2267\u0338'),
+            ('&not;', u'\xac'),
+            ('&Not;', u'\u2aec'),
+                
+            # We _could_ convert | to &verbarr;, but we don't, because
+            # | is an ASCII character.
+            ('|' '|'),
+
+            # Similarly for the fj ligature, which we could convert to
+            # &fjlig;, but we don't.
+            ("fj", "fj"),
+
+            # We do convert _these_ ASCII characters to HTML entities,
+            # because that's required to generate valid HTML.
+            ('&gt;', '>'),
+            ('&lt;', '<'),
+            ('&amp;', '&'),
+        ):
+            template = u'3 %s 4'
+            raw = template % u
+            with_entities = template % entity
+            self.assertEqual(self.sub.substitute_html(raw), with_entities)
+            
+    def test_html5_entity_with_variation_selector(self):
+        # Some HTML5 entities correspond either to a single-character
+        # Unicode sequence _or_ to the same character plus U+FE00,
+        # VARIATION SELECTOR 1. We can handle this.
+        data = u"fjords \u2294 penguins"
+        markup = u"fjords &sqcup; penguins"
+        self.assertEqual(self.sub.substitute_html(data), markup)
+
+        data = u"fjords \u2294\ufe00 penguins"
+        markup = u"fjords &sqcups; penguins"
+        self.assertEqual(self.sub.substitute_html(data), markup)
+        
     def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
         s = 'Welcome to "my bar"'
         self.assertEqual(self.sub.substitute_xml(s, False), s)
author	Leonard Richardson <leonardr@segfault.org>	2021-05-31 15:49:41 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2021-05-31 15:49:41 -0400
commit	a00624d7fc2e29b41b286f46844cb75f4d96ff63 (patch)
tree	339396570eeaef7e51454dd5de9c432df29cce36 /bs4/tests
parent	8d73b97105bf6534057ee93af6795a2a0aceb993 (diff)