diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-05-31 15:49:41 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-05-31 15:49:41 -0400 |
commit | a00624d7fc2e29b41b286f46844cb75f4d96ff63 (patch) | |
tree | 339396570eeaef7e51454dd5de9c432df29cce36 /bs4/tests/test_soup.py | |
parent | 8d73b97105bf6534057ee93af6795a2a0aceb993 (diff) |
The html.parser tree builder can now handles named entities
found in the HTML5 spec in much the same way that the html5lib
tree builder does. Note that the lxml tree builder still handles
named entities differently. [bug=1924908]
Diffstat (limited to 'bs4/tests/test_soup.py')
-rw-r--r-- | bs4/tests/test_soup.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ddb6446..9074bdb 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -368,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") + def test_html5_entity(self): + # Some HTML5 entities correspond to single- or multi-character + # Unicode sequences. + + for entity, u in ( + # A few spot checks of our ability to recognize + # special character sequences and convert them + # to named entities. + ('⊧', u'\u22a7'), + ('𝔑', u'\U0001d511'), + ('≧̸', u'\u2267\u0338'), + ('¬', u'\xac'), + ('⫬', u'\u2aec'), + + # We _could_ convert | to &verbarr;, but we don't, because + # | is an ASCII character. + ('|' '|'), + + # Similarly for the fj ligature, which we could convert to + # fj, but we don't. + ("fj", "fj"), + + # We do convert _these_ ASCII characters to HTML entities, + # because that's required to generate valid HTML. + ('>', '>'), + ('<', '<'), + ('&', '&'), + ): + template = u'3 %s 4' + raw = template % u + with_entities = template % entity + self.assertEqual(self.sub.substitute_html(raw), with_entities) + + def test_html5_entity_with_variation_selector(self): + # Some HTML5 entities correspond either to a single-character + # Unicode sequence _or_ to the same character plus U+FE00, + # VARIATION SELECTOR 1. We can handle this. + data = u"fjords \u2294 penguins" + markup = u"fjords ⊔ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + + data = u"fjords \u2294\ufe00 penguins" + markup = u"fjords ⊔︀ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) |