' (which is valid in XML and XHTML, but not HTML 4) is now

recognized as a named entity and converted to a single quote. [bug=1818721]
author: Leonard Richardson <leonardr@segfault.org> 2019-07-07 21:46:36 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2019-07-07 21:46:36 -0400
commit: 2fcaeb6e916a09fa87b4b2ab57167c39db6cef8c (patch)
tree: 06f7f47a4c6adaa6e33e24c91a5dee61162fd441
parent: fd040bfacc6caa2d0b92edbeed5f32582ad55d83 (diff)
4 files changed, 24 insertions, 6 deletions
diff --git a/CHANGELOG b/CHANGELOG
index da4aaaf..b60b5b5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,9 @@
    attributes are treated -- you can do this with the
    `multi_valued_attributes` argument. [bug=1832978]
 
+* &apos; (which is valid in XML and XHTML, but not HTML 4) is now
+   recognized as a named entity and converted to a single quote. [bug=1818721]
+
 = 4.7.1 (20190106)
 
 * Fixed a significant performance problem introduced in 4.7.0. [bug=1810617]
diff --git a/bs4/dammit.py b/bs4/dammit.py
index fb2f8b8..08109f2 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -57,15 +57,24 @@ class EntitySubstitution(object):
         lookup = {}
         reverse_lookup = {}
         characters_for_re = []
-        for codepoint, name in list(codepoint2name.items()):
+
+        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
+        # entity. We don't want to use it, but we want to recognize it on the way in.
+        #
+        # TODO: Ideally we would be able to recognize all HTML 5 named
+        # entities, but that's a little tricky.
+        extra = [(39, 'apos')]
+        for codepoint, name in list(codepoint2name.items()) + extra:
             character = unichr(codepoint)
-            if codepoint != 34:
+            if codepoint not in (34, 39):
                 # There's no point in turning the quotation mark into
-                # &quot;, unless it happens within an attribute value, which
-                # is handled elsewhere.
+                # &quot; or the single quote into &apos;, unless it
+                # happens within an attribute value, which is handled
+                # elsewhere.
                 characters_for_re.append(character)
                 lookup[character] = name
-            # But we do want to turn &quot; into the quotation mark.
+            # But we do want to recognize those entities on the way in and
+            # convert them to Unicode characters.
             reverse_lookup[name] = character
         re_definition = "[%s]" % "".join(characters_for_re)
         return lookup, reverse_lookup, re.compile(re_definition)
diff --git a/bs4/testing.py b/bs4/testing.py
index e144e7e..9f12e8d 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -491,6 +491,12 @@ Hello, world!
             u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
         )
 
+    def test_apos_entity(self):
+        self.assertSoupEquals(
+            u"<p>Bob&apos;s Bar</p>",
+            u"<p>Bob's Bar</p>",
+        )
+        
     def test_entities_in_foreign_document_encoding(self):
         # &#147; and &#148; are invalid numeric entities referencing
         # Windows-1252 characters. &#45; references a character common
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index a2242da..e50d603 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -279,7 +279,7 @@ class TestEntitySubstitution(unittest.TestCase):
         self.assertEqual(
             self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
             "&Aacute;T&amp;T")
-
+       
     def test_quotes_not_html_substituted(self):
         """There's no need to do this except inside attribute values."""
         text = 'Bob\'s "bar"'
author	Leonard Richardson <leonardr@segfault.org>	2019-07-07 21:46:36 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2019-07-07 21:46:36 -0400
commit	2fcaeb6e916a09fa87b4b2ab57167c39db6cef8c (patch)
tree	06f7f47a4c6adaa6e33e24c91a5dee61162fd441
parent	fd040bfacc6caa2d0b92edbeed5f32582ad55d83 (diff)