diff options
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/dammit.py | 19 | ||||
-rw-r--r-- | bs4/testing.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 2 |
4 files changed, 24 insertions, 6 deletions
@@ -6,6 +6,9 @@ attributes are treated -- you can do this with the `multi_valued_attributes` argument. [bug=1832978] +* ' (which is valid in XML and XHTML, but not HTML 4) is now + recognized as a named entity and converted to a single quote. [bug=1818721] + = 4.7.1 (20190106) * Fixed a significant performance problem introduced in 4.7.0. [bug=1810617] diff --git a/bs4/dammit.py b/bs4/dammit.py index fb2f8b8..08109f2 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -57,15 +57,24 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters_for_re = [] - for codepoint, name in list(codepoint2name.items()): + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: character = unichr(codepoint) - if codepoint != 34: + if codepoint not in (34, 39): # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. characters_for_re.append(character) lookup[character] = name - # But we do want to turn " into the quotation mark. + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) diff --git a/bs4/testing.py b/bs4/testing.py index e144e7e..9f12e8d 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -491,6 +491,12 @@ Hello, world! u"<p>\u2022 AT&T is in the s&p 500</p>" ) + def test_apos_entity(self): + self.assertSoupEquals( + u"<p>Bob's Bar</p>", + u"<p>Bob's Bar</p>", + ) + def test_entities_in_foreign_document_encoding(self): # “ and ” are invalid numeric entities referencing # Windows-1252 characters. - references a character common diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index a2242da..e50d603 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -279,7 +279,7 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual( self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") - + def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' |