diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 21:46:36 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 21:46:36 -0400 |
commit | 2fcaeb6e916a09fa87b4b2ab57167c39db6cef8c (patch) | |
tree | 06f7f47a4c6adaa6e33e24c91a5dee61162fd441 /bs4/dammit.py | |
parent | fd040bfacc6caa2d0b92edbeed5f32582ad55d83 (diff) |
' (which is valid in XML and XHTML, but not HTML 4) is now
recognized as a named entity and converted to a single quote. [bug=1818721]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 19 |
1 files changed, 14 insertions, 5 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index fb2f8b8..08109f2 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -57,15 +57,24 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters_for_re = [] - for codepoint, name in list(codepoint2name.items()): + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: character = unichr(codepoint) - if codepoint != 34: + if codepoint not in (34, 39): # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. characters_for_re.append(character) lookup[character] = name - # But we do want to turn " into the quotation mark. + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) |