summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-07-07 21:46:36 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-07-07 21:46:36 -0400
commit2fcaeb6e916a09fa87b4b2ab57167c39db6cef8c (patch)
tree06f7f47a4c6adaa6e33e24c91a5dee61162fd441 /bs4/dammit.py
parentfd040bfacc6caa2d0b92edbeed5f32582ad55d83 (diff)
&apos; (which is valid in XML and XHTML, but not HTML 4) is now
recognized as a named entity and converted to a single quote. [bug=1818721]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py19
1 files changed, 14 insertions, 5 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index fb2f8b8..08109f2 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -57,15 +57,24 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters_for_re = []
- for codepoint, name in list(codepoint2name.items()):
+
+ # &apos is an XHTML entity and an HTML 5, but not an HTML 4
+ # entity. We don't want to use it, but we want to recognize it on the way in.
+ #
+ # TODO: Ideally we would be able to recognize all HTML 5 named
+ # entities, but that's a little tricky.
+ extra = [(39, 'apos')]
+ for codepoint, name in list(codepoint2name.items()) + extra:
character = unichr(codepoint)
- if codepoint != 34:
+ if codepoint not in (34, 39):
# There's no point in turning the quotation mark into
- # &quot;, unless it happens within an attribute value, which
- # is handled elsewhere.
+ # &quot; or the single quote into &apos;, unless it
+ # happens within an attribute value, which is handled
+ # elsewhere.
characters_for_re.append(character)
lookup[character] = name
- # But we do want to turn &quot; into the quotation mark.
+ # But we do want to recognize those entities on the way in and
+ # convert them to Unicode characters.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)