summaryrefslogtreecommitdiff
path: root/beautifulsoup/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-22 22:57:39 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-22 22:57:39 -0500
commit7db58f67d60bf2675613e38d2d8daeeadc5522db (patch)
tree6cacead21b43bb5de23947a414c5d949c0d6c434 /beautifulsoup/dammit.py
parentf42fef27dc82ce97df0cb7b254595e6771461637 (diff)
parent6d7422139b7a60f48761f9a8ef52ed3de7393287 (diff)
Solved the question of how to decide between &apos; (XML) and &squot; (HTML) by cutting the Gordian knot: quote the *double* quotes, which are always &quot;.
Diffstat (limited to 'beautifulsoup/dammit.py')
-rw-r--r--beautifulsoup/dammit.py48
1 files changed, 23 insertions, 25 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 67bec17..788f72d 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -35,6 +35,10 @@ class EntitySubstitution(object):
lookup = {}
characters = []
for codepoint, name in codepoint2name.items():
+ if codepoint == 34:
+ # There's no point in turning the quotation mark into
+ # &quot--even in attribute values we quote the
+ continue;
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
@@ -66,8 +70,7 @@ class EntitySubstitution(object):
entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
- def substitute_xml(self, value, make_quoted_attribute=False,
- destination_is_xml=False):
+ def substitute_xml(self, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
@@ -88,33 +91,28 @@ class EntitySubstitution(object):
Welcome to "my bar" -> 'Welcome to "my bar"'
If the string contains both single and double quotes, the
- single quotes will be escaped (see `destination_is_xml`), and
- the string will be quoted using single quotes.
-
- Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar'
- OR
- 'Welcome to "Bob&apos;s bar'
- (depending on the value of `destination_is_xml`)
-
- :param destination_is_xml: If destination_is_xml is True,
- then when a single quote is escaped it will become
- "&apos;". But &apos; is not a valid HTML 4 entity. If
- destination_is_xml is False, then single quotes will be
- turned into "&squot;".
-
- The value of this argument is irrelevant unless
- make_quoted_attribute is True.
+ double quotes will be escaped, and the string will be quoted
+ using double quotes.
+
+ Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
- quote_with = '"'
if make_quoted_attribute:
+ quote_with = '"'
if '"' in value:
- quote_with = "'"
if "'" in value:
- if destination_is_xml:
- replace_with = "&apos;"
- else:
- replace_with = "&squot;"
- value = value.replace("'", replace_with)
+ # The string contains both single and double
+ # quotes. Turn the double quotes into
+ # entities. We quote the double quotes rather than
+ # the single quotes because the entity name is
+ # "&quot;" whether this is HTML or XML. If we
+ # quoted the single quotes, we'd have to decide
+ # between &apos; and &squot;.
+ replace_with = "&quot;"
+ value = value.replace('"', replace_with)
+ else:
+ # There are double quotes but no single quotes.
+ # We can use single quotes to quote the attribute.
+ quote_with = "'"
# Escape angle brackets, and ampersands that aren't part of
# entities.