summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-22 22:57:39 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-22 22:57:39 -0500
commit7db58f67d60bf2675613e38d2d8daeeadc5522db (patch)
tree6cacead21b43bb5de23947a414c5d949c0d6c434
parentf42fef27dc82ce97df0cb7b254595e6771461637 (diff)
parent6d7422139b7a60f48761f9a8ef52ed3de7393287 (diff)
Solved the question of how to decide between &apos; (XML) and &squot; (HTML) by cutting the Gordian knot: quote the *double* quotes, which are always &quot;.
-rw-r--r--beautifulsoup/dammit.py48
-rw-r--r--beautifulsoup/element.py3
-rw-r--r--tests/test_lxml.py2
-rw-r--r--tests/test_soup.py12
4 files changed, 30 insertions, 35 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 67bec17..788f72d 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -35,6 +35,10 @@ class EntitySubstitution(object):
lookup = {}
characters = []
for codepoint, name in codepoint2name.items():
+ if codepoint == 34:
+ # There's no point in turning the quotation mark into
+ # &quot--even in attribute values we quote the
+ continue;
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
@@ -66,8 +70,7 @@ class EntitySubstitution(object):
entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
- def substitute_xml(self, value, make_quoted_attribute=False,
- destination_is_xml=False):
+ def substitute_xml(self, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
@@ -88,33 +91,28 @@ class EntitySubstitution(object):
Welcome to "my bar" -> 'Welcome to "my bar"'
If the string contains both single and double quotes, the
- single quotes will be escaped (see `destination_is_xml`), and
- the string will be quoted using single quotes.
-
- Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar'
- OR
- 'Welcome to "Bob&apos;s bar'
- (depending on the value of `destination_is_xml`)
-
- :param destination_is_xml: If destination_is_xml is True,
- then when a single quote is escaped it will become
- "&apos;". But &apos; is not a valid HTML 4 entity. If
- destination_is_xml is False, then single quotes will be
- turned into "&squot;".
-
- The value of this argument is irrelevant unless
- make_quoted_attribute is True.
+ double quotes will be escaped, and the string will be quoted
+ using double quotes.
+
+ Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
- quote_with = '"'
if make_quoted_attribute:
+ quote_with = '"'
if '"' in value:
- quote_with = "'"
if "'" in value:
- if destination_is_xml:
- replace_with = "&apos;"
- else:
- replace_with = "&squot;"
- value = value.replace("'", replace_with)
+ # The string contains both single and double
+ # quotes. Turn the double quotes into
+ # entities. We quote the double quotes rather than
+ # the single quotes because the entity name is
+ # "&quot;" whether this is HTML or XML. If we
+ # quoted the single quotes, we'd have to decide
+ # between &apos; and &squot;.
+ replace_with = "&quot;"
+ value = value.replace('"', replace_with)
+ else:
+ # There are double quotes but no single quotes.
+ # We can use single quotes to quote the attribute.
+ quote_with = "'"
# Escape angle brackets, and ampersands that aren't part of
# entities.
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 0ef9db1..23f8c33 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -582,8 +582,7 @@ class Tag(PageElement, EntitySubstitution):
and '%SOUP-ENCODING%' in val):
val = self.substituteEncoding(val, eventual_encoding)
- # XXX: Set destination_is_xml based on... something!
- decoded = key + '=' + self.substitute_xml(val, True, False)
+ decoded = key + '=' + self.substitute_xml(val, True)
attrs.append(decoded)
close = ''
closeTag = ''
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index df2f341..8f3d798 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -125,7 +125,7 @@ class TestLXMLBuilder(SoupTest):
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
self.assertSoupEquals(
soup.foo.decode(),
- """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""")
+ """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
def test_ampersand_in_attribute_value_gets_quoted(self):
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
diff --git a/tests/test_soup.py b/tests/test_soup.py
index eaedd94..c4d9c2c 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -53,15 +53,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
- # This one is going into an HTML document.
self.assertEquals(
self.sub.substitute_xml(s, True),
- "'Welcome to \"Bob&squot;s Bar\"'")
-
- # This one is going into an XML document.
- self.assertEquals(
- self.sub.substitute_xml(s, True, destination_is_xml=True),
- "'Welcome to \"Bob&apos;s Bar\"'")
+ '"Welcome to &quot;Bob\'s Bar&quot;"')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
@@ -80,6 +74,10 @@ class TestEntitySubstitution(unittest.TestCase):
self.sub.substitute_xml("&Aacute;T&T"),
"&Aacute;T&amp;T")
+ def test_quotes_not_html_substituted(self):
+ """There's no need to do this except inside attribute values."""
+ text = 'Bob\'s "bar"'
+ self.assertEquals(self.sub.substitute_html(text), text)
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""