diff options
-rw-r--r-- | beautifulsoup/dammit.py | 48 | ||||
-rw-r--r-- | beautifulsoup/element.py | 3 | ||||
-rw-r--r-- | tests/test_lxml.py | 2 | ||||
-rw-r--r-- | tests/test_soup.py | 12 |
4 files changed, 30 insertions, 35 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 67bec17..788f72d 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -35,6 +35,10 @@ class EntitySubstitution(object): lookup = {} characters = [] for codepoint, name in codepoint2name.items(): + if codepoint == 34: + # There's no point in turning the quotation mark into + # "--even in attribute values we quote the + continue; character = unichr(codepoint) characters.append(character) lookup[character] = name @@ -66,8 +70,7 @@ class EntitySubstitution(object): entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity - def substitute_xml(self, value, make_quoted_attribute=False, - destination_is_xml=False): + def substitute_xml(self, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. :param value: A string to be substituted. The less-than sign will @@ -88,33 +91,28 @@ class EntitySubstitution(object): Welcome to "my bar" -> 'Welcome to "my bar"' If the string contains both single and double quotes, the - single quotes will be escaped (see `destination_is_xml`), and - the string will be quoted using single quotes. - - Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar' - OR - 'Welcome to "Bob's bar' - (depending on the value of `destination_is_xml`) - - :param destination_is_xml: If destination_is_xml is True, - then when a single quote is escaped it will become - "'". But ' is not a valid HTML 4 entity. If - destination_is_xml is False, then single quotes will be - turned into "&squot;". - - The value of this argument is irrelevant unless - make_quoted_attribute is True. + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" """ - quote_with = '"' if make_quoted_attribute: + quote_with = '"' if '"' in value: - quote_with = "'" if "'" in value: - if destination_is_xml: - replace_with = "'" - else: - replace_with = "&squot;" - value = value.replace("'", replace_with) + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" # Escape angle brackets, and ampersands that aren't part of # entities. diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 0ef9db1..23f8c33 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -582,8 +582,7 @@ class Tag(PageElement, EntitySubstitution): and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventual_encoding) - # XXX: Set destination_is_xml based on... something! - decoded = key + '=' + self.substitute_xml(val, True, False) + decoded = key + '=' + self.substitute_xml(val, True) attrs.append(decoded) close = '' closeTag = '' diff --git a/tests/test_lxml.py b/tests/test_lxml.py index df2f341..8f3d798 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -125,7 +125,7 @@ class TestLXMLBuilder(SoupTest): soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), - """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""") + """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") def test_ampersand_in_attribute_value_gets_quoted(self): self.assertSoupEquals('<this is="really messed up & stuff"></this>', diff --git a/tests/test_soup.py b/tests/test_soup.py index eaedd94..c4d9c2c 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -53,15 +53,9 @@ class TestEntitySubstitution(unittest.TestCase): def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' - # This one is going into an HTML document. self.assertEquals( self.sub.substitute_xml(s, True), - "'Welcome to \"Bob&squot;s Bar\"'") - - # This one is going into an XML document. - self.assertEquals( - self.sub.substitute_xml(s, True, destination_is_xml=True), - "'Welcome to \"Bob's Bar\"'") + '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' @@ -80,6 +74,10 @@ class TestEntitySubstitution(unittest.TestCase): self.sub.substitute_xml("ÁT&T"), "ÁT&T") + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEquals(self.sub.substitute_html(text), text) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" |