diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-16 10:06:26 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-16 10:06:26 -0400 |
commit | 0afe0af7cd8240ab790ccbcea6ecbdf69f21461e (patch) | |
tree | bd6a8fc992d24d144466e74ea9f7b2ac4fb31fa1 /bs4 | |
parent | c40bc98de62545aa8855311a1d046af5cd9020ba (diff) |
Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. [bug=980237]
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/dammit.py | 70 | ||||
-rw-r--r-- | bs4/element.py | 27 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 19 |
3 files changed, 72 insertions, 44 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index a35c213..65fd43d 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -81,58 +81,62 @@ class EntitySubstitution(object): return "&%s;" % entity @classmethod - def substitute_xml(cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. - :param value: A string to be substituted. The less-than sign will - become <, the greater-than sign will become >, and any - ampersands that are not part of an entity defition will - become &. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - - Ordinarily, the string will be quoted using double quotes. + Most strings will be quoted using double quotes. Bob's Bar -> "Bob's Bar" - If the string contains double quotes, it will be quoted using + If a string contains double quotes, it will be quoted using single quotes. Welcome to "my bar" -> 'Welcome to "my bar"' - If the string contains both single and double quotes, the + If a string contains both single and double quotes, the double quotes will be escaped, and the string will be quoted using double quotes. Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" """ - if make_quoted_attribute: - quote_with = '"' - if '"' in value: - if "'" in value: - # The string contains both single and double - # quotes. Turn the double quotes into - # entities. We quote the double quotes rather than - # the single quotes because the entity name is - # """ whether this is HTML or XML. If we - # quoted the single quotes, we'd have to decide - # between ' and &squot;. - replace_with = """ - value = value.replace('"', replace_with) - else: - # There are double quotes but no single quotes. - # We can use single quotes to quote the attribute. - quote_with = "'" + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ # Escape angle brackets, and ampersands that aren't part of # entities. value = cls.BARE_AMPERSAND_OR_BRACKET.sub( cls._substitute_xml_entity, value) + if make_quoted_attribute: - return quote_with + value + quote_with - else: - return value + value = cls.quoted_attribute_value(value) + return value @classmethod def substitute_html(cls, s): diff --git a/bs4/element.py b/bs4/element.py index 496f2ad..684da38 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -57,6 +57,18 @@ class PageElement(object): None : None } + @classmethod + def format_string(self, s, formatter='minimal'): + """Format the given string using the given formatter.""" + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = s + else: + output = formatter(s) + return output + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" @@ -617,14 +629,7 @@ class NavigableString(unicode, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): - if not callable(formatter): - formatter = self.FORMATTERS.get( - formatter, EntitySubstitution.substitute_xml) - if formatter is None: - output = self - else: - output = formatter(self) - + output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @@ -950,8 +955,10 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substitute_encoding(val, eventual_encoding) - decoded = (str(key) + '=' - + EntitySubstitution.substitute_xml(val, True)) + text = self.format_string(val, formatter) + decoded = ( + str(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index f4fe451..661decb 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1219,6 +1219,23 @@ class TestSubstitutions(SoupTest): decoded, self.document_for(u"<b><FOO></b><b>BAR</b>")) + def test_formatter_is_run_on_attribute_values(self): + markup = u'<a href="http://a.com?a=b&c=é">e</a>' + soup = self.soup(markup) + a = soup.a + + expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + def test_prettify_accepts_formatter(self): soup = BeautifulSoup("<html><body>foo</body></html>") pretty = soup.prettify(formatter = lambda x: x.upper()) @@ -1309,7 +1326,7 @@ class TestEncoding(SoupTest): def test_encode_contents(self): html = u"<b>\N{SNOWMAN}</b>" soup = self.soup(html) - self.assertEquals( + self.assertEqual( u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) |