summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/dammit.py70
-rw-r--r--bs4/element.py27
-rw-r--r--bs4/tests/test_tree.py19
3 files changed, 72 insertions, 44 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a35c213..65fd43d 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -81,58 +81,62 @@ class EntitySubstitution(object):
return "&%s;" % entity
@classmethod
- def substitute_xml(cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
+ def quoted_attribute_value(self, value):
+ """Make a value into a quoted XML attribute, possibly escaping it.
- :param value: A string to be substituted. The less-than sign will
- become <, the greater-than sign will become >, and any
- ampersands that are not part of an entity defition will
- become &.
-
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
-
- Ordinarily, the string will be quoted using double quotes.
+ Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
- If the string contains double quotes, it will be quoted using
+ If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
- If the string contains both single and double quotes, the
+ If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
"""
- if make_quoted_attribute:
- quote_with = '"'
- if '"' in value:
- if "'" in value:
- # The string contains both single and double
- # quotes. Turn the double quotes into
- # entities. We quote the double quotes rather than
- # the single quotes because the entity name is
- # """ whether this is HTML or XML. If we
- # quoted the single quotes, we'd have to decide
- # between ' and &squot;.
- replace_with = """
- value = value.replace('"', replace_with)
- else:
- # There are double quotes but no single quotes.
- # We can use single quotes to quote the attribute.
- quote_with = "'"
+ quote_with = '"'
+ if '"' in value:
+ if "'" in value:
+ # The string contains both single and double
+ # quotes. Turn the double quotes into
+ # entities. We quote the double quotes rather than
+ # the single quotes because the entity name is
+ # """ whether this is HTML or XML. If we
+ # quoted the single quotes, we'd have to decide
+ # between ' and &squot;.
+ replace_with = """
+ value = value.replace('"', replace_with)
+ else:
+ # There are double quotes but no single quotes.
+ # We can use single quotes to quote the attribute.
+ quote_with = "'"
+ return quote_with + value + quote_with
+
+ @classmethod
+ def substitute_xml(cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign will
+ become <, the greater-than sign will become >, and any
+ ampersands that are not part of an entity defition will
+ become &.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
+
if make_quoted_attribute:
- return quote_with + value + quote_with
- else:
- return value
+ value = cls.quoted_attribute_value(value)
+ return value
@classmethod
def substitute_html(cls, s):
diff --git a/bs4/element.py b/bs4/element.py
index 496f2ad..684da38 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -57,6 +57,18 @@ class PageElement(object):
None : None
}
+ @classmethod
+ def format_string(self, s, formatter='minimal'):
+ """Format the given string using the given formatter."""
+ if not callable(formatter):
+ formatter = self.FORMATTERS.get(
+ formatter, EntitySubstitution.substitute_xml)
+ if formatter is None:
+ output = s
+ else:
+ output = formatter(s)
+ return output
+
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@@ -617,14 +629,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
- if not callable(formatter):
- formatter = self.FORMATTERS.get(
- formatter, EntitySubstitution.substitute_xml)
- if formatter is None:
- output = self
- else:
- output = formatter(self)
-
+ output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@@ -950,8 +955,10 @@ class Tag(PageElement):
and '%SOUP-ENCODING%' in val):
val = self.substitute_encoding(val, eventual_encoding)
- decoded = (str(key) + '='
- + EntitySubstitution.substitute_xml(val, True))
+ text = self.format_string(val, formatter)
+ decoded = (
+ str(key) + '='
+ + EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f4fe451..661decb 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1219,6 +1219,23 @@ class TestSubstitutions(SoupTest):
decoded,
self.document_for(u"<b><FOO></b><b>BAR</b>"))
+ def test_formatter_is_run_on_attribute_values(self):
+ markup = u'<a href="http://a.com?a=b&c=é">e</a>'
+ soup = self.soup(markup)
+ a = soup.a
+
+ expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
+
+ self.assertEqual(expect_minimal, a.decode())
+ self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
+
+ expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
+ self.assertEqual(expect_html, a.decode(formatter="html"))
+
+ self.assertEqual(markup, a.decode(formatter=None))
+ expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
+ self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
+
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>")
pretty = soup.prettify(formatter = lambda x: x.upper())
@@ -1309,7 +1326,7 @@ class TestEncoding(SoupTest):
def test_encode_contents(self):
html = u"<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
- self.assertEquals(
+ self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8"))