summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-20 14:59:32 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-20 14:59:32 -0400
commit9ebf90c684990306433b6c364a93b425a88ef2e7 (patch)
treee8cfba2b7e0166cd77512b031d61794b74491ca9
parentb289252da31f1824ee9c85f1ce53907069d6dd7e (diff)
The default XML formatter will now replace ampersands even if they appear to be part of entities. That is, "&lt;" will become "&amp;lt;".[bug=1182183]
-rw-r--r--NEWS.txt10
-rw-r--r--bs4/builder/__init__.py2
-rw-r--r--bs4/dammit.py25
-rw-r--r--bs4/tests/test_soup.py7
4 files changed, 42 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 440869c..a3485e7 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -5,6 +5,16 @@
list of tags and strings without having to check whether each
element is a tag or a string.
+* The default XML formatter will now replace ampersands even if they
+ appear to be part of entities. That is, "&lt;" will become
+ "&amp;lt;". The old code was left over from Beautiful Soup 3, which
+ didn't always turn entities into Unicode characters.
+
+ If you really want the old behavior (maybe because you add new
+ strings to the tree, those strings include entities, and you want
+ the formatter to leave them alone on output), it can be found in
+ EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183]
+
* Gave new_string() the ability to create subclasses of
NavigableString. [bug=1181986]
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index dc7deb9..bae453e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -152,7 +152,7 @@ class TreeBuilder(object):
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific):
- if cdata_list_attr in dict(attrs):
+ if cdata_list_attr in attrs:
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
# classes. Split it into a list.
diff --git a/bs4/dammit.py b/bs4/dammit.py
index c199cd5..a733cad 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -81,6 +81,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
+ AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@@ -134,6 +136,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign
+ will become &lt;, the greater-than sign will become &gt;,
+ and any ampersands will become &amp;. If you want ampersands
+ that appear to be part of an entity definition to be left
+ alone, use substitute_xml_containing_entities() instead.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets and ampersands.
+ value = cls.AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_xml_containing_entities(
+ cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
@@ -151,6 +175,7 @@ class EntitySubstitution(object):
value = cls.quoted_attribute_value(value)
return value
+
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index dd636d8..b127716 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
- def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+ def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
+ "&amp;Aacute;T&amp;T")
+
+ def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
+ self.assertEqual(
+ self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):