summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-05-20 14:59:32 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-05-20 14:59:32 -0400
commit9ebf90c684990306433b6c364a93b425a88ef2e7 (patch)
treee8cfba2b7e0166cd77512b031d61794b74491ca9 /bs4/dammit.py
parentb289252da31f1824ee9c85f1ce53907069d6dd7e (diff)
The default XML formatter will now replace ampersands even if they appear to be part of entities. That is, "&lt;" will become "&amp;lt;".[bug=1182183]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index c199cd5..a733cad 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -81,6 +81,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
+ AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@@ -134,6 +136,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign
+ will become &lt;, the greater-than sign will become &gt;,
+ and any ampersands will become &amp;. If you want ampersands
+ that appear to be part of an entity definition to be left
+ alone, use substitute_xml_containing_entities() instead.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets and ampersands.
+ value = cls.AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_xml_containing_entities(
+ cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
@@ -151,6 +175,7 @@ class EntitySubstitution(object):
value = cls.quoted_attribute_value(value)
return value
+
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.