diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-20 14:59:32 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-20 14:59:32 -0400 |
commit | 9ebf90c684990306433b6c364a93b425a88ef2e7 (patch) | |
tree | e8cfba2b7e0166cd77512b031d61794b74491ca9 /bs4/dammit.py | |
parent | b289252da31f1824ee9c85f1ce53907069d6dd7e (diff) |
The default XML formatter will now replace ampersands even if they appear to be part of entities. That is, "<" will become "&lt;".[bug=1182183]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index c199cd5..a733cad 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -81,6 +81,8 @@ class EntitySubstitution(object): "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) @@ -134,6 +136,28 @@ class EntitySubstitution(object): def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign will become <, the greater-than sign will become >, and any ampersands that are not part of an entity defition will @@ -151,6 +175,7 @@ class EntitySubstitution(object): value = cls.quoted_attribute_value(value) return value + @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. |