diff options
-rw-r--r-- | NEWS.txt | 10 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 2 | ||||
-rw-r--r-- | bs4/dammit.py | 25 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 7 |
4 files changed, 42 insertions, 2 deletions
@@ -5,6 +5,16 @@ list of tags and strings without having to check whether each element is a tag or a string. +* The default XML formatter will now replace ampersands even if they + appear to be part of entities. That is, "<" will become + "&lt;". The old code was left over from Beautiful Soup 3, which + didn't always turn entities into Unicode characters. + + If you really want the old behavior (maybe because you add new + strings to the tree, those strings include entities, and you want + the formatter to leave them alone on output), it can be found in + EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] + * Gave new_string() the ability to create subclasses of NavigableString. [bug=1181986] diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index dc7deb9..bae453e 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -152,7 +152,7 @@ class TreeBuilder(object): tag_specific = self.cdata_list_attributes.get( tag_name.lower(), []) for cdata_list_attr in itertools.chain(universal, tag_specific): - if cdata_list_attr in dict(attrs): + if cdata_list_attr in attrs: # Basically, we have a "class" attribute whose # value is a whitespace-separated list of CSS # classes. Split it into a list. diff --git a/bs4/dammit.py b/bs4/dammit.py index c199cd5..a733cad 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -81,6 +81,8 @@ class EntitySubstitution(object): "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) @@ -134,6 +136,28 @@ class EntitySubstitution(object): def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign will become <, the greater-than sign will become >, and any ampersands that are not part of an entity defition will @@ -151,6 +175,7 @@ class EntitySubstitution(object): value = cls.quoted_attribute_value(value) return value + @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index dd636d8..b127716 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase): def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") - def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), + "&Aacute;T&T") + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): |