diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-02-14 15:34:04 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-02-14 15:34:04 -0500 |
commit | 7201eecc09b51df5a0fb704670aa66bcc9d8e635 (patch) | |
tree | 30dd9d9df4d81eff431a53f5c47093934b06dfd1 | |
parent | c876fbf402f15d924b7c0d9a9be5ba80769444a3 (diff) |
The 'html5' formatter now treats attributes whose values are the
empty string as HTML boolean attributes. Previously (and in other
formatters), an attribute value must be set as None to be treated as
a boolean attribute. In a future release, I plan to also give this
behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
-rw-r--r-- | CHANGELOG | 8 | ||||
-rw-r--r-- | bs4/formatter.py | 23 | ||||
-rw-r--r-- | bs4/tests/test_formatter.py | 81 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 39 | ||||
-rw-r--r-- | doc/source/index.rst | 12 |
5 files changed, 117 insertions, 46 deletions
@@ -18,7 +18,13 @@ may now return a different result than calling get_text() on the tag itself. That's because different tags now have different understandings of what counts as 'text'. [bug=1906226] [bug=1868861] - + +* The 'html5' formatter now treats attributes whose values are the + empty string as HTML boolean attributes. Previously (and in other + formatters), an attribute value must be set as None to be treated as + a boolean attribute. In a future release, I plan to also give this + behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424] + * Corrected output when the namespace prefix associated with a namespaced attribute is the empty string, as opposed to None. [bug=1915583] diff --git a/bs4/formatter.py b/bs4/formatter.py index 9a692ec..82d4689 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -14,7 +14,8 @@ class Formatter(EntitySubstitution): For HTML documents: * 'html' - HTML entity substitution for generic HTML documents. (default) - * 'html5' - HTML entity substitution for HTML5 documents. + * 'html5' - HTML entity substitution for HTML5 documents, as + well as some optimizations in the way tags are rendered. * 'minimal' - Only make the substitutions necessary to guarantee valid HTML. * None - Do not perform any substitution. This will be faster @@ -48,6 +49,7 @@ class Formatter(EntitySubstitution): def __init__( self, language=None, entity_substitution=None, void_element_close_prefix='/', cdata_containing_tags=None, + empty_attributes_are_booleans=False, ): """Constructor. @@ -64,6 +66,9 @@ class Formatter(EntitySubstitution): as containing CDATA in this dialect. For example, in HTML, <script> and <style> tags are defined as containing CDATA, and their contents should not be formatted. + :param blank_attributes_are_booleans: Render attributes whose value + is the empty string as HTML-style boolean attributes. + (Attributes whose value is None are always rendered this way.) """ self.language = language self.entity_substitution = entity_substitution @@ -71,7 +76,8 @@ class Formatter(EntitySubstitution): self.cdata_containing_tags = self._default( language, cdata_containing_tags, 'cdata_containing_tags' ) - + self.empty_attributes_are_booleans=empty_attributes_are_booleans + def substitute(self, ns): """Process a string that needs to undergo entity substitution. This may be a string encountered in an attribute value or as @@ -107,11 +113,17 @@ class Formatter(EntitySubstitution): By default, attributes are sorted alphabetically. This makes behavior consistent between Python 2 and Python 3, and preserves backwards compatibility with older versions of Beautiful Soup. + + If `empty_boolean_attributes` is True, then attributes whose + values are set to the empty string will be treated as boolean + attributes. """ if tag.attrs is None: return [] - return sorted(tag.attrs.items()) - + return sorted( + (k, (None if self.empty_attributes_are_booleans and v == '' else v)) + for k, v in tag.attrs.items() + ) class HTMLFormatter(Formatter): """A generic Formatter for HTML.""" @@ -133,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter( ) HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html, - void_element_close_prefix = None + void_element_close_prefix=None, + empty_attributes_are_booleans=True, ) HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_xml diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py new file mode 100644 index 0000000..718989b --- /dev/null +++ b/bs4/tests/test_formatter.py @@ -0,0 +1,81 @@ +from bs4.element import Tag +from bs4.testing import SoupTest +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +class TestFormatter(SoupTest): + + def test_default_attributes(self): + # Test the default behavior of Formatter.attributes(). + formatter = Formatter() + tag = Tag(name="tag") + tag['b'] = 1 + tag['a'] = 2 + + # Attributes come out sorted by name. In Python 3, attributes + # normally come out of a dictionary in the order they were + # added. + self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag)) + + # This works even if Tag.attrs is None, though this shouldn't + # normally happen. + tag.attrs = None + self.assertEquals([], formatter.attributes(tag)) + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEquals(formatter.called_with, soup.p) + self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) + + def test_empty_attributes_are_booleans(self): + # Test the behavior of empty_attributes_are_booleans as well + # as which Formatters have it enabled. + + for name in ('html', 'minimal', None): + formatter = HTMLFormatter.REGISTRY[name] + self.assertEquals(False, formatter.empty_attributes_are_booleans) + + formatter = XMLFormatter.REGISTRY[None] + self.assertEquals(False, formatter.empty_attributes_are_booleans) + + formatter = HTMLFormatter.REGISTRY['html5'] + self.assertEquals(True, formatter.empty_attributes_are_booleans) + + # Verify that the constructor sets the value. + formatter = Formatter(empty_attributes_are_booleans=True) + self.assertEquals(True, formatter.empty_attributes_are_booleans) + + # Now demonstrate what it does to markup. + for markup in ( + "<option selected></option>", + '<option selected=""></option>' + ): + soup = self.soup(markup) + for formatter in ('html', 'minimal', 'xml', None): + self.assertEquals( + b'<option selected=""></option>', + soup.option.encode(formatter='html') + ) + self.assertEquals( + b'<option selected></option>', + soup.option.encode(formatter='html5') + ) + diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 9267a8f..d1ca5ea 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1846,45 +1846,6 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) -class TestFormatter(SoupTest): - - def test_default_attributes(self): - # Test the default behavior of Formatter.attributes(). - formatter = Formatter() - tag = Tag(name="tag") - tag['b'] = 1 - tag['a'] = 2 - - # Attributes come out sorted by name. In Python 3, attributes - # normally come out of a dictionary in the order they were - # added. - self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag)) - - # This works even if Tag.attrs is None, though this shouldn't - # normally happen. - tag.attrs = None - self.assertEquals([], formatter.attributes(tag)) - - def test_sort_attributes(self): - # Test the ability to override Formatter.attributes() to, - # e.g., disable the normal sorting of attributes. - class UnsortedFormatter(Formatter): - def attributes(self, tag): - self.called_with = tag - for k, v in sorted(tag.attrs.items()): - if k == 'ignore': - continue - yield k,v - - soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') - formatter = UnsortedFormatter() - decoded = soup.decode(formatter=formatter) - - # attributes() was called on the <p> tag. It filtered out one - # attribute and sorted the other two. - self.assertEquals(formatter.called_with, soup.p) - self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) - class TestNavigableStringSubclasses(SoupTest): diff --git a/doc/source/index.rst b/doc/source/index.rst index 8a1a2d5..2b5843d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2299,7 +2299,7 @@ Unicode characters to HTML entities whenever possible:: # Il a dit <<Sacré bleu!>> # </p> -If you pass in ``formatter="html5"``, it's the same as +If you pass in ``formatter="html5"``, it's similar to ``formatter="html"``, but Beautiful Soup will omit the closing slash in HTML void tags like "br":: @@ -2310,7 +2310,17 @@ omit the closing slash in HTML void tags like "br":: print(br.encode(formatter="html5")) # b'<br>' + +In addition, any attributes whose values are the empty string +will become HTML-style boolean attributes: + + option = BeautifulSoup('<option selected=""></option>').option + print(option.encode(formatter="html")) + # b'<option selected=""></option>' + print(option.encode(formatter="html5")) + # b'<option selected></option>' + If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: |