The 'html5' formatter now treats attributes whose values are the

empty string as HTML boolean attributes. Previously (and in other formatters), an attribute value must be set as None to be treated as a boolean attribute. In a future release, I plan to also give this behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
author: Leonard Richardson <leonardr@segfault.org> 2021-02-14 15:34:04 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2021-02-14 15:34:04 -0500
commit: 7201eecc09b51df5a0fb704670aa66bcc9d8e635 (patch)
tree: 30dd9d9df4d81eff431a53f5c47093934b06dfd1
parent: c876fbf402f15d924b7c0d9a9be5ba80769444a3 (diff)
5 files changed, 117 insertions, 46 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 9cddc55..dd62294 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,7 +18,13 @@
   may now return a different result than calling get_text() on the tag
   itself. That's because different tags now have different
   understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
-	
+
+* The 'html5' formatter now treats attributes whose values are the
+  empty string as HTML boolean attributes. Previously (and in other
+  formatters), an attribute value must be set as None to be treated as
+  a boolean attribute. In a future release, I plan to also give this
+  behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
+
 * Corrected output when the namespace prefix associated with a
   namespaced attribute is the empty string, as opposed to
   None. [bug=1915583]
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 9a692ec..82d4689 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -14,7 +14,8 @@ class Formatter(EntitySubstitution):
 
     For HTML documents:
      * 'html' - HTML entity substitution for generic HTML documents. (default)
-     * 'html5' - HTML entity substitution for HTML5 documents.
+     * 'html5' - HTML entity substitution for HTML5 documents, as
+                 well as some optimizations in the way tags are rendered.
      * 'minimal' - Only make the substitutions necessary to guarantee
                    valid HTML.
      * None - Do not perform any substitution. This will be faster
@@ -48,6 +49,7 @@ class Formatter(EntitySubstitution):
     def __init__(
             self, language=None, entity_substitution=None,
             void_element_close_prefix='/', cdata_containing_tags=None,
+            empty_attributes_are_booleans=False,
     ):
         """Constructor.
 
@@ -64,6 +66,9 @@ class Formatter(EntitySubstitution):
            as containing CDATA in this dialect. For example, in HTML,
            <script> and <style> tags are defined as containing CDATA,
            and their contents should not be formatted.
+        :param blank_attributes_are_booleans: Render attributes whose value
+            is the empty string as HTML-style boolean attributes.
+            (Attributes whose value is None are always rendered this way.)
         """
         self.language = language
         self.entity_substitution = entity_substitution
@@ -71,7 +76,8 @@ class Formatter(EntitySubstitution):
         self.cdata_containing_tags = self._default(
             language, cdata_containing_tags, 'cdata_containing_tags'
         )
-            
+        self.empty_attributes_are_booleans=empty_attributes_are_booleans
+        
     def substitute(self, ns):
         """Process a string that needs to undergo entity substitution.
         This may be a string encountered in an attribute value or as
@@ -107,11 +113,17 @@ class Formatter(EntitySubstitution):
         By default, attributes are sorted alphabetically. This makes
         behavior consistent between Python 2 and Python 3, and preserves
         backwards compatibility with older versions of Beautiful Soup.
+
+        If `empty_boolean_attributes` is True, then attributes whose
+        values are set to the empty string will be treated as boolean
+        attributes.
         """
         if tag.attrs is None:
             return []
-        return sorted(tag.attrs.items())
-
+        return sorted(
+            (k, (None if self.empty_attributes_are_booleans and v == '' else v))
+            for k, v in tag.attrs.items()
+        )
    
 class HTMLFormatter(Formatter):
     """A generic Formatter for HTML."""
@@ -133,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
 )
 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
     entity_substitution=EntitySubstitution.substitute_html,
-    void_element_close_prefix = None
+    void_element_close_prefix=None,
+    empty_attributes_are_booleans=True,
 )
 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
     entity_substitution=EntitySubstitution.substitute_xml
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
new file mode 100644
index 0000000..718989b
--- /dev/null
+++ b/bs4/tests/test_formatter.py
@@ -0,0 +1,81 @@
+from bs4.element import Tag
+from bs4.testing import SoupTest
+from bs4.formatter import (
+    Formatter,
+    HTMLFormatter,
+    XMLFormatter,
+)
+
+class TestFormatter(SoupTest):
+
+    def test_default_attributes(self):
+        # Test the default behavior of Formatter.attributes().
+        formatter = Formatter()
+        tag = Tag(name="tag")
+        tag['b'] = 1
+        tag['a'] = 2
+
+        # Attributes come out sorted by name. In Python 3, attributes
+        # normally come out of a dictionary in the order they were
+        # added.
+        self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
+
+        # This works even if Tag.attrs is None, though this shouldn't
+        # normally happen.
+        tag.attrs = None
+        self.assertEquals([], formatter.attributes(tag))
+        
+    def test_sort_attributes(self):
+        # Test the ability to override Formatter.attributes() to,
+        # e.g., disable the normal sorting of attributes.
+        class UnsortedFormatter(Formatter):
+            def attributes(self, tag):
+                self.called_with = tag
+                for k, v in sorted(tag.attrs.items()):
+                    if k == 'ignore':
+                        continue
+                    yield k,v
+
+        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
+        formatter = UnsortedFormatter()
+        decoded = soup.decode(formatter=formatter)
+
+        # attributes() was called on the <p> tag. It filtered out one
+        # attribute and sorted the other two.
+        self.assertEquals(formatter.called_with, soup.p)
+        self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
+
+    def test_empty_attributes_are_booleans(self):
+        # Test the behavior of empty_attributes_are_booleans as well
+        # as which Formatters have it enabled.
+        
+        for name in ('html', 'minimal', None):
+            formatter = HTMLFormatter.REGISTRY[name]
+            self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+        formatter = XMLFormatter.REGISTRY[None]
+        self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+        formatter = HTMLFormatter.REGISTRY['html5']
+        self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+        # Verify that the constructor sets the value.
+        formatter = Formatter(empty_attributes_are_booleans=True)
+        self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+        # Now demonstrate what it does to markup.
+        for markup in (
+                "<option selected></option>",
+                '<option selected=""></option>'
+        ):
+            soup = self.soup(markup)
+            for formatter in ('html', 'minimal', 'xml', None):
+                self.assertEquals(
+                    b'<option selected=""></option>',
+                    soup.option.encode(formatter='html')
+                )
+                self.assertEquals(
+                    b'<option selected></option>',
+                    soup.option.encode(formatter='html5')
+                )
+
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 9267a8f..d1ca5ea 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1846,45 +1846,6 @@ class TestEncoding(SoupTest):
         else:
             self.assertEqual(b'<b>\\u2603</b>', repr(soup))
 
-class TestFormatter(SoupTest):
-
-    def test_default_attributes(self):
-        # Test the default behavior of Formatter.attributes().
-        formatter = Formatter()
-        tag = Tag(name="tag")
-        tag['b'] = 1
-        tag['a'] = 2
-
-        # Attributes come out sorted by name. In Python 3, attributes
-        # normally come out of a dictionary in the order they were
-        # added.
-        self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
-
-        # This works even if Tag.attrs is None, though this shouldn't
-        # normally happen.
-        tag.attrs = None
-        self.assertEquals([], formatter.attributes(tag))
-        
-    def test_sort_attributes(self):
-        # Test the ability to override Formatter.attributes() to,
-        # e.g., disable the normal sorting of attributes.
-        class UnsortedFormatter(Formatter):
-            def attributes(self, tag):
-                self.called_with = tag
-                for k, v in sorted(tag.attrs.items()):
-                    if k == 'ignore':
-                        continue
-                    yield k,v
-
-        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
-        formatter = UnsortedFormatter()
-        decoded = soup.decode(formatter=formatter)
-
-        # attributes() was called on the <p> tag. It filtered out one
-        # attribute and sorted the other two.
-        self.assertEquals(formatter.called_with, soup.p)
-        self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
-
 
 class TestNavigableStringSubclasses(SoupTest):
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 8a1a2d5..2b5843d 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2299,7 +2299,7 @@ Unicode characters to HTML entities whenever possible::
  #  Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
  # </p>
 
-If you pass in ``formatter="html5"``, it's the same as
+If you pass in ``formatter="html5"``, it's similar to
 ``formatter="html"``, but Beautiful Soup will
 omit the closing slash in HTML void tags like "br"::
 
@@ -2310,7 +2310,17 @@ omit the closing slash in HTML void tags like "br"::
  
  print(br.encode(formatter="html5"))
  # b'<br>'
+
+In addition, any attributes whose values are the empty string
+will become HTML-style boolean attributes:
+
+ option = BeautifulSoup('<option selected=""></option>').option
+ print(option.encode(formatter="html"))
+ # b'<option selected=""></option>'
  
+ print(option.encode(formatter="html5"))
+ # b'<option selected></option>'
+
 If you pass in ``formatter=None``, Beautiful Soup will not modify
 strings at all on output. This is the fastest option, but it may lead
 to Beautiful Soup generating invalid HTML/XML, as in these examples::
author	Leonard Richardson <leonardr@segfault.org>	2021-02-14 15:34:04 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2021-02-14 15:34:04 -0500
commit	7201eecc09b51df5a0fb704670aa66bcc9d8e635 (patch)
tree	30dd9d9df4d81eff431a53f5c47093934b06dfd1
parent	c876fbf402f15d924b7c0d9a9be5ba80769444a3 (diff)