summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-02-14 15:34:04 -0500
committerLeonard Richardson <leonardr@segfault.org>2021-02-14 15:34:04 -0500
commit7201eecc09b51df5a0fb704670aa66bcc9d8e635 (patch)
tree30dd9d9df4d81eff431a53f5c47093934b06dfd1
parentc876fbf402f15d924b7c0d9a9be5ba80769444a3 (diff)
The 'html5' formatter now treats attributes whose values are the
empty string as HTML boolean attributes. Previously (and in other formatters), an attribute value must be set as None to be treated as a boolean attribute. In a future release, I plan to also give this behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
-rw-r--r--CHANGELOG8
-rw-r--r--bs4/formatter.py23
-rw-r--r--bs4/tests/test_formatter.py81
-rw-r--r--bs4/tests/test_tree.py39
-rw-r--r--doc/source/index.rst12
5 files changed, 117 insertions, 46 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 9cddc55..dd62294 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,7 +18,13 @@
may now return a different result than calling get_text() on the tag
itself. That's because different tags now have different
understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
-
+
+* The 'html5' formatter now treats attributes whose values are the
+ empty string as HTML boolean attributes. Previously (and in other
+ formatters), an attribute value must be set as None to be treated as
+ a boolean attribute. In a future release, I plan to also give this
+ behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
+
* Corrected output when the namespace prefix associated with a
namespaced attribute is the empty string, as opposed to
None. [bug=1915583]
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 9a692ec..82d4689 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -14,7 +14,8 @@ class Formatter(EntitySubstitution):
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
- * 'html5' - HTML entity substitution for HTML5 documents.
+ * 'html5' - HTML entity substitution for HTML5 documents, as
+ well as some optimizations in the way tags are rendered.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
@@ -48,6 +49,7 @@ class Formatter(EntitySubstitution):
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
+ empty_attributes_are_booleans=False,
):
"""Constructor.
@@ -64,6 +66,9 @@ class Formatter(EntitySubstitution):
as containing CDATA in this dialect. For example, in HTML,
<script> and <style> tags are defined as containing CDATA,
and their contents should not be formatted.
+ :param blank_attributes_are_booleans: Render attributes whose value
+ is the empty string as HTML-style boolean attributes.
+ (Attributes whose value is None are always rendered this way.)
"""
self.language = language
self.entity_substitution = entity_substitution
@@ -71,7 +76,8 @@ class Formatter(EntitySubstitution):
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
-
+ self.empty_attributes_are_booleans=empty_attributes_are_booleans
+
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
@@ -107,11 +113,17 @@ class Formatter(EntitySubstitution):
By default, attributes are sorted alphabetically. This makes
behavior consistent between Python 2 and Python 3, and preserves
backwards compatibility with older versions of Beautiful Soup.
+
+ If `empty_boolean_attributes` is True, then attributes whose
+ values are set to the empty string will be treated as boolean
+ attributes.
"""
if tag.attrs is None:
return []
- return sorted(tag.attrs.items())
-
+ return sorted(
+ (k, (None if self.empty_attributes_are_booleans and v == '' else v))
+ for k, v in tag.attrs.items()
+ )
class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
@@ -133,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
- void_element_close_prefix = None
+ void_element_close_prefix=None,
+ empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
new file mode 100644
index 0000000..718989b
--- /dev/null
+++ b/bs4/tests/test_formatter.py
@@ -0,0 +1,81 @@
+from bs4.element import Tag
+from bs4.testing import SoupTest
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
+
+class TestFormatter(SoupTest):
+
+ def test_default_attributes(self):
+ # Test the default behavior of Formatter.attributes().
+ formatter = Formatter()
+ tag = Tag(name="tag")
+ tag['b'] = 1
+ tag['a'] = 2
+
+ # Attributes come out sorted by name. In Python 3, attributes
+ # normally come out of a dictionary in the order they were
+ # added.
+ self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
+
+ # This works even if Tag.attrs is None, though this shouldn't
+ # normally happen.
+ tag.attrs = None
+ self.assertEquals([], formatter.attributes(tag))
+
+ def test_sort_attributes(self):
+ # Test the ability to override Formatter.attributes() to,
+ # e.g., disable the normal sorting of attributes.
+ class UnsortedFormatter(Formatter):
+ def attributes(self, tag):
+ self.called_with = tag
+ for k, v in sorted(tag.attrs.items()):
+ if k == 'ignore':
+ continue
+ yield k,v
+
+ soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
+ formatter = UnsortedFormatter()
+ decoded = soup.decode(formatter=formatter)
+
+ # attributes() was called on the <p> tag. It filtered out one
+ # attribute and sorted the other two.
+ self.assertEquals(formatter.called_with, soup.p)
+ self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
+
+ def test_empty_attributes_are_booleans(self):
+ # Test the behavior of empty_attributes_are_booleans as well
+ # as which Formatters have it enabled.
+
+ for name in ('html', 'minimal', None):
+ formatter = HTMLFormatter.REGISTRY[name]
+ self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+ formatter = XMLFormatter.REGISTRY[None]
+ self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+ formatter = HTMLFormatter.REGISTRY['html5']
+ self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+ # Verify that the constructor sets the value.
+ formatter = Formatter(empty_attributes_are_booleans=True)
+ self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+ # Now demonstrate what it does to markup.
+ for markup in (
+ "<option selected></option>",
+ '<option selected=""></option>'
+ ):
+ soup = self.soup(markup)
+ for formatter in ('html', 'minimal', 'xml', None):
+ self.assertEquals(
+ b'<option selected=""></option>',
+ soup.option.encode(formatter='html')
+ )
+ self.assertEquals(
+ b'<option selected></option>',
+ soup.option.encode(formatter='html5')
+ )
+
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 9267a8f..d1ca5ea 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1846,45 +1846,6 @@ class TestEncoding(SoupTest):
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
-class TestFormatter(SoupTest):
-
- def test_default_attributes(self):
- # Test the default behavior of Formatter.attributes().
- formatter = Formatter()
- tag = Tag(name="tag")
- tag['b'] = 1
- tag['a'] = 2
-
- # Attributes come out sorted by name. In Python 3, attributes
- # normally come out of a dictionary in the order they were
- # added.
- self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
-
- # This works even if Tag.attrs is None, though this shouldn't
- # normally happen.
- tag.attrs = None
- self.assertEquals([], formatter.attributes(tag))
-
- def test_sort_attributes(self):
- # Test the ability to override Formatter.attributes() to,
- # e.g., disable the normal sorting of attributes.
- class UnsortedFormatter(Formatter):
- def attributes(self, tag):
- self.called_with = tag
- for k, v in sorted(tag.attrs.items()):
- if k == 'ignore':
- continue
- yield k,v
-
- soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
- formatter = UnsortedFormatter()
- decoded = soup.decode(formatter=formatter)
-
- # attributes() was called on the <p> tag. It filtered out one
- # attribute and sorted the other two.
- self.assertEquals(formatter.called_with, soup.p)
- self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
-
class TestNavigableStringSubclasses(SoupTest):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 8a1a2d5..2b5843d 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2299,7 +2299,7 @@ Unicode characters to HTML entities whenever possible::
# Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
# </p>
-If you pass in ``formatter="html5"``, it's the same as
+If you pass in ``formatter="html5"``, it's similar to
``formatter="html"``, but Beautiful Soup will
omit the closing slash in HTML void tags like "br"::
@@ -2310,7 +2310,17 @@ omit the closing slash in HTML void tags like "br"::
print(br.encode(formatter="html5"))
# b'<br>'
+
+In addition, any attributes whose values are the empty string
+will become HTML-style boolean attributes:
+
+ option = BeautifulSoup('<option selected=""></option>').option
+ print(option.encode(formatter="html"))
+ # b'<option selected=""></option>'
+ print(option.encode(formatter="html5"))
+ # b'<option selected></option>'
+
If you pass in ``formatter=None``, Beautiful Soup will not modify
strings at all on output. This is the fastest option, but it may lead
to Beautiful Soup generating invalid HTML/XML, as in these examples::