summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG8
-rw-r--r--bs4/formatter.py23
-rw-r--r--bs4/tests/test_formatter.py81
-rw-r--r--bs4/tests/test_tree.py39
-rw-r--r--doc/source/index.rst12
5 files changed, 117 insertions, 46 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 9cddc55..dd62294 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,7 +18,13 @@
may now return a different result than calling get_text() on the tag
itself. That's because different tags now have different
understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
-
+
+* The 'html5' formatter now treats attributes whose values are the
+ empty string as HTML boolean attributes. Previously (and in other
+ formatters), an attribute value must be set as None to be treated as
+ a boolean attribute. In a future release, I plan to also give this
+ behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
+
* Corrected output when the namespace prefix associated with a
namespaced attribute is the empty string, as opposed to
None. [bug=1915583]
diff --git a/bs4/formatter.py b/bs4/formatter.py
index 9a692ec..82d4689 100644
--- a/bs4/formatter.py
+++ b/bs4/formatter.py
@@ -14,7 +14,8 @@ class Formatter(EntitySubstitution):
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
- * 'html5' - HTML entity substitution for HTML5 documents.
+ * 'html5' - HTML entity substitution for HTML5 documents, as
+ well as some optimizations in the way tags are rendered.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
@@ -48,6 +49,7 @@ class Formatter(EntitySubstitution):
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
+ empty_attributes_are_booleans=False,
):
"""Constructor.
@@ -64,6 +66,9 @@ class Formatter(EntitySubstitution):
as containing CDATA in this dialect. For example, in HTML,
<script> and <style> tags are defined as containing CDATA,
and their contents should not be formatted.
+ :param blank_attributes_are_booleans: Render attributes whose value
+ is the empty string as HTML-style boolean attributes.
+ (Attributes whose value is None are always rendered this way.)
"""
self.language = language
self.entity_substitution = entity_substitution
@@ -71,7 +76,8 @@ class Formatter(EntitySubstitution):
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
-
+ self.empty_attributes_are_booleans=empty_attributes_are_booleans
+
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
@@ -107,11 +113,17 @@ class Formatter(EntitySubstitution):
By default, attributes are sorted alphabetically. This makes
behavior consistent between Python 2 and Python 3, and preserves
backwards compatibility with older versions of Beautiful Soup.
+
+ If `empty_boolean_attributes` is True, then attributes whose
+ values are set to the empty string will be treated as boolean
+ attributes.
"""
if tag.attrs is None:
return []
- return sorted(tag.attrs.items())
-
+ return sorted(
+ (k, (None if self.empty_attributes_are_booleans and v == '' else v))
+ for k, v in tag.attrs.items()
+ )
class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
@@ -133,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
- void_element_close_prefix = None
+ void_element_close_prefix=None,
+ empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py
new file mode 100644
index 0000000..718989b
--- /dev/null
+++ b/bs4/tests/test_formatter.py
@@ -0,0 +1,81 @@
+from bs4.element import Tag
+from bs4.testing import SoupTest
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
+
+class TestFormatter(SoupTest):
+
+ def test_default_attributes(self):
+ # Test the default behavior of Formatter.attributes().
+ formatter = Formatter()
+ tag = Tag(name="tag")
+ tag['b'] = 1
+ tag['a'] = 2
+
+ # Attributes come out sorted by name. In Python 3, attributes
+ # normally come out of a dictionary in the order they were
+ # added.
+ self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
+
+ # This works even if Tag.attrs is None, though this shouldn't
+ # normally happen.
+ tag.attrs = None
+ self.assertEquals([], formatter.attributes(tag))
+
+ def test_sort_attributes(self):
+ # Test the ability to override Formatter.attributes() to,
+ # e.g., disable the normal sorting of attributes.
+ class UnsortedFormatter(Formatter):
+ def attributes(self, tag):
+ self.called_with = tag
+ for k, v in sorted(tag.attrs.items()):
+ if k == 'ignore':
+ continue
+ yield k,v
+
+ soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
+ formatter = UnsortedFormatter()
+ decoded = soup.decode(formatter=formatter)
+
+ # attributes() was called on the <p> tag. It filtered out one
+ # attribute and sorted the other two.
+ self.assertEquals(formatter.called_with, soup.p)
+ self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
+
+ def test_empty_attributes_are_booleans(self):
+ # Test the behavior of empty_attributes_are_booleans as well
+ # as which Formatters have it enabled.
+
+ for name in ('html', 'minimal', None):
+ formatter = HTMLFormatter.REGISTRY[name]
+ self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+ formatter = XMLFormatter.REGISTRY[None]
+ self.assertEquals(False, formatter.empty_attributes_are_booleans)
+
+ formatter = HTMLFormatter.REGISTRY['html5']
+ self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+ # Verify that the constructor sets the value.
+ formatter = Formatter(empty_attributes_are_booleans=True)
+ self.assertEquals(True, formatter.empty_attributes_are_booleans)
+
+ # Now demonstrate what it does to markup.
+ for markup in (
+ "<option selected></option>",
+ '<option selected=""></option>'
+ ):
+ soup = self.soup(markup)
+ for formatter in ('html', 'minimal', 'xml', None):
+ self.assertEquals(
+ b'<option selected=""></option>',
+ soup.option.encode(formatter='html')
+ )
+ self.assertEquals(
+ b'<option selected></option>',
+ soup.option.encode(formatter='html5')
+ )
+
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 9267a8f..d1ca5ea 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1846,45 +1846,6 @@ class TestEncoding(SoupTest):
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
-class TestFormatter(SoupTest):
-
- def test_default_attributes(self):
- # Test the default behavior of Formatter.attributes().
- formatter = Formatter()
- tag = Tag(name="tag")
- tag['b'] = 1
- tag['a'] = 2
-
- # Attributes come out sorted by name. In Python 3, attributes
- # normally come out of a dictionary in the order they were
- # added.
- self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag))
-
- # This works even if Tag.attrs is None, though this shouldn't
- # normally happen.
- tag.attrs = None
- self.assertEquals([], formatter.attributes(tag))
-
- def test_sort_attributes(self):
- # Test the ability to override Formatter.attributes() to,
- # e.g., disable the normal sorting of attributes.
- class UnsortedFormatter(Formatter):
- def attributes(self, tag):
- self.called_with = tag
- for k, v in sorted(tag.attrs.items()):
- if k == 'ignore':
- continue
- yield k,v
-
- soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
- formatter = UnsortedFormatter()
- decoded = soup.decode(formatter=formatter)
-
- # attributes() was called on the <p> tag. It filtered out one
- # attribute and sorted the other two.
- self.assertEquals(formatter.called_with, soup.p)
- self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
-
class TestNavigableStringSubclasses(SoupTest):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 8a1a2d5..2b5843d 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2299,7 +2299,7 @@ Unicode characters to HTML entities whenever possible::
# Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
# </p>
-If you pass in ``formatter="html5"``, it's the same as
+If you pass in ``formatter="html5"``, it's similar to
``formatter="html"``, but Beautiful Soup will
omit the closing slash in HTML void tags like "br"::
@@ -2310,7 +2310,17 @@ omit the closing slash in HTML void tags like "br"::
print(br.encode(formatter="html5"))
# b'<br>'
+
+In addition, any attributes whose values are the empty string
+will become HTML-style boolean attributes:
+
+ option = BeautifulSoup('<option selected=""></option>').option
+ print(option.encode(formatter="html"))
+ # b'<option selected=""></option>'
+ print(option.encode(formatter="html5"))
+ # b'<option selected></option>'
+
If you pass in ``formatter=None``, Beautiful Soup will not modify
strings at all on output. This is the fastest option, but it may lead
to Beautiful Soup generating invalid HTML/XML, as in these examples::