diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/formatter.py | 23 | ||||
-rw-r--r-- | bs4/tests/test_formatter.py | 81 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 39 |
3 files changed, 99 insertions, 44 deletions
diff --git a/bs4/formatter.py b/bs4/formatter.py index 9a692ec..82d4689 100644 --- a/bs4/formatter.py +++ b/bs4/formatter.py @@ -14,7 +14,8 @@ class Formatter(EntitySubstitution): For HTML documents: * 'html' - HTML entity substitution for generic HTML documents. (default) - * 'html5' - HTML entity substitution for HTML5 documents. + * 'html5' - HTML entity substitution for HTML5 documents, as + well as some optimizations in the way tags are rendered. * 'minimal' - Only make the substitutions necessary to guarantee valid HTML. * None - Do not perform any substitution. This will be faster @@ -48,6 +49,7 @@ class Formatter(EntitySubstitution): def __init__( self, language=None, entity_substitution=None, void_element_close_prefix='/', cdata_containing_tags=None, + empty_attributes_are_booleans=False, ): """Constructor. @@ -64,6 +66,9 @@ class Formatter(EntitySubstitution): as containing CDATA in this dialect. For example, in HTML, <script> and <style> tags are defined as containing CDATA, and their contents should not be formatted. + :param blank_attributes_are_booleans: Render attributes whose value + is the empty string as HTML-style boolean attributes. + (Attributes whose value is None are always rendered this way.) """ self.language = language self.entity_substitution = entity_substitution @@ -71,7 +76,8 @@ class Formatter(EntitySubstitution): self.cdata_containing_tags = self._default( language, cdata_containing_tags, 'cdata_containing_tags' ) - + self.empty_attributes_are_booleans=empty_attributes_are_booleans + def substitute(self, ns): """Process a string that needs to undergo entity substitution. This may be a string encountered in an attribute value or as @@ -107,11 +113,17 @@ class Formatter(EntitySubstitution): By default, attributes are sorted alphabetically. This makes behavior consistent between Python 2 and Python 3, and preserves backwards compatibility with older versions of Beautiful Soup. + + If `empty_boolean_attributes` is True, then attributes whose + values are set to the empty string will be treated as boolean + attributes. """ if tag.attrs is None: return [] - return sorted(tag.attrs.items()) - + return sorted( + (k, (None if self.empty_attributes_are_booleans and v == '' else v)) + for k, v in tag.attrs.items() + ) class HTMLFormatter(Formatter): """A generic Formatter for HTML.""" @@ -133,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter( ) HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html, - void_element_close_prefix = None + void_element_close_prefix=None, + empty_attributes_are_booleans=True, ) HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_xml diff --git a/bs4/tests/test_formatter.py b/bs4/tests/test_formatter.py new file mode 100644 index 0000000..718989b --- /dev/null +++ b/bs4/tests/test_formatter.py @@ -0,0 +1,81 @@ +from bs4.element import Tag +from bs4.testing import SoupTest +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +class TestFormatter(SoupTest): + + def test_default_attributes(self): + # Test the default behavior of Formatter.attributes(). + formatter = Formatter() + tag = Tag(name="tag") + tag['b'] = 1 + tag['a'] = 2 + + # Attributes come out sorted by name. In Python 3, attributes + # normally come out of a dictionary in the order they were + # added. + self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag)) + + # This works even if Tag.attrs is None, though this shouldn't + # normally happen. + tag.attrs = None + self.assertEquals([], formatter.attributes(tag)) + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEquals(formatter.called_with, soup.p) + self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) + + def test_empty_attributes_are_booleans(self): + # Test the behavior of empty_attributes_are_booleans as well + # as which Formatters have it enabled. + + for name in ('html', 'minimal', None): + formatter = HTMLFormatter.REGISTRY[name] + self.assertEquals(False, formatter.empty_attributes_are_booleans) + + formatter = XMLFormatter.REGISTRY[None] + self.assertEquals(False, formatter.empty_attributes_are_booleans) + + formatter = HTMLFormatter.REGISTRY['html5'] + self.assertEquals(True, formatter.empty_attributes_are_booleans) + + # Verify that the constructor sets the value. + formatter = Formatter(empty_attributes_are_booleans=True) + self.assertEquals(True, formatter.empty_attributes_are_booleans) + + # Now demonstrate what it does to markup. + for markup in ( + "<option selected></option>", + '<option selected=""></option>' + ): + soup = self.soup(markup) + for formatter in ('html', 'minimal', 'xml', None): + self.assertEquals( + b'<option selected=""></option>', + soup.option.encode(formatter='html') + ) + self.assertEquals( + b'<option selected></option>', + soup.option.encode(formatter='html5') + ) + diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 9267a8f..d1ca5ea 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1846,45 +1846,6 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) -class TestFormatter(SoupTest): - - def test_default_attributes(self): - # Test the default behavior of Formatter.attributes(). - formatter = Formatter() - tag = Tag(name="tag") - tag['b'] = 1 - tag['a'] = 2 - - # Attributes come out sorted by name. In Python 3, attributes - # normally come out of a dictionary in the order they were - # added. - self.assertEquals([('a', 2), ('b', 1)], formatter.attributes(tag)) - - # This works even if Tag.attrs is None, though this shouldn't - # normally happen. - tag.attrs = None - self.assertEquals([], formatter.attributes(tag)) - - def test_sort_attributes(self): - # Test the ability to override Formatter.attributes() to, - # e.g., disable the normal sorting of attributes. - class UnsortedFormatter(Formatter): - def attributes(self, tag): - self.called_with = tag - for k, v in sorted(tag.attrs.items()): - if k == 'ignore': - continue - yield k,v - - soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') - formatter = UnsortedFormatter() - decoded = soup.decode(formatter=formatter) - - # attributes() was called on the <p> tag. It filtered out one - # attribute and sorted the other two. - self.assertEquals(formatter.called_with, soup.p) - self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) - class TestNavigableStringSubclasses(SoupTest): |