diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 22:57:04 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 22:57:04 -0400 |
commit | 519afbe269b671e15a1f1d2aecfe4fc579b61efc (patch) | |
tree | 34009e19c95cae9245678451f3d7dc783f75f59a | |
parent | 2fcaeb6e916a09fa87b4b2ab57167c39db6cef8c (diff) |
A Formatter can now decide how (or whether) to order the attributes
inside a tag. [bug=1812422]
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/element.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 22 |
3 files changed, 37 insertions, 3 deletions
@@ -6,6 +6,9 @@ attributes are treated -- you can do this with the `multi_valued_attributes` argument. [bug=1832978] +* A Formatter can now decide how (or whether) to order the attributes + inside a tag. [bug=1812422] + * ' (which is valid in XML and XHTML, but not HTML 4) is now recognized as a named entity and converted to a single quote. [bug=1818721] diff --git a/bs4/element.py b/bs4/element.py index 1183f77..e8e48df 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -142,15 +142,20 @@ class Formatter(object): # By default, represent void elements as <tag/> rather than <tag> void_element_close_prefix = '/' - def substitute_entities(self, *args, **kwargs): + def substitute(self, *args, **kwargs): """Transform certain characters into named entities.""" raise NotImplementedError() + + def sort_attributes(self, attributes): + """Reorder a tag's attributes however you want.""" + return sorted(attributes.items()) + class HTMLFormatter(Formatter): """The default HTML formatter.""" def substitute(self, *args, **kwargs): return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - + class MinimalHTMLFormatter(Formatter): """A minimal HTML formatter.""" def substitute(self, *args, **kwargs): @@ -1157,7 +1162,11 @@ class Tag(PageElement): formatter = self._formatter_for_name(formatter) attrs = [] if self.attrs: - for key, val in sorted(self.attrs.items()): + if isinstance(formatter, Formatter): + sorted_attrs = formatter.sort_attributes(self.attrs) + else: + sorted_attrs = self.attrs.items() + for key, val in sorted_attrs: if val is None: decoded = key else: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index a14928e..f7c5e2f 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -24,6 +24,7 @@ from bs4.element import ( CData, Comment, Declaration, + MinimalHTMLFormatter, Doctype, NavigableString, SoupStrainer, @@ -1683,6 +1684,27 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) +class TestFormatter(SoupTest): + + def test_sort_attributes(self): + class UnsortedFormatter(MinimalHTMLFormatter): + def sort_attributes(self, attributes): + self.called_with = attributes + for k, v in sorted(attributes.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # sort_attributes() was called with all three attributes. It removed one and + # sorted the other two. + self.assertEquals(formatter.called_with, dict(cval="1", aval="2", ignore="ignored")) + self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) + + class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): |