From 519afbe269b671e15a1f1d2aecfe4fc579b61efc Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 7 Jul 2019 22:57:04 -0400 Subject: A Formatter can now decide how (or whether) to order the attributes inside a tag. [bug=1812422] --- CHANGELOG | 3 +++ bs4/element.py | 15 ++++++++++++--- bs4/tests/test_tree.py | 22 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index b60b5b5..019ace4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,9 @@ attributes are treated -- you can do this with the `multi_valued_attributes` argument. [bug=1832978] +* A Formatter can now decide how (or whether) to order the attributes + inside a tag. [bug=1812422] + * ' (which is valid in XML and XHTML, but not HTML 4) is now recognized as a named entity and converted to a single quote. [bug=1818721] diff --git a/bs4/element.py b/bs4/element.py index 1183f77..e8e48df 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -142,15 +142,20 @@ class Formatter(object): # By default, represent void elements as rather than void_element_close_prefix = '/' - def substitute_entities(self, *args, **kwargs): + def substitute(self, *args, **kwargs): """Transform certain characters into named entities.""" raise NotImplementedError() + + def sort_attributes(self, attributes): + """Reorder a tag's attributes however you want.""" + return sorted(attributes.items()) + class HTMLFormatter(Formatter): """The default HTML formatter.""" def substitute(self, *args, **kwargs): return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - + class MinimalHTMLFormatter(Formatter): """A minimal HTML formatter.""" def substitute(self, *args, **kwargs): @@ -1157,7 +1162,11 @@ class Tag(PageElement): formatter = self._formatter_for_name(formatter) attrs = [] if self.attrs: - for key, val in sorted(self.attrs.items()): + if isinstance(formatter, Formatter): + sorted_attrs = formatter.sort_attributes(self.attrs) + else: + sorted_attrs = self.attrs.items() + for key, val in sorted_attrs: if val is None: decoded = key else: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index a14928e..f7c5e2f 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -24,6 +24,7 @@ from bs4.element import ( CData, Comment, Declaration, + MinimalHTMLFormatter, Doctype, NavigableString, SoupStrainer, @@ -1683,6 +1684,27 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'\\u2603', repr(soup)) +class TestFormatter(SoupTest): + + def test_sort_attributes(self): + class UnsortedFormatter(MinimalHTMLFormatter): + def sort_attributes(self, attributes): + self.called_with = attributes + for k, v in sorted(attributes.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('

') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # sort_attributes() was called with all three attributes. It removed one and + # sorted the other two. + self.assertEquals(formatter.called_with, dict(cval="1", aval="2", ignore="ignored")) + self.assertEquals(u'

', decoded) + + class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): -- cgit v1.2.3