Give the Formatter class more control over formatting decisions.

author: Leonard Richardson <leonardr@segfault.org> 2019-07-14 17:09:58 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2019-07-14 17:09:58 -0400
commit: 0df054db08ef3286482694ee0c9aa85b5313dfd2 (patch)
tree: d1b38991f1148abccb0862484d87d760654cd18f
parent: 519afbe269b671e15a1f1d2aecfe4fc579b61efc (diff)
3 files changed, 158 insertions, 174 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index c5e6e84..e087f07 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -7,7 +7,6 @@ import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
-    HTMLAwareEntitySubstitution,
     nonwhitespace_re
     )
 
@@ -90,7 +89,6 @@ class TreeBuilder(object):
 
     is_xml = False
     picklable = False
-    preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
     
@@ -98,9 +96,11 @@ class TreeBuilder(object):
     # comma-separated list of CDATA, rather than a single CDATA.
     DEFAULT_CDATA_LIST_ATTRIBUTES = {}
 
+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+    
     USE_DEFAULT = object()
     
-    def __init__(self, multi_valued_attributes=USE_DEFAULT):
+    def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
         """Constructor.
 
         :param multi_valued_attributes: If this is set to None, the
@@ -110,14 +110,19 @@ class TreeBuilder(object):
         for an example.
 
         Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-use, so the argument ame
+        probably doesn't make sense to an end-user, so the argument name
         is `multi_valued_attributes`.
+
+        :param preserve_whitespace_tags:
         """
         self.soup = None
         if multi_valued_attributes is self.USE_DEFAULT:
             multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
         self.cdata_list_attributes = multi_valued_attributes
-
+        if preserve_whitespace_tags is self.USE_DEFAULT:
+            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+        self.preserve_whitespace_tags = preserve_whitespace_tags
+            
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
         being associated with the TreeBuilder.
@@ -253,7 +258,6 @@ class HTMLTreeBuilder(TreeBuilder):
     Such as which tags are empty-element tags.
     """
 
-    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
     empty_element_tags = set([
         # These are from HTML5.
         'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@@ -292,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
         "output" : ["for"],
         }
 
+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+    
     def set_up_substitutions(self, tag):
         # We are only interested in <meta> tags
         if tag.name != 'meta':
diff --git a/bs4/element.py b/bs4/element.py
index e8e48df..a233dcd 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -99,134 +99,114 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
             return match.group(1) + encoding
         return self.CHARSET_RE.sub(rewrite, self.original_value)
 
-class HTMLAwareEntitySubstitution(EntitySubstitution):
 
-    """Entity substitution rules that are aware of some HTML quirks.
+class Formatter(EntitySubstitution):
+    """Describes a strategy to use when outputting a parse tree to a string.
 
-    Specifically, the contents of <script> and <style> tags should not
-    undergo entity substitution.
-
-    Incoming NavigableString objects are checked to see if they're the
-    direct children of a <script> or <style> tag.
+    Some parts of this strategy come from the distinction between
+    HTML4, HTML5, and XML. Others are configurable by the user.
     """
+    # Registries of XML and HTML formatters.
+    XML_FORMATTERS = {}
+    HTML_FORMATTERS = {}
 
-    cdata_containing_tags = set(["script", "style"])
+    HTML = 'html'
+    XML = 'xml'
 
-    preformatted_tags = set(["pre"])
+    HTML_DEFAULTS = dict(
+        cdata_containing_tags=set(["script", "style"]),
+        preformatted_tags=set(["pre"]),
+    )
 
-    preserve_whitespace_tags = set(['pre', 'textarea'])
+    def _default(self, language, value, kwarg):
+        if value is not None:
+            return value
+        if language == self.XML:
+            return set()
+        return self.HTML_DEFAULTS[kwarg]
+
+    def __init__(
+            self, language=None, entity_substitution=None,
+            void_element_close_prefix='/', cdata_containing_tags=None,
+            preformatted_tags=None,
+    ):
+        """
 
-    @classmethod
-    def _substitute_if_appropriate(cls, ns, f):
+        :param void_element_close_prefix: By default, represent void
+        elements as <tag/> rather than <tag>
+        """
+        self.language = language
+        self.entity_substitution = entity_substitution
+        self.void_element_close_prefix = void_element_close_prefix
+        self.cdata_containing_tags = self._default(
+            language, cdata_containing_tags, 'cdata_containing_tags'
+        )
+            
+    def substitute(self, ns):
+        """Process a string that needs to undergo entity substitution."""
+        if not self.entity_substitution:
+            return ns
         if (isinstance(ns, NavigableString)
             and ns.parent is not None
-            and ns.parent.name in cls.cdata_containing_tags):
+            and ns.parent.name in self.cdata_containing_tags):
             # Do nothing.
             return ns
         # Substitute.
-        return f(ns)
-
-    @classmethod
-    def substitute_html(cls, ns):
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_html)
+        return self.entity_substitution(ns)
 
-    @classmethod
-    def substitute_xml(cls, ns):
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_xml)
-
-class Formatter(object):
-    """Contains information about how to format a parse tree."""
+    def attribute_value(self, value):
+        """Process the value of an attribute."""
+        return self.substitute(value)
     
-    # By default, represent void elements as <tag/> rather than <tag>
-    void_element_close_prefix = '/'
-
-    def substitute(self, *args, **kwargs):
-        """Transform certain characters into named entities."""
-        raise NotImplementedError()
-    
-    def sort_attributes(self, attributes):
+    def attributes(self, tag):
         """Reorder a tag's attributes however you want."""
-        return sorted(attributes.items())
-
-
+        return sorted(tag.attrs.items())
+   
 class HTMLFormatter(Formatter):
-    """The default HTML formatter."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-    
-class MinimalHTMLFormatter(Formatter):
-    """A minimal HTML formatter."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
     
-class HTML5Formatter(HTMLFormatter):
-    """An HTML formatter that omits the slash in a void tag."""
-    void_element_close_prefix = None
-
 class XMLFormatter(Formatter):
-    """Substitute only the essential XML entities."""
-    def substitute(self, *args, **kwargs):
-        return EntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTMLXMLFormatter(Formatter):
-    """Format XML using HTML rules."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html,
+    void_element_close_prefix = None
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(
+    entity_substitution=None
+)
+XMLFormatter.REGISTRY["html"] =  XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+XMLFormatter.REGISTRY[None] = Formatter(
+    Formatter(Formatter.XML, entity_substitution=None)
+)
     
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
-
-    # There are five possible values for the "formatter" argument passed in
-    # to methods like encode() and prettify():
-    #
-    # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output.
-    # "html5" - The same as "html", but empty void tags are represented as
-    #   <tag> rather than <tag/>
-    # "minimal" - Bare ampersands and angle brackets are converted to
-    #   XML entities: &amp; &lt; &gt;
-    # None - The null formatter. Unicode characters are never
-    #   converted to entities.  This is not recommended, but it's
-    #   faster than "minimal".
-    # A callable function - it will be called on every string that needs to undergo entity substitution.
-    # A Formatter instance - Formatter.substitute(string) will be called on every string that
-    #  needs to undergo entity substitution.
-    #
-
-    # In an HTML document, the default "html", "html5", and "minimal"
-    # functions will leave the contents of <script> and <style> tags
-    # alone. For an XML document, all tags will be given the same
-    # treatment.
-
-    HTML_FORMATTERS = {
-        "html" : HTMLFormatter(),
-        "html5" : HTML5Formatter(),
-        "minimal" : MinimalHTMLFormatter(),
-        None : None
-        }
-
-    XML_FORMATTERS = {
-        "html" : HTMLXMLFormatter(),
-        "minimal" : XMLFormatter(),
-        None : None
-        }
-
-    def format_string(self, s, formatter='minimal'):
+   
+    def format_string(self, s, formatter):
         """Format the given string using the given formatter."""
-        if isinstance(formatter, basestring):
-            formatter = self._formatter_for_name(formatter)
         if formatter is None:
-            output = s
-        else:
-            if isinstance(formatter, Callable):
-                # Backwards compatibility -- you used to pass in a formatting method.
-                output = formatter(s)
-            else:
-                output = formatter.substitute(s)
+            return s
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_by_name(formatter)
+        output = formatter.substitute(s)
         return output
 
     @property
@@ -253,13 +233,6 @@ class PageElement(object):
             return getattr(self, 'is_xml', False)
         return self.parent._is_xml
 
-    def _formatter_for_name(self, name):
-        "Look up a formatter function based on its name and the tree."
-        if self._is_xml:
-            return self.XML_FORMATTERS.get(name, XMLFormatter())
-        else:
-            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
-
     def setup(self, parent=None, previous_element=None, next_element=None,
               previous_sibling=None, next_sibling=None):
         """Sets up the initial relations between this element and
@@ -765,10 +738,12 @@ class PreformattedString(NavigableString):
     but the return value will be ignored.
     """
 
-    def output_ready(self, formatter="minimal"):
-        """CData strings are passed into the formatter.
-        But the return value is ignored."""
-        self.format_string(self, formatter)
+    def output_ready(self, formatter=None):
+        """CData strings are passed into the formatter, purely
+        for any side effects. The return value is ignored.
+        """
+        if formatter:
+            self.format_string(self, formatter)
         return self.PREFIX + self + self.SUFFIX
 
 class CData(PreformattedString):
@@ -836,14 +811,6 @@ class Tag(PageElement):
         self.name = name
         self.namespace = namespace
         self.prefix = prefix
-        if builder is not None:
-            preserve_whitespace_tags = builder.preserve_whitespace_tags
-        else:
-            if is_xml:
-                preserve_whitespace_tags = []
-            else:
-                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
-        self.preserve_whitespace_tags = preserve_whitespace_tags
         if attrs is None:
             attrs = {}
         elif attrs:
@@ -887,6 +854,10 @@ class Tag(PageElement):
             # (unlike can_be_empty_element), we almost never need
             # to check this.
             self.cdata_list_attributes = builder.cdata_list_attributes
+
+            # Keep track of the names that might cause this tag to be treated as a
+            # whitespace-preserved tag.
+            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
             
     parserClass = _alias("parser_class")  # BS3
 
@@ -1135,14 +1106,6 @@ class Tag(PageElement):
         u = self.decode(indent_level, encoding, formatter)
         return u.encode(encoding, errors)
 
-    def _should_pretty_print(self, indent_level):
-        """Should this tag be pretty-printed?"""
-
-        return (
-            indent_level is not None
-            and self.name not in self.preserve_whitespace_tags
-        )
-
     def decode(self, indent_level=None,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
                formatter="minimal"):
@@ -1158,32 +1121,29 @@ class Tag(PageElement):
 
         # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
-            formatter = self._formatter_for_name(formatter)
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
+        attributes = formatter.attributes(self)
         attrs = []
-        if self.attrs:
-            if isinstance(formatter, Formatter):
-                sorted_attrs = formatter.sort_attributes(self.attrs)
+        for key, val in attributes:
+            if val is None:
+                decoded = key
             else:
-                sorted_attrs = self.attrs.items()
-            for key, val in sorted_attrs:
-                if val is None:
-                    decoded = key
-                else:
-                    if isinstance(val, list) or isinstance(val, tuple):
-                        val = ' '.join(val)
-                    elif not isinstance(val, basestring):
-                        val = unicode(val)
-                    elif (
+                if isinstance(val, list) or isinstance(val, tuple):
+                    val = ' '.join(val)
+                elif not isinstance(val, basestring):
+                    val = unicode(val)
+                elif (
                         isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None):
-                        val = val.encode(eventual_encoding)
-
-                    text = self.format_string(val, formatter)
-                    decoded = (
-                        unicode(key) + '='
-                        + EntitySubstitution.quoted_attribute_value(text))
-                attrs.append(decoded)
+                        and eventual_encoding is not None
+                ):
+                    val = val.encode(eventual_encoding)
+
+                text = formatter.attribute_value(val)
+                decoded = (
+                    unicode(key) + '='
+                    + EntitySubstitution.quoted_attribute_value(text))
+            attrs.append(decoded)
         close = ''
         closeTag = ''
 
@@ -1192,9 +1152,7 @@ class Tag(PageElement):
             prefix = self.prefix + ":"
 
         if self.is_empty_element:
-            close = ''
-            if isinstance(formatter, Formatter):
-                close = formatter.void_element_close_prefix or close
+            close = formatter.void_element_close_prefix or ''
         else:
             closeTag = '</%s%s>' % (prefix, self.name)
 
@@ -1241,6 +1199,13 @@ class Tag(PageElement):
             s = ''.join(s)
         return s
 
+    def _should_pretty_print(self, indent_level):
+        """Should this tag be pretty-printed?"""
+        return (
+            indent_level is not None
+            and self.name not in self.preserve_whitespace_tags
+        )
+
     def prettify(self, encoding=None, formatter="minimal"):
         if encoding is None:
             return self.decode(True, formatter=formatter)
@@ -1267,8 +1232,8 @@ class Tag(PageElement):
         """
         # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
-            formatter = self._formatter_for_name(formatter)
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
 
         pretty_print = (indent_level is not None)
         s = []
@@ -1289,6 +1254,19 @@ class Tag(PageElement):
                     s.append("\n")
         return ''.join(s)
 
+    def formatter_for_name(self, formatter):
+        if isinstance(formatter, Formatter):
+            return formatter
+        if self._is_xml:
+            c = XMLFormatter
+        else:
+            c = HTMLFormatter
+        if callable(formatter):
+            formatter = c(entity_substitution=formatter)
+            formatter.custom = True
+            return formatter
+        return c.REGISTRY[formatter]
+        
     def encode_contents(
         self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
         formatter="minimal"):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f7c5e2f..ffbc29e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -24,8 +24,8 @@ from bs4.element import (
     CData,
     Comment,
     Declaration,
-    MinimalHTMLFormatter,
     Doctype,
+    HTMLFormatter,
     NavigableString,
     SoupStrainer,
     Tag,
@@ -1533,7 +1533,7 @@ class TestSubstitutions(SoupTest):
         # callable is called on every string.
         self.assertEqual(
             decoded,
-            self.document_for(u"<b><FOO></b><b>BAR</b><br>"))
+            self.document_for(u"<b><FOO></b><b>BAR</b><br/>"))
 
     def test_formatter_is_run_on_attribute_values(self):
         markup = u'<a href="http://a.com?a=b&c=é">e</a>'
@@ -1687,10 +1687,10 @@ class TestEncoding(SoupTest):
 class TestFormatter(SoupTest):
 
     def test_sort_attributes(self):
-        class UnsortedFormatter(MinimalHTMLFormatter):
-            def sort_attributes(self, attributes):
-                self.called_with = attributes
-                for k, v in sorted(attributes.items()):
+        class UnsortedFormatter(HTMLFormatter):
+            def attributes(self, tag):
+                self.called_with = tag
+                for k, v in sorted(tag.attrs.items()):
                     if k == 'ignore':
                         continue
                     yield k,v
@@ -1699,9 +1699,9 @@ class TestFormatter(SoupTest):
         formatter = UnsortedFormatter()
         decoded = soup.decode(formatter=formatter)
 
-        # sort_attributes() was called with all three attributes. It removed one and
-        # sorted the other two.
-        self.assertEquals(formatter.called_with, dict(cval="1", aval="2", ignore="ignored"))
+        # sort_attributes() was called on the <p> tag. It filtered out one
+        # attribute and sorted the other two.
+        self.assertEquals(formatter.called_with, soup.p)
         self.assertEquals(u'<p aval="2" cval="1"></p>', decoded)
author	Leonard Richardson <leonardr@segfault.org>	2019-07-14 17:09:58 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2019-07-14 17:09:58 -0400
commit	0df054db08ef3286482694ee0c9aa85b5313dfd2 (patch)
tree	d1b38991f1148abccb0862484d87d760654cd18f
parent	519afbe269b671e15a1f1d2aecfe4fc579b61efc (diff)