4 files changed, 198 insertions, 184 deletions
diff --git a/bs4/element.py b/bs4/element.py
index a233dcd..c4b5bc7 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -16,7 +16,11 @@ except ImportError, e:
         'The soupsieve package is not installed. CSS selectors cannot be used.'
     )
 
-from bs4.dammit import EntitySubstitution
+from bs4.formatter import (
+    Formatter,
+    HTMLFormatter,
+    XMLFormatter,
+)
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
@@ -99,123 +103,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
             return match.group(1) + encoding
         return self.CHARSET_RE.sub(rewrite, self.original_value)
 
+    
+class PageElement(object):
+    """Contains the navigational information for some part of the page
+    (either a tag or a piece of text)"""
+   
+    def setup(self, parent=None, previous_element=None, next_element=None,
+              previous_sibling=None, next_sibling=None):
+        """Sets up the initial relations between this element and
+        other elements."""
+        self.parent = parent
 
-class Formatter(EntitySubstitution):
-    """Describes a strategy to use when outputting a parse tree to a string.
+        self.previous_element = previous_element
+        if previous_element is not None:
+            self.previous_element.next_element = self
 
-    Some parts of this strategy come from the distinction between
-    HTML4, HTML5, and XML. Others are configurable by the user.
-    """
-    # Registries of XML and HTML formatters.
-    XML_FORMATTERS = {}
-    HTML_FORMATTERS = {}
+        self.next_element = next_element
+        if self.next_element is not None:
+            self.next_element.previous_element = self
 
-    HTML = 'html'
-    XML = 'xml'
+        self.next_sibling = next_sibling
+        if self.next_sibling is not None:
+            self.next_sibling.previous_sibling = self
 
-    HTML_DEFAULTS = dict(
-        cdata_containing_tags=set(["script", "style"]),
-        preformatted_tags=set(["pre"]),
-    )
+        if (previous_sibling is None
+            and self.parent is not None and self.parent.contents):
+            previous_sibling = self.parent.contents[-1]
 
-    def _default(self, language, value, kwarg):
-        if value is not None:
-            return value
-        if language == self.XML:
-            return set()
-        return self.HTML_DEFAULTS[kwarg]
-
-    def __init__(
-            self, language=None, entity_substitution=None,
-            void_element_close_prefix='/', cdata_containing_tags=None,
-            preformatted_tags=None,
-    ):
-        """
+        self.previous_sibling = previous_sibling
+        if previous_sibling is not None:
+            self.previous_sibling.next_sibling = self
 
-        :param void_element_close_prefix: By default, represent void
-        elements as <tag/> rather than <tag>
-        """
-        self.language = language
-        self.entity_substitution = entity_substitution
-        self.void_element_close_prefix = void_element_close_prefix
-        self.cdata_containing_tags = self._default(
-            language, cdata_containing_tags, 'cdata_containing_tags'
-        )
-            
-    def substitute(self, ns):
-        """Process a string that needs to undergo entity substitution."""
-        if not self.entity_substitution:
-            return ns
-        if (isinstance(ns, NavigableString)
-            and ns.parent is not None
-            and ns.parent.name in self.cdata_containing_tags):
-            # Do nothing.
-            return ns
-        # Substitute.
-        return self.entity_substitution(ns)
-
-    def attribute_value(self, value):
-        """Process the value of an attribute."""
-        return self.substitute(value)
-    
-    def attributes(self, tag):
-        """Reorder a tag's attributes however you want."""
-        return sorted(tag.attrs.items())
-   
-class HTMLFormatter(Formatter):
-    REGISTRY = {}
-    def __init__(self, *args, **kwargs):
-        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
-    
-class XMLFormatter(Formatter):
-    REGISTRY = {}
-    def __init__(self, *args, **kwargs):
-        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
-
-# Set up aliases for the default formatters.
-HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
-    entity_substitution=EntitySubstitution.substitute_html
-)
-HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
-    entity_substitution=EntitySubstitution.substitute_html,
-    void_element_close_prefix = None
-)
-HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
-    entity_substitution=EntitySubstitution.substitute_xml
-)
-HTMLFormatter.REGISTRY[None] = HTMLFormatter(
-    entity_substitution=None
-)
-XMLFormatter.REGISTRY["html"] =  XMLFormatter(
-    entity_substitution=EntitySubstitution.substitute_html
-)
-XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
-    entity_substitution=EntitySubstitution.substitute_xml
-)
-XMLFormatter.REGISTRY[None] = Formatter(
-    Formatter(Formatter.XML, entity_substitution=None)
-)
-    
-class PageElement(object):
-    """Contains the navigational information for some part of the page
-    (either a tag or a piece of text)"""
-   
     def format_string(self, s, formatter):
         """Format the given string using the given formatter."""
         if formatter is None:
             return s
         if not isinstance(formatter, Formatter):
-            formatter = self.formatter_by_name(formatter)
+            formatter = self.formatter_for_name(formatter)
         output = formatter.substitute(s)
         return output
 
+    def formatter_for_name(self, formatter):
+        """Look up or create a Formatter for the given identifier,
+        if necessary.
+
+        :param formatter: Can be a Formatter object (used as-is), a
+        function (used as the entity substitution hook for an
+        XMLFormatter or HTMLFormatter), or a string (used to look up
+        an XMLFormatter or HTMLFormatter in the appropriate registry.
+        """
+        if isinstance(formatter, Formatter):
+            return formatter
+        if self._is_xml:
+            c = XMLFormatter
+        else:
+            c = HTMLFormatter
+        if callable(formatter):
+            return c(entity_substitution=formatter)
+        return c.REGISTRY[formatter]
+
     @property
     def _is_xml(self):
         """Is this element part of an XML tree or an HTML tree?
 
-        This is used when mapping a formatter name ("minimal") to an
-        appropriate function (one that performs entity-substitution on
-        the contents of <script> and <style> tags, or not). It can be
+        This is used in formatter_for_name, when deciding whether an
+        XMLFormatter or HTMLFormatter is more appropriate. It can be
         inefficient, but it should be called very rarely.
         """
         if self.known_xml is not None:
@@ -233,32 +185,6 @@ class PageElement(object):
             return getattr(self, 'is_xml', False)
         return self.parent._is_xml
 
-    def setup(self, parent=None, previous_element=None, next_element=None,
-              previous_sibling=None, next_sibling=None):
-        """Sets up the initial relations between this element and
-        other elements."""
-        self.parent = parent
-
-        self.previous_element = previous_element
-        if previous_element is not None:
-            self.previous_element.next_element = self
-
-        self.next_element = next_element
-        if self.next_element is not None:
-            self.next_element.previous_element = self
-
-        self.next_sibling = next_sibling
-        if self.next_sibling is not None:
-            self.next_sibling.previous_sibling = self
-
-        if (previous_sibling is None
-            and self.parent is not None and self.parent.contents):
-            previous_sibling = self.parent.contents[-1]
-
-        self.previous_sibling = previous_sibling
-        if previous_sibling is not None:
-            self.previous_sibling.next_sibling = self
-
     nextSibling = _alias("next_sibling")  # BS3
     previousSibling = _alias("previous_sibling")  # BS3
 
@@ -720,6 +646,7 @@ class NavigableString(unicode, PageElement):
                     self.__class__.__name__, attr))
 
     def output_ready(self, formatter="minimal"):
+        """Run the string through the provided formatter."""
         output = self.format_string(self, formatter)
         return self.PREFIX + output + self.SUFFIX
 
@@ -742,8 +669,8 @@ class PreformattedString(NavigableString):
         """CData strings are passed into the formatter, purely
         for any side effects. The return value is ignored.
         """
-        if formatter:
-            self.format_string(self, formatter)
+        if formatter is not None:
+            ignore = self.format_string(self, formatter)
         return self.PREFIX + self + self.SUFFIX
 
 class CData(PreformattedString):
@@ -1119,8 +1046,9 @@ class Tag(PageElement):
            encoding.
         """
 
-        # First off, turn a string formatter into a Formatter object. This
-        # will stop the lookup from happening over and over again.
+        # First off, turn a non-Formatter `formatter` into a Formatter
+        # object. This will stop the lookup from happening over and
+        # over again.
         if not isinstance(formatter, Formatter):
             formatter = self.formatter_for_name(formatter)
         attributes = formatter.attributes(self)
@@ -1142,7 +1070,7 @@ class Tag(PageElement):
                 text = formatter.attribute_value(val)
                 decoded = (
                     unicode(key) + '='
-                    + EntitySubstitution.quoted_attribute_value(text))
+                    + formatter.quoted_attribute_value(text))
             attrs.append(decoded)
         close = ''
         closeTag = ''
@@ -1167,7 +1095,8 @@ class Tag(PageElement):
         else:
             indent_contents = None
         contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter)
+            indent_contents, eventual_encoding, formatter
+        )
 
         if self.hidden:
             # This is the 'document root' object.
@@ -1221,14 +1150,14 @@ class Tag(PageElement):
            indented this many spaces.
 
         :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. This method is _not_
+           encoded into this encoding. decode_contents() is _not_
            responsible for performing that encoding. This information
            is passed in so that it can be substituted in if the
            document contains a <META> tag that mentions the document's
            encoding.
 
-        :param formatter: The output formatter responsible for converting
-           entities to Unicode characters.
+        :param formatter: A Formatter object, or a string naming one of
+            the standard Formatters.
         """
         # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
@@ -1244,29 +1173,17 @@ class Tag(PageElement):
             elif isinstance(c, Tag):
                 s.append(c.decode(indent_level, eventual_encoding,
                                   formatter))
-            if text and indent_level and not self.name == 'pre':
+            preserve_whitespace = self.name in self.preserve_whitespace_tags
+            if text and indent_level and not preserve_whitespace:
                 text = text.strip()
             if text:
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                     s.append(" " * (indent_level - 1))
                 s.append(text)
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                     s.append("\n")
         return ''.join(s)
-
-    def formatter_for_name(self, formatter):
-        if isinstance(formatter, Formatter):
-            return formatter
-        if self._is_xml:
-            c = XMLFormatter
-        else:
-            c = HTMLFormatter
-        if callable(formatter):
-            formatter = c(entity_substitution=formatter)
-            formatter.custom = True
-            return formatter
-        return c.REGISTRY[formatter]
-        
+       
     def encode_contents(
         self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
         formatter="minimal"):
diff --git a/bs4/formatter.py b/bs4/formatter.py
new file mode 100644
index 0000000..f2724db
--- /dev/null
+++ b/bs4/formatter.py
@@ -0,0 +1,99 @@
+from bs4.dammit import EntitySubstitution
+
+class Formatter(EntitySubstitution):
+    """Describes a strategy to use when outputting a parse tree to a string.
+
+    Some parts of this strategy come from the distinction between
+    HTML4, HTML5, and XML. Others are configurable by the user.
+    """
+    # Registries of XML and HTML formatters.
+    XML_FORMATTERS = {}
+    HTML_FORMATTERS = {}
+
+    HTML = 'html'
+    XML = 'xml'
+
+    HTML_DEFAULTS = dict(
+        cdata_containing_tags=set(["script", "style"]),
+    )
+
+    def _default(self, language, value, kwarg):
+        if value is not None:
+            return value
+        if language == self.XML:
+            return set()
+        return self.HTML_DEFAULTS[kwarg]
+
+    def __init__(
+            self, language=None, entity_substitution=None,
+            void_element_close_prefix='/', cdata_containing_tags=None,
+    ):
+        """
+
+        :param void_element_close_prefix: By default, represent void
+        elements as <tag/> rather than <tag>
+        """
+        self.language = language
+        self.entity_substitution = entity_substitution
+        self.void_element_close_prefix = void_element_close_prefix
+        self.cdata_containing_tags = self._default(
+            language, cdata_containing_tags, 'cdata_containing_tags'
+        )
+            
+    def substitute(self, ns):
+        """Process a string that needs to undergo entity substitution."""
+        if not self.entity_substitution:
+            return ns
+        from element import NavigableString
+        if (isinstance(ns, NavigableString)
+            and ns.parent is not None
+            and ns.parent.name in self.cdata_containing_tags):
+            # Do nothing.
+            return ns
+        # Substitute.
+        return self.entity_substitution(ns)
+
+    def attribute_value(self, value):
+        """Process the value of an attribute."""
+        return self.substitute(value)
+    
+    def attributes(self, tag):
+        """Reorder a tag's attributes however you want."""
+        return sorted(tag.attrs.items())
+
+   
+class HTMLFormatter(Formatter):
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
+
+    
+class XMLFormatter(Formatter):
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html,
+    void_element_close_prefix = None
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(
+    entity_substitution=None
+)
+XMLFormatter.REGISTRY["html"] =  XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+XMLFormatter.REGISTRY[None] = Formatter(
+    Formatter(Formatter.XML, entity_substitution=None)
+)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index c1cd581..6510f85 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1571,11 +1571,11 @@ class TestSubstitutions(SoupTest):
         self.assertTrue(b"< < hey > >" in encoded)
 
     def test_prettify_leaves_preformatted_text_alone(self):
-        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
+        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  <textarea> eee\nfff\t</textarea></div>")
         # Everything outside the <pre> tag is reformatted, but everything
         # inside is left alone.
         self.assertEqual(
-            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
+            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
             soup.div.prettify())
 
     def test_prettify_accepts_formatter_function(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 8376549..0c09964 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2264,16 +2264,17 @@ to Beautiful Soup generating invalid HTML/XML, as in these examples::
  print(link_soup.a.encode(formatter=None))
  # <a href="http://example.com/?foo=val1&bar=val2">A link</a>
 
-Finally, if you pass in a function for ``formatter``, Beautiful Soup
-will call that function once for every string and attribute value in
-the document. You can do whatever you want in this function. Here's a
-formatter that converts strings to uppercase and does absolutely
-nothing else::
+If you need more sophisticated control over your output, you can
+use Beautiful Soup's ``Formatter`` class. Here's a formatter that
+converts strings to uppercase, whether they occur in a text node or in an
+attribute value::
 
+ from bs4.formatter import HTMLFormatter
  def uppercase(str):
      return str.upper()
+ formatter = HTMLFormatter(uppercase)
 
- print(soup.prettify(formatter=uppercase))
+ print(soup.prettify(formatter=formatter))
  # <html>
  #  <body>
  #   <p>
@@ -2282,34 +2283,31 @@ nothing else::
  #  </body>
  # </html>
 
- print(link_soup.a.prettify(formatter=uppercase))
+ print(link_soup.a.prettify(formatter=formatter))
  # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
  #  A LINK
  # </a>
 
-If you're writing your own function, you should know about the
-``EntitySubstitution`` class in the ``bs4.dammit`` module. This class
-implements Beautiful Soup's standard formatters as class methods: the
-"html" formatter is ``EntitySubstitution.substitute_html``, and the
-"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can
-use these functions to simulate ``formatter=html`` or
-``formatter==minimal``, but then do something extra.
-
-Here's an example that replaces Unicode characters with HTML entities
-whenever possible, but `also` converts all strings to uppercase::
-
- from bs4.dammit import EntitySubstitution
- def uppercase_and_substitute_html_entities(str):
-     return EntitySubstitution.substitute_html(str.upper())
-
- print(soup.prettify(formatter=uppercase_and_substitute_html_entities))
- # <html>
- #  <body>
- #   <p>
- #    IL A DIT &lt;&lt;SACR&Eacute; BLEU!&gt;&gt;
- #   </p>
- #  </body>
- # </html>
+Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even
+more control over the output. For example, Beautiful Soup sorts the
+attributes in every tag by default::
+
+ attr_soup = BeautifulSoup('<p z="1" m="2" a="3"></p>')
+ print(attr_soup.p.encode())
+ # <p a="3" m="2" z="1"></p>
+
+To turn this off, you can subclass the ``Formatter.attributes()``
+method, which controls which attributes are output and in what
+order. This implementation also filters out out one of the attributes.
+
+ class UnsortedAttributes(HTMLFormatter):
+     def attributes(self, tag):
+         for k, v in tag.attrs.items():
+             if k == 'm':
+	         continue
+             yield k, v
+ print(attr_soup.p.encode(formatter=UnsortedAttributes())) 
+ # <p z="1" a="3"></p>
 
 One last caveat: if you create a ``CData`` object, the text inside
 that object is always presented `exactly as it appears, with no