3 files changed, 89 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ee68e25..21e5d6c 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -297,9 +297,10 @@ class BeautifulSoup(Tag):
 
     def decode(self, pretty_print=False,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               substitute_html_entities=False):
+               formatter="minimal"):
         """Returns a string or Unicode representation of this document.
         To get Unicode, pass None for encoding."""
+
         if self.is_xml:
             # Print the XML declaration
             encoding_part = ''
@@ -313,8 +314,7 @@ class BeautifulSoup(Tag):
         else:
             indent_level = 0
         return prefix + super(BeautifulSoup, self).decode(
-            indent_level, eventual_encoding,
-            substitute_html_entities)
+            indent_level, eventual_encoding, formatter)
 
 
 class StopParsing(Exception):
diff --git a/bs4/element.py b/bs4/element.py
index 7c72894..b176777 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -29,6 +29,24 @@ class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 
+    # There are five possible values for the "formatter" argument passed in
+    # to methods like encode() and prettify():
+    #
+    # "html" - All Unicode characters with corresponding HTML entities
+    #   are converted to those entities on output.
+    # "minimal" - Bare ampersands and angle brackets are converted to
+    #   XML entities: &amp; &lt; &gt;
+    # None - The null formatter. Unicode characters are never
+    #   converted to entities.  This is not recommended, but it's
+    #   faster than "minimal".
+    # A function - This function will be called on every string that
+    #  needs to undergo entity substition
+    FORMATTERS = {
+        "html" : EntitySubstitution.substitute_html,
+        "minimal" : EntitySubstitution.substitute_xml,
+        None : None
+        }
+
     def setup(self, parent=None, previous_element=None):
         """Sets up the initial relations between this element and
         other elements."""
@@ -396,11 +414,15 @@ class NavigableString(unicode, PageElement):
                 "'%s' object has no attribute '%s'" % (
                     self.__class__.__name__, attr))
 
-    def output_ready(self, substitute_html_entities=False):
-        if substitute_html_entities:
-            output = EntitySubstitution.substitute_html(self)
+    def output_ready(self, formatter="minimal"):
+        if not callable(formatter):
+            formatter = self.FORMATTERS.get(
+                formatter, EntitySubstitution.substitute_xml)
+        if formatter is None:
+            output = self
         else:
-            output = EntitySubstitution.substitute_xml(self)
+            output = formatter(self)
+
         return self.PREFIX + output + self.SUFFIX
 
 
@@ -673,13 +695,13 @@ class Tag(PageElement):
         __str__ = __repr__ = __unicode__
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               indent_level=None, substitute_html_entities=False):
+               indent_level=None, formatter="minimal"):
         return self.decode(indent_level, encoding,
-                           substitute_html_entities).encode(encoding)
+                           formatter).encode(encoding)
 
     def decode(self, indent_level=None,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               substitute_html_entities=False):
+               formatter="minimal"):
         """Returns a Unicode representation of this tag and its contents.
 
         :param eventual_encoding: The tag is destined to be
@@ -720,7 +742,7 @@ class Tag(PageElement):
             space = ''
             indent_contents = None
         contents = self.decode_contents(
-            indent_contents, eventual_encoding, substitute_html_entities)
+            indent_contents, eventual_encoding, formatter)
 
         if self.hidden:
             # This is the 'document root' object.
@@ -746,12 +768,13 @@ class Tag(PageElement):
             s = ''.join(s)
         return s
 
-    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return self.encode(encoding, True)
+    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                 formatter="minimal"):
+        return self.encode(encoding, True, formatter)
 
     def decode_contents(self, indent_level=None,
                        eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-                       substitute_html_entities=False):
+                       formatter="minimal"):
         """Renders the contents of this tag as a Unicode string.
 
         :param eventual_encoding: The tag is destined to be
@@ -766,10 +789,10 @@ class Tag(PageElement):
         for c in self:
             text = None
             if isinstance(c, NavigableString):
-                text = c.output_ready(substitute_html_entities)
+                text = c.output_ready(formatter)
             elif isinstance(c, Tag):
                 s.append(c.decode(indent_level, eventual_encoding,
-                                  substitute_html_entities))
+                                  formatter))
             if text and indent_level:
                 text = text.strip()
             if text:
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 82a3bfa..5552347 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -934,12 +934,57 @@ class TestPersistence(SoupTest):
 
 class TestSubstitutions(SoupTest):
 
-    def test_html_entity_substitution(self):
-        soup = self.soup(
-            u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>")
-        decoded = soup.decode(substitute_html_entities=True)
+    def test_default_formatter_is_minimal(self):
+        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter="minimal")
+        # The < is converted back into &lt; but the e-with-acute is left alone.
+        self.assertEqual(
+            decoded,
+            self.document_for(
+                u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+
+    def test_formatter_html(self):
+        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter="html")
+        self.assertEqual(
+            decoded,
+            self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+
+    def test_formatter_minimal(self):
+        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter="minimal")
+        # The < is converted back into &lt; but the e-with-acute is left alone.
+        self.assertEqual(
+            decoded,
+            self.document_for(
+                u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+
+    def test_formatter_null(self):
+        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter=None)
+        # Neither the angle brackets nor the e-with-acute are converted.
+        # This is not valid HTML, but it's what the user wanted.
         self.assertEqual(decoded,
-                          self.document_for("<b>Sacr&eacute; bleu!</b>"))
+                          self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
+
+    def test_formatter_custom(self):
+        markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter = lambda x: x.upper())
+        # Instead of normal entity conversion code, the custom
+        # callable is called on every string.
+        self.assertEqual(
+            decoded,
+            self.document_for(u"<b><FOO></b><b>BAR</b>"))
+
+    def test_prettify_accepts_formatter(self):
+        soup = BeautifulSoup("<html><body>foo</body></html>")
+        pretty = soup.prettify(formatter = lambda x: x.upper())
+        self.assertTrue(b"FOO" in pretty)
 
     def test_html_entity_substitution_off_by_default(self):
         markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
@@ -984,7 +1029,6 @@ class TestSubstitutions(SoupTest):
         soup = self.soup(markup, parse_only=strainer)
         self.assertEqual(soup.contents[0].name, 'pre')
 
-
 class TestEncoding(SoupTest):
     """Test the ability to encode objects into strings."""