7 files changed, 234 insertions, 125 deletions
diff --git a/CHANGELOG b/CHANGELOG
index c9a4ca7..abdf1b1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -25,6 +25,7 @@ So have some arguments to popular methods:
 
  * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...)
  * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...)
+ * Tag.encode(prettyPrint=...) -> Tag.encode(pretty_print=...)
 
 == Generators are now properties ==
 
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index c998924..ce39d33 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -63,10 +63,10 @@ __all__ = ['BeautifulSoup']
 
 import re
 
-from util import isList, isString, buildSet
+from util import isList, buildSet
 from builder import builder_registry
 from dammit import UnicodeDammit
-from element import Entities, NavigableString, Tag
+from element import NavigableString, Tag
 
 
 class BeautifulSoup(Tag):
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index b97c5f9..fb10628 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,7 +1,6 @@
 from collections import defaultdict
 import re
 import sys
-from beautifulsoup.element import Entities
 
 __all__ = [
     'HTMLTreeBuilder',
@@ -73,7 +72,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()
 
 
-class TreeBuilder(Entities):
+class TreeBuilder(object):
     """Turn a document into a Beautiful Soup object tree."""
 
     features = []
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 455b0bf..67bec17 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -7,6 +7,7 @@ encoding; that's the tree builder's job.
 """
 
 import codecs
+from htmlentitydefs import codepoint2name
 import re
 import types
 
@@ -21,18 +22,124 @@ try:
 except ImportError:
     chardet = None
 
-# Both are available from http://cjkpython.i18n.org/
-# They're built in if you use Python 2.4.
-try:
-    import cjkcodecs.aliases
-except ImportError:
-    pass
+# Available from http://cjkpython.i18n.org/.
 try:
     import iconv_codec
 except ImportError:
     pass
 
 
+class EntitySubstitution(object):
+
+    def _populate_class_variables():
+        lookup = {}
+        characters = []
+        for codepoint, name in codepoint2name.items():
+            character = unichr(codepoint)
+            characters.append(character)
+            lookup[character] = name
+        re_definition = "[%s]" % "".join(characters)
+        return lookup, re.compile(re_definition)
+    CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = (
+        _populate_class_variables())
+
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'" : "apos",
+        '"' : "quot",
+        "&" : "amp",
+        "<" : "lt",
+        ">" : "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    def _substitute_html_entity(self, matchobj):
+        entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    def _substitute_xml_entity(self, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    def substitute_xml(self, value, make_quoted_attribute=False,
+                       destination_is_xml=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+
+         Ordinarily, the string will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If the string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If the string contains both single and double quotes, the
+         single quotes will be escaped (see `destination_is_xml`), and
+         the string will be quoted using single quotes.
+
+          Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar'
+                                              OR
+                                    'Welcome to "Bob&apos;s bar'
+          (depending on the value of `destination_is_xml`)
+
+         :param destination_is_xml: If destination_is_xml is True,
+          then when a single quote is escaped it will become
+          "&apos;". But &apos; is not a valid HTML 4 entity. If
+          destination_is_xml is False, then single quotes will be
+          turned into "&squot;".
+
+          The value of this argument is irrelevant unless
+          make_quoted_attribute is True.
+        """
+        quote_with = '"'
+        if make_quoted_attribute:
+            if '"' in value:
+                quote_with = "'"
+                if "'" in value:
+                    if destination_is_xml:
+                        replace_with = "&apos;"
+                    else:
+                        replace_with = "&squot;"
+                    value = value.replace("'", replace_with)
+
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = self.BARE_AMPERSAND_OR_BRACKET.sub(
+            self._substitute_xml_entity, value)
+        if make_quoted_attribute:
+            return quote_with + value + quote_with
+        else:
+            return value
+
+    def substitute_html(self, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return self.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            self._substitute_html_entity, s)
+
+
 class UnicodeDammit:
     """A class for detecting the encoding of a *ML document and
     converting it to a Unicode string. If the source encoding is
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index a70813d..0ef9db1 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -4,32 +4,12 @@ try:
     from htmlentitydefs import name2codepoint
 except ImportError:
     name2codepoint = {}
+from beautifulsoup.dammit import EntitySubstitution
 
-from util import isString, isList
+from util import isList
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
-class Entities(object):
-    """A mixin class that knows about XML entities."""
-
-    HTML_ENTITIES = "html"
-    XML_ENTITIES = "xml"
-    XHTML_ENTITIES = "xhtml"
-
-    def _invert(h):
-        "Cheap function to invert a hash."
-        i = {}
-        for k,v in h.items():
-            i[v] = k
-        return i
-
-    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
-                                      "quot" : '"',
-                                      "amp" : "&",
-                                      "lt" : "<",
-                                      "gt" : ">" }
-
-    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 
 class PageElement(object):
     """Contains the navigational information for some part of the page
@@ -378,28 +358,28 @@ class NavigableString(unicode, PageElement):
         else:
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         return self
 
 class CData(NavigableString):
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         return u'<![CDATA[' + self + u']]>'
 
 class ProcessingInstruction(NavigableString):
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         output = self
         if u'%SOUP-ENCODING%' in output:
-            output = self.substituteEncoding(output, eventualEncoding)
+            output = self.substituteEncoding(output, eventual_encoding)
         return u'<?' + output + u'?>'
 
 class Comment(NavigableString):
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         return u'<!--' + self + u'-->'
 
 class Declaration(NavigableString):
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         return u'<!' + self + u'>'
 
 class Doctype(NavigableString):
@@ -414,10 +394,10 @@ class Doctype(NavigableString):
 
         return Doctype(value)
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def decodeGivenEventualEncoding(self, eventual_encoding):
         return u'<!DOCTYPE ' + self + u'>'
 
-class Tag(PageElement, Entities):
+class Tag(PageElement, EntitySubstitution):
 
     """Represents a found HTML tag with its attributes and contents."""
 
@@ -556,7 +536,7 @@ class Tag(PageElement, Entities):
         """Returns true iff this tag has the same name, the same attributes,
         and the same contents (recursively) as the given tag.
 
-        NOTE: right now this will return false if two tags have the
+        XXX: right now this will return false if two tags have the
         same attributes in a different order. Should this be fixed?"""
         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
             return False
@@ -572,16 +552,7 @@ class Tag(PageElement, Entities):
 
     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
-        return self.decode(eventualEncoding=encoding)
-
-    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
-                                           + ")")
-
-    def _sub_entity(self, x):
-        """Used with a regular expression to substitute the
-        appropriate XML entity for an XML special character."""
-        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+        return self.decode(eventual_encoding=encoding)
 
     def __unicode__(self):
         return self.decode()
@@ -590,56 +561,29 @@ class Tag(PageElement, Entities):
         return self.encode()
 
     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               prettyPrint=False, indentLevel=0):
-        return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+               pretty_print=False, indent_level=0):
+        return self.decode(pretty_print, indent_level, encoding).encode(encoding)
 
-    def decode(self, prettyPrint=False, indentLevel=0,
-               eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+    def decode(self, pretty_print=False, indent_level=0,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING):
         """Returns a string or Unicode representation of this tag and
         its contents. To get Unicode, pass None for encoding."""
 
         attrs = []
         if self.attrs:
             for key, val in self.attrs:
-                fmt = '%s="%s"'
-                if isString(val):
-                    if (self.contains_substitutions
-                        and eventualEncoding is not None
-                        and '%SOUP-ENCODING%' in val):
-                        val = self.substituteEncoding(val, eventualEncoding)
-
-                    # The attribute value either:
-                    #
-                    # * Contains no embedded double quotes or single quotes.
-                    #   No problem: we enclose it in double quotes.
-                    # * Contains embedded single quotes. No problem:
-                    #   double quotes work here too.
-                    # * Contains embedded double quotes. No problem:
-                    #   we enclose it in single quotes.
-                    # * Embeds both single _and_ double quotes. This
-                    #   can't happen naturally, but it can happen if
-                    #   you modify an attribute value after parsing
-                    #   the document. Now we have a bit of a
-                    #   problem. We solve it by enclosing the
-                    #   attribute in single quotes, and escaping any
-                    #   embedded single quotes to XML entities.
-                    if '"' in val:
-                        fmt = "%s='%s'"
-                        if "'" in val:
-                            # TODO: replace with apos when
-                            # appropriate.
-                            val = val.replace("'", "&squot;")
-
-                    # Now we're okay w/r/t quotes. But the attribute
-                    # value might also contain angle brackets, or
-                    # ampersands that aren't part of entities. We need
-                    # to escape those to XML entities too.
-                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
                 if val is None:
-                    # Handle boolean attributes.
                     decoded = key
                 else:
-                    decoded = fmt % (key, val)
+                    if not isinstance(val, basestring):
+                        val = str(val)
+                    if (self.contains_substitutions
+                        and eventual_encoding is not None
+                        and '%SOUP-ENCODING%' in val):
+                        val = self.substituteEncoding(val, eventual_encoding)
+
+                    # XXX: Set destination_is_xml based on... something!
+                    decoded = key + '=' + self.substitute_xml(val, True, False)
                 attrs.append(decoded)
         close = ''
         closeTag = ''
@@ -649,12 +593,12 @@ class Tag(PageElement, Entities):
             closeTag = '</%s>' % self.name
 
         indentTag, indentContents = 0, 0
-        if prettyPrint:
-            indentTag = indentLevel
+        if pretty_print:
+            indentTag = indent_level
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
-        contents = self.decodeContents(prettyPrint, indentContents,
-                                       eventualEncoding)
+        contents = self.decodeContents(pretty_print, indentContents,
+                                       eventual_encoding)
         if self.hidden:
             s = contents
         else:
@@ -662,18 +606,18 @@ class Tag(PageElement, Entities):
             attributeString = ''
             if attrs:
                 attributeString = ' ' + ' '.join(attrs)
-            if prettyPrint:
+            if pretty_print:
                 s.append(space)
             s.append('<%s%s%s>' % (self.name, attributeString, close))
-            if prettyPrint:
+            if pretty_print:
                 s.append("\n")
             s.append(contents)
-            if prettyPrint and contents and contents[-1] != "\n":
+            if pretty_print and contents and contents[-1] != "\n":
                 s.append("\n")
-            if prettyPrint and closeTag:
+            if pretty_print and closeTag:
                 s.append(space)
             s.append(closeTag)
-            if prettyPrint and closeTag and self.nextSibling:
+            if pretty_print and closeTag and self.nextSibling:
                 s.append("\n")
             s = ''.join(s)
         return s
@@ -692,27 +636,27 @@ class Tag(PageElement, Entities):
         return self.encode(encoding, True)
 
     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
+                       pretty_print=False, indent_level=0):
+        return self.decodeContents(pretty_print, indent_level).encode(encoding)
 
-    def decodeContents(self, prettyPrint=False, indentLevel=0,
-                       eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+    def decodeContents(self, pretty_print=False, indent_level=0,
+                       eventual_encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
         for c in self:
             text = None
             if isinstance(c, NavigableString):
-                text = c.decodeGivenEventualEncoding(eventualEncoding)
+                text = c.decodeGivenEventualEncoding(eventual_encoding)
             elif isinstance(c, Tag):
-                s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
-            if text and prettyPrint:
+                s.append(c.decode(pretty_print, indent_level, eventual_encoding))
+            if text and pretty_print:
                 text = text.strip()
             if text:
-                if prettyPrint:
-                    s.append(" " * (indentLevel-1))
+                if pretty_print:
+                    s.append(" " * (indent_level-1))
                 s.append(text)
-                if prettyPrint:
+                if pretty_print:
                     s.append("\n")
         return ''.join(s)
 
@@ -790,7 +734,7 @@ class SoupStrainer(object):
 
     def __init__(self, name=None, attrs={}, text=None, **kwargs):
         self.name = name
-        if isString(attrs):
+        if isinstance(attrs, basestring):
             kwargs['class'] = attrs
             attrs = None
         if kwargs:
@@ -863,7 +807,7 @@ class SoupStrainer(object):
                 found = self.searchTag(markup)
         # If it's text, make sure the text matches.
         elif isinstance(markup, NavigableString) or \
-                 isString(markup):
+                 isinstance(markup, basestring):
             if self._matches(markup, self.text):
                 found = markup
         else:
@@ -883,18 +827,19 @@ class SoupStrainer(object):
             #other ways of matching match the tag name as a string.
             if isinstance(markup, Tag):
                 markup = markup.name
-            if markup is not None and not isString(markup):
+            if markup is not None and not isinstance(markup, basestring):
                 markup = unicode(markup)
             #Now we know that chunk is either a string, or None.
             if hasattr(matchAgainst, 'match'):
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
             elif (isList(matchAgainst)
-                  and (markup is not None or not isString(matchAgainst))):
+                  and (markup is not None
+                       or not isinstance(matchAgainst, basestring))):
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
                 result = markup.has_key(matchAgainst)
-            elif matchAgainst and isString(markup):
+            elif matchAgainst and isinstance(markup, basestring):
                 if isinstance(markup, unicode):
                     matchAgainst = unicode(matchAgainst)
                 else:
diff --git a/beautifulsoup/util.py b/beautifulsoup/util.py
index 693a7e2..5978865 100644
--- a/beautifulsoup/util.py
+++ b/beautifulsoup/util.py
@@ -9,17 +9,9 @@ except NameError:
 def isList(l):
     """Convenience method that works with all 2.x versions of Python
     to determine whether or not something is listlike."""
-    return ((hasattr(l, '__iter__') and not isString(l))
+    return ((hasattr(l, '__iter__') and not isinstance(l, basestring))
             or (type(l) in (types.ListType, types.TupleType)))
 
-def isString(s):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is stringlike."""
-    try:
-        return isinstance(s, unicode) or isinstance(s, basestring)
-    except NameError:
-        return isinstance(s, str)
-
 def buildSet(args=None):
     """Turns a list or a string into a set."""
     if isinstance(args, str):
diff --git a/tests/test_soup.py b/tests/test_soup.py
index bb2262a..eaedd94 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -3,7 +3,7 @@
 
 import unittest
 from beautifulsoup.element import SoupStrainer
-from beautifulsoup.dammit import UnicodeDammit
+from beautifulsoup.dammit import EntitySubstitution, UnicodeDammit
 from beautifulsoup.testing import SoupTest
 
 
@@ -16,6 +16,71 @@ class TestSelectiveParsing(SoupTest):
         self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
 
 
+class TestEntitySubstitution(unittest.TestCase):
+    """Standalone tests of the EntitySubstitution class."""
+    def setUp(self):
+        self.sub = EntitySubstitution()
+
+    def test_simple_html_substitution(self):
+        # Unicode characters corresponding to named HTML entites
+        # are substituted, and no others.
+        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+        self.assertEquals(self.sub.substitute_html(s),
+                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
+
+    def test_smart_quote_substitution(self):
+        # MS smart quotes are a common source of frustration, so we
+        # give them a special test.
+        quotes = "\x91\x92foo\x93\x94"
+        dammit = UnicodeDammit(quotes)
+        self.assertEquals(self.sub.substitute_html(dammit.markup),
+                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
+
+    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
+        s = 'Welcome to "my bar"'
+        self.assertEquals(self.sub.substitute_xml(s, False), s)
+
+    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
+        self.assertEquals(self.sub.substitute_xml("Welcome", True),
+                          '"Welcome"')
+        self.assertEquals(self.sub.substitute_xml("Bob's Bar", True),
+                          '"Bob\'s Bar"')
+
+    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
+        s = 'Welcome to "my bar"'
+        self.assertEquals(self.sub.substitute_xml(s, True),
+                          "'Welcome to \"my bar\"'")
+
+    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
+        s = 'Welcome to "Bob\'s Bar"'
+        # This one is going into an HTML document.
+        self.assertEquals(
+            self.sub.substitute_xml(s, True),
+            "'Welcome to \"Bob&squot;s Bar\"'")
+
+        # This one is going into an XML document.
+        self.assertEquals(
+            self.sub.substitute_xml(s, True, destination_is_xml=True),
+            "'Welcome to \"Bob&apos;s Bar\"'")
+
+    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
+        quoted = 'Welcome to "Bob\'s Bar"'
+        self.assertEquals(self.sub.substitute_xml(quoted), quoted)
+
+    def test_xml_quoting_handles_angle_brackets(self):
+        self.assertEquals(
+            self.sub.substitute_xml("foo<bar>"),
+            "foo&lt;bar&gt;")
+
+    def test_xml_quoting_handles_ampersands(self):
+        self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&amp;T")
+
+    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEquals(
+            self.sub.substitute_xml("&Aacute;T&T"),
+            "&Aacute;T&amp;T")
+
+
 class TestUnicodeDammit(unittest.TestCase):
     """Standalone tests of Unicode, Dammit."""