Created an EntitySubstitution class that's going to take code away from UnicodeDammit, Entities, and BeautifulSoup.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-21 17:03:13 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-21 17:03:13 -0500
commit: 274d94dc13ffeb80c587f68bbad267f4f5199b9e (patch)
tree: 79d0597c5cb1a5a514b25fbd8de7a55ae5123f94
parent: ce3742abd4c7fe39247569e82e2b3acdd6052bb1 (diff)
2 files changed, 189 insertions, 1 deletions
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 455b0bf..06e142e 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -33,6 +33,129 @@ except ImportError:
     pass
 
 
+from htmlentitydefs import codepoint2name
+import re
+
+class EntitySubstitution(object):
+    CHARACTER_TO_HTML_ENTITY = None
+    CHARACTER_TO_HTML_ENTITY_RE = None
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'" : "apos",
+        '"' : "quot",
+        "&" : "amp",
+        "<" : "lt",
+        ">" : "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _initialize_lookup(cls):
+        if cls.CHARACTER_TO_HTML_ENTITY is not None:
+            return
+        lookup = {}
+        characters = []
+        for codepoint, name in codepoint2name.items():
+            character = unichr(codepoint)
+            characters.append(character)
+            lookup[character] = name
+        re_definition = "[%s]" % "".join(characters)
+        cls.CHARACTER_TO_HTML_ENTITY = lookup
+        cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
+
+    def __init__(self):
+        # Initialize the class variables if not already initialized
+        self._initialize_lookup()
+
+    def _substitute_html_entity(self, matchobj):
+        entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    def _substitute_xml_entity(self, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    def substitute_xml(self, value, make_quoted_attribute=False,
+                       destination_is_xml=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+
+         Ordinarily, the string will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If the string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If the string contains both single and double quotes, the
+         single quotes will be escaped (see `destination_is_xml`), and
+         the string will be quoted using single quotes.
+
+          Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar'
+                                              OR
+                                    'Welcome to "Bob&apos;s bar'
+          (depending on the value of `destination_is_xml`)
+
+         :param destination_is_xml: If destination_is_xml is True,
+          then when a single quote is escaped it will become
+          "&apos;". But &apos; is not a valid HTML 4 entity. If
+          destination_is_xml is False, then single quotes will be
+          turned into "&squot;".
+
+          The value of this argument is irrelevant unless
+          make_quoted_attribute is True.
+        """
+        quote_with = '"'
+        if make_quoted_attribute:
+            if '"' in value:
+                quote_with = "'"
+                if "'" in value:
+                    if destination_is_xml:
+                        replace_with = "&apos;"
+                    else:
+                        replace_with = "&squot;"
+                    value = value.replace("'", replace_with)
+
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = self.BARE_AMPERSAND_OR_BRACKET.sub(
+            self._substitute_xml_entity, value)
+        if make_quoted_attribute:
+            return quote_with + value + quote_with
+        else:
+            return value
+
+    def substitute_html(self, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return self.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            self._substitute_html_entity, s)
+
+
+
+
 class UnicodeDammit:
     """A class for detecting the encoding of a *ML document and
     converting it to a Unicode string. If the source encoding is
diff --git a/tests/test_soup.py b/tests/test_soup.py
index bb2262a..eaedd94 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -3,7 +3,7 @@
 
 import unittest
 from beautifulsoup.element import SoupStrainer
-from beautifulsoup.dammit import UnicodeDammit
+from beautifulsoup.dammit import EntitySubstitution, UnicodeDammit
 from beautifulsoup.testing import SoupTest
 
 
@@ -16,6 +16,71 @@ class TestSelectiveParsing(SoupTest):
         self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
 
 
+class TestEntitySubstitution(unittest.TestCase):
+    """Standalone tests of the EntitySubstitution class."""
+    def setUp(self):
+        self.sub = EntitySubstitution()
+
+    def test_simple_html_substitution(self):
+        # Unicode characters corresponding to named HTML entites
+        # are substituted, and no others.
+        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+        self.assertEquals(self.sub.substitute_html(s),
+                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
+
+    def test_smart_quote_substitution(self):
+        # MS smart quotes are a common source of frustration, so we
+        # give them a special test.
+        quotes = "\x91\x92foo\x93\x94"
+        dammit = UnicodeDammit(quotes)
+        self.assertEquals(self.sub.substitute_html(dammit.markup),
+                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
+
+    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
+        s = 'Welcome to "my bar"'
+        self.assertEquals(self.sub.substitute_xml(s, False), s)
+
+    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
+        self.assertEquals(self.sub.substitute_xml("Welcome", True),
+                          '"Welcome"')
+        self.assertEquals(self.sub.substitute_xml("Bob's Bar", True),
+                          '"Bob\'s Bar"')
+
+    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
+        s = 'Welcome to "my bar"'
+        self.assertEquals(self.sub.substitute_xml(s, True),
+                          "'Welcome to \"my bar\"'")
+
+    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
+        s = 'Welcome to "Bob\'s Bar"'
+        # This one is going into an HTML document.
+        self.assertEquals(
+            self.sub.substitute_xml(s, True),
+            "'Welcome to \"Bob&squot;s Bar\"'")
+
+        # This one is going into an XML document.
+        self.assertEquals(
+            self.sub.substitute_xml(s, True, destination_is_xml=True),
+            "'Welcome to \"Bob&apos;s Bar\"'")
+
+    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
+        quoted = 'Welcome to "Bob\'s Bar"'
+        self.assertEquals(self.sub.substitute_xml(quoted), quoted)
+
+    def test_xml_quoting_handles_angle_brackets(self):
+        self.assertEquals(
+            self.sub.substitute_xml("foo<bar>"),
+            "foo&lt;bar&gt;")
+
+    def test_xml_quoting_handles_ampersands(self):
+        self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&amp;T")
+
+    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEquals(
+            self.sub.substitute_xml("&Aacute;T&T"),
+            "&Aacute;T&amp;T")
+
+
 class TestUnicodeDammit(unittest.TestCase):
     """Standalone tests of Unicode, Dammit."""
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-21 17:03:13 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-21 17:03:13 -0500
commit	274d94dc13ffeb80c587f68bbad267f4f5199b9e (patch)
tree	79d0597c5cb1a5a514b25fbd8de7a55ae5123f94
parent	ce3742abd4c7fe39247569e82e2b3acdd6052bb1 (diff)