By default, Unicode Dammit converts smart quotes to Unicode characters, not XML entities.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:53:33 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-18 12:53:33 -0500
commit: b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (patch)
tree: f089e9dee8109e0fdfae2589cd8228d4ddee5939
parent: 5962a409b04b8a78d78e9186da97bedbb67df8e6 (diff)
4 files changed, 16 insertions, 10 deletions
diff --git a/CHANGELOG b/CHANGELOG
index dffab7c..5d13a6d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo".
 
 An HTML or XML entity is always converted into the corresponding
 Unicode character. There are no longer any smartQuotesTo or
-convertEntities arguments. (Unicode Dammit still has smartQuotesTo,
-though that may change.)
+convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
+but the default is now to turn smart quotes into Unicode.)
 
 = 3.1.0 =
 
diff --git a/TODO b/TODO
index ea32bbb..887c426 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,11 @@
-html5lib has its own Unicode, Dammit-like system. Converting the input
-to Unicode should be up to the builder. The lxml builder would use
-Unicode, Dammit, and the html5lib builder would be a no-op.
-
 Bare ampersands should be converted to HTML entities upon output.
 
-It should also be possible to convert certain Unicode characters to
-HTML entities upon output.
+It should also be possible to, on output, convert to HTML entities any
+Unicode characters found in htmlentitydefs.codepoint2name. (This
+algorithm would allow me to simplify Unicode, Dammit--convert
+everything to Unicode, and then convert to entities upon output, not
+treating smart quotes differently from any other Unicode character
+that can be represented as an entity.)
 
 XML handling:
 
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 6ff3f84..455b0bf 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -53,7 +53,7 @@ class UnicodeDammit:
         ]
 
     def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to='xml', isHTML=False):
+                 smart_quotes_to=None, isHTML=False):
         self.declared_html_encoding = None
         self.markup, document_encoding, sniffed_encoding = \
                      self._detectEncoding(markup, isHTML)
diff --git a/tests/test_soup.py b/tests/test_soup.py
index c3a19e1..01dff53 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,10 +19,16 @@ class TestSelectiveParsing(SoupTest):
 class TestUnicodeDammit(unittest.TestCase):
     """Standalone tests of Unicode, Dammit."""
 
-    def test_smart_quotes_to_xml_entities(self):
+    def test_smart_quotes_to_unicode(self):
         markup = "<foo>\x91\x92\x93\x94</foo>"
         dammit = UnicodeDammit(markup)
         self.assertEquals(
+            dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+    def test_smart_quotes_to_xml_entities(self):
+        markup = "<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+        self.assertEquals(
             dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
 
     def test_smart_quotes_to_html_entities(self):
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:53:33 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-18 12:53:33 -0500
commit	b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (patch)
tree	f089e9dee8109e0fdfae2589cd8228d4ddee5939
parent	5962a409b04b8a78d78e9186da97bedbb67df8e6 (diff)