summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-18 12:53:33 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-18 12:53:33 -0500
commitb5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (patch)
treef089e9dee8109e0fdfae2589cd8228d4ddee5939
parent5962a409b04b8a78d78e9186da97bedbb67df8e6 (diff)
By default, Unicode Dammit converts smart quotes to Unicode characters, not XML entities.
-rw-r--r--CHANGELOG4
-rw-r--r--TODO12
-rw-r--r--beautifulsoup/dammit.py2
-rw-r--r--tests/test_soup.py8
4 files changed, 16 insertions, 10 deletions
diff --git a/CHANGELOG b/CHANGELOG
index dffab7c..5d13a6d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo".
An HTML or XML entity is always converted into the corresponding
Unicode character. There are no longer any smartQuotesTo or
-convertEntities arguments. (Unicode Dammit still has smartQuotesTo,
-though that may change.)
+convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
+but the default is now to turn smart quotes into Unicode.)
= 3.1.0 =
diff --git a/TODO b/TODO
index ea32bbb..887c426 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,11 @@
-html5lib has its own Unicode, Dammit-like system. Converting the input
-to Unicode should be up to the builder. The lxml builder would use
-Unicode, Dammit, and the html5lib builder would be a no-op.
-
Bare ampersands should be converted to HTML entities upon output.
-It should also be possible to convert certain Unicode characters to
-HTML entities upon output.
+It should also be possible to, on output, convert to HTML entities any
+Unicode characters found in htmlentitydefs.codepoint2name. (This
+algorithm would allow me to simplify Unicode, Dammit--convert
+everything to Unicode, and then convert to entities upon output, not
+treating smart quotes differently from any other Unicode character
+that can be represented as an entity.)
XML handling:
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 6ff3f84..455b0bf 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -53,7 +53,7 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to='xml', isHTML=False):
+ smart_quotes_to=None, isHTML=False):
self.declared_html_encoding = None
self.markup, document_encoding, sniffed_encoding = \
self._detectEncoding(markup, isHTML)
diff --git a/tests/test_soup.py b/tests/test_soup.py
index c3a19e1..01dff53 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,10 +19,16 @@ class TestSelectiveParsing(SoupTest):
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
- def test_smart_quotes_to_xml_entities(self):
+ def test_smart_quotes_to_unicode(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(
+ dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+ def test_smart_quotes_to_xml_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
+ dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+ self.assertEquals(
dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):