summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG4
-rw-r--r--TODO12
-rw-r--r--beautifulsoup/dammit.py2
-rw-r--r--tests/test_soup.py8
4 files changed, 16 insertions, 10 deletions
diff --git a/CHANGELOG b/CHANGELOG
index dffab7c..5d13a6d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo".
An HTML or XML entity is always converted into the corresponding
Unicode character. There are no longer any smartQuotesTo or
-convertEntities arguments. (Unicode Dammit still has smartQuotesTo,
-though that may change.)
+convert_entities arguments. (Unicode Dammit still has smart_quotes_to,
+but the default is now to turn smart quotes into Unicode.)
= 3.1.0 =
diff --git a/TODO b/TODO
index ea32bbb..887c426 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,11 @@
-html5lib has its own Unicode, Dammit-like system. Converting the input
-to Unicode should be up to the builder. The lxml builder would use
-Unicode, Dammit, and the html5lib builder would be a no-op.
-
Bare ampersands should be converted to HTML entities upon output.
-It should also be possible to convert certain Unicode characters to
-HTML entities upon output.
+It should also be possible to, on output, convert to HTML entities any
+Unicode characters found in htmlentitydefs.codepoint2name. (This
+algorithm would allow me to simplify Unicode, Dammit--convert
+everything to Unicode, and then convert to entities upon output, not
+treating smart quotes differently from any other Unicode character
+that can be represented as an entity.)
XML handling:
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 6ff3f84..455b0bf 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -53,7 +53,7 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to='xml', isHTML=False):
+ smart_quotes_to=None, isHTML=False):
self.declared_html_encoding = None
self.markup, document_encoding, sniffed_encoding = \
self._detectEncoding(markup, isHTML)
diff --git a/tests/test_soup.py b/tests/test_soup.py
index c3a19e1..01dff53 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,10 +19,16 @@ class TestSelectiveParsing(SoupTest):
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
- def test_smart_quotes_to_xml_entities(self):
+ def test_smart_quotes_to_unicode(self):
markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(
+ dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+ def test_smart_quotes_to_xml_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
+ dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+ self.assertEquals(
dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):