diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 12:53:33 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 12:53:33 -0500 |
commit | b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f (patch) | |
tree | f089e9dee8109e0fdfae2589cd8228d4ddee5939 | |
parent | 5962a409b04b8a78d78e9186da97bedbb67df8e6 (diff) |
By default, Unicode Dammit converts smart quotes to Unicode characters, not XML entities.
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | TODO | 12 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 2 | ||||
-rw-r--r-- | tests/test_soup.py | 8 |
4 files changed, 16 insertions, 10 deletions
@@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo". An HTML or XML entity is always converted into the corresponding Unicode character. There are no longer any smartQuotesTo or -convertEntities arguments. (Unicode Dammit still has smartQuotesTo, -though that may change.) +convert_entities arguments. (Unicode Dammit still has smart_quotes_to, +but the default is now to turn smart quotes into Unicode.) = 3.1.0 = @@ -1,11 +1,11 @@ -html5lib has its own Unicode, Dammit-like system. Converting the input -to Unicode should be up to the builder. The lxml builder would use -Unicode, Dammit, and the html5lib builder would be a no-op. - Bare ampersands should be converted to HTML entities upon output. -It should also be possible to convert certain Unicode characters to -HTML entities upon output. +It should also be possible to, on output, convert to HTML entities any +Unicode characters found in htmlentitydefs.codepoint2name. (This +algorithm would allow me to simplify Unicode, Dammit--convert +everything to Unicode, and then convert to entities upon output, not +treating smart quotes differently from any other Unicode character +that can be represented as an entity.) XML handling: diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 6ff3f84..455b0bf 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -53,7 +53,7 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to='xml', isHTML=False): + smart_quotes_to=None, isHTML=False): self.declared_html_encoding = None self.markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, isHTML) diff --git a/tests/test_soup.py b/tests/test_soup.py index c3a19e1..01dff53 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -19,10 +19,16 @@ class TestSelectiveParsing(SoupTest): class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" - def test_smart_quotes_to_xml_entities(self): + def test_smart_quotes_to_unicode(self): markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEquals( + dammit.unicode, u"<foo>\u2018\u2019\u201c\u201d</foo>") + + def test_smart_quotes_to_xml_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEquals( dammit.unicode, "<foo>‘’“”</foo>") def test_smart_quotes_to_html_entities(self): |