From b5fa9d7f5579f22f5fe0f7c9dc63e0aa7d29262f Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 18 Feb 2011 12:53:33 -0500 Subject: By default, Unicode Dammit converts smart quotes to Unicode characters, not XML entities. --- CHANGELOG | 4 ++-- TODO | 12 ++++++------ beautifulsoup/dammit.py | 2 +- tests/test_soup.py | 8 +++++++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index dffab7c..5d13a6d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -55,8 +55,8 @@ The value of a.string used to be None, and now it's "foo". An HTML or XML entity is always converted into the corresponding Unicode character. There are no longer any smartQuotesTo or -convertEntities arguments. (Unicode Dammit still has smartQuotesTo, -though that may change.) +convert_entities arguments. (Unicode Dammit still has smart_quotes_to, +but the default is now to turn smart quotes into Unicode.) = 3.1.0 = diff --git a/TODO b/TODO index ea32bbb..887c426 100644 --- a/TODO +++ b/TODO @@ -1,11 +1,11 @@ -html5lib has its own Unicode, Dammit-like system. Converting the input -to Unicode should be up to the builder. The lxml builder would use -Unicode, Dammit, and the html5lib builder would be a no-op. - Bare ampersands should be converted to HTML entities upon output. -It should also be possible to convert certain Unicode characters to -HTML entities upon output. +It should also be possible to, on output, convert to HTML entities any +Unicode characters found in htmlentitydefs.codepoint2name. (This +algorithm would allow me to simplify Unicode, Dammit--convert +everything to Unicode, and then convert to entities upon output, not +treating smart quotes differently from any other Unicode character +that can be represented as an entity.) XML handling: diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 6ff3f84..455b0bf 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -53,7 +53,7 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to='xml', isHTML=False): + smart_quotes_to=None, isHTML=False): self.declared_html_encoding = None self.markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, isHTML) diff --git a/tests/test_soup.py b/tests/test_soup.py index c3a19e1..01dff53 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -19,9 +19,15 @@ class TestSelectiveParsing(SoupTest): class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" - def test_smart_quotes_to_xml_entities(self): + def test_smart_quotes_to_unicode(self): markup = "\x91\x92\x93\x94" dammit = UnicodeDammit(markup) + self.assertEquals( + dammit.unicode, u"\u2018\u2019\u201c\u201d") + + def test_smart_quotes_to_xml_entities(self): + markup = "\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEquals( dammit.unicode, "‘’“”") -- cgit v1.2.3