summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2012-05-03 10:29:55 -0400
committerLeonard Richardson <leonardr@segfault.org>2012-05-03 10:29:55 -0400
commit0401057f29c9c8e6ee781aa9ca6fd1a395a4b084 (patch)
tree7efefe13bf03632255b59f6241bad4c660de3e4f /bs4
parent32a764727d7ae697945e70a942eab8899bc2f30d (diff)
Fixed the handling of &quot; with the built-in parser. [bug=993871]
Diffstat (limited to 'bs4')
-rw-r--r--bs4/dammit.py14
-rw-r--r--bs4/testing.py4
-rw-r--r--bs4/tests/test_docs.py4
3 files changed, 13 insertions, 9 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a3301ee..66a9e9b 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -41,18 +41,18 @@ class EntitySubstitution(object):
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
- characters = []
+ characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
- if codepoint == 34:
+ character = unichr(codepoint)
+ if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
- continue
- character = unichr(codepoint)
- characters.append(character)
- lookup[character] = name
+ characters_for_re.append(character)
+ lookup[character] = name
+ # But we do want to turn &quot; into the quotation mark.
reverse_lookup[name] = character
- re_definition = "[%s]" % "".join(characters)
+ re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
diff --git a/bs4/testing.py b/bs4/testing.py
index b004c18..40dc976 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -217,6 +217,10 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+ def test_quot_entity_converted_to_quotation_mark(self):
+ self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+ '<p>I said "good day!"</p>')
+
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
diff --git a/bs4/tests/test_docs.py b/bs4/tests/test_docs.py
index b7b427d..5b9f677 100644
--- a/bs4/tests/test_docs.py
+++ b/bs4/tests/test_docs.py
@@ -10,8 +10,8 @@ __all__ = [
import atexit
import doctest
import os
-from pkg_resources import (
- resource_filename, resource_exists, resource_listdir, cleanup_resources)
+#from pkg_resources import (
+# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (