summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt2
-rw-r--r--bs4/dammit.py14
-rw-r--r--bs4/testing.py4
-rw-r--r--bs4/tests/test_docs.py4
4 files changed, 15 insertions, 9 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 3f6ad7a..934246b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,8 @@
* Added experimental support for fixing Windows-1252 characters
embedded in UTF-8 documents.
+* Fixed the handling of " with the built-in parser. [bug=993871]
+
= 4.0.5 (20120427) =
* Added a new method, wrap(), which wraps an element in a tag.
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a3301ee..66a9e9b 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -41,18 +41,18 @@ class EntitySubstitution(object):
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
- characters = []
+ characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
- if codepoint == 34:
+ character = unichr(codepoint)
+ if codepoint != 34:
# There's no point in turning the quotation mark into
# ", unless it happens within an attribute value, which
# is handled elsewhere.
- continue
- character = unichr(codepoint)
- characters.append(character)
- lookup[character] = name
+ characters_for_re.append(character)
+ lookup[character] = name
+ # But we do want to turn " into the quotation mark.
reverse_lookup[name] = character
- re_definition = "[%s]" % "".join(characters)
+ re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
diff --git a/bs4/testing.py b/bs4/testing.py
index b004c18..40dc976 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -217,6 +217,10 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+ def test_quot_entity_converted_to_quotation_mark(self):
+ self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+ '<p>I said "good day!"</p>')
+
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
diff --git a/bs4/tests/test_docs.py b/bs4/tests/test_docs.py
index b7b427d..5b9f677 100644
--- a/bs4/tests/test_docs.py
+++ b/bs4/tests/test_docs.py
@@ -10,8 +10,8 @@ __all__ = [
import atexit
import doctest
import os
-from pkg_resources import (
- resource_filename, resource_exists, resource_listdir, cleanup_resources)
+#from pkg_resources import (
+# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (