diff options
-rw-r--r-- | NEWS.txt | 2 | ||||
-rw-r--r-- | bs4/dammit.py | 14 | ||||
-rw-r--r-- | bs4/testing.py | 4 | ||||
-rw-r--r-- | bs4/tests/test_docs.py | 4 |
4 files changed, 15 insertions, 9 deletions
@@ -3,6 +3,8 @@ * Added experimental support for fixing Windows-1252 characters embedded in UTF-8 documents. +* Fixed the handling of " with the built-in parser. [bug=993871] + = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. diff --git a/bs4/dammit.py b/bs4/dammit.py index a3301ee..66a9e9b 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -41,18 +41,18 @@ class EntitySubstitution(object): def _populate_class_variables(): lookup = {} reverse_lookup = {} - characters = [] + characters_for_re = [] for codepoint, name in list(codepoint2name.items()): - if codepoint == 34: + character = unichr(codepoint) + if codepoint != 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which # is handled elsewhere. - continue - character = unichr(codepoint) - characters.append(character) - lookup[character] = name + characters_for_re.append(character) + lookup[character] = name + # But we do want to turn " into the quotation mark. reverse_lookup[name] = character - re_definition = "[%s]" % "".join(characters) + re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() diff --git a/bs4/testing.py b/bs4/testing.py index b004c18..40dc976 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -217,6 +217,10 @@ class HTMLTreeBuilderSmokeTest(object): self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("<p>I said "good day!"</p>", + '<p>I said "good day!"</p>') + def test_out_of_range_entity(self): expect = u"\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("�", expect) diff --git a/bs4/tests/test_docs.py b/bs4/tests/test_docs.py index b7b427d..5b9f677 100644 --- a/bs4/tests/test_docs.py +++ b/bs4/tests/test_docs.py @@ -10,8 +10,8 @@ __all__ = [ import atexit import doctest import os -from pkg_resources import ( - resource_filename, resource_exists, resource_listdir, cleanup_resources) +#from pkg_resources import ( +# resource_filename, resource_exists, resource_listdir, cleanup_resources) import unittest DOCTEST_FLAGS = ( |