From 6e319a74343b9efb69517ab178dbea921f438ee1 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sat, 28 Jul 2018 16:58:23 -0400
Subject: Correctly handle invalid HTML numeric character entities like &#147; 
  which reference code points that are not Unicode code points. Note   that
 this is only fixed when Beautiful Soup is used with the   html.parser parser
 -- html5lib already worked and I couldn't fix it   with lxml.  [bug=1782933]

---
 bs4/testing.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'bs4/testing.py')
diff --git a/bs4/testing.py b/bs4/testing.py
index bbcc271..745a9c4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Helper classes for tests."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -326,6 +327,18 @@ Hello, world!
             u"<p>&bull; AT&T is in the s&p 500</p>",
             u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
         )
+
+    def test_entities_in_foreign_document_encoding(self):
+        # &#147; and &#148; are invalid numeric entities referencing
+        # Windows-1252 characters. &#45; references a character common
+        # to Windows-1252 and Unicode, and &#9731; references a
+        # character only found in Unicode.
+        #
+        # All of these entities should be converted to Unicode
+        # characters.
+        markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+        soup = self.soup(markup)
+        self.assertEquals(u"“Hello” -☃", soup.p.string)
         
     def test_entities_in_attributes_converted_to_unicode(self):
         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
-- 
cgit v1.2.3