From 6e319a74343b9efb69517ab178dbea921f438ee1 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sat, 28 Jul 2018 16:58:23 -0400
Subject: Correctly handle invalid HTML numeric character entities like &#147; 
  which reference code points that are not Unicode code points. Note   that
 this is only fixed when Beautiful Soup is used with the   html.parser parser
 -- html5lib already worked and I couldn't fix it   with lxml.  [bug=1782933]

---
 bs4/builder/_htmlparser.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'bs4/builder/_htmlparser.py')

diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         else:
             real_name = int(name)
 
-        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError, e:
+                    pass
+        if not data:
+            try:
+                data = unichr(real_name)
+            except (ValueError, OverflowError), e:
+                pass
+        data = data or u"\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
-- 
cgit v1.2.3