Correctly handle invalid HTML numeric character entities like 

which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
commit: 6e319a74343b9efb69517ab178dbea921f438ee1 (patch)
tree: 7c4e3dbd72b473211d7f9c4509a0a334ae53621f /bs4/builder/_htmlparser.py
parent: 58cffa003e82049b78f14db73518d000cd05e3d6 (diff)
1 files changed, 21 insertions, 5 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         else:
             real_name = int(name)
 
-        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError, e:
+                    pass
+        if not data:
+            try:
+                data = unichr(real_name)
+            except (ValueError, OverflowError), e:
+                pass
+        data = data or u"\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
author	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
commit	6e319a74343b9efb69517ab178dbea921f438ee1 (patch)
tree	7c4e3dbd72b473211d7f9c4509a0a334ae53621f /bs4/builder/_htmlparser.py
parent	58cffa003e82049b78f14db73518d000cd05e3d6 (diff)