summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
commit6e319a74343b9efb69517ab178dbea921f438ee1 (patch)
tree7c4e3dbd72b473211d7f9c4509a0a334ae53621f /bs4/builder/_htmlparser.py
parent58cffa003e82049b78f14db73518d000cd05e3d6 (diff)
Correctly handle invalid HTML numeric character entities like &#147;
which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py26
1 files changed, 21 insertions, 5 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g. &#147;
+ # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError, e:
+ pass
+ if not data:
+ try:
+ data = unichr(real_name)
+ except (ValueError, OverflowError), e:
+ pass
+ data = data or u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):