diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
commit | 6e319a74343b9efb69517ab178dbea921f438ee1 (patch) | |
tree | 7c4e3dbd72b473211d7f9c4509a0a334ae53621f /bs4/builder/_htmlparser.py | |
parent | 58cffa003e82049b78f14db73518d000cd05e3d6 (diff) |
Correctly handle invalid HTML numeric character entities like “
which reference code points that are not Unicode code points. Note
that this is only fixed when Beautiful Soup is used with the
html.parser parser -- html5lib already worked and I couldn't fix it
with lxml. [bug=1782933]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 26 |
1 files changed, 21 insertions, 5 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index eff30ff..ee6c685 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" # Use of this source code is governed by a BSD-style license that can be @@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError, e: + pass + if not data: + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + pass + data = data or u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): |