diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-03-10 09:18:39 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-03-10 09:18:39 -0400 |
commit | 7133d0a1ed1262528baf73fc5f65bcbc8841a041 (patch) | |
tree | 83f0718082ea5816b18acc8db61588f2dd749a7e /bs4/__init__.py | |
parent | 542d5d3d3bd33661d593e224f62ed4950949ce60 (diff) |
Fixed a bug that happened when passing a Unicode filename containing
non-ASCII characters as markup into Beautiful Soup, on a system that
allows Unicode filenames. [bug=1866717]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 3cd2e15..f828cd2 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -306,12 +306,11 @@ class BeautifulSoup(Tag): # system. Just let it go. pass if is_file: - if isinstance(markup, unicode): - markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % markup) + ' Beautiful Soup.' % self._decode_markup(markup) + ) self._check_markup_is_url(markup) rejections = [] @@ -360,8 +359,21 @@ class BeautifulSoup(Tag): d['builder'] = None return d - @staticmethod - def _check_markup_is_url(markup): + @classmethod + def _decode_markup(cls, markup): + """Ensure `markup` is bytes so it's safe to send into warnings.warn. + + TODO: warnings.warn had this problem back in 2010 but it might not + anymore. + """ + if isinstance(markup, bytes): + decoded = markup.decode('utf-8', 'replace') + else: + decoded = markup + return decoded + + @classmethod + def _check_markup_is_url(cls, markup): """Error-handling method to raise a warning if incoming markup looks like a URL. @@ -378,15 +390,13 @@ class BeautifulSoup(Tag): if any(markup.startswith(prefix) for prefix in cant_start_with): if not space in markup: - if isinstance(markup, bytes): - decoded_markup = markup.decode('utf-8', 'replace') - else: - decoded_markup = markup warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an' ' HTTP client. You should probably use an HTTP client like' ' requests to get the document behind the URL, and feed' - ' that document to Beautiful Soup.' % decoded_markup + ' that document to Beautiful Soup.' % cls._decode_markup( + markup + ) ) def _feed(self): |