summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-03-10 09:18:39 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-03-10 09:18:39 -0400
commit7133d0a1ed1262528baf73fc5f65bcbc8841a041 (patch)
tree83f0718082ea5816b18acc8db61588f2dd749a7e /bs4/__init__.py
parent542d5d3d3bd33661d593e224f62ed4950949ce60 (diff)
Fixed a bug that happened when passing a Unicode filename containing
non-ASCII characters as markup into Beautiful Soup, on a system that allows Unicode filenames. [bug=1866717]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py30
1 files changed, 20 insertions, 10 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 3cd2e15..f828cd2 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -306,12 +306,11 @@ class BeautifulSoup(Tag):
# system. Just let it go.
pass
if is_file:
- if isinstance(markup, unicode):
- markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into'
- ' Beautiful Soup.' % markup)
+ ' Beautiful Soup.' % self._decode_markup(markup)
+ )
self._check_markup_is_url(markup)
rejections = []
@@ -360,8 +359,21 @@ class BeautifulSoup(Tag):
d['builder'] = None
return d
- @staticmethod
- def _check_markup_is_url(markup):
+ @classmethod
+ def _decode_markup(cls, markup):
+ """Ensure `markup` is bytes so it's safe to send into warnings.warn.
+
+ TODO: warnings.warn had this problem back in 2010 but it might not
+ anymore.
+ """
+ if isinstance(markup, bytes):
+ decoded = markup.decode('utf-8', 'replace')
+ else:
+ decoded = markup
+ return decoded
+
+ @classmethod
+ def _check_markup_is_url(cls, markup):
"""Error-handling method to raise a warning if incoming markup looks
like a URL.
@@ -378,15 +390,13 @@ class BeautifulSoup(Tag):
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
- if isinstance(markup, bytes):
- decoded_markup = markup.decode('utf-8', 'replace')
- else:
- decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
- ' that document to Beautiful Soup.' % decoded_markup
+ ' that document to Beautiful Soup.' % cls._decode_markup(
+ markup
+ )
)
def _feed(self):