diff options
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 14 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 16 |
3 files changed, 33 insertions, 1 deletions
@@ -4,6 +4,10 @@ namespaced attribute is the empty string, as opposed to None. [bug=1915583] +* Improve the warning issued when a directory name (as opposed to + the name of a regular file) is passed as markup into the BeautifulSoup + constructor. [bug=1913628] + = 4.9.3 (20201003) * Implemented a significant performance optimization to the process of diff --git a/bs4/__init__.py b/bs4/__init__.py index 8f78809..e33f62a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -321,14 +321,26 @@ class BeautifulSoup(Tag): else: possible_filename = markup is_file = False + is_directory = False try: is_file = os.path.exists(possible_filename) + if is_file: + is_directory = os.path.isdir(possible_filename) except Exception, e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass - if is_file: + if is_directory: + warnings.warn( + '"%s" looks like a directory name, not markup. You may' + ' want to open a file found in this directory and pass' + ' the filehandle into Beautiful Soup.' % ( + self._decode_markup(markup) + ), + MarkupResemblesLocatorWarning + ) + elif is_file: warnings.warn( '"%s" looks like a filename, not markup. You should' ' probably open this file and pass the filehandle into' diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index f21edfa..0603ce7 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -3,6 +3,7 @@ from pdb import set_trace import logging +import os import unittest import sys import tempfile @@ -291,6 +292,21 @@ class TestWarnings(SoupTest): soup = self.soup(filename) self.assertEqual([], w) + def test_directory_warning(self): + try: + filename = tempfile.mkdtemp() + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + self.assertTrue("looks like a directory" in str(warning.message)) + finally: + os.rmdir(filename) + + # The directory no longer exists, so Beautiful Soup will no longer issue the warning. + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + self.assertEqual([], w) + def test_url_warning_with_bytes_url(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/") |