diff options
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 91 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 66 |
4 files changed, 81 insertions, 84 deletions
@@ -3,7 +3,7 @@ Beautiful Soup's official support for Python 2 ended on December 31st, 4.9.3. In the Launchpad Bazaar repository, the final revision to support Python 2 was revision 605. -= 4.11.0 (Unreleased) += 4.11.0 (20220407) * Ported unit tests to use pytest. @@ -50,8 +50,8 @@ Python 2 was revision 605. html5lib parser. [bug=1948488] * Standardized the wording of the MarkupResemblesLocatorWarning - warnings to to make them less judgmental about what you ought to - be doing. [bug=1955450] + warnings to omit untrusted input and make the warnings less + judgmental about what you ought to be doing. [bug=1955450] * Removed support for the iconv_codec library, which doesn't seem to exist anymore and was never put up on PyPI. (The closest diff --git a/bs4/__init__.py b/bs4/__init__.py index 2371ccf..a16ff26 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -15,14 +15,13 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.10.0" -__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson" +__version__ = "4.11.0" +__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] - from collections import Counter import os import re @@ -315,43 +314,12 @@ class BeautifulSoup(Tag): (isinstance(markup, bytes) and not b'<' in markup) or (isinstance(markup, str) and not '<' in markup) ): - # Print out warnings for a couple beginner problems + # Issue warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, - # just in case that's what the user really wants. - if (isinstance(markup, str) - and not os.path.supports_unicode_filenames): - possible_filename = markup.encode("utf8") - else: - possible_filename = markup - is_file = False - is_directory = False - try: - is_file = os.path.exists(possible_filename) - if is_file: - is_directory = os.path.isdir(possible_filename) - except Exception as e: - # This is almost certainly a problem involving - # characters not valid in filenames on this - # system. Just let it go. - pass - if is_directory: - warnings.warn( - '"%s" looks like a directory name, not markup. You may' - ' want to open a file found in this directory and pass' - ' the filehandle into Beautiful Soup.' % ( - self._decode_markup(markup) - ), - MarkupResemblesLocatorWarning - ) - elif is_file: - warnings.warn( - '"%s" looks like a filename, not markup. You may' - ' want to open this file and pass the filehandle into' - ' Beautiful Soup.' % self._decode_markup(markup), - MarkupResemblesLocatorWarning - ) - self._check_markup_is_url(markup) + # since that is sometimes the intended behavior. + if not self._markup_is_url(markup): + self._markup_resembles_filename(markup) rejections = [] success = False @@ -414,11 +382,13 @@ class BeautifulSoup(Tag): return decoded @classmethod - def _check_markup_is_url(cls, markup): + def _markup_is_url(cls, markup): """Error-handling method to raise a warning if incoming markup looks like a URL. :param markup: A string. + :return: Whether or not the markup resembles a URL + closely enough to justify a warning. """ if isinstance(markup, bytes): space = b' ' @@ -427,21 +397,50 @@ class BeautifulSoup(Tag): space = ' ' cant_start_with = ("http:", "https:") else: - return + return False if any(markup.startswith(prefix) for prefix in cant_start_with): if not space in markup: warnings.warn( - '"%s" looks like a URL, not markup. You may want to use' + 'The input looks more like a URL than markup. You may want to use' ' an HTTP client like requests to get the document behind' - ' the URL, and feed that document to Beautiful Soup.' % ( - cls._decode_markup( - markup - ) - ), + ' the URL, and feed that document to Beautiful Soup.', MarkupResemblesLocatorWarning ) + return True + return False + + @classmethod + def _markup_resembles_filename(cls, markup): + """Error-handling method to raise a warning if incoming markup + resembles a filename. + :param markup: A bytestring or string. + :return: Whether or not the markup resembles a filename + closely enough to justify a warning. + """ + path_characters = '/\\' + extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] + if isinstance(markup, bytes): + path_characters = path_characters.encode("utf8") + extensions = [x.encode('utf8') for x in extensions] + filelike = False + if any(x in markup for x in path_characters): + filelike = True + else: + lower = markup.lower() + if any(lower.endswith(ext) for ext in extensions): + filelike = True + if filelike: + warnings.warn( + 'The input looks more like a filename than markup. You may' + ' want to open this file and pass the filehandle into' + ' Beautiful Soup.', + MarkupResemblesLocatorWarning + ) + return True + return False + def _feed(self): """Internal method that parses previously set markup, creating a large number of Tag and NavigableString objects. diff --git a/bs4/element.py b/bs4/element.py index 86123f8..74b1dc0 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1992,7 +1992,7 @@ class Tag(PageElement): has_key() is gone in Python 3, anyway. """ warnings.warn( - 'has_key is deprecated. Use has_attr("%s") instead.' % key, + 'has_key is deprecated. Use has_attr(key) instead.', DeprecationWarning ) return self.has_attr(key) diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 2f53a30..445f74d 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -224,7 +224,7 @@ class TestWarnings(SoupTest): for w in warnings: if isinstance(w.message, cls): return w - raise Exception("%s warning not found in %r" % cls, warnings) + raise Exception("%s warning not found in %r" % (cls, warnings)) def _assert_no_parser_specified(self, w): warning = self._assert_warning(w, GuessedAtParserWarning) @@ -267,67 +267,65 @@ class TestWarnings(SoupTest): with pytest.raises(TypeError): self.soup("<a>", no_such_argument=True) - def test_disk_file_warning(self): - filehandle = tempfile.NamedTemporaryFile() - filename = filehandle.name - try: - with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) - warning = self._assert_warning(w, MarkupResemblesLocatorWarning) - assert "looks like a filename" in str(warning.message) - finally: - filehandle.close() - - # The file no longer exists, so Beautiful Soup will no longer issue the warning. + @pytest.mark.parametrize( + "extension", + ['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt', + 'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"] + ) + def test_resembles_filename_warning(self, extension): + # A warning is issued if the "markup" looks like the name of + # an HTML or text file, or a full path to a file on disk. with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) - assert [] == w - - def test_directory_warning(self): - try: - filename = tempfile.mkdtemp() - with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) + soup = self.soup("markup" + extension) warning = self._assert_warning(w, MarkupResemblesLocatorWarning) - assert "looks like a directory" in str(warning.message) - finally: - os.rmdir(filename) - - # The directory no longer exists, so Beautiful Soup will no longer issue the warning. + assert "looks more like a filename" in str(warning.message) + + @pytest.mark.parametrize( + "extension", + ['markuphtml', 'markup.com', '', 'markup.js'] + ) + def test_resembles_filename_no_warning(self, extension): + # The 'looks more like a filename' warning is not issued if + # the markup looks like a bare string, a domain name, or a + # file that's not an HTML file. with warnings.catch_warnings(record=True) as w: - soup = self.soup(filename) + soup = self.soup("markup" + extension) assert [] == w def test_url_warning_with_bytes_url(self): + url = b"http://www.crummybytes.com/" with warnings.catch_warnings(record=True) as warning_list: - soup = self.soup(b"http://www.crummybytes.com/") + soup = self.soup(url) warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) - assert "looks like a URL" in str(warning.message) - + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message).encode("utf8") + def test_url_warning_with_unicode_url(self): + url = "http://www.crummyunicode.com/" with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning - soup = self.soup("http://www.crummyunicode.com/") + soup = self.soup(url) warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) - assert "looks like a URL" in str(warning.message) + assert "looks more like a URL" in str(warning.message) + assert url not in str(warning.message) def test_url_warning_with_bytes_and_space(self): # Here the markup contains something besides a URL, so no warning # is issued. with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") - assert not any("looks like a URL" in str(w.message) + assert not any("looks more like a URL" in str(w.message) for w in warning_list) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup("http://www.crummyunicode.com/ is great") - assert not any("looks like a URL" in str(w.message) + assert not any("looks more like a URL" in str(w.message) for w in warning_list) |