summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2022-04-07 19:17:52 -0400
committerLeonard Richardson <leonardr@segfault.org>2022-04-07 19:17:52 -0400
commitd8e5c8b975102d6567ed172190feacb8811a343f (patch)
treeeba22a9c1e9492032ad27612106c6f677a3cc43c
parent461a7083d2a924a9aaeadb8a8f6b47cf8b3fb511 (diff)
Omit untrusted input when issuing warnings.
-rw-r--r--CHANGELOG6
-rw-r--r--bs4/__init__.py91
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/tests/test_soup.py66
4 files changed, 81 insertions, 84 deletions
diff --git a/CHANGELOG b/CHANGELOG
index c8889cd..909d852 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,7 +3,7 @@ Beautiful Soup's official support for Python 2 ended on December 31st,
4.9.3. In the Launchpad Bazaar repository, the final revision to support
Python 2 was revision 605.
-= 4.11.0 (Unreleased)
+= 4.11.0 (20220407)
* Ported unit tests to use pytest.
@@ -50,8 +50,8 @@ Python 2 was revision 605.
html5lib parser. [bug=1948488]
* Standardized the wording of the MarkupResemblesLocatorWarning
- warnings to to make them less judgmental about what you ought to
- be doing. [bug=1955450]
+ warnings to omit untrusted input and make the warnings less
+ judgmental about what you ought to be doing. [bug=1955450]
* Removed support for the iconv_codec library, which doesn't seem
to exist anymore and was never put up on PyPI. (The closest
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 2371ccf..a16ff26 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,14 +15,13 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.10.0"
-__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
+__version__ = "4.11.0"
+__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
-
from collections import Counter
import os
import re
@@ -315,43 +314,12 @@ class BeautifulSoup(Tag):
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, str) and not '<' in markup)
):
- # Print out warnings for a couple beginner problems
+ # Issue warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
- # just in case that's what the user really wants.
- if (isinstance(markup, str)
- and not os.path.supports_unicode_filenames):
- possible_filename = markup.encode("utf8")
- else:
- possible_filename = markup
- is_file = False
- is_directory = False
- try:
- is_file = os.path.exists(possible_filename)
- if is_file:
- is_directory = os.path.isdir(possible_filename)
- except Exception as e:
- # This is almost certainly a problem involving
- # characters not valid in filenames on this
- # system. Just let it go.
- pass
- if is_directory:
- warnings.warn(
- '"%s" looks like a directory name, not markup. You may'
- ' want to open a file found in this directory and pass'
- ' the filehandle into Beautiful Soup.' % (
- self._decode_markup(markup)
- ),
- MarkupResemblesLocatorWarning
- )
- elif is_file:
- warnings.warn(
- '"%s" looks like a filename, not markup. You may'
- ' want to open this file and pass the filehandle into'
- ' Beautiful Soup.' % self._decode_markup(markup),
- MarkupResemblesLocatorWarning
- )
- self._check_markup_is_url(markup)
+ # since that is sometimes the intended behavior.
+ if not self._markup_is_url(markup):
+ self._markup_resembles_filename(markup)
rejections = []
success = False
@@ -414,11 +382,13 @@ class BeautifulSoup(Tag):
return decoded
@classmethod
- def _check_markup_is_url(cls, markup):
+ def _markup_is_url(cls, markup):
"""Error-handling method to raise a warning if incoming markup looks
like a URL.
:param markup: A string.
+ :return: Whether or not the markup resembles a URL
+ closely enough to justify a warning.
"""
if isinstance(markup, bytes):
space = b' '
@@ -427,21 +397,50 @@ class BeautifulSoup(Tag):
space = ' '
cant_start_with = ("http:", "https:")
else:
- return
+ return False
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
warnings.warn(
- '"%s" looks like a URL, not markup. You may want to use'
+ 'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
- ' the URL, and feed that document to Beautiful Soup.' % (
- cls._decode_markup(
- markup
- )
- ),
+ ' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning
)
+ return True
+ return False
+
+ @classmethod
+ def _markup_resembles_filename(cls, markup):
+ """Error-handling method to raise a warning if incoming markup
+ resembles a filename.
+ :param markup: A bytestring or string.
+ :return: Whether or not the markup resembles a filename
+ closely enough to justify a warning.
+ """
+ path_characters = '/\\'
+ extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
+ if isinstance(markup, bytes):
+ path_characters = path_characters.encode("utf8")
+ extensions = [x.encode('utf8') for x in extensions]
+ filelike = False
+ if any(x in markup for x in path_characters):
+ filelike = True
+ else:
+ lower = markup.lower()
+ if any(lower.endswith(ext) for ext in extensions):
+ filelike = True
+ if filelike:
+ warnings.warn(
+ 'The input looks more like a filename than markup. You may'
+ ' want to open this file and pass the filehandle into'
+ ' Beautiful Soup.',
+ MarkupResemblesLocatorWarning
+ )
+ return True
+ return False
+
def _feed(self):
"""Internal method that parses previously set markup, creating a large
number of Tag and NavigableString objects.
diff --git a/bs4/element.py b/bs4/element.py
index 86123f8..74b1dc0 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1992,7 +1992,7 @@ class Tag(PageElement):
has_key() is gone in Python 3, anyway.
"""
warnings.warn(
- 'has_key is deprecated. Use has_attr("%s") instead.' % key,
+ 'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning
)
return self.has_attr(key)
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 2f53a30..445f74d 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -224,7 +224,7 @@ class TestWarnings(SoupTest):
for w in warnings:
if isinstance(w.message, cls):
return w
- raise Exception("%s warning not found in %r" % cls, warnings)
+ raise Exception("%s warning not found in %r" % (cls, warnings))
def _assert_no_parser_specified(self, w):
warning = self._assert_warning(w, GuessedAtParserWarning)
@@ -267,67 +267,65 @@ class TestWarnings(SoupTest):
with pytest.raises(TypeError):
self.soup("<a>", no_such_argument=True)
- def test_disk_file_warning(self):
- filehandle = tempfile.NamedTemporaryFile()
- filename = filehandle.name
- try:
- with warnings.catch_warnings(record=True) as w:
- soup = self.soup(filename)
- warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
- assert "looks like a filename" in str(warning.message)
- finally:
- filehandle.close()
-
- # The file no longer exists, so Beautiful Soup will no longer issue the warning.
+ @pytest.mark.parametrize(
+ "extension",
+ ['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt',
+ 'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"]
+ )
+ def test_resembles_filename_warning(self, extension):
+ # A warning is issued if the "markup" looks like the name of
+ # an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(filename)
- assert [] == w
-
- def test_directory_warning(self):
- try:
- filename = tempfile.mkdtemp()
- with warnings.catch_warnings(record=True) as w:
- soup = self.soup(filename)
+ soup = self.soup("markup" + extension)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
- assert "looks like a directory" in str(warning.message)
- finally:
- os.rmdir(filename)
-
- # The directory no longer exists, so Beautiful Soup will no longer issue the warning.
+ assert "looks more like a filename" in str(warning.message)
+
+ @pytest.mark.parametrize(
+ "extension",
+ ['markuphtml', 'markup.com', '', 'markup.js']
+ )
+ def test_resembles_filename_no_warning(self, extension):
+ # The 'looks more like a filename' warning is not issued if
+ # the markup looks like a bare string, a domain name, or a
+ # file that's not an HTML file.
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(filename)
+ soup = self.soup("markup" + extension)
assert [] == w
def test_url_warning_with_bytes_url(self):
+ url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list:
- soup = self.soup(b"http://www.crummybytes.com/")
+ soup = self.soup(url)
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
- assert "looks like a URL" in str(warning.message)
-
+ assert "looks more like a URL" in str(warning.message)
+ assert url not in str(warning.message).encode("utf8")
+
def test_url_warning_with_unicode_url(self):
+ url = "http://www.crummyunicode.com/"
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
- soup = self.soup("http://www.crummyunicode.com/")
+ soup = self.soup(url)
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
- assert "looks like a URL" in str(warning.message)
+ assert "looks more like a URL" in str(warning.message)
+ assert url not in str(warning.message)
def test_url_warning_with_bytes_and_space(self):
# Here the markup contains something besides a URL, so no warning
# is issued.
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
- assert not any("looks like a URL" in str(w.message)
+ assert not any("looks more like a URL" in str(w.message)
for w in warning_list)
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup("http://www.crummyunicode.com/ is great")
- assert not any("looks like a URL" in str(w.message)
+ assert not any("looks more like a URL" in str(w.message)
for w in warning_list)