summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2022-04-07 19:17:52 -0400
committerLeonard Richardson <leonardr@segfault.org>2022-04-07 19:17:52 -0400
commitd8e5c8b975102d6567ed172190feacb8811a343f (patch)
treeeba22a9c1e9492032ad27612106c6f677a3cc43c /bs4/__init__.py
parent461a7083d2a924a9aaeadb8a8f6b47cf8b3fb511 (diff)
Omit untrusted input when issuing warnings.
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py91
1 files changed, 45 insertions, 46 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 2371ccf..a16ff26 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,14 +15,13 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.10.0"
-__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
+__version__ = "4.11.0"
+__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
-
from collections import Counter
import os
import re
@@ -315,43 +314,12 @@ class BeautifulSoup(Tag):
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, str) and not '<' in markup)
):
- # Print out warnings for a couple beginner problems
+ # Issue warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
- # just in case that's what the user really wants.
- if (isinstance(markup, str)
- and not os.path.supports_unicode_filenames):
- possible_filename = markup.encode("utf8")
- else:
- possible_filename = markup
- is_file = False
- is_directory = False
- try:
- is_file = os.path.exists(possible_filename)
- if is_file:
- is_directory = os.path.isdir(possible_filename)
- except Exception as e:
- # This is almost certainly a problem involving
- # characters not valid in filenames on this
- # system. Just let it go.
- pass
- if is_directory:
- warnings.warn(
- '"%s" looks like a directory name, not markup. You may'
- ' want to open a file found in this directory and pass'
- ' the filehandle into Beautiful Soup.' % (
- self._decode_markup(markup)
- ),
- MarkupResemblesLocatorWarning
- )
- elif is_file:
- warnings.warn(
- '"%s" looks like a filename, not markup. You may'
- ' want to open this file and pass the filehandle into'
- ' Beautiful Soup.' % self._decode_markup(markup),
- MarkupResemblesLocatorWarning
- )
- self._check_markup_is_url(markup)
+ # since that is sometimes the intended behavior.
+ if not self._markup_is_url(markup):
+ self._markup_resembles_filename(markup)
rejections = []
success = False
@@ -414,11 +382,13 @@ class BeautifulSoup(Tag):
return decoded
@classmethod
- def _check_markup_is_url(cls, markup):
+ def _markup_is_url(cls, markup):
"""Error-handling method to raise a warning if incoming markup looks
like a URL.
:param markup: A string.
+ :return: Whether or not the markup resembles a URL
+ closely enough to justify a warning.
"""
if isinstance(markup, bytes):
space = b' '
@@ -427,21 +397,50 @@ class BeautifulSoup(Tag):
space = ' '
cant_start_with = ("http:", "https:")
else:
- return
+ return False
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
warnings.warn(
- '"%s" looks like a URL, not markup. You may want to use'
+ 'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
- ' the URL, and feed that document to Beautiful Soup.' % (
- cls._decode_markup(
- markup
- )
- ),
+ ' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning
)
+ return True
+ return False
+
+ @classmethod
+ def _markup_resembles_filename(cls, markup):
+ """Error-handling method to raise a warning if incoming markup
+ resembles a filename.
+ :param markup: A bytestring or string.
+ :return: Whether or not the markup resembles a filename
+ closely enough to justify a warning.
+ """
+ path_characters = '/\\'
+ extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
+ if isinstance(markup, bytes):
+ path_characters = path_characters.encode("utf8")
+ extensions = [x.encode('utf8') for x in extensions]
+ filelike = False
+ if any(x in markup for x in path_characters):
+ filelike = True
+ else:
+ lower = markup.lower()
+ if any(lower.endswith(ext) for ext in extensions):
+ filelike = True
+ if filelike:
+ warnings.warn(
+ 'The input looks more like a filename than markup. You may'
+ ' want to open this file and pass the filehandle into'
+ ' Beautiful Soup.',
+ MarkupResemblesLocatorWarning
+ )
+ return True
+ return False
+
def _feed(self):
"""Internal method that parses previously set markup, creating a large
number of Tag and NavigableString objects.