summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-21 08:17:28 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-21 08:17:28 -0400
commitc913ce03775a2e08e970c1c537ad43cf03cbd417 (patch)
tree9c57cd6b86a29a4927767f3cacf921c40244bb30
parentc39b3d727006259d7933f468cef7cb1ea9ab6bba (diff)
Added two distinct UserWarning subclasses for warnings issued from the BeautifulSoup constructor which a caller may want to filter out. [bug=1873787]
-rw-r--r--CHANGELOG9
-rw-r--r--bs4/__init__.py24
-rw-r--r--bs4/tests/test_soup.py53
3 files changed, 62 insertions, 24 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 6da8b10..9f87315 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,14 @@
= 4.9.1 (unreleased)
+* Added a distinct subclass, GuessedAsParserWarning, for the warning
+ issued when BeautifulSoup is instantiated without a parser being
+ specified. [bug=1873787]
+
+* Added a distinct subclass, MarkupResemblesLocatorWarning, for the
+ warning issued when BeautifulSoup is instantiated with 'markup' that
+ actually seems to be a URL or the path to a file on
+ disk. [bug=1873787]
+
* The new NavigableString subclasses (Stylesheet, Script, and
TemplateString) can now be imported directly from the bs4 package.
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 18aa2d1..834f180 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -51,6 +51,19 @@ from .element import (
# running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+# Define some custom warnings.
+class GuessedAtParserWarning(UserWarning):
+ """The warning issued when BeautifulSoup has to guess what parser to
+ use -- probably because no parser was specified in the constructor.
+ """
+
+class MarkupResemblesLocatorWarning(UserWarning):
+ """The warning issued when BeautifulSoup is given 'markup' that
+ actually looks like a resource locator -- a URL or a path to a file
+ on disk.
+ """
+
+
class BeautifulSoup(Tag):
"""A data structure representing a parsed HTML or XML document.
@@ -272,7 +285,10 @@ class BeautifulSoup(Tag):
parser=builder.NAME,
markup_type=markup_type
)
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+ warnings.warn(
+ self.NO_PARSER_SPECIFIED_WARNING % values,
+ GuessedAtParserWarning, stacklevel=2
+ )
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@@ -312,7 +328,8 @@ class BeautifulSoup(Tag):
warnings.warn(
'"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into'
- ' Beautiful Soup.' % self._decode_markup(markup)
+ ' Beautiful Soup.' % self._decode_markup(markup),
+ MarkupResemblesLocatorWarning
)
self._check_markup_is_url(markup)
@@ -399,7 +416,8 @@ class BeautifulSoup(Tag):
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % cls._decode_markup(
markup
- )
+ ),
+ MarkupResemblesLocatorWarning
)
def _feed(self):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 8d0583c..857eb41 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -10,6 +10,8 @@ import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
+ GuessedAtParserWarning,
+ MarkupResemblesLocatorWarning,
)
from bs4.builder import (
TreeBuilder,
@@ -224,25 +226,32 @@ class TestConstructor(SoupTest):
class TestWarnings(SoupTest):
- def _no_parser_specified(self, s, is_there=True):
- v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
- self.assertTrue(v)
+ def _assert_warning(self, warnings, cls):
+ for w in warnings:
+ if isinstance(w.message, cls):
+ return w
+ raise Exception("%s warning not found in %r" % cls, warnings)
+
+ def _assert_no_parser_specified(self, w):
+ warning = self._assert_warning(w, GuessedAtParserWarning)
+ message = str(warning.message)
+ self.assertTrue(
+ message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
+ )
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup("<a><b></b></a>")
- msg = str(w[0].message)
- self._assert_no_parser_specified(msg)
+ soup = BeautifulSoup("<a><b></b></a>")
+ self._assert_no_parser_specified(w)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup("<a><b></b></a>", "html")
- msg = str(w[0].message)
- self._assert_no_parser_specified(msg)
+ soup = BeautifulSoup("<a><b></b></a>", "html")
+ self._assert_no_parser_specified(w)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup("<a><b></b></a>", "html.parser")
+ soup = BeautifulSoup("<a><b></b></a>", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
@@ -266,41 +275,43 @@ class TestWarnings(SoupTest):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
-class TestWarnings(SoupTest):
-
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
- msg = str(w[0].message)
- self.assertTrue("looks like a filename" in msg)
+ warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+ self.assertTrue("looks like a filename" in str(warning.message))
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
- self.assertEqual(0, len(w))
+ self.assertEqual([], w)
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
- # Be aware this isn't the only warning that can be raised during
- # execution..
- self.assertTrue(any("looks like a URL" in str(w.message)
- for w in warning_list))
+ warning = self._assert_warning(
+ warning_list, MarkupResemblesLocatorWarning
+ )
+ self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/")
- self.assertTrue(any("looks like a URL" in str(w.message)
- for w in warning_list))
+ warning = self._assert_warning(
+ warning_list, MarkupResemblesLocatorWarning
+ )
+ self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_bytes_and_space(self):
+ # Here the markup contains something besides a URL, so no warning
+ # is issued.
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)