diff options
-rw-r--r-- | CHANGELOG | 9 | ||||
-rw-r--r-- | bs4/__init__.py | 24 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 53 |
3 files changed, 62 insertions, 24 deletions
@@ -1,5 +1,14 @@ = 4.9.1 (unreleased) +* Added a distinct subclass, GuessedAsParserWarning, for the warning + issued when BeautifulSoup is instantiated without a parser being + specified. [bug=1873787] + +* Added a distinct subclass, MarkupResemblesLocatorWarning, for the + warning issued when BeautifulSoup is instantiated with 'markup' that + actually seems to be a URL or the path to a file on + disk. [bug=1873787] + * The new NavigableString subclasses (Stylesheet, Script, and TemplateString) can now be imported directly from the bs4 package. diff --git a/bs4/__init__.py b/bs4/__init__.py index 18aa2d1..834f180 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -51,6 +51,19 @@ from .element import ( # running this code under Python 3 without converting it. 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +# Define some custom warnings. +class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to + use -- probably because no parser was specified in the constructor. + """ + +class MarkupResemblesLocatorWarning(UserWarning): + """The warning issued when BeautifulSoup is given 'markup' that + actually looks like a resource locator -- a URL or a path to a file + on disk. + """ + + class BeautifulSoup(Tag): """A data structure representing a parsed HTML or XML document. @@ -272,7 +285,10 @@ class BeautifulSoup(Tag): parser=builder.NAME, markup_type=markup_type ) - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + warnings.warn( + self.NO_PARSER_SPECIFIED_WARNING % values, + GuessedAtParserWarning, stacklevel=2 + ) else: if kwargs: warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") @@ -312,7 +328,8 @@ class BeautifulSoup(Tag): warnings.warn( '"%s" looks like a filename, not markup. You should' ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % self._decode_markup(markup) + ' Beautiful Soup.' % self._decode_markup(markup), + MarkupResemblesLocatorWarning ) self._check_markup_is_url(markup) @@ -399,7 +416,8 @@ class BeautifulSoup(Tag): ' requests to get the document behind the URL, and feed' ' that document to Beautiful Soup.' % cls._decode_markup( markup - ) + ), + MarkupResemblesLocatorWarning ) def _feed(self): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 8d0583c..857eb41 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -10,6 +10,8 @@ import tempfile from bs4 import ( BeautifulSoup, BeautifulStoneSoup, + GuessedAtParserWarning, + MarkupResemblesLocatorWarning, ) from bs4.builder import ( TreeBuilder, @@ -224,25 +226,32 @@ class TestConstructor(SoupTest): class TestWarnings(SoupTest): - def _no_parser_specified(self, s, is_there=True): - v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) - self.assertTrue(v) + def _assert_warning(self, warnings, cls): + for w in warnings: + if isinstance(w.message, cls): + return w + raise Exception("%s warning not found in %r" % cls, warnings) + + def _assert_no_parser_specified(self, w): + warning = self._assert_warning(w, GuessedAtParserWarning) + message = str(warning.message) + self.assertTrue( + message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60]) + ) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("<a><b></b></a>") - msg = str(w[0].message) - self._assert_no_parser_specified(msg) + soup = BeautifulSoup("<a><b></b></a>") + self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("<a><b></b></a>", "html") - msg = str(w[0].message) - self._assert_no_parser_specified(msg) + soup = BeautifulSoup("<a><b></b></a>", "html") + self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("<a><b></b></a>", "html.parser") + soup = BeautifulSoup("<a><b></b></a>", "html.parser") self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): @@ -266,41 +275,43 @@ class TestWarnings(SoupTest): self.assertRaises( TypeError, self.soup, "<a>", no_such_argument=True) -class TestWarnings(SoupTest): - def test_disk_file_warning(self): filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name try: with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) - msg = str(w[0].message) - self.assertTrue("looks like a filename" in msg) + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + self.assertTrue("looks like a filename" in str(warning.message)) finally: filehandle.close() # The file no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) - self.assertEqual(0, len(w)) + self.assertEqual([], w) def test_url_warning_with_bytes_url(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/") - # Be aware this isn't the only warning that can be raised during - # execution.. - self.assertTrue(any("looks like a URL" in str(w.message) - for w in warning_list)) + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) + self.assertTrue("looks like a URL" in str(warning.message)) def test_url_warning_with_unicode_url(self): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning soup = self.soup(u"http://www.crummyunicode.com/") - self.assertTrue(any("looks like a URL" in str(w.message) - for w in warning_list)) + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) + self.assertTrue("looks like a URL" in str(warning.message)) def test_url_warning_with_bytes_and_space(self): + # Here the markup contains something besides a URL, so no warning + # is issued. with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) |