summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Whitaker <17734409+nathanwhit@users.noreply.github.com>2024-03-29 23:23:48 -0700
committerGitHub <noreply@github.com>2024-03-29 23:23:48 -0700
commit99720d0713ecc17245e9b08652aadcb160d92cb8 (patch)
tree88682deabbc84b43c50ff058a6419f199e816cf2
parent524e451bfbd36d3f5a00a16a7115b31b56805f10 (diff)
chore: Make jupyter integration tests less flaky and avoid hang (#23134)
There's a TOCTOU issue that can happen when selecting unused ports for the server to use (we get assigned an unused port by the OS, and between then and when the server actually binds to the port another test steals it). Improve this by checking if the server existed soon after setup, and if so we retry starting it. Client connection can also fail spuriously (in local testing) so added a retry mechanism. This also fixes a hang, where if the server exited (almost always due to the issue described above) before we connected to it, attempting to connect our client ZMQ sockets to it would just hang. To resolve this, I added a timeout so we can't wait forever.
-rw-r--r--tests/integration/jupyter_tests.rs68
1 files changed, 53 insertions, 15 deletions
diff --git a/tests/integration/jupyter_tests.rs b/tests/integration/jupyter_tests.rs
index 02695c595..af8101ea7 100644
--- a/tests/integration/jupyter_tests.rs
+++ b/tests/integration/jupyter_tests.rs
@@ -191,7 +191,24 @@ async fn connect_socket<S: zeromq::Socket>(
) -> S {
let addr = spec.endpoint(port);
let mut socket = S::new();
- socket.connect(&addr).await.unwrap();
+ let mut connected = false;
+ for _ in 0..5 {
+ match timeout(Duration::from_secs(5), socket.connect(&addr)).await {
+ Ok(Ok(_)) => {
+ connected = true;
+ break;
+ }
+ Ok(Err(e)) => {
+ eprintln!("Failed to connect to {addr}: {e}");
+ }
+ Err(e) => {
+ eprintln!("Timed out connecting to {addr}: {e}");
+ }
+ }
+ }
+ if !connected {
+ panic!("Failed to connect to {addr}");
+ }
socket
}
@@ -369,27 +386,48 @@ impl Drop for JupyterServerProcess {
}
}
-fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
+async fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
let context = TestContextBuilder::new().use_temp_cwd().build();
- let conn = ConnectionSpec::default();
+ let mut conn = ConnectionSpec::default();
let conn_file = context.temp_dir().path().join("connection.json");
conn_file.write_json(&conn);
- let process = context
- .new_command()
- .piped_output()
- .args_vec(vec![
- "jupyter",
- "--kernel",
- "--conn",
- conn_file.to_string().as_str(),
- ])
- .spawn()
- .unwrap();
+
+ let start_process = |conn_file: &test_util::PathRef| {
+ context
+ .new_command()
+ .args_vec(vec![
+ "jupyter",
+ "--kernel",
+ "--conn",
+ conn_file.to_string().as_str(),
+ ])
+ .spawn()
+ .unwrap()
+ };
+
+ // try to start the server, retrying up to 5 times
+ // (this can happen due to TOCTOU errors with selecting unused TCP ports)
+ let mut process = start_process(&conn_file);
+ tokio::time::sleep(Duration::from_millis(1000)).await;
+
+ for _ in 0..5 {
+ if process.try_wait().unwrap().is_none() {
+ break;
+ } else {
+ conn = ConnectionSpec::default();
+ conn_file.write_json(&conn);
+ process = start_process(&conn_file);
+ tokio::time::sleep(Duration::from_millis(1000)).await;
+ }
+ }
+ if process.try_wait().unwrap().is_some() {
+ panic!("Failed to start Jupyter server");
+ }
(context, conn, JupyterServerProcess(Some(process)))
}
async fn setup() -> (TestContext, JupyterClient, JupyterServerProcess) {
- let (context, conn, process) = setup_server();
+ let (context, conn, process) = setup_server().await;
let client = JupyterClient::new(&conn).await;
client.io_subscribe("").await.unwrap();