refactor(cli): move op sanitizer to Rust (#22413)

The format of the sanitizers will change a little bit: - If multiple async ops leak and traces are on, we repeat the async op header once per stack trace. - All leaks are aggregated under a "Leaks detected:" banner as the new timers are eventually going to be added, and these are neither ops nor resources. - `1 async op` is now `An async op` - If ops and resources leak, we show both (rather than op leaks masking resources) Follow-on to https://github.com/denoland/deno/pull/22226
author: Matt Mastracci <matthew@mastracci.com> 2024-02-16 14:22:12 -0700
committer: GitHub <noreply@github.com> 2024-02-16 21:22:12 +0000
commit: 67a4231bb62b7839f39bafc9e2e656f43093619b (patch)
tree: 428c3db10857a5c1785af56ea8997dff8b2adb0b /cli/tools
parent: c92717a1a43d31951cd8a1161afb766ef339aa21 (diff)
2 files changed, 260 insertions, 63 deletions
diff --git a/cli/tools/test/fmt.rs b/cli/tools/test/fmt.rs
index 931caf147..a185bd04f 100644
--- a/cli/tools/test/fmt.rs
+++ b/cli/tools/test/fmt.rs
@@ -82,17 +82,23 @@ pub fn format_test_error(js_error: &JsError) -> String {
   format_js_error(&js_error)
 }
 
-pub fn format_sanitizer_diff(diff: RuntimeActivityDiff) -> Vec<String> {
-  let mut output = format_sanitizer_accum(diff.appeared, true);
-  output.extend(format_sanitizer_accum(diff.disappeared, false));
-  output.sort();
-  output
+pub fn format_sanitizer_diff(
+  diff: RuntimeActivityDiff,
+) -> (Vec<String>, Vec<String>) {
+  let (mut messages, trailers) = format_sanitizer_accum(diff.appeared, true);
+  let disappeared = format_sanitizer_accum(diff.disappeared, false);
+  messages.extend(disappeared.0);
+  messages.sort();
+  let mut trailers = BTreeSet::from_iter(trailers);
+  trailers.extend(disappeared.1);
+  (messages, trailers.into_iter().collect::<Vec<_>>())
 }
 
 fn format_sanitizer_accum(
   activities: Vec<RuntimeActivity>,
   appeared: bool,
-) -> Vec<String> {
+) -> (Vec<String>, Vec<String>) {
+  // Aggregate the sanitizer information
   let mut accum = HashMap::new();
   for activity in activities {
     let item = format_sanitizer_accum_item(activity);
@@ -100,21 +106,46 @@ fn format_sanitizer_accum(
   }
 
   let mut output = vec![];
-  for ((item_type, item_name), count) in accum.into_iter() {
+  let mut needs_trace_ops = false;
+  for ((item_type, item_name, trace), count) in accum.into_iter() {
     if item_type == RuntimeActivityType::Resource {
-      // TODO(mmastrac): until we implement the new timers and op sanitization, these must be ignored in this path
-      if item_name == "timer" {
-        continue;
-      }
       let (name, action1, action2) = pretty_resource_name(&item_name);
       let hint = resource_close_hint(&item_name);
 
-      if appeared {
-        output.push(format!("{name} was {action1} during the test, but not {action2} during the test. {hint}"));
+      let value = if appeared {
+        format!("{name} was {action1} during the test, but not {action2} during the test. {hint}")
       } else {
-        output.push(format!("{name} was {action1} before the test started, but was {action2} during the test. \
-          Do not close resources in a test that were not created during that test."));
-      }
+        format!("{name} was {action1} before the test started, but was {action2} during the test. \
+          Do not close resources in a test that were not created during that test.")
+      };
+      output.push(value);
+    } else if item_type == RuntimeActivityType::AsyncOp {
+      let (count_str, plural, tense) = if count == 1 {
+        (Cow::Borrowed("An"), "", "was")
+      } else {
+        (Cow::Owned(count.to_string()), "s", "were")
+      };
+      let phrase = if appeared {
+        "started in this test, but never completed"
+      } else {
+        "started before the test, but completed during the test. Async operations should not complete in a test if they were not started in that test"
+      };
+      let mut value = if let Some([operation, hint]) =
+        OP_DETAILS.get(&item_name)
+      {
+        format!("{count_str} async operation{plural} to {operation} {tense} {phrase}. This is often caused by not {hint}.")
+      } else {
+        format!(
+          "{count_str} async call{plural} to {item_name} {tense} {phrase}."
+        )
+      };
+      value += &if let Some(trace) = trace {
+        format!(" The operation {tense} started here:\n{trace}")
+      } else {
+        needs_trace_ops = true;
+        String::new()
+      };
+      output.push(value);
     } else {
       // TODO(mmastrac): this will be done in a later PR
       unimplemented!(
@@ -125,18 +156,25 @@ fn format_sanitizer_accum(
       );
     }
   }
-  output
+  if needs_trace_ops {
+    (output, vec!["To get more details where ops were leaked, run again with --trace-ops flag.".to_owned()])
+  } else {
+    (output, vec![])
+  }
 }
 
 fn format_sanitizer_accum_item(
   activity: RuntimeActivity,
-) -> (RuntimeActivityType, Cow<'static, str>) {
+) -> (RuntimeActivityType, Cow<'static, str>, Option<String>) {
   let activity_type = activity.activity();
   match activity {
-    RuntimeActivity::AsyncOp(_, name, _) => (activity_type, name.into()),
-    RuntimeActivity::Interval(_) => (activity_type, "".into()),
-    RuntimeActivity::Resource(_, name) => (activity_type, name.into()),
-    RuntimeActivity::Timer(_) => (activity_type, "".into()),
+    // TODO(mmastrac): OpCallTrace needs to be Eq
+    RuntimeActivity::AsyncOp(_, name, trace) => {
+      (activity_type, name.into(), trace.map(|x| x.to_string()))
+    }
+    RuntimeActivity::Interval(_) => (activity_type, "".into(), None),
+    RuntimeActivity::Resource(_, name) => (activity_type, name.into(), None),
+    RuntimeActivity::Timer(_) => (activity_type, "".into(), None),
   }
 }
 
@@ -215,9 +253,6 @@ fn resource_close_hint(name: &str) -> &'static str {
   }
 }
 
-// An async operation to $0 was started in this test, but never completed. This is often caused by not $1.
-// An async operation to $0 was started in this test, but never completed. Async operations should not complete in a test if they were not started in that test.
-// deno-fmt-ignore
 pub const OP_DETAILS: phf::Map<&'static str, [&'static str; 2]> = phf_map! {
   "op_blob_read_part" => ["read from a Blob or File", "awaiting the result of a Blob or File read"],
   "op_broadcast_recv" => ["receive a message from a BroadcastChannel", "closing the BroadcastChannel"],
@@ -295,3 +330,31 @@ pub const OP_DETAILS: phf::Map<&'static str, [&'static str; 2]> = phf_map! {
   "op_ws_send_ping" => ["send a message on a WebSocket", "closing a `WebSocket` or `WebSocketStream`"],
   "op_spawn_wait" => ["wait for a subprocess to exit", "awaiting the result of a `Deno.Process#status` call"],
 };
+
+#[cfg(test)]
+mod tests {
+  use deno_core::stats::RuntimeActivity;
+
+  macro_rules! leak_format_test {
+    ($name:ident, $appeared:literal, [$($activity:expr),*], $expected:literal) => {
+      #[test]
+      fn $name() {
+        let (leaks, trailer_notes) = super::format_sanitizer_accum(vec![$($activity),*], $appeared);
+        let mut output = String::new();
+        for leak in leaks {
+          output += &format!(" - {leak}\n");
+        }
+        for trailer in trailer_notes {
+          output += &format!("{trailer}\n");
+        }
+        assert_eq!(output, $expected);
+      }
+    }
+  }
+
+  // https://github.com/denoland/deno/issues/13729
+  // https://github.com/denoland/deno/issues/13938
+  leak_format_test!(op_unknown, true, [RuntimeActivity::AsyncOp(0, "op_unknown", None)], 
+    " - An async call to op_unknown was started in this test, but never completed.\n\
+    To get more details where ops were leaked, run again with --trace-ops flag.\n");
+}
diff --git a/cli/tools/test/mod.rs b/cli/tools/test/mod.rs
index 2cf663b5f..6e22b894a 100644
--- a/cli/tools/test/mod.rs
+++ b/cli/tools/test/mod.rs
@@ -34,13 +34,15 @@ use deno_core::error::AnyError;
 use deno_core::error::JsError;
 use deno_core::futures::future;
 use deno_core::futures::stream;
-use deno_core::futures::task::noop_waker;
 use deno_core::futures::FutureExt;
 use deno_core::futures::StreamExt;
 use deno_core::located_script_name;
 use deno_core::parking_lot::Mutex;
 use deno_core::serde_v8;
+use deno_core::stats::RuntimeActivity;
+use deno_core::stats::RuntimeActivityDiff;
 use deno_core::stats::RuntimeActivityStats;
+use deno_core::stats::RuntimeActivityStatsFactory;
 use deno_core::stats::RuntimeActivityStatsFilter;
 use deno_core::unsync::spawn;
 use deno_core::unsync::spawn_blocking;
@@ -68,6 +70,7 @@ use std::collections::BTreeSet;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::fmt::Write as _;
+use std::future::poll_fn;
 use std::io::Read;
 use std::io::Write;
 use std::num::NonZeroUsize;
@@ -76,7 +79,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::task::Context;
+use std::task::Poll;
 use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
@@ -98,6 +101,9 @@ use reporters::PrettyTestReporter;
 use reporters::TapTestReporter;
 use reporters::TestReporter;
 
+/// How many times we're allowed to spin the event loop before considering something a leak.
+const MAX_SANITIZER_LOOP_SPINS: usize = 16;
+
 /// The test mode is used to determine how a specifier is to be tested.
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub enum TestMode {
@@ -210,8 +216,7 @@ pub enum TestFailure {
   JsError(Box<JsError>),
   FailedSteps(usize),
   IncompleteSteps,
-  LeakedOps(Vec<String>, bool), // Details, isOpCallTracingEnabled
-  LeakedResources(Vec<String>), // Details
+  Leaked(Vec<String>, Vec<String>), // Details, trailer notes
   // The rest are for steps only.
   Incomplete,
   OverlapsWithSanitizers(IndexSet<String>), // Long names of overlapped tests
@@ -226,20 +231,13 @@ impl ToString for TestFailure {
       TestFailure::FailedSteps(n) => format!("{} test steps failed.", n),
       TestFailure::IncompleteSteps => "Completed while steps were still running. Ensure all steps are awaited with `await t.step(...)`.".to_string(),
       TestFailure::Incomplete => "Didn't complete before parent. Await step with `await t.step(...)`.".to_string(),
-      TestFailure::LeakedOps(details, is_op_call_tracing_enabled) => {
-        let mut string = "Leaking async ops:".to_string();
+      TestFailure::Leaked(details, trailer_notes) => {
+        let mut string = "Leaks detected:".to_string();
         for detail in details {
-          string.push_str(&format!("\n  - {}", detail));
+          string.push_str(&format!("\n  - {detail}"));
         }
-        if !is_op_call_tracing_enabled {
-          string.push_str("\nTo get more details where ops were leaked, run again with --trace-ops flag.");
-        }
-        string
-      }
-      TestFailure::LeakedResources(details) => {
-        let mut string = "Leaking resources:".to_string();
-        for detail in details {
-          string.push_str(&format!("\n  - {}", detail));
+        for trailer in trailer_notes {
+          string.push_str(&format!("\n{trailer}"));
         }
         string
       }
@@ -540,6 +538,25 @@ pub fn worker_has_tests(worker: &mut MainWorker) -> bool {
   !state.borrow::<ops::testing::TestContainer>().0.is_empty()
 }
 
+/// Yields to tokio to allow async work to process, and then polls
+/// the event loop once.
+#[must_use = "The event loop result should be checked"]
+pub async fn poll_event_loop(worker: &mut MainWorker) -> Result<(), AnyError> {
+  // Allow any ops that to do work in the tokio event loop to do so
+  tokio::task::yield_now().await;
+  // Spin the event loop once
+  poll_fn(|cx| {
+    if let Poll::Ready(Err(err)) = worker
+      .js_runtime
+      .poll_event_loop(cx, PollEventLoopOptions::default())
+    {
+      return Poll::Ready(Err(err));
+    }
+    Poll::Ready(Ok(()))
+  })
+  .await
+}
+
 pub async fn run_tests_for_worker(
   worker: &mut MainWorker,
   specifier: &ModuleSpecifier,
@@ -574,11 +591,43 @@ pub async fn run_tests_for_worker(
   }))?;
   let mut had_uncaught_error = false;
   let stats = worker.js_runtime.runtime_activity_stats_factory();
+  let ops = worker.js_runtime.op_names();
+
+  // These particular ops may start and stop independently of tests, so we just filter them out
+  // completely.
+  let op_id_host_recv_message = ops
+    .iter()
+    .position(|op| *op == "op_host_recv_message")
+    .unwrap();
+  let op_id_host_recv_ctrl = ops
+    .iter()
+    .position(|op| *op == "op_host_recv_ctrl")
+    .unwrap();
+
+  // For consistency between tests with and without sanitizers, we _always_ include
+  // the actual sanitizer capture before and after a test, but a test that ignores resource
+  // or op sanitization simply doesn't throw if one of these constraints is violated.
+  let mut filter = RuntimeActivityStatsFilter::default();
+  filter = filter.with_resources();
+  filter = filter.with_ops();
+  filter = filter.omit_op(op_id_host_recv_ctrl as _);
+  filter = filter.omit_op(op_id_host_recv_message as _);
 
   for (desc, function) in tests {
     if fail_fast_tracker.should_stop() {
       break;
     }
+
+    // Each test needs a fresh reqwest connection pool to avoid inter-test weirdness with connections
+    // failing. If we don't do this, a connection to a test server we just tore down might be re-used in
+    // the next test.
+    // TODO(mmastrac): this should be some sort of callback that we can implement for any subsystem
+    worker
+      .js_runtime
+      .op_state()
+      .borrow_mut()
+      .try_take::<deno_runtime::deno_fetch::reqwest::Client>();
+
     if desc.ignore {
       sender.send(TestEvent::Result(desc.id, TestResult::Ignored, 0))?;
       continue;
@@ -593,24 +642,10 @@ pub async fn run_tests_for_worker(
     // responded to settle.
     // TODO(mmastrac): we should provide an API to poll the event loop until no futher
     // progress is made.
-    {
-      let waker = noop_waker();
-      let mut cx = Context::from_waker(&waker);
-      let _ = worker
-        .js_runtime
-        .poll_event_loop(&mut cx, PollEventLoopOptions::default());
-    }
+    poll_event_loop(worker).await?;
 
-    let mut filter = RuntimeActivityStatsFilter::default();
-    if desc.sanitize_resources {
-      filter = filter.with_resources();
-    }
-
-    let before = if !filter.is_empty() {
-      Some(stats.clone().capture(&filter))
-    } else {
-      None
-    };
+    // We always capture stats, regardless of sanitization state
+    let before = stats.clone().capture(&filter);
 
     let earlier = SystemTime::now();
     let call = worker.js_runtime.call(&function);
@@ -635,12 +670,21 @@ pub async fn run_tests_for_worker(
         }
       }
     };
-    if let Some(before) = before {
-      let after = stats.clone().capture(&filter);
-      let diff = RuntimeActivityStats::diff(&before, &after);
-      let formatted = format_sanitizer_diff(diff);
+
+    // Await activity stabilization
+    if let Some(diff) = wait_for_activity_to_stabilize(
+      worker,
+      &stats,
+      &filter,
+      before,
+      desc.sanitize_ops,
+      desc.sanitize_resources,
+    )
+    .await?
+    {
+      let (formatted, trailer_notes) = format_sanitizer_diff(diff);
       if !formatted.is_empty() {
-        let failure = TestFailure::LeakedResources(formatted);
+        let failure = TestFailure::Leaked(formatted, trailer_notes);
         let elapsed = SystemTime::now().duration_since(earlier)?.as_millis();
         sender.send(TestEvent::Result(
           desc.id,
@@ -663,6 +707,96 @@ pub async fn run_tests_for_worker(
   Ok(())
 }
 
+/// Removes timer resources and op_sleep_interval calls. When an interval is started before a test
+/// and resolves during a test, there's a false alarm.
+fn preprocess_timer_activity(activities: &mut Vec<RuntimeActivity>) {
+  // TODO(mmastrac): Once we get to the new timer implementation, all of this
+  // code can go away and be replaced by a proper timer sanitizer.
+  let mut timer_resource_leaked = false;
+
+  // First, search for any timer resources which will indicate that we have an interval leak
+  activities.retain(|activity| {
+    if let RuntimeActivity::Resource(_, name) = activity {
+      if name == "timer" {
+        timer_resource_leaked = true;
+        return false;
+      }
+    }
+    true
+  });
+
+  // If we've leaked a timer resource, we un-mute op_sleep_interval calls. Otherwise, we remove
+  // them.
+  if !timer_resource_leaked {
+    activities.retain(|activity| {
+      if let RuntimeActivity::AsyncOp(_, op, _) = activity {
+        *op != "op_sleep_interval"
+      } else {
+        true
+      }
+    })
+  }
+}
+
+async fn wait_for_activity_to_stabilize(
+  worker: &mut MainWorker,
+  stats: &RuntimeActivityStatsFactory,
+  filter: &RuntimeActivityStatsFilter,
+  before: RuntimeActivityStats,
+  sanitize_ops: bool,
+  sanitize_resources: bool,
+) -> Result<Option<RuntimeActivityDiff>, AnyError> {
+  // First, check to see if there's any diff at all. If not, just continue.
+  let after = stats.clone().capture(filter);
+  let mut diff = RuntimeActivityStats::diff(&before, &after);
+  preprocess_timer_activity(&mut diff.appeared);
+  preprocess_timer_activity(&mut diff.disappeared);
+  if diff.appeared.is_empty() && diff.disappeared.is_empty() {
+    // No activity, so we return early
+    return Ok(None);
+  }
+
+  // We allow for up to MAX_SANITIZER_LOOP_SPINS to get to a point where there is no difference.
+  // TODO(mmastrac): We could be much smarter about this if we had the concept of "progress" in
+  // an event loop tick. Ideally we'd be able to tell if we were spinning and doing nothing, or
+  // spinning and resolving ops.
+  for _ in 0..MAX_SANITIZER_LOOP_SPINS {
+    // There was a diff, so let the event loop run once
+    poll_event_loop(worker).await?;
+
+    let after = stats.clone().capture(filter);
+    diff = RuntimeActivityStats::diff(&before, &after);
+    preprocess_timer_activity(&mut diff.appeared);
+    preprocess_timer_activity(&mut diff.disappeared);
+    if diff.appeared.is_empty() && diff.disappeared.is_empty() {
+      return Ok(None);
+    }
+  }
+
+  if !sanitize_ops {
+    diff
+      .appeared
+      .retain(|activity| !matches!(activity, RuntimeActivity::AsyncOp(..)));
+    diff
+      .disappeared
+      .retain(|activity| !matches!(activity, RuntimeActivity::AsyncOp(..)));
+  }
+  if !sanitize_resources {
+    diff
+      .appeared
+      .retain(|activity| !matches!(activity, RuntimeActivity::Resource(..)));
+    diff
+      .disappeared
+      .retain(|activity| !matches!(activity, RuntimeActivity::Resource(..)));
+  }
+
+  Ok(if diff.appeared.is_empty() && diff.disappeared.is_empty() {
+    None
+  } else {
+    Some(diff)
+  })
+}
+
 fn extract_files_from_regex_blocks(
   specifier: &ModuleSpecifier,
   source: &str,
author	Matt Mastracci <matthew@mastracci.com>	2024-02-16 14:22:12 -0700
committer	GitHub <noreply@github.com>	2024-02-16 21:22:12 +0000
commit	67a4231bb62b7839f39bafc9e2e656f43093619b (patch)
tree	428c3db10857a5c1785af56ea8997dff8b2adb0b /cli/tools
parent	c92717a1a43d31951cd8a1161afb766ef339aa21 (diff)