summaryrefslogtreecommitdiff
path: root/cli/npm/managed/cache
diff options
context:
space:
mode:
authorDavid Sherret <dsherret@users.noreply.github.com>2024-06-02 21:39:13 -0400
committerGitHub <noreply@github.com>2024-06-03 01:39:13 +0000
commitb1f776adef6f0d0caa0b2badf9fb707cf5efa6e7 (patch)
treedf801e53bb5e43268933d883f049546256ef8e7f /cli/npm/managed/cache
parenteda43c46de12ed589fdbe62ba0574887cfbb3574 (diff)
refactor: extract structs for downloading tarballs and npm registry packuments (#24067)
Diffstat (limited to 'cli/npm/managed/cache')
-rw-r--r--cli/npm/managed/cache/mod.rs254
-rw-r--r--cli/npm/managed/cache/registry_info.rs284
-rw-r--r--cli/npm/managed/cache/tarball.rs210
-rw-r--r--cli/npm/managed/cache/tarball_extract.rs324
4 files changed, 1072 insertions, 0 deletions
diff --git a/cli/npm/managed/cache/mod.rs b/cli/npm/managed/cache/mod.rs
new file mode 100644
index 000000000..f409744b9
--- /dev/null
+++ b/cli/npm/managed/cache/mod.rs
@@ -0,0 +1,254 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+use std::collections::HashSet;
+use std::fs;
+use std::io::ErrorKind;
+use std::path::Path;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use deno_ast::ModuleSpecifier;
+use deno_core::anyhow::bail;
+use deno_core::anyhow::Context;
+use deno_core::error::AnyError;
+use deno_core::parking_lot::Mutex;
+use deno_core::serde_json;
+use deno_core::url::Url;
+use deno_npm::npm_rc::ResolvedNpmRc;
+use deno_npm::registry::NpmPackageInfo;
+use deno_npm::NpmPackageCacheFolderId;
+use deno_semver::package::PackageNv;
+
+use crate::args::CacheSetting;
+use crate::cache::CACHE_PERM;
+use crate::npm::NpmCacheDir;
+use crate::util::fs::atomic_write_file_with_retries;
+use crate::util::fs::hard_link_dir_recursive;
+
+mod registry_info;
+mod tarball;
+mod tarball_extract;
+
+pub use registry_info::RegistryInfoDownloader;
+pub use tarball::TarballCache;
+
+/// Stores a single copy of npm packages in a cache.
+#[derive(Debug)]
+pub struct NpmCache {
+ cache_dir: NpmCacheDir,
+ cache_setting: CacheSetting,
+ npmrc: Arc<ResolvedNpmRc>,
+ /// ensures a package is only downloaded once per run
+ previously_reloaded_packages: Mutex<HashSet<PackageNv>>,
+}
+
+impl NpmCache {
+ pub fn new(
+ cache_dir: NpmCacheDir,
+ cache_setting: CacheSetting,
+ npmrc: Arc<ResolvedNpmRc>,
+ ) -> Self {
+ Self {
+ cache_dir,
+ cache_setting,
+ previously_reloaded_packages: Default::default(),
+ npmrc,
+ }
+ }
+
+ pub fn cache_setting(&self) -> &CacheSetting {
+ &self.cache_setting
+ }
+
+ pub fn root_dir_url(&self) -> &Url {
+ self.cache_dir.root_dir_url()
+ }
+
+ /// Checks if the cache should be used for the provided name and version.
+ /// NOTE: Subsequent calls for the same package will always return `true`
+ /// to ensure a package is only downloaded once per run of the CLI. This
+ /// prevents downloads from re-occurring when someone has `--reload` and
+ /// and imports a dynamic import that imports the same package again for example.
+ pub fn should_use_cache_for_package(&self, package: &PackageNv) -> bool {
+ self.cache_setting.should_use_for_npm_package(&package.name)
+ || !self
+ .previously_reloaded_packages
+ .lock()
+ .insert(package.clone())
+ }
+
+ /// Ensures a copy of the package exists in the global cache.
+ ///
+ /// This assumes that the original package folder being hard linked
+ /// from exists before this is called.
+ pub fn ensure_copy_package(
+ &self,
+ folder_id: &NpmPackageCacheFolderId,
+ ) -> Result<(), AnyError> {
+ let registry_url = self.npmrc.get_registry_url(&folder_id.nv.name);
+ assert_ne!(folder_id.copy_index, 0);
+ let package_folder = self
+ .cache_dir
+ .package_folder_for_id(folder_id, registry_url);
+
+ if package_folder.exists()
+ // if this file exists, then the package didn't successfully initialize
+ // the first time, or another process is currently extracting the zip file
+ && !package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME).exists()
+ && self.cache_setting.should_use_for_npm_package(&folder_id.nv.name)
+ {
+ return Ok(());
+ }
+
+ let original_package_folder = self
+ .cache_dir
+ .package_folder_for_nv(&folder_id.nv, registry_url);
+
+ // it seems Windows does an "AccessDenied" error when moving a
+ // directory with hard links, so that's why this solution is done
+ with_folder_sync_lock(&folder_id.nv, &package_folder, || {
+ hard_link_dir_recursive(&original_package_folder, &package_folder)
+ })?;
+ Ok(())
+ }
+
+ pub fn package_folder_for_id(&self, id: &NpmPackageCacheFolderId) -> PathBuf {
+ let registry_url = self.npmrc.get_registry_url(&id.nv.name);
+ self.cache_dir.package_folder_for_id(id, registry_url)
+ }
+
+ pub fn package_folder_for_nv(&self, package: &PackageNv) -> PathBuf {
+ let registry_url = self.npmrc.get_registry_url(&package.name);
+ self.package_folder_for_nv_and_url(package, registry_url)
+ }
+
+ pub fn package_folder_for_nv_and_url(
+ &self,
+ package: &PackageNv,
+ registry_url: &Url,
+ ) -> PathBuf {
+ self.cache_dir.package_folder_for_nv(package, registry_url)
+ }
+
+ pub fn package_name_folder(&self, name: &str) -> PathBuf {
+ let registry_url = self.npmrc.get_registry_url(name);
+ self.cache_dir.package_name_folder(name, registry_url)
+ }
+
+ pub fn root_folder(&self) -> PathBuf {
+ self.cache_dir.root_dir().to_owned()
+ }
+
+ pub fn resolve_package_folder_id_from_specifier(
+ &self,
+ specifier: &ModuleSpecifier,
+ ) -> Option<NpmPackageCacheFolderId> {
+ self
+ .cache_dir
+ .resolve_package_folder_id_from_specifier(specifier)
+ }
+
+ pub fn load_package_info(
+ &self,
+ name: &str,
+ ) -> Result<Option<NpmPackageInfo>, AnyError> {
+ let file_cache_path = self.get_registry_package_info_file_cache_path(name);
+
+ let file_text = match fs::read_to_string(file_cache_path) {
+ Ok(file_text) => file_text,
+ Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
+ Err(err) => return Err(err.into()),
+ };
+ Ok(serde_json::from_str(&file_text)?)
+ }
+
+ pub fn save_package_info(
+ &self,
+ name: &str,
+ package_info: &NpmPackageInfo,
+ ) -> Result<(), AnyError> {
+ let file_cache_path = self.get_registry_package_info_file_cache_path(name);
+ let file_text = serde_json::to_string(&package_info)?;
+ atomic_write_file_with_retries(&file_cache_path, file_text, CACHE_PERM)?;
+ Ok(())
+ }
+
+ fn get_registry_package_info_file_cache_path(&self, name: &str) -> PathBuf {
+ let name_folder_path = self.package_name_folder(name);
+ name_folder_path.join("registry.json")
+ }
+}
+
+const NPM_PACKAGE_SYNC_LOCK_FILENAME: &str = ".deno_sync_lock";
+
+fn with_folder_sync_lock(
+ package: &PackageNv,
+ output_folder: &Path,
+ action: impl FnOnce() -> Result<(), AnyError>,
+) -> Result<(), AnyError> {
+ fn inner(
+ output_folder: &Path,
+ action: impl FnOnce() -> Result<(), AnyError>,
+ ) -> Result<(), AnyError> {
+ fs::create_dir_all(output_folder).with_context(|| {
+ format!("Error creating '{}'.", output_folder.display())
+ })?;
+
+ // This sync lock file is a way to ensure that partially created
+ // npm package directories aren't considered valid. This could maybe
+ // be a bit smarter in the future to not bother extracting here
+ // if another process has taken the lock in the past X seconds and
+ // wait for the other process to finish (it could try to create the
+ // file with `create_new(true)` then if it exists, check the metadata
+ // then wait until the other process finishes with a timeout), but
+ // for now this is good enough.
+ let sync_lock_path = output_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME);
+ match fs::OpenOptions::new()
+ .write(true)
+ .create(true)
+ .truncate(false)
+ .open(&sync_lock_path)
+ {
+ Ok(_) => {
+ action()?;
+ // extraction succeeded, so only now delete this file
+ let _ignore = std::fs::remove_file(&sync_lock_path);
+ Ok(())
+ }
+ Err(err) => {
+ bail!(
+ concat!(
+ "Error creating package sync lock file at '{}'. ",
+ "Maybe try manually deleting this folder.\n\n{:#}",
+ ),
+ output_folder.display(),
+ err
+ );
+ }
+ }
+ }
+
+ match inner(output_folder, action) {
+ Ok(()) => Ok(()),
+ Err(err) => {
+ if let Err(remove_err) = fs::remove_dir_all(output_folder) {
+ if remove_err.kind() != std::io::ErrorKind::NotFound {
+ bail!(
+ concat!(
+ "Failed setting up package cache directory for {}, then ",
+ "failed cleaning it up.\n\nOriginal error:\n\n{}\n\n",
+ "Remove error:\n\n{}\n\nPlease manually ",
+ "delete this folder or you will run into issues using this ",
+ "package in the future:\n\n{}"
+ ),
+ package,
+ err,
+ remove_err,
+ output_folder.display(),
+ );
+ }
+ }
+ Err(err)
+ }
+ }
+}
diff --git a/cli/npm/managed/cache/registry_info.rs b/cli/npm/managed/cache/registry_info.rs
new file mode 100644
index 000000000..ea6b47969
--- /dev/null
+++ b/cli/npm/managed/cache/registry_info.rs
@@ -0,0 +1,284 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use deno_core::anyhow::anyhow;
+use deno_core::anyhow::bail;
+use deno_core::anyhow::Context;
+use deno_core::error::custom_error;
+use deno_core::error::AnyError;
+use deno_core::futures::future::BoxFuture;
+use deno_core::futures::future::Shared;
+use deno_core::futures::FutureExt;
+use deno_core::parking_lot::Mutex;
+use deno_core::serde_json;
+use deno_core::url::Url;
+use deno_npm::npm_rc::RegistryConfig;
+use deno_npm::npm_rc::ResolvedNpmRc;
+use deno_npm::registry::NpmPackageInfo;
+
+use crate::args::CacheSetting;
+use crate::http_util::HttpClient;
+use crate::npm::common::maybe_auth_header_for_npm_registry;
+use crate::util::progress_bar::ProgressBar;
+
+use super::NpmCache;
+
+// todo(dsherret): create seams and unit test this
+
+#[derive(Debug, Clone)]
+enum MemoryCacheItem {
+ /// The cache item hasn't loaded yet.
+ PendingFuture(Shared<PendingRegistryLoadFuture>),
+ /// The item has loaded in the past and was stored in the file system cache.
+ /// There is no reason to request this package from the npm registry again
+ /// for the duration of execution.
+ FsCached,
+ /// An item is memory cached when it fails saving to the file system cache
+ /// or the package does not exist.
+ MemoryCached(Result<Option<Arc<NpmPackageInfo>>, Arc<AnyError>>),
+}
+
+#[derive(Debug, Clone)]
+enum FutureResult {
+ PackageNotExists,
+ SavedFsCache(Arc<NpmPackageInfo>),
+ ErroredFsCache(Arc<NpmPackageInfo>),
+}
+
+type PendingRegistryLoadFuture =
+ BoxFuture<'static, Result<FutureResult, Arc<AnyError>>>;
+
+/// Downloads packuments from the npm registry.
+///
+/// This is shared amongst all the workers.
+#[derive(Debug)]
+pub struct RegistryInfoDownloader {
+ cache: Arc<NpmCache>,
+ npmrc: Arc<ResolvedNpmRc>,
+ progress_bar: ProgressBar,
+ memory_cache: Mutex<HashMap<String, MemoryCacheItem>>,
+}
+
+impl RegistryInfoDownloader {
+ pub fn new(
+ cache: Arc<NpmCache>,
+ npmrc: Arc<ResolvedNpmRc>,
+ progress_bar: ProgressBar,
+ ) -> Self {
+ Self {
+ cache,
+ npmrc,
+ progress_bar,
+ memory_cache: Default::default(),
+ }
+ }
+
+ pub async fn load_package_info(
+ &self,
+ name: &str,
+ current_runtime_http_client: &Arc<HttpClient>,
+ ) -> Result<Option<Arc<NpmPackageInfo>>, AnyError> {
+ let registry_url = self.npmrc.get_registry_url(name);
+ let registry_config = self.npmrc.get_registry_config(name);
+
+ self
+ .load_package_info_inner(
+ name,
+ registry_url,
+ registry_config,
+ current_runtime_http_client,
+ )
+ .await
+ .with_context(|| {
+ format!(
+ "Error getting response at {} for package \"{}\"",
+ self.get_package_url(name, registry_url),
+ name
+ )
+ })
+ }
+
+ async fn load_package_info_inner(
+ &self,
+ name: &str,
+ registry_url: &Url,
+ registry_config: &RegistryConfig,
+ current_runtime_http_client: &Arc<HttpClient>,
+ ) -> Result<Option<Arc<NpmPackageInfo>>, AnyError> {
+ if *self.cache.cache_setting() == CacheSetting::Only {
+ return Err(custom_error(
+ "NotCached",
+ format!(
+ "An npm specifier not found in cache: \"{name}\", --cached-only is specified."
+ )
+ ));
+ }
+
+ let (created, cache_item) = {
+ let mut mem_cache = self.memory_cache.lock();
+ if let Some(cache_item) = mem_cache.get(name) {
+ (false, cache_item.clone())
+ } else {
+ let future = self.create_load_future(
+ name,
+ registry_url,
+ registry_config,
+ current_runtime_http_client,
+ );
+ let cache_item = MemoryCacheItem::PendingFuture(future);
+ mem_cache.insert(name.to_string(), cache_item.clone());
+ (true, cache_item)
+ }
+ };
+ match cache_item {
+ MemoryCacheItem::FsCached => {
+ // this struct previously loaded from the registry, so we can load it from the file system cache
+ self
+ .load_file_cached_package_info(name)
+ .await
+ .map(|info| Some(Arc::new(info)))
+ }
+ MemoryCacheItem::MemoryCached(maybe_info) => {
+ maybe_info.clone().map_err(|e| anyhow!("{}", e))
+ }
+ MemoryCacheItem::PendingFuture(future) => {
+ if created {
+ match future.await {
+ Ok(FutureResult::SavedFsCache(info)) => {
+ // return back the future and mark this package as having
+ // been saved in the cache for next time it's requested
+ *self.memory_cache.lock().get_mut(name).unwrap() =
+ MemoryCacheItem::FsCached;
+ Ok(Some(info))
+ }
+ Ok(FutureResult::ErroredFsCache(info)) => {
+ // since saving to the fs cache failed, keep the package information in memory
+ *self.memory_cache.lock().get_mut(name).unwrap() =
+ MemoryCacheItem::MemoryCached(Ok(Some(info.clone())));
+ Ok(Some(info))
+ }
+ Ok(FutureResult::PackageNotExists) => {
+ *self.memory_cache.lock().get_mut(name).unwrap() =
+ MemoryCacheItem::MemoryCached(Ok(None));
+ Ok(None)
+ }
+ Err(err) => {
+ let return_err = anyhow!("{}", err);
+ *self.memory_cache.lock().get_mut(name).unwrap() =
+ MemoryCacheItem::MemoryCached(Err(err));
+ Err(return_err)
+ }
+ }
+ } else {
+ match future.await {
+ Ok(FutureResult::SavedFsCache(info)) => Ok(Some(info)),
+ Ok(FutureResult::ErroredFsCache(info)) => Ok(Some(info)),
+ Ok(FutureResult::PackageNotExists) => Ok(None),
+ Err(err) => Err(anyhow!("{}", err)),
+ }
+ }
+ }
+ }
+ }
+
+ async fn load_file_cached_package_info(
+ &self,
+ name: &str,
+ ) -> Result<NpmPackageInfo, AnyError> {
+ // this scenario failing should be exceptionally rare so let's
+ // deal with improving it only when anyone runs into an issue
+ let maybe_package_info = deno_core::unsync::spawn_blocking({
+ let cache = self.cache.clone();
+ let name = name.to_string();
+ move || cache.load_package_info(&name)
+ })
+ .await
+ .unwrap()
+ .with_context(|| {
+ format!(
+ "Previously saved '{}' from the npm cache, but now it fails to load.",
+ name
+ )
+ })?;
+ match maybe_package_info {
+ Some(package_info) => Ok(package_info),
+ None => {
+ bail!("The package '{}' previously saved its registry information to the file system cache, but that file no longer exists.", name)
+ }
+ }
+ }
+
+ fn create_load_future(
+ &self,
+ name: &str,
+ registry_url: &Url,
+ registry_config: &RegistryConfig,
+ current_runtime_http_client: &Arc<HttpClient>,
+ ) -> Shared<PendingRegistryLoadFuture> {
+ let package_url = self.get_package_url(name, registry_url);
+ let maybe_auth_header = maybe_auth_header_for_npm_registry(registry_config);
+ let guard = self.progress_bar.update(package_url.as_str());
+ let cache = self.cache.clone();
+ let http_client = current_runtime_http_client.clone();
+ let name = name.to_string();
+ // force this future to be polled on the current runtime because it's not
+ // safe to share `HttpClient`s across runtimes and because a restart of
+ // npm resolution might cause this package not to be resolved again
+ // causing the future to never be polled
+ deno_core::unsync::spawn(async move {
+ let maybe_bytes = http_client
+ .download_with_progress(package_url, maybe_auth_header, &guard)
+ .await?;
+ match maybe_bytes {
+ Some(bytes) => {
+ let future_result = deno_core::unsync::spawn_blocking(
+ move || -> Result<FutureResult, AnyError> {
+ let package_info = serde_json::from_slice(&bytes)?;
+ match cache.save_package_info(&name, &package_info) {
+ Ok(()) => {
+ Ok(FutureResult::SavedFsCache(Arc::new(package_info)))
+ }
+ Err(err) => {
+ log::debug!(
+ "Error saving package {} to cache: {:#}",
+ name,
+ err
+ );
+ Ok(FutureResult::ErroredFsCache(Arc::new(package_info)))
+ }
+ }
+ },
+ )
+ .await??;
+ Ok(future_result)
+ }
+ None => Ok(FutureResult::PackageNotExists),
+ }
+ })
+ .map(|result| result.unwrap().map_err(Arc::new))
+ .boxed()
+ .shared()
+ }
+
+ fn get_package_url(&self, name: &str, registry_url: &Url) -> Url {
+ // list of all characters used in npm packages:
+ // !, ', (, ), *, -, ., /, [0-9], @, [A-Za-z], _, ~
+ const ASCII_SET: percent_encoding::AsciiSet =
+ percent_encoding::NON_ALPHANUMERIC
+ .remove(b'!')
+ .remove(b'\'')
+ .remove(b'(')
+ .remove(b')')
+ .remove(b'*')
+ .remove(b'-')
+ .remove(b'.')
+ .remove(b'/')
+ .remove(b'@')
+ .remove(b'_')
+ .remove(b'~');
+ let name = percent_encoding::utf8_percent_encode(name, &ASCII_SET);
+ registry_url.join(&name.to_string()).unwrap()
+ }
+}
diff --git a/cli/npm/managed/cache/tarball.rs b/cli/npm/managed/cache/tarball.rs
new file mode 100644
index 000000000..9848aca13
--- /dev/null
+++ b/cli/npm/managed/cache/tarball.rs
@@ -0,0 +1,210 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use deno_core::anyhow::anyhow;
+use deno_core::anyhow::bail;
+use deno_core::anyhow::Context;
+use deno_core::error::custom_error;
+use deno_core::error::AnyError;
+use deno_core::futures::future::BoxFuture;
+use deno_core::futures::future::Shared;
+use deno_core::futures::FutureExt;
+use deno_core::parking_lot::Mutex;
+use deno_npm::npm_rc::ResolvedNpmRc;
+use deno_npm::registry::NpmPackageVersionDistInfo;
+use deno_runtime::deno_fs::FileSystem;
+use deno_semver::package::PackageNv;
+
+use crate::args::CacheSetting;
+use crate::http_util::HttpClient;
+use crate::npm::common::maybe_auth_header_for_npm_registry;
+use crate::util::progress_bar::ProgressBar;
+
+use super::tarball_extract::verify_and_extract_tarball;
+use super::tarball_extract::TarballExtractionMode;
+use super::NpmCache;
+
+// todo(dsherret): create seams and unit test this
+
+#[derive(Debug, Clone)]
+enum MemoryCacheItem {
+ /// The cache item hasn't finished yet.
+ PendingFuture(Shared<BoxFuture<'static, Result<(), Arc<AnyError>>>>),
+ /// The result errored.
+ Errored(Arc<AnyError>),
+ /// This package has already been cached.
+ Cached,
+}
+
+/// Coordinates caching of tarballs being loaded from
+/// the npm registry.
+///
+/// This is shared amongst all the workers.
+#[derive(Debug)]
+pub struct TarballCache {
+ cache: Arc<NpmCache>,
+ fs: Arc<dyn FileSystem>,
+ npmrc: Arc<ResolvedNpmRc>,
+ progress_bar: ProgressBar,
+ memory_cache: Mutex<HashMap<PackageNv, MemoryCacheItem>>,
+}
+
+impl TarballCache {
+ pub fn new(
+ cache: Arc<NpmCache>,
+ fs: Arc<dyn FileSystem>,
+ npmrc: Arc<ResolvedNpmRc>,
+ progress_bar: ProgressBar,
+ ) -> Self {
+ Self {
+ cache,
+ fs,
+ npmrc,
+ progress_bar,
+ memory_cache: Default::default(),
+ }
+ }
+
+ pub async fn ensure_package(
+ &self,
+ package: &PackageNv,
+ dist: &NpmPackageVersionDistInfo,
+ // it's not safe to share these across runtimes
+ http_client_for_runtime: &Arc<HttpClient>,
+ ) -> Result<(), AnyError> {
+ self
+ .ensure_package_inner(package, dist, http_client_for_runtime)
+ .await
+ .with_context(|| format!("Failed caching npm package '{}'.", package))
+ }
+
+ async fn ensure_package_inner(
+ &self,
+ package_nv: &PackageNv,
+ dist: &NpmPackageVersionDistInfo,
+ http_client_for_runtime: &Arc<HttpClient>,
+ ) -> Result<(), AnyError> {
+ let (created, cache_item) = {
+ let mut mem_cache = self.memory_cache.lock();
+ if let Some(cache_item) = mem_cache.get(package_nv) {
+ (false, cache_item.clone())
+ } else {
+ let future = self.create_setup_future(
+ package_nv.clone(),
+ dist.clone(),
+ http_client_for_runtime.clone(),
+ );
+ let cache_item = MemoryCacheItem::PendingFuture(future);
+ mem_cache.insert(package_nv.clone(), cache_item.clone());
+ (true, cache_item)
+ }
+ };
+
+ match cache_item {
+ MemoryCacheItem::Cached => Ok(()),
+ MemoryCacheItem::Errored(err) => Err(anyhow!("{}", err)),
+ MemoryCacheItem::PendingFuture(future) => {
+ if created {
+ match future.await {
+ Ok(_) => {
+ *self.memory_cache.lock().get_mut(package_nv).unwrap() =
+ MemoryCacheItem::Cached;
+ Ok(())
+ }
+ Err(err) => {
+ let result_err = anyhow!("{}", err);
+ *self.memory_cache.lock().get_mut(package_nv).unwrap() =
+ MemoryCacheItem::Errored(err);
+ Err(result_err)
+ }
+ }
+ } else {
+ future.await.map_err(|err| anyhow!("{}", err))
+ }
+ }
+ }
+ }
+
+ fn create_setup_future(
+ &self,
+ package_nv: PackageNv,
+ dist: NpmPackageVersionDistInfo,
+ http_client_for_runtime: Arc<HttpClient>,
+ ) -> Shared<BoxFuture<'static, Result<(), Arc<AnyError>>>> {
+ let registry_url = self.npmrc.get_registry_url(&package_nv.name);
+ let registry_config =
+ self.npmrc.get_registry_config(&package_nv.name).clone();
+
+ let cache = self.cache.clone();
+ let fs = self.fs.clone();
+ let progress_bar = self.progress_bar.clone();
+ let package_folder =
+ cache.package_folder_for_nv_and_url(&package_nv, registry_url);
+
+ deno_core::unsync::spawn(async move {
+ let should_use_cache = cache.should_use_cache_for_package(&package_nv);
+ let package_folder_exists = fs.exists_sync(&package_folder);
+ if should_use_cache && package_folder_exists {
+ return Ok(());
+ } else if cache.cache_setting() == &CacheSetting::Only {
+ return Err(custom_error(
+ "NotCached",
+ format!(
+ "An npm specifier not found in cache: \"{}\", --cached-only is specified.",
+ &package_nv.name
+ )
+ )
+ );
+ }
+
+ if dist.tarball.is_empty() {
+ bail!("Tarball URL was empty.");
+ }
+
+ let maybe_auth_header =
+ maybe_auth_header_for_npm_registry(&registry_config);
+
+ let guard = progress_bar.update(&dist.tarball);
+ let maybe_bytes = http_client_for_runtime
+ .download_with_progress(&dist.tarball, maybe_auth_header, &guard)
+ .await?;
+ match maybe_bytes {
+ Some(bytes) => {
+ let extraction_mode = if should_use_cache || !package_folder_exists {
+ TarballExtractionMode::SiblingTempDir
+ } else {
+ // The user ran with `--reload`, so overwrite the package instead of
+ // deleting it since the package might get corrupted if a user kills
+ // their deno process while it's deleting a package directory
+ //
+ // We can't rename this folder and delete it because the folder
+ // may be in use by another process or may now contain hardlinks,
+ // which will cause windows to throw an "AccessDenied" error when
+ // renaming. So we settle for overwriting.
+ TarballExtractionMode::Overwrite
+ };
+ let dist = dist.clone();
+ let package_nv = package_nv.clone();
+ deno_core::unsync::spawn_blocking(move || {
+ verify_and_extract_tarball(
+ &package_nv,
+ &bytes,
+ &dist,
+ &package_folder,
+ extraction_mode,
+ )
+ })
+ .await?
+ }
+ None => {
+ bail!("Could not find npm package tarball at: {}", dist.tarball);
+ }
+ }
+ })
+ .map(|result| result.unwrap().map_err(Arc::new))
+ .boxed()
+ .shared()
+ }
+}
diff --git a/cli/npm/managed/cache/tarball_extract.rs b/cli/npm/managed/cache/tarball_extract.rs
new file mode 100644
index 000000000..e2d242e66
--- /dev/null
+++ b/cli/npm/managed/cache/tarball_extract.rs
@@ -0,0 +1,324 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+use std::collections::HashSet;
+use std::fs;
+use std::io::ErrorKind;
+use std::path::Path;
+use std::path::PathBuf;
+
+use base64::prelude::BASE64_STANDARD;
+use base64::Engine;
+use deno_core::anyhow::bail;
+use deno_core::anyhow::Context;
+use deno_core::error::AnyError;
+use deno_npm::registry::NpmPackageVersionDistInfo;
+use deno_npm::registry::NpmPackageVersionDistInfoIntegrity;
+use deno_semver::package::PackageNv;
+use flate2::read::GzDecoder;
+use tar::Archive;
+use tar::EntryType;
+
+use crate::util::path::get_atomic_dir_path;
+
+#[derive(Debug, Copy, Clone)]
+pub enum TarballExtractionMode {
+ /// Overwrites the destination directory without deleting any files.
+ Overwrite,
+ /// Creates and writes to a sibling temporary directory. When done, moves
+ /// it to the final destination.
+ ///
+ /// This is more robust than `Overwrite` as it better handles multiple
+ /// processes writing to the directory at the same time.
+ SiblingTempDir,
+}
+
+pub fn verify_and_extract_tarball(
+ package_nv: &PackageNv,
+ data: &[u8],
+ dist_info: &NpmPackageVersionDistInfo,
+ output_folder: &Path,
+ extraction_mode: TarballExtractionMode,
+) -> Result<(), AnyError> {
+ verify_tarball_integrity(package_nv, data, &dist_info.integrity())?;
+
+ match extraction_mode {
+ TarballExtractionMode::Overwrite => extract_tarball(data, output_folder),
+ TarballExtractionMode::SiblingTempDir => {
+ let temp_dir = get_atomic_dir_path(output_folder);
+ extract_tarball(data, &temp_dir)?;
+ rename_with_retries(&temp_dir, output_folder)
+ .map_err(AnyError::from)
+ .context("Failed moving extracted tarball to final destination.")
+ }
+ }
+}
+
+fn rename_with_retries(
+ temp_dir: &Path,
+ output_folder: &Path,
+) -> Result<(), std::io::Error> {
+ fn already_exists(err: &std::io::Error, output_folder: &Path) -> bool {
+ // Windows will do an "Access is denied" error
+ err.kind() == ErrorKind::AlreadyExists || output_folder.exists()
+ }
+
+ let mut count = 0;
+ // renaming might be flaky if a lot of processes are trying
+ // to do this, so retry a few times
+ loop {
+ match fs::rename(temp_dir, output_folder) {
+ Ok(_) => return Ok(()),
+ Err(err) if already_exists(&err, output_folder) => {
+ // another process copied here, just cleanup
+ let _ = fs::remove_dir_all(temp_dir);
+ return Ok(());
+ }
+ Err(err) => {
+ count += 1;
+ if count > 5 {
+ // too many retries, cleanup and return the error
+ let _ = fs::remove_dir_all(temp_dir);
+ return Err(err);
+ }
+
+ // wait a bit before retrying... this should be very rare or only
+ // in error cases, so ok to sleep a bit
+ let sleep_ms = std::cmp::min(100, 20 * count);
+ std::thread::sleep(std::time::Duration::from_millis(sleep_ms));
+ }
+ }
+ }
+}
+
+fn verify_tarball_integrity(
+ package: &PackageNv,
+ data: &[u8],
+ npm_integrity: &NpmPackageVersionDistInfoIntegrity,
+) -> Result<(), AnyError> {
+ use ring::digest::Context;
+ let (tarball_checksum, expected_checksum) = match npm_integrity {
+ NpmPackageVersionDistInfoIntegrity::Integrity {
+ algorithm,
+ base64_hash,
+ } => {
+ let algo = match *algorithm {
+ "sha512" => &ring::digest::SHA512,
+ "sha1" => &ring::digest::SHA1_FOR_LEGACY_USE_ONLY,
+ hash_kind => bail!(
+ "Not implemented hash function for {}: {}",
+ package,
+ hash_kind
+ ),
+ };
+ let mut hash_ctx = Context::new(algo);
+ hash_ctx.update(data);
+ let digest = hash_ctx.finish();
+ let tarball_checksum = BASE64_STANDARD.encode(digest.as_ref());
+ (tarball_checksum, base64_hash)
+ }
+ NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(hex) => {
+ let mut hash_ctx = Context::new(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY);
+ hash_ctx.update(data);
+ let digest = hash_ctx.finish();
+ let tarball_checksum = faster_hex::hex_string(digest.as_ref());
+ (tarball_checksum, hex)
+ }
+ NpmPackageVersionDistInfoIntegrity::UnknownIntegrity(integrity) => {
+ bail!(
+ "Not implemented integrity kind for {}: {}",
+ package,
+ integrity
+ )
+ }
+ };
+
+ if tarball_checksum != *expected_checksum {
+ bail!(
+ "Tarball checksum did not match what was provided by npm registry for {}.\n\nExpected: {}\nActual: {}",
+ package,
+ expected_checksum,
+ tarball_checksum,
+ )
+ }
+ Ok(())
+}
+
+fn extract_tarball(data: &[u8], output_folder: &Path) -> Result<(), AnyError> {
+ fs::create_dir_all(output_folder)?;
+ let output_folder = fs::canonicalize(output_folder)?;
+ let tar = GzDecoder::new(data);
+ let mut archive = Archive::new(tar);
+ archive.set_overwrite(true);
+ archive.set_preserve_permissions(true);
+ let mut created_dirs = HashSet::new();
+
+ for entry in archive.entries()? {
+ let mut entry = entry?;
+ let path = entry.path()?;
+ let entry_type = entry.header().entry_type();
+
+ // Some package tarballs contain "pax_global_header", these entries
+ // should be skipped.
+ if entry_type == EntryType::XGlobalHeader {
+ continue;
+ }
+
+ // skip the first component which will be either "package" or the name of the package
+ let relative_path = path.components().skip(1).collect::<PathBuf>();
+ let absolute_path = output_folder.join(relative_path);
+ let dir_path = if entry_type == EntryType::Directory {
+ absolute_path.as_path()
+ } else {
+ absolute_path.parent().unwrap()
+ };
+ if created_dirs.insert(dir_path.to_path_buf()) {
+ fs::create_dir_all(dir_path)?;
+ let canonicalized_dir = fs::canonicalize(dir_path)?;
+ if !canonicalized_dir.starts_with(&output_folder) {
+ bail!(
+ "Extracted directory '{}' of npm tarball was not in output directory.",
+ canonicalized_dir.display()
+ )
+ }
+ }
+
+ let entry_type = entry.header().entry_type();
+ match entry_type {
+ EntryType::Regular => {
+ entry.unpack(&absolute_path)?;
+ }
+ EntryType::Symlink | EntryType::Link => {
+ // At the moment, npm doesn't seem to support uploading hardlinks or
+ // symlinks to the npm registry. If ever adding symlink or hardlink
+ // support, we will need to validate that the hardlink and symlink
+ // target are within the package directory.
+ log::warn!(
+ "Ignoring npm tarball entry type {:?} for '{}'",
+ entry_type,
+ absolute_path.display()
+ )
+ }
+ _ => {
+ // ignore
+ }
+ }
+ }
+ Ok(())
+}
+
+#[cfg(test)]
+mod test {
+ use deno_semver::Version;
+ use test_util::TempDir;
+
+ use super::*;
+
+ #[test]
+ pub fn test_verify_tarball() {
+ let package = PackageNv {
+ name: "package".to_string(),
+ version: Version::parse_from_npm("1.0.0").unwrap(),
+ };
+ let actual_checksum =
+ "z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg/SpIdNs6c5H0NE8XYXysP+DGNKHfuwvY7kxvUdBeoGlODJ6+SfaPg==";
+ assert_eq!(
+ verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::UnknownIntegrity("test")
+ )
+ .unwrap_err()
+ .to_string(),
+ "Not implemented integrity kind for package@1.0.0: test",
+ );
+ assert_eq!(
+ verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::Integrity {
+ algorithm: "notimplemented",
+ base64_hash: "test"
+ }
+ )
+ .unwrap_err()
+ .to_string(),
+ "Not implemented hash function for package@1.0.0: notimplemented",
+ );
+ assert_eq!(
+ verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::Integrity {
+ algorithm: "sha1",
+ base64_hash: "test"
+ }
+ )
+ .unwrap_err()
+ .to_string(),
+ concat!(
+ "Tarball checksum did not match what was provided by npm ",
+ "registry for package@1.0.0.\n\nExpected: test\nActual: 2jmj7l5rSw0yVb/vlWAYkK/YBwk=",
+ ),
+ );
+ assert_eq!(
+ verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::Integrity {
+ algorithm: "sha512",
+ base64_hash: "test"
+ }
+ )
+ .unwrap_err()
+ .to_string(),
+ format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_checksum}"),
+ );
+ assert!(verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::Integrity {
+ algorithm: "sha512",
+ base64_hash: actual_checksum,
+ },
+ )
+ .is_ok());
+ let actual_hex = "da39a3ee5e6b4b0d3255bfef95601890afd80709";
+ assert_eq!(
+ verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex("test"),
+ )
+ .unwrap_err()
+ .to_string(),
+ format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_hex}"),
+ );
+ assert!(verify_tarball_integrity(
+ &package,
+ &Vec::new(),
+ &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(actual_hex),
+ )
+ .is_ok());
+ }
+
+ #[test]
+ fn rename_with_retries_succeeds_exists() {
+ let temp_dir = TempDir::new();
+ let folder_1 = temp_dir.path().join("folder_1");
+ let folder_2 = temp_dir.path().join("folder_2");
+
+ folder_1.create_dir_all();
+ folder_1.join("a.txt").write("test");
+ folder_2.create_dir_all();
+ // this will not end up in the output as rename_with_retries assumes
+ // the folders ending up at the destination are the same
+ folder_2.join("b.txt").write("test2");
+
+ let dest_folder = temp_dir.path().join("dest_folder");
+
+ rename_with_retries(folder_1.as_path(), dest_folder.as_path()).unwrap();
+ rename_with_retries(folder_2.as_path(), dest_folder.as_path()).unwrap();
+ assert!(dest_folder.join("a.txt").exists());
+ assert!(!dest_folder.join("b.txt").exists());
+ }
+}