diff options
Diffstat (limited to 'cli/npm/managed/cache')
-rw-r--r-- | cli/npm/managed/cache/mod.rs | 254 | ||||
-rw-r--r-- | cli/npm/managed/cache/registry_info.rs | 284 | ||||
-rw-r--r-- | cli/npm/managed/cache/tarball.rs | 210 | ||||
-rw-r--r-- | cli/npm/managed/cache/tarball_extract.rs | 324 |
4 files changed, 1072 insertions, 0 deletions
diff --git a/cli/npm/managed/cache/mod.rs b/cli/npm/managed/cache/mod.rs new file mode 100644 index 000000000..f409744b9 --- /dev/null +++ b/cli/npm/managed/cache/mod.rs @@ -0,0 +1,254 @@ +// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. + +use std::collections::HashSet; +use std::fs; +use std::io::ErrorKind; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use deno_ast::ModuleSpecifier; +use deno_core::anyhow::bail; +use deno_core::anyhow::Context; +use deno_core::error::AnyError; +use deno_core::parking_lot::Mutex; +use deno_core::serde_json; +use deno_core::url::Url; +use deno_npm::npm_rc::ResolvedNpmRc; +use deno_npm::registry::NpmPackageInfo; +use deno_npm::NpmPackageCacheFolderId; +use deno_semver::package::PackageNv; + +use crate::args::CacheSetting; +use crate::cache::CACHE_PERM; +use crate::npm::NpmCacheDir; +use crate::util::fs::atomic_write_file_with_retries; +use crate::util::fs::hard_link_dir_recursive; + +mod registry_info; +mod tarball; +mod tarball_extract; + +pub use registry_info::RegistryInfoDownloader; +pub use tarball::TarballCache; + +/// Stores a single copy of npm packages in a cache. +#[derive(Debug)] +pub struct NpmCache { + cache_dir: NpmCacheDir, + cache_setting: CacheSetting, + npmrc: Arc<ResolvedNpmRc>, + /// ensures a package is only downloaded once per run + previously_reloaded_packages: Mutex<HashSet<PackageNv>>, +} + +impl NpmCache { + pub fn new( + cache_dir: NpmCacheDir, + cache_setting: CacheSetting, + npmrc: Arc<ResolvedNpmRc>, + ) -> Self { + Self { + cache_dir, + cache_setting, + previously_reloaded_packages: Default::default(), + npmrc, + } + } + + pub fn cache_setting(&self) -> &CacheSetting { + &self.cache_setting + } + + pub fn root_dir_url(&self) -> &Url { + self.cache_dir.root_dir_url() + } + + /// Checks if the cache should be used for the provided name and version. + /// NOTE: Subsequent calls for the same package will always return `true` + /// to ensure a package is only downloaded once per run of the CLI. This + /// prevents downloads from re-occurring when someone has `--reload` and + /// and imports a dynamic import that imports the same package again for example. + pub fn should_use_cache_for_package(&self, package: &PackageNv) -> bool { + self.cache_setting.should_use_for_npm_package(&package.name) + || !self + .previously_reloaded_packages + .lock() + .insert(package.clone()) + } + + /// Ensures a copy of the package exists in the global cache. + /// + /// This assumes that the original package folder being hard linked + /// from exists before this is called. + pub fn ensure_copy_package( + &self, + folder_id: &NpmPackageCacheFolderId, + ) -> Result<(), AnyError> { + let registry_url = self.npmrc.get_registry_url(&folder_id.nv.name); + assert_ne!(folder_id.copy_index, 0); + let package_folder = self + .cache_dir + .package_folder_for_id(folder_id, registry_url); + + if package_folder.exists() + // if this file exists, then the package didn't successfully initialize + // the first time, or another process is currently extracting the zip file + && !package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME).exists() + && self.cache_setting.should_use_for_npm_package(&folder_id.nv.name) + { + return Ok(()); + } + + let original_package_folder = self + .cache_dir + .package_folder_for_nv(&folder_id.nv, registry_url); + + // it seems Windows does an "AccessDenied" error when moving a + // directory with hard links, so that's why this solution is done + with_folder_sync_lock(&folder_id.nv, &package_folder, || { + hard_link_dir_recursive(&original_package_folder, &package_folder) + })?; + Ok(()) + } + + pub fn package_folder_for_id(&self, id: &NpmPackageCacheFolderId) -> PathBuf { + let registry_url = self.npmrc.get_registry_url(&id.nv.name); + self.cache_dir.package_folder_for_id(id, registry_url) + } + + pub fn package_folder_for_nv(&self, package: &PackageNv) -> PathBuf { + let registry_url = self.npmrc.get_registry_url(&package.name); + self.package_folder_for_nv_and_url(package, registry_url) + } + + pub fn package_folder_for_nv_and_url( + &self, + package: &PackageNv, + registry_url: &Url, + ) -> PathBuf { + self.cache_dir.package_folder_for_nv(package, registry_url) + } + + pub fn package_name_folder(&self, name: &str) -> PathBuf { + let registry_url = self.npmrc.get_registry_url(name); + self.cache_dir.package_name_folder(name, registry_url) + } + + pub fn root_folder(&self) -> PathBuf { + self.cache_dir.root_dir().to_owned() + } + + pub fn resolve_package_folder_id_from_specifier( + &self, + specifier: &ModuleSpecifier, + ) -> Option<NpmPackageCacheFolderId> { + self + .cache_dir + .resolve_package_folder_id_from_specifier(specifier) + } + + pub fn load_package_info( + &self, + name: &str, + ) -> Result<Option<NpmPackageInfo>, AnyError> { + let file_cache_path = self.get_registry_package_info_file_cache_path(name); + + let file_text = match fs::read_to_string(file_cache_path) { + Ok(file_text) => file_text, + Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None), + Err(err) => return Err(err.into()), + }; + Ok(serde_json::from_str(&file_text)?) + } + + pub fn save_package_info( + &self, + name: &str, + package_info: &NpmPackageInfo, + ) -> Result<(), AnyError> { + let file_cache_path = self.get_registry_package_info_file_cache_path(name); + let file_text = serde_json::to_string(&package_info)?; + atomic_write_file_with_retries(&file_cache_path, file_text, CACHE_PERM)?; + Ok(()) + } + + fn get_registry_package_info_file_cache_path(&self, name: &str) -> PathBuf { + let name_folder_path = self.package_name_folder(name); + name_folder_path.join("registry.json") + } +} + +const NPM_PACKAGE_SYNC_LOCK_FILENAME: &str = ".deno_sync_lock"; + +fn with_folder_sync_lock( + package: &PackageNv, + output_folder: &Path, + action: impl FnOnce() -> Result<(), AnyError>, +) -> Result<(), AnyError> { + fn inner( + output_folder: &Path, + action: impl FnOnce() -> Result<(), AnyError>, + ) -> Result<(), AnyError> { + fs::create_dir_all(output_folder).with_context(|| { + format!("Error creating '{}'.", output_folder.display()) + })?; + + // This sync lock file is a way to ensure that partially created + // npm package directories aren't considered valid. This could maybe + // be a bit smarter in the future to not bother extracting here + // if another process has taken the lock in the past X seconds and + // wait for the other process to finish (it could try to create the + // file with `create_new(true)` then if it exists, check the metadata + // then wait until the other process finishes with a timeout), but + // for now this is good enough. + let sync_lock_path = output_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME); + match fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(false) + .open(&sync_lock_path) + { + Ok(_) => { + action()?; + // extraction succeeded, so only now delete this file + let _ignore = std::fs::remove_file(&sync_lock_path); + Ok(()) + } + Err(err) => { + bail!( + concat!( + "Error creating package sync lock file at '{}'. ", + "Maybe try manually deleting this folder.\n\n{:#}", + ), + output_folder.display(), + err + ); + } + } + } + + match inner(output_folder, action) { + Ok(()) => Ok(()), + Err(err) => { + if let Err(remove_err) = fs::remove_dir_all(output_folder) { + if remove_err.kind() != std::io::ErrorKind::NotFound { + bail!( + concat!( + "Failed setting up package cache directory for {}, then ", + "failed cleaning it up.\n\nOriginal error:\n\n{}\n\n", + "Remove error:\n\n{}\n\nPlease manually ", + "delete this folder or you will run into issues using this ", + "package in the future:\n\n{}" + ), + package, + err, + remove_err, + output_folder.display(), + ); + } + } + Err(err) + } + } +} diff --git a/cli/npm/managed/cache/registry_info.rs b/cli/npm/managed/cache/registry_info.rs new file mode 100644 index 000000000..ea6b47969 --- /dev/null +++ b/cli/npm/managed/cache/registry_info.rs @@ -0,0 +1,284 @@ +// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. + +use std::collections::HashMap; +use std::sync::Arc; + +use deno_core::anyhow::anyhow; +use deno_core::anyhow::bail; +use deno_core::anyhow::Context; +use deno_core::error::custom_error; +use deno_core::error::AnyError; +use deno_core::futures::future::BoxFuture; +use deno_core::futures::future::Shared; +use deno_core::futures::FutureExt; +use deno_core::parking_lot::Mutex; +use deno_core::serde_json; +use deno_core::url::Url; +use deno_npm::npm_rc::RegistryConfig; +use deno_npm::npm_rc::ResolvedNpmRc; +use deno_npm::registry::NpmPackageInfo; + +use crate::args::CacheSetting; +use crate::http_util::HttpClient; +use crate::npm::common::maybe_auth_header_for_npm_registry; +use crate::util::progress_bar::ProgressBar; + +use super::NpmCache; + +// todo(dsherret): create seams and unit test this + +#[derive(Debug, Clone)] +enum MemoryCacheItem { + /// The cache item hasn't loaded yet. + PendingFuture(Shared<PendingRegistryLoadFuture>), + /// The item has loaded in the past and was stored in the file system cache. + /// There is no reason to request this package from the npm registry again + /// for the duration of execution. + FsCached, + /// An item is memory cached when it fails saving to the file system cache + /// or the package does not exist. + MemoryCached(Result<Option<Arc<NpmPackageInfo>>, Arc<AnyError>>), +} + +#[derive(Debug, Clone)] +enum FutureResult { + PackageNotExists, + SavedFsCache(Arc<NpmPackageInfo>), + ErroredFsCache(Arc<NpmPackageInfo>), +} + +type PendingRegistryLoadFuture = + BoxFuture<'static, Result<FutureResult, Arc<AnyError>>>; + +/// Downloads packuments from the npm registry. +/// +/// This is shared amongst all the workers. +#[derive(Debug)] +pub struct RegistryInfoDownloader { + cache: Arc<NpmCache>, + npmrc: Arc<ResolvedNpmRc>, + progress_bar: ProgressBar, + memory_cache: Mutex<HashMap<String, MemoryCacheItem>>, +} + +impl RegistryInfoDownloader { + pub fn new( + cache: Arc<NpmCache>, + npmrc: Arc<ResolvedNpmRc>, + progress_bar: ProgressBar, + ) -> Self { + Self { + cache, + npmrc, + progress_bar, + memory_cache: Default::default(), + } + } + + pub async fn load_package_info( + &self, + name: &str, + current_runtime_http_client: &Arc<HttpClient>, + ) -> Result<Option<Arc<NpmPackageInfo>>, AnyError> { + let registry_url = self.npmrc.get_registry_url(name); + let registry_config = self.npmrc.get_registry_config(name); + + self + .load_package_info_inner( + name, + registry_url, + registry_config, + current_runtime_http_client, + ) + .await + .with_context(|| { + format!( + "Error getting response at {} for package \"{}\"", + self.get_package_url(name, registry_url), + name + ) + }) + } + + async fn load_package_info_inner( + &self, + name: &str, + registry_url: &Url, + registry_config: &RegistryConfig, + current_runtime_http_client: &Arc<HttpClient>, + ) -> Result<Option<Arc<NpmPackageInfo>>, AnyError> { + if *self.cache.cache_setting() == CacheSetting::Only { + return Err(custom_error( + "NotCached", + format!( + "An npm specifier not found in cache: \"{name}\", --cached-only is specified." + ) + )); + } + + let (created, cache_item) = { + let mut mem_cache = self.memory_cache.lock(); + if let Some(cache_item) = mem_cache.get(name) { + (false, cache_item.clone()) + } else { + let future = self.create_load_future( + name, + registry_url, + registry_config, + current_runtime_http_client, + ); + let cache_item = MemoryCacheItem::PendingFuture(future); + mem_cache.insert(name.to_string(), cache_item.clone()); + (true, cache_item) + } + }; + match cache_item { + MemoryCacheItem::FsCached => { + // this struct previously loaded from the registry, so we can load it from the file system cache + self + .load_file_cached_package_info(name) + .await + .map(|info| Some(Arc::new(info))) + } + MemoryCacheItem::MemoryCached(maybe_info) => { + maybe_info.clone().map_err(|e| anyhow!("{}", e)) + } + MemoryCacheItem::PendingFuture(future) => { + if created { + match future.await { + Ok(FutureResult::SavedFsCache(info)) => { + // return back the future and mark this package as having + // been saved in the cache for next time it's requested + *self.memory_cache.lock().get_mut(name).unwrap() = + MemoryCacheItem::FsCached; + Ok(Some(info)) + } + Ok(FutureResult::ErroredFsCache(info)) => { + // since saving to the fs cache failed, keep the package information in memory + *self.memory_cache.lock().get_mut(name).unwrap() = + MemoryCacheItem::MemoryCached(Ok(Some(info.clone()))); + Ok(Some(info)) + } + Ok(FutureResult::PackageNotExists) => { + *self.memory_cache.lock().get_mut(name).unwrap() = + MemoryCacheItem::MemoryCached(Ok(None)); + Ok(None) + } + Err(err) => { + let return_err = anyhow!("{}", err); + *self.memory_cache.lock().get_mut(name).unwrap() = + MemoryCacheItem::MemoryCached(Err(err)); + Err(return_err) + } + } + } else { + match future.await { + Ok(FutureResult::SavedFsCache(info)) => Ok(Some(info)), + Ok(FutureResult::ErroredFsCache(info)) => Ok(Some(info)), + Ok(FutureResult::PackageNotExists) => Ok(None), + Err(err) => Err(anyhow!("{}", err)), + } + } + } + } + } + + async fn load_file_cached_package_info( + &self, + name: &str, + ) -> Result<NpmPackageInfo, AnyError> { + // this scenario failing should be exceptionally rare so let's + // deal with improving it only when anyone runs into an issue + let maybe_package_info = deno_core::unsync::spawn_blocking({ + let cache = self.cache.clone(); + let name = name.to_string(); + move || cache.load_package_info(&name) + }) + .await + .unwrap() + .with_context(|| { + format!( + "Previously saved '{}' from the npm cache, but now it fails to load.", + name + ) + })?; + match maybe_package_info { + Some(package_info) => Ok(package_info), + None => { + bail!("The package '{}' previously saved its registry information to the file system cache, but that file no longer exists.", name) + } + } + } + + fn create_load_future( + &self, + name: &str, + registry_url: &Url, + registry_config: &RegistryConfig, + current_runtime_http_client: &Arc<HttpClient>, + ) -> Shared<PendingRegistryLoadFuture> { + let package_url = self.get_package_url(name, registry_url); + let maybe_auth_header = maybe_auth_header_for_npm_registry(registry_config); + let guard = self.progress_bar.update(package_url.as_str()); + let cache = self.cache.clone(); + let http_client = current_runtime_http_client.clone(); + let name = name.to_string(); + // force this future to be polled on the current runtime because it's not + // safe to share `HttpClient`s across runtimes and because a restart of + // npm resolution might cause this package not to be resolved again + // causing the future to never be polled + deno_core::unsync::spawn(async move { + let maybe_bytes = http_client + .download_with_progress(package_url, maybe_auth_header, &guard) + .await?; + match maybe_bytes { + Some(bytes) => { + let future_result = deno_core::unsync::spawn_blocking( + move || -> Result<FutureResult, AnyError> { + let package_info = serde_json::from_slice(&bytes)?; + match cache.save_package_info(&name, &package_info) { + Ok(()) => { + Ok(FutureResult::SavedFsCache(Arc::new(package_info))) + } + Err(err) => { + log::debug!( + "Error saving package {} to cache: {:#}", + name, + err + ); + Ok(FutureResult::ErroredFsCache(Arc::new(package_info))) + } + } + }, + ) + .await??; + Ok(future_result) + } + None => Ok(FutureResult::PackageNotExists), + } + }) + .map(|result| result.unwrap().map_err(Arc::new)) + .boxed() + .shared() + } + + fn get_package_url(&self, name: &str, registry_url: &Url) -> Url { + // list of all characters used in npm packages: + // !, ', (, ), *, -, ., /, [0-9], @, [A-Za-z], _, ~ + const ASCII_SET: percent_encoding::AsciiSet = + percent_encoding::NON_ALPHANUMERIC + .remove(b'!') + .remove(b'\'') + .remove(b'(') + .remove(b')') + .remove(b'*') + .remove(b'-') + .remove(b'.') + .remove(b'/') + .remove(b'@') + .remove(b'_') + .remove(b'~'); + let name = percent_encoding::utf8_percent_encode(name, &ASCII_SET); + registry_url.join(&name.to_string()).unwrap() + } +} diff --git a/cli/npm/managed/cache/tarball.rs b/cli/npm/managed/cache/tarball.rs new file mode 100644 index 000000000..9848aca13 --- /dev/null +++ b/cli/npm/managed/cache/tarball.rs @@ -0,0 +1,210 @@ +// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. + +use std::collections::HashMap; +use std::sync::Arc; + +use deno_core::anyhow::anyhow; +use deno_core::anyhow::bail; +use deno_core::anyhow::Context; +use deno_core::error::custom_error; +use deno_core::error::AnyError; +use deno_core::futures::future::BoxFuture; +use deno_core::futures::future::Shared; +use deno_core::futures::FutureExt; +use deno_core::parking_lot::Mutex; +use deno_npm::npm_rc::ResolvedNpmRc; +use deno_npm::registry::NpmPackageVersionDistInfo; +use deno_runtime::deno_fs::FileSystem; +use deno_semver::package::PackageNv; + +use crate::args::CacheSetting; +use crate::http_util::HttpClient; +use crate::npm::common::maybe_auth_header_for_npm_registry; +use crate::util::progress_bar::ProgressBar; + +use super::tarball_extract::verify_and_extract_tarball; +use super::tarball_extract::TarballExtractionMode; +use super::NpmCache; + +// todo(dsherret): create seams and unit test this + +#[derive(Debug, Clone)] +enum MemoryCacheItem { + /// The cache item hasn't finished yet. + PendingFuture(Shared<BoxFuture<'static, Result<(), Arc<AnyError>>>>), + /// The result errored. + Errored(Arc<AnyError>), + /// This package has already been cached. + Cached, +} + +/// Coordinates caching of tarballs being loaded from +/// the npm registry. +/// +/// This is shared amongst all the workers. +#[derive(Debug)] +pub struct TarballCache { + cache: Arc<NpmCache>, + fs: Arc<dyn FileSystem>, + npmrc: Arc<ResolvedNpmRc>, + progress_bar: ProgressBar, + memory_cache: Mutex<HashMap<PackageNv, MemoryCacheItem>>, +} + +impl TarballCache { + pub fn new( + cache: Arc<NpmCache>, + fs: Arc<dyn FileSystem>, + npmrc: Arc<ResolvedNpmRc>, + progress_bar: ProgressBar, + ) -> Self { + Self { + cache, + fs, + npmrc, + progress_bar, + memory_cache: Default::default(), + } + } + + pub async fn ensure_package( + &self, + package: &PackageNv, + dist: &NpmPackageVersionDistInfo, + // it's not safe to share these across runtimes + http_client_for_runtime: &Arc<HttpClient>, + ) -> Result<(), AnyError> { + self + .ensure_package_inner(package, dist, http_client_for_runtime) + .await + .with_context(|| format!("Failed caching npm package '{}'.", package)) + } + + async fn ensure_package_inner( + &self, + package_nv: &PackageNv, + dist: &NpmPackageVersionDistInfo, + http_client_for_runtime: &Arc<HttpClient>, + ) -> Result<(), AnyError> { + let (created, cache_item) = { + let mut mem_cache = self.memory_cache.lock(); + if let Some(cache_item) = mem_cache.get(package_nv) { + (false, cache_item.clone()) + } else { + let future = self.create_setup_future( + package_nv.clone(), + dist.clone(), + http_client_for_runtime.clone(), + ); + let cache_item = MemoryCacheItem::PendingFuture(future); + mem_cache.insert(package_nv.clone(), cache_item.clone()); + (true, cache_item) + } + }; + + match cache_item { + MemoryCacheItem::Cached => Ok(()), + MemoryCacheItem::Errored(err) => Err(anyhow!("{}", err)), + MemoryCacheItem::PendingFuture(future) => { + if created { + match future.await { + Ok(_) => { + *self.memory_cache.lock().get_mut(package_nv).unwrap() = + MemoryCacheItem::Cached; + Ok(()) + } + Err(err) => { + let result_err = anyhow!("{}", err); + *self.memory_cache.lock().get_mut(package_nv).unwrap() = + MemoryCacheItem::Errored(err); + Err(result_err) + } + } + } else { + future.await.map_err(|err| anyhow!("{}", err)) + } + } + } + } + + fn create_setup_future( + &self, + package_nv: PackageNv, + dist: NpmPackageVersionDistInfo, + http_client_for_runtime: Arc<HttpClient>, + ) -> Shared<BoxFuture<'static, Result<(), Arc<AnyError>>>> { + let registry_url = self.npmrc.get_registry_url(&package_nv.name); + let registry_config = + self.npmrc.get_registry_config(&package_nv.name).clone(); + + let cache = self.cache.clone(); + let fs = self.fs.clone(); + let progress_bar = self.progress_bar.clone(); + let package_folder = + cache.package_folder_for_nv_and_url(&package_nv, registry_url); + + deno_core::unsync::spawn(async move { + let should_use_cache = cache.should_use_cache_for_package(&package_nv); + let package_folder_exists = fs.exists_sync(&package_folder); + if should_use_cache && package_folder_exists { + return Ok(()); + } else if cache.cache_setting() == &CacheSetting::Only { + return Err(custom_error( + "NotCached", + format!( + "An npm specifier not found in cache: \"{}\", --cached-only is specified.", + &package_nv.name + ) + ) + ); + } + + if dist.tarball.is_empty() { + bail!("Tarball URL was empty."); + } + + let maybe_auth_header = + maybe_auth_header_for_npm_registry(®istry_config); + + let guard = progress_bar.update(&dist.tarball); + let maybe_bytes = http_client_for_runtime + .download_with_progress(&dist.tarball, maybe_auth_header, &guard) + .await?; + match maybe_bytes { + Some(bytes) => { + let extraction_mode = if should_use_cache || !package_folder_exists { + TarballExtractionMode::SiblingTempDir + } else { + // The user ran with `--reload`, so overwrite the package instead of + // deleting it since the package might get corrupted if a user kills + // their deno process while it's deleting a package directory + // + // We can't rename this folder and delete it because the folder + // may be in use by another process or may now contain hardlinks, + // which will cause windows to throw an "AccessDenied" error when + // renaming. So we settle for overwriting. + TarballExtractionMode::Overwrite + }; + let dist = dist.clone(); + let package_nv = package_nv.clone(); + deno_core::unsync::spawn_blocking(move || { + verify_and_extract_tarball( + &package_nv, + &bytes, + &dist, + &package_folder, + extraction_mode, + ) + }) + .await? + } + None => { + bail!("Could not find npm package tarball at: {}", dist.tarball); + } + } + }) + .map(|result| result.unwrap().map_err(Arc::new)) + .boxed() + .shared() + } +} diff --git a/cli/npm/managed/cache/tarball_extract.rs b/cli/npm/managed/cache/tarball_extract.rs new file mode 100644 index 000000000..e2d242e66 --- /dev/null +++ b/cli/npm/managed/cache/tarball_extract.rs @@ -0,0 +1,324 @@ +// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. + +use std::collections::HashSet; +use std::fs; +use std::io::ErrorKind; +use std::path::Path; +use std::path::PathBuf; + +use base64::prelude::BASE64_STANDARD; +use base64::Engine; +use deno_core::anyhow::bail; +use deno_core::anyhow::Context; +use deno_core::error::AnyError; +use deno_npm::registry::NpmPackageVersionDistInfo; +use deno_npm::registry::NpmPackageVersionDistInfoIntegrity; +use deno_semver::package::PackageNv; +use flate2::read::GzDecoder; +use tar::Archive; +use tar::EntryType; + +use crate::util::path::get_atomic_dir_path; + +#[derive(Debug, Copy, Clone)] +pub enum TarballExtractionMode { + /// Overwrites the destination directory without deleting any files. + Overwrite, + /// Creates and writes to a sibling temporary directory. When done, moves + /// it to the final destination. + /// + /// This is more robust than `Overwrite` as it better handles multiple + /// processes writing to the directory at the same time. + SiblingTempDir, +} + +pub fn verify_and_extract_tarball( + package_nv: &PackageNv, + data: &[u8], + dist_info: &NpmPackageVersionDistInfo, + output_folder: &Path, + extraction_mode: TarballExtractionMode, +) -> Result<(), AnyError> { + verify_tarball_integrity(package_nv, data, &dist_info.integrity())?; + + match extraction_mode { + TarballExtractionMode::Overwrite => extract_tarball(data, output_folder), + TarballExtractionMode::SiblingTempDir => { + let temp_dir = get_atomic_dir_path(output_folder); + extract_tarball(data, &temp_dir)?; + rename_with_retries(&temp_dir, output_folder) + .map_err(AnyError::from) + .context("Failed moving extracted tarball to final destination.") + } + } +} + +fn rename_with_retries( + temp_dir: &Path, + output_folder: &Path, +) -> Result<(), std::io::Error> { + fn already_exists(err: &std::io::Error, output_folder: &Path) -> bool { + // Windows will do an "Access is denied" error + err.kind() == ErrorKind::AlreadyExists || output_folder.exists() + } + + let mut count = 0; + // renaming might be flaky if a lot of processes are trying + // to do this, so retry a few times + loop { + match fs::rename(temp_dir, output_folder) { + Ok(_) => return Ok(()), + Err(err) if already_exists(&err, output_folder) => { + // another process copied here, just cleanup + let _ = fs::remove_dir_all(temp_dir); + return Ok(()); + } + Err(err) => { + count += 1; + if count > 5 { + // too many retries, cleanup and return the error + let _ = fs::remove_dir_all(temp_dir); + return Err(err); + } + + // wait a bit before retrying... this should be very rare or only + // in error cases, so ok to sleep a bit + let sleep_ms = std::cmp::min(100, 20 * count); + std::thread::sleep(std::time::Duration::from_millis(sleep_ms)); + } + } + } +} + +fn verify_tarball_integrity( + package: &PackageNv, + data: &[u8], + npm_integrity: &NpmPackageVersionDistInfoIntegrity, +) -> Result<(), AnyError> { + use ring::digest::Context; + let (tarball_checksum, expected_checksum) = match npm_integrity { + NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm, + base64_hash, + } => { + let algo = match *algorithm { + "sha512" => &ring::digest::SHA512, + "sha1" => &ring::digest::SHA1_FOR_LEGACY_USE_ONLY, + hash_kind => bail!( + "Not implemented hash function for {}: {}", + package, + hash_kind + ), + }; + let mut hash_ctx = Context::new(algo); + hash_ctx.update(data); + let digest = hash_ctx.finish(); + let tarball_checksum = BASE64_STANDARD.encode(digest.as_ref()); + (tarball_checksum, base64_hash) + } + NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(hex) => { + let mut hash_ctx = Context::new(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY); + hash_ctx.update(data); + let digest = hash_ctx.finish(); + let tarball_checksum = faster_hex::hex_string(digest.as_ref()); + (tarball_checksum, hex) + } + NpmPackageVersionDistInfoIntegrity::UnknownIntegrity(integrity) => { + bail!( + "Not implemented integrity kind for {}: {}", + package, + integrity + ) + } + }; + + if tarball_checksum != *expected_checksum { + bail!( + "Tarball checksum did not match what was provided by npm registry for {}.\n\nExpected: {}\nActual: {}", + package, + expected_checksum, + tarball_checksum, + ) + } + Ok(()) +} + +fn extract_tarball(data: &[u8], output_folder: &Path) -> Result<(), AnyError> { + fs::create_dir_all(output_folder)?; + let output_folder = fs::canonicalize(output_folder)?; + let tar = GzDecoder::new(data); + let mut archive = Archive::new(tar); + archive.set_overwrite(true); + archive.set_preserve_permissions(true); + let mut created_dirs = HashSet::new(); + + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?; + let entry_type = entry.header().entry_type(); + + // Some package tarballs contain "pax_global_header", these entries + // should be skipped. + if entry_type == EntryType::XGlobalHeader { + continue; + } + + // skip the first component which will be either "package" or the name of the package + let relative_path = path.components().skip(1).collect::<PathBuf>(); + let absolute_path = output_folder.join(relative_path); + let dir_path = if entry_type == EntryType::Directory { + absolute_path.as_path() + } else { + absolute_path.parent().unwrap() + }; + if created_dirs.insert(dir_path.to_path_buf()) { + fs::create_dir_all(dir_path)?; + let canonicalized_dir = fs::canonicalize(dir_path)?; + if !canonicalized_dir.starts_with(&output_folder) { + bail!( + "Extracted directory '{}' of npm tarball was not in output directory.", + canonicalized_dir.display() + ) + } + } + + let entry_type = entry.header().entry_type(); + match entry_type { + EntryType::Regular => { + entry.unpack(&absolute_path)?; + } + EntryType::Symlink | EntryType::Link => { + // At the moment, npm doesn't seem to support uploading hardlinks or + // symlinks to the npm registry. If ever adding symlink or hardlink + // support, we will need to validate that the hardlink and symlink + // target are within the package directory. + log::warn!( + "Ignoring npm tarball entry type {:?} for '{}'", + entry_type, + absolute_path.display() + ) + } + _ => { + // ignore + } + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use deno_semver::Version; + use test_util::TempDir; + + use super::*; + + #[test] + pub fn test_verify_tarball() { + let package = PackageNv { + name: "package".to_string(), + version: Version::parse_from_npm("1.0.0").unwrap(), + }; + let actual_checksum = + "z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg/SpIdNs6c5H0NE8XYXysP+DGNKHfuwvY7kxvUdBeoGlODJ6+SfaPg=="; + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::UnknownIntegrity("test") + ) + .unwrap_err() + .to_string(), + "Not implemented integrity kind for package@1.0.0: test", + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "notimplemented", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + "Not implemented hash function for package@1.0.0: notimplemented", + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha1", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + concat!( + "Tarball checksum did not match what was provided by npm ", + "registry for package@1.0.0.\n\nExpected: test\nActual: 2jmj7l5rSw0yVb/vlWAYkK/YBwk=", + ), + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha512", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_checksum}"), + ); + assert!(verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha512", + base64_hash: actual_checksum, + }, + ) + .is_ok()); + let actual_hex = "da39a3ee5e6b4b0d3255bfef95601890afd80709"; + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex("test"), + ) + .unwrap_err() + .to_string(), + format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_hex}"), + ); + assert!(verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(actual_hex), + ) + .is_ok()); + } + + #[test] + fn rename_with_retries_succeeds_exists() { + let temp_dir = TempDir::new(); + let folder_1 = temp_dir.path().join("folder_1"); + let folder_2 = temp_dir.path().join("folder_2"); + + folder_1.create_dir_all(); + folder_1.join("a.txt").write("test"); + folder_2.create_dir_all(); + // this will not end up in the output as rename_with_retries assumes + // the folders ending up at the destination are the same + folder_2.join("b.txt").write("test2"); + + let dest_folder = temp_dir.path().join("dest_folder"); + + rename_with_retries(folder_1.as_path(), dest_folder.as_path()).unwrap(); + rename_with_retries(folder_2.as_path(), dest_folder.as_path()).unwrap(); + assert!(dest_folder.join("a.txt").exists()); + assert!(!dest_folder.join("b.txt").exists()); + } +} |