diff options
author | David Sherret <dsherret@users.noreply.github.com> | 2024-06-02 21:39:13 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-03 01:39:13 +0000 |
commit | b1f776adef6f0d0caa0b2badf9fb707cf5efa6e7 (patch) | |
tree | df801e53bb5e43268933d883f049546256ef8e7f /cli/npm/managed/cache/tarball_extract.rs | |
parent | eda43c46de12ed589fdbe62ba0574887cfbb3574 (diff) |
refactor: extract structs for downloading tarballs and npm registry packuments (#24067)
Diffstat (limited to 'cli/npm/managed/cache/tarball_extract.rs')
-rw-r--r-- | cli/npm/managed/cache/tarball_extract.rs | 324 |
1 files changed, 324 insertions, 0 deletions
diff --git a/cli/npm/managed/cache/tarball_extract.rs b/cli/npm/managed/cache/tarball_extract.rs new file mode 100644 index 000000000..e2d242e66 --- /dev/null +++ b/cli/npm/managed/cache/tarball_extract.rs @@ -0,0 +1,324 @@ +// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. + +use std::collections::HashSet; +use std::fs; +use std::io::ErrorKind; +use std::path::Path; +use std::path::PathBuf; + +use base64::prelude::BASE64_STANDARD; +use base64::Engine; +use deno_core::anyhow::bail; +use deno_core::anyhow::Context; +use deno_core::error::AnyError; +use deno_npm::registry::NpmPackageVersionDistInfo; +use deno_npm::registry::NpmPackageVersionDistInfoIntegrity; +use deno_semver::package::PackageNv; +use flate2::read::GzDecoder; +use tar::Archive; +use tar::EntryType; + +use crate::util::path::get_atomic_dir_path; + +#[derive(Debug, Copy, Clone)] +pub enum TarballExtractionMode { + /// Overwrites the destination directory without deleting any files. + Overwrite, + /// Creates and writes to a sibling temporary directory. When done, moves + /// it to the final destination. + /// + /// This is more robust than `Overwrite` as it better handles multiple + /// processes writing to the directory at the same time. + SiblingTempDir, +} + +pub fn verify_and_extract_tarball( + package_nv: &PackageNv, + data: &[u8], + dist_info: &NpmPackageVersionDistInfo, + output_folder: &Path, + extraction_mode: TarballExtractionMode, +) -> Result<(), AnyError> { + verify_tarball_integrity(package_nv, data, &dist_info.integrity())?; + + match extraction_mode { + TarballExtractionMode::Overwrite => extract_tarball(data, output_folder), + TarballExtractionMode::SiblingTempDir => { + let temp_dir = get_atomic_dir_path(output_folder); + extract_tarball(data, &temp_dir)?; + rename_with_retries(&temp_dir, output_folder) + .map_err(AnyError::from) + .context("Failed moving extracted tarball to final destination.") + } + } +} + +fn rename_with_retries( + temp_dir: &Path, + output_folder: &Path, +) -> Result<(), std::io::Error> { + fn already_exists(err: &std::io::Error, output_folder: &Path) -> bool { + // Windows will do an "Access is denied" error + err.kind() == ErrorKind::AlreadyExists || output_folder.exists() + } + + let mut count = 0; + // renaming might be flaky if a lot of processes are trying + // to do this, so retry a few times + loop { + match fs::rename(temp_dir, output_folder) { + Ok(_) => return Ok(()), + Err(err) if already_exists(&err, output_folder) => { + // another process copied here, just cleanup + let _ = fs::remove_dir_all(temp_dir); + return Ok(()); + } + Err(err) => { + count += 1; + if count > 5 { + // too many retries, cleanup and return the error + let _ = fs::remove_dir_all(temp_dir); + return Err(err); + } + + // wait a bit before retrying... this should be very rare or only + // in error cases, so ok to sleep a bit + let sleep_ms = std::cmp::min(100, 20 * count); + std::thread::sleep(std::time::Duration::from_millis(sleep_ms)); + } + } + } +} + +fn verify_tarball_integrity( + package: &PackageNv, + data: &[u8], + npm_integrity: &NpmPackageVersionDistInfoIntegrity, +) -> Result<(), AnyError> { + use ring::digest::Context; + let (tarball_checksum, expected_checksum) = match npm_integrity { + NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm, + base64_hash, + } => { + let algo = match *algorithm { + "sha512" => &ring::digest::SHA512, + "sha1" => &ring::digest::SHA1_FOR_LEGACY_USE_ONLY, + hash_kind => bail!( + "Not implemented hash function for {}: {}", + package, + hash_kind + ), + }; + let mut hash_ctx = Context::new(algo); + hash_ctx.update(data); + let digest = hash_ctx.finish(); + let tarball_checksum = BASE64_STANDARD.encode(digest.as_ref()); + (tarball_checksum, base64_hash) + } + NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(hex) => { + let mut hash_ctx = Context::new(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY); + hash_ctx.update(data); + let digest = hash_ctx.finish(); + let tarball_checksum = faster_hex::hex_string(digest.as_ref()); + (tarball_checksum, hex) + } + NpmPackageVersionDistInfoIntegrity::UnknownIntegrity(integrity) => { + bail!( + "Not implemented integrity kind for {}: {}", + package, + integrity + ) + } + }; + + if tarball_checksum != *expected_checksum { + bail!( + "Tarball checksum did not match what was provided by npm registry for {}.\n\nExpected: {}\nActual: {}", + package, + expected_checksum, + tarball_checksum, + ) + } + Ok(()) +} + +fn extract_tarball(data: &[u8], output_folder: &Path) -> Result<(), AnyError> { + fs::create_dir_all(output_folder)?; + let output_folder = fs::canonicalize(output_folder)?; + let tar = GzDecoder::new(data); + let mut archive = Archive::new(tar); + archive.set_overwrite(true); + archive.set_preserve_permissions(true); + let mut created_dirs = HashSet::new(); + + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?; + let entry_type = entry.header().entry_type(); + + // Some package tarballs contain "pax_global_header", these entries + // should be skipped. + if entry_type == EntryType::XGlobalHeader { + continue; + } + + // skip the first component which will be either "package" or the name of the package + let relative_path = path.components().skip(1).collect::<PathBuf>(); + let absolute_path = output_folder.join(relative_path); + let dir_path = if entry_type == EntryType::Directory { + absolute_path.as_path() + } else { + absolute_path.parent().unwrap() + }; + if created_dirs.insert(dir_path.to_path_buf()) { + fs::create_dir_all(dir_path)?; + let canonicalized_dir = fs::canonicalize(dir_path)?; + if !canonicalized_dir.starts_with(&output_folder) { + bail!( + "Extracted directory '{}' of npm tarball was not in output directory.", + canonicalized_dir.display() + ) + } + } + + let entry_type = entry.header().entry_type(); + match entry_type { + EntryType::Regular => { + entry.unpack(&absolute_path)?; + } + EntryType::Symlink | EntryType::Link => { + // At the moment, npm doesn't seem to support uploading hardlinks or + // symlinks to the npm registry. If ever adding symlink or hardlink + // support, we will need to validate that the hardlink and symlink + // target are within the package directory. + log::warn!( + "Ignoring npm tarball entry type {:?} for '{}'", + entry_type, + absolute_path.display() + ) + } + _ => { + // ignore + } + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use deno_semver::Version; + use test_util::TempDir; + + use super::*; + + #[test] + pub fn test_verify_tarball() { + let package = PackageNv { + name: "package".to_string(), + version: Version::parse_from_npm("1.0.0").unwrap(), + }; + let actual_checksum = + "z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg/SpIdNs6c5H0NE8XYXysP+DGNKHfuwvY7kxvUdBeoGlODJ6+SfaPg=="; + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::UnknownIntegrity("test") + ) + .unwrap_err() + .to_string(), + "Not implemented integrity kind for package@1.0.0: test", + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "notimplemented", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + "Not implemented hash function for package@1.0.0: notimplemented", + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha1", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + concat!( + "Tarball checksum did not match what was provided by npm ", + "registry for package@1.0.0.\n\nExpected: test\nActual: 2jmj7l5rSw0yVb/vlWAYkK/YBwk=", + ), + ); + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha512", + base64_hash: "test" + } + ) + .unwrap_err() + .to_string(), + format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_checksum}"), + ); + assert!(verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::Integrity { + algorithm: "sha512", + base64_hash: actual_checksum, + }, + ) + .is_ok()); + let actual_hex = "da39a3ee5e6b4b0d3255bfef95601890afd80709"; + assert_eq!( + verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex("test"), + ) + .unwrap_err() + .to_string(), + format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_hex}"), + ); + assert!(verify_tarball_integrity( + &package, + &Vec::new(), + &NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(actual_hex), + ) + .is_ok()); + } + + #[test] + fn rename_with_retries_succeeds_exists() { + let temp_dir = TempDir::new(); + let folder_1 = temp_dir.path().join("folder_1"); + let folder_2 = temp_dir.path().join("folder_2"); + + folder_1.create_dir_all(); + folder_1.join("a.txt").write("test"); + folder_2.create_dir_all(); + // this will not end up in the output as rename_with_retries assumes + // the folders ending up at the destination are the same + folder_2.join("b.txt").write("test2"); + + let dest_folder = temp_dir.path().join("dest_folder"); + + rename_with_retries(folder_1.as_path(), dest_folder.as_path()).unwrap(); + rename_with_retries(folder_2.as_path(), dest_folder.as_path()).unwrap(); + assert!(dest_folder.join("a.txt").exists()); + assert!(!dest_folder.join("b.txt").exists()); + } +} |