diff options
author | David Sherret <dsherret@users.noreply.github.com> | 2024-01-08 12:18:42 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-08 17:18:42 +0000 |
commit | e212e1fc35ddae63f457f0f2a2e95154e008941f (patch) | |
tree | 6c1f553fbc529bfcab6413049af8f32ed31d1dfd /cli/util/fs.rs | |
parent | ee45d5bf8f2e1826b2a106b030abe891cfc0b37c (diff) |
perf: skip expanding exclude globs (#21817)
We were calling `expand_glob` on our excludes, which is very expensive
and unnecessary because we can pattern match while traversing instead.
1. Doesn't expand "exclude" globs. Instead pattern matches while walking
the directory.
2. Splits up the "include" into base paths and applicable file patterns.
This causes less pattern matching to occur because we're only pattern
matching on patterns that might match and not ones in completely
unrelated directories.
Diffstat (limited to 'cli/util/fs.rs')
-rw-r--r-- | cli/util/fs.rs | 358 |
1 files changed, 201 insertions, 157 deletions
diff --git a/cli/util/fs.rs b/cli/util/fs.rs index f9fe9424f..86b17754b 100644 --- a/cli/util/fs.rs +++ b/cli/util/fs.rs @@ -1,5 +1,6 @@ // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. +use deno_core::anyhow::anyhow; use deno_core::anyhow::Context; use deno_core::error::AnyError; pub use deno_core::normalize_path; @@ -8,7 +9,7 @@ use deno_core::ModuleSpecifier; use deno_runtime::deno_crypto::rand; use deno_runtime::deno_fs::FileSystem; use deno_runtime::deno_node::PathClean; -use std::borrow::Cow; +use std::collections::HashSet; use std::env::current_dir; use std::fmt::Write as FmtWrite; use std::fs::OpenOptions; @@ -21,11 +22,13 @@ use std::sync::Arc; use std::time::Duration; use walkdir::WalkDir; -use crate::args::FilesConfig; use crate::util::progress_bar::ProgressBar; use crate::util::progress_bar::ProgressBarStyle; use crate::util::progress_bar::ProgressMessagePrompt; +use super::glob::FilePatterns; +use super::glob::PathOrPattern; +use super::glob::PathOrPatternSet; use super::path::specifier_to_file_path; /// Writes the file to the file system at a temporary path, then @@ -244,18 +247,16 @@ pub fn resolve_from_cwd(path: &Path) -> Result<PathBuf, AnyError> { /// Collects file paths that satisfy the given predicate, by recursively walking `files`. /// If the walker visits a path that is listed in `ignore`, it skips descending into the directory. -pub struct FileCollector<TFilter: Fn(&Path) -> bool> { - canonicalized_ignore: Vec<PathBuf>, +pub struct FileCollector<TFilter: Fn(&Path, &FilePatterns) -> bool> { file_filter: TFilter, ignore_git_folder: bool, ignore_node_modules: bool, ignore_vendor_folder: bool, } -impl<TFilter: Fn(&Path) -> bool> FileCollector<TFilter> { +impl<TFilter: Fn(&Path, &FilePatterns) -> bool> FileCollector<TFilter> { pub fn new(file_filter: TFilter) -> Self { Self { - canonicalized_ignore: Default::default(), file_filter, ignore_git_folder: false, ignore_node_modules: false, @@ -263,14 +264,6 @@ impl<TFilter: Fn(&Path) -> bool> FileCollector<TFilter> { } } - pub fn add_ignore_paths(mut self, paths: &[PathBuf]) -> Self { - // retain only the paths which exist and ignore the rest - self - .canonicalized_ignore - .extend(paths.iter().filter_map(|i| canonicalize_path(i).ok())); - self - } - pub fn ignore_node_modules(mut self) -> Self { self.ignore_node_modules = true; self @@ -286,58 +279,62 @@ impl<TFilter: Fn(&Path) -> bool> FileCollector<TFilter> { self } - pub fn collect_files( + pub fn collect_file_patterns( &self, - files: Option<&[PathBuf]>, + file_patterns: FilePatterns, ) -> Result<Vec<PathBuf>, AnyError> { let mut target_files = Vec::new(); - let files = if let Some(files) = files { - Cow::Borrowed(files) - } else { - Cow::Owned(vec![PathBuf::from(".")]) - }; - for file in files.iter() { - if let Ok(file) = canonicalize_path(file) { - // use an iterator like this in order to minimize the number of file system operations - let mut iterator = WalkDir::new(&file).into_iter(); - loop { - let e = match iterator.next() { - None => break, - Some(Err(_)) => continue, - Some(Ok(entry)) => entry, - }; - let file_type = e.file_type(); - let is_dir = file_type.is_dir(); - if let Ok(c) = canonicalize_path(e.path()) { - if self.canonicalized_ignore.iter().any(|i| c.starts_with(i)) { - if is_dir { - iterator.skip_current_dir(); - } - } else if is_dir { - let should_ignore_dir = c - .file_name() - .map(|dir_name| { - let dir_name = dir_name.to_string_lossy().to_lowercase(); - let is_ignored_file = match dir_name.as_str() { - "node_modules" => self.ignore_node_modules, - "vendor" => self.ignore_vendor_folder, - ".git" => self.ignore_git_folder, - _ => false, - }; - // allow the user to opt out of ignoring by explicitly specifying the dir - file != c && is_ignored_file - }) - .unwrap_or(false); - if should_ignore_dir { - iterator.skip_current_dir(); - } - } else if (self.file_filter)(e.path()) { - target_files.push(c); - } - } else if is_dir { - // failed canonicalizing, so skip it + let mut visited_paths = HashSet::new(); + let file_patterns_by_base = file_patterns.split_by_base(); + for (base, file_patterns) in file_patterns_by_base { + let file = normalize_path(base); + // use an iterator in order to minimize the number of file system operations + let mut iterator = WalkDir::new(&file) + .follow_links(false) // the default, but be explicit + .into_iter(); + loop { + let e = match iterator.next() { + None => break, + Some(Err(_)) => continue, + Some(Ok(entry)) => entry, + }; + let file_type = e.file_type(); + let is_dir = file_type.is_dir(); + let c = e.path().to_path_buf(); + if file_patterns.exclude.matches_path(&c) + || !is_dir + && !file_patterns + .include + .as_ref() + .map(|i| i.matches_path(&c)) + .unwrap_or(true) + { + if is_dir { iterator.skip_current_dir(); } + } else if is_dir { + let should_ignore_dir = c + .file_name() + .map(|dir_name| { + let dir_name = dir_name.to_string_lossy().to_lowercase(); + let is_ignored_file = match dir_name.as_str() { + "node_modules" => self.ignore_node_modules, + "vendor" => self.ignore_vendor_folder, + ".git" => self.ignore_git_folder, + _ => false, + }; + // allow the user to opt out of ignoring by explicitly specifying the dir + file != c && is_ignored_file + }) + .unwrap_or(false) + || !visited_paths.insert(c.clone()); + if should_ignore_dir { + iterator.skip_current_dir(); + } + } else if (self.file_filter)(&c, &file_patterns) + && visited_paths.insert(c.clone()) + { + target_files.push(c); } } } @@ -349,54 +346,68 @@ impl<TFilter: Fn(&Path) -> bool> FileCollector<TFilter> { /// Specifiers that start with http and https are left intact. /// Note: This ignores all .git and node_modules folders. pub fn collect_specifiers( - files: &FilesConfig, - predicate: impl Fn(&Path) -> bool, + mut files: FilePatterns, + predicate: impl Fn(&Path, &FilePatterns) -> bool, ) -> Result<Vec<ModuleSpecifier>, AnyError> { let mut prepared = vec![]; - let file_collector = FileCollector::new(predicate) - .add_ignore_paths(&files.exclude) - .ignore_git_folder() - .ignore_node_modules() - .ignore_vendor_folder(); - - let root_path = current_dir()?; - let include_files = if let Some(include) = &files.include { - Cow::Borrowed(include) - } else { - Cow::Owned(vec![root_path.clone()]) - }; - for path in include_files.iter() { - let path = path.to_string_lossy(); - let lowercase_path = path.to_lowercase(); - if lowercase_path.starts_with("http://") - || lowercase_path.starts_with("https://") - { - let url = ModuleSpecifier::parse(&path)?; - prepared.push(url); - continue; - } - let p = if lowercase_path.starts_with("file://") { - specifier_to_file_path(&ModuleSpecifier::parse(&path)?)? - } else { - root_path.join(path.as_ref()) - }; - let p = normalize_path(p); - if p.is_dir() { - let test_files = file_collector.collect_files(Some(&[p]))?; - let mut test_files_as_urls = test_files - .iter() - .map(|f| ModuleSpecifier::from_file_path(f).unwrap()) - .collect::<Vec<ModuleSpecifier>>(); - - test_files_as_urls.sort(); - prepared.extend(test_files_as_urls); - } else { - let url = ModuleSpecifier::from_file_path(p).unwrap(); - prepared.push(url); + // break out the remote specifiers + if let Some(include_mut) = &mut files.include { + let includes = std::mem::take(include_mut); + let path_or_patterns = includes.into_path_or_patterns(); + let mut result = Vec::with_capacity(path_or_patterns.len()); + for path_or_pattern in path_or_patterns { + match path_or_pattern { + PathOrPattern::Path(path) => { + // todo(dsherret): we should improve this to not store URLs in a PathBuf + let path_str = path.to_string_lossy(); + let lowercase_path = path_str.to_lowercase(); + if lowercase_path.starts_with("http://") + || lowercase_path.starts_with("https://") + { + // take out the url + let url = ModuleSpecifier::parse(&path_str) + .with_context(|| format!("Invalid URL '{}'", path_str))?; + prepared.push(url); + } else if lowercase_path.starts_with("file://") { + let url = ModuleSpecifier::parse(&path_str) + .with_context(|| format!("Invalid URL '{}'", path_str))?; + let p = specifier_to_file_path(&url)?; + if p.is_dir() { + result.push(PathOrPattern::Path(p)); + } else { + prepared.push(url) + } + } else if path.is_dir() { + result.push(PathOrPattern::Path(path)); + } else if !files.exclude.matches_path(&path) { + let url = ModuleSpecifier::from_file_path(&path) + .map_err(|_| anyhow!("Invalid file path '{}'", path.display()))?; + prepared.push(url); + } + } + PathOrPattern::Pattern(pattern) => { + // add it back + result.push(PathOrPattern::Pattern(pattern)); + } + } } + *include_mut = PathOrPatternSet::new(result); } + let collected_files = FileCollector::new(predicate) + .ignore_git_folder() + .ignore_node_modules() + .ignore_vendor_folder() + .collect_file_patterns(files)?; + let mut collected_files_as_urls = collected_files + .iter() + .map(|f| ModuleSpecifier::from_file_path(f).unwrap()) + .collect::<Vec<ModuleSpecifier>>(); + + collected_files_as_urls.sort(); + prepared.extend(collected_files_as_urls); + Ok(prepared) } @@ -812,18 +823,29 @@ mod tests { let ignore_dir_files = ["g.d.ts", ".gitignore"]; create_files(&ignore_dir_path, &ignore_dir_files); - let file_collector = FileCollector::new(|path| { + let file_patterns = FilePatterns { + include: Some( + PathOrPatternSet::from_absolute_paths( + vec![root_dir_path.to_path_buf()], + ) + .unwrap(), + ), + exclude: PathOrPatternSet::from_absolute_paths(vec![ + ignore_dir_path.to_path_buf() + ]) + .unwrap(), + }; + let file_collector = FileCollector::new(|path, _| { // exclude dotfiles path .file_name() .and_then(|f| f.to_str()) .map(|f| !f.starts_with('.')) .unwrap_or(false) - }) - .add_ignore_paths(&[ignore_dir_path.to_path_buf()]); + }); let result = file_collector - .collect_files(Some(&[root_dir_path.to_path_buf()])) + .collect_file_patterns(file_patterns.clone()) .unwrap(); let expected = [ "README.md", @@ -850,7 +872,7 @@ mod tests { .ignore_node_modules() .ignore_vendor_folder(); let result = file_collector - .collect_files(Some(&[root_dir_path.to_path_buf()])) + .collect_file_patterns(file_patterns.clone()) .unwrap(); let expected = [ "README.md", @@ -869,12 +891,20 @@ mod tests { assert_eq!(file_names, expected); // test opting out of ignoring by specifying the dir - let result = file_collector - .collect_files(Some(&[ - root_dir_path.to_path_buf(), - root_dir_path.to_path_buf().join("child/node_modules/"), - ])) - .unwrap(); + let file_patterns = FilePatterns { + include: Some( + PathOrPatternSet::from_absolute_paths(vec![ + root_dir_path.to_path_buf(), + root_dir_path.to_path_buf().join("child/node_modules/"), + ]) + .unwrap(), + ), + exclude: PathOrPatternSet::from_absolute_paths(vec![ + ignore_dir_path.to_path_buf() + ]) + .unwrap(), + }; + let result = file_collector.collect_file_patterns(file_patterns).unwrap(); let expected = [ "README.md", "a.ts", @@ -930,7 +960,7 @@ mod tests { let ignore_dir_files = ["g.d.ts", ".gitignore"]; create_files(&ignore_dir_path, &ignore_dir_files); - let predicate = |path: &Path| { + let predicate = |path: &Path, _: &FilePatterns| { // exclude dotfiles path .file_name() @@ -940,38 +970,46 @@ mod tests { }; let result = collect_specifiers( - &FilesConfig { - include: Some(vec![ - PathBuf::from("http://localhost:8080"), - root_dir_path.to_path_buf(), - PathBuf::from("https://localhost:8080".to_string()), - ]), - exclude: vec![ignore_dir_path.to_path_buf()], + FilePatterns { + include: Some( + PathOrPatternSet::from_absolute_paths(vec![ + PathBuf::from("http://localhost:8080"), + root_dir_path.to_path_buf(), + PathBuf::from("https://localhost:8080".to_string()), + ]) + .unwrap(), + ), + exclude: PathOrPatternSet::from_absolute_paths(vec![ + ignore_dir_path.to_path_buf() + ]) + .unwrap(), }, predicate, ) .unwrap(); - let root_dir_url = - ModuleSpecifier::from_file_path(root_dir_path.canonicalize()) - .unwrap() - .to_string(); - let expected: Vec<ModuleSpecifier> = [ - "http://localhost:8080", - &format!("{root_dir_url}/a.ts"), - &format!("{root_dir_url}/b.js"), - &format!("{root_dir_url}/c.tsx"), - &format!("{root_dir_url}/child/README.md"), - &format!("{root_dir_url}/child/e.mjs"), - &format!("{root_dir_url}/child/f.mjsx"), - &format!("{root_dir_url}/d.jsx"), - "https://localhost:8080", - ] - .iter() - .map(|f| ModuleSpecifier::parse(f).unwrap()) - .collect::<Vec<_>>(); + let root_dir_url = ModuleSpecifier::from_file_path(&root_dir_path) + .unwrap() + .to_string(); + let expected = vec![ + "http://localhost:8080/".to_string(), + "https://localhost:8080/".to_string(), + format!("{root_dir_url}/a.ts"), + format!("{root_dir_url}/b.js"), + format!("{root_dir_url}/c.tsx"), + format!("{root_dir_url}/child/README.md"), + format!("{root_dir_url}/child/e.mjs"), + format!("{root_dir_url}/child/f.mjsx"), + format!("{root_dir_url}/d.jsx"), + ]; - assert_eq!(result, expected); + assert_eq!( + result + .into_iter() + .map(|s| s.to_string()) + .collect::<Vec<_>>(), + expected + ); let scheme = if cfg!(target_os = "windows") { "file:///" @@ -979,28 +1017,34 @@ mod tests { "file://" }; let result = collect_specifiers( - &FilesConfig { - include: Some(vec![PathBuf::from(format!( - "{}{}", - scheme, - root_dir_path.join("child").to_string().replace('\\', "/") - ))]), - exclude: vec![], + FilePatterns { + include: Some( + PathOrPatternSet::from_absolute_paths(vec![PathBuf::from(format!( + "{}{}", + scheme, + root_dir_path.join("child").to_string().replace('\\', "/") + ))]) + .unwrap(), + ), + exclude: Default::default(), }, predicate, ) .unwrap(); - let expected: Vec<ModuleSpecifier> = [ - &format!("{root_dir_url}/child/README.md"), - &format!("{root_dir_url}/child/e.mjs"), - &format!("{root_dir_url}/child/f.mjsx"), - ] - .iter() - .map(|f| ModuleSpecifier::parse(f).unwrap()) - .collect::<Vec<_>>(); + let expected = vec![ + format!("{root_dir_url}/child/README.md"), + format!("{root_dir_url}/child/e.mjs"), + format!("{root_dir_url}/child/f.mjsx"), + ]; - assert_eq!(result, expected); + assert_eq!( + result + .into_iter() + .map(|s| s.to_string()) + .collect::<Vec<_>>(), + expected + ); } #[tokio::test] |