diff options
Diffstat (limited to 'cli/file_fetcher.rs')
-rw-r--r-- | cli/file_fetcher.rs | 346 |
1 files changed, 273 insertions, 73 deletions
diff --git a/cli/file_fetcher.rs b/cli/file_fetcher.rs index 7d72d5cd4..63743f5ab 100644 --- a/cli/file_fetcher.rs +++ b/cli/file_fetcher.rs @@ -7,10 +7,12 @@ use crate::http_util::FetchOnceResult; use crate::msg; use crate::op_error::OpError; use crate::permissions::Permissions; +use crate::text_encoding; use deno_core::ErrBox; use deno_core::ModuleSpecifier; use futures::future::FutureExt; use log::info; +use std::borrow::Cow; use std::collections::HashMap; use std::fs; use std::future::Future; @@ -24,6 +26,47 @@ use std::sync::Arc; use std::sync::Mutex; use url::Url; +/// Structure representing a text document. +#[derive(Debug, Clone)] +pub struct TextDocument { + bytes: Vec<u8>, + charset: Cow<'static, str>, +} + +impl TextDocument { + pub fn new( + bytes: Vec<u8>, + charset: Option<impl Into<Cow<'static, str>>>, + ) -> TextDocument { + let charset = charset + .map(|cs| cs.into()) + .unwrap_or_else(|| text_encoding::detect_charset(&bytes).into()); + TextDocument { bytes, charset } + } + + pub fn as_bytes(&self) -> &Vec<u8> { + &self.bytes + } + + pub fn into_bytes(self) -> Vec<u8> { + self.bytes + } + + pub fn to_str(&self) -> Result<Cow<str>, std::io::Error> { + text_encoding::convert_to_utf8(&self.bytes, &self.charset) + } + + pub fn to_string(&self) -> Result<String, std::io::Error> { + self.to_str().map(String::from) + } +} + +impl From<Vec<u8>> for TextDocument { + fn from(bytes: Vec<u8>) -> Self { + TextDocument::new(bytes, Option::<&str>::None) + } +} + /// Structure representing local or remote file. /// /// In case of remote file `url` might be different than originally requested URL, if so @@ -34,7 +77,7 @@ pub struct SourceFile { pub filename: PathBuf, pub types_header: Option<String>, pub media_type: msg::MediaType, - pub source_code: Vec<u8>, + pub source_code: TextDocument, } /// Simple struct implementing in-process caching to prevent multiple @@ -180,8 +223,9 @@ impl SourceFileFetcher { match result { Ok(mut file) => { // TODO: move somewhere? - if file.source_code.starts_with(b"#!") { - file.source_code = filter_shebang(file.source_code); + if file.source_code.bytes.starts_with(b"#!") { + file.source_code = + filter_shebang(&file.source_code.to_str().unwrap()[..]).into(); } // Cache in-process for subsequent access. @@ -313,12 +357,12 @@ impl SourceFileFetcher { Err(e) => return Err(e.into()), }; - let media_type = map_content_type(&filepath, None); + let (media_type, charset) = map_content_type(&filepath, None); Ok(SourceFile { url: module_url.clone(), filename: filepath, media_type, - source_code, + source_code: TextDocument::new(source_code, charset), types_header: None, }) } @@ -380,7 +424,7 @@ impl SourceFileFetcher { let cache_filename = self.http_cache.get_cache_filename(module_url); let fake_filepath = PathBuf::from(module_url.path()); - let media_type = map_content_type( + let (media_type, charset) = map_content_type( &fake_filepath, headers.get("content-type").map(|e| e.as_str()), ); @@ -389,7 +433,7 @@ impl SourceFileFetcher { url: module_url.clone(), filename: cache_filename, media_type, - source_code, + source_code: TextDocument::new(source_code, charset), types_header, })) } @@ -490,7 +534,7 @@ impl SourceFileFetcher { let cache_filepath = dir.http_cache.get_cache_filename(&module_url); // Used to sniff out content type from file extension - probably to be removed let fake_filepath = PathBuf::from(module_url.path()); - let media_type = map_content_type( + let (media_type, charset) = map_content_type( &fake_filepath, headers.get("content-type").map(String::as_str), ); @@ -502,7 +546,7 @@ impl SourceFileFetcher { url: module_url.clone(), filename: cache_filepath, media_type, - source_code: source, + source_code: TextDocument::new(source, charset), types_header, }; @@ -532,16 +576,19 @@ pub fn map_file_extension(path: &Path) -> msg::MediaType { } } -// convert a ContentType string into a enumerated MediaType -fn map_content_type(path: &Path, content_type: Option<&str>) -> msg::MediaType { +// convert a ContentType string into a enumerated MediaType + optional charset +fn map_content_type( + path: &Path, + content_type: Option<&str>, +) -> (msg::MediaType, Option<String>) { match content_type { Some(content_type) => { - // sometimes there is additional data after the media type in + // Sometimes there is additional data after the media type in // Content-Type so we have to do a bit of manipulation so we are only - // dealing with the actual media type - let ct_vector: Vec<&str> = content_type.split(';').collect(); - let ct: &str = ct_vector.first().unwrap(); - match ct.to_lowercase().as_ref() { + // dealing with the actual media type. + let mut ct_iter = content_type.split(';'); + let ct = ct_iter.next().unwrap(); + let media_type = match ct.to_lowercase().as_ref() { "application/typescript" | "text/typescript" | "video/vnd.dlna.mpeg-tts" @@ -565,9 +612,16 @@ fn map_content_type(path: &Path, content_type: Option<&str>) -> msg::MediaType { debug!("unknown content type: {}", content_type); msg::MediaType::Unknown } - } + }; + + let charset = ct_iter + .map(str::trim) + .find_map(|s| s.strip_prefix("charset=")) + .map(String::from); + + (media_type, charset) } - None => map_file_extension(path), + None => (map_file_extension(path), None), } } @@ -586,8 +640,7 @@ fn map_js_like_extension( } } -fn filter_shebang(bytes: Vec<u8>) -> Vec<u8> { - let string = str::from_utf8(&bytes).unwrap(); +fn filter_shebang(string: &str) -> Vec<u8> { if let Some(i) = string.find('\n') { let (_, rest) = string.split_at(i); rest.as_bytes().to_owned() @@ -767,7 +820,7 @@ mod tests { assert!(result.is_ok()); let r = result.unwrap(); assert_eq!( - r.source_code, + r.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); @@ -794,7 +847,7 @@ mod tests { assert!(result2.is_ok()); let r2 = result2.unwrap(); assert_eq!( - r2.source_code, + r2.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); // If get_source_file does not call remote, this should be JavaScript @@ -823,7 +876,7 @@ mod tests { assert!(result3.is_ok()); let r3 = result3.unwrap(); assert_eq!( - r3.source_code, + r3.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); // If get_source_file does not call remote, this should be JavaScript @@ -850,7 +903,7 @@ mod tests { assert!(result4.is_ok()); let r4 = result4.unwrap(); let expected4 = &b"export { printHello } from \"./print_hello.ts\";\n"[..]; - assert_eq!(r4.source_code, expected4); + assert_eq!(r4.source_code.bytes, expected4); // Resolved back to TypeScript assert_eq!(&(r4.media_type), &msg::MediaType::TypeScript); @@ -880,7 +933,7 @@ mod tests { assert!(result.is_ok()); let r = result.unwrap(); let expected = b"export const loaded = true;\n"; - assert_eq!(r.source_code, expected); + assert_eq!(r.source_code.bytes, expected); assert_eq!(&(r.media_type), &msg::MediaType::JavaScript); let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/javascript"); @@ -906,7 +959,7 @@ mod tests { assert!(result2.is_ok()); let r2 = result2.unwrap(); let expected2 = b"export const loaded = true;\n"; - assert_eq!(r2.source_code, expected2); + assert_eq!(r2.source_code.bytes, expected2); // If get_source_file does not call remote, this should be TypeScript // as we modified before! (we do not overwrite .headers.json due to no http // fetch) @@ -932,7 +985,7 @@ mod tests { assert!(result3.is_ok()); let r3 = result3.unwrap(); let expected3 = b"export const loaded = true;\n"; - assert_eq!(r3.source_code, expected3); + assert_eq!(r3.source_code.bytes, expected3); // Now the old .headers.json file should be overwritten back to JavaScript! // (due to http fetch) assert_eq!(&(r3.media_type), &msg::MediaType::JavaScript); @@ -1352,7 +1405,7 @@ mod tests { .await; assert!(result.is_ok()); let r = result.unwrap(); - assert_eq!(r.source_code, b"export const loaded = true;\n"); + assert_eq!(r.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); // Modify .metadata.json, make sure read from local @@ -1368,7 +1421,7 @@ mod tests { let result2 = fetcher.fetch_cached_remote_source(&module_url, 1); assert!(result2.is_ok()); let r2 = result2.unwrap().unwrap(); - assert_eq!(r2.source_code, b"export const loaded = true;\n"); + assert_eq!(r2.source_code.bytes, b"export const loaded = true;\n"); // Not MediaType::TypeScript due to .headers.json modification assert_eq!(&(r2.media_type), &msg::MediaType::JavaScript); @@ -1392,7 +1445,7 @@ mod tests { .await; assert!(result.is_ok()); let r = result.unwrap(); - assert_eq!(r.source_code, b"export const loaded = true;\n"); + assert_eq!(r.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/typescript"); @@ -1417,7 +1470,7 @@ mod tests { .await; assert!(result.is_ok()); let r2 = result.unwrap(); - assert_eq!(r2.source_code, b"export const loaded = true;\n"); + assert_eq!(r2.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r2.media_type), &msg::MediaType::JavaScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/javascript"); @@ -1442,7 +1495,7 @@ mod tests { .await; assert!(result.is_ok()); let r3 = result.unwrap(); - assert_eq!(r3.source_code, b"export const loaded = true;\n"); + assert_eq!(r3.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r3.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/typescript"); @@ -1523,6 +1576,63 @@ mod tests { } } + async fn test_fetch_source_file_from_disk_nonstandard_encoding( + charset: &str, + expected_content: String, + ) { + let (_temp_dir, fetcher) = test_setup(); + + let p = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(format!("tests/encoding/{}.ts", charset)); + let specifier = + ModuleSpecifier::resolve_url_or_path(p.to_str().unwrap()).unwrap(); + let r = fetcher + .fetch_source_file(&specifier, None, Permissions::allow_all()) + .await; + assert!(r.is_ok()); + let fetched_file = r.unwrap(); + let source_code = fetched_file.source_code.to_str(); + assert!(source_code.is_ok()); + let actual = source_code.unwrap(); + assert_eq!(expected_content, actual); + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_16_be() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-16be", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_16_le() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-16le", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_8_with_bom() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-8", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + #[test] fn test_map_file_extension() { assert_eq!( @@ -1571,43 +1681,43 @@ mod tests { fn test_map_content_type_extension_only() { // Extension only assert_eq!( - map_content_type(Path::new("foo/bar.ts"), None), + map_content_type(Path::new("foo/bar.ts"), None).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.tsx"), None), + map_content_type(Path::new("foo/bar.tsx"), None).0, msg::MediaType::TSX ); assert_eq!( - map_content_type(Path::new("foo/bar.d.ts"), None), + map_content_type(Path::new("foo/bar.d.ts"), None).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.js"), None), + map_content_type(Path::new("foo/bar.js"), None).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar.txt"), None), + map_content_type(Path::new("foo/bar.txt"), None).0, msg::MediaType::Unknown ); assert_eq!( - map_content_type(Path::new("foo/bar.jsx"), None), + map_content_type(Path::new("foo/bar.jsx"), None).0, msg::MediaType::JSX ); assert_eq!( - map_content_type(Path::new("foo/bar.json"), None), + map_content_type(Path::new("foo/bar.json"), None).0, msg::MediaType::Json ); assert_eq!( - map_content_type(Path::new("foo/bar.wasm"), None), + map_content_type(Path::new("foo/bar.wasm"), None).0, msg::MediaType::Wasm ); assert_eq!( - map_content_type(Path::new("foo/bar.cjs"), None), + map_content_type(Path::new("foo/bar.cjs"), None).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), None), + map_content_type(Path::new("foo/bar"), None).0, msg::MediaType::Unknown ); } @@ -1616,140 +1726,154 @@ mod tests { fn test_map_content_type_media_type_with_no_extension() { // Media Type assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/typescript")), + map_content_type(Path::new("foo/bar"), Some("application/typescript")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/typescript")), + map_content_type(Path::new("foo/bar"), Some("text/typescript")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("video/vnd.dlna.mpeg-tts")), + map_content_type(Path::new("foo/bar"), Some("video/vnd.dlna.mpeg-tts")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("video/mp2t")), + map_content_type(Path::new("foo/bar"), Some("video/mp2t")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/x-typescript")), + map_content_type(Path::new("foo/bar"), Some("application/x-typescript")) + .0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/javascript")), + map_content_type(Path::new("foo/bar"), Some("application/javascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/javascript")), + map_content_type(Path::new("foo/bar"), Some("text/javascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/ecmascript")), + map_content_type(Path::new("foo/bar"), Some("application/ecmascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/ecmascript")), + map_content_type(Path::new("foo/bar"), Some("text/ecmascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/x-javascript")), + map_content_type(Path::new("foo/bar"), Some("application/x-javascript")) + .0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/json")), + map_content_type(Path::new("foo/bar"), Some("application/json")).0, msg::MediaType::Json ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/node")), + map_content_type(Path::new("foo/bar"), Some("application/node")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/json")), + map_content_type(Path::new("foo/bar"), Some("text/json")).0, msg::MediaType::Json ); + assert_eq!( + map_content_type(Path::new("foo/bar"), Some("text/json; charset=utf-8 ")), + (msg::MediaType::Json, Some("utf-8".to_owned())) + ); } #[test] fn test_map_file_extension_media_type_with_extension() { assert_eq!( - map_content_type(Path::new("foo/bar.ts"), Some("text/plain")), + map_content_type(Path::new("foo/bar.ts"), Some("text/plain")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.ts"), Some("foo/bar")), + map_content_type(Path::new("foo/bar.ts"), Some("foo/bar")).0, msg::MediaType::Unknown ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/typescript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/javascript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/x-typescript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("video/vnd.dlna.mpeg-tts"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( - map_content_type(Path::new("foo/bar.tsx"), Some("video/mp2t")), + map_content_type(Path::new("foo/bar.tsx"), Some("video/mp2t")).0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/javascript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/x-typescript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/ecmascript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( - map_content_type(Path::new("foo/bar.jsx"), Some("text/ecmascript")), + map_content_type(Path::new("foo/bar.jsx"), Some("text/ecmascript")).0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/x-javascript"), - ), + ) + .0, msg::MediaType::JSX ); } #[test] fn test_filter_shebang() { - assert_eq!(filter_shebang(b"#!"[..].to_owned()), b""); - assert_eq!(filter_shebang(b"#!\n\n"[..].to_owned()), b"\n\n"); - let code = b"#!/usr/bin/env deno\nconsole.log('hello');\n"[..].to_owned(); + assert_eq!(filter_shebang("#!"), b""); + assert_eq!(filter_shebang("#!\n\n"), b"\n\n"); + let code = "#!/usr/bin/env deno\nconsole.log('hello');\n"; assert_eq!(filter_shebang(code), b"\nconsole.log('hello');\n"); } @@ -1771,7 +1895,7 @@ mod tests { .await; assert!(source.is_ok()); let source = source.unwrap(); - assert_eq!(source.source_code, b"console.log('etag')"); + assert_eq!(source.source_code.bytes, b"console.log('etag')"); assert_eq!(&(source.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); @@ -1798,7 +1922,7 @@ mod tests { ) .await .unwrap(); - assert_eq!(cached_source.source_code, b"changed content"); + assert_eq!(cached_source.source_code.bytes, b"changed content"); let modified2 = metadata_path.metadata().unwrap().modified().unwrap(); @@ -1825,7 +1949,7 @@ mod tests { .await; assert!(source.is_ok()); let source = source.unwrap(); - assert_eq!(source.source_code, b"export const foo = 'foo';"); + assert_eq!(source.source_code.bytes, b"export const foo = 'foo';"); assert_eq!(&(source.media_type), &msg::MediaType::JavaScript); assert_eq!( source.types_header, @@ -1833,4 +1957,80 @@ mod tests { ); drop(http_server_guard); } + + #[tokio::test] + async fn test_fetch_source_file_from_net_utf16_le() { + let content = + std::str::from_utf8(b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A") + .unwrap(); + test_fetch_non_utf8_source_file_from_net( + "utf-16le", + "utf-16le.ts", + content, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_net_utf16_be() { + let content = + std::str::from_utf8(b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A") + .unwrap(); + test_fetch_non_utf8_source_file_from_net( + "utf-16be", + "utf-16be.ts", + content, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_net_windows_1255() { + let content = "console.log(\"\u{5E9}\u{5DC}\u{5D5}\u{5DD} \ + \u{5E2}\u{5D5}\u{5DC}\u{5DD}\");\u{A}"; + test_fetch_non_utf8_source_file_from_net( + "windows-1255", + "windows-1255", + content, + ) + .await; + } + + async fn test_fetch_non_utf8_source_file_from_net( + charset: &str, + file_name: &str, + expected_content: &str, + ) { + let http_server_guard = test_util::http_server(); + let (_temp_dir, fetcher) = test_setup(); + let module_url = Url::parse(&format!( + "http://127.0.0.1:4545/cli/tests/encoding/{}", + file_name + )) + .unwrap(); + + let source = fetcher + .fetch_remote_source( + &module_url, + false, + false, + 1, + &Permissions::allow_all(), + ) + .await; + assert!(source.is_ok()); + let source = source.unwrap(); + assert_eq!(&source.source_code.charset.to_lowercase()[..], charset); + let text = &source.source_code.to_str().unwrap(); + assert_eq!(text, expected_content); + assert_eq!(&(source.media_type), &msg::MediaType::TypeScript); + + let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); + assert_eq!( + headers.get("content-type").unwrap(), + &format!("application/typescript;charset={}", charset) + ); + + drop(http_server_guard); + } } |