summaryrefslogtreecommitdiff
path: root/cli/text_encoding.rs
diff options
context:
space:
mode:
authorMaayan Hanin <maayan.asa.hanin@gmail.com>2020-08-04 00:39:48 +0300
committerGitHub <noreply@github.com>2020-08-03 23:39:48 +0200
commit5fc5e7b54a9fba421dfc473016625a4f592403ed (patch)
treee66bbbdaa30b34b7cef8072ded8ea3f0575c47f0 /cli/text_encoding.rs
parentd615ebefe2e306f2877afb40dc603f71263407d6 (diff)
fix(cli): add support for non-UTF8 source files (#6789)
Fixes: #5542
Diffstat (limited to 'cli/text_encoding.rs')
-rw-r--r--cli/text_encoding.rs94
1 files changed, 94 insertions, 0 deletions
diff --git a/cli/text_encoding.rs b/cli/text_encoding.rs
new file mode 100644
index 000000000..9d8e5bcc9
--- /dev/null
+++ b/cli/text_encoding.rs
@@ -0,0 +1,94 @@
+// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license.
+use encoding_rs::*;
+use std::{
+ borrow::Cow,
+ io::{Error, ErrorKind},
+};
+
+/// Attempts to detect the character encoding of the provided bytes.
+///
+/// Supports UTF-8, UTF-16 Little Endian and UTF-16 Big Endian.
+pub fn detect_charset(bytes: &'_ [u8]) -> &'static str {
+ const UTF16_LE_BOM: &[u8] = b"\xFF\xFE";
+ const UTF16_BE_BOM: &[u8] = b"\xFE\xFF";
+
+ if bytes.starts_with(UTF16_LE_BOM) {
+ "utf-16le"
+ } else if bytes.starts_with(UTF16_BE_BOM) {
+ "utf-16be"
+ } else {
+ // Assume everything else is utf-8
+ "utf-8"
+ }
+}
+
+/// Attempts to convert the provided bytes to a UTF-8 string.
+///
+/// Supports all encodings supported by the encoding_rs crate, which includes
+/// all encodings specified in the WHATWG Encoding Standard, and only those
+/// encodings (see: https://encoding.spec.whatwg.org/).
+pub fn convert_to_utf8<'a>(
+ bytes: &'a [u8],
+ charset: &'_ str,
+) -> Result<Cow<'a, str>, Error> {
+ match Encoding::for_label(charset.as_bytes()) {
+ Some(encoding) => encoding
+ .decode_without_bom_handling_and_without_replacement(bytes)
+ .ok_or_else(|| ErrorKind::InvalidData.into()),
+ None => Err(Error::new(
+ ErrorKind::InvalidInput,
+ format!("Unsupported charset: {}", charset),
+ )),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn test_detection(test_data: &[u8], expected_charset: &str) {
+ let detected_charset = detect_charset(test_data);
+ assert_eq!(
+ expected_charset.to_lowercase(),
+ detected_charset.to_lowercase()
+ );
+ }
+
+ #[test]
+ fn test_detection_utf8_no_bom() {
+ let test_data = "Hello UTF-8 it is \u{23F0} for Deno!"
+ .to_owned()
+ .into_bytes();
+ test_detection(&test_data, "utf-8");
+ }
+
+ #[test]
+ fn test_detection_utf16_little_endian() {
+ let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec();
+ test_detection(&test_data, "utf-16le");
+ }
+
+ #[test]
+ fn test_detection_utf16_big_endian() {
+ let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec();
+ test_detection(&test_data, "utf-16be");
+ }
+
+ #[test]
+ fn test_decoding_unsupported_charset() {
+ let test_data = Vec::new();
+ let result = convert_to_utf8(&test_data, "utf-32le");
+ assert!(result.is_err());
+ let err = result.expect_err("Err expected");
+ assert!(err.kind() == ErrorKind::InvalidInput);
+ }
+
+ #[test]
+ fn test_decoding_invalid_utf8() {
+ let test_data = b"\xFE\xFE\xFF\xFF".to_vec();
+ let result = convert_to_utf8(&test_data, "utf-8");
+ assert!(result.is_err());
+ let err = result.expect_err("Err expected");
+ assert!(err.kind() == ErrorKind::InvalidData);
+ }
+}