diff options
author | Kitson Kelly <me@kitsonkelly.com> | 2021-01-22 21:03:16 +1100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-01-22 21:03:16 +1100 |
commit | 1a9209d1e3ed297c96a698550ab833c54c02a4ee (patch) | |
tree | 21be94f78196af33dd4a59c40fbfe2e7fa744922 /cli/lsp/text.rs | |
parent | ffa920e4b9594f201756f9eeca542e5dfb8576d1 (diff) |
fix(lsp): handle mbc documents properly (#9151)
Co-authored-by: Ryan Dahl <ry@tinyclouds.org>
Diffstat (limited to 'cli/lsp/text.rs')
-rw-r--r-- | cli/lsp/text.rs | 617 |
1 files changed, 427 insertions, 190 deletions
diff --git a/cli/lsp/text.rs b/cli/lsp/text.rs index e871cb265..1d350c12f 100644 --- a/cli/lsp/text.rs +++ b/cli/lsp/text.rs @@ -1,123 +1,233 @@ // Copyright 2018-2021 the Deno authors. All rights reserved. MIT license. +use deno_core::error::custom_error; +use deno_core::error::AnyError; use deno_core::serde_json::json; use deno_core::serde_json::Value; use dissimilar::diff; use dissimilar::Chunk; +use lspower::jsonrpc; use lspower::lsp_types; use lspower::lsp_types::TextEdit; +use std::collections::HashMap; use std::ops::Bound; -use std::ops::Range; use std::ops::RangeBounds; +use text_size::TextRange; +use text_size::TextSize; -// TODO(@kitson) in general all of these text handling routines don't handle -// JavaScript encoding in the same way and likely cause issues when trying to -// arbitrate between chars and Unicode graphemes. There be dragons. - -/// Generate a character position for the start of each line. For example: -/// -/// ```rust -/// let actual = index_lines("a\nb\n"); -/// assert_eq!(actual, vec![0, 2, 4]); -/// ``` -/// -pub fn index_lines(text: &str) -> Vec<u32> { - let mut indexes = vec![0_u32]; - for (i, c) in text.chars().enumerate() { - if c == '\n' { - indexes.push((i + 1) as u32); +fn partition_point<T, P>(slice: &[T], mut predicate: P) -> usize +where + P: FnMut(&T) -> bool, +{ + let mut left = 0; + let mut right = slice.len(); + + while left != right { + let mid = left + (right - left) / 2; + // SAFETY: + // When left < right, left <= mid < right. + // Therefore left always increases and right always decreases, + // and either of them is selected. + // In both cases left <= right is satisfied. + // Therefore if left < right in a step, + // left <= right is satisfied in the next step. + // Therefore as long as left != right, 0 <= left < right <= len is satisfied + // and if this case 0 <= mid < len is satisfied too. + let value = unsafe { slice.get_unchecked(mid) }; + if predicate(value) { + left = mid + 1; + } else { + right = mid; } } - indexes + + left } -enum IndexValid { - All, - UpTo(u32), +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct Utf16Char { + pub start: TextSize, + pub end: TextSize, } -impl IndexValid { - fn covers(&self, line: u32) -> bool { - match *self { - IndexValid::UpTo(to) => to > line, - IndexValid::All => true, +impl Utf16Char { + fn len(&self) -> TextSize { + self.end - self.start + } + + fn len_utf16(&self) -> usize { + if self.len() == TextSize::from(4) { + 2 + } else { + 1 } } } -fn to_range(line_index: &[u32], range: lsp_types::Range) -> Range<usize> { - let start = - (line_index[range.start.line as usize] + range.start.character) as usize; - let end = - (line_index[range.end.line as usize] + range.end.character) as usize; - Range { start, end } +#[derive(Debug, Clone, Default, Eq, PartialEq)] +pub struct LineIndex { + utf8_offsets: Vec<TextSize>, + utf16_lines: HashMap<u32, Vec<Utf16Char>>, + utf16_offsets: Vec<TextSize>, } -pub fn to_position(line_index: &[u32], char_pos: u32) -> lsp_types::Position { - let mut line = 0_usize; - let mut line_start = 0_u32; - for (pos, v) in line_index.iter().enumerate() { - if char_pos < *v { - break; +impl LineIndex { + pub fn new(text: &str) -> LineIndex { + let mut utf16_lines = HashMap::new(); + let mut utf16_chars = Vec::new(); + + let mut utf8_offsets = vec![0.into()]; + let mut utf16_offsets = vec![0.into()]; + let mut curr_row = 0.into(); + let mut curr_col = 0.into(); + let mut curr_offset_u16 = 0.into(); + let mut line = 0; + for c in text.chars() { + let c_len = TextSize::of(c); + curr_row += c_len; + curr_offset_u16 += TextSize::from(c.len_utf16() as u32); + if c == '\n' { + utf8_offsets.push(curr_row); + utf16_offsets.push(curr_offset_u16); + + if !utf16_chars.is_empty() { + utf16_lines.insert(line, utf16_chars); + utf16_chars = Vec::new(); + } + + curr_col = 0.into(); + line += 1; + continue; + } + + if !c.is_ascii() { + utf16_chars.push(Utf16Char { + start: curr_col, + end: curr_col + c_len, + }); + } + curr_col += c_len; + } + + if !utf16_chars.is_empty() { + utf16_lines.insert(line, utf16_chars); + } + + LineIndex { + utf8_offsets, + utf16_lines, + utf16_offsets, } - line_start = *v; - line = pos; } - lsp_types::Position { - line: line as u32, - character: char_pos - line_start, + /// Convert a u16 based range to a u8 TextRange. + pub fn get_text_range( + &self, + range: lsp_types::Range, + ) -> Result<TextRange, AnyError> { + let start = self.offset(range.start)?; + let end = self.offset(range.end)?; + Ok(TextRange::new(start, end)) + } + + /// Return a u8 offset based on a u16 position. + pub fn offset( + &self, + position: lsp_types::Position, + ) -> Result<TextSize, AnyError> { + let col = self.utf16_to_utf8_col(position.line, position.character); + if let Some(line_offset) = self.utf8_offsets.get(position.line as usize) { + Ok(line_offset + col) + } else { + Err(custom_error("OutOfRange", "The position is out of range.")) + } } -} -pub fn to_char_pos(line_index: &[u32], position: lsp_types::Position) -> u32 { - if let Some(line_start) = line_index.get(position.line as usize) { - line_start + position.character - } else { - 0_u32 + /// Convert an lsp Position into a tsc/TypeScript "position", which is really + /// an u16 byte offset from the start of the string represented as an u32. + pub fn offset_tsc( + &self, + position: lsp_types::Position, + ) -> jsonrpc::Result<u32> { + self + .offset_utf16(position) + .map(|ts| ts.into()) + .map_err(|err| jsonrpc::Error::invalid_params(err.to_string())) } -} -/// Apply a vector of document changes to the supplied string. -pub fn apply_content_changes( - content: &mut String, - content_changes: Vec<lsp_types::TextDocumentContentChangeEvent>, -) { - let mut line_index = index_lines(&content); - let mut index_valid = IndexValid::All; - for change in content_changes { - if let Some(range) = change.range { - if !index_valid.covers(range.start.line) { - line_index = index_lines(&content); - } - let range = to_range(&line_index, range); - content.replace_range(range, &change.text); + fn offset_utf16( + &self, + position: lsp_types::Position, + ) -> Result<TextSize, AnyError> { + if let Some(line_offset) = self.utf16_offsets.get(position.line as usize) { + Ok(line_offset + TextSize::from(position.character)) } else { - *content = change.text; - index_valid = IndexValid::UpTo(0); + Err(custom_error("OutOfRange", "The position is out of range.")) } } + + /// Returns a u16 position based on a u16 offset, which TypeScript offsets are + /// returned as u16. + pub fn position_tsc(&self, offset: TextSize) -> lsp_types::Position { + let line = partition_point(&self.utf16_offsets, |&it| it <= offset) - 1; + let line_start_offset = self.utf16_offsets[line]; + let col = offset - line_start_offset; + + lsp_types::Position { + line: line as u32, + character: col.into(), + } + } + + /// Returns a u16 position based on a u8 offset. + pub fn position_utf16(&self, offset: TextSize) -> lsp_types::Position { + let line = partition_point(&self.utf8_offsets, |&it| it <= offset) - 1; + let line_start_offset = self.utf8_offsets[line]; + let col = offset - line_start_offset; + + lsp_types::Position { + line: line as u32, + character: col.into(), + } + } + + fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize { + if let Some(utf16_chars) = self.utf16_lines.get(&line) { + for c in utf16_chars { + if col > u32::from(c.start) { + col += u32::from(c.len()) - c.len_utf16() as u32; + } else { + break; + } + } + } + + col.into() + } } /// Compare two strings and return a vector of text edit records which are /// supported by the Language Server Protocol. pub fn get_edits(a: &str, b: &str) -> Vec<TextEdit> { + if a == b { + return vec![]; + } let chunks = diff(a, b); let mut text_edits = Vec::<TextEdit>::new(); - let line_index = index_lines(a); + let line_index = LineIndex::new(a); let mut iter = chunks.iter().peekable(); - let mut a_pos = 0_u32; + let mut a_pos = TextSize::from(0); loop { let chunk = iter.next(); match chunk { None => break, Some(Chunk::Equal(e)) => { - a_pos += e.chars().count() as u32; + a_pos += TextSize::from(e.encode_utf16().count() as u32); } Some(Chunk::Delete(d)) => { - let start = to_position(&line_index, a_pos); - a_pos += d.chars().count() as u32; - let end = to_position(&line_index, a_pos); + let start = line_index.position_utf16(a_pos); + a_pos += TextSize::from(d.encode_utf16().count() as u32); + let end = line_index.position_utf16(a_pos); let range = lsp_types::Range { start, end }; match iter.peek() { Some(Chunk::Insert(i)) => { @@ -134,7 +244,7 @@ pub fn get_edits(a: &str, b: &str) -> Vec<TextEdit> { } } Some(Chunk::Insert(i)) => { - let pos = to_position(&line_index, a_pos); + let pos = line_index.position_utf16(a_pos); let range = lsp_types::Range { start: pos, end: pos, @@ -153,6 +263,9 @@ pub fn get_edits(a: &str, b: &str) -> Vec<TextEdit> { /// Convert a difference between two strings into a change range used by the /// TypeScript Language Service. pub fn get_range_change(a: &str, b: &str) -> Value { + if a == b { + return json!(null); + } let chunks = diff(a, b); let mut iter = chunks.iter().peekable(); let mut started = false; @@ -162,12 +275,12 @@ pub fn get_range_change(a: &str, b: &str) -> Value { let mut equal = 0; let mut a_pos = 0; loop { - let chunk = iter.next(); - match chunk { + let diff = iter.next(); + match diff { None => break, Some(Chunk::Equal(e)) => { - a_pos += e.chars().count(); - equal += e.chars().count(); + a_pos += e.encode_utf16().count(); + equal += e.encode_utf16().count(); } Some(Chunk::Delete(d)) => { if !started { @@ -175,7 +288,7 @@ pub fn get_range_change(a: &str, b: &str) -> Value { started = true; equal = 0; } - a_pos += d.chars().count(); + a_pos += d.encode_utf16().count(); if started { end = a_pos; new_length += equal; @@ -191,7 +304,7 @@ pub fn get_range_change(a: &str, b: &str) -> Value { } else { end += equal; } - new_length += i.chars().count() + equal; + new_length += i.encode_utf16().count() + equal; equal = 0; } } @@ -215,7 +328,7 @@ pub fn slice(s: &str, range: impl RangeBounds<usize>) -> &str { let len = match range.end_bound() { Bound::Included(bound) => *bound + 1, Bound::Excluded(bound) => *bound, - Bound::Unbounded => s.len(), + Bound::Unbounded => s.encode_utf16().count(), } - start; substring(s, start, start + len) } @@ -231,7 +344,7 @@ pub fn substring(s: &str, start: usize, end: usize) -> &str { break; } if let Some(c) = it.next() { - char_pos += 1; + char_pos += c.len_utf16(); byte_start += c.len_utf8(); } else { break; @@ -244,7 +357,7 @@ pub fn substring(s: &str, start: usize, end: usize) -> &str { break; } if let Some(c) = it.next() { - char_pos += 1; + char_pos += c.len_utf16(); byte_end += c.len_utf8(); } else { break; @@ -258,24 +371,194 @@ mod tests { use super::*; #[test] - fn test_apply_content_changes() { - let mut content = "a\nb\nc\nd".to_string(); - let content_changes = vec![lsp_types::TextDocumentContentChangeEvent { - range: Some(lsp_types::Range { - start: lsp_types::Position { - line: 1, - character: 0, - }, - end: lsp_types::Position { - line: 1, - character: 1, - }, - }), - range_length: Some(1), - text: "e".to_string(), - }]; - apply_content_changes(&mut content, content_changes); - assert_eq!(content, "a\ne\nc\nd"); + fn test_line_index() { + let text = "hello\nworld"; + let index = LineIndex::new(text); + assert_eq!( + index.position_utf16(0.into()), + lsp_types::Position { + line: 0, + character: 0 + } + ); + assert_eq!( + index.position_utf16(1.into()), + lsp_types::Position { + line: 0, + character: 1 + } + ); + assert_eq!( + index.position_utf16(5.into()), + lsp_types::Position { + line: 0, + character: 5 + } + ); + assert_eq!( + index.position_utf16(6.into()), + lsp_types::Position { + line: 1, + character: 0 + } + ); + assert_eq!( + index.position_utf16(7.into()), + lsp_types::Position { + line: 1, + character: 1 + } + ); + assert_eq!( + index.position_utf16(8.into()), + lsp_types::Position { + line: 1, + character: 2 + } + ); + assert_eq!( + index.position_utf16(10.into()), + lsp_types::Position { + line: 1, + character: 4 + } + ); + assert_eq!( + index.position_utf16(11.into()), + lsp_types::Position { + line: 1, + character: 5 + } + ); + assert_eq!( + index.position_utf16(12.into()), + lsp_types::Position { + line: 1, + character: 6 + } + ); + + let text = "\nhello\nworld"; + let index = LineIndex::new(text); + assert_eq!( + index.position_utf16(0.into()), + lsp_types::Position { + line: 0, + character: 0 + } + ); + assert_eq!( + index.position_utf16(1.into()), + lsp_types::Position { + line: 1, + character: 0 + } + ); + assert_eq!( + index.position_utf16(2.into()), + lsp_types::Position { + line: 1, + character: 1 + } + ); + assert_eq!( + index.position_utf16(6.into()), + lsp_types::Position { + line: 1, + character: 5 + } + ); + assert_eq!( + index.position_utf16(7.into()), + lsp_types::Position { + line: 2, + character: 0 + } + ); + } + + #[test] + fn test_char_len() { + assert_eq!('γ‘'.len_utf8(), 3); + assert_eq!('γ‘'.len_utf16(), 1); + assert_eq!('ηΌ'.len_utf8(), 3); + assert_eq!('ηΌ'.len_utf16(), 1); + assert_eq!('π¦'.len_utf8(), 4); + assert_eq!('π¦'.len_utf16(), 2); + } + + #[test] + fn test_empty_index() { + let col_index = LineIndex::new( + " +const C: char = 'x'; +", + ); + assert_eq!(col_index.utf16_lines.len(), 0); + } + + #[test] + fn test_single_char() { + let col_index = LineIndex::new( + " +const C: char = 'γ‘'; +", + ); + + assert_eq!(col_index.utf16_lines.len(), 1); + assert_eq!(col_index.utf16_lines[&1].len(), 1); + assert_eq!( + col_index.utf16_lines[&1][0], + Utf16Char { + start: 17.into(), + end: 20.into() + } + ); + + // UTF-16 to UTF-8, no changes + assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15)); + + // UTF-16 to UTF-8 + assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); + + let col_index = LineIndex::new("aπb"); + assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5)); + } + + #[test] + fn test_string() { + let col_index = LineIndex::new( + " +const C: char = \"γ‘ γ‘\"; +", + ); + + assert_eq!(col_index.utf16_lines.len(), 1); + assert_eq!(col_index.utf16_lines[&1].len(), 2); + assert_eq!( + col_index.utf16_lines[&1][0], + Utf16Char { + start: 17.into(), + end: 20.into() + } + ); + assert_eq!( + col_index.utf16_lines[&1][1], + Utf16Char { + start: 21.into(), + end: 24.into() + } + ); + + // UTF-16 to UTF-8 + assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15)); + + // γ‘ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1 + assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first γ‘ at 17..20 + assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space + assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second γ‘ at 21..24 + + assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15)); } #[test] @@ -319,6 +602,11 @@ mod tests { #[test] fn test_get_range_change() { let a = "abcdefg"; + let b = "abcdefg"; + let actual = get_range_change(a, b); + assert_eq!(actual, json!(null)); + + let a = "abcdefg"; let b = "abedcfg"; let actual = get_range_change(a, b); assert_eq!( @@ -401,108 +689,56 @@ mod tests { "newLength": 3 }) ); - } - #[test] - fn test_index_lines() { - let actual = index_lines("a\nb\r\nc"); - assert_eq!(actual, vec![0, 2, 5]); - } - - #[test] - fn test_to_position() { - let line_index = index_lines("a\nb\r\nc\n"); - assert_eq!( - to_position(&line_index, 6), - lsp_types::Position { - line: 2, - character: 1, - } - ); - assert_eq!( - to_position(&line_index, 0), - lsp_types::Position { - line: 0, - character: 0, - } - ); + let a = "hello π¦!"; + let b = "hello deno!"; + let actual = get_range_change(a, b); assert_eq!( - to_position(&line_index, 3), - lsp_types::Position { - line: 1, - character: 1, - } + actual, + json!({ + "span": { + "start": 6, + "length": 2, + }, + "newLength": 4 + }) ); - } - #[test] - fn test_to_position_mbc() { - let line_index = index_lines("yΜ\nπ±π¦\nπ€―\n"); - assert_eq!( - to_position(&line_index, 0), - lsp_types::Position { - line: 0, - character: 0, - } - ); - assert_eq!( - to_position(&line_index, 2), - lsp_types::Position { - line: 0, - character: 2, - } - ); - assert_eq!( - to_position(&line_index, 3), - lsp_types::Position { - line: 1, - character: 0, - } - ); - assert_eq!( - to_position(&line_index, 4), - lsp_types::Position { - line: 1, - character: 1, - } - ); - assert_eq!( - to_position(&line_index, 5), - lsp_types::Position { - line: 1, - character: 2, - } - ); - assert_eq!( - to_position(&line_index, 6), - lsp_types::Position { - line: 2, - character: 0, - } - ); - assert_eq!( - to_position(&line_index, 7), - lsp_types::Position { - line: 2, - character: 1, - } - ); + let a = "hello deno!"; + let b = "hello denoπ¦!"; + let actual = get_range_change(a, b); assert_eq!( - to_position(&line_index, 8), - lsp_types::Position { - line: 3, - character: 0, - } + actual, + json!({ + "span": { + "start": 10, + "length": 0, + }, + "newLength": 2 + }) ); + + // TODO(@kitsonk): https://github.com/dtolnay/dissimilar/issues/5 + // let a = r#" π¦πΊπΈπ "#; + // let b = r#" πΊπΈπ "#; + // let actual = get_range_change(a, b); + // assert_eq!( + // actual, + // json!({ + // "span": { + // "start": 1, + // "length": 2, + // }, + // "newLength": 0 + // }) + // ); } #[test] fn test_substring() { assert_eq!(substring("Deno", 1, 3), "en"); assert_eq!(substring("yΜyΜ", 2, 4), "yΜ"); - // this doesn't work like JavaScript, as π¦ is treated as a single char in - // Rust, but as two chars in JavaScript. - // assert_eq!(substring("π¦π¦", 2, 4), "π¦"); + assert_eq!(substring("π¦π¦", 2, 4), "π¦"); } #[test] @@ -511,5 +747,6 @@ mod tests { assert_eq!(slice("Deno", 1..=3), "eno"); assert_eq!(slice("Deno Land", 1..), "eno Land"); assert_eq!(slice("Deno", ..3), "Den"); + assert_eq!(slice("Hello π¦", 6..8), "π¦"); } } |