From 648dec1ceb3f276b4507cce3a016f55d21cf07a1 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 17:29:17 +0200 Subject: [PATCH] move Utf32String to nucleo-matcher --- matcher/src/lib.rs | 2 +- matcher/src/utf32_str.rs | 176 +++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 +- 3 files changed, 178 insertions(+), 4 deletions(-) diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 6aea293..efae388 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -27,7 +27,7 @@ mod utf32_str; mod tests; pub use crate::config::MatcherConfig; -pub use crate::utf32_str::Utf32Str; +pub use crate::utf32_str::{Utf32Str, Utf32String}; use crate::chars::{AsciiChar, Char}; use crate::matrix::MatrixSlab; diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index fe4f44e..9602b27 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,6 +1,10 @@ +use std::borrow::Cow; +use std::mem::take; use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; +use crate::chars; + /// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. /// /// Usually rusts' utf8 encoded strings are great. However during fuzzy matching @@ -209,3 +213,175 @@ impl DoubleEndedIterator for Chars<'_> { } } } + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +pub enum Utf32String { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain valid ASCII (<=127) + Ascii(Box), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(Box<[char]>), +} + +impl Default for Utf32String { + fn default() -> Self { + Self::Ascii(String::new().into_boxed_str()) + } +} + +impl Utf32String { + #[inline] + pub fn len(&self) -> usize { + match self { + Utf32String::Unicode(codepoints) => codepoints.len(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Utf32String::Unicode(codepoints) => codepoints.is_empty(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } + + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher + #[inline] + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start as usize, + Bound::Excluded(&start) => start as usize + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end as usize + 1, + Bound::Excluded(&end) => end as usize, + Bound::Unbounded => self.len(), + }; + match self { + Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]), + Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + #[inline] + pub fn is_ascii(&self) -> bool { + matches!(self, Utf32String::Ascii(_)) + } + + #[inline] + pub fn get(&self, idx: u32) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, + Utf32String::Unicode(codepoints) => codepoints[idx as usize], + } + } + + #[inline] + pub fn last(&self) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, + Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], + } + } + + #[inline] + pub fn chars(&self) -> Chars<'_> { + match self { + Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), + Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), + } + } + + #[inline] + pub fn push_str(&mut self, text: &str) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if text.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push_str(text); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.extend(chars::graphemes(text)); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } + + #[inline] + pub fn push(&mut self, c: char) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if c.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push(c); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.push(c); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } +} + +impl From<&str> for Utf32String { + #[inline] + fn from(value: &str) -> Self { + if value.is_ascii() { + Self::Ascii(value.to_owned().into_boxed_str()) + } else { + Self::Unicode(chars::graphemes(value).collect()) + } + } +} + +impl From> for Utf32String { + fn from(value: Box) -> Self { + if value.is_ascii() { + Self::Ascii(value) + } else { + Self::Unicode(chars::graphemes(&value).collect()) + } + } +} + +impl From for Utf32String { + #[inline] + fn from(value: String) -> Self { + value.into_boxed_str().into() + } +} + +impl<'a> From> for Utf32String { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + match value { + Cow::Borrowed(value) => value.into(), + Cow::Owned(value) => value.into(), + } + } +} + +impl fmt::Debug for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"")?; + for c in self.chars() { + for c in c.escape_debug() { + write!(f, "{c}")? + } + } + write!(f, "\"") + } +} + +impl fmt::Display for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for c in self.chars() { + write!(f, "{c}")? + } + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 7331089..cb87352 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,14 +8,12 @@ use parking_lot::Mutex; use rayon::ThreadPool; pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; -pub use crate::utf32_string::Utf32String; use crate::worker::Worker; -pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; +pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str, Utf32String}; mod boxcar; mod par_sort; mod pattern; -mod utf32_string; mod worker; pub struct Item<'a, T> {