diff --git a/Cargo.lock b/Cargo.lock index 0be607f..24abffc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,7 @@ version = "0.1.0" dependencies = [ "cov-mark", "memchr", + "unicode-segmentation", ] [[package]] @@ -277,6 +278,12 @@ dependencies = [ "once_cell", ] +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/matcher/Cargo.toml b/matcher/Cargo.toml index c609a21..bdcfb8e 100644 --- a/matcher/Cargo.toml +++ b/matcher/Cargo.toml @@ -9,6 +9,7 @@ edition = "2021" [dependencies] memchr = "2.5.0" cov-mark = { version = "1.1.0", default-features = false } +unicode-segmentation = "1.10" [dev-dependencies] cov-mark = { version = "1.1.0", default-features = true } diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index 378ab90..4d61777 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -135,6 +135,7 @@ impl Char for char { } pub use normalize::normalize; +use unicode_segmentation::UnicodeSegmentation; #[inline(always)] pub fn to_lower_case(c: char) -> char { @@ -154,3 +155,14 @@ pub enum CharClass { Letter, Number, } + +/// nucleo can not match graphemes as single units to work around +/// that we only use the first codepoint of each grapheme +pub fn graphemes(text: &str) -> impl Iterator + '_ { + text.graphemes(true).map(|grapheme| { + grapheme + .chars() + .next() + .expect("graphemes must be non-empty") + }) +} diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 70945be..cfd73db 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -43,7 +43,10 @@ impl<'a> Utf32Str<'a> { Utf32Str::Ascii(str.as_bytes()) } else { buf.clear(); - buf.extend(str.chars()); + buf.extend(crate::chars::graphemes(str)); + if buf.iter().all(|c| c.is_ascii()) { + return Utf32Str::Ascii(str.as_bytes()); + } Utf32Str::Unicode(&*buf) } } diff --git a/src/lib.rs b/src/lib.rs index ff9ee33..50b2569 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,11 +9,11 @@ use crate::worker::Worker; use parking_lot::lock_api::ArcMutexGuard; use rayon::ThreadPool; -pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind}; +pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; pub use crate::utf32_string::Utf32String; mod items; -mod query; +mod pattern; mod utf32_string; mod worker; pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; @@ -124,14 +124,14 @@ impl Nucleo { self.should_notify.store(false, atomic::Ordering::Relaxed); let status = self.pattern.status(); let items = self.items.cache.lock_arc(); - let canceled = status != query::Status::Unchanged || items.cleared(); + let canceled = status != pattern::Status::Unchanged || items.cleared(); let res = self.tick_inner(timeout, canceled, items, status); if !canceled { self.should_notify.store(true, atomic::Ordering::Relaxed); return res; } let items = self.items.cache.lock_arc(); - let res = self.tick_inner(timeout, false, items, query::Status::Unchanged); + let res = self.tick_inner(timeout, false, items, pattern::Status::Unchanged); self.should_notify.store(true, atomic::Ordering::Relaxed); res } @@ -141,7 +141,7 @@ impl Nucleo { timeout: u64, canceled: bool, items: ArcMutexGuard, - status: query::Status, + status: pattern::Status, ) -> Status { let mut inner = if canceled { self.pattern.reset_status(); diff --git a/src/query.rs b/src/pattern.rs similarity index 94% rename from src/query.rs rename to src/pattern.rs index 1563ba5..ab2cf17 100644 --- a/src/query.rs +++ b/src/pattern.rs @@ -65,7 +65,7 @@ impl PatternAtom { let mut needle_ = Vec::with_capacity(needle.len()); if escape_whitespace { let mut saw_backslash = false; - for mut c in needle.chars() { + for mut c in chars::graphemes(needle) { if saw_backslash { if c == ' ' { needle_.push(' '); @@ -88,6 +88,21 @@ impl PatternAtom { } needle_.push(c); } + } else { + let chars = chars::graphemes(needle).map(|mut c| { + if normalize { + c = chars::normalize(c); + } + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case && !c.is_uppercase(); + } + CaseMatching::Respect => (), + } + c + }); + needle_.extend(chars); }; Utf32String::Unicode(needle_.into_boxed_slice()) }; diff --git a/src/utf32_string.rs b/src/utf32_string.rs index 734e428..d7e9935 100644 --- a/src/utf32_string.rs +++ b/src/utf32_string.rs @@ -4,7 +4,7 @@ use std::fmt; use std::mem::take; use std::ops::{Bound, RangeBounds}; -use nucleo_matcher::Utf32Str; +use nucleo_matcher::{chars, Utf32Str}; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] pub enum Utf32String { @@ -94,12 +94,13 @@ impl Utf32String { *self = Self::Ascii(bytes.into_boxed_str()); return; } - Utf32String::Ascii(bytes) => bytes.chars().collect(), + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), Utf32String::Unicode(codepoints) => Vec::from(codepoints), }; - codeboints.extend(text.chars()); + codeboints.extend(chars::graphemes(text)); *self = Utf32String::Unicode(codeboints.into_boxed_slice()); } + #[inline] pub fn push(&mut self, c: char) { let mut codeboints = match take(self) { @@ -109,7 +110,7 @@ impl Utf32String { *self = Self::Ascii(bytes.into_boxed_str()); return; } - Utf32String::Ascii(bytes) => bytes.chars().collect(), + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), Utf32String::Unicode(codepoints) => Vec::from(codepoints), }; codeboints.push(c); @@ -123,7 +124,7 @@ impl From<&str> for Utf32String { if value.is_ascii() { Self::Ascii(value.to_owned().into_boxed_str()) } else { - Self::Unicode(value.chars().collect()) + Self::Unicode(chars::graphemes(value).collect()) } } } @@ -133,7 +134,7 @@ impl From> for Utf32String { if value.is_ascii() { Self::Ascii(value) } else { - Self::Unicode(value.chars().collect()) + Self::Unicode(chars::graphemes(&value).collect()) } } } diff --git a/src/worker.rs b/src/worker.rs index af0c3ab..d7babdc 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -8,7 +8,7 @@ use parking_lot::RawMutex; use rayon::{prelude::*, ThreadPool}; use crate::items::{ItemCache, ItemsSnapshot}; -use crate::query::{self, MultiPattern}; +use crate::pattern::{self, MultiPattern}; use crate::Match; struct Matchers(Box<[UnsafeCell]>); @@ -76,7 +76,7 @@ impl Worker { pub(crate) unsafe fn run( &mut self, items_lock: ArcMutexGuard, - query_status: query::Status, + query_status: pattern::Status, ) { self.running = true; let mut last_scored_item = self.items.len(); @@ -84,7 +84,7 @@ impl Worker { drop(items_lock); // TODO: be smarter around reusing past results for rescoring - if cleared || query_status == query::Status::Rescore { + if cleared || query_status == pattern::Status::Rescore { self.matches.clear(); last_scored_item = 0; } @@ -103,7 +103,7 @@ impl Worker { } return; } - if query_status != query::Status::Unchanged && !self.matches.is_empty() { + if query_status != pattern::Status::Unchanged && !self.matches.is_empty() { self.matches .par_iter_mut() .take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))