From 1ce8850f7e561a10487a63580d8f8f5539324ae2 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 30 Jul 2023 04:52:44 +0200 Subject: [PATCH] cleanup bugs --- Cargo.lock | 1 - Cargo.toml | 18 ++- {benches => bench}/Cargo.toml | 3 +- {benches => bench}/src/main.rs | 8 +- matcher/src/config.rs | 6 +- matcher/src/lib.rs | 93 +++++++++---- matcher/src/matrix.rs | 2 + matcher/src/prefilter.rs | 3 + {worker => src}/Cargo.toml | 0 {worker/src => src}/items.rs | 13 +- src/lib.rs | 207 ++++++++++++++++++++++++++++ {worker/src => src}/matcher.rs | 0 {worker/src => src}/query.rs | 82 +++++++---- {worker/src => src}/results.rs | 0 {worker/src => src}/utf32_string.rs | 59 ++++++++ {worker/src => src}/worker.rs | 55 +++++--- worker/src/lib.rs | 131 ------------------ 17 files changed, 463 insertions(+), 218 deletions(-) rename {benches => bench}/Cargo.toml (64%) rename {benches => bench}/src/main.rs (90%) rename {worker => src}/Cargo.toml (100%) rename {worker/src => src}/items.rs (89%) create mode 100644 src/lib.rs rename {worker/src => src}/matcher.rs (100%) rename {worker/src => src}/query.rs (80%) rename {worker/src => src}/results.rs (100%) rename {worker/src => src}/utf32_string.rs (69%) rename {worker/src => src}/worker.rs (72%) delete mode 100644 worker/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 478f6e4..0be607f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,7 +15,6 @@ dependencies = [ "brunch", "fuzzy-matcher", "nucleo", - "nucleo-matcher", "walkdir", ] diff --git a/Cargo.toml b/Cargo.toml index b9a2d3a..af4e3b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,18 @@ +[package] +name = "nucleo" +description = "plug and play high performance fuzzy matcher" +authors = ["Pascal Kuthe "] +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" +repository = "https://github.com/helix-editor/nucleo" + +[lib] + +[dependencies] +nucleo-matcher = { version = "0.1", path = "matcher" } +parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]} +rayon = "1.7.0" + [workspace] -members = [ "matcher", "worker", "benches" ] +members = [ "matcher", "bench" ] diff --git a/benches/Cargo.toml b/bench/Cargo.toml similarity index 64% rename from benches/Cargo.toml rename to bench/Cargo.toml index 89f10b4..3c0c356 100644 --- a/benches/Cargo.toml +++ b/bench/Cargo.toml @@ -6,8 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -nucleo-matcher = { version = "0.1", path = "../matcher" } -nucleo = { version = "0.1", path = "../worker" } +nucleo = { version = "0.1", path = "../" } brunch = "0.5.0" fuzzy-matcher = "0.3.7" walkdir = "2" \ No newline at end of file diff --git a/benches/src/main.rs b/bench/src/main.rs similarity index 90% rename from benches/src/main.rs rename to bench/src/main.rs index 34f83f2..148d353 100644 --- a/benches/src/main.rs +++ b/bench/src/main.rs @@ -4,8 +4,7 @@ use std::process::Command; use brunch::{Bench, Benches}; use fuzzy_matcher::FuzzyMatcher; -use nucleo::Utf32String; -use nucleo_matcher::Utf32Str; +use nucleo::{Utf32Str, Utf32String}; fn bench_dir() -> PathBuf { std::env::var_os("BENCHMARK_DIR") @@ -44,9 +43,8 @@ fn main() { Some((path.as_str().into(), path)) }) .unzip(); - let mut nucleo = - nucleo_matcher::Matcher::new(nucleo_matcher::MatcherConfig::DEFAULT.match_paths()); - let skim = fuzzy_matcher::skim::SkimMatcherV2::default().ignore_case(); + let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths()); + let skim = fuzzy_matcher::skim::SkimMatcherV2::default(); // TODO: unicode? let needles = ["never_matches", "copying", "/doc/kernel", "//.h"]; diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 931b032..38c4ae8 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -11,9 +11,7 @@ pub struct MatcherConfig { /// Extra bonus for word boundary after slash, colon, semi-colon, and comma pub(crate) bonus_boundary_delimiter: u16, pub initial_char_class: CharClass, - /// Whether to normalize latin script characters to ASCII - /// this significantly degrades performance so its not recommended - /// to be turned on by default + /// Whether to normalize latin script characters to ASCII (enabled by default) pub normalize: bool, /// whether to ignore casing pub ignore_case: bool, @@ -26,7 +24,7 @@ impl MatcherConfig { bonus_boundary_white: BONUS_BOUNDARY + 2, bonus_boundary_delimiter: BONUS_BOUNDARY + 1, initial_char_class: CharClass::Whitespace, - normalize: false, + normalize: true, ignore_case: true, } }; diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index dda12b8..6e6efcc 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -125,26 +125,41 @@ impl Matcher { fn fuzzy_matcher_impl( &mut self, - haystack: Utf32Str<'_>, + haystack_: Utf32Str<'_>, needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if needle_.len() > haystack.len() || needle_.is_empty() { + if needle_.len() > haystack_.len() || needle_.is_empty() { return None; } - if needle_.len() == haystack.len() { - return self.exact_match_impl::(haystack, needle_, indices); + if needle_.len() == haystack_.len() { + return self.exact_match_impl::( + haystack_, + needle_, + 0, + haystack_.len(), + indices, + ); } assert!( - haystack.len() <= u32::MAX as usize, + haystack_.len() <= u32::MAX as usize, "fuzzy matching is only support for up to 2^32-1 codepoints" ); - match (haystack, needle_) { + match (haystack_, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { if let &[needle] = needle { return self.substring_match_1_ascii::(haystack, needle, indices); } let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; + if needle_.len() == end - start { + return Some(self.calculate_score::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } self.fuzzy_match_optimal::( AsciiChar::cast(haystack), AsciiChar::cast(needle), @@ -171,6 +186,10 @@ impl Matcher { return Some(res); } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::(haystack_, needle_, start, end, indices); + } self.fuzzy_match_optimal::( haystack, AsciiChar::cast(needle), @@ -188,6 +207,10 @@ impl Matcher { return Some(res); } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::(haystack_, needle_, start, end, indices); + } self.fuzzy_match_optimal::( haystack, needle, @@ -243,7 +266,7 @@ impl Matcher { return None; } if needle_.len() == haystack.len() { - return self.exact_match_impl::(haystack, needle_, indices); + return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } assert!( haystack.len() <= u32::MAX as usize, @@ -252,6 +275,15 @@ impl Matcher { match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; + if needle_.len() == greedy_end - start { + return Some(self.calculate_score::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } self.fuzzy_match_greedy_::( AsciiChar::cast(haystack), AsciiChar::cast(needle), @@ -330,7 +362,7 @@ impl Matcher { return None; } if needle_.len() == haystack.len() { - return self.exact_match_impl::(haystack, needle_, indices); + return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } assert!( haystack.len() <= u32::MAX as usize, @@ -393,7 +425,7 @@ impl Matcher { /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - self.exact_match_impl::(haystack, needle, &mut Vec::new()) + self.exact_match_impl::(haystack, needle, 0, haystack.len(), &mut Vec::new()) } /// Checks whether needle and haystack match exactly and compute the matches indices. @@ -407,7 +439,7 @@ impl Matcher { needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - self.exact_match_impl::(haystack, needle, indices) + self.exact_match_impl::(haystack, needle, 0, haystack.len(), indices) } /// Checks whether needle is a prefix of the haystack. @@ -419,7 +451,7 @@ impl Matcher { if haystack.len() < needle.len() { None } else { - self.exact_match_impl::(haystack.slice(..needle.len()), needle, &mut Vec::new()) + self.exact_match_impl::(haystack, needle, 0, needle.len(), &mut Vec::new()) } } @@ -437,7 +469,7 @@ impl Matcher { if haystack.len() < needle.len() { None } else { - self.exact_match_impl::(haystack.slice(..needle.len()), needle, indices) + self.exact_match_impl::(haystack, needle, 0, needle.len(), indices) } } @@ -451,8 +483,10 @@ impl Matcher { None } else { self.exact_match_impl::( - haystack.slice(haystack.len() - needle.len()..), + haystack, needle, + haystack.len() - needle.len(), + haystack.len(), &mut Vec::new(), ) } @@ -473,8 +507,10 @@ impl Matcher { None } else { self.exact_match_impl::( - haystack.slice(haystack.len() - needle.len()..), + haystack, needle, + haystack.len() - needle.len(), + haystack.len(), indices, ) } @@ -484,9 +520,11 @@ impl Matcher { &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, + start: usize, + end: usize, indices: &mut Vec, ) -> Option { - if needle_.len() != haystack.len() || needle_.is_empty() { + if needle_.len() != end - start || needle_.is_empty() { return None; } assert!( @@ -496,7 +534,7 @@ impl Matcher { let score = match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { let matched = if self.config.ignore_case { - AsciiChar::cast(haystack) + AsciiChar::cast(haystack)[start..end] .iter() .map(|c| c.normalize(&self.config)) .eq(AsciiChar::cast(needle) @@ -511,8 +549,8 @@ impl Matcher { self.calculate_score::( AsciiChar::cast(haystack), AsciiChar::cast(needle), - 0, - haystack.len(), + start, + end, indices, ) } @@ -522,13 +560,12 @@ impl Matcher { return None; } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - let matched = - haystack + let matched = haystack[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) .iter() - .map(|c| c.normalize(&self.config)) - .eq(AsciiChar::cast(needle) - .iter() - .map(|c| c.normalize(&self.config))); + .map(|c| c.normalize(&self.config))); if !matched { return None; } @@ -536,20 +573,20 @@ impl Matcher { self.calculate_score::( haystack, AsciiChar::cast(needle), - 0, - haystack.len(), + start, + end, indices, ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - let matched = haystack + let matched = haystack[start..end] .iter() .map(|c| c.normalize(&self.config)) .eq(needle.iter().map(|c| c.normalize(&self.config))); if !matched { return None; } - self.calculate_score::(haystack, needle, 0, haystack.len(), indices) + self.calculate_score::(haystack, needle, start, end, indices) } }; Some(score) diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs index 412f1bf..99ff3bc 100644 --- a/matcher/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -25,6 +25,8 @@ struct MatrixLayout { } impl MatrixLayout { fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout { + assert!(haystack_len >= needle_len); + assert!(haystack_len <= u32::MAX as usize); let mut layout = Layout::from_size_align(0, 1).unwrap(); let haystack_layout = Layout::array::(haystack_len).unwrap(); let bonus_layout = Layout::array::(haystack_len).unwrap(); diff --git a/matcher/src/prefilter.rs b/matcher/src/prefilter.rs index 92fbb7a..7e4de94 100644 --- a/matcher/src/prefilter.rs +++ b/matcher/src/prefilter.rs @@ -85,6 +85,9 @@ impl Matcher { .iter() .rev() .position(|c| c.normalize(&self.config) == needle_char)?; + if end - start < needle.len() { + return None; + } Some((start, end)) } diff --git a/worker/Cargo.toml b/src/Cargo.toml similarity index 100% rename from worker/Cargo.toml rename to src/Cargo.toml diff --git a/worker/src/items.rs b/src/items.rs similarity index 89% rename from worker/src/items.rs rename to src/items.rs index 82c9853..7b819d9 100644 --- a/worker/src/items.rs +++ b/src/items.rs @@ -89,9 +89,16 @@ pub(crate) struct ItemsSnapshot { } impl ItemsSnapshot { - pub(crate) fn new() -> Self { + pub(crate) fn new(items: &ItemCache) -> Self { Self { - items: Vec::with_capacity(1024), + items: items + .live + .iter() + .map(|item| ItemSnapshot { + cols: item.cols, + len: item.cols().iter().map(|s| s.len() as u32).sum(), + }) + .collect(), } } @@ -104,7 +111,7 @@ impl ItemsSnapshot { } pub(crate) fn update(&mut self, items: &ItemCache) -> bool { - let cleared = items.evicted.is_empty(); + let cleared = !items.evicted.is_empty(); // drop in another thread to ensure we don't wait for a long drop here if cleared { self.items.clear(); diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..f0f0d72 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,207 @@ +use std::cmp::Reverse; +use std::ops::Deref; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::Arc; +use std::time::Duration; + +use crate::items::{Item, ItemCache}; +use crate::worker::Worker; +use parking_lot::lock_api::ArcMutexGuard; +use rayon::ThreadPool; + +pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind}; +pub use crate::utf32_string::Utf32String; + +mod items; +mod query; +mod utf32_string; +mod worker; +pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; + +use parking_lot::{Mutex, MutexGuard, RawMutex}; + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub struct Match { + pub score: u32, + pub idx: u32, +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub struct Status { + pub changed: bool, + pub running: bool, +} + +#[derive(Clone)] +pub struct Items { + cache: Arc>, + items: Arc>>, + notify: Arc<(dyn Fn() + Sync + Send)>, +} + +impl Items { + pub fn clear(&mut self) { + self.items.lock().clear(); + self.cache.lock().clear(); + } + + pub fn append(&mut self, items: impl Iterator)>) { + let mut cache = self.cache.lock(); + let mut items_ = self.items.lock(); + items_.extend(items.map(|(item, text)| { + cache.push(text); + item + })); + // notify that a new tick will be necessary + (self.notify)(); + } + + pub fn get(&self) -> impl Deref + '_ { + MutexGuard::map(self.items.lock(), |items| items.as_mut_slice()) + } + + pub fn get_matcher_items(&self) -> impl Deref + '_ { + MutexGuard::map(self.cache.lock(), |items| items.get()) + } +} + +pub struct Nucleo { + // the way the API is build we totally don't actually neeed these to be Arcs + // but this lets us avoid some unsafe + worker: Arc>, + canceled: Arc, + pool: ThreadPool, + pub items: Items, + pub matches: Vec, + pub pattern: MultiPattern, + should_notify: Arc, +} + +impl Nucleo { + pub fn new( + config: MatcherConfig, + notify: Arc<(dyn Fn() + Sync + Send)>, + num_threads: Option, + case_matching: CaseMatching, + cols: usize, + items: impl Iterator)>, + ) -> Self { + let mut cache = ItemCache::new(); + let items: Vec<_> = items + .map(|(item, text)| { + cache.push(text); + item + }) + .collect(); + let matches: Vec<_> = (0..items.len()) + .map(|i| Match { + score: 0, + idx: i as u32, + }) + .collect(); + let (pool, worker) = + Worker::new(notify.clone(), num_threads, config, matches.clone(), &cache); + Self { + canceled: worker.canceled.clone(), + should_notify: worker.should_notify.clone(), + items: Items { + cache: Arc::new(Mutex::new(cache)), + items: Arc::new(Mutex::new(items)), + notify, + }, + pool, + matches, + pattern: MultiPattern::new(&config, case_matching, cols), + worker: Arc::new(Mutex::new(worker)), + } + } + + pub fn update_config(&mut self, config: MatcherConfig) { + self.worker.lock().update_config(config) + } + + pub fn tick(&mut self, timeout: u64) -> Status { + self.should_notify.store(false, atomic::Ordering::Relaxed); + let status = self.pattern.status(); + let items = self.items.cache.lock_arc(); + let canceled = status != query::Status::Unchanged || items.cleared(); + let res = self.tick_inner(timeout, canceled, items, status); + if !canceled { + self.should_notify.store(true, atomic::Ordering::Relaxed); + return res; + } + let items = self.items.cache.lock_arc(); + let res = self.tick_inner(timeout, false, items, query::Status::Unchanged); + self.should_notify.store(true, atomic::Ordering::Relaxed); + res + } + + fn tick_inner( + &mut self, + timeout: u64, + canceled: bool, + items: ArcMutexGuard, + status: query::Status, + ) -> Status { + let mut inner = if canceled { + self.pattern.reset_status(); + self.canceled.store(true, atomic::Ordering::Relaxed); + self.worker.lock_arc() + } else { + let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else { + return Status{ changed: false, running: true }; + }; + worker + }; + + let changed = inner.running; + if inner.running { + inner.running = false; + self.matches.clone_from(&inner.matches); + } + + let running = canceled || inner.items.outdated(&items); + if running { + inner.pattern.clone_from(&self.pattern); + self.canceled.store(false, atomic::Ordering::Relaxed); + self.pool.spawn(move || unsafe { inner.run(items, status) }) + } + Status { changed, running } + } +} + +impl Drop for Nucleo { + fn drop(&mut self) { + // we ensure the worker quits before dropping items to ensure that + // the worker can always assume the items outlife it + self.canceled.store(true, atomic::Ordering::Relaxed); + let lock = self.worker.try_lock_for(Duration::from_secs(1)); + if lock.is_none() { + unreachable!("thread pool failed to shutdown properly") + } + } +} +/// convenicne function to easily fuzzy match +/// on a (relatievly small list of inputs). This is not recommended for building a full tui +/// application that can match large numbers of matches as all matching is done on the current +/// thread, effectively blocking the UI +pub fn fuzzy_match>( + matcher: &mut Matcher, + pattern: &str, + items: impl IntoIterator, + case_matching: CaseMatching, +) -> Vec<(T, u32)> { + let mut pattern_ = Pattern::new(&matcher.config, case_matching); + pattern_.set_literal(pattern, PatternKind::Fuzzy, false); + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + pattern_ + .score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(item, score)| (Reverse(*score), item.as_ref().len())); + items +} diff --git a/worker/src/matcher.rs b/src/matcher.rs similarity index 100% rename from worker/src/matcher.rs rename to src/matcher.rs diff --git a/worker/src/query.rs b/src/query.rs similarity index 80% rename from worker/src/query.rs rename to src/query.rs index 474540c..1563ba5 100644 --- a/worker/src/query.rs +++ b/src/query.rs @@ -54,7 +54,9 @@ impl PatternAtom { match case { CaseMatching::Ignore => needle.make_ascii_lowercase(), - CaseMatching::Smart => ignore_case = needle.bytes().any(|b| b.is_ascii_uppercase()), + CaseMatching::Smart => { + ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) + } CaseMatching::Respect => (), } @@ -80,7 +82,7 @@ impl PatternAtom { match case { CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Smart => { - ignore_case = ignore_case || c.is_uppercase(); + ignore_case = ignore_case && !c.is_uppercase(); } CaseMatching::Respect => (), } @@ -149,22 +151,18 @@ pub enum Status { } #[derive(Debug, Clone)] -pub struct Query { +pub struct MultiPattern { pub cols: Vec, } -impl Query { - pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching, cols: usize) -> Query { - Query { - cols: vec![ - Pattern { - terms: Vec::new(), - case_matching, - normalize: matcher_config.normalize, - status: Status::Unchanged, - }; - cols - ], +impl MultiPattern { + pub fn new( + matcher_config: &MatcherConfig, + case_matching: CaseMatching, + cols: usize, + ) -> MultiPattern { + MultiPattern { + cols: vec![Pattern::new(matcher_config, case_matching); cols], } } @@ -201,7 +199,30 @@ pub struct Pattern { } impl Pattern { - pub(crate) fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern { + Pattern { + terms: Vec::new(), + case_matching, + normalize: matcher_config.normalize, + status: Status::Unchanged, + } + } + pub fn new_fuzzy_literal( + matcher_config: &MatcherConfig, + case_matching: CaseMatching, + pattern: &str, + ) -> Pattern { + let mut res = Pattern { + terms: Vec::new(), + case_matching, + normalize: matcher_config.normalize, + status: Status::Unchanged, + }; + res.set_literal(pattern, PatternKind::Fuzzy, false); + res + } + + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { if self.terms.is_empty() { return Some(0); } @@ -215,7 +236,7 @@ impl Pattern { matcher.substring_match(haystack, pattern.needle.slice(..)) } PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), - PatternKind::Postfix => matcher.prefix_match(haystack, pattern.needle.slice(..)), + PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)), }; if pattern.invert { if pattern_score.is_some() { @@ -249,7 +270,7 @@ impl Pattern { } PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), PatternKind::Postfix => { - matcher.prefix_match(haystack, pattern.needle.slice(..)) + matcher.postfix_match(haystack, pattern.needle.slice(..)) } }; if pattern_score.is_some() { @@ -262,16 +283,16 @@ impl Pattern { matcher.exact_indices(haystack, pattern.needle.slice(..), indices) } PatternKind::Fuzzy => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices) } PatternKind::Substring => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + matcher.substring_indices(haystack, pattern.needle.slice(..), indices) } PatternKind::Prefix => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + matcher.prefix_indices(haystack, pattern.needle.slice(..), indices) } PatternKind::Postfix => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + matcher.postfix_indices(haystack, pattern.needle.slice(..), indices) } }; score += pattern_score? as u32 @@ -282,10 +303,15 @@ impl Pattern { pub fn parse_from(&mut self, pattern: &str, append: bool) { self.terms.clear(); let invert = self.terms.last().map_or(false, |pat| pat.invert); - for atom in pattern_atoms(pattern) { - self.terms - .push(PatternAtom::parse(atom, self.normalize, self.case_matching)); - } + let atoms = pattern_atoms(pattern).filter_map(|atom| { + let atom = PatternAtom::parse(atom, self.normalize, self.case_matching); + if atom.needle.is_empty() { + return None; + } + Some(atom) + }); + self.terms.extend(atoms); + self.status = if append && !invert && self.status != Status::Rescore { Status::Update } else { @@ -304,6 +330,10 @@ impl Pattern { Status::Rescore }; } + + pub fn is_empty(&self) -> bool { + self.terms.is_empty() + } } fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { diff --git a/worker/src/results.rs b/src/results.rs similarity index 100% rename from worker/src/results.rs rename to src/results.rs diff --git a/worker/src/utf32_string.rs b/src/utf32_string.rs similarity index 69% rename from worker/src/utf32_string.rs rename to src/utf32_string.rs index 1a2604a..734e428 100644 --- a/worker/src/utf32_string.rs +++ b/src/utf32_string.rs @@ -1,5 +1,7 @@ use core::slice; +use std::borrow::Cow; use std::fmt; +use std::mem::take; use std::ops::{Bound, RangeBounds}; use nucleo_matcher::Utf32Str; @@ -12,6 +14,12 @@ pub enum Utf32String { /// A string represented as an array of unicode codepoints (basically UTF-32). Unicode(Box<[char]>), } + +impl Default for Utf32String { + fn default() -> Self { + Self::Ascii(String::new().into_boxed_str()) + } +} impl Utf32String { #[inline] pub fn len(&self) -> usize { @@ -48,31 +56,69 @@ impl Utf32String { } } + #[inline] pub fn is_ascii(&self) -> bool { matches!(self, Utf32String::Ascii(_)) } + #[inline] pub fn get(&self, idx: u32) -> char { match self { Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, Utf32String::Unicode(codepoints) => codepoints[idx as usize], } } + + #[inline] pub fn last(&self) -> char { match self { Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], } } + + #[inline] pub fn chars(&self) -> Chars<'_> { match self { Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), } } + + #[inline] + pub fn push_str(&mut self, text: &str) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if text.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push_str(text); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.chars().collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.extend(text.chars()); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } + #[inline] + pub fn push(&mut self, c: char) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if c.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push(c); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.chars().collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.push(c); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } } impl From<&str> for Utf32String { + #[inline] fn from(value: &str) -> Self { if value.is_ascii() { Self::Ascii(value.to_owned().into_boxed_str()) @@ -91,12 +137,24 @@ impl From> for Utf32String { } } } + impl From for Utf32String { + #[inline] fn from(value: String) -> Self { value.into_boxed_str().into() } } +impl<'a> From> for Utf32String { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + match value { + Cow::Borrowed(value) => value.into(), + Cow::Owned(value) => value.into(), + } + } +} + pub enum Chars<'a> { Ascii(slice::Iter<'a, u8>), Unicode(slice::Iter<'a, char>), @@ -104,6 +162,7 @@ pub enum Chars<'a> { impl<'a> Iterator for Chars<'a> { type Item = char; + #[inline] fn next(&mut self) -> Option { match self { Chars::Ascii(iter) => iter.next().map(|&c| c as char), diff --git a/worker/src/worker.rs b/src/worker.rs similarity index 72% rename from worker/src/worker.rs rename to src/worker.rs index 500b001..9165080 100644 --- a/worker/src/worker.rs +++ b/src/worker.rs @@ -8,7 +8,7 @@ use parking_lot::RawMutex; use rayon::{prelude::*, ThreadPool}; use crate::items::{ItemCache, ItemsSnapshot}; -use crate::query::{self, Query}; +use crate::query::{self, MultiPattern}; use crate::Match; struct Matchers(Box<[UnsafeCell]>); @@ -30,15 +30,24 @@ pub(crate) struct Worker { pub(crate) items: ItemsSnapshot, matchers: Matchers, pub(crate) matches: Vec, - pub(crate) query: Query, + pub(crate) pattern: MultiPattern, pub(crate) canceled: Arc, + pub(crate) should_notify: Arc, } impl Worker { + pub(crate) fn update_config(&mut self, config: MatcherConfig) { + for matcher in self.matchers.0.iter_mut() { + matcher.get_mut().config = config; + } + } + pub(crate) fn new( notify: Arc<(dyn Fn() + Sync + Send)>, worker_threads: Option, config: MatcherConfig, + matches: Vec, + items: &ItemCache, ) -> (ThreadPool, Worker) { let worker_threads = worker_threads .unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get())); @@ -53,15 +62,17 @@ impl Worker { let worker = Worker { notify, running: false, - items: ItemsSnapshot::new(), + items: ItemsSnapshot::new(items), matchers: Matchers(matchers), - matches: Vec::with_capacity(1024), + matches, // just a placeholder - query: Query::new(&config, crate::CaseMatching::Ignore, 0), + pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0), canceled: Arc::new(AtomicBool::new(false)), + should_notify: Arc::new(AtomicBool::new(false)), }; (pool, worker) } + pub(crate) unsafe fn run( &mut self, items_lock: ArcMutexGuard, @@ -77,48 +88,56 @@ impl Worker { self.matches.clear(); last_scored_item = 0; } - let matchers = &self.matchers; - let query = &self.query; + let pattern = &self.pattern; let items = unsafe { self.items.get() }; + if self.pattern.cols.iter().all(|pat| pat.is_empty()) { + self.matches.clear(); + self.matches.extend((0..items.len()).map(|i| Match { + score: 0, + idx: i as u32, + })); + if self.should_notify.load(atomic::Ordering::Relaxed) { + (self.notify)(); + } + return; + } if query_status != query::Status::Unchanged && !self.matches.is_empty() { self.matches .par_iter_mut() - .take_any_while(|_| self.canceled.load(atomic::Ordering::Relaxed)) + .take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed)) .for_each(|match_| { let item = &items[match_.idx as usize]; - match_.score = query + match_.score = pattern .score(item.cols(), unsafe { matchers.get() }) .unwrap_or(u32::MAX); }); // TODO: do this in parallel? - self.matches.retain(|m| m.score != u32::MAX) + self.matches.retain(|m| m.score != u32::MAX); } - if last_scored_item != self.items.len() { - self.running = true; let items = items[last_scored_item..] .par_iter() .enumerate() .filter_map(|(i, item)| { let score = if self.canceled.load(atomic::Ordering::Relaxed) { - 0 + u32::MAX - 1 } else { - query.score(item.cols(), unsafe { matchers.get() })? + pattern.score(item.cols(), unsafe { matchers.get() })? }; Some(Match { score, idx: i as u32, }) }); - self.matches.par_extend(items) + self.matches.par_extend(items); } if !self.canceled.load(atomic::Ordering::Relaxed) { // TODO: cancel sort in progess? self.matches.par_sort_unstable_by(|match1, match2| { - match2.idx.cmp(&match1.idx).then_with(|| { + match2.score.cmp(&match1.score).then_with(|| { // the tie breaker is comparitevly rarely needed so we keep it // in a branch especially beacuse we need to acceess the items // array here which invovles some pointer chasing @@ -129,6 +148,8 @@ impl Worker { }); } - (self.notify)(); + if self.should_notify.load(atomic::Ordering::Relaxed) { + (self.notify)(); + } } } diff --git a/worker/src/lib.rs b/worker/src/lib.rs deleted file mode 100644 index 43f4123..0000000 --- a/worker/src/lib.rs +++ /dev/null @@ -1,131 +0,0 @@ -use std::ops::Deref; -use std::sync::atomic::{self, AtomicBool}; -use std::sync::Arc; -use std::time::Duration; - -use crate::items::{Item, ItemCache}; -use crate::worker::Worker; -use rayon::ThreadPool; - -pub use crate::query::{CaseMatching, Pattern, PatternKind, Query}; -pub use crate::utf32_string::Utf32String; - -mod items; -mod query; -mod utf32_string; -mod worker; -pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; - -use parking_lot::{Mutex, MutexGuard}; - -#[derive(PartialEq, Eq, Debug, Clone, Copy)] -pub struct Match { - pub score: u32, - pub idx: u32, -} - -#[derive(Clone)] -pub struct Items { - cache: Arc>, - items: Arc>>, - notify: Arc<(dyn Fn() + Sync + Send)>, -} - -impl Items { - pub fn clear(&mut self) { - self.items.lock().clear(); - self.cache.lock().clear(); - } - - pub fn append(&mut self, items: impl Iterator)>) { - let mut cache = self.cache.lock(); - let mut items_ = self.items.lock(); - items_.extend(items.map(|(item, text)| { - cache.push(text); - item - })); - // notify that a new tick will be necessary - (self.notify)(); - } - - pub fn get(&self) -> impl Deref + '_ { - MutexGuard::map(self.items.lock(), |items| items.as_mut_slice()) - } - - pub fn get_matcher_items(&self) -> impl Deref + '_ { - MutexGuard::map(self.cache.lock(), |items| items.get()) - } -} - -pub struct Nucleo { - // the way the API is build we totally don't actually neeed these to be Arcs - // but this lets us avoid some unsafe - worker: Arc>, - canceled: Arc, - pool: ThreadPool, - pub items: Items, - pub matches: Vec, - pub query: Query, -} - -impl Nucleo { - pub fn new( - config: MatcherConfig, - notify: Arc<(dyn Fn() + Sync + Send)>, - num_threads: Option, - case_matching: CaseMatching, - cols: usize, - ) -> Self { - let (pool, worker) = Worker::new(notify.clone(), num_threads, config); - Self { - canceled: worker.canceled.clone(), - items: Items { - cache: Arc::new(Mutex::new(ItemCache::new())), - items: Arc::new(Mutex::new(Vec::with_capacity(1024))), - notify, - }, - pool, - matches: Vec::with_capacity(1024), - query: Query::new(&config, case_matching, cols), - worker: Arc::new(Mutex::new(worker)), - } - } - - pub fn tick(&mut self, timeout: u64) -> bool { - let status = self.query.status(); - let items = self.items.cache.lock_arc(); - let canceled = status != query::Status::Unchanged || items.cleared(); - let mut inner = if canceled { - self.query.reset_status(); - self.canceled.store(true, atomic::Ordering::Relaxed); - self.worker.lock_arc() - } else { - let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else { - return true; - }; - worker - }; - - if inner.running { - inner.running = false; - self.matches.clone_from(&inner.matches); - } else if !canceled { - // nothing has changed - return false; - } - - if canceled || inner.items.outdated(&items) { - self.pool.spawn(move || unsafe { inner.run(items, status) }) - } - true - } -} - -impl Drop for Nucleo { - fn drop(&mut self) { - // we ensure the worker quits before dropping items to ensure that - // the worker can always assume the items outlife it - self.canceled.store(true, atomic::Ordering::Relaxed); - drop(self.worker.lock()); - } -}