From 6b8ee0f5850d71b1620c274c7903ccb7ea245b13 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sat, 29 Jul 2023 21:43:02 +0200 Subject: [PATCH] progress on high level API --- matcher/src/chars.rs | 12 +-- worker/Cargo.toml | 5 +- worker/src/items.rs | 33 +++++- worker/src/lib.rs | 150 +++++++-------------------- worker/src/query.rs | 241 ++++++++++++++++++++++++++++++++++++++++++- worker/src/worker.rs | 134 ++++++++++++++++++++++++ 6 files changed, 448 insertions(+), 127 deletions(-) create mode 100644 worker/src/worker.rs diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index a26ef93..378ab90 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -138,15 +138,9 @@ pub use normalize::normalize; #[inline(always)] pub fn to_lower_case(c: char) -> char { - if c >= 'A' && c <= 'Z' { - char::from_u32(c as u32 + 32).unwrap() - } else if !c.is_ascii() { - CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |(upper, _)| *upper) - .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) - } else { - c - } + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] diff --git a/worker/Cargo.toml b/worker/Cargo.toml index 483c133..ed1429c 100644 --- a/worker/Cargo.toml +++ b/worker/Cargo.toml @@ -1,10 +1,11 @@ [package] name = "nucleo" +description = "plug and play high performance fuzzy matcher" authors = ["Pascal Kuthe "] version = "0.1.0" edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +license = "MPL-2.0" +repository = "https://github.com/helix-editor/nucleo" [dependencies] nucleo-matcher = { version = "0.1", path = "../matcher" } diff --git a/worker/src/items.rs b/worker/src/items.rs index 27ec6a1..82c9853 100644 --- a/worker/src/items.rs +++ b/worker/src/items.rs @@ -8,13 +8,22 @@ pub(crate) struct ItemCache { evicted: Vec, } impl ItemCache { + pub(crate) fn new() -> Self { + Self { + live: Vec::with_capacity(1024), + evicted: Vec::new(), + } + } + pub(crate) fn clear(&mut self) { if self.evicted.is_empty() { + self.evicted.reserve(1024); swap(&mut self.evicted, &mut self.live) } else { self.evicted.append(&mut self.live) } } + pub(crate) fn cleared(&self) -> bool { !self.evicted.is_empty() } @@ -24,19 +33,31 @@ impl ItemCache { cols: Box::leak(item).into(), }) } + + pub(crate) fn get(&mut self) -> &mut [Item] { + &mut self.live + } } -#[derive(PartialEq, Eq, Debug, Clone)] -pub(crate) struct Item { +#[derive(PartialEq, Eq, Clone)] +pub struct Item { // TODO: small vec optimization?? cols: NonNull<[Utf32String]>, } +impl std::fmt::Debug for Item { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ItemText") + .field("cols", &self.cols()) + .finish() + } +} + unsafe impl Send for Item {} unsafe impl Sync for Item {} impl Item { - fn cols(&self) -> &[Utf32String] { + pub fn cols(&self) -> &[Utf32String] { // safety: cols is basically a box and treated the same as a box, // however there can be other references so using a box (unique ptr) // would be an alias violation @@ -68,6 +89,12 @@ pub(crate) struct ItemsSnapshot { } impl ItemsSnapshot { + pub(crate) fn new() -> Self { + Self { + items: Vec::with_capacity(1024), + } + } + pub(crate) fn outdated(&self, items: &ItemCache) -> bool { items.live.len() != self.items.len() } diff --git a/worker/src/lib.rs b/worker/src/lib.rs index aa047fc..43f4123 100644 --- a/worker/src/lib.rs +++ b/worker/src/lib.rs @@ -1,123 +1,30 @@ -use std::cell::UnsafeCell; use std::ops::Deref; use std::sync::atomic::{self, AtomicBool}; use std::sync::Arc; use std::time::Duration; -use crate::items::{ItemCache, ItemsSnapshot}; -use crate::query::Query; +use crate::items::{Item, ItemCache}; +use crate::worker::Worker; +use rayon::ThreadPool; + +pub use crate::query::{CaseMatching, Pattern, PatternKind, Query}; pub use crate::utf32_string::Utf32String; -use parking_lot::lock_api::ArcMutexGuard; -use rayon::prelude::*; mod items; mod query; mod utf32_string; +mod worker; +pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; -use parking_lot::{Mutex, MutexGuard, RawMutex}; +use parking_lot::{Mutex, MutexGuard}; #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct Match { - score: u32, - idx: u32, -} - -struct Matchers(Box<[UnsafeCell]>); - -impl Matchers { - // thiss is not a true mut from ref, we use a cell here - #[allow(clippy::mut_from_ref)] - unsafe fn get(&self) -> &mut nucleo_matcher::Matcher { - &mut *self.0[rayon::current_thread_index().unwrap()].get() - } -} - -unsafe impl Sync for Matchers {} -unsafe impl Send for Matchers {} - -struct Worker { - notify: Arc<(dyn Fn() + Sync + Send)>, - running: bool, - items: ItemsSnapshot, - matchers: Matchers, - matches: Vec, - query: Query, - canceled: Arc, -} - -impl Worker { - unsafe fn run( - &mut self, - items_lock: ArcMutexGuard, - query_status: query::Status, - canceled: Arc, - ) { - self.running = true; - let mut last_scored_item = self.items.len(); - let cleared = self.items.update(&items_lock); - drop(items_lock); - - // TODO: be smarter around reusing past results for rescoring - if cleared || query_status == query::Status::Rescore { - self.matches.clear(); - last_scored_item = 0; - } - - let matchers = &self.matchers; - let query = &self.query; - let items = unsafe { self.items.get() }; - - if query_status != query::Status::Unchanged && !self.matches.is_empty() { - self.matches - .par_iter_mut() - .take_any_while(|_| canceled.load(atomic::Ordering::Relaxed)) - .for_each(|match_| { - let item = &items[match_.idx as usize]; - match_.score = query - .score(item.cols(), unsafe { matchers.get() }) - .unwrap_or(u32::MAX); - }); - // TODO: do this in parallel? - self.matches.retain(|m| m.score != u32::MAX) - } - - if last_scored_item != self.items.len() { - self.running = true; - let items = items[last_scored_item..] - .par_iter() - .enumerate() - .filter_map(|(i, item)| { - let score = if canceled.load(atomic::Ordering::Relaxed) { - 0 - } else { - query.score(item.cols(), unsafe { matchers.get() })? - }; - Some(Match { - score, - idx: i as u32, - }) - }); - self.matches.par_extend(items) - } - - if !self.canceled.load(atomic::Ordering::Relaxed) { - // TODO: cancel sort in progess? - self.matches.par_sort_unstable_by(|match1, match2| { - match2.idx.cmp(&match1.idx).then_with(|| { - // the tie breaker is comparitevly rarely needed so we keep it - // in a branch especially beacuse we need to acceess the items - // array here which invovles some pointer chasing - let item1 = &items[match1.idx as usize]; - let item2 = &items[match2.idx as usize]; - (item1.len, match1.idx).cmp(&(item2.len, match2.idx)) - }) - }); - } - - (self.notify)(); - } + pub score: u32, + pub idx: u32, } +#[derive(Clone)] pub struct Items { cache: Arc>, items: Arc>>, @@ -145,7 +52,9 @@ impl Items { MutexGuard::map(self.items.lock(), |items| items.as_mut_slice()) } - pub fn push() {} + pub fn get_matcher_items(&self) -> impl Deref + '_ { + MutexGuard::map(self.cache.lock(), |items| items.get()) + } } pub struct Nucleo { @@ -153,18 +62,41 @@ pub struct Nucleo { // but this lets us avoid some unsafe worker: Arc>, canceled: Arc, - items: Items, - thread_pool: rayon::ThreadPool, + pool: ThreadPool, + pub items: Items, pub matches: Vec, pub query: Query, } impl Nucleo { + pub fn new( + config: MatcherConfig, + notify: Arc<(dyn Fn() + Sync + Send)>, + num_threads: Option, + case_matching: CaseMatching, + cols: usize, + ) -> Self { + let (pool, worker) = Worker::new(notify.clone(), num_threads, config); + Self { + canceled: worker.canceled.clone(), + items: Items { + cache: Arc::new(Mutex::new(ItemCache::new())), + items: Arc::new(Mutex::new(Vec::with_capacity(1024))), + notify, + }, + pool, + matches: Vec::with_capacity(1024), + query: Query::new(&config, case_matching, cols), + worker: Arc::new(Mutex::new(worker)), + } + } + pub fn tick(&mut self, timeout: u64) -> bool { let status = self.query.status(); let items = self.items.cache.lock_arc(); let canceled = status != query::Status::Unchanged || items.cleared(); let mut inner = if canceled { + self.query.reset_status(); self.canceled.store(true, atomic::Ordering::Relaxed); self.worker.lock_arc() } else { @@ -183,9 +115,7 @@ impl Nucleo { } if canceled || inner.items.outdated(&items) { - let canceled = self.canceled.clone(); - self.thread_pool - .spawn(move || unsafe { inner.run(items, status, canceled) }) + self.pool.spawn(move || unsafe { inner.run(items, status) }) } true } diff --git a/worker/src/query.rs b/worker/src/query.rs index 936c6bf..474540c 100644 --- a/worker/src/query.rs +++ b/worker/src/query.rs @@ -1,9 +1,18 @@ -use nucleo_matcher::{Matcher, Utf32Str}; +use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; use crate::Utf32String; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaseMatching { + Ignore, + Smart, + Respect, +} + #[derive(Debug, PartialEq, Eq, Clone, Copy)] -enum PatternKind { +#[non_exhaustive] +pub enum PatternKind { Exact, Fuzzy, Substring, @@ -16,6 +25,120 @@ struct PatternAtom { kind: PatternKind, needle: Utf32String, invert: bool, + ignore_case: bool, +} +impl PatternAtom { + fn literal( + needle: &str, + normalize: bool, + case: CaseMatching, + kind: PatternKind, + escape_whitespace: bool, + ) -> PatternAtom { + let mut ignore_case = case == CaseMatching::Ignore; + let needle = if needle.is_ascii() { + let mut needle = if escape_whitespace { + if let Some((start, rem)) = needle.split_once("\\ ") { + let mut needle = start.to_owned(); + for rem in rem.split("\\ ") { + needle.push(' '); + needle.push_str(rem); + } + needle + } else { + needle.to_owned() + } + } else { + needle.to_owned() + }; + + match case { + CaseMatching::Ignore => needle.make_ascii_lowercase(), + CaseMatching::Smart => ignore_case = needle.bytes().any(|b| b.is_ascii_uppercase()), + CaseMatching::Respect => (), + } + + Utf32String::Ascii(needle.into_boxed_str()) + } else { + let mut needle_ = Vec::with_capacity(needle.len()); + if escape_whitespace { + let mut saw_backslash = false; + for mut c in needle.chars() { + if saw_backslash { + if c == ' ' { + needle_.push(' '); + saw_backslash = false; + continue; + } else { + needle_.push('\\'); + } + } + saw_backslash = c == '\\'; + if normalize { + c = chars::normalize(c); + } + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case || c.is_uppercase(); + } + CaseMatching::Respect => (), + } + needle_.push(c); + } + }; + Utf32String::Unicode(needle_.into_boxed_slice()) + }; + PatternAtom { + kind, + needle, + invert: false, + ignore_case, + } + } + + fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom { + let mut atom = raw; + let inverse = atom.starts_with('!'); + if inverse { + atom = &atom[1..]; + } + + let mut kind = match atom.as_bytes() { + [b'^', ..] => { + atom = &atom[1..]; + PatternKind::Prefix + } + [b'\'', ..] => { + atom = &atom[1..]; + PatternKind::Substring + } + [b'\\', b'^' | b'\'', ..] => { + atom = &atom[1..]; + PatternKind::Fuzzy + } + _ => PatternKind::Fuzzy, + }; + + match atom.as_bytes() { + [.., b'\\', b'$'] => (), + [.., b'$'] => { + kind = if kind == PatternKind::Fuzzy { + PatternKind::Postfix + } else { + PatternKind::Exact + }; + atom = &atom[..atom.len() - 1] + } + _ => (), + } + + if inverse && kind == PatternKind::Fuzzy { + kind = PatternKind::Substring + } + + PatternAtom::literal(atom, normalize, case, kind, true) + } } #[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] @@ -31,6 +154,20 @@ pub struct Query { } impl Query { + pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching, cols: usize) -> Query { + Query { + cols: vec![ + Pattern { + terms: Vec::new(), + case_matching, + normalize: matcher_config.normalize, + status: Status::Unchanged, + }; + cols + ], + } + } + pub(crate) fn status(&self) -> Status { self.cols .iter() @@ -39,7 +176,13 @@ impl Query { .unwrap_or(Status::Unchanged) } - pub(crate) fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { + pub(crate) fn reset_status(&mut self) { + for col in &mut self.cols { + col.status = Status::Unchanged + } + } + + pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { // TODO: wheight columns? let mut score = 0; for (pattern, haystack) in self.cols.iter().zip(haystack) { @@ -52,6 +195,8 @@ impl Query { #[derive(Clone, Debug)] pub struct Pattern { terms: Vec, + case_matching: CaseMatching, + normalize: bool, status: Status, } @@ -62,6 +207,7 @@ impl Pattern { } let mut score = 0; for pattern in &self.terms { + matcher.config.ignore_case = pattern.ignore_case; let pattern_score = match pattern.kind { PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), @@ -81,4 +227,93 @@ impl Pattern { } Some(score) } + + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec, + ) -> Option { + if self.terms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.terms { + matcher.config.ignore_case = pattern.ignore_case; + if pattern.invert { + let pattern_score = match pattern.kind { + PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), + PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), + PatternKind::Substring => { + matcher.substring_match(haystack, pattern.needle.slice(..)) + } + PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), + PatternKind::Postfix => { + matcher.prefix_match(haystack, pattern.needle.slice(..)) + } + }; + if pattern_score.is_some() { + return None; + } + continue; + } + let pattern_score = match pattern.kind { + PatternKind::Exact => { + matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + } + PatternKind::Fuzzy => { + matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + } + PatternKind::Substring => { + matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + } + PatternKind::Prefix => { + matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + } + PatternKind::Postfix => { + matcher.exact_indices(haystack, pattern.needle.slice(..), indices) + } + }; + score += pattern_score? as u32 + } + Some(score) + } + + pub fn parse_from(&mut self, pattern: &str, append: bool) { + self.terms.clear(); + let invert = self.terms.last().map_or(false, |pat| pat.invert); + for atom in pattern_atoms(pattern) { + self.terms + .push(PatternAtom::parse(atom, self.normalize, self.case_matching)); + } + self.status = if append && !invert && self.status != Status::Rescore { + Status::Update + } else { + Status::Rescore + }; + } + + pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) { + self.terms.clear(); + let pattern = + PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false); + self.terms.push(pattern); + self.status = if append && self.status != Status::Rescore { + Status::Update + } else { + Status::Rescore + }; + } +} + +fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { + let mut saw_backslash = false; + pattern.split(move |c| { + saw_backslash = match c { + ' ' if !saw_backslash => return true, + '\\' => true, + _ => false, + }; + false + }) } diff --git a/worker/src/worker.rs b/worker/src/worker.rs new file mode 100644 index 0000000..500b001 --- /dev/null +++ b/worker/src/worker.rs @@ -0,0 +1,134 @@ +use std::cell::UnsafeCell; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::Arc; + +use nucleo_matcher::MatcherConfig; +use parking_lot::lock_api::ArcMutexGuard; +use parking_lot::RawMutex; +use rayon::{prelude::*, ThreadPool}; + +use crate::items::{ItemCache, ItemsSnapshot}; +use crate::query::{self, Query}; +use crate::Match; + +struct Matchers(Box<[UnsafeCell]>); + +impl Matchers { + // thiss is not a true mut from ref, we use a cell here + #[allow(clippy::mut_from_ref)] + unsafe fn get(&self) -> &mut nucleo_matcher::Matcher { + &mut *self.0[rayon::current_thread_index().unwrap()].get() + } +} + +unsafe impl Sync for Matchers {} +unsafe impl Send for Matchers {} + +pub(crate) struct Worker { + notify: Arc<(dyn Fn() + Sync + Send)>, + pub(crate) running: bool, + pub(crate) items: ItemsSnapshot, + matchers: Matchers, + pub(crate) matches: Vec, + pub(crate) query: Query, + pub(crate) canceled: Arc, +} + +impl Worker { + pub(crate) fn new( + notify: Arc<(dyn Fn() + Sync + Send)>, + worker_threads: Option, + config: MatcherConfig, + ) -> (ThreadPool, Worker) { + let worker_threads = worker_threads + .unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get())); + let pool = rayon::ThreadPoolBuilder::new() + .thread_name(|i| format!("nucleo worker {i}")) + .num_threads(worker_threads) + .build() + .expect("creating threadpool failed"); + let matchers = (0..worker_threads) + .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config))) + .collect(); + let worker = Worker { + notify, + running: false, + items: ItemsSnapshot::new(), + matchers: Matchers(matchers), + matches: Vec::with_capacity(1024), + // just a placeholder + query: Query::new(&config, crate::CaseMatching::Ignore, 0), + canceled: Arc::new(AtomicBool::new(false)), + }; + (pool, worker) + } + pub(crate) unsafe fn run( + &mut self, + items_lock: ArcMutexGuard, + query_status: query::Status, + ) { + self.running = true; + let mut last_scored_item = self.items.len(); + let cleared = self.items.update(&items_lock); + drop(items_lock); + + // TODO: be smarter around reusing past results for rescoring + if cleared || query_status == query::Status::Rescore { + self.matches.clear(); + last_scored_item = 0; + } + + let matchers = &self.matchers; + let query = &self.query; + let items = unsafe { self.items.get() }; + + if query_status != query::Status::Unchanged && !self.matches.is_empty() { + self.matches + .par_iter_mut() + .take_any_while(|_| self.canceled.load(atomic::Ordering::Relaxed)) + .for_each(|match_| { + let item = &items[match_.idx as usize]; + match_.score = query + .score(item.cols(), unsafe { matchers.get() }) + .unwrap_or(u32::MAX); + }); + // TODO: do this in parallel? + self.matches.retain(|m| m.score != u32::MAX) + } + + if last_scored_item != self.items.len() { + self.running = true; + let items = items[last_scored_item..] + .par_iter() + .enumerate() + .filter_map(|(i, item)| { + let score = if self.canceled.load(atomic::Ordering::Relaxed) { + 0 + } else { + query.score(item.cols(), unsafe { matchers.get() })? + }; + Some(Match { + score, + idx: i as u32, + }) + }); + self.matches.par_extend(items) + } + + if !self.canceled.load(atomic::Ordering::Relaxed) { + // TODO: cancel sort in progess? + self.matches.par_sort_unstable_by(|match1, match2| { + match2.idx.cmp(&match1.idx).then_with(|| { + // the tie breaker is comparitevly rarely needed so we keep it + // in a branch especially beacuse we need to acceess the items + // array here which invovles some pointer chasing + let item1 = &items[match1.idx as usize]; + let item2 = &items[match2.idx as usize]; + (item1.len, match1.idx).cmp(&(item2.len, match2.idx)) + }) + }); + } + + (self.notify)(); + } +}