cleanup bugs

This commit is contained in:
Pascal Kuthe 2023-07-30 04:52:44 +02:00
parent 6b8ee0f585
commit 1ce8850f7e
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
17 changed files with 463 additions and 218 deletions

1
Cargo.lock generated
View File

@ -15,7 +15,6 @@ dependencies = [
"brunch", "brunch",
"fuzzy-matcher", "fuzzy-matcher",
"nucleo", "nucleo",
"nucleo-matcher",
"walkdir", "walkdir",
] ]

View File

@ -1,2 +1,18 @@
[package]
name = "nucleo"
description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.0"
edition = "2021"
license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo"
[lib]
[dependencies]
nucleo-matcher = { version = "0.1", path = "matcher" }
parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]}
rayon = "1.7.0"
[workspace] [workspace]
members = [ "matcher", "worker", "benches" ] members = [ "matcher", "bench" ]

View File

@ -6,8 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
nucleo-matcher = { version = "0.1", path = "../matcher" } nucleo = { version = "0.1", path = "../" }
nucleo = { version = "0.1", path = "../worker" }
brunch = "0.5.0" brunch = "0.5.0"
fuzzy-matcher = "0.3.7" fuzzy-matcher = "0.3.7"
walkdir = "2" walkdir = "2"

View File

@ -4,8 +4,7 @@ use std::process::Command;
use brunch::{Bench, Benches}; use brunch::{Bench, Benches};
use fuzzy_matcher::FuzzyMatcher; use fuzzy_matcher::FuzzyMatcher;
use nucleo::Utf32String; use nucleo::{Utf32Str, Utf32String};
use nucleo_matcher::Utf32Str;
fn bench_dir() -> PathBuf { fn bench_dir() -> PathBuf {
std::env::var_os("BENCHMARK_DIR") std::env::var_os("BENCHMARK_DIR")
@ -44,9 +43,8 @@ fn main() {
Some((path.as_str().into(), path)) Some((path.as_str().into(), path))
}) })
.unzip(); .unzip();
let mut nucleo = let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths());
nucleo_matcher::Matcher::new(nucleo_matcher::MatcherConfig::DEFAULT.match_paths()); let skim = fuzzy_matcher::skim::SkimMatcherV2::default();
let skim = fuzzy_matcher::skim::SkimMatcherV2::default().ignore_case();
// TODO: unicode? // TODO: unicode?
let needles = ["never_matches", "copying", "/doc/kernel", "//.h"]; let needles = ["never_matches", "copying", "/doc/kernel", "//.h"];

View File

@ -11,9 +11,7 @@ pub struct MatcherConfig {
/// Extra bonus for word boundary after slash, colon, semi-colon, and comma /// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub(crate) bonus_boundary_delimiter: u16, pub(crate) bonus_boundary_delimiter: u16,
pub initial_char_class: CharClass, pub initial_char_class: CharClass,
/// Whether to normalize latin script characters to ASCII /// Whether to normalize latin script characters to ASCII (enabled by default)
/// this significantly degrades performance so its not recommended
/// to be turned on by default
pub normalize: bool, pub normalize: bool,
/// whether to ignore casing /// whether to ignore casing
pub ignore_case: bool, pub ignore_case: bool,
@ -26,7 +24,7 @@ impl MatcherConfig {
bonus_boundary_white: BONUS_BOUNDARY + 2, bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1, bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
initial_char_class: CharClass::Whitespace, initial_char_class: CharClass::Whitespace,
normalize: false, normalize: true,
ignore_case: true, ignore_case: true,
} }
}; };

View File

@ -125,26 +125,41 @@ impl Matcher {
fn fuzzy_matcher_impl<const INDICES: bool>( fn fuzzy_matcher_impl<const INDICES: bool>(
&mut self, &mut self,
haystack: Utf32Str<'_>, haystack_: Utf32Str<'_>,
needle_: Utf32Str<'_>, needle_: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() > haystack.len() || needle_.is_empty() { if needle_.len() > haystack_.len() || needle_.is_empty() {
return None; return None;
} }
if needle_.len() == haystack.len() { if needle_.len() == haystack_.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices); return self.exact_match_impl::<INDICES>(
haystack_,
needle_,
0,
haystack_.len(),
indices,
);
} }
assert!( assert!(
haystack.len() <= u32::MAX as usize, haystack_.len() <= u32::MAX as usize,
"fuzzy matching is only support for up to 2^32-1 codepoints" "fuzzy matching is only support for up to 2^32-1 codepoints"
); );
match (haystack, needle_) { match (haystack_, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
if let &[needle] = needle { if let &[needle] = needle {
return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices); return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices);
} }
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?;
if needle_.len() == end - start {
return Some(self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
start,
greedy_end,
indices,
));
}
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>( self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
AsciiChar::cast(haystack), AsciiChar::cast(haystack),
AsciiChar::cast(needle), AsciiChar::cast(needle),
@ -171,6 +186,10 @@ impl Matcher {
return Some(res); return Some(res);
} }
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
if needle_.len() == end - start {
return self
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
}
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>( self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
haystack, haystack,
AsciiChar::cast(needle), AsciiChar::cast(needle),
@ -188,6 +207,10 @@ impl Matcher {
return Some(res); return Some(res);
} }
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
if needle_.len() == end - start {
return self
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
}
self.fuzzy_match_optimal::<INDICES, char, char>( self.fuzzy_match_optimal::<INDICES, char, char>(
haystack, haystack,
needle, needle,
@ -243,7 +266,7 @@ impl Matcher {
return None; return None;
} }
if needle_.len() == haystack.len() { if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices); return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
} }
assert!( assert!(
haystack.len() <= u32::MAX as usize, haystack.len() <= u32::MAX as usize,
@ -252,6 +275,15 @@ impl Matcher {
match (haystack, needle_) { match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?;
if needle_.len() == greedy_end - start {
return Some(self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
start,
greedy_end,
indices,
));
}
self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>( self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>(
AsciiChar::cast(haystack), AsciiChar::cast(haystack),
AsciiChar::cast(needle), AsciiChar::cast(needle),
@ -330,7 +362,7 @@ impl Matcher {
return None; return None;
} }
if needle_.len() == haystack.len() { if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices); return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
} }
assert!( assert!(
haystack.len() <= u32::MAX as usize, haystack.len() <= u32::MAX as usize,
@ -393,7 +425,7 @@ impl Matcher {
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
self.exact_match_impl::<false>(haystack, needle, &mut Vec::new()) self.exact_match_impl::<false>(haystack, needle, 0, haystack.len(), &mut Vec::new())
} }
/// Checks whether needle and haystack match exactly and compute the matches indices. /// Checks whether needle and haystack match exactly and compute the matches indices.
@ -407,7 +439,7 @@ impl Matcher {
needle: Utf32Str<'_>, needle: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
self.exact_match_impl::<true>(haystack, needle, indices) self.exact_match_impl::<true>(haystack, needle, 0, haystack.len(), indices)
} }
/// Checks whether needle is a prefix of the haystack. /// Checks whether needle is a prefix of the haystack.
@ -419,7 +451,7 @@ impl Matcher {
if haystack.len() < needle.len() { if haystack.len() < needle.len() {
None None
} else { } else {
self.exact_match_impl::<false>(haystack.slice(..needle.len()), needle, &mut Vec::new()) self.exact_match_impl::<false>(haystack, needle, 0, needle.len(), &mut Vec::new())
} }
} }
@ -437,7 +469,7 @@ impl Matcher {
if haystack.len() < needle.len() { if haystack.len() < needle.len() {
None None
} else { } else {
self.exact_match_impl::<true>(haystack.slice(..needle.len()), needle, indices) self.exact_match_impl::<true>(haystack, needle, 0, needle.len(), indices)
} }
} }
@ -451,8 +483,10 @@ impl Matcher {
None None
} else { } else {
self.exact_match_impl::<false>( self.exact_match_impl::<false>(
haystack.slice(haystack.len() - needle.len()..), haystack,
needle, needle,
haystack.len() - needle.len(),
haystack.len(),
&mut Vec::new(), &mut Vec::new(),
) )
} }
@ -473,8 +507,10 @@ impl Matcher {
None None
} else { } else {
self.exact_match_impl::<true>( self.exact_match_impl::<true>(
haystack.slice(haystack.len() - needle.len()..), haystack,
needle, needle,
haystack.len() - needle.len(),
haystack.len(),
indices, indices,
) )
} }
@ -484,9 +520,11 @@ impl Matcher {
&mut self, &mut self,
haystack: Utf32Str<'_>, haystack: Utf32Str<'_>,
needle_: Utf32Str<'_>, needle_: Utf32Str<'_>,
start: usize,
end: usize,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() != haystack.len() || needle_.is_empty() { if needle_.len() != end - start || needle_.is_empty() {
return None; return None;
} }
assert!( assert!(
@ -496,7 +534,7 @@ impl Matcher {
let score = match (haystack, needle_) { let score = match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let matched = if self.config.ignore_case { let matched = if self.config.ignore_case {
AsciiChar::cast(haystack) AsciiChar::cast(haystack)[start..end]
.iter() .iter()
.map(|c| c.normalize(&self.config)) .map(|c| c.normalize(&self.config))
.eq(AsciiChar::cast(needle) .eq(AsciiChar::cast(needle)
@ -511,8 +549,8 @@ impl Matcher {
self.calculate_score::<INDICES, _, _>( self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack), AsciiChar::cast(haystack),
AsciiChar::cast(needle), AsciiChar::cast(needle),
0, start,
haystack.len(), end,
indices, indices,
) )
} }
@ -522,8 +560,7 @@ impl Matcher {
return None; return None;
} }
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
let matched = let matched = haystack[start..end]
haystack
.iter() .iter()
.map(|c| c.normalize(&self.config)) .map(|c| c.normalize(&self.config))
.eq(AsciiChar::cast(needle) .eq(AsciiChar::cast(needle)
@ -536,20 +573,20 @@ impl Matcher {
self.calculate_score::<INDICES, _, _>( self.calculate_score::<INDICES, _, _>(
haystack, haystack,
AsciiChar::cast(needle), AsciiChar::cast(needle),
0, start,
haystack.len(), end,
indices, indices,
) )
} }
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
let matched = haystack let matched = haystack[start..end]
.iter() .iter()
.map(|c| c.normalize(&self.config)) .map(|c| c.normalize(&self.config))
.eq(needle.iter().map(|c| c.normalize(&self.config))); .eq(needle.iter().map(|c| c.normalize(&self.config)));
if !matched { if !matched {
return None; return None;
} }
self.calculate_score::<INDICES, _, _>(haystack, needle, 0, haystack.len(), indices) self.calculate_score::<INDICES, _, _>(haystack, needle, start, end, indices)
} }
}; };
Some(score) Some(score)

View File

@ -25,6 +25,8 @@ struct MatrixLayout<C: Char> {
} }
impl<C: Char> MatrixLayout<C> { impl<C: Char> MatrixLayout<C> {
fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout<C> { fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout<C> {
assert!(haystack_len >= needle_len);
assert!(haystack_len <= u32::MAX as usize);
let mut layout = Layout::from_size_align(0, 1).unwrap(); let mut layout = Layout::from_size_align(0, 1).unwrap();
let haystack_layout = Layout::array::<C>(haystack_len).unwrap(); let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap(); let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();

View File

@ -85,6 +85,9 @@ impl Matcher {
.iter() .iter()
.rev() .rev()
.position(|c| c.normalize(&self.config) == needle_char)?; .position(|c| c.normalize(&self.config) == needle_char)?;
if end - start < needle.len() {
return None;
}
Some((start, end)) Some((start, end))
} }

View File

@ -89,9 +89,16 @@ pub(crate) struct ItemsSnapshot {
} }
impl ItemsSnapshot { impl ItemsSnapshot {
pub(crate) fn new() -> Self { pub(crate) fn new(items: &ItemCache) -> Self {
Self { Self {
items: Vec::with_capacity(1024), items: items
.live
.iter()
.map(|item| ItemSnapshot {
cols: item.cols,
len: item.cols().iter().map(|s| s.len() as u32).sum(),
})
.collect(),
} }
} }
@ -104,7 +111,7 @@ impl ItemsSnapshot {
} }
pub(crate) fn update(&mut self, items: &ItemCache) -> bool { pub(crate) fn update(&mut self, items: &ItemCache) -> bool {
let cleared = items.evicted.is_empty(); let cleared = !items.evicted.is_empty();
// drop in another thread to ensure we don't wait for a long drop here // drop in another thread to ensure we don't wait for a long drop here
if cleared { if cleared {
self.items.clear(); self.items.clear();

207
src/lib.rs Normal file
View File

@ -0,0 +1,207 @@
use std::cmp::Reverse;
use std::ops::Deref;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use std::time::Duration;
use crate::items::{Item, ItemCache};
use crate::worker::Worker;
use parking_lot::lock_api::ArcMutexGuard;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::utf32_string::Utf32String;
mod items;
mod query;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use parking_lot::{Mutex, MutexGuard, RawMutex};
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
pub score: u32,
pub idx: u32,
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Status {
pub changed: bool,
pub running: bool,
}
#[derive(Clone)]
pub struct Items<T> {
cache: Arc<Mutex<ItemCache>>,
items: Arc<Mutex<Vec<T>>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
}
impl<T: Sync + Send> Items<T> {
pub fn clear(&mut self) {
self.items.lock().clear();
self.cache.lock().clear();
}
pub fn append(&mut self, items: impl Iterator<Item = (T, Box<[Utf32String]>)>) {
let mut cache = self.cache.lock();
let mut items_ = self.items.lock();
items_.extend(items.map(|(item, text)| {
cache.push(text);
item
}));
// notify that a new tick will be necessary
(self.notify)();
}
pub fn get(&self) -> impl Deref<Target = [T]> + '_ {
MutexGuard::map(self.items.lock(), |items| items.as_mut_slice())
}
pub fn get_matcher_items(&self) -> impl Deref<Target = [Item]> + '_ {
MutexGuard::map(self.cache.lock(), |items| items.get())
}
}
pub struct Nucleo<T: Sync + Send> {
// the way the API is build we totally don't actually neeed these to be Arcs
// but this lets us avoid some unsafe
worker: Arc<Mutex<Worker>>,
canceled: Arc<AtomicBool>,
pool: ThreadPool,
pub items: Items<T>,
pub matches: Vec<Match>,
pub pattern: MultiPattern,
should_notify: Arc<AtomicBool>,
}
impl<T: Sync + Send> Nucleo<T> {
pub fn new(
config: MatcherConfig,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
cols: usize,
items: impl Iterator<Item = (T, Box<[Utf32String]>)>,
) -> Self {
let mut cache = ItemCache::new();
let items: Vec<_> = items
.map(|(item, text)| {
cache.push(text);
item
})
.collect();
let matches: Vec<_> = (0..items.len())
.map(|i| Match {
score: 0,
idx: i as u32,
})
.collect();
let (pool, worker) =
Worker::new(notify.clone(), num_threads, config, matches.clone(), &cache);
Self {
canceled: worker.canceled.clone(),
should_notify: worker.should_notify.clone(),
items: Items {
cache: Arc::new(Mutex::new(cache)),
items: Arc::new(Mutex::new(items)),
notify,
},
pool,
matches,
pattern: MultiPattern::new(&config, case_matching, cols),
worker: Arc::new(Mutex::new(worker)),
}
}
pub fn update_config(&mut self, config: MatcherConfig) {
self.worker.lock().update_config(config)
}
pub fn tick(&mut self, timeout: u64) -> Status {
self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let res = self.tick_inner(timeout, canceled, items, status);
if !canceled {
self.should_notify.store(true, atomic::Ordering::Relaxed);
return res;
}
let items = self.items.cache.lock_arc();
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged);
self.should_notify.store(true, atomic::Ordering::Relaxed);
res
}
fn tick_inner(
&mut self,
timeout: u64,
canceled: bool,
items: ArcMutexGuard<RawMutex, ItemCache>,
status: query::Status,
) -> Status {
let mut inner = if canceled {
self.pattern.reset_status();
self.canceled.store(true, atomic::Ordering::Relaxed);
self.worker.lock_arc()
} else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
return Status{ changed: false, running: true };
};
worker
};
let changed = inner.running;
if inner.running {
inner.running = false;
self.matches.clone_from(&inner.matches);
}
let running = canceled || inner.items.outdated(&items);
if running {
inner.pattern.clone_from(&self.pattern);
self.canceled.store(false, atomic::Ordering::Relaxed);
self.pool.spawn(move || unsafe { inner.run(items, status) })
}
Status { changed, running }
}
}
impl<T: Sync + Send> Drop for Nucleo<T> {
fn drop(&mut self) {
// we ensure the worker quits before dropping items to ensure that
// the worker can always assume the items outlife it
self.canceled.store(true, atomic::Ordering::Relaxed);
let lock = self.worker.try_lock_for(Duration::from_secs(1));
if lock.is_none() {
unreachable!("thread pool failed to shutdown properly")
}
}
}
/// convenicne function to easily fuzzy match
/// on a (relatievly small list of inputs). This is not recommended for building a full tui
/// application that can match large numbers of matches as all matching is done on the current
/// thread, effectively blocking the UI
pub fn fuzzy_match<T: AsRef<str>>(
matcher: &mut Matcher,
pattern: &str,
items: impl IntoIterator<Item = T>,
case_matching: CaseMatching,
) -> Vec<(T, u32)> {
let mut pattern_ = Pattern::new(&matcher.config, case_matching);
pattern_.set_literal(pattern, PatternKind::Fuzzy, false);
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
pattern_
.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(item, score)| (Reverse(*score), item.as_ref().len()));
items
}

View File

@ -54,7 +54,9 @@ impl PatternAtom {
match case { match case {
CaseMatching::Ignore => needle.make_ascii_lowercase(), CaseMatching::Ignore => needle.make_ascii_lowercase(),
CaseMatching::Smart => ignore_case = needle.bytes().any(|b| b.is_ascii_uppercase()), CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => (), CaseMatching::Respect => (),
} }
@ -80,7 +82,7 @@ impl PatternAtom {
match case { match case {
CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = ignore_case || c.is_uppercase(); ignore_case = ignore_case && !c.is_uppercase();
} }
CaseMatching::Respect => (), CaseMatching::Respect => (),
} }
@ -149,22 +151,18 @@ pub enum Status {
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Query { pub struct MultiPattern {
pub cols: Vec<Pattern>, pub cols: Vec<Pattern>,
} }
impl Query { impl MultiPattern {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching, cols: usize) -> Query { pub fn new(
Query { matcher_config: &MatcherConfig,
cols: vec![ case_matching: CaseMatching,
Pattern { cols: usize,
terms: Vec::new(), ) -> MultiPattern {
case_matching, MultiPattern {
normalize: matcher_config.normalize, cols: vec![Pattern::new(matcher_config, case_matching); cols],
status: Status::Unchanged,
};
cols
],
} }
} }
@ -201,7 +199,30 @@ pub struct Pattern {
} }
impl Pattern { impl Pattern {
pub(crate) fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> { pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
}
}
pub fn new_fuzzy_literal(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
pattern: &str,
) -> Pattern {
let mut res = Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
res.set_literal(pattern, PatternKind::Fuzzy, false);
res
}
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.terms.is_empty() { if self.terms.is_empty() {
return Some(0); return Some(0);
} }
@ -215,7 +236,7 @@ impl Pattern {
matcher.substring_match(haystack, pattern.needle.slice(..)) matcher.substring_match(haystack, pattern.needle.slice(..))
} }
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => matcher.prefix_match(haystack, pattern.needle.slice(..)), PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)),
}; };
if pattern.invert { if pattern.invert {
if pattern_score.is_some() { if pattern_score.is_some() {
@ -249,7 +270,7 @@ impl Pattern {
} }
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => { PatternKind::Postfix => {
matcher.prefix_match(haystack, pattern.needle.slice(..)) matcher.postfix_match(haystack, pattern.needle.slice(..))
} }
}; };
if pattern_score.is_some() { if pattern_score.is_some() {
@ -262,16 +283,16 @@ impl Pattern {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices) matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
} }
PatternKind::Fuzzy => { PatternKind::Fuzzy => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices) matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices)
} }
PatternKind::Substring => { PatternKind::Substring => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices) matcher.substring_indices(haystack, pattern.needle.slice(..), indices)
} }
PatternKind::Prefix => { PatternKind::Prefix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices) matcher.prefix_indices(haystack, pattern.needle.slice(..), indices)
} }
PatternKind::Postfix => { PatternKind::Postfix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices) matcher.postfix_indices(haystack, pattern.needle.slice(..), indices)
} }
}; };
score += pattern_score? as u32 score += pattern_score? as u32
@ -282,10 +303,15 @@ impl Pattern {
pub fn parse_from(&mut self, pattern: &str, append: bool) { pub fn parse_from(&mut self, pattern: &str, append: bool) {
self.terms.clear(); self.terms.clear();
let invert = self.terms.last().map_or(false, |pat| pat.invert); let invert = self.terms.last().map_or(false, |pat| pat.invert);
for atom in pattern_atoms(pattern) { let atoms = pattern_atoms(pattern).filter_map(|atom| {
self.terms let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
.push(PatternAtom::parse(atom, self.normalize, self.case_matching)); if atom.needle.is_empty() {
return None;
} }
Some(atom)
});
self.terms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore { self.status = if append && !invert && self.status != Status::Rescore {
Status::Update Status::Update
} else { } else {
@ -304,6 +330,10 @@ impl Pattern {
Status::Rescore Status::Rescore
}; };
} }
pub fn is_empty(&self) -> bool {
self.terms.is_empty()
}
} }
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ { fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {

View File

@ -1,5 +1,7 @@
use core::slice; use core::slice;
use std::borrow::Cow;
use std::fmt; use std::fmt;
use std::mem::take;
use std::ops::{Bound, RangeBounds}; use std::ops::{Bound, RangeBounds};
use nucleo_matcher::Utf32Str; use nucleo_matcher::Utf32Str;
@ -12,6 +14,12 @@ pub enum Utf32String {
/// A string represented as an array of unicode codepoints (basically UTF-32). /// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(Box<[char]>), Unicode(Box<[char]>),
} }
impl Default for Utf32String {
fn default() -> Self {
Self::Ascii(String::new().into_boxed_str())
}
}
impl Utf32String { impl Utf32String {
#[inline] #[inline]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
@ -48,31 +56,69 @@ impl Utf32String {
} }
} }
#[inline]
pub fn is_ascii(&self) -> bool { pub fn is_ascii(&self) -> bool {
matches!(self, Utf32String::Ascii(_)) matches!(self, Utf32String::Ascii(_))
} }
#[inline]
pub fn get(&self, idx: u32) -> char { pub fn get(&self, idx: u32) -> char {
match self { match self {
Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char,
Utf32String::Unicode(codepoints) => codepoints[idx as usize], Utf32String::Unicode(codepoints) => codepoints[idx as usize],
} }
} }
#[inline]
pub fn last(&self) -> char { pub fn last(&self) -> char {
match self { match self {
Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char,
Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1],
} }
} }
#[inline]
pub fn chars(&self) -> Chars<'_> { pub fn chars(&self) -> Chars<'_> {
match self { match self {
Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()),
Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
} }
} }
#[inline]
pub fn push_str(&mut self, text: &str) {
let mut codeboints = match take(self) {
Utf32String::Ascii(bytes) if text.is_ascii() => {
let mut bytes = bytes.into_string();
bytes.push_str(text);
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.extend(text.chars());
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
}
#[inline]
pub fn push(&mut self, c: char) {
let mut codeboints = match take(self) {
Utf32String::Ascii(bytes) if c.is_ascii() => {
let mut bytes = bytes.into_string();
bytes.push(c);
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.push(c);
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
}
} }
impl From<&str> for Utf32String { impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self { fn from(value: &str) -> Self {
if value.is_ascii() { if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str()) Self::Ascii(value.to_owned().into_boxed_str())
@ -91,12 +137,24 @@ impl From<Box<str>> for Utf32String {
} }
} }
} }
impl From<String> for Utf32String { impl From<String> for Utf32String {
#[inline]
fn from(value: String) -> Self { fn from(value: String) -> Self {
value.into_boxed_str().into() value.into_boxed_str().into()
} }
} }
impl<'a> From<Cow<'a, str>> for Utf32String {
#[inline]
fn from(value: Cow<'a, str>) -> Self {
match value {
Cow::Borrowed(value) => value.into(),
Cow::Owned(value) => value.into(),
}
}
}
pub enum Chars<'a> { pub enum Chars<'a> {
Ascii(slice::Iter<'a, u8>), Ascii(slice::Iter<'a, u8>),
Unicode(slice::Iter<'a, char>), Unicode(slice::Iter<'a, char>),
@ -104,6 +162,7 @@ pub enum Chars<'a> {
impl<'a> Iterator for Chars<'a> { impl<'a> Iterator for Chars<'a> {
type Item = char; type Item = char;
#[inline]
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self { match self {
Chars::Ascii(iter) => iter.next().map(|&c| c as char), Chars::Ascii(iter) => iter.next().map(|&c| c as char),

View File

@ -8,7 +8,7 @@ use parking_lot::RawMutex;
use rayon::{prelude::*, ThreadPool}; use rayon::{prelude::*, ThreadPool};
use crate::items::{ItemCache, ItemsSnapshot}; use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::{self, Query}; use crate::query::{self, MultiPattern};
use crate::Match; use crate::Match;
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>); struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
@ -30,15 +30,24 @@ pub(crate) struct Worker {
pub(crate) items: ItemsSnapshot, pub(crate) items: ItemsSnapshot,
matchers: Matchers, matchers: Matchers,
pub(crate) matches: Vec<Match>, pub(crate) matches: Vec<Match>,
pub(crate) query: Query, pub(crate) pattern: MultiPattern,
pub(crate) canceled: Arc<AtomicBool>, pub(crate) canceled: Arc<AtomicBool>,
pub(crate) should_notify: Arc<AtomicBool>,
} }
impl Worker { impl Worker {
pub(crate) fn update_config(&mut self, config: MatcherConfig) {
for matcher in self.matchers.0.iter_mut() {
matcher.get_mut().config = config;
}
}
pub(crate) fn new( pub(crate) fn new(
notify: Arc<(dyn Fn() + Sync + Send)>, notify: Arc<(dyn Fn() + Sync + Send)>,
worker_threads: Option<usize>, worker_threads: Option<usize>,
config: MatcherConfig, config: MatcherConfig,
matches: Vec<Match>,
items: &ItemCache,
) -> (ThreadPool, Worker) { ) -> (ThreadPool, Worker) {
let worker_threads = worker_threads let worker_threads = worker_threads
.unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get())); .unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get()));
@ -53,15 +62,17 @@ impl Worker {
let worker = Worker { let worker = Worker {
notify, notify,
running: false, running: false,
items: ItemsSnapshot::new(), items: ItemsSnapshot::new(items),
matchers: Matchers(matchers), matchers: Matchers(matchers),
matches: Vec::with_capacity(1024), matches,
// just a placeholder // just a placeholder
query: Query::new(&config, crate::CaseMatching::Ignore, 0), pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0),
canceled: Arc::new(AtomicBool::new(false)), canceled: Arc::new(AtomicBool::new(false)),
should_notify: Arc::new(AtomicBool::new(false)),
}; };
(pool, worker) (pool, worker)
} }
pub(crate) unsafe fn run( pub(crate) unsafe fn run(
&mut self, &mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>, items_lock: ArcMutexGuard<RawMutex, ItemCache>,
@ -77,48 +88,56 @@ impl Worker {
self.matches.clear(); self.matches.clear();
last_scored_item = 0; last_scored_item = 0;
} }
let matchers = &self.matchers; let matchers = &self.matchers;
let query = &self.query; let pattern = &self.pattern;
let items = unsafe { self.items.get() }; let items = unsafe { self.items.get() };
if self.pattern.cols.iter().all(|pat| pat.is_empty()) {
self.matches.clear();
self.matches.extend((0..items.len()).map(|i| Match {
score: 0,
idx: i as u32,
}));
if self.should_notify.load(atomic::Ordering::Relaxed) {
(self.notify)();
}
return;
}
if query_status != query::Status::Unchanged && !self.matches.is_empty() { if query_status != query::Status::Unchanged && !self.matches.is_empty() {
self.matches self.matches
.par_iter_mut() .par_iter_mut()
.take_any_while(|_| self.canceled.load(atomic::Ordering::Relaxed)) .take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))
.for_each(|match_| { .for_each(|match_| {
let item = &items[match_.idx as usize]; let item = &items[match_.idx as usize];
match_.score = query match_.score = pattern
.score(item.cols(), unsafe { matchers.get() }) .score(item.cols(), unsafe { matchers.get() })
.unwrap_or(u32::MAX); .unwrap_or(u32::MAX);
}); });
// TODO: do this in parallel? // TODO: do this in parallel?
self.matches.retain(|m| m.score != u32::MAX) self.matches.retain(|m| m.score != u32::MAX);
} }
if last_scored_item != self.items.len() { if last_scored_item != self.items.len() {
self.running = true;
let items = items[last_scored_item..] let items = items[last_scored_item..]
.par_iter() .par_iter()
.enumerate() .enumerate()
.filter_map(|(i, item)| { .filter_map(|(i, item)| {
let score = if self.canceled.load(atomic::Ordering::Relaxed) { let score = if self.canceled.load(atomic::Ordering::Relaxed) {
0 u32::MAX - 1
} else { } else {
query.score(item.cols(), unsafe { matchers.get() })? pattern.score(item.cols(), unsafe { matchers.get() })?
}; };
Some(Match { Some(Match {
score, score,
idx: i as u32, idx: i as u32,
}) })
}); });
self.matches.par_extend(items) self.matches.par_extend(items);
} }
if !self.canceled.load(atomic::Ordering::Relaxed) { if !self.canceled.load(atomic::Ordering::Relaxed) {
// TODO: cancel sort in progess? // TODO: cancel sort in progess?
self.matches.par_sort_unstable_by(|match1, match2| { self.matches.par_sort_unstable_by(|match1, match2| {
match2.idx.cmp(&match1.idx).then_with(|| { match2.score.cmp(&match1.score).then_with(|| {
// the tie breaker is comparitevly rarely needed so we keep it // the tie breaker is comparitevly rarely needed so we keep it
// in a branch especially beacuse we need to acceess the items // in a branch especially beacuse we need to acceess the items
// array here which invovles some pointer chasing // array here which invovles some pointer chasing
@ -129,6 +148,8 @@ impl Worker {
}); });
} }
if self.should_notify.load(atomic::Ordering::Relaxed) {
(self.notify)(); (self.notify)();
} }
}
} }

View File

@ -1,131 +0,0 @@
use std::ops::Deref;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use std::time::Duration;
use crate::items::{Item, ItemCache};
use crate::worker::Worker;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, Pattern, PatternKind, Query};
pub use crate::utf32_string::Utf32String;
mod items;
mod query;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use parking_lot::{Mutex, MutexGuard};
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
pub score: u32,
pub idx: u32,
}
#[derive(Clone)]
pub struct Items<T> {
cache: Arc<Mutex<ItemCache>>,
items: Arc<Mutex<Vec<T>>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
}
impl<T: Sync + Send> Items<T> {
pub fn clear(&mut self) {
self.items.lock().clear();
self.cache.lock().clear();
}
pub fn append(&mut self, items: impl Iterator<Item = (T, Box<[Utf32String]>)>) {
let mut cache = self.cache.lock();
let mut items_ = self.items.lock();
items_.extend(items.map(|(item, text)| {
cache.push(text);
item
}));
// notify that a new tick will be necessary
(self.notify)();
}
pub fn get(&self) -> impl Deref<Target = [T]> + '_ {
MutexGuard::map(self.items.lock(), |items| items.as_mut_slice())
}
pub fn get_matcher_items(&self) -> impl Deref<Target = [Item]> + '_ {
MutexGuard::map(self.cache.lock(), |items| items.get())
}
}
pub struct Nucleo<T: Sync + Send> {
// the way the API is build we totally don't actually neeed these to be Arcs
// but this lets us avoid some unsafe
worker: Arc<Mutex<Worker>>,
canceled: Arc<AtomicBool>,
pool: ThreadPool,
pub items: Items<T>,
pub matches: Vec<Match>,
pub query: Query,
}
impl<T: Sync + Send> Nucleo<T> {
pub fn new(
config: MatcherConfig,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
cols: usize,
) -> Self {
let (pool, worker) = Worker::new(notify.clone(), num_threads, config);
Self {
canceled: worker.canceled.clone(),
items: Items {
cache: Arc::new(Mutex::new(ItemCache::new())),
items: Arc::new(Mutex::new(Vec::with_capacity(1024))),
notify,
},
pool,
matches: Vec::with_capacity(1024),
query: Query::new(&config, case_matching, cols),
worker: Arc::new(Mutex::new(worker)),
}
}
pub fn tick(&mut self, timeout: u64) -> bool {
let status = self.query.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let mut inner = if canceled {
self.query.reset_status();
self.canceled.store(true, atomic::Ordering::Relaxed);
self.worker.lock_arc()
} else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
return true;
};
worker
};
if inner.running {
inner.running = false;
self.matches.clone_from(&inner.matches);
} else if !canceled {
// nothing has changed
return false;
}
if canceled || inner.items.outdated(&items) {
self.pool.spawn(move || unsafe { inner.run(items, status) })
}
true
}
}
impl<T: Sync + Send> Drop for Nucleo<T> {
fn drop(&mut self) {
// we ensure the worker quits before dropping items to ensure that
// the worker can always assume the items outlife it
self.canceled.store(true, atomic::Ordering::Relaxed);
drop(self.worker.lock());
}
}