cleanup bugs

This commit is contained in:
Pascal Kuthe 2023-07-30 04:52:44 +02:00
parent 6b8ee0f585
commit 1ce8850f7e
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
17 changed files with 463 additions and 218 deletions

1
Cargo.lock generated
View File

@ -15,7 +15,6 @@ dependencies = [
"brunch",
"fuzzy-matcher",
"nucleo",
"nucleo-matcher",
"walkdir",
]

View File

@ -1,2 +1,18 @@
[package]
name = "nucleo"
description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.0"
edition = "2021"
license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo"
[lib]
[dependencies]
nucleo-matcher = { version = "0.1", path = "matcher" }
parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]}
rayon = "1.7.0"
[workspace]
members = [ "matcher", "worker", "benches" ]
members = [ "matcher", "bench" ]

View File

@ -6,8 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
nucleo-matcher = { version = "0.1", path = "../matcher" }
nucleo = { version = "0.1", path = "../worker" }
nucleo = { version = "0.1", path = "../" }
brunch = "0.5.0"
fuzzy-matcher = "0.3.7"
walkdir = "2"

View File

@ -4,8 +4,7 @@ use std::process::Command;
use brunch::{Bench, Benches};
use fuzzy_matcher::FuzzyMatcher;
use nucleo::Utf32String;
use nucleo_matcher::Utf32Str;
use nucleo::{Utf32Str, Utf32String};
fn bench_dir() -> PathBuf {
std::env::var_os("BENCHMARK_DIR")
@ -44,9 +43,8 @@ fn main() {
Some((path.as_str().into(), path))
})
.unzip();
let mut nucleo =
nucleo_matcher::Matcher::new(nucleo_matcher::MatcherConfig::DEFAULT.match_paths());
let skim = fuzzy_matcher::skim::SkimMatcherV2::default().ignore_case();
let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths());
let skim = fuzzy_matcher::skim::SkimMatcherV2::default();
// TODO: unicode?
let needles = ["never_matches", "copying", "/doc/kernel", "//.h"];

View File

@ -11,9 +11,7 @@ pub struct MatcherConfig {
/// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub(crate) bonus_boundary_delimiter: u16,
pub initial_char_class: CharClass,
/// Whether to normalize latin script characters to ASCII
/// this significantly degrades performance so its not recommended
/// to be turned on by default
/// Whether to normalize latin script characters to ASCII (enabled by default)
pub normalize: bool,
/// whether to ignore casing
pub ignore_case: bool,
@ -26,7 +24,7 @@ impl MatcherConfig {
bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
initial_char_class: CharClass::Whitespace,
normalize: false,
normalize: true,
ignore_case: true,
}
};

View File

@ -125,26 +125,41 @@ impl Matcher {
fn fuzzy_matcher_impl<const INDICES: bool>(
&mut self,
haystack: Utf32Str<'_>,
haystack_: Utf32Str<'_>,
needle_: Utf32Str<'_>,
indices: &mut Vec<u32>,
) -> Option<u16> {
if needle_.len() > haystack.len() || needle_.is_empty() {
if needle_.len() > haystack_.len() || needle_.is_empty() {
return None;
}
if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices);
if needle_.len() == haystack_.len() {
return self.exact_match_impl::<INDICES>(
haystack_,
needle_,
0,
haystack_.len(),
indices,
);
}
assert!(
haystack.len() <= u32::MAX as usize,
haystack_.len() <= u32::MAX as usize,
"fuzzy matching is only support for up to 2^32-1 codepoints"
);
match (haystack, needle_) {
match (haystack_, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
if let &[needle] = needle {
return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices);
}
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?;
if needle_.len() == end - start {
return Some(self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
start,
greedy_end,
indices,
));
}
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
@ -171,6 +186,10 @@ impl Matcher {
return Some(res);
}
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
if needle_.len() == end - start {
return self
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
}
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
haystack,
AsciiChar::cast(needle),
@ -188,6 +207,10 @@ impl Matcher {
return Some(res);
}
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
if needle_.len() == end - start {
return self
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
}
self.fuzzy_match_optimal::<INDICES, char, char>(
haystack,
needle,
@ -243,7 +266,7 @@ impl Matcher {
return None;
}
if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices);
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
}
assert!(
haystack.len() <= u32::MAX as usize,
@ -252,6 +275,15 @@ impl Matcher {
match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?;
if needle_.len() == greedy_end - start {
return Some(self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
start,
greedy_end,
indices,
));
}
self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
@ -330,7 +362,7 @@ impl Matcher {
return None;
}
if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, indices);
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
}
assert!(
haystack.len() <= u32::MAX as usize,
@ -393,7 +425,7 @@ impl Matcher {
///
/// See the [matcher documentation](crate::Matcher) for more details.
pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
self.exact_match_impl::<false>(haystack, needle, &mut Vec::new())
self.exact_match_impl::<false>(haystack, needle, 0, haystack.len(), &mut Vec::new())
}
/// Checks whether needle and haystack match exactly and compute the matches indices.
@ -407,7 +439,7 @@ impl Matcher {
needle: Utf32Str<'_>,
indices: &mut Vec<u32>,
) -> Option<u16> {
self.exact_match_impl::<true>(haystack, needle, indices)
self.exact_match_impl::<true>(haystack, needle, 0, haystack.len(), indices)
}
/// Checks whether needle is a prefix of the haystack.
@ -419,7 +451,7 @@ impl Matcher {
if haystack.len() < needle.len() {
None
} else {
self.exact_match_impl::<false>(haystack.slice(..needle.len()), needle, &mut Vec::new())
self.exact_match_impl::<false>(haystack, needle, 0, needle.len(), &mut Vec::new())
}
}
@ -437,7 +469,7 @@ impl Matcher {
if haystack.len() < needle.len() {
None
} else {
self.exact_match_impl::<true>(haystack.slice(..needle.len()), needle, indices)
self.exact_match_impl::<true>(haystack, needle, 0, needle.len(), indices)
}
}
@ -451,8 +483,10 @@ impl Matcher {
None
} else {
self.exact_match_impl::<false>(
haystack.slice(haystack.len() - needle.len()..),
haystack,
needle,
haystack.len() - needle.len(),
haystack.len(),
&mut Vec::new(),
)
}
@ -473,8 +507,10 @@ impl Matcher {
None
} else {
self.exact_match_impl::<true>(
haystack.slice(haystack.len() - needle.len()..),
haystack,
needle,
haystack.len() - needle.len(),
haystack.len(),
indices,
)
}
@ -484,9 +520,11 @@ impl Matcher {
&mut self,
haystack: Utf32Str<'_>,
needle_: Utf32Str<'_>,
start: usize,
end: usize,
indices: &mut Vec<u32>,
) -> Option<u16> {
if needle_.len() != haystack.len() || needle_.is_empty() {
if needle_.len() != end - start || needle_.is_empty() {
return None;
}
assert!(
@ -496,7 +534,7 @@ impl Matcher {
let score = match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let matched = if self.config.ignore_case {
AsciiChar::cast(haystack)
AsciiChar::cast(haystack)[start..end]
.iter()
.map(|c| c.normalize(&self.config))
.eq(AsciiChar::cast(needle)
@ -511,8 +549,8 @@ impl Matcher {
self.calculate_score::<INDICES, _, _>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
0,
haystack.len(),
start,
end,
indices,
)
}
@ -522,13 +560,12 @@ impl Matcher {
return None;
}
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
let matched =
haystack
let matched = haystack[start..end]
.iter()
.map(|c| c.normalize(&self.config))
.eq(AsciiChar::cast(needle)
.iter()
.map(|c| c.normalize(&self.config))
.eq(AsciiChar::cast(needle)
.iter()
.map(|c| c.normalize(&self.config)));
.map(|c| c.normalize(&self.config)));
if !matched {
return None;
}
@ -536,20 +573,20 @@ impl Matcher {
self.calculate_score::<INDICES, _, _>(
haystack,
AsciiChar::cast(needle),
0,
haystack.len(),
start,
end,
indices,
)
}
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
let matched = haystack
let matched = haystack[start..end]
.iter()
.map(|c| c.normalize(&self.config))
.eq(needle.iter().map(|c| c.normalize(&self.config)));
if !matched {
return None;
}
self.calculate_score::<INDICES, _, _>(haystack, needle, 0, haystack.len(), indices)
self.calculate_score::<INDICES, _, _>(haystack, needle, start, end, indices)
}
};
Some(score)

View File

@ -25,6 +25,8 @@ struct MatrixLayout<C: Char> {
}
impl<C: Char> MatrixLayout<C> {
fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout<C> {
assert!(haystack_len >= needle_len);
assert!(haystack_len <= u32::MAX as usize);
let mut layout = Layout::from_size_align(0, 1).unwrap();
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();

View File

@ -85,6 +85,9 @@ impl Matcher {
.iter()
.rev()
.position(|c| c.normalize(&self.config) == needle_char)?;
if end - start < needle.len() {
return None;
}
Some((start, end))
}

View File

@ -89,9 +89,16 @@ pub(crate) struct ItemsSnapshot {
}
impl ItemsSnapshot {
pub(crate) fn new() -> Self {
pub(crate) fn new(items: &ItemCache) -> Self {
Self {
items: Vec::with_capacity(1024),
items: items
.live
.iter()
.map(|item| ItemSnapshot {
cols: item.cols,
len: item.cols().iter().map(|s| s.len() as u32).sum(),
})
.collect(),
}
}
@ -104,7 +111,7 @@ impl ItemsSnapshot {
}
pub(crate) fn update(&mut self, items: &ItemCache) -> bool {
let cleared = items.evicted.is_empty();
let cleared = !items.evicted.is_empty();
// drop in another thread to ensure we don't wait for a long drop here
if cleared {
self.items.clear();

207
src/lib.rs Normal file
View File

@ -0,0 +1,207 @@
use std::cmp::Reverse;
use std::ops::Deref;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use std::time::Duration;
use crate::items::{Item, ItemCache};
use crate::worker::Worker;
use parking_lot::lock_api::ArcMutexGuard;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::utf32_string::Utf32String;
mod items;
mod query;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use parking_lot::{Mutex, MutexGuard, RawMutex};
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
pub score: u32,
pub idx: u32,
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Status {
pub changed: bool,
pub running: bool,
}
#[derive(Clone)]
pub struct Items<T> {
cache: Arc<Mutex<ItemCache>>,
items: Arc<Mutex<Vec<T>>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
}
impl<T: Sync + Send> Items<T> {
pub fn clear(&mut self) {
self.items.lock().clear();
self.cache.lock().clear();
}
pub fn append(&mut self, items: impl Iterator<Item = (T, Box<[Utf32String]>)>) {
let mut cache = self.cache.lock();
let mut items_ = self.items.lock();
items_.extend(items.map(|(item, text)| {
cache.push(text);
item
}));
// notify that a new tick will be necessary
(self.notify)();
}
pub fn get(&self) -> impl Deref<Target = [T]> + '_ {
MutexGuard::map(self.items.lock(), |items| items.as_mut_slice())
}
pub fn get_matcher_items(&self) -> impl Deref<Target = [Item]> + '_ {
MutexGuard::map(self.cache.lock(), |items| items.get())
}
}
pub struct Nucleo<T: Sync + Send> {
// the way the API is build we totally don't actually neeed these to be Arcs
// but this lets us avoid some unsafe
worker: Arc<Mutex<Worker>>,
canceled: Arc<AtomicBool>,
pool: ThreadPool,
pub items: Items<T>,
pub matches: Vec<Match>,
pub pattern: MultiPattern,
should_notify: Arc<AtomicBool>,
}
impl<T: Sync + Send> Nucleo<T> {
pub fn new(
config: MatcherConfig,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
cols: usize,
items: impl Iterator<Item = (T, Box<[Utf32String]>)>,
) -> Self {
let mut cache = ItemCache::new();
let items: Vec<_> = items
.map(|(item, text)| {
cache.push(text);
item
})
.collect();
let matches: Vec<_> = (0..items.len())
.map(|i| Match {
score: 0,
idx: i as u32,
})
.collect();
let (pool, worker) =
Worker::new(notify.clone(), num_threads, config, matches.clone(), &cache);
Self {
canceled: worker.canceled.clone(),
should_notify: worker.should_notify.clone(),
items: Items {
cache: Arc::new(Mutex::new(cache)),
items: Arc::new(Mutex::new(items)),
notify,
},
pool,
matches,
pattern: MultiPattern::new(&config, case_matching, cols),
worker: Arc::new(Mutex::new(worker)),
}
}
pub fn update_config(&mut self, config: MatcherConfig) {
self.worker.lock().update_config(config)
}
pub fn tick(&mut self, timeout: u64) -> Status {
self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let res = self.tick_inner(timeout, canceled, items, status);
if !canceled {
self.should_notify.store(true, atomic::Ordering::Relaxed);
return res;
}
let items = self.items.cache.lock_arc();
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged);
self.should_notify.store(true, atomic::Ordering::Relaxed);
res
}
fn tick_inner(
&mut self,
timeout: u64,
canceled: bool,
items: ArcMutexGuard<RawMutex, ItemCache>,
status: query::Status,
) -> Status {
let mut inner = if canceled {
self.pattern.reset_status();
self.canceled.store(true, atomic::Ordering::Relaxed);
self.worker.lock_arc()
} else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
return Status{ changed: false, running: true };
};
worker
};
let changed = inner.running;
if inner.running {
inner.running = false;
self.matches.clone_from(&inner.matches);
}
let running = canceled || inner.items.outdated(&items);
if running {
inner.pattern.clone_from(&self.pattern);
self.canceled.store(false, atomic::Ordering::Relaxed);
self.pool.spawn(move || unsafe { inner.run(items, status) })
}
Status { changed, running }
}
}
impl<T: Sync + Send> Drop for Nucleo<T> {
fn drop(&mut self) {
// we ensure the worker quits before dropping items to ensure that
// the worker can always assume the items outlife it
self.canceled.store(true, atomic::Ordering::Relaxed);
let lock = self.worker.try_lock_for(Duration::from_secs(1));
if lock.is_none() {
unreachable!("thread pool failed to shutdown properly")
}
}
}
/// convenicne function to easily fuzzy match
/// on a (relatievly small list of inputs). This is not recommended for building a full tui
/// application that can match large numbers of matches as all matching is done on the current
/// thread, effectively blocking the UI
pub fn fuzzy_match<T: AsRef<str>>(
matcher: &mut Matcher,
pattern: &str,
items: impl IntoIterator<Item = T>,
case_matching: CaseMatching,
) -> Vec<(T, u32)> {
let mut pattern_ = Pattern::new(&matcher.config, case_matching);
pattern_.set_literal(pattern, PatternKind::Fuzzy, false);
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
pattern_
.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(item, score)| (Reverse(*score), item.as_ref().len()));
items
}

View File

@ -54,7 +54,9 @@ impl PatternAtom {
match case {
CaseMatching::Ignore => needle.make_ascii_lowercase(),
CaseMatching::Smart => ignore_case = needle.bytes().any(|b| b.is_ascii_uppercase()),
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => (),
}
@ -80,7 +82,7 @@ impl PatternAtom {
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case || c.is_uppercase();
ignore_case = ignore_case && !c.is_uppercase();
}
CaseMatching::Respect => (),
}
@ -149,22 +151,18 @@ pub enum Status {
}
#[derive(Debug, Clone)]
pub struct Query {
pub struct MultiPattern {
pub cols: Vec<Pattern>,
}
impl Query {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching, cols: usize) -> Query {
Query {
cols: vec![
Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
cols
],
impl MultiPattern {
pub fn new(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
cols: usize,
) -> MultiPattern {
MultiPattern {
cols: vec![Pattern::new(matcher_config, case_matching); cols],
}
}
@ -201,7 +199,30 @@ pub struct Pattern {
}
impl Pattern {
pub(crate) fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
}
}
pub fn new_fuzzy_literal(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
pattern: &str,
) -> Pattern {
let mut res = Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
res.set_literal(pattern, PatternKind::Fuzzy, false);
res
}
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.terms.is_empty() {
return Some(0);
}
@ -215,7 +236,7 @@ impl Pattern {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)),
};
if pattern.invert {
if pattern_score.is_some() {
@ -249,7 +270,7 @@ impl Pattern {
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => {
matcher.prefix_match(haystack, pattern.needle.slice(..))
matcher.postfix_match(haystack, pattern.needle.slice(..))
}
};
if pattern_score.is_some() {
@ -262,16 +283,16 @@ impl Pattern {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Fuzzy => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Substring => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
matcher.substring_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Prefix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
matcher.prefix_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Postfix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
matcher.postfix_indices(haystack, pattern.needle.slice(..), indices)
}
};
score += pattern_score? as u32
@ -282,10 +303,15 @@ impl Pattern {
pub fn parse_from(&mut self, pattern: &str, append: bool) {
self.terms.clear();
let invert = self.terms.last().map_or(false, |pat| pat.invert);
for atom in pattern_atoms(pattern) {
self.terms
.push(PatternAtom::parse(atom, self.normalize, self.case_matching));
}
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.terms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore {
Status::Update
} else {
@ -304,6 +330,10 @@ impl Pattern {
Status::Rescore
};
}
pub fn is_empty(&self) -> bool {
self.terms.is_empty()
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {

View File

@ -1,5 +1,7 @@
use core::slice;
use std::borrow::Cow;
use std::fmt;
use std::mem::take;
use std::ops::{Bound, RangeBounds};
use nucleo_matcher::Utf32Str;
@ -12,6 +14,12 @@ pub enum Utf32String {
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(Box<[char]>),
}
impl Default for Utf32String {
fn default() -> Self {
Self::Ascii(String::new().into_boxed_str())
}
}
impl Utf32String {
#[inline]
pub fn len(&self) -> usize {
@ -48,31 +56,69 @@ impl Utf32String {
}
}
#[inline]
pub fn is_ascii(&self) -> bool {
matches!(self, Utf32String::Ascii(_))
}
#[inline]
pub fn get(&self, idx: u32) -> char {
match self {
Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char,
Utf32String::Unicode(codepoints) => codepoints[idx as usize],
}
}
#[inline]
pub fn last(&self) -> char {
match self {
Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char,
Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1],
}
}
#[inline]
pub fn chars(&self) -> Chars<'_> {
match self {
Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()),
Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
}
}
#[inline]
pub fn push_str(&mut self, text: &str) {
let mut codeboints = match take(self) {
Utf32String::Ascii(bytes) if text.is_ascii() => {
let mut bytes = bytes.into_string();
bytes.push_str(text);
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.extend(text.chars());
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
}
#[inline]
pub fn push(&mut self, c: char) {
let mut codeboints = match take(self) {
Utf32String::Ascii(bytes) if c.is_ascii() => {
let mut bytes = bytes.into_string();
bytes.push(c);
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.push(c);
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
}
}
impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self {
if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str())
@ -91,12 +137,24 @@ impl From<Box<str>> for Utf32String {
}
}
}
impl From<String> for Utf32String {
#[inline]
fn from(value: String) -> Self {
value.into_boxed_str().into()
}
}
impl<'a> From<Cow<'a, str>> for Utf32String {
#[inline]
fn from(value: Cow<'a, str>) -> Self {
match value {
Cow::Borrowed(value) => value.into(),
Cow::Owned(value) => value.into(),
}
}
}
pub enum Chars<'a> {
Ascii(slice::Iter<'a, u8>),
Unicode(slice::Iter<'a, char>),
@ -104,6 +162,7 @@ pub enum Chars<'a> {
impl<'a> Iterator for Chars<'a> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self {
Chars::Ascii(iter) => iter.next().map(|&c| c as char),

View File

@ -8,7 +8,7 @@ use parking_lot::RawMutex;
use rayon::{prelude::*, ThreadPool};
use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::{self, Query};
use crate::query::{self, MultiPattern};
use crate::Match;
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
@ -30,15 +30,24 @@ pub(crate) struct Worker {
pub(crate) items: ItemsSnapshot,
matchers: Matchers,
pub(crate) matches: Vec<Match>,
pub(crate) query: Query,
pub(crate) pattern: MultiPattern,
pub(crate) canceled: Arc<AtomicBool>,
pub(crate) should_notify: Arc<AtomicBool>,
}
impl Worker {
pub(crate) fn update_config(&mut self, config: MatcherConfig) {
for matcher in self.matchers.0.iter_mut() {
matcher.get_mut().config = config;
}
}
pub(crate) fn new(
notify: Arc<(dyn Fn() + Sync + Send)>,
worker_threads: Option<usize>,
config: MatcherConfig,
matches: Vec<Match>,
items: &ItemCache,
) -> (ThreadPool, Worker) {
let worker_threads = worker_threads
.unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get()));
@ -53,15 +62,17 @@ impl Worker {
let worker = Worker {
notify,
running: false,
items: ItemsSnapshot::new(),
items: ItemsSnapshot::new(items),
matchers: Matchers(matchers),
matches: Vec::with_capacity(1024),
matches,
// just a placeholder
query: Query::new(&config, crate::CaseMatching::Ignore, 0),
pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0),
canceled: Arc::new(AtomicBool::new(false)),
should_notify: Arc::new(AtomicBool::new(false)),
};
(pool, worker)
}
pub(crate) unsafe fn run(
&mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
@ -77,48 +88,56 @@ impl Worker {
self.matches.clear();
last_scored_item = 0;
}
let matchers = &self.matchers;
let query = &self.query;
let pattern = &self.pattern;
let items = unsafe { self.items.get() };
if self.pattern.cols.iter().all(|pat| pat.is_empty()) {
self.matches.clear();
self.matches.extend((0..items.len()).map(|i| Match {
score: 0,
idx: i as u32,
}));
if self.should_notify.load(atomic::Ordering::Relaxed) {
(self.notify)();
}
return;
}
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
self.matches
.par_iter_mut()
.take_any_while(|_| self.canceled.load(atomic::Ordering::Relaxed))
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))
.for_each(|match_| {
let item = &items[match_.idx as usize];
match_.score = query
match_.score = pattern
.score(item.cols(), unsafe { matchers.get() })
.unwrap_or(u32::MAX);
});
// TODO: do this in parallel?
self.matches.retain(|m| m.score != u32::MAX)
self.matches.retain(|m| m.score != u32::MAX);
}
if last_scored_item != self.items.len() {
self.running = true;
let items = items[last_scored_item..]
.par_iter()
.enumerate()
.filter_map(|(i, item)| {
let score = if self.canceled.load(atomic::Ordering::Relaxed) {
0
u32::MAX - 1
} else {
query.score(item.cols(), unsafe { matchers.get() })?
pattern.score(item.cols(), unsafe { matchers.get() })?
};
Some(Match {
score,
idx: i as u32,
})
});
self.matches.par_extend(items)
self.matches.par_extend(items);
}
if !self.canceled.load(atomic::Ordering::Relaxed) {
// TODO: cancel sort in progess?
self.matches.par_sort_unstable_by(|match1, match2| {
match2.idx.cmp(&match1.idx).then_with(|| {
match2.score.cmp(&match1.score).then_with(|| {
// the tie breaker is comparitevly rarely needed so we keep it
// in a branch especially beacuse we need to acceess the items
// array here which invovles some pointer chasing
@ -129,6 +148,8 @@ impl Worker {
});
}
(self.notify)();
if self.should_notify.load(atomic::Ordering::Relaxed) {
(self.notify)();
}
}
}

View File

@ -1,131 +0,0 @@
use std::ops::Deref;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use std::time::Duration;
use crate::items::{Item, ItemCache};
use crate::worker::Worker;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, Pattern, PatternKind, Query};
pub use crate::utf32_string::Utf32String;
mod items;
mod query;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use parking_lot::{Mutex, MutexGuard};
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
pub score: u32,
pub idx: u32,
}
#[derive(Clone)]
pub struct Items<T> {
cache: Arc<Mutex<ItemCache>>,
items: Arc<Mutex<Vec<T>>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
}
impl<T: Sync + Send> Items<T> {
pub fn clear(&mut self) {
self.items.lock().clear();
self.cache.lock().clear();
}
pub fn append(&mut self, items: impl Iterator<Item = (T, Box<[Utf32String]>)>) {
let mut cache = self.cache.lock();
let mut items_ = self.items.lock();
items_.extend(items.map(|(item, text)| {
cache.push(text);
item
}));
// notify that a new tick will be necessary
(self.notify)();
}
pub fn get(&self) -> impl Deref<Target = [T]> + '_ {
MutexGuard::map(self.items.lock(), |items| items.as_mut_slice())
}
pub fn get_matcher_items(&self) -> impl Deref<Target = [Item]> + '_ {
MutexGuard::map(self.cache.lock(), |items| items.get())
}
}
pub struct Nucleo<T: Sync + Send> {
// the way the API is build we totally don't actually neeed these to be Arcs
// but this lets us avoid some unsafe
worker: Arc<Mutex<Worker>>,
canceled: Arc<AtomicBool>,
pool: ThreadPool,
pub items: Items<T>,
pub matches: Vec<Match>,
pub query: Query,
}
impl<T: Sync + Send> Nucleo<T> {
pub fn new(
config: MatcherConfig,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
cols: usize,
) -> Self {
let (pool, worker) = Worker::new(notify.clone(), num_threads, config);
Self {
canceled: worker.canceled.clone(),
items: Items {
cache: Arc::new(Mutex::new(ItemCache::new())),
items: Arc::new(Mutex::new(Vec::with_capacity(1024))),
notify,
},
pool,
matches: Vec::with_capacity(1024),
query: Query::new(&config, case_matching, cols),
worker: Arc::new(Mutex::new(worker)),
}
}
pub fn tick(&mut self, timeout: u64) -> bool {
let status = self.query.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let mut inner = if canceled {
self.query.reset_status();
self.canceled.store(true, atomic::Ordering::Relaxed);
self.worker.lock_arc()
} else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
return true;
};
worker
};
if inner.running {
inner.running = false;
self.matches.clone_from(&inner.matches);
} else if !canceled {
// nothing has changed
return false;
}
if canceled || inner.items.outdated(&items) {
self.pool.spawn(move || unsafe { inner.run(items, status) })
}
true
}
}
impl<T: Sync + Send> Drop for Nucleo<T> {
fn drop(&mut self) {
// we ensure the worker quits before dropping items to ensure that
// the worker can always assume the items outlife it
self.canceled.store(true, atomic::Ordering::Relaxed);
drop(self.worker.lock());
}
}