progress on high level API

This commit is contained in:
Pascal Kuthe 2023-07-29 21:43:02 +02:00
parent 39982dc81a
commit 6b8ee0f585
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
6 changed files with 448 additions and 127 deletions

View File

@ -138,15 +138,9 @@ pub use normalize::normalize;
#[inline(always)]
pub fn to_lower_case(c: char) -> char {
if c >= 'A' && c <= 'Z' {
char::from_u32(c as u32 + 32).unwrap()
} else if !c.is_ascii() {
CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
} else {
c
}
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]

View File

@ -1,10 +1,11 @@
[package]
name = "nucleo"
description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo"
[dependencies]
nucleo-matcher = { version = "0.1", path = "../matcher" }

View File

@ -8,13 +8,22 @@ pub(crate) struct ItemCache {
evicted: Vec<Item>,
}
impl ItemCache {
pub(crate) fn new() -> Self {
Self {
live: Vec::with_capacity(1024),
evicted: Vec::new(),
}
}
pub(crate) fn clear(&mut self) {
if self.evicted.is_empty() {
self.evicted.reserve(1024);
swap(&mut self.evicted, &mut self.live)
} else {
self.evicted.append(&mut self.live)
}
}
pub(crate) fn cleared(&self) -> bool {
!self.evicted.is_empty()
}
@ -24,19 +33,31 @@ impl ItemCache {
cols: Box::leak(item).into(),
})
}
pub(crate) fn get(&mut self) -> &mut [Item] {
&mut self.live
}
}
#[derive(PartialEq, Eq, Debug, Clone)]
pub(crate) struct Item {
#[derive(PartialEq, Eq, Clone)]
pub struct Item {
// TODO: small vec optimization??
cols: NonNull<[Utf32String]>,
}
impl std::fmt::Debug for Item {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ItemText")
.field("cols", &self.cols())
.finish()
}
}
unsafe impl Send for Item {}
unsafe impl Sync for Item {}
impl Item {
fn cols(&self) -> &[Utf32String] {
pub fn cols(&self) -> &[Utf32String] {
// safety: cols is basically a box and treated the same as a box,
// however there can be other references so using a box (unique ptr)
// would be an alias violation
@ -68,6 +89,12 @@ pub(crate) struct ItemsSnapshot {
}
impl ItemsSnapshot {
pub(crate) fn new() -> Self {
Self {
items: Vec::with_capacity(1024),
}
}
pub(crate) fn outdated(&self, items: &ItemCache) -> bool {
items.live.len() != self.items.len()
}

View File

@ -1,123 +1,30 @@
use std::cell::UnsafeCell;
use std::ops::Deref;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use std::time::Duration;
use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::Query;
use crate::items::{Item, ItemCache};
use crate::worker::Worker;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, Pattern, PatternKind, Query};
pub use crate::utf32_string::Utf32String;
use parking_lot::lock_api::ArcMutexGuard;
use rayon::prelude::*;
mod items;
mod query;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use parking_lot::{Mutex, MutexGuard, RawMutex};
use parking_lot::{Mutex, MutexGuard};
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
score: u32,
idx: u32,
}
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
impl Matchers {
// thiss is not a true mut from ref, we use a cell here
#[allow(clippy::mut_from_ref)]
unsafe fn get(&self) -> &mut nucleo_matcher::Matcher {
&mut *self.0[rayon::current_thread_index().unwrap()].get()
}
}
unsafe impl Sync for Matchers {}
unsafe impl Send for Matchers {}
struct Worker {
notify: Arc<(dyn Fn() + Sync + Send)>,
running: bool,
items: ItemsSnapshot,
matchers: Matchers,
matches: Vec<Match>,
query: Query,
canceled: Arc<AtomicBool>,
}
impl Worker {
unsafe fn run(
&mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
query_status: query::Status,
canceled: Arc<AtomicBool>,
) {
self.running = true;
let mut last_scored_item = self.items.len();
let cleared = self.items.update(&items_lock);
drop(items_lock);
// TODO: be smarter around reusing past results for rescoring
if cleared || query_status == query::Status::Rescore {
self.matches.clear();
last_scored_item = 0;
}
let matchers = &self.matchers;
let query = &self.query;
let items = unsafe { self.items.get() };
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
self.matches
.par_iter_mut()
.take_any_while(|_| canceled.load(atomic::Ordering::Relaxed))
.for_each(|match_| {
let item = &items[match_.idx as usize];
match_.score = query
.score(item.cols(), unsafe { matchers.get() })
.unwrap_or(u32::MAX);
});
// TODO: do this in parallel?
self.matches.retain(|m| m.score != u32::MAX)
}
if last_scored_item != self.items.len() {
self.running = true;
let items = items[last_scored_item..]
.par_iter()
.enumerate()
.filter_map(|(i, item)| {
let score = if canceled.load(atomic::Ordering::Relaxed) {
0
} else {
query.score(item.cols(), unsafe { matchers.get() })?
};
Some(Match {
score,
idx: i as u32,
})
});
self.matches.par_extend(items)
}
if !self.canceled.load(atomic::Ordering::Relaxed) {
// TODO: cancel sort in progess?
self.matches.par_sort_unstable_by(|match1, match2| {
match2.idx.cmp(&match1.idx).then_with(|| {
// the tie breaker is comparitevly rarely needed so we keep it
// in a branch especially beacuse we need to acceess the items
// array here which invovles some pointer chasing
let item1 = &items[match1.idx as usize];
let item2 = &items[match2.idx as usize];
(item1.len, match1.idx).cmp(&(item2.len, match2.idx))
})
});
}
(self.notify)();
}
pub score: u32,
pub idx: u32,
}
#[derive(Clone)]
pub struct Items<T> {
cache: Arc<Mutex<ItemCache>>,
items: Arc<Mutex<Vec<T>>>,
@ -145,7 +52,9 @@ impl<T: Sync + Send> Items<T> {
MutexGuard::map(self.items.lock(), |items| items.as_mut_slice())
}
pub fn push() {}
pub fn get_matcher_items(&self) -> impl Deref<Target = [Item]> + '_ {
MutexGuard::map(self.cache.lock(), |items| items.get())
}
}
pub struct Nucleo<T: Sync + Send> {
@ -153,18 +62,41 @@ pub struct Nucleo<T: Sync + Send> {
// but this lets us avoid some unsafe
worker: Arc<Mutex<Worker>>,
canceled: Arc<AtomicBool>,
items: Items<T>,
thread_pool: rayon::ThreadPool,
pool: ThreadPool,
pub items: Items<T>,
pub matches: Vec<Match>,
pub query: Query,
}
impl<T: Sync + Send> Nucleo<T> {
pub fn new(
config: MatcherConfig,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
cols: usize,
) -> Self {
let (pool, worker) = Worker::new(notify.clone(), num_threads, config);
Self {
canceled: worker.canceled.clone(),
items: Items {
cache: Arc::new(Mutex::new(ItemCache::new())),
items: Arc::new(Mutex::new(Vec::with_capacity(1024))),
notify,
},
pool,
matches: Vec::with_capacity(1024),
query: Query::new(&config, case_matching, cols),
worker: Arc::new(Mutex::new(worker)),
}
}
pub fn tick(&mut self, timeout: u64) -> bool {
let status = self.query.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let mut inner = if canceled {
self.query.reset_status();
self.canceled.store(true, atomic::Ordering::Relaxed);
self.worker.lock_arc()
} else {
@ -183,9 +115,7 @@ impl<T: Sync + Send> Nucleo<T> {
}
if canceled || inner.items.outdated(&items) {
let canceled = self.canceled.clone();
self.thread_pool
.spawn(move || unsafe { inner.run(items, status, canceled) })
self.pool.spawn(move || unsafe { inner.run(items, status) })
}
true
}

View File

@ -1,9 +1,18 @@
use nucleo_matcher::{Matcher, Utf32Str};
use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum CaseMatching {
Ignore,
Smart,
Respect,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum PatternKind {
#[non_exhaustive]
pub enum PatternKind {
Exact,
Fuzzy,
Substring,
@ -16,6 +25,120 @@ struct PatternAtom {
kind: PatternKind,
needle: Utf32String,
invert: bool,
ignore_case: bool,
}
impl PatternAtom {
fn literal(
needle: &str,
normalize: bool,
case: CaseMatching,
kind: PatternKind,
escape_whitespace: bool,
) -> PatternAtom {
let mut ignore_case = case == CaseMatching::Ignore;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
CaseMatching::Ignore => needle.make_ascii_lowercase(),
CaseMatching::Smart => ignore_case = needle.bytes().any(|b| b.is_ascii_uppercase()),
CaseMatching::Respect => (),
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
if escape_whitespace {
let mut saw_backslash = false;
for mut c in needle.chars() {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case || c.is_uppercase();
}
CaseMatching::Respect => (),
}
needle_.push(c);
}
};
Utf32String::Unicode(needle_.into_boxed_slice())
};
PatternAtom {
kind,
needle,
invert: false,
ignore_case,
}
}
fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom {
let mut atom = raw;
let inverse = atom.starts_with('!');
if inverse {
atom = &atom[1..];
}
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
PatternKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
PatternKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
PatternKind::Fuzzy
}
_ => PatternKind::Fuzzy,
};
match atom.as_bytes() {
[.., b'\\', b'$'] => (),
[.., b'$'] => {
kind = if kind == PatternKind::Fuzzy {
PatternKind::Postfix
} else {
PatternKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if inverse && kind == PatternKind::Fuzzy {
kind = PatternKind::Substring
}
PatternAtom::literal(atom, normalize, case, kind, true)
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
@ -31,6 +154,20 @@ pub struct Query {
}
impl Query {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching, cols: usize) -> Query {
Query {
cols: vec![
Pattern {
terms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
cols
],
}
}
pub(crate) fn status(&self) -> Status {
self.cols
.iter()
@ -39,7 +176,13 @@ impl Query {
.unwrap_or(Status::Unchanged)
}
pub(crate) fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option<u32> {
pub(crate) fn reset_status(&mut self) {
for col in &mut self.cols {
col.status = Status::Unchanged
}
}
pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option<u32> {
// TODO: wheight columns?
let mut score = 0;
for (pattern, haystack) in self.cols.iter().zip(haystack) {
@ -52,6 +195,8 @@ impl Query {
#[derive(Clone, Debug)]
pub struct Pattern {
terms: Vec<PatternAtom>,
case_matching: CaseMatching,
normalize: bool,
status: Status,
}
@ -62,6 +207,7 @@ impl Pattern {
}
let mut score = 0;
for pattern in &self.terms {
matcher.config.ignore_case = pattern.ignore_case;
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
@ -81,4 +227,93 @@ impl Pattern {
}
Some(score)
}
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.terms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.terms {
matcher.config.ignore_case = pattern.ignore_case;
if pattern.invert {
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
PatternKind::Substring => {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => {
matcher.prefix_match(haystack, pattern.needle.slice(..))
}
};
if pattern_score.is_some() {
return None;
}
continue;
}
let pattern_score = match pattern.kind {
PatternKind::Exact => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Fuzzy => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Substring => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Prefix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Postfix => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
};
score += pattern_score? as u32
}
Some(score)
}
pub fn parse_from(&mut self, pattern: &str, append: bool) {
self.terms.clear();
let invert = self.terms.last().map_or(false, |pat| pat.invert);
for atom in pattern_atoms(pattern) {
self.terms
.push(PatternAtom::parse(atom, self.normalize, self.case_matching));
}
self.status = if append && !invert && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) {
self.terms.clear();
let pattern =
PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false);
self.terms.push(pattern);
self.status = if append && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
' ' if !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}

134
worker/src/worker.rs Normal file
View File

@ -0,0 +1,134 @@
use std::cell::UnsafeCell;
use std::sync::atomic::{self, AtomicBool};
use std::sync::Arc;
use nucleo_matcher::MatcherConfig;
use parking_lot::lock_api::ArcMutexGuard;
use parking_lot::RawMutex;
use rayon::{prelude::*, ThreadPool};
use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::{self, Query};
use crate::Match;
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
impl Matchers {
// thiss is not a true mut from ref, we use a cell here
#[allow(clippy::mut_from_ref)]
unsafe fn get(&self) -> &mut nucleo_matcher::Matcher {
&mut *self.0[rayon::current_thread_index().unwrap()].get()
}
}
unsafe impl Sync for Matchers {}
unsafe impl Send for Matchers {}
pub(crate) struct Worker {
notify: Arc<(dyn Fn() + Sync + Send)>,
pub(crate) running: bool,
pub(crate) items: ItemsSnapshot,
matchers: Matchers,
pub(crate) matches: Vec<Match>,
pub(crate) query: Query,
pub(crate) canceled: Arc<AtomicBool>,
}
impl Worker {
pub(crate) fn new(
notify: Arc<(dyn Fn() + Sync + Send)>,
worker_threads: Option<usize>,
config: MatcherConfig,
) -> (ThreadPool, Worker) {
let worker_threads = worker_threads
.unwrap_or_else(|| std::thread::available_parallelism().map_or(4, |it| it.get()));
let pool = rayon::ThreadPoolBuilder::new()
.thread_name(|i| format!("nucleo worker {i}"))
.num_threads(worker_threads)
.build()
.expect("creating threadpool failed");
let matchers = (0..worker_threads)
.map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config)))
.collect();
let worker = Worker {
notify,
running: false,
items: ItemsSnapshot::new(),
matchers: Matchers(matchers),
matches: Vec::with_capacity(1024),
// just a placeholder
query: Query::new(&config, crate::CaseMatching::Ignore, 0),
canceled: Arc::new(AtomicBool::new(false)),
};
(pool, worker)
}
pub(crate) unsafe fn run(
&mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
query_status: query::Status,
) {
self.running = true;
let mut last_scored_item = self.items.len();
let cleared = self.items.update(&items_lock);
drop(items_lock);
// TODO: be smarter around reusing past results for rescoring
if cleared || query_status == query::Status::Rescore {
self.matches.clear();
last_scored_item = 0;
}
let matchers = &self.matchers;
let query = &self.query;
let items = unsafe { self.items.get() };
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
self.matches
.par_iter_mut()
.take_any_while(|_| self.canceled.load(atomic::Ordering::Relaxed))
.for_each(|match_| {
let item = &items[match_.idx as usize];
match_.score = query
.score(item.cols(), unsafe { matchers.get() })
.unwrap_or(u32::MAX);
});
// TODO: do this in parallel?
self.matches.retain(|m| m.score != u32::MAX)
}
if last_scored_item != self.items.len() {
self.running = true;
let items = items[last_scored_item..]
.par_iter()
.enumerate()
.filter_map(|(i, item)| {
let score = if self.canceled.load(atomic::Ordering::Relaxed) {
0
} else {
query.score(item.cols(), unsafe { matchers.get() })?
};
Some(Match {
score,
idx: i as u32,
})
});
self.matches.par_extend(items)
}
if !self.canceled.load(atomic::Ordering::Relaxed) {
// TODO: cancel sort in progess?
self.matches.par_sort_unstable_by(|match1, match2| {
match2.idx.cmp(&match1.idx).then_with(|| {
// the tie breaker is comparitevly rarely needed so we keep it
// in a branch especially beacuse we need to acceess the items
// array here which invovles some pointer chasing
let item1 = &items[match1.idx as usize];
let item2 = &items[match2.idx as usize];
(item1.len, match1.idx).cmp(&(item2.len, match2.idx))
})
});
}
(self.notify)();
}
}