From b38fdfa8d7ea23d384651335d9bbb15fd23e84a6 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 16:21:51 +0200 Subject: [PATCH 1/6] add option to prefer prefix matches --- Cargo.lock | 2 +- matcher/src/config.rs | 9 +++++++ matcher/src/fuzzy_optimal.rs | 40 ++++++++++++++++++++++++------- matcher/src/lib.rs | 34 ++++++++++++++++++++++---- matcher/src/score.rs | 14 ++++++++++- matcher/src/tests.rs | 46 +++++++++++++++++++++++++++++++++++- 6 files changed, 129 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24abffc..9590fda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -152,7 +152,7 @@ dependencies = [ [[package]] name = "nucleo" -version = "0.1.0" +version = "0.1.1" dependencies = [ "nucleo-matcher", "parking_lot", diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 7065262..67e07b7 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -15,6 +15,14 @@ pub struct MatcherConfig { pub normalize: bool, /// whether to ignore casing pub ignore_case: bool, + /// Whether to provide a bonus to matches by their distance from the start + /// of the haystack. The bonus is fairly small compared to the normal gap + /// penalty to avoid messing with the normal score heuristic. This setting + /// is not turned on by default and only recommended for autocompletion + /// usecases where the expectation is that the user is typing the entire + /// match. For a full fzf-like fuzzy matcher/picker word segmentation and + /// explicit prefix literals should be used instead. + pub prefer_prefix: bool, } impl MatcherConfig { @@ -26,6 +34,7 @@ impl MatcherConfig { initial_char_class: CharClass::Whitespace, normalize: true, ignore_case: true, + prefer_prefix: false, } }; } diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs index 65fe95b..f007d79 100644 --- a/matcher/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -3,8 +3,8 @@ use std::cmp::max; use crate::chars::{Char, CharClass}; use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; use crate::score::{ - BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, - PENALTY_GAP_START, SCORE_MATCH, + BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, + PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, }; use crate::{Matcher, MatcherConfig}; @@ -35,7 +35,7 @@ impl Matcher { .checked_sub(1) .map(|i| haystack[i].char_class(&self.config)) .unwrap_or(self.config.initial_char_class); - let matched = matrix.setup::(needle, prev_class, &self.config); + let matched = matrix.setup::(needle, prev_class, &self.config, start as u32); // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects if !matched { assert!( @@ -117,6 +117,7 @@ impl MatcherDataView<'_, H> { needle: &[N], mut prev_class: CharClass, config: &MatcherConfig, + start: u32, ) -> bool where H: PartialEq, @@ -167,6 +168,17 @@ impl MatcherDataView<'_, H> { 0, needle[0], needle[1], + if config.prefer_prefix { + if start == 0 { + MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE + } else { + (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub( + (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION, + ) + } + } else { + 0 + }, ); true } @@ -182,6 +194,7 @@ impl MatcherDataView<'_, H> { needle_idx: u16, needle_char: N, next_needle_char: N, + mut prefix_bonus: u16, ) where H: PartialEq, { @@ -198,15 +211,19 @@ impl MatcherDataView<'_, H> { for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { - if c == needle_char { + let cell = if c == needle_char { ScoreCell { - score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, + score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, matched: false, consecutive_bonus: *bonus, } } else { UNMATCHED - } + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell } else { *score_cell }; @@ -224,15 +241,19 @@ impl MatcherDataView<'_, H> { for (((c, bonus), score_cell), matrix_cell) in col_iter { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { - if c[0] == needle_char { + let cell = if c[0] == needle_char { ScoreCell { - score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, + score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, matched: false, consecutive_bonus: bonus[0], } } else { UNMATCHED - } + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell } else { *score_cell }; @@ -271,6 +292,7 @@ impl MatcherDataView<'_, H> { needle_idx as u16 + 1, needle_char, next_needle_char, + 0, ); let len = self.current_row.len() + needle_idx + 1 - row_off as usize; matrix_cells = &mut matrix_cells[len..]; diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index f966b1a..6aea293 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -1,12 +1,11 @@ /*! `nucleo_matcher` is a low level crate that contains the matcher implementation -used by the other nucleo crates. +used by the high level `nucleo` crate. The matcher is hightly optimized and can significantly outperform `fzf` and `skim` (the `fuzzy-matcher` crate). However some of these optimizations require -a slightly less convenient API. Particularly, `nucleo_matcher` requires that -needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead -of rusts normal utf32 strings. +a slightly less convenient API. Be sure to carefully read the documentation of the +[`Matcher`] to avoid unexpected behaviour.. */ // sadly ranges don't optmimzie well @@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab; /// multiple different matches on the same haystack and merging the indices by /// sorting and deduplicating the vector. /// +/// The `needle` argument for each function must always be normalized by the caller +/// (unicode normalization and case folding if a case insesnitive match is produced). +/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules +/// provides utilities to preprocess needles. +/// +/// Additionally it's recommend to perform separate matches for each word in +/// the needle. Consider the folloling example: If `foo bar` as used at the +/// needle it matches both `foo test baaar` and `foo hello-world bar`. However, +/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a +/// 2 character gap which will receive a penalty and therefore the user will +/// likely expect it to rank lower. However, if `foo bar` is matched as a single +/// query `hello-world` and `test` are both considered gaps too. As `hello- +/// world` is a much longer gap then `test` the extra penalty for `baaar` is +/// outweigh. If both words are matched individually the interspersed words +/// do not receive a penalty and `foo hello-world bar` ranks higher. +/// +/// In general nucleo is a **substring matching tool** with no penalty assigned +/// to matches that start later within the same pattern (which enables the +/// usecase shown above). This may be undesirable in one very particular usecase: +/// For automatic suggestions for commands (like a shell). In these case the +/// assumption is that the user is actually typing the full haystack. In other words: +/// The matcher should prefer a prefix match. To accomedate that usecase the +/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set +/// to true. Note that the penalty given is quite small (and capped to a maximum) +/// to avoid overwriting the normal scoring heuristic. +/// +/// /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than /// that the matcher *will panic*. The caller must decide whether it wants to /// filter out long haystacks or truncate them. diff --git a/matcher/src/score.rs b/matcher/src/score.rs index 7a7c0c3..eba054b 100644 --- a/matcher/src/score.rs +++ b/matcher/src/score.rs @@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig}; pub(crate) const SCORE_MATCH: u16 = 16; pub(crate) const PENALTY_GAP_START: u16 = 3; pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; +/// If the prefer_prefix option is enabled we want to penalize +/// the initial gap. The prefix should not be too much +pub(crate) const PREFIX_BONUS_SCALE: u16 = 2; +pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY; // We prefer matches at the beginning of a word, but the bonus should not be // too great to prevent the longer acronym matches from always winning over @@ -140,7 +144,15 @@ impl Matcher { } prev_class = class; } - + if self.config.prefer_prefix { + if start != 0 { + let penalty = PENALTY_GAP_START + + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16; + score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE); + } else { + score += MAX_PREFIX_BONUS; + } + } score } } diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index d2bfaea..691230c 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -1,7 +1,7 @@ use crate::chars::Char; use crate::score::{ BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, - PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, + MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; use crate::utf32_str::Utf32Str; use crate::{Matcher, MatcherConfig}; @@ -23,11 +23,13 @@ fn assert_matches( normalize: bool, case_sensitive: bool, path: bool, + prefer_prefix: bool, cases: &[(&str, &str, &[u32], u16)], ) { let mut config = MatcherConfig { normalize, ignore_case: !case_sensitive, + prefer_prefix, ..MatcherConfig::DEFAULT }; if path { @@ -142,6 +144,7 @@ fn test_fuzzy() { false, false, false, + false, &[ ( "fooBarbaz1", @@ -250,6 +253,7 @@ fn empty_needle() { false, false, false, + false, &[("foo bar baz", "", &[], 0)], ); } @@ -261,6 +265,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo bar baz", @@ -287,6 +292,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo bar baz", @@ -313,6 +319,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo", @@ -339,6 +346,7 @@ fn test_substring() { false, false, false, + false, &[ ( "fooBarbaz1", @@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() { false, true, false, + false, &[ ( "fooBarbaz1", @@ -418,6 +427,7 @@ fn test_normalize() { true, false, false, + false, &[ ( "Só Danço Samba", @@ -464,6 +474,7 @@ fn test_unicode() { true, false, false, + false, &[ ( "你好世界", @@ -488,6 +499,7 @@ fn test_long_str() { false, false, false, + false, &[( &"x".repeat(u16::MAX as usize + 1), "xx", @@ -504,6 +516,7 @@ fn test_casing() { false, false, false, + false, &[ // these two have the same score ( @@ -536,6 +549,7 @@ fn test_casing() { ], ) } + #[test] fn test_optimal() { assert_matches( @@ -543,6 +557,7 @@ fn test_optimal() { false, false, false, + false, &[ ( "axxx xx ", @@ -624,3 +639,32 @@ fn test_reject() { ); assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]); } + +#[test] +fn test_prefer_prefix() { + assert_matches( + &[FuzzyOptimal, FuzzyGreedy], + false, + false, + false, + true, + &[ + ( + "Moby Dick", + "md", + &[0, 5], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS + - PENALTY_GAP_START + - 3 * PENALTY_GAP_EXTENSION, + ), + ( + "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage", + "md", + &[82, 85], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ], + ); +} From 14014ed88345a19b1e327acc9e7a9df7e490aabd Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 17:22:21 +0200 Subject: [PATCH 2/6] reformat with rustfmt 1.71 --- matcher/src/fuzzy_greedy.rs | 2 +- matcher/src/fuzzy_optimal.rs | 8 ++------ matcher/src/matrix.rs | 2 +- src/lib.rs | 5 ++++- src/worker.rs | 10 ++++++++-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/matcher/src/fuzzy_greedy.rs b/matcher/src/fuzzy_greedy.rs index 963818d..8215bf3 100644 --- a/matcher/src/fuzzy_greedy.rs +++ b/matcher/src/fuzzy_greedy.rs @@ -21,7 +21,7 @@ impl Matcher { if c.normalize(&self.config) == needle_char { let Some(next_needle_char) = needle_iter.next() else { // we found a match so we are now in the same state - // as the prefilter would produce + // as the prefilter would produce end = first_char_end + i + 1; break 'nonascii; }; diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs index f007d79..10c7bcd 100644 --- a/matcher/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -23,11 +23,7 @@ impl Matcher { // us to treat needle indices as u16 let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else { return self.fuzzy_match_greedy_::( - haystack, - needle, - start, - greedy_end, - indices, + haystack, needle, start, greedy_end, indices, ); }; @@ -339,7 +335,7 @@ impl MatcherDataView<'_, H> { } let next_matched = row[col as usize].get(matched); if matched { - let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else{ + let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else { break; }; col += row_off - next_row_off; diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs index d60e2a6..4af4535 100644 --- a/matcher/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -148,7 +148,7 @@ impl MatrixSlab { let layout = Layout::new::(); // safety: the matrix is never zero sized (hardcoded constants) let ptr = unsafe { alloc_zeroed(layout) }; - let Some(ptr) = NonNull::new(ptr) else{ + let Some(ptr) = NonNull::new(ptr) else { handle_alloc_error(layout) }; MatrixSlab(ptr.cast()) diff --git a/src/lib.rs b/src/lib.rs index 59386bd..7331089 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -278,7 +278,10 @@ impl Nucleo { } else { let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else { self.should_notify.store(true, Ordering::Release); - return Status{ changed: false, running: true }; + return Status { + changed: false, + running: true, + }; }; worker }; diff --git a/src/worker.rs b/src/worker.rs index 7b72257..e343be1 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -102,14 +102,20 @@ impl Worker { let Some(item) = item else { in_flight.lock().push(idx); unmatched.fetch_add(1, atomic::Ordering::Relaxed); - return Match { score: 0, idx: u32::MAX }; + return Match { + score: 0, + idx: u32::MAX, + }; }; if self.canceled.load(atomic::Ordering::Relaxed) { return Match { score: 0, idx }; } let Some(score) = pattern.score(item.matcher_columns, matchers.get()) else { unmatched.fetch_add(1, atomic::Ordering::Relaxed); - return Match { score: 0, idx: u32::MAX }; + return Match { + score: 0, + idx: u32::MAX, + }; }; Match { score, idx } }); From 648dec1ceb3f276b4507cce3a016f55d21cf07a1 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 17:29:17 +0200 Subject: [PATCH 3/6] move Utf32String to nucleo-matcher --- matcher/src/lib.rs | 2 +- matcher/src/utf32_str.rs | 176 +++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 +- 3 files changed, 178 insertions(+), 4 deletions(-) diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 6aea293..efae388 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -27,7 +27,7 @@ mod utf32_str; mod tests; pub use crate::config::MatcherConfig; -pub use crate::utf32_str::Utf32Str; +pub use crate::utf32_str::{Utf32Str, Utf32String}; use crate::chars::{AsciiChar, Char}; use crate::matrix::MatrixSlab; diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index fe4f44e..9602b27 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,6 +1,10 @@ +use std::borrow::Cow; +use std::mem::take; use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; +use crate::chars; + /// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. /// /// Usually rusts' utf8 encoded strings are great. However during fuzzy matching @@ -209,3 +213,175 @@ impl DoubleEndedIterator for Chars<'_> { } } } + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +pub enum Utf32String { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain valid ASCII (<=127) + Ascii(Box), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(Box<[char]>), +} + +impl Default for Utf32String { + fn default() -> Self { + Self::Ascii(String::new().into_boxed_str()) + } +} + +impl Utf32String { + #[inline] + pub fn len(&self) -> usize { + match self { + Utf32String::Unicode(codepoints) => codepoints.len(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Utf32String::Unicode(codepoints) => codepoints.is_empty(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } + + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher + #[inline] + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start as usize, + Bound::Excluded(&start) => start as usize + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end as usize + 1, + Bound::Excluded(&end) => end as usize, + Bound::Unbounded => self.len(), + }; + match self { + Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]), + Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + #[inline] + pub fn is_ascii(&self) -> bool { + matches!(self, Utf32String::Ascii(_)) + } + + #[inline] + pub fn get(&self, idx: u32) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, + Utf32String::Unicode(codepoints) => codepoints[idx as usize], + } + } + + #[inline] + pub fn last(&self) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, + Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], + } + } + + #[inline] + pub fn chars(&self) -> Chars<'_> { + match self { + Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), + Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), + } + } + + #[inline] + pub fn push_str(&mut self, text: &str) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if text.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push_str(text); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.extend(chars::graphemes(text)); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } + + #[inline] + pub fn push(&mut self, c: char) { + let mut codeboints = match take(self) { + Utf32String::Ascii(bytes) if c.is_ascii() => { + let mut bytes = bytes.into_string(); + bytes.push(c); + *self = Self::Ascii(bytes.into_boxed_str()); + return; + } + Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), + Utf32String::Unicode(codepoints) => Vec::from(codepoints), + }; + codeboints.push(c); + *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + } +} + +impl From<&str> for Utf32String { + #[inline] + fn from(value: &str) -> Self { + if value.is_ascii() { + Self::Ascii(value.to_owned().into_boxed_str()) + } else { + Self::Unicode(chars::graphemes(value).collect()) + } + } +} + +impl From> for Utf32String { + fn from(value: Box) -> Self { + if value.is_ascii() { + Self::Ascii(value) + } else { + Self::Unicode(chars::graphemes(&value).collect()) + } + } +} + +impl From for Utf32String { + #[inline] + fn from(value: String) -> Self { + value.into_boxed_str().into() + } +} + +impl<'a> From> for Utf32String { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + match value { + Cow::Borrowed(value) => value.into(), + Cow::Owned(value) => value.into(), + } + } +} + +impl fmt::Debug for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"")?; + for c in self.chars() { + for c in c.escape_debug() { + write!(f, "{c}")? + } + } + write!(f, "\"") + } +} + +impl fmt::Display for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for c in self.chars() { + write!(f, "{c}")? + } + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 7331089..cb87352 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,14 +8,12 @@ use parking_lot::Mutex; use rayon::ThreadPool; pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; -pub use crate::utf32_string::Utf32String; use crate::worker::Worker; -pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; +pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str, Utf32String}; mod boxcar; mod par_sort; mod pattern; -mod utf32_string; mod worker; pub struct Item<'a, T> { From 3e48c9f1ee977c44d52b5dbeb7fefb58ac20ed42 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 17:29:34 +0200 Subject: [PATCH 4/6] fix clippy lint --- matcher/src/matrix.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs index 4af4535..a91ed95 100644 --- a/matcher/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -74,7 +74,7 @@ impl MatrixLayout { let base = ptr.as_ptr(); let haystack = base.add(self.haystack_off) as *mut C; let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); - let bonus = base.add(self.bonus_off) as *mut u8; + let bonus = base.add(self.bonus_off); let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); let rows = base.add(self.rows_off) as *mut u16; let rows = slice_from_raw_parts_mut(rows, self.needle_len); From de844d6acec4c3e4435c8f9595796db9470ea1a8 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Mon, 28 Aug 2023 01:33:47 +0200 Subject: [PATCH 5/6] move pattern API to nucleo-matcher --- bench/src/main.rs | 2 +- matcher/src/chars.rs | 33 ++- matcher/src/chars/normalize.rs | 10 + matcher/src/config.rs | 24 +- matcher/src/fuzzy_optimal.rs | 4 +- matcher/src/lib.rs | 16 +- matcher/src/pattern.rs | 469 +++++++++++++++++++++++++++++++++ matcher/src/pattern/tests.rs | 114 ++++++++ matcher/src/score.rs | 4 +- matcher/src/tests.rs | 14 +- matcher/src/utf32_str.rs | 134 ++++------ src/lib.rs | 44 +--- src/pattern.rs | 408 +++------------------------- src/pattern/tests.rs | 149 +---------- src/worker.rs | 14 +- typos.toml | 2 +- 16 files changed, 766 insertions(+), 675 deletions(-) create mode 100644 matcher/src/pattern.rs create mode 100644 matcher/src/pattern/tests.rs diff --git a/bench/src/main.rs b/bench/src/main.rs index 148d353..bc77b03 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -43,7 +43,7 @@ fn main() { Some((path.as_str().into(), path)) }) .unzip(); - let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths()); + let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths()); let skim = fuzzy_matcher::skim::SkimMatcherV2::default(); // TODO: unicode? diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index a469fc1..710c212 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -1,7 +1,9 @@ +//! Utilities for working with (unicode) characters/codepoints + use std::fmt::{self, Debug, Display}; use crate::chars::case_fold::CASE_FOLDING_SIMPLE; -use crate::MatcherConfig; +use crate::Config; //autogenerated by generate-ucd #[allow(warnings)] @@ -11,9 +13,9 @@ mod normalize; pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { const ASCII: bool; - fn char_class(self, config: &MatcherConfig) -> CharClass; - fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); - fn normalize(self, config: &MatcherConfig) -> Self; + fn char_class(self, config: &Config) -> CharClass; + fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); + fn normalize(self, config: &Config) -> Self; } /// repr tansparent wrapper around u8 with better formatting and `PartialEq` implementation @@ -42,7 +44,7 @@ impl PartialEq for char { impl Char for AsciiChar { const ASCII: bool = true; #[inline] - fn char_class(self, config: &MatcherConfig) -> CharClass { + fn char_class(self, config: &Config) -> CharClass { let c = self.0; // using manual if conditions instead optimizes better if c >= b'a' && c <= b'z' { @@ -61,7 +63,7 @@ impl Char for AsciiChar { } #[inline(always)] - fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { let char_class = self.char_class(config); if config.ignore_case && char_class == CharClass::Upper { self.0 += 32 @@ -70,7 +72,7 @@ impl Char for AsciiChar { } #[inline(always)] - fn normalize(mut self, config: &MatcherConfig) -> Self { + fn normalize(mut self, config: &Config) -> Self { if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { self.0 += 32 } @@ -95,7 +97,7 @@ fn char_class_non_ascii(c: char) -> CharClass { impl Char for char { const ASCII: bool = false; #[inline(always)] - fn char_class(self, config: &MatcherConfig) -> CharClass { + fn char_class(self, config: &Config) -> CharClass { if self.is_ascii() { return AsciiChar(self as u8).char_class(config); } @@ -103,7 +105,7 @@ impl Char for char { } #[inline(always)] - fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { if self.is_ascii() { let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); return (c.0 as char, class); @@ -123,7 +125,7 @@ impl Char for char { } #[inline(always)] - fn normalize(mut self, config: &MatcherConfig) -> Self { + fn normalize(mut self, config: &Config) -> Self { if config.normalize { self = normalize::normalize(self); } @@ -138,12 +140,14 @@ pub use normalize::normalize; use unicode_segmentation::UnicodeSegmentation; #[inline(always)] +/// Converts a character to lower case using simple unicode case folding pub fn to_lower_case(c: char) -> char { CASE_FOLDING_SIMPLE .binary_search_by_key(&c, |(upper, _)| *upper) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } +/// Converts a character to upper case using simple unicode case folding #[inline(always)] pub fn is_upper_case(c: char) -> bool { CASE_FOLDING_SIMPLE @@ -152,8 +156,7 @@ pub fn is_upper_case(c: char) -> bool { } #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] -#[non_exhaustive] -pub enum CharClass { +pub(crate) enum CharClass { Whitespace, NonWord, Delimiter, @@ -163,8 +166,10 @@ pub enum CharClass { Number, } -/// nucleo cannot match graphemes as single units to work around -/// that we only use the first codepoint of each grapheme +/// Nucleo cannot match graphemes as single units. To work around +/// that we only use the first codepoint of each grapheme. This +/// iterator returns the first character of each unicode grapheme +// in a string and is used for constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { text.graphemes(true).map(|grapheme| { grapheme diff --git a/matcher/src/chars/normalize.rs b/matcher/src/chars/normalize.rs index 66a4db1..d3df40e 100644 --- a/matcher/src/chars/normalize.rs +++ b/matcher/src/chars/normalize.rs @@ -495,6 +495,16 @@ const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1; const LEN3: usize = (DATA3_END - DATA3_START) as usize; static TABLE3: [char; LEN3] = generate_table(&DATA3); +/// Normalizes a unicode character by converting latin characters +/// which are variants of ASCII characters to their latin equivant. +/// +/// # Example +/// +/// ``` rust +/// # use nucleo_matcher::chars::normalize; +/// +/// assert_eq!(normalize('ä'), 'a'); +/// ``` pub fn normalize(c: char) -> char { let i = c as u32; if i < DATA1_START || i >= DATA3_END { diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 67e07b7..eca7ae3 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -1,16 +1,19 @@ use crate::chars::CharClass; use crate::score::BONUS_BOUNDARY; +/// Configuration data that controls how a matcher behaves #[non_exhaustive] -#[derive(PartialEq, Eq, Debug, Clone, Copy)] -pub struct MatcherConfig { - pub delimiter_chars: &'static [u8], +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct Config { + /// Characters that act as delimiters and provide bonus + /// for matching the following char + pub(crate) delimiter_chars: &'static [u8], /// Extra bonus for word boundary after whitespace character or beginning of the string pub(crate) bonus_boundary_white: u16, - /// Extra bonus for word boundary after slash, colon, semi-colon, and comma pub(crate) bonus_boundary_delimiter: u16, - pub initial_char_class: CharClass, + pub(crate) initial_char_class: CharClass, + /// Whether to normalize latin script characters to ASCII (enabled by default) pub normalize: bool, /// whether to ignore casing @@ -25,9 +28,11 @@ pub struct MatcherConfig { pub prefer_prefix: bool, } -impl MatcherConfig { +impl Config { + /// The default config for nucleo, implemented as a constant since + /// Default::default can not be called in a const context pub const DEFAULT: Self = { - MatcherConfig { + Config { delimiter_chars: b"/,:;|", bonus_boundary_white: BONUS_BOUNDARY + 2, bonus_boundary_delimiter: BONUS_BOUNDARY + 1, @@ -39,9 +44,9 @@ impl MatcherConfig { }; } -impl MatcherConfig { +impl Config { + /// Configures the matcher with bonuses appropriate for matching file paths. pub fn set_match_paths(&mut self) { - // compared to fzf we include if cfg!(windows) { self.delimiter_chars = b"/:\\"; } else { @@ -51,6 +56,7 @@ impl MatcherConfig { self.initial_char_class = CharClass::Delimiter; } + /// Configures the matcher with bonuses appropriate for matching file paths. pub const fn match_paths(mut self) -> Self { if cfg!(windows) { self.delimiter_chars = b"/\\"; diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs index 10c7bcd..aba7bbe 100644 --- a/matcher/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -6,7 +6,7 @@ use crate::score::{ BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, }; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; impl Matcher { pub(crate) fn fuzzy_match_optimal, N: Char>( @@ -112,7 +112,7 @@ impl MatcherDataView<'_, H> { &mut self, needle: &[N], mut prev_class: CharClass, - config: &MatcherConfig, + config: &Config, start: u32, ) -> bool where diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index efae388..7feff93 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -10,6 +10,7 @@ a slightly less convenient API. Be sure to carefully read the documentation of t // sadly ranges don't optmimzie well #![allow(clippy::manual_range_contains)] +#![warn(missing_docs)] pub mod chars; mod config; @@ -19,6 +20,7 @@ mod exact; mod fuzzy_greedy; mod fuzzy_optimal; mod matrix; +pub mod pattern; mod prefilter; mod score; mod utf32_str; @@ -26,7 +28,7 @@ mod utf32_str; #[cfg(test)] mod tests; -pub use crate::config::MatcherConfig; +pub use crate::config::Config; pub use crate::utf32_str::{Utf32Str, Utf32String}; use crate::chars::{AsciiChar, Char}; @@ -80,7 +82,8 @@ use crate::matrix::MatrixSlab; /// that the matcher *will panic*. The caller must decide whether it wants to /// filter out long haystacks or truncate them. pub struct Matcher { - pub config: MatcherConfig, + #[allow(missing_docs)] + pub config: Config, slab: MatrixSlab, } @@ -88,7 +91,7 @@ pub struct Matcher { impl Clone for Matcher { fn clone(&self) -> Self { Matcher { - config: self.config, + config: self.config.clone(), slab: MatrixSlab::new(), } } @@ -105,14 +108,17 @@ impl std::fmt::Debug for Matcher { impl Default for Matcher { fn default() -> Self { Matcher { - config: MatcherConfig::DEFAULT, + config: Config::DEFAULT, slab: MatrixSlab::new(), } } } impl Matcher { - pub fn new(config: MatcherConfig) -> Self { + /// Creates a new matcher instance, note that this will eagerly allocate + /// a fairly large chunk of heap memory (135KB currently but subject to + /// change) so matchers should be reused if used in a loop. + pub fn new(config: Config) -> Self { Self { config, slab: MatrixSlab::new(), diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs new file mode 100644 index 0000000..3583ebe --- /dev/null +++ b/matcher/src/pattern.rs @@ -0,0 +1,469 @@ +//! This module provides a slightly higher level API for matching strings. + +use std::cmp::Reverse; + +use crate::{chars, Matcher, Utf32Str}; + +#[cfg(test)] +mod tests; + +use crate::Utf32String; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[non_exhaustive] +/// How nucleo will treat case mismatch +pub enum CaseMatching { + /// Characters always match their case folded version (`a == A`) + Ignore, + /// Characters never match their case folded version (`a != A`) + Respect, + /// Acts like `Ignore` if all characters in a pattern atom are + /// lowercase and like `Respect` otherwire + #[default] + Smart, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[non_exhaustive] +/// The kind of matching algorithm to run for this atom +pub enum AtomKind { + /// Fuzzy matching where the needle must match any haystack characters + /// (match can contain gaps). This atom kind is used by default if no + /// special syntax is used. There is no negated fuzzy matching (too + /// many false positives). + /// + /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). + Fuzzy, + /// The needle must match a contiguous sequence of haystack characters + /// without gaps. This atom kind is parsed from the following syntax: + /// `'foo` and `!foo` (negated). + /// + /// See also [`Matcher::substring_match`](crate::Matcher::substring_match). + Substring, + /// The needle must match all leading haystack characters without gaps or + /// prefix. This atom kind is parsed from the following syntax: `foo$` and + /// `!foo$` (negated). + /// + /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match). + Prefix, + /// The needle must match all trailing haystack characters without gaps or + /// postfix. This atom kind is parsed from the following syntax: `foo$` and + /// `!foo$` (negated). + /// + /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match). + Postfix, + /// The needle must match all haystack characters without gaps or prefix. + /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$` + /// (negated). + /// + /// See also [`Matcher::exact_match`] (crate::Matcher::exact_match). + Exact, +} + +/// A single pattern component that is matched with a single [`Matcher`](crate::Matcher) function +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Atom { + /// Whether this pattern atom is a negative match. + /// A negative pattern atom will prevent haystacks matching it from + /// being matchend. It does not contribute to scoring/indices + pub negative: bool, + /// The kind of match that this pattern performs + pub kind: AtomKind, + needle: Utf32String, + ignore_case: bool, +} + +impl Atom { + /// Creates a single [`PatternAtom`] from a string by performing unicode + /// normalization + pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom { + Atom::new_inner(needle, case, kind, escape_whitespace, false) + } + + fn new_inner( + needle: &str, + case: CaseMatching, + kind: AtomKind, + escape_whitespace: bool, + append_dollar: bool, + ) -> Atom { + let mut ignore_case; + let needle = if needle.is_ascii() { + let mut needle = if escape_whitespace { + if let Some((start, rem)) = needle.split_once("\\ ") { + let mut needle = start.to_owned(); + for rem in rem.split("\\ ") { + needle.push(' '); + needle.push_str(rem); + } + needle + } else { + needle.to_owned() + } + } else { + needle.to_owned() + }; + + match case { + CaseMatching::Ignore => { + ignore_case = true; + needle.make_ascii_lowercase() + } + CaseMatching::Smart => { + ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) + } + CaseMatching::Respect => ignore_case = false, + } + if append_dollar { + needle.push('$'); + } + Utf32String::Ascii(needle.into_boxed_str()) + } else { + let mut needle_ = Vec::with_capacity(needle.len()); + ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); + if escape_whitespace { + let mut saw_backslash = false; + for mut c in chars::graphemes(needle) { + if saw_backslash { + if c == ' ' { + needle_.push(' '); + saw_backslash = false; + continue; + } else { + needle_.push('\\'); + } + } + saw_backslash = c == '\\'; + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c) + } + CaseMatching::Respect => (), + } + needle_.push(c); + } + } else { + let chars = chars::graphemes(needle).map(|mut c| { + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c); + } + CaseMatching::Respect => (), + } + c + }); + needle_.extend(chars); + }; + if append_dollar { + needle_.push('$'); + } + Utf32String::Unicode(needle_.into_boxed_slice()) + }; + Atom { + kind, + needle, + negative: false, + ignore_case, + } + } + + /// Parse a pattern atom from a string. Some special trailing and leading + /// characters can be used to control the atom kind. See [`AtomKind`] for + /// details. + pub fn parse(raw: &str, case: CaseMatching) -> Atom { + let mut atom = raw; + let invert = match atom.as_bytes() { + [b'!', ..] => { + atom = &atom[1..]; + true + } + [b'\\', b'!', ..] => { + atom = &atom[1..]; + false + } + _ => false, + }; + + let mut kind = match atom.as_bytes() { + [b'^', ..] => { + atom = &atom[1..]; + AtomKind::Prefix + } + [b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Substring + } + [b'\\', b'^' | b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Fuzzy + } + _ => AtomKind::Fuzzy, + }; + + let mut append_dollar = false; + match atom.as_bytes() { + [.., b'\\', b'$'] => { + append_dollar = true; + atom = &atom[..atom.len() - 2] + } + [.., b'$'] => { + kind = if kind == AtomKind::Fuzzy { + AtomKind::Postfix + } else { + AtomKind::Exact + }; + atom = &atom[..atom.len() - 1] + } + _ => (), + } + + if invert && kind == AtomKind::Fuzzy { + kind = AtomKind::Substring + } + + let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar); + pattern.negative = invert; + pattern + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher). + /// Documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + matcher.config.ignore_case = self.ignore_case; + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + if self.negative { + if pattern_score.is_some() { + return None; + } + Some(0) + } else { + pattern_score + } + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the matche + /// indices. See the [`Matcher`](crate::Matcher). Documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// this pattern atom. + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec, + ) -> Option { + matcher.config.ignore_case = self.ignore_case; + if self.negative { + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + pattern_score.is_none().then_some(0) + } else { + match self.kind { + AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices), + AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices), + AtomKind::Substring => { + matcher.substring_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Prefix => { + matcher.prefix_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Postfix => { + matcher.postfix_indices(haystack, self.needle.slice(..), indices) + } + } + } + } + + /// Returns the needle text that is passed to the matcher. All indices + /// produced by the `indices` functions produce char indices used to index + /// this text + pub fn needle_text(&self) -> Utf32Str<'_> { + self.needle.slice(..) + } + /// Convenience function to easily match on a (relatively small) list of + /// inputs. This is not recommended for building a full fuzzy matching + /// application that can match large numbers of matches (like all files in + /// a directory) as all matching is done on the current thread, effectively + /// blocking the UI. + pub fn match_list>( + &self, + matcher: &mut Matcher, + items: impl IntoIterator, + ) -> Vec<(T, u16)> { + if self.needle.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } +} + +fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { + let mut saw_backslash = false; + pattern.split(move |c| { + saw_backslash = match c { + ' ' if !saw_backslash => return true, + '\\' => true, + _ => false, + }; + false + }) +} + +#[derive(Debug, Default)] +/// A fuzzy match pattern +#[non_exhaustive] +pub struct Pattern { + /// The individual pattern (words) in this pattern + pub atoms: Vec, +} + +impl Pattern { + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, ' + /// and ^ don't receive special treatment). If you want to match the entiru + /// pattern as a single needle use a single [`PatternAtom`] instead + pub fn new(case_matching: CaseMatching, kind: AtomKind, pattern: &str) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::new(pat, case_matching, kind, true); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will + /// cause different matching behaviour (see [`PatternAtomKind`]). These can be + /// escaped with backslash. + pub fn parse(case_matching: CaseMatching, pattern: &str) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::parse(pat, case_matching); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + + /// Convenience function to easily match on a (relatively small) list of + /// inputs. This is not recommended for building a full fuzzy matching + /// application that can match large numbers of matches (like all files in + /// a directory) as all matching is done on the current thread, effectively + /// blocking the UI. + pub fn match_list>( + &self, + matcher: &mut Matcher, + items: impl IntoIterator, + ) -> Vec<(T, u32)> { + if self.atoms.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher). + /// Documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.score(haystack, matcher)? as u32; + } + Some(score) + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the matche + /// indices. See the [`Matcher`](crate::Matcher). Documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + /// + /// *Note:* The indices for each pattern are calculated individually + /// and simply appended to the `indices` vector. This allows + /// + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec, + ) -> Option { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.indices(haystack, matcher, indices)? as u32; + } + Some(score) + } + + /// Refreshes this pattern by reparsing a + pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) { + self.atoms.clear(); + let atoms = pattern_atoms(pattern).filter_map(|atom| { + let atom = Atom::parse(atom, case_matching); + if atom.needle.is_empty() { + return None; + } + Some(atom) + }); + self.atoms.extend(atoms); + } +} + +impl Clone for Pattern { + fn clone(&self) -> Self { + Self { + atoms: self.atoms.clone(), + } + } + + fn clone_from(&mut self, source: &Self) { + self.atoms.clone_from(&source.atoms); + } +} diff --git a/matcher/src/pattern/tests.rs b/matcher/src/pattern/tests.rs new file mode 100644 index 0000000..8fcd0a9 --- /dev/null +++ b/matcher/src/pattern/tests.rs @@ -0,0 +1,114 @@ +use crate::pattern::{Atom, AtomKind, CaseMatching}; + +#[test] +fn negative() { + let pat = Atom::parse("!foo", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!foo$", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo$", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn pattern_kinds() { + let pat = Atom::parse("foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Fuzzy); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("'foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("foo$", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo$", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn case_matching() { + let pat = Atom::parse("foo", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Äxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "äxx"); + let pat = Atom::parse("Äxx", CaseMatching::Respect); + assert!(!pat.ignore_case); + let pat = Atom::parse("Axx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = Atom::parse("你xx", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("你xx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Ⲽxx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "ⲽxx"); +} + +#[test] +fn escape() { + let pat = Atom::parse("foo\\ bar", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo bar"); + let pat = Atom::parse("\\!foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "!foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\'foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "'foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\^foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Prefix); + let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "!^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Substring); +} diff --git a/matcher/src/score.rs b/matcher/src/score.rs index eba054b..c934a8e 100644 --- a/matcher/src/score.rs +++ b/matcher/src/score.rs @@ -1,7 +1,7 @@ use std::cmp::max; use crate::chars::{Char, CharClass}; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; pub(crate) const SCORE_MATCH: u16 = 16; pub(crate) const PENALTY_GAP_START: u16 = 3; @@ -47,7 +47,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS // still respected. pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; -impl MatcherConfig { +impl Config { #[inline] pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { if class > CharClass::Delimiter { diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index 691230c..058b497 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -4,7 +4,7 @@ use crate::score::{ MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; use crate::utf32_str::Utf32Str; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; use Algorithm::*; @@ -26,11 +26,11 @@ fn assert_matches( prefer_prefix: bool, cases: &[(&str, &str, &[u32], u16)], ) { - let mut config = MatcherConfig { + let mut config = Config { normalize, ignore_case: !case_sensitive, prefer_prefix, - ..MatcherConfig::DEFAULT + ..Config::DEFAULT }; if path { config.set_match_paths(); @@ -89,10 +89,10 @@ pub fn assert_not_matches( path: bool, cases: &[(&str, &str)], ) { - let mut config = MatcherConfig { + let mut config = Config { normalize, ignore_case: !case_sensitive, - ..MatcherConfig::DEFAULT + ..Config::DEFAULT }; if path { config.set_match_paths(); @@ -134,8 +134,8 @@ pub fn assert_not_matches( } } -const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; -const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; +const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white; +const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter; #[test] fn test_fuzzy() { diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 9602b27..1821b46 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::mem::take; use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; @@ -55,6 +54,7 @@ impl<'a> Utf32Str<'a> { } } + /// Returns the number of characters in this string. #[inline] pub fn len(self) -> usize { match self { @@ -62,6 +62,8 @@ impl<'a> Utf32Str<'a> { Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), } } + + /// Returns whether this string is empty. #[inline] pub fn is_empty(self) -> bool { match self { @@ -70,6 +72,8 @@ impl<'a> Utf32Str<'a> { } } + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. #[inline] pub fn slice(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { @@ -90,7 +94,7 @@ impl<'a> Utf32Str<'a> { /// Returns the number of leading whitespaces in this string #[inline] - pub fn leading_white_space(self) -> usize { + pub(crate) fn leading_white_space(self) -> usize { match self { Utf32Str::Ascii(bytes) => bytes .iter() @@ -105,7 +109,7 @@ impl<'a> Utf32Str<'a> { /// Returns the number of leading whitespaces in this string #[inline] - pub fn trailing_white_space(self) -> usize { + pub(crate) fn trailing_white_space(self) -> usize { match self { Utf32Str::Ascii(bytes) => bytes .iter() @@ -121,7 +125,7 @@ impl<'a> Utf32Str<'a> { } /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher + /// those are the indices returned by the matcher. #[inline] pub fn slice_u32(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { @@ -139,29 +143,34 @@ impl<'a> Utf32Str<'a> { Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), } } + + /// Returns whether this string only contains ascii text. pub fn is_ascii(self) -> bool { matches!(self, Utf32Str::Ascii(_)) } - pub fn get(self, idx: u32) -> char { + /// Returns the `n`th character in this string. + pub fn get(self, n: u32) -> char { match self { - Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, - Utf32Str::Unicode(codepoints) => codepoints[idx as usize], + Utf32Str::Ascii(bytes) => bytes[n as usize] as char, + Utf32Str::Unicode(codepoints) => codepoints[n as usize], } } - pub fn last(self) -> char { + pub(crate) fn last(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], } } - pub fn first(self) -> char { + + pub(crate) fn first(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[0] as char, Utf32Str::Unicode(codepoints) => codepoints[0], } } + /// Returns an iterator over the characters in this string pub fn chars(self) -> Chars<'a> { match self { Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), @@ -169,6 +178,7 @@ impl<'a> Utf32Str<'a> { } } } + impl fmt::Debug for Utf32Str<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "\"")?; @@ -215,6 +225,7 @@ impl DoubleEndedIterator for Chars<'_> { } #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +/// An owned version of [`Utf32Str`]. pub enum Utf32String { /// A string represented as ASCII encoded bytes. /// Correctness invariant: must only contain valid ASCII (<=127) @@ -230,6 +241,7 @@ impl Default for Utf32String { } impl Utf32String { + /// Returns the number of characters in this string. #[inline] pub fn len(&self) -> usize { match self { @@ -237,6 +249,8 @@ impl Utf32String { Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), } } + + /// Returns whether this string is empty. #[inline] pub fn is_empty(&self) -> bool { match self { @@ -245,18 +259,18 @@ impl Utf32String { } } - /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. #[inline] - pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { let start = match range.start_bound() { - Bound::Included(&start) => start as usize, - Bound::Excluded(&start) => start as usize + 1, + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, Bound::Unbounded => 0, }; let end = match range.end_bound() { - Bound::Included(&end) => end as usize + 1, - Bound::Excluded(&end) => end as usize, + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, Bound::Unbounded => self.len(), }; match self { @@ -265,65 +279,28 @@ impl Utf32String { } } + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher. #[inline] - pub fn is_ascii(&self) -> bool { - matches!(self, Utf32String::Ascii(_)) - } - - #[inline] - pub fn get(&self, idx: u32) -> char { - match self { - Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, - Utf32String::Unicode(codepoints) => codepoints[idx as usize], - } - } - - #[inline] - pub fn last(&self) -> char { - match self { - Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, - Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], - } - } - - #[inline] - pub fn chars(&self) -> Chars<'_> { - match self { - Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), - Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), - } - } - - #[inline] - pub fn push_str(&mut self, text: &str) { - let mut codeboints = match take(self) { - Utf32String::Ascii(bytes) if text.is_ascii() => { - let mut bytes = bytes.into_string(); - bytes.push_str(text); - *self = Self::Ascii(bytes.into_boxed_str()); - return; - } - Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), - Utf32String::Unicode(codepoints) => Vec::from(codepoints), + pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, }; - codeboints.extend(chars::graphemes(text)); - *self = Utf32String::Unicode(codeboints.into_boxed_slice()); - } - - #[inline] - pub fn push(&mut self, c: char) { - let mut codeboints = match take(self) { - Utf32String::Ascii(bytes) if c.is_ascii() => { - let mut bytes = bytes.into_string(); - bytes.push(c); - *self = Self::Ascii(bytes.into_boxed_str()); - return; - } - Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), - Utf32String::Unicode(codepoints) => Vec::from(codepoints), + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.len() as u32, }; - codeboints.push(c); - *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + match self { + Utf32String::Ascii(bytes) => { + Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize]) + } + Utf32String::Unicode(codepoints) => { + Utf32Str::Unicode(&codepoints[start as usize..end as usize]) + } + } } } @@ -367,21 +344,12 @@ impl<'a> From> for Utf32String { impl fmt::Debug for Utf32String { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\"")?; - for c in self.chars() { - for c in c.escape_debug() { - write!(f, "{c}")? - } - } - write!(f, "\"") + write!(f, "{:?}", self.slice(..)) } } impl fmt::Display for Utf32String { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for c in self.chars() { - write!(f, "{c}")? - } - Ok(()) + write!(f, "{}", self.slice(..)) } } diff --git a/src/lib.rs b/src/lib.rs index cb87352..61f18db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,3 @@ -use std::cmp::Reverse; use std::ops::{Bound, RangeBounds}; use std::sync::atomic::{self, AtomicBool, Ordering}; use std::sync::Arc; @@ -7,13 +6,13 @@ use std::time::Duration; use parking_lot::Mutex; use rayon::ThreadPool; -pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; +use crate::pattern::MultiPattern; use crate::worker::Worker; -pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str, Utf32String}; +pub use nucleo_matcher::{chars, Config, Matcher, Utf32Str, Utf32String}; mod boxcar; mod par_sort; -mod pattern; +pub mod pattern; mod worker; pub struct Item<'a, T> { @@ -195,10 +194,9 @@ pub struct Nucleo { impl Nucleo { pub fn new( - config: MatcherConfig, + config: Config, notify: Arc<(dyn Fn() + Sync + Send)>, num_threads: Option, - case_matching: CaseMatching, columns: u32, ) -> Self { let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns); @@ -207,10 +205,10 @@ impl Nucleo { should_notify: worker.should_notify.clone(), items: worker.items.clone(), pool, - pattern: MultiPattern::new(&config, case_matching, columns as usize), + pattern: MultiPattern::new(columns as usize), snapshot: Snapshot { matches: Vec::with_capacity(2 * 1024), - pattern: MultiPattern::new(&config, case_matching, columns as usize), + pattern: MultiPattern::new(columns as usize), item_count: 0, items: worker.items.clone(), }, @@ -252,7 +250,7 @@ impl Nucleo { } } - pub fn update_config(&mut self, config: MatcherConfig) { + pub fn update_config(&mut self, config: Config) { self.worker.lock().update_config(config) } @@ -321,31 +319,3 @@ impl Drop for Nucleo { } } } - -/// convenience function to easily fuzzy match -/// on a (relatively small) list of inputs. This is not recommended for building a full tui -/// application that can match large numbers of matches as all matching is done on the current -/// thread, effectively blocking the UI -pub fn fuzzy_match>( - matcher: &mut Matcher, - pattern: &str, - items: impl IntoIterator, - case_matching: CaseMatching, -) -> Vec<(T, u32)> { - let mut pattern_ = Pattern::new(&matcher.config, case_matching); - pattern_.set_literal(pattern, PatternKind::Fuzzy, false); - if pattern_.is_empty() { - return items.into_iter().map(|item| (item, 0)).collect(); - } - let mut buf = Vec::new(); - let mut items: Vec<_> = items - .into_iter() - .filter_map(|item| { - pattern_ - .score(Utf32Str::new(item.as_ref(), &mut buf), matcher) - .map(|score| (item, score)) - }) - .collect(); - items.sort_by_key(|(_, score)| Reverse(*score)); - items -} diff --git a/src/pattern.rs b/src/pattern.rs index f9939c6..07620e9 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,188 +1,12 @@ -use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; +pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern}; +use nucleo_matcher::{Matcher, Utf32String}; #[cfg(test)] mod tests; -use crate::Utf32String; - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -#[non_exhaustive] -pub enum CaseMatching { - Ignore, - Smart, - Respect, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -#[non_exhaustive] -pub enum PatternKind { - Exact, - Fuzzy, - Substring, - Prefix, - Postfix, -} - -#[derive(Debug, PartialEq, Eq, Clone)] -struct PatternAtom { - kind: PatternKind, - needle: Utf32String, - invert: bool, - ignore_case: bool, -} -impl PatternAtom { - fn literal( - needle: &str, - normalize: bool, - case: CaseMatching, - kind: PatternKind, - escape_whitespace: bool, - ) -> PatternAtom { - let mut ignore_case; - let needle = if needle.is_ascii() { - let mut needle = if escape_whitespace { - if let Some((start, rem)) = needle.split_once("\\ ") { - let mut needle = start.to_owned(); - for rem in rem.split("\\ ") { - needle.push(' '); - needle.push_str(rem); - } - needle - } else { - needle.to_owned() - } - } else { - needle.to_owned() - }; - - match case { - CaseMatching::Ignore => { - ignore_case = true; - needle.make_ascii_lowercase() - } - CaseMatching::Smart => { - ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) - } - CaseMatching::Respect => ignore_case = false, - } - - Utf32String::Ascii(needle.into_boxed_str()) - } else { - let mut needle_ = Vec::with_capacity(needle.len()); - ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); - if escape_whitespace { - let mut saw_backslash = false; - for mut c in chars::graphemes(needle) { - if saw_backslash { - if c == ' ' { - needle_.push(' '); - saw_backslash = false; - continue; - } else { - needle_.push('\\'); - } - } - saw_backslash = c == '\\'; - if normalize { - c = chars::normalize(c); - } - match case { - CaseMatching::Ignore => c = chars::to_lower_case(c), - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c) - } - CaseMatching::Respect => (), - } - needle_.push(c); - } - } else { - let chars = chars::graphemes(needle).map(|mut c| { - if normalize { - c = chars::normalize(c); - } - match case { - CaseMatching::Ignore => c = chars::to_lower_case(c), - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c); - } - CaseMatching::Respect => (), - } - c - }); - needle_.extend(chars); - }; - Utf32String::Unicode(needle_.into_boxed_slice()) - }; - PatternAtom { - kind, - needle, - invert: false, - ignore_case, - } - } - - fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom { - let mut atom = raw; - let invert = match atom.as_bytes() { - [b'!', ..] => { - atom = &atom[1..]; - true - } - [b'\\', b'!', ..] => { - atom = &atom[1..]; - false - } - _ => false, - }; - - let mut kind = match atom.as_bytes() { - [b'^', ..] => { - atom = &atom[1..]; - PatternKind::Prefix - } - [b'\'', ..] => { - atom = &atom[1..]; - PatternKind::Substring - } - [b'\\', b'^' | b'\'', ..] => { - atom = &atom[1..]; - PatternKind::Fuzzy - } - _ => PatternKind::Fuzzy, - }; - - let mut append_dollar = false; - match atom.as_bytes() { - [.., b'\\', b'$'] => { - append_dollar = true; - atom = &atom[..atom.len() - 2] - } - [.., b'$'] => { - kind = if kind == PatternKind::Fuzzy { - PatternKind::Postfix - } else { - PatternKind::Exact - }; - atom = &atom[..atom.len() - 1] - } - _ => (), - } - - if invert && kind == PatternKind::Fuzzy { - kind = PatternKind::Substring - } - - let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true); - pattern.invert = invert; - if append_dollar { - pattern.needle.push('$'); - } - pattern - } -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Default)] pub enum Status { + #[default] Unchanged, Update, Rescore, @@ -190,7 +14,7 @@ pub enum Status { #[derive(Debug)] pub struct MultiPattern { - pub cols: Vec, + cols: Vec<(Pattern, Status)>, } impl Clone for MultiPattern { @@ -206,214 +30,64 @@ impl Clone for MultiPattern { } impl MultiPattern { - pub fn new( - matcher_config: &MatcherConfig, - case_matching: CaseMatching, - columns: usize, - ) -> MultiPattern { - MultiPattern { - cols: vec![Pattern::new(matcher_config, case_matching); columns], + /// Creates a multi pattern with `columns` empty column patterns. + pub fn new(columns: usize) -> Self { + Self { + cols: vec![Default::default(); columns], } } + /// Reparses a column. By specifying `append` the caller promises that text passed + /// to the previous `reparse` invocation is a prefix of `new_text`. This enables + /// additional optimizations but can lead to missing matches if an incorrect value + /// is passed. + pub fn reparse( + &mut self, + column: usize, + new_text: &str, + case_matching: CaseMatching, + append: bool, + ) { + let old_status = self.cols[column].1; + if append + && old_status != Status::Rescore + && self.cols[column] + .0 + .atoms + .last() + .map_or(true, |last| !last.negative) + { + self.cols[column].1 = Status::Update; + } else { + self.cols[column].1 = Status::Rescore; + } + self.cols[column].0.reparse(new_text, case_matching); + } + pub(crate) fn status(&self) -> Status { self.cols .iter() - .map(|col| col.status) + .map(|&(_, status)| status) .max() .unwrap_or(Status::Unchanged) } pub(crate) fn reset_status(&mut self) { - for col in &mut self.cols { - col.status = Status::Unchanged + for (_, status) in &mut self.cols { + *status = Status::Unchanged } } pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { // TODO: wheight columns? let mut score = 0; - for (pattern, haystack) in self.cols.iter().zip(haystack) { + for ((pattern, _), haystack) in self.cols.iter().zip(haystack) { score += pattern.score(haystack.slice(..), matcher)? } Some(score) } -} - -#[derive(Debug)] -pub struct Pattern { - atoms: Vec, - case_matching: CaseMatching, - normalize: bool, - status: Status, -} - -impl Pattern { - pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern { - Pattern { - atoms: Vec::new(), - case_matching, - normalize: matcher_config.normalize, - status: Status::Unchanged, - } - } - pub fn new_fuzzy_literal( - matcher_config: &MatcherConfig, - case_matching: CaseMatching, - pattern: &str, - ) -> Pattern { - let mut res = Pattern { - atoms: Vec::new(), - case_matching, - normalize: matcher_config.normalize, - status: Status::Unchanged, - }; - res.set_literal(pattern, PatternKind::Fuzzy, false); - res - } - - pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - matcher.config.ignore_case = pattern.ignore_case; - let pattern_score = match pattern.kind { - PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), - PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), - PatternKind::Substring => { - matcher.substring_match(haystack, pattern.needle.slice(..)) - } - PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), - PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)), - }; - if pattern.invert { - if pattern_score.is_some() { - return None; - } - } else { - score += pattern_score? as u32 - } - } - Some(score) - } - - pub fn indices( - &self, - haystack: Utf32Str<'_>, - matcher: &mut Matcher, - indices: &mut Vec, - ) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - matcher.config.ignore_case = pattern.ignore_case; - if pattern.invert { - let pattern_score = match pattern.kind { - PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), - PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), - PatternKind::Substring => { - matcher.substring_match(haystack, pattern.needle.slice(..)) - } - PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), - PatternKind::Postfix => { - matcher.postfix_match(haystack, pattern.needle.slice(..)) - } - }; - if pattern_score.is_some() { - return None; - } - continue; - } - let pattern_score = match pattern.kind { - PatternKind::Exact => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Fuzzy => { - matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Substring => { - matcher.substring_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Prefix => { - matcher.prefix_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Postfix => { - matcher.postfix_indices(haystack, pattern.needle.slice(..), indices) - } - }; - score += pattern_score? as u32 - } - Some(score) - } - - pub fn parse_from(&mut self, pattern: &str, append: bool) { - let invert = self.atoms.last().map_or(false, |pat| pat.invert); - self.atoms.clear(); - let atoms = pattern_atoms(pattern).filter_map(|atom| { - let atom = PatternAtom::parse(atom, self.normalize, self.case_matching); - if atom.needle.is_empty() { - return None; - } - Some(atom) - }); - self.atoms.extend(atoms); - - self.status = if append && !invert && self.status != Status::Rescore { - Status::Update - } else { - Status::Rescore - }; - } - - pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) { - self.atoms.clear(); - let pattern = - PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false); - if !pattern.needle.is_empty() { - self.atoms.push(pattern); - } - self.status = if append && self.status != Status::Rescore { - Status::Update - } else { - Status::Rescore - }; - } pub fn is_empty(&self) -> bool { - self.atoms.is_empty() + self.cols.iter().all(|(pat, _)| pat.atoms.is_empty()) } } - -impl Clone for Pattern { - fn clone(&self) -> Self { - Self { - atoms: self.atoms.clone(), - case_matching: self.case_matching, - normalize: self.normalize, - status: self.status, - } - } - - fn clone_from(&mut self, source: &Self) { - self.atoms.clone_from(&source.atoms); - self.case_matching = source.case_matching; - self.normalize = source.normalize; - self.status = source.status; - } -} - -fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { - let mut saw_backslash = false; - pattern.split(move |c| { - saw_backslash = match c { - ' ' if !saw_backslash => return true, - '\\' => true, - _ => false, - }; - false - }) -} diff --git a/src/pattern/tests.rs b/src/pattern/tests.rs index 4eaeb40..3854e15 100644 --- a/src/pattern/tests.rs +++ b/src/pattern/tests.rs @@ -1,145 +1,14 @@ -use crate::pattern::{PatternAtom, Status}; -use crate::{CaseMatching, Pattern, PatternKind}; +use nucleo_matcher::pattern::CaseMatching; -fn parse_atom(pat: &str) -> PatternAtom { - parse_atom_with(pat, CaseMatching::Smart) -} - -fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom { - let mut pat = parse_with(pat, case_matching, false); - assert_eq!(pat.atoms.len(), 1); - pat.atoms.remove(0) -} - -fn parse_with(pat: &str, case_matching: CaseMatching, append: bool) -> Pattern { - let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching); - res.parse_from(pat, append); - res -} - -#[test] -fn negative() { - let pat = parse_atom("!foo"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!^foo"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!foo$"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!^foo$"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn pattern_kinds() { - let pat = parse_atom("foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Fuzzy); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("'foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("^foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("foo$"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("^foo$"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn case_matching() { - let pat = parse_atom_with("foo", CaseMatching::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom_with("Foo", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Foo", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom_with("Foo", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Foo", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Äxx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("你xx", CaseMatching::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = parse_atom_with("你xx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Ⲽxx"); - let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "ⲽxx"); -} - -#[test] -fn escape() { - let pat = parse_atom("foo\\ bar"); - assert_eq!(pat.needle.to_string(), "foo bar"); - let pat = parse_atom("\\!foo"); - assert_eq!(pat.needle.to_string(), "!foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\'foo"); - assert_eq!(pat.needle.to_string(), "'foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\^foo"); - assert_eq!(pat.needle.to_string(), "^foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("foo\\$"); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("^foo\\$"); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, PatternKind::Prefix); - let pat = parse_atom("\\^foo\\$"); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\!^foo\\$"); - assert_eq!(pat.needle.to_string(), "!^foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("!\\^foo\\$"); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, PatternKind::Substring); -} +use crate::pattern::{MultiPattern, Status}; #[test] fn append() { - let mut pat = parse_with("!", CaseMatching::Smart, true); - assert_eq!(pat.status, Status::Update); - pat.parse_from("!f", true); - assert_eq!(pat.status, Status::Update); - pat.parse_from("!fo", true); - assert_eq!(pat.status, Status::Rescore); + let mut pat = MultiPattern::new(1); + pat.reparse(0, "!", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Update); + pat.reparse(0, "!f", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Update); + pat.reparse(0, "!fo", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Rescore); } diff --git a/src/worker.rs b/src/worker.rs index e343be1..478dce7 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -3,7 +3,7 @@ use std::mem::take; use std::sync::atomic::{self, AtomicBool, AtomicU32}; use std::sync::Arc; -use nucleo_matcher::MatcherConfig; +use nucleo_matcher::Config; use parking_lot::Mutex; use rayon::{prelude::*, ThreadPool}; @@ -42,15 +42,15 @@ impl Worker { pub(crate) fn item_count(&self) -> u32 { self.last_snapshot - self.in_flight.len() as u32 } - pub(crate) fn update_config(&mut self, config: MatcherConfig) { + pub(crate) fn update_config(&mut self, config: Config) { for matcher in self.matchers.0.iter_mut() { - matcher.get_mut().config = config; + matcher.get_mut().config = config.clone(); } } pub(crate) fn new( worker_threads: Option, - config: MatcherConfig, + config: Config, notify: Arc<(dyn Fn() + Sync + Send)>, cols: u32, ) -> (ThreadPool, Self) { @@ -62,7 +62,7 @@ impl Worker { .build() .expect("creating threadpool failed"); let matchers = (0..worker_threads) - .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config))) + .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config.clone()))) .collect(); let worker = Worker { running: false, @@ -70,7 +70,7 @@ impl Worker { last_snapshot: 0, matches: Vec::new(), // just a placeholder - pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0), + pattern: MultiPattern::new(cols as usize), canceled: Arc::new(AtomicBool::new(false)), should_notify: Arc::new(AtomicBool::new(false)), was_canceled: false, @@ -162,7 +162,7 @@ impl Worker { } // TODO: be smarter around reusing past results for rescoring - if self.pattern.cols.iter().all(|pat| pat.is_empty()) { + if self.pattern.is_empty() { self.reset_matches(); self.process_new_items_trivial(); if self.should_notify.load(atomic::Ordering::Relaxed) { diff --git a/typos.toml b/typos.toml index 900e3df..14fc504 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,3 @@ default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"] [files] -extend-exclude = ["matcher/src/tests.rs", "*.html"] +extend-exclude = ["matcher/src/tests.rs","src/pattern/tests.rs", "*.html"] From 20bf02f0acf85fabe31df282239f5dbe3eeef980 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Mon, 28 Aug 2023 01:50:55 +0200 Subject: [PATCH 6/6] Prepare for 0.2 release Co-authored-by: Michael Davis --- CHANGELOG.md | 10 +++ Cargo.lock | 4 +- Cargo.toml | 4 +- README.md | 11 ++-- bench/Cargo.toml | 2 +- matcher/Cargo.toml | 2 +- matcher/src/chars.rs | 2 +- matcher/src/lib.rs | 139 ++++++++++++++++++++++++++++------------- matcher/src/pattern.rs | 89 +++++++++++++++----------- src/lib.rs | 83 +++++++++++++++++++++--- src/pattern.rs | 2 +- 11 files changed, 246 insertions(+), 102 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..938534f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog + +## nucleo-matcher + +# [0.2.0] - 2023-09-01 + +*initial public release* + + +[0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0 diff --git a/Cargo.lock b/Cargo.lock index 9590fda..f729731 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -152,7 +152,7 @@ dependencies = [ [[package]] name = "nucleo" -version = "0.1.1" +version = "0.2.0" dependencies = [ "nucleo-matcher", "parking_lot", @@ -161,7 +161,7 @@ dependencies = [ [[package]] name = "nucleo-matcher" -version = "0.1.0" +version = "0.2.0" dependencies = [ "cov-mark", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 43e6806..e100e7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "nucleo" description = "plug and play high performance fuzzy matcher" authors = ["Pascal Kuthe "] -version = "0.1.1" +version = "0.2.0" edition = "2021" license = "MPL-2.0" repository = "https://github.com/helix-editor/nucleo" @@ -11,7 +11,7 @@ readme = "README.md" [lib] [dependencies] -nucleo-matcher = { version = "0.1", path = "matcher" } +nucleo-matcher = { version = "0.2.0", path = "matcher" } parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]} rayon = "1.7.0" diff --git a/README.md b/README.md index 374b2f5..134aaf6 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,5 @@ # Nucleo -> Disclaimer: An 0.1 version has been published to crates.io. -> This allows us to merge the `nucleo` integration into helix. -> However, the public API is not yet final and will likely -> change quite a bit in the next release. The documentation -> is also not yet complete `nucleo` is a highly performant fuzzy matcher written in rust. It aims to fill the same use case as `fzf` and `skim`. Compared to `fzf` `nucleo` has a significantly faster matching algorithm. This mainly makes a difference when matching patterns with low selectivity on many items. An (unscientific) comparison is shown in the benchmark section below. @@ -14,6 +9,12 @@ Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly). +## Status + +Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away). + +While the high level `nucleo` crate also works well (and is also used in helix), there are still additional features that will be added in the future. The high level crate also need better documentation and will likely see a few API changes in the future. + ## Benchmarks > WIP currently more of a demonstration than a comprehensive benchmark suit diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 3c0c356..6df9706 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -nucleo = { version = "0.1", path = "../" } +nucleo = { version = "0.2", path = "../" } brunch = "0.5.0" fuzzy-matcher = "0.3.7" walkdir = "2" \ No newline at end of file diff --git a/matcher/Cargo.toml b/matcher/Cargo.toml index eee20da..663a493 100644 --- a/matcher/Cargo.toml +++ b/matcher/Cargo.toml @@ -2,7 +2,7 @@ name = "nucleo-matcher" description = "plug and play high performance fuzzy matcher" authors = ["Pascal Kuthe "] -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MPL-2.0" repository = "https://github.com/helix-editor/nucleo" diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index 710c212..9b3bc69 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -169,7 +169,7 @@ pub(crate) enum CharClass { /// Nucleo cannot match graphemes as single units. To work around /// that we only use the first codepoint of each grapheme. This /// iterator returns the first character of each unicode grapheme -// in a string and is used for constructing `Utf32Str(ing)`. +/// in a string and is used for constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { text.graphemes(true).map(|grapheme| { grapheme diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 7feff93..c38cffa 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -4,8 +4,58 @@ used by the high level `nucleo` crate. The matcher is hightly optimized and can significantly outperform `fzf` and `skim` (the `fuzzy-matcher` crate). However some of these optimizations require -a slightly less convenient API. Be sure to carefully read the documentation of the -[`Matcher`] to avoid unexpected behaviour.. +a slightly less convenient API. Be sure to carefully read the documentation of +the [`Matcher`] to avoid unexpected behaviour. +# Examples + +For almost all usecases the [`pattern`] API should be used instead of calling +the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will +construct a single Atom (a single match operation) for each word. The pattern +can contain special characters to control what kind of match is performed (see +[`AtomKind`](crate::pattern::AtomKind)). + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, CaseMatching}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]); +``` + +If the pattern should be matched literally (without this special parsing) +[`Pattern::new`](pattern::Pattern::new) can be used instead. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let paths = ["^foo/bar", "bar/^foo", "foobar"]; +let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]); +``` + +If word segmentation is also not desired, a single `Atom` can be constructed directly. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind}; +let paths = ["foobar", "foo bar"]; +let mut matcher = Matcher::new(Config::DEFAULT); +let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo bar", 192)]); +``` + + +# Status + +Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away). + */ // sadly ranges don't optmimzie well @@ -40,46 +90,44 @@ use crate::matrix::MatrixSlab; /// matching. This scratch memory allows the matcher to guarantee that it will /// **never allocate** during matching (with the exception of pushing to the /// `indices` vector if there isn't enough capacity). However this scratch -/// memory is fairly large (around 135KB) so creating a matcher is expensive and -/// should be reused. +/// memory is fairly large (around 135KB) so creating a matcher is expensive. /// -/// All `.._match` functions will not compute the indices of the matched chars -/// and are therefore significantly faster. These should be used to prefitler -/// and sort all matches. All `.._indices` functions will compute the indices of -/// the computed chars. These should be used when rendering the best N matches. -/// Note that the `indices` argument is **never cleared**. This allows running -/// multiple different matches on the same haystack and merging the indices by -/// sorting and deduplicating the vector. +/// All `.._match` functions will not compute the indices of the matched +/// characters. These should be used to prefitler to filter and rank all +/// matches. All `.._indices` functions will also compute the indices of the +/// matched characters but are slower compared to the `..match` variant. These +/// should be used when rendering the best N matches. Note that the `indices` +/// argument is **never cleared**. This allows running multiple different +/// matches on the same haystack and merging the indices by sorting and +/// deduplicating the vector. /// -/// The `needle` argument for each function must always be normalized by the caller -/// (unicode normalization and case folding if a case insesnitive match is produced). -/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules -/// provides utilities to preprocess needles. +/// The `needle` argument for each function must always be normalized by the +/// caller (unicode normalization and case folding). Otherwise, the matcher +/// may fail to produce a match. The [`pattern`] modules provides utilities +/// to preprocess needles and **should usually be preferred over invoking the +/// matcher directly**. Additionally it's recommend to perform separate matches +/// for each word in the needle. Consider the folloling example: /// -/// Additionally it's recommend to perform separate matches for each word in -/// the needle. Consider the folloling example: If `foo bar` as used at the -/// needle it matches both `foo test baaar` and `foo hello-world bar`. However, -/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a -/// 2 character gap which will receive a penalty and therefore the user will -/// likely expect it to rank lower. However, if `foo bar` is matched as a single -/// query `hello-world` and `test` are both considered gaps too. As `hello- -/// world` is a much longer gap then `test` the extra penalty for `baaar` is -/// outweigh. If both words are matched individually the interspersed words -/// do not receive a penalty and `foo hello-world bar` ranks higher. -/// -/// In general nucleo is a **substring matching tool** with no penalty assigned -/// to matches that start later within the same pattern (which enables the -/// usecase shown above). This may be undesirable in one very particular usecase: -/// For automatic suggestions for commands (like a shell). In these case the -/// assumption is that the user is actually typing the full haystack. In other words: -/// The matcher should prefer a prefix match. To accomedate that usecase the -/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set -/// to true. Note that the penalty given is quite small (and capped to a maximum) -/// to avoid overwriting the normal scoring heuristic. +/// If `foo bar` is used as the needle it matches both `foo test baaar` and +/// `foo hello-world bar`. However, `foo test baaar` will receive a higher +/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which +/// will receive a penalty and therefore the user will likely expect it to rank +/// lower. However, if `foo bar` is matched as a single query `hello-world` and +/// `test` are both considered gaps too. As `hello-world` is a much longer gap +/// then `test` the extra penalty for `baaar` is canceled out. If both words +/// are matched individually the interspersed words do not receive a penalty and +/// `foo hello-world bar` ranks higher. /// +/// In general nucleo is a **substring matching tool** (except for the prefix/ +/// postfix matching modes) with no penalty assigned to matches that start +/// later within the same pattern (which enables matching words individually +/// as shown above). If patterns show a large variety in length and the syntax +/// described above is not used it may be preferable to give preference to +/// matches closer to the start of a haystack. To accommodate that usecase the +/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true. /// /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than -/// that the matcher *will panic*. The caller must decide whether it wants to +/// that the matcher **will panic**. The caller must decide whether it wants to /// filter out long haystacks or truncate them. pub struct Matcher { #[allow(missing_docs)] @@ -115,9 +163,9 @@ impl Default for Matcher { } impl Matcher { - /// Creates a new matcher instance, note that this will eagerly allocate - /// a fairly large chunk of heap memory (135KB currently but subject to - /// change) so matchers should be reused if used in a loop. + /// Creates a new matcher instance, note that this will eagerly allocate a + /// fairly large chunk of heap memory (around 135KB currently but subject to + /// change) so matchers should be reused if called often (like in a loop). pub fn new(config: Config) -> Self { Self { config, @@ -127,9 +175,10 @@ impl Matcher { /// Find the fuzzy match with the highest score in the `haystack`. /// - /// This functions has `O(mn)` time complexity for short inputs. To - /// avoid slowdowns it automatically falls back to [greedy matching] - /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks + /// This functions has `O(mn)` time complexity for short inputs. + /// To avoid slowdowns it automatically falls back to + /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large + /// needles and haystacks. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { @@ -261,7 +310,7 @@ impl Matcher { /// Greedly find a fuzzy match in the `haystack`. /// /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) - /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should + /// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should /// be preferred. /// /// See the [matcher documentation](crate::Matcher) for more details. @@ -277,7 +326,7 @@ impl Matcher { /// Greedly find a fuzzy match in the `haystack` and compute its indices. /// /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) - /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should + /// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should /// be preferred. /// /// See the [matcher documentation](crate::Matcher) for more details. @@ -361,7 +410,7 @@ impl Matcher { /// Finds the substring match with the highest score in the `haystack`. /// /// This functions has `O(nm)` time complexity. However many cases can - /// be significantly accelerated using prefilters so it's usually fast + /// be significantly accelerated using prefilters so it's usually very fast /// in practice. /// /// See the [matcher documentation](crate::Matcher) for more details. diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs index 3583ebe..1d6f3bf 100644 --- a/matcher/src/pattern.rs +++ b/matcher/src/pattern.rs @@ -11,28 +11,28 @@ use crate::Utf32String; #[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[non_exhaustive] -/// How nucleo will treat case mismatch +/// How to treat a case mismatch between two characters. pub enum CaseMatching { - /// Characters always match their case folded version (`a == A`) + /// Characters always match their case folded version (`a == A`). Ignore, - /// Characters never match their case folded version (`a != A`) + /// Characters never match their case folded version (`a != A`). Respect, - /// Acts like `Ignore` if all characters in a pattern atom are - /// lowercase and like `Respect` otherwire + /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are + /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise. #[default] Smart, } #[derive(Debug, PartialEq, Eq, Clone, Copy)] #[non_exhaustive] -/// The kind of matching algorithm to run for this atom +/// The kind of matching algorithm to run for an atom. pub enum AtomKind { /// Fuzzy matching where the needle must match any haystack characters /// (match can contain gaps). This atom kind is used by default if no /// special syntax is used. There is no negated fuzzy matching (too /// many false positives). /// - /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). + /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match). Fuzzy, /// The needle must match a contiguous sequence of haystack characters /// without gaps. This atom kind is parsed from the following syntax: @@ -41,8 +41,8 @@ pub enum AtomKind { /// See also [`Matcher::substring_match`](crate::Matcher::substring_match). Substring, /// The needle must match all leading haystack characters without gaps or - /// prefix. This atom kind is parsed from the following syntax: `foo$` and - /// `!foo$` (negated). + /// prefix. This atom kind is parsed from the following syntax: `^foo` and + /// `!^foo` (negated). /// /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match). Prefix, @@ -56,7 +56,7 @@ pub enum AtomKind { /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$` /// (negated). /// - /// See also [`Matcher::exact_match`] (crate::Matcher::exact_match). + /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). Exact, } @@ -74,8 +74,9 @@ pub struct Atom { } impl Atom { - /// Creates a single [`PatternAtom`] from a string by performing unicode - /// normalization + /// Creates a single [`Atom`] from a string by performing unicode + /// normalization and case folding (if necessary). Optionally `\ ` can + /// be escaped to ` `. pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom { Atom::new_inner(needle, case, kind, escape_whitespace, false) } @@ -254,12 +255,14 @@ impl Atom { } /// Matches this pattern against `haystack` (using the allocation and - /// configuration from `matcher`), calculates a ranking score and the matche + /// configuration from `matcher`), calculates a ranking score and the match /// indices. See the [`Matcher`](crate::Matcher). Documentation for more /// details. /// /// *Note:* The `ignore_case` setting is overwritten to match the casing of - /// this pattern atom. + /// each pattern atom. + /// + /// *Note:* The `indices` vector is not cleared by this function. pub fn indices( &self, haystack: Utf32Str<'_>, @@ -299,15 +302,18 @@ impl Atom { pub fn needle_text(&self) -> Utf32Str<'_> { self.needle.slice(..) } - /// Convenience function to easily match on a (relatively small) list of - /// inputs. This is not recommended for building a full fuzzy matching - /// application that can match large numbers of matches (like all files in - /// a directory) as all matching is done on the current thread, effectively - /// blocking the UI. + /// Convenience function to easily match (and sort) a (relatively small) + /// list of inputs. + /// + /// *Note* This function is not recommended for building a full fuzzy + /// matching application that can match large numbers of matches (like all + /// files in a directory) as all matching is done on the current thread, + /// effectively blocking the UI. For such applications the high level + /// `nucleo` crate can be used instead. pub fn match_list>( &self, - matcher: &mut Matcher, items: impl IntoIterator, + matcher: &mut Matcher, ) -> Vec<(T, u16)> { if self.needle.is_empty() { return items.into_iter().map(|item| (item, 0)).collect(); @@ -338,7 +344,7 @@ fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { } #[derive(Debug, Default)] -/// A fuzzy match pattern +/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom). #[non_exhaustive] pub struct Pattern { /// The individual pattern (words) in this pattern @@ -348,9 +354,9 @@ pub struct Pattern { impl Pattern { /// Creates a pattern where each word is matched individually (whitespaces /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, ' - /// and ^ don't receive special treatment). If you want to match the entiru - /// pattern as a single needle use a single [`PatternAtom`] instead - pub fn new(case_matching: CaseMatching, kind: AtomKind, pattern: &str) -> Pattern { + /// and ^ don't receive special treatment). If you want to match the entire + /// pattern as a single needle use a single [`Atom`] instead. + pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern { let atoms = pattern_atoms(pattern) .filter_map(|pat| { let pat = Atom::new(pat, case_matching, kind, true); @@ -361,9 +367,9 @@ impl Pattern { } /// Creates a pattern where each word is matched individually (whitespaces /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will - /// cause different matching behaviour (see [`PatternAtomKind`]). These can be + /// cause different matching behaviour (see [`AtomKind`]). These can be /// escaped with backslash. - pub fn parse(case_matching: CaseMatching, pattern: &str) -> Pattern { + pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern { let atoms = pattern_atoms(pattern) .filter_map(|pat| { let pat = Atom::parse(pat, case_matching); @@ -373,15 +379,18 @@ impl Pattern { Pattern { atoms } } - /// Convenience function to easily match on a (relatively small) list of - /// inputs. This is not recommended for building a full fuzzy matching - /// application that can match large numbers of matches (like all files in - /// a directory) as all matching is done on the current thread, effectively - /// blocking the UI. + /// Convenience function to easily match (and sort) a (relatively small) + /// list of inputs. + /// + /// *Note* This function is not recommended for building a full fuzzy + /// matching application that can match large numbers of matches (like all + /// files in a directory) as all matching is done on the current thread, + /// effectively blocking the UI. For such applications the high level + /// `nucleo` crate can be used instead. pub fn match_list>( &self, - matcher: &mut Matcher, items: impl IntoIterator, + matcher: &mut Matcher, ) -> Vec<(T, u32)> { if self.atoms.is_empty() { return items.into_iter().map(|item| (item, 0)).collect(); @@ -416,7 +425,7 @@ impl Pattern { } /// Matches this pattern against `haystack` (using the allocation and - /// configuration from `matcher`), calculates a ranking score and the matche + /// configuration from `matcher`), calculates a ranking score and the match /// indices. See the [`Matcher`](crate::Matcher). Documentation for more /// details. /// @@ -424,8 +433,16 @@ impl Pattern { /// each pattern atom. /// /// *Note:* The indices for each pattern are calculated individually - /// and simply appended to the `indices` vector. This allows + /// and simply appended to the `indices` vector and not deduplicated/sorted. + /// This allows associating the match indices to their source pattern. If + /// required (like for highlighting) unique/sorted indices can be obtained + /// as follows: /// + /// ``` + /// # let mut indices: Vec = Vec::new(); + /// indices.sort_unstable(); + /// indices.dedup(); + /// ``` pub fn indices( &self, haystack: Utf32Str<'_>, @@ -442,7 +459,9 @@ impl Pattern { Some(score) } - /// Refreshes this pattern by reparsing a + /// Refreshes this pattern by reparsing it from a string. This is mostly + /// equivalent to just constructing a new pattern using [`Pattern::parse`] + /// but is slightly more efficient by reusing some allocations pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) { self.atoms.clear(); let atoms = pattern_atoms(pattern).filter_map(|atom| { diff --git a/src/lib.rs b/src/lib.rs index 61f18db..dfec2d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,31 @@ +/*! +`nucleo` is a high level crate that provides a high level matcher API that +provides a highly effective (parallel) matcher worker. It's designed to allow +quickly plugging a fully featured (and faster) fzf/skim like fuzzy matcher into +your TUI application. + +It's designed to run matching on a background threadpool while providing a +snapshot of the last complete match. That means the matcher can update the +results live while the user is typing while never blocking the main UI thread +(beyond a user provided timeout). Nucleo also supports fully concurrent lock-free +(and wait-free) streaming of input items. + +The [`Nucleo`] struct servers as the main API entrypoint for this crate. + +# Status + +Nucleo is used in the helix-editor and therefore has a large user base with lots +or real world testing. The core matcher implementation is considered complete +and is unlikely to see major changes. The `nucleo-matcher` crate is finished and +ready for widespread use, breaking changes should be very rare (a 1.0 release +should not be far away). + +While the high level `nucleo` crate also works well (and is also used in helix), +there are still additional features that will be added in the future. The high +level crate also need better documentation and will likely see a few API +changes in the future. + +*/ use std::ops::{Bound, RangeBounds}; use std::sync::atomic::{self, AtomicBool, Ordering}; use std::sync::Arc; @@ -15,11 +43,16 @@ mod par_sort; pub mod pattern; mod worker; +/// A match candidate stored in a [`Nucleo`] worker. pub struct Item<'a, T> { pub data: &'a T, pub matcher_columns: &'a [Utf32String], } +/// A handle that allow adding new items [`Nucleo`] worker. +/// +/// It's internally reference counted and can be cheaply cloned +/// and send acsorss tread pub struct Injector { items: Arc>, notify: Arc<(dyn Fn() + Sync + Send)>, @@ -35,15 +68,17 @@ impl Clone for Injector { } impl Injector { - /// Appends an element to the back of the vector. + /// Appends an element to the list of matched items. + /// This function is lock-free and wait-free. pub fn push(&self, value: T, fill_columns: impl FnOnce(&mut [Utf32String])) -> u32 { let idx = self.items.push(value, fill_columns); (self.notify)(); idx } - /// Returns the total number of items in the current - /// queue + /// Returns the total number of items injected in the matcher. This might + /// not match the number of items in the match snapshot (if the matcher + /// is still running) pub fn injected_items(&self) -> u32 { self.items.count() } @@ -66,18 +101,24 @@ impl Injector { } } +/// An [item](crate::Item) that was successfully matched by a [`Nucleo`] worker. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct Match { pub score: u32, pub idx: u32, } +/// That status of a [`Nucleo`] worker after a match. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct Status { + /// Whether the current snapshot has changed. pub changed: bool, + /// Whether the matcher is still processing in the background. pub running: bool, } +/// A snapshot represent the results of a [`Nucleo`] worker after +/// finishing a [`tick`](Nucleo::tick). pub struct Snapshot { item_count: u32, matches: Vec, @@ -178,6 +219,8 @@ impl Snapshot { } } +/// A high level matcher worker that quickly computes matches in a background +/// threadpool. pub struct Nucleo { // the way the API is build we totally don't actually need these to be Arcs // but this lets us avoid some unsafe @@ -189,10 +232,27 @@ pub struct Nucleo { items: Arc>, notify: Arc<(dyn Fn() + Sync + Send)>, snapshot: Snapshot, + /// The pattern matched by this matcher. To update the match pattern + /// [`MultiPattern::reparse`](`pattern::MultiPattern::reparse`) should be used. + /// Note that the matcher worker will only become aware of the new pattern + /// after a call to [`tick`](Nucleo::tick). pub pattern: MultiPattern, } impl Nucleo { + /// Constructs a new `nucleo` worker threadpool with the provided `config`. + /// + /// `notify` is called everytime new information is available and + /// [`tick`](Nucleo::tick) should be called. Note that `notify` is not + /// debounced, that should be handled by the downstream crate (for example + /// debouncing to only redraw at most every 1/60 seconds). + /// + /// If `None` is passed for the number of worker threads, nucleo will use + /// one thread per hardware thread. + /// + /// Nucleo can match items with multiple orthogonal properties. `columns` + /// indicates how many matching columns each item (and the pattern) has. The + /// number of columns can not be changed after construction. pub fn new( config: Config, notify: Arc<(dyn Fn() + Sync + Send)>, @@ -218,11 +278,12 @@ impl Nucleo { } } - /// Returns a snapshot of all items + /// Returns a snapshot of the current matcher state. pub fn snapshot(&self) -> &Snapshot { &self.snapshot } + /// Returns an injector that can be used for adding candidates to the matcher. pub fn injector(&self) -> Injector { Injector { items: self.items.clone(), @@ -230,11 +291,11 @@ impl Nucleo { } } - /// Restart the the item stream. Removes all items disconnects all - /// previously created injectors from this instance. If `clear_snapshot` is - /// `true` then all items and matched are removed from the - /// [`Snapshot`](crate::Snapshot) immediately. Otherwise the snapshot will - /// keep the current matches until the matcher has run again. + /// Restart the the item stream. Removes all items and disconnects all + /// previously created injectors from this instance. If `clear_snapshot` + /// is `true` then all items and matched are removed from the [`Snapshot`] + /// (crate::Snapshot) immediately. Otherwise the snapshot will keep the + /// current matches until the matcher has run again. /// /// # Note /// @@ -254,6 +315,10 @@ impl Nucleo { self.worker.lock().update_config(config) } + /// The main way to interact with the matcher, this should be called + /// regularly (for example each time a frame is rendered). To avoid + /// excessive redraws this method will wait `timeout` milliseconds for the + /// worker therad to finish. It is recommend to set the timeout to 10ms. pub fn tick(&mut self, timeout: u64) -> Status { self.should_notify.store(false, atomic::Ordering::Relaxed); let status = self.pattern.status(); diff --git a/src/pattern.rs b/src/pattern.rs index 07620e9..71f0700 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -5,7 +5,7 @@ use nucleo_matcher::{Matcher, Utf32String}; mod tests; #[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Default)] -pub enum Status { +pub(crate) enum Status { #[default] Unchanged, Update,