From b38fdfa8d7ea23d384651335d9bbb15fd23e84a6 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 27 Aug 2023 16:21:51 +0200 Subject: [PATCH] add option to prefer prefix matches --- Cargo.lock | 2 +- matcher/src/config.rs | 9 +++++++ matcher/src/fuzzy_optimal.rs | 40 ++++++++++++++++++++++++------- matcher/src/lib.rs | 34 ++++++++++++++++++++++---- matcher/src/score.rs | 14 ++++++++++- matcher/src/tests.rs | 46 +++++++++++++++++++++++++++++++++++- 6 files changed, 129 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24abffc..9590fda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -152,7 +152,7 @@ dependencies = [ [[package]] name = "nucleo" -version = "0.1.0" +version = "0.1.1" dependencies = [ "nucleo-matcher", "parking_lot", diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 7065262..67e07b7 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -15,6 +15,14 @@ pub struct MatcherConfig { pub normalize: bool, /// whether to ignore casing pub ignore_case: bool, + /// Whether to provide a bonus to matches by their distance from the start + /// of the haystack. The bonus is fairly small compared to the normal gap + /// penalty to avoid messing with the normal score heuristic. This setting + /// is not turned on by default and only recommended for autocompletion + /// usecases where the expectation is that the user is typing the entire + /// match. For a full fzf-like fuzzy matcher/picker word segmentation and + /// explicit prefix literals should be used instead. + pub prefer_prefix: bool, } impl MatcherConfig { @@ -26,6 +34,7 @@ impl MatcherConfig { initial_char_class: CharClass::Whitespace, normalize: true, ignore_case: true, + prefer_prefix: false, } }; } diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs index 65fe95b..f007d79 100644 --- a/matcher/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -3,8 +3,8 @@ use std::cmp::max; use crate::chars::{Char, CharClass}; use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; use crate::score::{ - BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, - PENALTY_GAP_START, SCORE_MATCH, + BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, + PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, }; use crate::{Matcher, MatcherConfig}; @@ -35,7 +35,7 @@ impl Matcher { .checked_sub(1) .map(|i| haystack[i].char_class(&self.config)) .unwrap_or(self.config.initial_char_class); - let matched = matrix.setup::(needle, prev_class, &self.config); + let matched = matrix.setup::(needle, prev_class, &self.config, start as u32); // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects if !matched { assert!( @@ -117,6 +117,7 @@ impl MatcherDataView<'_, H> { needle: &[N], mut prev_class: CharClass, config: &MatcherConfig, + start: u32, ) -> bool where H: PartialEq, @@ -167,6 +168,17 @@ impl MatcherDataView<'_, H> { 0, needle[0], needle[1], + if config.prefer_prefix { + if start == 0 { + MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE + } else { + (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub( + (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION, + ) + } + } else { + 0 + }, ); true } @@ -182,6 +194,7 @@ impl MatcherDataView<'_, H> { needle_idx: u16, needle_char: N, next_needle_char: N, + mut prefix_bonus: u16, ) where H: PartialEq, { @@ -198,15 +211,19 @@ impl MatcherDataView<'_, H> { for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { - if c == needle_char { + let cell = if c == needle_char { ScoreCell { - score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, + score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, matched: false, consecutive_bonus: *bonus, } } else { UNMATCHED - } + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell } else { *score_cell }; @@ -224,15 +241,19 @@ impl MatcherDataView<'_, H> { for (((c, bonus), score_cell), matrix_cell) in col_iter { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { - if c[0] == needle_char { + let cell = if c[0] == needle_char { ScoreCell { - score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, + score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, matched: false, consecutive_bonus: bonus[0], } } else { UNMATCHED - } + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell } else { *score_cell }; @@ -271,6 +292,7 @@ impl MatcherDataView<'_, H> { needle_idx as u16 + 1, needle_char, next_needle_char, + 0, ); let len = self.current_row.len() + needle_idx + 1 - row_off as usize; matrix_cells = &mut matrix_cells[len..]; diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index f966b1a..6aea293 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -1,12 +1,11 @@ /*! `nucleo_matcher` is a low level crate that contains the matcher implementation -used by the other nucleo crates. +used by the high level `nucleo` crate. The matcher is hightly optimized and can significantly outperform `fzf` and `skim` (the `fuzzy-matcher` crate). However some of these optimizations require -a slightly less convenient API. Particularly, `nucleo_matcher` requires that -needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead -of rusts normal utf32 strings. +a slightly less convenient API. Be sure to carefully read the documentation of the +[`Matcher`] to avoid unexpected behaviour.. */ // sadly ranges don't optmimzie well @@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab; /// multiple different matches on the same haystack and merging the indices by /// sorting and deduplicating the vector. /// +/// The `needle` argument for each function must always be normalized by the caller +/// (unicode normalization and case folding if a case insesnitive match is produced). +/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules +/// provides utilities to preprocess needles. +/// +/// Additionally it's recommend to perform separate matches for each word in +/// the needle. Consider the folloling example: If `foo bar` as used at the +/// needle it matches both `foo test baaar` and `foo hello-world bar`. However, +/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a +/// 2 character gap which will receive a penalty and therefore the user will +/// likely expect it to rank lower. However, if `foo bar` is matched as a single +/// query `hello-world` and `test` are both considered gaps too. As `hello- +/// world` is a much longer gap then `test` the extra penalty for `baaar` is +/// outweigh. If both words are matched individually the interspersed words +/// do not receive a penalty and `foo hello-world bar` ranks higher. +/// +/// In general nucleo is a **substring matching tool** with no penalty assigned +/// to matches that start later within the same pattern (which enables the +/// usecase shown above). This may be undesirable in one very particular usecase: +/// For automatic suggestions for commands (like a shell). In these case the +/// assumption is that the user is actually typing the full haystack. In other words: +/// The matcher should prefer a prefix match. To accomedate that usecase the +/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set +/// to true. Note that the penalty given is quite small (and capped to a maximum) +/// to avoid overwriting the normal scoring heuristic. +/// +/// /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than /// that the matcher *will panic*. The caller must decide whether it wants to /// filter out long haystacks or truncate them. diff --git a/matcher/src/score.rs b/matcher/src/score.rs index 7a7c0c3..eba054b 100644 --- a/matcher/src/score.rs +++ b/matcher/src/score.rs @@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig}; pub(crate) const SCORE_MATCH: u16 = 16; pub(crate) const PENALTY_GAP_START: u16 = 3; pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; +/// If the prefer_prefix option is enabled we want to penalize +/// the initial gap. The prefix should not be too much +pub(crate) const PREFIX_BONUS_SCALE: u16 = 2; +pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY; // We prefer matches at the beginning of a word, but the bonus should not be // too great to prevent the longer acronym matches from always winning over @@ -140,7 +144,15 @@ impl Matcher { } prev_class = class; } - + if self.config.prefer_prefix { + if start != 0 { + let penalty = PENALTY_GAP_START + + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16; + score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE); + } else { + score += MAX_PREFIX_BONUS; + } + } score } } diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index d2bfaea..691230c 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -1,7 +1,7 @@ use crate::chars::Char; use crate::score::{ BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, - PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, + MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; use crate::utf32_str::Utf32Str; use crate::{Matcher, MatcherConfig}; @@ -23,11 +23,13 @@ fn assert_matches( normalize: bool, case_sensitive: bool, path: bool, + prefer_prefix: bool, cases: &[(&str, &str, &[u32], u16)], ) { let mut config = MatcherConfig { normalize, ignore_case: !case_sensitive, + prefer_prefix, ..MatcherConfig::DEFAULT }; if path { @@ -142,6 +144,7 @@ fn test_fuzzy() { false, false, false, + false, &[ ( "fooBarbaz1", @@ -250,6 +253,7 @@ fn empty_needle() { false, false, false, + false, &[("foo bar baz", "", &[], 0)], ); } @@ -261,6 +265,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo bar baz", @@ -287,6 +292,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo bar baz", @@ -313,6 +319,7 @@ fn test_substring() { false, false, false, + false, &[ ( "foo", @@ -339,6 +346,7 @@ fn test_substring() { false, false, false, + false, &[ ( "fooBarbaz1", @@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() { false, true, false, + false, &[ ( "fooBarbaz1", @@ -418,6 +427,7 @@ fn test_normalize() { true, false, false, + false, &[ ( "Só Danço Samba", @@ -464,6 +474,7 @@ fn test_unicode() { true, false, false, + false, &[ ( "你好世界", @@ -488,6 +499,7 @@ fn test_long_str() { false, false, false, + false, &[( &"x".repeat(u16::MAX as usize + 1), "xx", @@ -504,6 +516,7 @@ fn test_casing() { false, false, false, + false, &[ // these two have the same score ( @@ -536,6 +549,7 @@ fn test_casing() { ], ) } + #[test] fn test_optimal() { assert_matches( @@ -543,6 +557,7 @@ fn test_optimal() { false, false, false, + false, &[ ( "axxx xx ", @@ -624,3 +639,32 @@ fn test_reject() { ); assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]); } + +#[test] +fn test_prefer_prefix() { + assert_matches( + &[FuzzyOptimal, FuzzyGreedy], + false, + false, + false, + true, + &[ + ( + "Moby Dick", + "md", + &[0, 5], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS + - PENALTY_GAP_START + - 3 * PENALTY_GAP_EXTENSION, + ), + ( + "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage", + "md", + &[82, 85], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ], + ); +}