add option to prefer prefix matches

2024-12-22 01:47:49 +00:00 · 2023-08-27 16:21:51 +02:00 · 2023-08-27 16:21:51 +02:00 · b38fdfa8d7
commit b38fdfa8d7
parent f18c19cd53
6 changed files with 129 additions and 16 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -152,7 +152,7 @@ dependencies = [

 [[package]]
 name = "nucleo"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
 "nucleo-matcher",
 "parking_lot",
--- a/matcher/src/config.rs
+++ b/matcher/src/config.rs
@ -15,6 +15,14 @@ pub struct MatcherConfig {
    pub normalize: bool,
    /// whether to ignore casing
    pub ignore_case: bool,
+    /// Whether to provide a bonus to matches by their distance from the start
+    /// of the haystack. The bonus is fairly small compared to the normal gap
+    /// penalty to avoid messing with the normal score heuristic. This setting
+    /// is not turned on by default and only recommended for autocompletion
+    /// usecases where the expectation is that the user is typing the entire
+    /// match. For a full fzf-like fuzzy matcher/picker word segmentation and
+    /// explicit prefix literals should be used instead.
+    pub prefer_prefix: bool,
 }

 impl MatcherConfig {
@ -26,6 +34,7 @@ impl MatcherConfig {
            initial_char_class: CharClass::Whitespace,
            normalize: true,
            ignore_case: true,
+            prefer_prefix: false,
        }
    };
 }
--- a/matcher/src/fuzzy_optimal.rs
+++ b/matcher/src/fuzzy_optimal.rs
@ -3,8 +3,8 @@ use std::cmp::max;
 use crate::chars::{Char, CharClass};
 use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
 use crate::score::{
-    BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
-    PENALTY_GAP_START, SCORE_MATCH,
+    BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
+    PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
 };
 use crate::{Matcher, MatcherConfig};

@ -35,7 +35,7 @@ impl Matcher {
            .checked_sub(1)
            .map(|i| haystack[i].char_class(&self.config))
            .unwrap_or(self.config.initial_char_class);
-        let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
+        let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
        // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
        if !matched {
            assert!(
@ -117,6 +117,7 @@ impl<H: Char> MatcherDataView<'_, H> {
        needle: &[N],
        mut prev_class: CharClass,
        config: &MatcherConfig,
+        start: u32,
    ) -> bool
    where
        H: PartialEq<N>,
@ -167,6 +168,17 @@ impl<H: Char> MatcherDataView<'_, H> {
            0,
            needle[0],
            needle[1],
+            if config.prefer_prefix {
+                if start == 0 {
+                    MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
+                } else {
+                    (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
+                        (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
+                    )
+                }
+            } else {
+                0
+            },
        );
        true
    }
@ -182,6 +194,7 @@ impl<H: Char> MatcherDataView<'_, H> {
        needle_idx: u16,
        needle_char: N,
        next_needle_char: N,
+        mut prefix_bonus: u16,
    ) where
        H: PartialEq<N>,
    {
@ -198,15 +211,19 @@ impl<H: Char> MatcherDataView<'_, H> {
        for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
            let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
            let m_cell = if FIRST_ROW {
-                if c == needle_char {
+                let cell = if c == needle_char {
                    ScoreCell {
-                        score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
+                        score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+                            + SCORE_MATCH
+                            + prefix_bonus / PREFIX_BONUS_SCALE,
                        matched: false,
                        consecutive_bonus: *bonus,
                    }
                } else {
                    UNMATCHED
-                }
+                };
+                prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
+                cell
            } else {
                *score_cell
            };
@ -224,15 +241,19 @@ impl<H: Char> MatcherDataView<'_, H> {
        for (((c, bonus), score_cell), matrix_cell) in col_iter {
            let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
            let m_cell = if FIRST_ROW {
-                if c[0] == needle_char {
+                let cell = if c[0] == needle_char {
                    ScoreCell {
-                        score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
+                        score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+                            + SCORE_MATCH
+                            + prefix_bonus / PREFIX_BONUS_SCALE,
                        matched: false,
                        consecutive_bonus: bonus[0],
                    }
                } else {
                    UNMATCHED
-                }
+                };
+                prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
+                cell
            } else {
                *score_cell
            };
@ -271,6 +292,7 @@ impl<H: Char> MatcherDataView<'_, H> {
                needle_idx as u16 + 1,
                needle_char,
                next_needle_char,
+                0,
            );
            let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
            matrix_cells = &mut matrix_cells[len..];
--- a/matcher/src/lib.rs
+++ b/matcher/src/lib.rs
@ -1,12 +1,11 @@
 /*!
 `nucleo_matcher` is a low level crate that contains the matcher implementation
-used by the other nucleo crates.
+used by the high level `nucleo` crate.

 The matcher is hightly optimized and can significantly outperform `fzf` and
 `skim` (the `fuzzy-matcher` crate). However some of these optimizations require
-a slightly less convenient API. Particularly, `nucleo_matcher` requires that
-needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
-of rusts normal utf32 strings.
+a slightly less convenient API. Be sure to carefully read the documentation of the
+[`Matcher`] to avoid unexpected behaviour..
 */

 // sadly ranges don't optmimzie well
@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab;
 /// multiple different matches on the same haystack and merging the indices by
 /// sorting and deduplicating the vector.
 ///
+/// The `needle` argument for each function must always be normalized by the caller
+/// (unicode normalization and case folding if a case insesnitive match is produced).
+/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
+/// provides utilities to preprocess needles.
+///
+/// Additionally it's recommend to perform separate matches for each word in
+/// the needle. Consider the folloling example: If `foo bar` as used at the
+/// needle it  matches both `foo test baaar` and `foo hello-world bar`. However,
+/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
+/// 2 character gap which will receive a penalty and therefore the user will
+/// likely expect it to rank lower. However, if `foo bar` is matched as a single
+/// query `hello-world` and `test` are both considered gaps too. As `hello-
+/// world` is a much longer gap then `test` the extra penalty for `baaar` is
+/// outweigh. If both words are matched individually the interspersed words
+/// do not receive a penalty and `foo hello-world bar` ranks higher.
+///
+/// In general nucleo is a **substring matching tool** with no penalty assigned
+/// to matches that start later within the same pattern (which enables the
+/// usecase shown above). This may be undesirable in one very particular usecase:
+/// For automatic suggestions for commands (like a shell). In these case the
+/// assumption is that the user is actually typing the full haystack. In other words:
+/// The matcher should prefer a prefix match. To accomedate that usecase the
+/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
+/// to true. Note that the penalty given is quite small (and capped to a maximum)
+/// to avoid overwriting the normal scoring heuristic.
+///
+///
 /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
 /// that the matcher *will panic*. The caller must decide whether it wants to
 /// filter out long haystacks or truncate them.
--- a/matcher/src/score.rs
+++ b/matcher/src/score.rs
@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig};
 pub(crate) const SCORE_MATCH: u16 = 16;
 pub(crate) const PENALTY_GAP_START: u16 = 3;
 pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
+/// If the prefer_prefix option is enabled we want to penalize
+/// the initial gap. The prefix should not be too much  
+pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
+pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;

 // We prefer matches at the beginning of a word, but the bonus should not be
 // too great to prevent the longer acronym matches from always winning over
@ -140,7 +144,15 @@ impl Matcher {
            }
            prev_class = class;
        }
-
+        if self.config.prefer_prefix {
+            if start != 0 {
+                let penalty = PENALTY_GAP_START
+                    + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
+                score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
+            } else {
+                score += MAX_PREFIX_BONUS;
+            }
+        }
        score
    }
 }
--- a/matcher/src/tests.rs
+++ b/matcher/src/tests.rs
@ -1,7 +1,7 @@
 use crate::chars::Char;
 use crate::score::{
    BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
-    PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
+    MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
 };
 use crate::utf32_str::Utf32Str;
 use crate::{Matcher, MatcherConfig};
@ -23,11 +23,13 @@ fn assert_matches(
    normalize: bool,
    case_sensitive: bool,
    path: bool,
+    prefer_prefix: bool,
    cases: &[(&str, &str, &[u32], u16)],
 ) {
    let mut config = MatcherConfig {
        normalize,
        ignore_case: !case_sensitive,
+        prefer_prefix,
        ..MatcherConfig::DEFAULT
    };
    if path {
@ -142,6 +144,7 @@ fn test_fuzzy() {
        false,
        false,
        false,
+        false,
        &[
            (
                "fooBarbaz1",
@ -250,6 +253,7 @@ fn empty_needle() {
        false,
        false,
        false,
+        false,
        &[("foo bar baz", "", &[], 0)],
    );
 }
@ -261,6 +265,7 @@ fn test_substring() {
        false,
        false,
        false,
+        false,
        &[
            (
                "foo bar baz",
@ -287,6 +292,7 @@ fn test_substring() {
        false,
        false,
        false,
+        false,
        &[
            (
                "foo bar baz",
@ -313,6 +319,7 @@ fn test_substring() {
        false,
        false,
        false,
+        false,
        &[
            (
                "foo",
@ -339,6 +346,7 @@ fn test_substring() {
        false,
        false,
        false,
+        false,
        &[
            (
                "fooBarbaz1",
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
        false,
        true,
        false,
+        false,
        &[
            (
                "fooBarbaz1",
@ -418,6 +427,7 @@ fn test_normalize() {
        true,
        false,
        false,
+        false,
        &[
            (
                "Só Danço Samba",
@ -464,6 +474,7 @@ fn test_unicode() {
        true,
        false,
        false,
+        false,
        &[
            (
                "你好世界",
@ -488,6 +499,7 @@ fn test_long_str() {
        false,
        false,
        false,
+        false,
        &[(
            &"x".repeat(u16::MAX as usize + 1),
            "xx",
@ -504,6 +516,7 @@ fn test_casing() {
        false,
        false,
        false,
+        false,
        &[
            // these two have the same score
            (
@ -536,6 +549,7 @@ fn test_casing() {
        ],
    )
 }
+
 #[test]
 fn test_optimal() {
    assert_matches(
@ -543,6 +557,7 @@ fn test_optimal() {
        false,
        false,
        false,
+        false,
        &[
            (
                "axxx xx ",
@ -624,3 +639,32 @@ fn test_reject() {
    );
    assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
 }
+
+#[test]
+fn test_prefer_prefix() {
+    assert_matches(
+        &[FuzzyOptimal, FuzzyGreedy],
+        false,
+        false,
+        false,
+        true,
+        &[
+            (
+                "Moby Dick",
+                "md",
+                &[0, 5],
+                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)  + MAX_PREFIX_BONUS
+                    - PENALTY_GAP_START
+                    - 3 * PENALTY_GAP_EXTENSION,
+            ),
+            (
+                "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
+                "md",
+                &[82, 85],
+                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
+                    - PENALTY_GAP_START
+                    - PENALTY_GAP_EXTENSION,
+            ),
+        ],
+    );
+}