From b38fdfa8d7ea23d384651335d9bbb15fd23e84a6 Mon Sep 17 00:00:00 2001
From: Pascal Kuthe <pascal.kuthe@semimod.de>
Date: Sun, 27 Aug 2023 16:21:51 +0200
Subject: [PATCH] add option to prefer prefix matches

---
 Cargo.lock                   |  2 +-
 matcher/src/config.rs        |  9 +++++++
 matcher/src/fuzzy_optimal.rs | 40 ++++++++++++++++++++++++-------
 matcher/src/lib.rs           | 34 ++++++++++++++++++++++----
 matcher/src/score.rs         | 14 ++++++++++-
 matcher/src/tests.rs         | 46 +++++++++++++++++++++++++++++++++++-
 6 files changed, 129 insertions(+), 16 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 24abffc..9590fda 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -152,7 +152,7 @@ dependencies = [
 
 [[package]]
 name = "nucleo"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "nucleo-matcher",
  "parking_lot",
diff --git a/matcher/src/config.rs b/matcher/src/config.rs
index 7065262..67e07b7 100644
--- a/matcher/src/config.rs
+++ b/matcher/src/config.rs
@@ -15,6 +15,14 @@ pub struct MatcherConfig {
     pub normalize: bool,
     /// whether to ignore casing
     pub ignore_case: bool,
+    /// Whether to provide a bonus to matches by their distance from the start
+    /// of the haystack. The bonus is fairly small compared to the normal gap
+    /// penalty to avoid messing with the normal score heuristic. This setting
+    /// is not turned on by default and only recommended for autocompletion
+    /// usecases where the expectation is that the user is typing the entire
+    /// match. For a full fzf-like fuzzy matcher/picker word segmentation and
+    /// explicit prefix literals should be used instead.
+    pub prefer_prefix: bool,
 }
 
 impl MatcherConfig {
@@ -26,6 +34,7 @@ impl MatcherConfig {
             initial_char_class: CharClass::Whitespace,
             normalize: true,
             ignore_case: true,
+            prefer_prefix: false,
         }
     };
 }
diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs
index 65fe95b..f007d79 100644
--- a/matcher/src/fuzzy_optimal.rs
+++ b/matcher/src/fuzzy_optimal.rs
@@ -3,8 +3,8 @@ use std::cmp::max;
 use crate::chars::{Char, CharClass};
 use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
 use crate::score::{
-    BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
-    PENALTY_GAP_START, SCORE_MATCH,
+    BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
+    PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
 };
 use crate::{Matcher, MatcherConfig};
 
@@ -35,7 +35,7 @@ impl Matcher {
             .checked_sub(1)
             .map(|i| haystack[i].char_class(&self.config))
             .unwrap_or(self.config.initial_char_class);
-        let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
+        let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
         // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
         if !matched {
             assert!(
@@ -117,6 +117,7 @@ impl<H: Char> MatcherDataView<'_, H> {
         needle: &[N],
         mut prev_class: CharClass,
         config: &MatcherConfig,
+        start: u32,
     ) -> bool
     where
         H: PartialEq<N>,
@@ -167,6 +168,17 @@ impl<H: Char> MatcherDataView<'_, H> {
             0,
             needle[0],
             needle[1],
+            if config.prefer_prefix {
+                if start == 0 {
+                    MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
+                } else {
+                    (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
+                        (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
+                    )
+                }
+            } else {
+                0
+            },
         );
         true
     }
@@ -182,6 +194,7 @@ impl<H: Char> MatcherDataView<'_, H> {
         needle_idx: u16,
         needle_char: N,
         next_needle_char: N,
+        mut prefix_bonus: u16,
     ) where
         H: PartialEq<N>,
     {
@@ -198,15 +211,19 @@ impl<H: Char> MatcherDataView<'_, H> {
         for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
             let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
             let m_cell = if FIRST_ROW {
-                if c == needle_char {
+                let cell = if c == needle_char {
                     ScoreCell {
-                        score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
+                        score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+                            + SCORE_MATCH
+                            + prefix_bonus / PREFIX_BONUS_SCALE,
                         matched: false,
                         consecutive_bonus: *bonus,
                     }
                 } else {
                     UNMATCHED
-                }
+                };
+                prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
+                cell
             } else {
                 *score_cell
             };
@@ -224,15 +241,19 @@ impl<H: Char> MatcherDataView<'_, H> {
         for (((c, bonus), score_cell), matrix_cell) in col_iter {
             let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
             let m_cell = if FIRST_ROW {
-                if c[0] == needle_char {
+                let cell = if c[0] == needle_char {
                     ScoreCell {
-                        score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
+                        score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+                            + SCORE_MATCH
+                            + prefix_bonus / PREFIX_BONUS_SCALE,
                         matched: false,
                         consecutive_bonus: bonus[0],
                     }
                 } else {
                     UNMATCHED
-                }
+                };
+                prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
+                cell
             } else {
                 *score_cell
             };
@@ -271,6 +292,7 @@ impl<H: Char> MatcherDataView<'_, H> {
                 needle_idx as u16 + 1,
                 needle_char,
                 next_needle_char,
+                0,
             );
             let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
             matrix_cells = &mut matrix_cells[len..];
diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs
index f966b1a..6aea293 100644
--- a/matcher/src/lib.rs
+++ b/matcher/src/lib.rs
@@ -1,12 +1,11 @@
 /*!
 `nucleo_matcher` is a low level crate that contains the matcher implementation
-used by the other nucleo crates.
+used by the high level `nucleo` crate.
 
 The matcher is hightly optimized and can significantly outperform `fzf` and
 `skim` (the `fuzzy-matcher` crate). However some of these optimizations require
-a slightly less convenient API. Particularly, `nucleo_matcher` requires that
-needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
-of rusts normal utf32 strings.
+a slightly less convenient API. Be sure to carefully read the documentation of the
+[`Matcher`] to avoid unexpected behaviour..
 */
 
 // sadly ranges don't optmimzie well
@@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab;
 /// multiple different matches on the same haystack and merging the indices by
 /// sorting and deduplicating the vector.
 ///
+/// The `needle` argument for each function must always be normalized by the caller
+/// (unicode normalization and case folding if a case insesnitive match is produced).
+/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
+/// provides utilities to preprocess needles.
+///
+/// Additionally it's recommend to perform separate matches for each word in
+/// the needle. Consider the folloling example: If `foo bar` as used at the
+/// needle it  matches both `foo test baaar` and `foo hello-world bar`. However,
+/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
+/// 2 character gap which will receive a penalty and therefore the user will
+/// likely expect it to rank lower. However, if `foo bar` is matched as a single
+/// query `hello-world` and `test` are both considered gaps too. As `hello-
+/// world` is a much longer gap then `test` the extra penalty for `baaar` is
+/// outweigh. If both words are matched individually the interspersed words
+/// do not receive a penalty and `foo hello-world bar` ranks higher.
+///
+/// In general nucleo is a **substring matching tool** with no penalty assigned
+/// to matches that start later within the same pattern (which enables the
+/// usecase shown above). This may be undesirable in one very particular usecase:
+/// For automatic suggestions for commands (like a shell). In these case the
+/// assumption is that the user is actually typing the full haystack. In other words:
+/// The matcher should prefer a prefix match. To accomedate that usecase the
+/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
+/// to true. Note that the penalty given is quite small (and capped to a maximum)
+/// to avoid overwriting the normal scoring heuristic.
+///
+///
 /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
 /// that the matcher *will panic*. The caller must decide whether it wants to
 /// filter out long haystacks or truncate them.
diff --git a/matcher/src/score.rs b/matcher/src/score.rs
index 7a7c0c3..eba054b 100644
--- a/matcher/src/score.rs
+++ b/matcher/src/score.rs
@@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig};
 pub(crate) const SCORE_MATCH: u16 = 16;
 pub(crate) const PENALTY_GAP_START: u16 = 3;
 pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
+/// If the prefer_prefix option is enabled we want to penalize
+/// the initial gap. The prefix should not be too much  
+pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
+pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
 
 // We prefer matches at the beginning of a word, but the bonus should not be
 // too great to prevent the longer acronym matches from always winning over
@@ -140,7 +144,15 @@ impl Matcher {
             }
             prev_class = class;
         }
-
+        if self.config.prefer_prefix {
+            if start != 0 {
+                let penalty = PENALTY_GAP_START
+                    + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
+                score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
+            } else {
+                score += MAX_PREFIX_BONUS;
+            }
+        }
         score
     }
 }
diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs
index d2bfaea..691230c 100644
--- a/matcher/src/tests.rs
+++ b/matcher/src/tests.rs
@@ -1,7 +1,7 @@
 use crate::chars::Char;
 use crate::score::{
     BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
-    PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
+    MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
 };
 use crate::utf32_str::Utf32Str;
 use crate::{Matcher, MatcherConfig};
@@ -23,11 +23,13 @@ fn assert_matches(
     normalize: bool,
     case_sensitive: bool,
     path: bool,
+    prefer_prefix: bool,
     cases: &[(&str, &str, &[u32], u16)],
 ) {
     let mut config = MatcherConfig {
         normalize,
         ignore_case: !case_sensitive,
+        prefer_prefix,
         ..MatcherConfig::DEFAULT
     };
     if path {
@@ -142,6 +144,7 @@ fn test_fuzzy() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "fooBarbaz1",
@@ -250,6 +253,7 @@ fn empty_needle() {
         false,
         false,
         false,
+        false,
         &[("foo bar baz", "", &[], 0)],
     );
 }
@@ -261,6 +265,7 @@ fn test_substring() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "foo bar baz",
@@ -287,6 +292,7 @@ fn test_substring() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "foo bar baz",
@@ -313,6 +319,7 @@ fn test_substring() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "foo",
@@ -339,6 +346,7 @@ fn test_substring() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "fooBarbaz1",
@@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
         false,
         true,
         false,
+        false,
         &[
             (
                 "fooBarbaz1",
@@ -418,6 +427,7 @@ fn test_normalize() {
         true,
         false,
         false,
+        false,
         &[
             (
                 "Só Danço Samba",
@@ -464,6 +474,7 @@ fn test_unicode() {
         true,
         false,
         false,
+        false,
         &[
             (
                 "你好世界",
@@ -488,6 +499,7 @@ fn test_long_str() {
         false,
         false,
         false,
+        false,
         &[(
             &"x".repeat(u16::MAX as usize + 1),
             "xx",
@@ -504,6 +516,7 @@ fn test_casing() {
         false,
         false,
         false,
+        false,
         &[
             // these two have the same score
             (
@@ -536,6 +549,7 @@ fn test_casing() {
         ],
     )
 }
+
 #[test]
 fn test_optimal() {
     assert_matches(
@@ -543,6 +557,7 @@ fn test_optimal() {
         false,
         false,
         false,
+        false,
         &[
             (
                 "axxx xx ",
@@ -624,3 +639,32 @@ fn test_reject() {
     );
     assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
 }
+
+#[test]
+fn test_prefer_prefix() {
+    assert_matches(
+        &[FuzzyOptimal, FuzzyGreedy],
+        false,
+        false,
+        false,
+        true,
+        &[
+            (
+                "Moby Dick",
+                "md",
+                &[0, 5],
+                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)  + MAX_PREFIX_BONUS
+                    - PENALTY_GAP_START
+                    - 3 * PENALTY_GAP_EXTENSION,
+            ),
+            (
+                "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
+                "md",
+                &[82, 85],
+                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
+                    - PENALTY_GAP_START
+                    - PENALTY_GAP_EXTENSION,
+            ),
+        ],
+    );
+}