add option to prefer prefix matches

This commit is contained in:
Pascal Kuthe 2023-08-27 16:21:51 +02:00
parent f18c19cd53
commit b38fdfa8d7
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
6 changed files with 129 additions and 16 deletions

2
Cargo.lock generated
View File

@ -152,7 +152,7 @@ dependencies = [
[[package]]
name = "nucleo"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"nucleo-matcher",
"parking_lot",

View File

@ -15,6 +15,14 @@ pub struct MatcherConfig {
pub normalize: bool,
/// whether to ignore casing
pub ignore_case: bool,
/// Whether to provide a bonus to matches by their distance from the start
/// of the haystack. The bonus is fairly small compared to the normal gap
/// penalty to avoid messing with the normal score heuristic. This setting
/// is not turned on by default and only recommended for autocompletion
/// usecases where the expectation is that the user is typing the entire
/// match. For a full fzf-like fuzzy matcher/picker word segmentation and
/// explicit prefix literals should be used instead.
pub prefer_prefix: bool,
}
impl MatcherConfig {
@ -26,6 +34,7 @@ impl MatcherConfig {
initial_char_class: CharClass::Whitespace,
normalize: true,
ignore_case: true,
prefer_prefix: false,
}
};
}

View File

@ -3,8 +3,8 @@ use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
use crate::score::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
};
use crate::{Matcher, MatcherConfig};
@ -35,7 +35,7 @@ impl Matcher {
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.initial_char_class);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched {
assert!(
@ -117,6 +117,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle: &[N],
mut prev_class: CharClass,
config: &MatcherConfig,
start: u32,
) -> bool
where
H: PartialEq<N>,
@ -167,6 +168,17 @@ impl<H: Char> MatcherDataView<'_, H> {
0,
needle[0],
needle[1],
if config.prefer_prefix {
if start == 0 {
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
} else {
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
)
}
} else {
0
},
);
true
}
@ -182,6 +194,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx: u16,
needle_char: N,
next_needle_char: N,
mut prefix_bonus: u16,
) where
H: PartialEq<N>,
{
@ -198,15 +211,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
if c == needle_char {
let cell = if c == needle_char {
ScoreCell {
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: *bonus,
}
} else {
UNMATCHED
}
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
@ -224,15 +241,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((c, bonus), score_cell), matrix_cell) in col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
if c[0] == needle_char {
let cell = if c[0] == needle_char {
ScoreCell {
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: bonus[0],
}
} else {
UNMATCHED
}
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
@ -271,6 +292,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx as u16 + 1,
needle_char,
next_needle_char,
0,
);
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
matrix_cells = &mut matrix_cells[len..];

View File

@ -1,12 +1,11 @@
/*!
`nucleo_matcher` is a low level crate that contains the matcher implementation
used by the other nucleo crates.
used by the high level `nucleo` crate.
The matcher is hightly optimized and can significantly outperform `fzf` and
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
a slightly less convenient API. Particularly, `nucleo_matcher` requires that
needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
of rusts normal utf32 strings.
a slightly less convenient API. Be sure to carefully read the documentation of the
[`Matcher`] to avoid unexpected behaviour..
*/
// sadly ranges don't optmimzie well
@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab;
/// multiple different matches on the same haystack and merging the indices by
/// sorting and deduplicating the vector.
///
/// The `needle` argument for each function must always be normalized by the caller
/// (unicode normalization and case folding if a case insesnitive match is produced).
/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
/// provides utilities to preprocess needles.
///
/// Additionally it's recommend to perform separate matches for each word in
/// the needle. Consider the folloling example: If `foo bar` as used at the
/// needle it matches both `foo test baaar` and `foo hello-world bar`. However,
/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
/// 2 character gap which will receive a penalty and therefore the user will
/// likely expect it to rank lower. However, if `foo bar` is matched as a single
/// query `hello-world` and `test` are both considered gaps too. As `hello-
/// world` is a much longer gap then `test` the extra penalty for `baaar` is
/// outweigh. If both words are matched individually the interspersed words
/// do not receive a penalty and `foo hello-world bar` ranks higher.
///
/// In general nucleo is a **substring matching tool** with no penalty assigned
/// to matches that start later within the same pattern (which enables the
/// usecase shown above). This may be undesirable in one very particular usecase:
/// For automatic suggestions for commands (like a shell). In these case the
/// assumption is that the user is actually typing the full haystack. In other words:
/// The matcher should prefer a prefix match. To accomedate that usecase the
/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
/// to true. Note that the penalty given is quite small (and capped to a maximum)
/// to avoid overwriting the normal scoring heuristic.
///
///
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
/// that the matcher *will panic*. The caller must decide whether it wants to
/// filter out long haystacks or truncate them.

View File

@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig};
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
/// If the prefer_prefix option is enabled we want to penalize
/// the initial gap. The prefix should not be too much
pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
@ -140,7 +144,15 @@ impl Matcher {
}
prev_class = class;
}
if self.config.prefer_prefix {
if start != 0 {
let penalty = PENALTY_GAP_START
+ PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
} else {
score += MAX_PREFIX_BONUS;
}
}
score
}
}

View File

@ -1,7 +1,7 @@
use crate::chars::Char;
use crate::score::{
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
};
use crate::utf32_str::Utf32Str;
use crate::{Matcher, MatcherConfig};
@ -23,11 +23,13 @@ fn assert_matches(
normalize: bool,
case_sensitive: bool,
path: bool,
prefer_prefix: bool,
cases: &[(&str, &str, &[u32], u16)],
) {
let mut config = MatcherConfig {
normalize,
ignore_case: !case_sensitive,
prefer_prefix,
..MatcherConfig::DEFAULT
};
if path {
@ -142,6 +144,7 @@ fn test_fuzzy() {
false,
false,
false,
false,
&[
(
"fooBarbaz1",
@ -250,6 +253,7 @@ fn empty_needle() {
false,
false,
false,
false,
&[("foo bar baz", "", &[], 0)],
);
}
@ -261,6 +265,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo bar baz",
@ -287,6 +292,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo bar baz",
@ -313,6 +319,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo",
@ -339,6 +346,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"fooBarbaz1",
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
false,
true,
false,
false,
&[
(
"fooBarbaz1",
@ -418,6 +427,7 @@ fn test_normalize() {
true,
false,
false,
false,
&[
(
"Só Danço Samba",
@ -464,6 +474,7 @@ fn test_unicode() {
true,
false,
false,
false,
&[
(
"你好世界",
@ -488,6 +499,7 @@ fn test_long_str() {
false,
false,
false,
false,
&[(
&"x".repeat(u16::MAX as usize + 1),
"xx",
@ -504,6 +516,7 @@ fn test_casing() {
false,
false,
false,
false,
&[
// these two have the same score
(
@ -536,6 +549,7 @@ fn test_casing() {
],
)
}
#[test]
fn test_optimal() {
assert_matches(
@ -543,6 +557,7 @@ fn test_optimal() {
false,
false,
false,
false,
&[
(
"axxx xx ",
@ -624,3 +639,32 @@ fn test_reject() {
);
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
}
#[test]
fn test_prefer_prefix() {
assert_matches(
&[FuzzyOptimal, FuzzyGreedy],
false,
false,
false,
true,
&[
(
"Moby Dick",
"md",
&[0, 5],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS
- PENALTY_GAP_START
- 3 * PENALTY_GAP_EXTENSION,
),
(
"Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
"md",
&[82, 85],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
],
);
}