mirror of
https://github.com/solaeus/nucleo.git
synced 2025-01-21 23:37:47 +00:00
add option to prefer prefix matches
This commit is contained in:
parent
f18c19cd53
commit
b38fdfa8d7
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -152,7 +152,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "nucleo"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"nucleo-matcher",
|
||||
"parking_lot",
|
||||
|
@ -15,6 +15,14 @@ pub struct MatcherConfig {
|
||||
pub normalize: bool,
|
||||
/// whether to ignore casing
|
||||
pub ignore_case: bool,
|
||||
/// Whether to provide a bonus to matches by their distance from the start
|
||||
/// of the haystack. The bonus is fairly small compared to the normal gap
|
||||
/// penalty to avoid messing with the normal score heuristic. This setting
|
||||
/// is not turned on by default and only recommended for autocompletion
|
||||
/// usecases where the expectation is that the user is typing the entire
|
||||
/// match. For a full fzf-like fuzzy matcher/picker word segmentation and
|
||||
/// explicit prefix literals should be used instead.
|
||||
pub prefer_prefix: bool,
|
||||
}
|
||||
|
||||
impl MatcherConfig {
|
||||
@ -26,6 +34,7 @@ impl MatcherConfig {
|
||||
initial_char_class: CharClass::Whitespace,
|
||||
normalize: true,
|
||||
ignore_case: true,
|
||||
prefer_prefix: false,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -3,8 +3,8 @@ use std::cmp::max;
|
||||
use crate::chars::{Char, CharClass};
|
||||
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
|
||||
use crate::score::{
|
||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
||||
PENALTY_GAP_START, SCORE_MATCH,
|
||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
|
||||
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
|
||||
};
|
||||
use crate::{Matcher, MatcherConfig};
|
||||
|
||||
@ -35,7 +35,7 @@ impl Matcher {
|
||||
.checked_sub(1)
|
||||
.map(|i| haystack[i].char_class(&self.config))
|
||||
.unwrap_or(self.config.initial_char_class);
|
||||
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
|
||||
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
|
||||
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
|
||||
if !matched {
|
||||
assert!(
|
||||
@ -117,6 +117,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
needle: &[N],
|
||||
mut prev_class: CharClass,
|
||||
config: &MatcherConfig,
|
||||
start: u32,
|
||||
) -> bool
|
||||
where
|
||||
H: PartialEq<N>,
|
||||
@ -167,6 +168,17 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
0,
|
||||
needle[0],
|
||||
needle[1],
|
||||
if config.prefer_prefix {
|
||||
if start == 0 {
|
||||
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
|
||||
} else {
|
||||
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
|
||||
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
0
|
||||
},
|
||||
);
|
||||
true
|
||||
}
|
||||
@ -182,6 +194,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
needle_idx: u16,
|
||||
needle_char: N,
|
||||
next_needle_char: N,
|
||||
mut prefix_bonus: u16,
|
||||
) where
|
||||
H: PartialEq<N>,
|
||||
{
|
||||
@ -198,15 +211,19 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
|
||||
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||
let m_cell = if FIRST_ROW {
|
||||
if c == needle_char {
|
||||
let cell = if c == needle_char {
|
||||
ScoreCell {
|
||||
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
|
||||
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ SCORE_MATCH
|
||||
+ prefix_bonus / PREFIX_BONUS_SCALE,
|
||||
matched: false,
|
||||
consecutive_bonus: *bonus,
|
||||
}
|
||||
} else {
|
||||
UNMATCHED
|
||||
}
|
||||
};
|
||||
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
|
||||
cell
|
||||
} else {
|
||||
*score_cell
|
||||
};
|
||||
@ -224,15 +241,19 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
for (((c, bonus), score_cell), matrix_cell) in col_iter {
|
||||
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||
let m_cell = if FIRST_ROW {
|
||||
if c[0] == needle_char {
|
||||
let cell = if c[0] == needle_char {
|
||||
ScoreCell {
|
||||
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
|
||||
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ SCORE_MATCH
|
||||
+ prefix_bonus / PREFIX_BONUS_SCALE,
|
||||
matched: false,
|
||||
consecutive_bonus: bonus[0],
|
||||
}
|
||||
} else {
|
||||
UNMATCHED
|
||||
}
|
||||
};
|
||||
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
|
||||
cell
|
||||
} else {
|
||||
*score_cell
|
||||
};
|
||||
@ -271,6 +292,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
||||
needle_idx as u16 + 1,
|
||||
needle_char,
|
||||
next_needle_char,
|
||||
0,
|
||||
);
|
||||
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
|
||||
matrix_cells = &mut matrix_cells[len..];
|
||||
|
@ -1,12 +1,11 @@
|
||||
/*!
|
||||
`nucleo_matcher` is a low level crate that contains the matcher implementation
|
||||
used by the other nucleo crates.
|
||||
used by the high level `nucleo` crate.
|
||||
|
||||
The matcher is hightly optimized and can significantly outperform `fzf` and
|
||||
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
|
||||
a slightly less convenient API. Particularly, `nucleo_matcher` requires that
|
||||
needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
|
||||
of rusts normal utf32 strings.
|
||||
a slightly less convenient API. Be sure to carefully read the documentation of the
|
||||
[`Matcher`] to avoid unexpected behaviour..
|
||||
*/
|
||||
|
||||
// sadly ranges don't optmimzie well
|
||||
@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab;
|
||||
/// multiple different matches on the same haystack and merging the indices by
|
||||
/// sorting and deduplicating the vector.
|
||||
///
|
||||
/// The `needle` argument for each function must always be normalized by the caller
|
||||
/// (unicode normalization and case folding if a case insesnitive match is produced).
|
||||
/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
|
||||
/// provides utilities to preprocess needles.
|
||||
///
|
||||
/// Additionally it's recommend to perform separate matches for each word in
|
||||
/// the needle. Consider the folloling example: If `foo bar` as used at the
|
||||
/// needle it matches both `foo test baaar` and `foo hello-world bar`. However,
|
||||
/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
|
||||
/// 2 character gap which will receive a penalty and therefore the user will
|
||||
/// likely expect it to rank lower. However, if `foo bar` is matched as a single
|
||||
/// query `hello-world` and `test` are both considered gaps too. As `hello-
|
||||
/// world` is a much longer gap then `test` the extra penalty for `baaar` is
|
||||
/// outweigh. If both words are matched individually the interspersed words
|
||||
/// do not receive a penalty and `foo hello-world bar` ranks higher.
|
||||
///
|
||||
/// In general nucleo is a **substring matching tool** with no penalty assigned
|
||||
/// to matches that start later within the same pattern (which enables the
|
||||
/// usecase shown above). This may be undesirable in one very particular usecase:
|
||||
/// For automatic suggestions for commands (like a shell). In these case the
|
||||
/// assumption is that the user is actually typing the full haystack. In other words:
|
||||
/// The matcher should prefer a prefix match. To accomedate that usecase the
|
||||
/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
|
||||
/// to true. Note that the penalty given is quite small (and capped to a maximum)
|
||||
/// to avoid overwriting the normal scoring heuristic.
|
||||
///
|
||||
///
|
||||
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
|
||||
/// that the matcher *will panic*. The caller must decide whether it wants to
|
||||
/// filter out long haystacks or truncate them.
|
||||
|
@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig};
|
||||
pub(crate) const SCORE_MATCH: u16 = 16;
|
||||
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
||||
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
||||
/// If the prefer_prefix option is enabled we want to penalize
|
||||
/// the initial gap. The prefix should not be too much
|
||||
pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
|
||||
pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
|
||||
|
||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||
// too great to prevent the longer acronym matches from always winning over
|
||||
@ -140,7 +144,15 @@ impl Matcher {
|
||||
}
|
||||
prev_class = class;
|
||||
}
|
||||
|
||||
if self.config.prefer_prefix {
|
||||
if start != 0 {
|
||||
let penalty = PENALTY_GAP_START
|
||||
+ PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
|
||||
score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
|
||||
} else {
|
||||
score += MAX_PREFIX_BONUS;
|
||||
}
|
||||
}
|
||||
score
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use crate::chars::Char;
|
||||
use crate::score::{
|
||||
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
|
||||
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
||||
MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
||||
};
|
||||
use crate::utf32_str::Utf32Str;
|
||||
use crate::{Matcher, MatcherConfig};
|
||||
@ -23,11 +23,13 @@ fn assert_matches(
|
||||
normalize: bool,
|
||||
case_sensitive: bool,
|
||||
path: bool,
|
||||
prefer_prefix: bool,
|
||||
cases: &[(&str, &str, &[u32], u16)],
|
||||
) {
|
||||
let mut config = MatcherConfig {
|
||||
normalize,
|
||||
ignore_case: !case_sensitive,
|
||||
prefer_prefix,
|
||||
..MatcherConfig::DEFAULT
|
||||
};
|
||||
if path {
|
||||
@ -142,6 +144,7 @@ fn test_fuzzy() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"fooBarbaz1",
|
||||
@ -250,6 +253,7 @@ fn empty_needle() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[("foo bar baz", "", &[], 0)],
|
||||
);
|
||||
}
|
||||
@ -261,6 +265,7 @@ fn test_substring() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"foo bar baz",
|
||||
@ -287,6 +292,7 @@ fn test_substring() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"foo bar baz",
|
||||
@ -313,6 +319,7 @@ fn test_substring() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"foo",
|
||||
@ -339,6 +346,7 @@ fn test_substring() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"fooBarbaz1",
|
||||
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"fooBarbaz1",
|
||||
@ -418,6 +427,7 @@ fn test_normalize() {
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"Só Danço Samba",
|
||||
@ -464,6 +474,7 @@ fn test_unicode() {
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"你好世界",
|
||||
@ -488,6 +499,7 @@ fn test_long_str() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[(
|
||||
&"x".repeat(u16::MAX as usize + 1),
|
||||
"xx",
|
||||
@ -504,6 +516,7 @@ fn test_casing() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
// these two have the same score
|
||||
(
|
||||
@ -536,6 +549,7 @@ fn test_casing() {
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_optimal() {
|
||||
assert_matches(
|
||||
@ -543,6 +557,7 @@ fn test_optimal() {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"axxx xx ",
|
||||
@ -624,3 +639,32 @@ fn test_reject() {
|
||||
);
|
||||
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefer_prefix() {
|
||||
assert_matches(
|
||||
&[FuzzyOptimal, FuzzyGreedy],
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
&[
|
||||
(
|
||||
"Moby Dick",
|
||||
"md",
|
||||
&[0, 5],
|
||||
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS
|
||||
- PENALTY_GAP_START
|
||||
- 3 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
|
||||
"md",
|
||||
&[82, 85],
|
||||
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
|
||||
- PENALTY_GAP_START
|
||||
- PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user