mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 01:47:49 +00:00
add option to prefer prefix matches
This commit is contained in:
parent
f18c19cd53
commit
b38fdfa8d7
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -152,7 +152,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nucleo"
|
name = "nucleo"
|
||||||
version = "0.1.0"
|
version = "0.1.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"nucleo-matcher",
|
"nucleo-matcher",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
|
@ -15,6 +15,14 @@ pub struct MatcherConfig {
|
|||||||
pub normalize: bool,
|
pub normalize: bool,
|
||||||
/// whether to ignore casing
|
/// whether to ignore casing
|
||||||
pub ignore_case: bool,
|
pub ignore_case: bool,
|
||||||
|
/// Whether to provide a bonus to matches by their distance from the start
|
||||||
|
/// of the haystack. The bonus is fairly small compared to the normal gap
|
||||||
|
/// penalty to avoid messing with the normal score heuristic. This setting
|
||||||
|
/// is not turned on by default and only recommended for autocompletion
|
||||||
|
/// usecases where the expectation is that the user is typing the entire
|
||||||
|
/// match. For a full fzf-like fuzzy matcher/picker word segmentation and
|
||||||
|
/// explicit prefix literals should be used instead.
|
||||||
|
pub prefer_prefix: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MatcherConfig {
|
impl MatcherConfig {
|
||||||
@ -26,6 +34,7 @@ impl MatcherConfig {
|
|||||||
initial_char_class: CharClass::Whitespace,
|
initial_char_class: CharClass::Whitespace,
|
||||||
normalize: true,
|
normalize: true,
|
||||||
ignore_case: true,
|
ignore_case: true,
|
||||||
|
prefer_prefix: false,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -3,8 +3,8 @@ use std::cmp::max;
|
|||||||
use crate::chars::{Char, CharClass};
|
use crate::chars::{Char, CharClass};
|
||||||
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
|
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
|
||||||
use crate::score::{
|
use crate::score::{
|
||||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
|
||||||
PENALTY_GAP_START, SCORE_MATCH,
|
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
|
||||||
};
|
};
|
||||||
use crate::{Matcher, MatcherConfig};
|
use crate::{Matcher, MatcherConfig};
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ impl Matcher {
|
|||||||
.checked_sub(1)
|
.checked_sub(1)
|
||||||
.map(|i| haystack[i].char_class(&self.config))
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
.unwrap_or(self.config.initial_char_class);
|
.unwrap_or(self.config.initial_char_class);
|
||||||
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
|
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
|
||||||
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
|
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
|
||||||
if !matched {
|
if !matched {
|
||||||
assert!(
|
assert!(
|
||||||
@ -117,6 +117,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
needle: &[N],
|
needle: &[N],
|
||||||
mut prev_class: CharClass,
|
mut prev_class: CharClass,
|
||||||
config: &MatcherConfig,
|
config: &MatcherConfig,
|
||||||
|
start: u32,
|
||||||
) -> bool
|
) -> bool
|
||||||
where
|
where
|
||||||
H: PartialEq<N>,
|
H: PartialEq<N>,
|
||||||
@ -167,6 +168,17 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
0,
|
0,
|
||||||
needle[0],
|
needle[0],
|
||||||
needle[1],
|
needle[1],
|
||||||
|
if config.prefer_prefix {
|
||||||
|
if start == 0 {
|
||||||
|
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
|
||||||
|
} else {
|
||||||
|
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
|
||||||
|
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
},
|
||||||
);
|
);
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
@ -182,6 +194,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
needle_idx: u16,
|
needle_idx: u16,
|
||||||
needle_char: N,
|
needle_char: N,
|
||||||
next_needle_char: N,
|
next_needle_char: N,
|
||||||
|
mut prefix_bonus: u16,
|
||||||
) where
|
) where
|
||||||
H: PartialEq<N>,
|
H: PartialEq<N>,
|
||||||
{
|
{
|
||||||
@ -198,15 +211,19 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
|
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
|
||||||
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||||
let m_cell = if FIRST_ROW {
|
let m_cell = if FIRST_ROW {
|
||||||
if c == needle_char {
|
let cell = if c == needle_char {
|
||||||
ScoreCell {
|
ScoreCell {
|
||||||
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
|
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ SCORE_MATCH
|
||||||
|
+ prefix_bonus / PREFIX_BONUS_SCALE,
|
||||||
matched: false,
|
matched: false,
|
||||||
consecutive_bonus: *bonus,
|
consecutive_bonus: *bonus,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
UNMATCHED
|
UNMATCHED
|
||||||
}
|
};
|
||||||
|
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
|
||||||
|
cell
|
||||||
} else {
|
} else {
|
||||||
*score_cell
|
*score_cell
|
||||||
};
|
};
|
||||||
@ -224,15 +241,19 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
for (((c, bonus), score_cell), matrix_cell) in col_iter {
|
for (((c, bonus), score_cell), matrix_cell) in col_iter {
|
||||||
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||||
let m_cell = if FIRST_ROW {
|
let m_cell = if FIRST_ROW {
|
||||||
if c[0] == needle_char {
|
let cell = if c[0] == needle_char {
|
||||||
ScoreCell {
|
ScoreCell {
|
||||||
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
|
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ SCORE_MATCH
|
||||||
|
+ prefix_bonus / PREFIX_BONUS_SCALE,
|
||||||
matched: false,
|
matched: false,
|
||||||
consecutive_bonus: bonus[0],
|
consecutive_bonus: bonus[0],
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
UNMATCHED
|
UNMATCHED
|
||||||
}
|
};
|
||||||
|
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
|
||||||
|
cell
|
||||||
} else {
|
} else {
|
||||||
*score_cell
|
*score_cell
|
||||||
};
|
};
|
||||||
@ -271,6 +292,7 @@ impl<H: Char> MatcherDataView<'_, H> {
|
|||||||
needle_idx as u16 + 1,
|
needle_idx as u16 + 1,
|
||||||
needle_char,
|
needle_char,
|
||||||
next_needle_char,
|
next_needle_char,
|
||||||
|
0,
|
||||||
);
|
);
|
||||||
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
|
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
|
||||||
matrix_cells = &mut matrix_cells[len..];
|
matrix_cells = &mut matrix_cells[len..];
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
/*!
|
/*!
|
||||||
`nucleo_matcher` is a low level crate that contains the matcher implementation
|
`nucleo_matcher` is a low level crate that contains the matcher implementation
|
||||||
used by the other nucleo crates.
|
used by the high level `nucleo` crate.
|
||||||
|
|
||||||
The matcher is hightly optimized and can significantly outperform `fzf` and
|
The matcher is hightly optimized and can significantly outperform `fzf` and
|
||||||
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
|
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
|
||||||
a slightly less convenient API. Particularly, `nucleo_matcher` requires that
|
a slightly less convenient API. Be sure to carefully read the documentation of the
|
||||||
needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
|
[`Matcher`] to avoid unexpected behaviour..
|
||||||
of rusts normal utf32 strings.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// sadly ranges don't optmimzie well
|
// sadly ranges don't optmimzie well
|
||||||
@ -50,6 +49,33 @@ use crate::matrix::MatrixSlab;
|
|||||||
/// multiple different matches on the same haystack and merging the indices by
|
/// multiple different matches on the same haystack and merging the indices by
|
||||||
/// sorting and deduplicating the vector.
|
/// sorting and deduplicating the vector.
|
||||||
///
|
///
|
||||||
|
/// The `needle` argument for each function must always be normalized by the caller
|
||||||
|
/// (unicode normalization and case folding if a case insesnitive match is produced).
|
||||||
|
/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
|
||||||
|
/// provides utilities to preprocess needles.
|
||||||
|
///
|
||||||
|
/// Additionally it's recommend to perform separate matches for each word in
|
||||||
|
/// the needle. Consider the folloling example: If `foo bar` as used at the
|
||||||
|
/// needle it matches both `foo test baaar` and `foo hello-world bar`. However,
|
||||||
|
/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
|
||||||
|
/// 2 character gap which will receive a penalty and therefore the user will
|
||||||
|
/// likely expect it to rank lower. However, if `foo bar` is matched as a single
|
||||||
|
/// query `hello-world` and `test` are both considered gaps too. As `hello-
|
||||||
|
/// world` is a much longer gap then `test` the extra penalty for `baaar` is
|
||||||
|
/// outweigh. If both words are matched individually the interspersed words
|
||||||
|
/// do not receive a penalty and `foo hello-world bar` ranks higher.
|
||||||
|
///
|
||||||
|
/// In general nucleo is a **substring matching tool** with no penalty assigned
|
||||||
|
/// to matches that start later within the same pattern (which enables the
|
||||||
|
/// usecase shown above). This may be undesirable in one very particular usecase:
|
||||||
|
/// For automatic suggestions for commands (like a shell). In these case the
|
||||||
|
/// assumption is that the user is actually typing the full haystack. In other words:
|
||||||
|
/// The matcher should prefer a prefix match. To accomedate that usecase the
|
||||||
|
/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
|
||||||
|
/// to true. Note that the penalty given is quite small (and capped to a maximum)
|
||||||
|
/// to avoid overwriting the normal scoring heuristic.
|
||||||
|
///
|
||||||
|
///
|
||||||
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
|
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
|
||||||
/// that the matcher *will panic*. The caller must decide whether it wants to
|
/// that the matcher *will panic*. The caller must decide whether it wants to
|
||||||
/// filter out long haystacks or truncate them.
|
/// filter out long haystacks or truncate them.
|
||||||
|
@ -6,6 +6,10 @@ use crate::{Matcher, MatcherConfig};
|
|||||||
pub(crate) const SCORE_MATCH: u16 = 16;
|
pub(crate) const SCORE_MATCH: u16 = 16;
|
||||||
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
||||||
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
||||||
|
/// If the prefer_prefix option is enabled we want to penalize
|
||||||
|
/// the initial gap. The prefix should not be too much
|
||||||
|
pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
|
||||||
|
pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
|
||||||
|
|
||||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||||
// too great to prevent the longer acronym matches from always winning over
|
// too great to prevent the longer acronym matches from always winning over
|
||||||
@ -140,7 +144,15 @@ impl Matcher {
|
|||||||
}
|
}
|
||||||
prev_class = class;
|
prev_class = class;
|
||||||
}
|
}
|
||||||
|
if self.config.prefer_prefix {
|
||||||
|
if start != 0 {
|
||||||
|
let penalty = PENALTY_GAP_START
|
||||||
|
+ PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
|
||||||
|
score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
|
||||||
|
} else {
|
||||||
|
score += MAX_PREFIX_BONUS;
|
||||||
|
}
|
||||||
|
}
|
||||||
score
|
score
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use crate::chars::Char;
|
use crate::chars::Char;
|
||||||
use crate::score::{
|
use crate::score::{
|
||||||
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
|
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
|
||||||
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
||||||
};
|
};
|
||||||
use crate::utf32_str::Utf32Str;
|
use crate::utf32_str::Utf32Str;
|
||||||
use crate::{Matcher, MatcherConfig};
|
use crate::{Matcher, MatcherConfig};
|
||||||
@ -23,11 +23,13 @@ fn assert_matches(
|
|||||||
normalize: bool,
|
normalize: bool,
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
path: bool,
|
path: bool,
|
||||||
|
prefer_prefix: bool,
|
||||||
cases: &[(&str, &str, &[u32], u16)],
|
cases: &[(&str, &str, &[u32], u16)],
|
||||||
) {
|
) {
|
||||||
let mut config = MatcherConfig {
|
let mut config = MatcherConfig {
|
||||||
normalize,
|
normalize,
|
||||||
ignore_case: !case_sensitive,
|
ignore_case: !case_sensitive,
|
||||||
|
prefer_prefix,
|
||||||
..MatcherConfig::DEFAULT
|
..MatcherConfig::DEFAULT
|
||||||
};
|
};
|
||||||
if path {
|
if path {
|
||||||
@ -142,6 +144,7 @@ fn test_fuzzy() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"fooBarbaz1",
|
"fooBarbaz1",
|
||||||
@ -250,6 +253,7 @@ fn empty_needle() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[("foo bar baz", "", &[], 0)],
|
&[("foo bar baz", "", &[], 0)],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -261,6 +265,7 @@ fn test_substring() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"foo bar baz",
|
"foo bar baz",
|
||||||
@ -287,6 +292,7 @@ fn test_substring() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"foo bar baz",
|
"foo bar baz",
|
||||||
@ -313,6 +319,7 @@ fn test_substring() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"foo",
|
"foo",
|
||||||
@ -339,6 +346,7 @@ fn test_substring() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"fooBarbaz1",
|
"fooBarbaz1",
|
||||||
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
|
|||||||
false,
|
false,
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"fooBarbaz1",
|
"fooBarbaz1",
|
||||||
@ -418,6 +427,7 @@ fn test_normalize() {
|
|||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"Só Danço Samba",
|
"Só Danço Samba",
|
||||||
@ -464,6 +474,7 @@ fn test_unicode() {
|
|||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"你好世界",
|
"你好世界",
|
||||||
@ -488,6 +499,7 @@ fn test_long_str() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[(
|
&[(
|
||||||
&"x".repeat(u16::MAX as usize + 1),
|
&"x".repeat(u16::MAX as usize + 1),
|
||||||
"xx",
|
"xx",
|
||||||
@ -504,6 +516,7 @@ fn test_casing() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
// these two have the same score
|
// these two have the same score
|
||||||
(
|
(
|
||||||
@ -536,6 +549,7 @@ fn test_casing() {
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_optimal() {
|
fn test_optimal() {
|
||||||
assert_matches(
|
assert_matches(
|
||||||
@ -543,6 +557,7 @@ fn test_optimal() {
|
|||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
&[
|
&[
|
||||||
(
|
(
|
||||||
"axxx xx ",
|
"axxx xx ",
|
||||||
@ -624,3 +639,32 @@ fn test_reject() {
|
|||||||
);
|
);
|
||||||
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
|
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_prefer_prefix() {
|
||||||
|
assert_matches(
|
||||||
|
&[FuzzyOptimal, FuzzyGreedy],
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
&[
|
||||||
|
(
|
||||||
|
"Moby Dick",
|
||||||
|
"md",
|
||||||
|
&[0, 5],
|
||||||
|
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- 3 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
|
||||||
|
"md",
|
||||||
|
&[82, 85],
|
||||||
|
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user