hardcode score config, fix some overights and perf

This commit is contained in:
Pascal Kuthe 2023-07-18 19:20:15 +02:00
parent d0703bb6e0
commit 6837b4e2cb
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
2 changed files with 97 additions and 98 deletions

View File

@ -1,44 +1,44 @@
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
pub struct MatcherConfig {
pub score_match: i16,
pub score_gap_start: i16,
pub score_gap_extension: i16,
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub bonus_boundary: i16,
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub bonus_non_word: i16,
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub bonus_camel123: i16,
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub bonus_consecutive: i16,
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub bonus_first_char_multiplier: i16,
pub delimeter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string
pub bonus_boundary_white: i16,
pub bonus_boundary_white: u16,
// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub bonus_boundary_delimiter: i16,
pub bonus_boundary_delimiter: u16,
pub inital_char_class: CharClass,
/// Whether to normalize latin script charaters to ASCII
/// this significantly degrades performance so its not recommended
@ -74,22 +74,10 @@ pub enum CaseMatching {
impl MatcherConfig {
pub const DEFAULT: Self = {
let score_match = 16;
let score_gap_start = -3;
let score_gap_extension = -1;
let bonus_boundary = score_match / 2;
MatcherConfig {
score_match,
score_gap_start,
score_gap_extension,
bonus_boundary,
bonus_non_word: score_match / 2,
bonus_camel123: bonus_boundary + score_gap_extension,
bonus_consecutive: -(score_gap_start + score_gap_extension),
bonus_first_char_multiplier: 2,
delimeter_chars: b"/,:;|",
bonus_boundary_white: bonus_boundary + 2,
bonus_boundary_delimiter: bonus_boundary + 1,
bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
inital_char_class: CharClass::Whitespace,
normalize: false,
use_v1: false,
@ -105,7 +93,7 @@ impl MatcherConfig {
} else {
self.delimeter_chars = b"/";
}
self.bonus_boundary_white = self.bonus_boundary;
self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter;
}
@ -115,7 +103,7 @@ impl MatcherConfig {
} else {
self.delimeter_chars = b"/";
}
self.bonus_boundary_white = self.bonus_boundary;
self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter;
self
}
@ -161,13 +149,13 @@ impl MatcherConfig {
}
}
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> i16 {
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::NonWord {
// transition from non word to word
match prev_class {
CharClass::Whitespace => return self.bonus_boundary_white,
CharClass::Delimiter => return self.bonus_boundary_delimiter,
CharClass::NonWord => return self.bonus_boundary,
CharClass::NonWord => return BONUS_BOUNDARY,
_ => (),
}
}
@ -175,9 +163,9 @@ impl MatcherConfig {
|| prev_class != CharClass::Number && class == CharClass::Number
{
// camelCase letter123
self.bonus_camel123
BONUS_CAMEL123
} else if class == CharClass::NonWord {
self.bonus_non_word
BONUS_NON_WORD
} else if class == CharClass::Whitespace {
self.bonus_boundary_white
} else {

View File

@ -16,19 +16,24 @@ mod normalize;
pub use config::{CaseMatching, CharClass, MatcherConfig};
use crate::config::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
};
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
#[derive(Clone, Copy, PartialEq, Eq)]
struct MatrixCell {
score: i16,
score: u16,
consecutive_chars: u16,
}
#[derive(Clone, Copy, PartialEq, Eq)]
struct HaystackChar {
char: char,
bonus: i16,
bonus: u16,
}
pub struct Matcher {
@ -134,7 +139,7 @@ impl Matcher {
);
}
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<i32> {
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<u16> {
if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize]
}
@ -156,7 +161,7 @@ impl Matcher {
query: &Query,
mut haystack: &str,
indicies: &mut Vec<u32>,
) -> Option<i32> {
) -> Option<u16> {
if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize]
}
@ -236,7 +241,7 @@ impl Matcher {
query: &Query,
haystack: &str,
indicies: &mut Vec<u32>,
) -> Option<i32> {
) -> Option<u16> {
let (start, end) = self.prefilter(query, haystack)?;
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
query, haystack, start, end, indicies,
@ -250,7 +255,7 @@ impl Matcher {
mut start: usize,
mut end: usize,
indicies: &mut Vec<u32>,
) -> Option<i32> {
) -> Option<u16> {
let first_char_end = if ASCII_ONLY { start + 1 } else { end };
if !ASCII_ONLY && query.needle_chars.len() != 1 {
let mut needle_iter = query.needle_chars[1..].iter().copied();
@ -297,7 +302,7 @@ impl Matcher {
match_start: usize,
match_end: usize,
indicies: &mut Vec<u32>,
) -> i32 {
) -> u16 {
if INDICIES {
indicies.reserve(query.needle_chars.len());
}
@ -307,10 +312,10 @@ impl Matcher {
.map(|c| self.config.char_class(c))
.unwrap_or(self.config.inital_char_class);
let mut needle_idx = 0;
let mut score = 0i32;
let mut score = 0u16;
let mut in_gap = false;
let mut consecutive = 0;
let mut first_bonus = 0i16;
let mut first_bonus = 0u16;
for (i, mut c) in text[match_start..match_end].char_indices() {
let class = self.config.char_class(c);
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
@ -323,33 +328,36 @@ impl Matcher {
if INDICIES {
indicies.push(i as u32)
}
score += self.config.score_match as i32;
score += SCORE_MATCH;
let mut bonus = self.config.bonus_for(prev_class, class);
if consecutive == 0 {
first_bonus = bonus
} else {
// Break consecutive chunk
if bonus >= self.config.bonus_boundary && bonus > first_bonus {
first_bonus = bonus
if bonus > first_bonus {
if bonus >= BONUS_BOUNDARY {
first_bonus = bonus;
} else {
bonus = max(bonus, BONUS_CONSECUTIVE);
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE);
}
bonus = max(
max(bonus, first_bonus),
self.config.bonus_first_char_multiplier,
);
}
if needle_idx == 0 {
bonus *= self.config.bonus_first_char_multiplier
bonus *= BONUS_FIRST_CHAR_MULTIPLIER;
}
score += bonus as i32;
score += bonus;
needle_idx += 1;
in_gap = false;
consecutive += 1;
} else {
if in_gap {
score += self.config.score_gap_extension as i32
let penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
score += self.config.score_gap_start as i32
}
PENALTY_GAP_START
};
score = score.saturating_sub(penalty);
in_gap = true;
consecutive = 0;
first_bonus = 0;
@ -365,7 +373,7 @@ impl Matcher {
query: &Query,
text: &str,
indicies: &mut Vec<u32>,
) -> Option<i32> {
) -> Option<u16> {
let (start, prefilter_end) = self.prefilter(query, text)?;
let text_len = text.len() - start;
// fallback to v1 algorithms for long haystacks
@ -407,7 +415,7 @@ impl Matcher {
let mut max_score = 0;
let mut max_score_pos = 0;
let mut in_gap = false;
let mut prev_score = 0;
let mut prev_score = 0u16;
let mut matched = false;
let first_needle_char = query.needle_chars[0];
@ -438,25 +446,24 @@ impl Matcher {
last_matched_idx = i;
}
if c == first_needle_char {
let score =
self.config.score_match + bonus * self.config.bonus_first_char_multiplier;
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
matrix_cell.consecutive_chars = 1;
if query.needle_chars.len() == 1 && score > max_score {
max_score = score;
max_score_pos = i;
// can't get better than this
if bonus >= self.config.bonus_boundary {
if bonus >= BONUS_BOUNDARY {
break;
}
}
in_gap = false;
} else {
let gap_score = if in_gap {
self.config.score_gap_extension
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
self.config.score_gap_start
PENALTY_GAP_START
};
matrix_cell.score = max(0, gap_score + prev_score);
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
matrix_cell.consecutive_chars = 0;
in_gap = true;
}
@ -468,7 +475,7 @@ impl Matcher {
}
if query.needle_chars.len() == 1 {
indicies.push(max_score_pos as u32);
return Some(max_score as i32);
return Some(max_score);
}
assert_eq!(
self.first_needle_occurance[0], 0,
@ -517,10 +524,10 @@ impl Matcher {
}
}
Some(max_score as i32)
Some(max_score)
}
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (i16, u16) {
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) {
let mut max_score = 0;
let mut max_score_end = 0;
let mut iter = query
@ -548,24 +555,28 @@ impl Matcher {
.enumerate()
{
let col = j + first_occurance as usize;
let gap_score = if in_gap {
self.config.score_gap_extension
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
self.config.score_gap_start
PENALTY_GAP_START
};
let mut score1 = 0;
let score2 = prev_matrix_cell.score + gap_score;
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
let mut consecutive = 0;
if haystack_char.char == needle_char {
score1 = diag_matrix_cell.score + self.config.score_match;
score1 = diag_matrix_cell.score + SCORE_MATCH;
let mut bonus = haystack_char.bonus;
consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 {
let first_bonus = self.haystack[col - consecutive as usize].bonus;
if bonus > self.config.bonus_boundary && bonus > first_bonus {
consecutive = 1
if bonus > first_bonus {
if bonus > BONUS_BOUNDARY {
consecutive = 1
} else {
bonus = max(bonus, BONUS_CONSECUTIVE)
}
} else {
bonus = max(bonus, max(self.config.bonus_consecutive, first_bonus))
bonus = max(first_bonus, BONUS_CONSECUTIVE)
}
}
if score1 + bonus < score2 {