hardcode score config, fix some overights and perf

This commit is contained in:
Pascal Kuthe 2023-07-18 19:20:15 +02:00
parent d0703bb6e0
commit 6837b4e2cb
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
2 changed files with 97 additions and 98 deletions

View File

@ -1,44 +1,44 @@
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
pub struct MatcherConfig { pub struct MatcherConfig {
pub score_match: i16,
pub score_gap_start: i16,
pub score_gap_extension: i16,
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub bonus_boundary: i16,
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub bonus_non_word: i16,
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub bonus_camel123: i16,
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub bonus_consecutive: i16,
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub bonus_first_char_multiplier: i16,
pub delimeter_chars: &'static [u8], pub delimeter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string /// Extra bonus for word boundary after whitespace character or beginning of the string
pub bonus_boundary_white: i16, pub bonus_boundary_white: u16,
// Extra bonus for word boundary after slash, colon, semi-colon, and comma // Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub bonus_boundary_delimiter: i16, pub bonus_boundary_delimiter: u16,
pub inital_char_class: CharClass, pub inital_char_class: CharClass,
/// Whether to normalize latin script charaters to ASCII /// Whether to normalize latin script charaters to ASCII
/// this significantly degrades performance so its not recommended /// this significantly degrades performance so its not recommended
@ -74,22 +74,10 @@ pub enum CaseMatching {
impl MatcherConfig { impl MatcherConfig {
pub const DEFAULT: Self = { pub const DEFAULT: Self = {
let score_match = 16;
let score_gap_start = -3;
let score_gap_extension = -1;
let bonus_boundary = score_match / 2;
MatcherConfig { MatcherConfig {
score_match,
score_gap_start,
score_gap_extension,
bonus_boundary,
bonus_non_word: score_match / 2,
bonus_camel123: bonus_boundary + score_gap_extension,
bonus_consecutive: -(score_gap_start + score_gap_extension),
bonus_first_char_multiplier: 2,
delimeter_chars: b"/,:;|", delimeter_chars: b"/,:;|",
bonus_boundary_white: bonus_boundary + 2, bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: bonus_boundary + 1, bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
inital_char_class: CharClass::Whitespace, inital_char_class: CharClass::Whitespace,
normalize: false, normalize: false,
use_v1: false, use_v1: false,
@ -105,7 +93,7 @@ impl MatcherConfig {
} else { } else {
self.delimeter_chars = b"/"; self.delimeter_chars = b"/";
} }
self.bonus_boundary_white = self.bonus_boundary; self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter; self.inital_char_class = CharClass::Delimiter;
} }
@ -115,7 +103,7 @@ impl MatcherConfig {
} else { } else {
self.delimeter_chars = b"/"; self.delimeter_chars = b"/";
} }
self.bonus_boundary_white = self.bonus_boundary; self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter; self.inital_char_class = CharClass::Delimiter;
self self
} }
@ -161,13 +149,13 @@ impl MatcherConfig {
} }
} }
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> i16 { pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::NonWord { if class > CharClass::NonWord {
// transition from non word to word // transition from non word to word
match prev_class { match prev_class {
CharClass::Whitespace => return self.bonus_boundary_white, CharClass::Whitespace => return self.bonus_boundary_white,
CharClass::Delimiter => return self.bonus_boundary_delimiter, CharClass::Delimiter => return self.bonus_boundary_delimiter,
CharClass::NonWord => return self.bonus_boundary, CharClass::NonWord => return BONUS_BOUNDARY,
_ => (), _ => (),
} }
} }
@ -175,9 +163,9 @@ impl MatcherConfig {
|| prev_class != CharClass::Number && class == CharClass::Number || prev_class != CharClass::Number && class == CharClass::Number
{ {
// camelCase letter123 // camelCase letter123
self.bonus_camel123 BONUS_CAMEL123
} else if class == CharClass::NonWord { } else if class == CharClass::NonWord {
self.bonus_non_word BONUS_NON_WORD
} else if class == CharClass::Whitespace { } else if class == CharClass::Whitespace {
self.bonus_boundary_white self.bonus_boundary_white
} else { } else {

View File

@ -16,19 +16,24 @@ mod normalize;
pub use config::{CaseMatching, CharClass, MatcherConfig}; pub use config::{CaseMatching, CharClass, MatcherConfig};
use crate::config::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
};
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
#[derive(Clone, Copy, PartialEq, Eq)] #[derive(Clone, Copy, PartialEq, Eq)]
struct MatrixCell { struct MatrixCell {
score: i16, score: u16,
consecutive_chars: u16, consecutive_chars: u16,
} }
#[derive(Clone, Copy, PartialEq, Eq)] #[derive(Clone, Copy, PartialEq, Eq)]
struct HaystackChar { struct HaystackChar {
char: char, char: char,
bonus: i16, bonus: u16,
} }
pub struct Matcher { pub struct Matcher {
@ -134,7 +139,7 @@ impl Matcher {
); );
} }
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<i32> { pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<u16> {
if haystack.len() > u32::MAX as usize { if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize] haystack = &haystack[..u32::MAX as usize]
} }
@ -156,7 +161,7 @@ impl Matcher {
query: &Query, query: &Query,
mut haystack: &str, mut haystack: &str,
indicies: &mut Vec<u32>, indicies: &mut Vec<u32>,
) -> Option<i32> { ) -> Option<u16> {
if haystack.len() > u32::MAX as usize { if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize] haystack = &haystack[..u32::MAX as usize]
} }
@ -236,7 +241,7 @@ impl Matcher {
query: &Query, query: &Query,
haystack: &str, haystack: &str,
indicies: &mut Vec<u32>, indicies: &mut Vec<u32>,
) -> Option<i32> { ) -> Option<u16> {
let (start, end) = self.prefilter(query, haystack)?; let (start, end) = self.prefilter(query, haystack)?;
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>( self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
query, haystack, start, end, indicies, query, haystack, start, end, indicies,
@ -250,7 +255,7 @@ impl Matcher {
mut start: usize, mut start: usize,
mut end: usize, mut end: usize,
indicies: &mut Vec<u32>, indicies: &mut Vec<u32>,
) -> Option<i32> { ) -> Option<u16> {
let first_char_end = if ASCII_ONLY { start + 1 } else { end }; let first_char_end = if ASCII_ONLY { start + 1 } else { end };
if !ASCII_ONLY && query.needle_chars.len() != 1 { if !ASCII_ONLY && query.needle_chars.len() != 1 {
let mut needle_iter = query.needle_chars[1..].iter().copied(); let mut needle_iter = query.needle_chars[1..].iter().copied();
@ -297,7 +302,7 @@ impl Matcher {
match_start: usize, match_start: usize,
match_end: usize, match_end: usize,
indicies: &mut Vec<u32>, indicies: &mut Vec<u32>,
) -> i32 { ) -> u16 {
if INDICIES { if INDICIES {
indicies.reserve(query.needle_chars.len()); indicies.reserve(query.needle_chars.len());
} }
@ -307,10 +312,10 @@ impl Matcher {
.map(|c| self.config.char_class(c)) .map(|c| self.config.char_class(c))
.unwrap_or(self.config.inital_char_class); .unwrap_or(self.config.inital_char_class);
let mut needle_idx = 0; let mut needle_idx = 0;
let mut score = 0i32; let mut score = 0u16;
let mut in_gap = false; let mut in_gap = false;
let mut consecutive = 0; let mut consecutive = 0;
let mut first_bonus = 0i16; let mut first_bonus = 0u16;
for (i, mut c) in text[match_start..match_end].char_indices() { for (i, mut c) in text[match_start..match_end].char_indices() {
let class = self.config.char_class(c); let class = self.config.char_class(c);
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case { if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
@ -323,33 +328,36 @@ impl Matcher {
if INDICIES { if INDICIES {
indicies.push(i as u32) indicies.push(i as u32)
} }
score += self.config.score_match as i32; score += SCORE_MATCH;
let mut bonus = self.config.bonus_for(prev_class, class); let mut bonus = self.config.bonus_for(prev_class, class);
if consecutive == 0 { if consecutive == 0 {
first_bonus = bonus first_bonus = bonus
} else { } else {
// Break consecutive chunk // Break consecutive chunk
if bonus >= self.config.bonus_boundary && bonus > first_bonus { if bonus > first_bonus {
first_bonus = bonus if bonus >= BONUS_BOUNDARY {
first_bonus = bonus;
} else {
bonus = max(bonus, BONUS_CONSECUTIVE);
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE);
} }
bonus = max(
max(bonus, first_bonus),
self.config.bonus_first_char_multiplier,
);
} }
if needle_idx == 0 { if needle_idx == 0 {
bonus *= self.config.bonus_first_char_multiplier bonus *= BONUS_FIRST_CHAR_MULTIPLIER;
} }
score += bonus as i32; score += bonus;
needle_idx += 1; needle_idx += 1;
in_gap = false; in_gap = false;
consecutive += 1; consecutive += 1;
} else { } else {
if in_gap { let penalty = if in_gap {
score += self.config.score_gap_extension as i32 PENALTY_GAP_EXTENSION
} else { } else {
score += self.config.score_gap_start as i32 PENALTY_GAP_START
} };
score = score.saturating_sub(penalty);
in_gap = true; in_gap = true;
consecutive = 0; consecutive = 0;
first_bonus = 0; first_bonus = 0;
@ -365,7 +373,7 @@ impl Matcher {
query: &Query, query: &Query,
text: &str, text: &str,
indicies: &mut Vec<u32>, indicies: &mut Vec<u32>,
) -> Option<i32> { ) -> Option<u16> {
let (start, prefilter_end) = self.prefilter(query, text)?; let (start, prefilter_end) = self.prefilter(query, text)?;
let text_len = text.len() - start; let text_len = text.len() - start;
// fallback to v1 algorithms for long haystacks // fallback to v1 algorithms for long haystacks
@ -407,7 +415,7 @@ impl Matcher {
let mut max_score = 0; let mut max_score = 0;
let mut max_score_pos = 0; let mut max_score_pos = 0;
let mut in_gap = false; let mut in_gap = false;
let mut prev_score = 0; let mut prev_score = 0u16;
let mut matched = false; let mut matched = false;
let first_needle_char = query.needle_chars[0]; let first_needle_char = query.needle_chars[0];
@ -438,25 +446,24 @@ impl Matcher {
last_matched_idx = i; last_matched_idx = i;
} }
if c == first_needle_char { if c == first_needle_char {
let score = let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
self.config.score_match + bonus * self.config.bonus_first_char_multiplier;
matrix_cell.consecutive_chars = 1; matrix_cell.consecutive_chars = 1;
if query.needle_chars.len() == 1 && score > max_score { if query.needle_chars.len() == 1 && score > max_score {
max_score = score; max_score = score;
max_score_pos = i; max_score_pos = i;
// can't get better than this // can't get better than this
if bonus >= self.config.bonus_boundary { if bonus >= BONUS_BOUNDARY {
break; break;
} }
} }
in_gap = false; in_gap = false;
} else { } else {
let gap_score = if in_gap { let gap_penalty = if in_gap {
self.config.score_gap_extension PENALTY_GAP_EXTENSION
} else { } else {
self.config.score_gap_start PENALTY_GAP_START
}; };
matrix_cell.score = max(0, gap_score + prev_score); matrix_cell.score = prev_score.saturating_sub(gap_penalty);
matrix_cell.consecutive_chars = 0; matrix_cell.consecutive_chars = 0;
in_gap = true; in_gap = true;
} }
@ -468,7 +475,7 @@ impl Matcher {
} }
if query.needle_chars.len() == 1 { if query.needle_chars.len() == 1 {
indicies.push(max_score_pos as u32); indicies.push(max_score_pos as u32);
return Some(max_score as i32); return Some(max_score);
} }
assert_eq!( assert_eq!(
self.first_needle_occurance[0], 0, self.first_needle_occurance[0], 0,
@ -517,10 +524,10 @@ impl Matcher {
} }
} }
Some(max_score as i32) Some(max_score)
} }
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (i16, u16) { fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) {
let mut max_score = 0; let mut max_score = 0;
let mut max_score_end = 0; let mut max_score_end = 0;
let mut iter = query let mut iter = query
@ -548,24 +555,28 @@ impl Matcher {
.enumerate() .enumerate()
{ {
let col = j + first_occurance as usize; let col = j + first_occurance as usize;
let gap_score = if in_gap { let gap_penalty = if in_gap {
self.config.score_gap_extension PENALTY_GAP_EXTENSION
} else { } else {
self.config.score_gap_start PENALTY_GAP_START
}; };
let mut score1 = 0; let mut score1 = 0;
let score2 = prev_matrix_cell.score + gap_score; let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
let mut consecutive = 0; let mut consecutive = 0;
if haystack_char.char == needle_char { if haystack_char.char == needle_char {
score1 = diag_matrix_cell.score + self.config.score_match; score1 = diag_matrix_cell.score + SCORE_MATCH;
let mut bonus = haystack_char.bonus; let mut bonus = haystack_char.bonus;
consecutive = diag_matrix_cell.consecutive_chars + 1; consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 { if consecutive > 1 {
let first_bonus = self.haystack[col - consecutive as usize].bonus; let first_bonus = self.haystack[col - consecutive as usize].bonus;
if bonus > self.config.bonus_boundary && bonus > first_bonus { if bonus > first_bonus {
consecutive = 1 if bonus > BONUS_BOUNDARY {
consecutive = 1
} else {
bonus = max(bonus, BONUS_CONSECUTIVE)
}
} else { } else {
bonus = max(bonus, max(self.config.bonus_consecutive, first_bonus)) bonus = max(first_bonus, BONUS_CONSECUTIVE)
} }
} }
if score1 + bonus < score2 { if score1 + bonus < score2 {