From 6837b4e2cb88c16096d680355201d1f7236bbca1 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Tue, 18 Jul 2023 19:20:15 +0200 Subject: [PATCH] hardcode score config, fix some overights and perf --- src/config.rs | 100 ++++++++++++++++++++++---------------------------- src/lib.rs | 95 ++++++++++++++++++++++++++--------------------- 2 files changed, 97 insertions(+), 98 deletions(-) diff --git a/src/config.rs b/src/config.rs index ef2c707..ca076ae 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,44 +1,44 @@ +pub(crate) const SCORE_MATCH: u16 = 16; +pub(crate) const PENALTY_GAP_START: u16 = 3; +pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; + +// We prefer matches at the beginning of a word, but the bonus should not be +// too great to prevent the longer acronym matches from always winning over +// shorter fuzzy matches. The bonus point here was specifically chosen that +// the bonus is cancelled when the gap between the acronyms grows over +// 8 characters, which is approximately the average length of the words found +// in web2 dictionary and my file system. +pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; + +// Although bonus point for non-word characters is non-contextual, we need it +// for computing bonus points for consecutive chunks starting with a non-word +// character. +pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2; + +// Edge-triggered bonus for matches in camelCase words. +// Compared to word-boundary case, they don't accompany single-character gaps +// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly. +pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION; + +// Minimum bonus point given to characters in consecutive chunks. +// Note that bonus points for consecutive matches shouldn't have needed if we +// used fixed match score as in the original algorithm. +pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; + +// The first character in the typed pattern usually has more significance +// than the rest so it's important that it appears at special positions where +// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". +// The amount of the extra bonus should be limited so that the gap penalty is +// still respected. +pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; + pub struct MatcherConfig { - pub score_match: i16, - pub score_gap_start: i16, - pub score_gap_extension: i16, - - // We prefer matches at the beginning of a word, but the bonus should not be - // too great to prevent the longer acronym matches from always winning over - // shorter fuzzy matches. The bonus point here was specifically chosen that - // the bonus is cancelled when the gap between the acronyms grows over - // 8 characters, which is approximately the average length of the words found - // in web2 dictionary and my file system. - pub bonus_boundary: i16, - - // Although bonus point for non-word characters is non-contextual, we need it - // for computing bonus points for consecutive chunks starting with a non-word - // character. - pub bonus_non_word: i16, - - // Edge-triggered bonus for matches in camelCase words. - // Compared to word-boundary case, they don't accompany single-character gaps - // (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly. - pub bonus_camel123: i16, - - // Minimum bonus point given to characters in consecutive chunks. - // Note that bonus points for consecutive matches shouldn't have needed if we - // used fixed match score as in the original algorithm. - pub bonus_consecutive: i16, - - // The first character in the typed pattern usually has more significance - // than the rest so it's important that it appears at special positions where - // bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". - // The amount of the extra bonus should be limited so that the gap penalty is - // still respected. - pub bonus_first_char_multiplier: i16, - pub delimeter_chars: &'static [u8], /// Extra bonus for word boundary after whitespace character or beginning of the string - pub bonus_boundary_white: i16, + pub bonus_boundary_white: u16, // Extra bonus for word boundary after slash, colon, semi-colon, and comma - pub bonus_boundary_delimiter: i16, + pub bonus_boundary_delimiter: u16, pub inital_char_class: CharClass, /// Whether to normalize latin script charaters to ASCII /// this significantly degrades performance so its not recommended @@ -74,22 +74,10 @@ pub enum CaseMatching { impl MatcherConfig { pub const DEFAULT: Self = { - let score_match = 16; - let score_gap_start = -3; - let score_gap_extension = -1; - let bonus_boundary = score_match / 2; MatcherConfig { - score_match, - score_gap_start, - score_gap_extension, - bonus_boundary, - bonus_non_word: score_match / 2, - bonus_camel123: bonus_boundary + score_gap_extension, - bonus_consecutive: -(score_gap_start + score_gap_extension), - bonus_first_char_multiplier: 2, delimeter_chars: b"/,:;|", - bonus_boundary_white: bonus_boundary + 2, - bonus_boundary_delimiter: bonus_boundary + 1, + bonus_boundary_white: BONUS_BOUNDARY + 2, + bonus_boundary_delimiter: BONUS_BOUNDARY + 1, inital_char_class: CharClass::Whitespace, normalize: false, use_v1: false, @@ -105,7 +93,7 @@ impl MatcherConfig { } else { self.delimeter_chars = b"/"; } - self.bonus_boundary_white = self.bonus_boundary; + self.bonus_boundary_white = BONUS_BOUNDARY; self.inital_char_class = CharClass::Delimiter; } @@ -115,7 +103,7 @@ impl MatcherConfig { } else { self.delimeter_chars = b"/"; } - self.bonus_boundary_white = self.bonus_boundary; + self.bonus_boundary_white = BONUS_BOUNDARY; self.inital_char_class = CharClass::Delimiter; self } @@ -161,13 +149,13 @@ impl MatcherConfig { } } - pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> i16 { + pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { if class > CharClass::NonWord { // transition from non word to word match prev_class { CharClass::Whitespace => return self.bonus_boundary_white, CharClass::Delimiter => return self.bonus_boundary_delimiter, - CharClass::NonWord => return self.bonus_boundary, + CharClass::NonWord => return BONUS_BOUNDARY, _ => (), } } @@ -175,9 +163,9 @@ impl MatcherConfig { || prev_class != CharClass::Number && class == CharClass::Number { // camelCase letter123 - self.bonus_camel123 + BONUS_CAMEL123 } else if class == CharClass::NonWord { - self.bonus_non_word + BONUS_NON_WORD } else if class == CharClass::Whitespace { self.bonus_boundary_white } else { diff --git a/src/lib.rs b/src/lib.rs index fc20f7d..0ff4c2c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,19 +16,24 @@ mod normalize; pub use config::{CaseMatching, CharClass, MatcherConfig}; +use crate::config::{ + BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, + PENALTY_GAP_START, SCORE_MATCH, +}; + const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB const MAX_HAYSTACK_LEN: usize = 8192; // 64KB #[derive(Clone, Copy, PartialEq, Eq)] struct MatrixCell { - score: i16, + score: u16, consecutive_chars: u16, } #[derive(Clone, Copy, PartialEq, Eq)] struct HaystackChar { char: char, - bonus: i16, + bonus: u16, } pub struct Matcher { @@ -134,7 +139,7 @@ impl Matcher { ); } - pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option { + pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option { if haystack.len() > u32::MAX as usize { haystack = &haystack[..u32::MAX as usize] } @@ -156,7 +161,7 @@ impl Matcher { query: &Query, mut haystack: &str, indicies: &mut Vec, - ) -> Option { + ) -> Option { if haystack.len() > u32::MAX as usize { haystack = &haystack[..u32::MAX as usize] } @@ -236,7 +241,7 @@ impl Matcher { query: &Query, haystack: &str, indicies: &mut Vec, - ) -> Option { + ) -> Option { let (start, end) = self.prefilter(query, haystack)?; self.fuzzy_matcher_v1_with_prefilter::( query, haystack, start, end, indicies, @@ -250,7 +255,7 @@ impl Matcher { mut start: usize, mut end: usize, indicies: &mut Vec, - ) -> Option { + ) -> Option { let first_char_end = if ASCII_ONLY { start + 1 } else { end }; if !ASCII_ONLY && query.needle_chars.len() != 1 { let mut needle_iter = query.needle_chars[1..].iter().copied(); @@ -297,7 +302,7 @@ impl Matcher { match_start: usize, match_end: usize, indicies: &mut Vec, - ) -> i32 { + ) -> u16 { if INDICIES { indicies.reserve(query.needle_chars.len()); } @@ -307,10 +312,10 @@ impl Matcher { .map(|c| self.config.char_class(c)) .unwrap_or(self.config.inital_char_class); let mut needle_idx = 0; - let mut score = 0i32; + let mut score = 0u16; let mut in_gap = false; let mut consecutive = 0; - let mut first_bonus = 0i16; + let mut first_bonus = 0u16; for (i, mut c) in text[match_start..match_end].char_indices() { let class = self.config.char_class(c); if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case { @@ -323,33 +328,36 @@ impl Matcher { if INDICIES { indicies.push(i as u32) } - score += self.config.score_match as i32; + score += SCORE_MATCH; let mut bonus = self.config.bonus_for(prev_class, class); if consecutive == 0 { first_bonus = bonus } else { // Break consecutive chunk - if bonus >= self.config.bonus_boundary && bonus > first_bonus { - first_bonus = bonus + if bonus > first_bonus { + if bonus >= BONUS_BOUNDARY { + first_bonus = bonus; + } else { + bonus = max(bonus, BONUS_CONSECUTIVE); + } + } else { + bonus = max(first_bonus, BONUS_CONSECUTIVE); } - bonus = max( - max(bonus, first_bonus), - self.config.bonus_first_char_multiplier, - ); } if needle_idx == 0 { - bonus *= self.config.bonus_first_char_multiplier + bonus *= BONUS_FIRST_CHAR_MULTIPLIER; } - score += bonus as i32; + score += bonus; needle_idx += 1; in_gap = false; consecutive += 1; } else { - if in_gap { - score += self.config.score_gap_extension as i32 + let penalty = if in_gap { + PENALTY_GAP_EXTENSION } else { - score += self.config.score_gap_start as i32 - } + PENALTY_GAP_START + }; + score = score.saturating_sub(penalty); in_gap = true; consecutive = 0; first_bonus = 0; @@ -365,7 +373,7 @@ impl Matcher { query: &Query, text: &str, indicies: &mut Vec, - ) -> Option { + ) -> Option { let (start, prefilter_end) = self.prefilter(query, text)?; let text_len = text.len() - start; // fallback to v1 algorithms for long haystacks @@ -407,7 +415,7 @@ impl Matcher { let mut max_score = 0; let mut max_score_pos = 0; let mut in_gap = false; - let mut prev_score = 0; + let mut prev_score = 0u16; let mut matched = false; let first_needle_char = query.needle_chars[0]; @@ -438,25 +446,24 @@ impl Matcher { last_matched_idx = i; } if c == first_needle_char { - let score = - self.config.score_match + bonus * self.config.bonus_first_char_multiplier; + let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; matrix_cell.consecutive_chars = 1; if query.needle_chars.len() == 1 && score > max_score { max_score = score; max_score_pos = i; // can't get better than this - if bonus >= self.config.bonus_boundary { + if bonus >= BONUS_BOUNDARY { break; } } in_gap = false; } else { - let gap_score = if in_gap { - self.config.score_gap_extension + let gap_penalty = if in_gap { + PENALTY_GAP_EXTENSION } else { - self.config.score_gap_start + PENALTY_GAP_START }; - matrix_cell.score = max(0, gap_score + prev_score); + matrix_cell.score = prev_score.saturating_sub(gap_penalty); matrix_cell.consecutive_chars = 0; in_gap = true; } @@ -468,7 +475,7 @@ impl Matcher { } if query.needle_chars.len() == 1 { indicies.push(max_score_pos as u32); - return Some(max_score as i32); + return Some(max_score); } assert_eq!( self.first_needle_occurance[0], 0, @@ -517,10 +524,10 @@ impl Matcher { } } - Some(max_score as i32) + Some(max_score) } - fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (i16, u16) { + fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) { let mut max_score = 0; let mut max_score_end = 0; let mut iter = query @@ -548,24 +555,28 @@ impl Matcher { .enumerate() { let col = j + first_occurance as usize; - let gap_score = if in_gap { - self.config.score_gap_extension + let gap_penalty = if in_gap { + PENALTY_GAP_EXTENSION } else { - self.config.score_gap_start + PENALTY_GAP_START }; let mut score1 = 0; - let score2 = prev_matrix_cell.score + gap_score; + let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty); let mut consecutive = 0; if haystack_char.char == needle_char { - score1 = diag_matrix_cell.score + self.config.score_match; + score1 = diag_matrix_cell.score + SCORE_MATCH; let mut bonus = haystack_char.bonus; consecutive = diag_matrix_cell.consecutive_chars + 1; if consecutive > 1 { let first_bonus = self.haystack[col - consecutive as usize].bonus; - if bonus > self.config.bonus_boundary && bonus > first_bonus { - consecutive = 1 + if bonus > first_bonus { + if bonus > BONUS_BOUNDARY { + consecutive = 1 + } else { + bonus = max(bonus, BONUS_CONSECUTIVE) + } } else { - bonus = max(bonus, max(self.config.bonus_consecutive, first_bonus)) + bonus = max(first_bonus, BONUS_CONSECUTIVE) } } if score1 + bonus < score2 {