nucleo/src/config.rs

188 lines
6.3 KiB
Rust
Raw Normal View History

2023-07-17 15:26:27 +00:00
pub struct MatcherConfig {
pub score_match: i16,
pub score_gap_start: i16,
pub score_gap_extension: i16,
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub bonus_boundary: i16,
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub bonus_non_word: i16,
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub bonus_camel123: i16,
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub bonus_consecutive: i16,
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub bonus_first_char_multiplier: i16,
pub delimeter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string
pub bonus_boundary_white: i16,
// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub bonus_boundary_delimiter: i16,
pub inital_char_class: CharClass,
/// Whether to normalize latin script charaters to ASCII
/// this significantly degrades performance so its not recommended
/// to be truned on by default
pub normalize: bool,
/// use faster/simpler algorithm at the cost of (potentially) much worse results
/// For long inputs this algorith is always used as a fallbach to avoid
/// blowups in time complexity
pub use_v1: bool,
/// The case matching to perform
pub case_matching: CaseMatching,
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CharClass {
Whitespace,
NonWord,
Delimiter,
Lower,
Upper,
Letter,
Number,
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CaseMatching {
Respect,
Ignore,
Smart,
}
impl MatcherConfig {
pub const DEFAULT: Self = {
let score_match = 16;
let score_gap_start = -3;
let score_gap_extension = -1;
let bonus_boundary = score_match / 2;
MatcherConfig {
score_match,
score_gap_start,
score_gap_extension,
bonus_boundary,
bonus_non_word: score_match / 2,
bonus_camel123: bonus_boundary + score_gap_extension,
bonus_consecutive: -(score_gap_start + score_gap_extension),
bonus_first_char_multiplier: 2,
delimeter_chars: b"/,:;|",
bonus_boundary_white: bonus_boundary + 2,
bonus_boundary_delimiter: bonus_boundary + 1,
inital_char_class: CharClass::Whitespace,
normalize: false,
use_v1: false,
case_matching: CaseMatching::Smart,
}
};
}
impl MatcherConfig {
pub fn set_match_paths(&mut self) {
if cfg!(windows) {
self.delimeter_chars = b"/\\";
} else {
self.delimeter_chars = b"/";
}
self.bonus_boundary_white = self.bonus_boundary;
self.inital_char_class = CharClass::Delimiter;
}
pub const fn match_paths(mut self) -> Self {
if cfg!(windows) {
self.delimeter_chars = b"/\\";
} else {
self.delimeter_chars = b"/";
}
self.bonus_boundary_white = self.bonus_boundary;
self.inital_char_class = CharClass::Delimiter;
self
}
fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() {
CharClass::Lower
} else if c.is_uppercase() {
CharClass::Upper
} else if c.is_numeric() {
CharClass::Number
} else if c.is_alphabetic() {
CharClass::Letter
} else if c.is_whitespace() {
CharClass::Whitespace
} else {
CharClass::NonWord
}
}
fn char_class_ascii(&self, c: char) -> CharClass {
// using manual if conditions instead optimizes better
if c >= 'a' && c <= 'z' {
CharClass::Lower
} else if c >= 'A' && c <= 'Z' {
CharClass::Upper
} else if c >= '0' && c <= '9' {
CharClass::Number
} else if c.is_ascii_whitespace() {
CharClass::Whitespace
} else if self.delimeter_chars.contains(&(c as u8)) {
CharClass::Delimiter
} else {
CharClass::NonWord
}
}
pub(crate) fn char_class(&self, c: char) -> CharClass {
if c.is_ascii() {
self.char_class_ascii(c)
} else {
Self::char_class_non_ascii(c)
}
}
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> i16 {
if class > CharClass::NonWord {
// transition from non word to word
match prev_class {
CharClass::Whitespace => return self.bonus_boundary_white,
CharClass::Delimiter => return self.bonus_boundary_delimiter,
CharClass::NonWord => return self.bonus_boundary,
_ => (),
}
}
if prev_class == CharClass::Lower && class == CharClass::Upper
|| prev_class != CharClass::Number && class == CharClass::Number
{
// camelCase letter123
self.bonus_camel123
} else if class == CharClass::NonWord {
self.bonus_non_word
} else if class == CharClass::Whitespace {
self.bonus_boundary_white
} else {
0
}
}
}