From 2ce871b70c58fdb1a7fe004636232cba25bd038e Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Tue, 25 Jul 2023 03:10:49 +0200 Subject: [PATCH] fairly stable and consistent fuzzy matching --- generate_case_fold_table.sh | 2 +- src/chars.rs | 10 ++++++---- src/chars/case_fold.rs | 6 +++++- src/exact.rs | 5 +++-- src/fuzzy_optimal.rs | 18 +++++++++++++----- src/score.rs | 2 +- src/tests.rs | 33 ++++++++++++++++++++++++++++++++- 7 files changed, 61 insertions(+), 15 deletions(-) diff --git a/generate_case_fold_table.sh b/generate_case_fold_table.sh index 8739eae..32a2669 100755 --- a/generate_case_fold_table.sh +++ b/generate_case_fold_table.sh @@ -9,5 +9,5 @@ unzip UCD.zip cd "${dir}" cargo install ucd-generate -ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/case_fold.rs +ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/chars/case_fold.rs rm -rf /tmp/ucd-15.0.0 diff --git a/src/chars.rs b/src/chars.rs index 7764e33..a26ef93 100644 --- a/src/chars.rs +++ b/src/chars.rs @@ -109,14 +109,16 @@ impl Char for char { return (c.0 as char, class); } let char_class = char_class_non_ascii(self); - if char_class == CharClass::Upper && config.ignore_case { + let mut case_fold = char_class == CharClass::Upper; + if config.normalize { + self = normalize::normalize(self); + case_fold = true + } + if case_fold && config.ignore_case { self = CASE_FOLDING_SIMPLE .binary_search_by_key(&self, |(upper, _)| *upper) .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) } - if config.normalize { - self = normalize::normalize(self); - } (self, char_class) } diff --git a/src/chars/case_fold.rs b/src/chars/case_fold.rs index 3c6d01b..aacbe46 100644 --- a/src/chars/case_fold.rs +++ b/src/chars/case_fold.rs @@ -7,7 +7,11 @@ // ucd-generate 0.3.0 is available on crates.io. pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[ - ('µ', 'μ'), ('À', 'à'), ('Á', 'á'), + ('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), + ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'), + ('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), + ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'), + ('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'), ('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'), ('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'), ('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'), diff --git a/src/exact.rs b/src/exact.rs index ec28763..70e42d1 100644 --- a/src/exact.rs +++ b/src/exact.rs @@ -78,9 +78,10 @@ impl Matcher { .checked_sub(1) .map(|i| haystack[i].char_class(&self.config)) .unwrap_or(self.config.initial_char_class); - for (i, &c) in haystack.iter().enumerate() { + for (i, &c) in haystack[start..].iter().enumerate() { let (c, char_class) = c.char_class_and_normalize(&self.config); if c != needle { + println!("ups {c} {needle}"); continue; } let bonus = self.config.bonus_for(prev_class, char_class); @@ -100,7 +101,7 @@ impl Matcher { if INDICES { indices.clear(); - indices.push(max_pos); + indices.push(max_pos + start as u32); } max_score } diff --git a/src/fuzzy_optimal.rs b/src/fuzzy_optimal.rs index 04e570e..e56ff46 100644 --- a/src/fuzzy_optimal.rs +++ b/src/fuzzy_optimal.rs @@ -65,7 +65,7 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE)); let score_match = m_score + consecutive_bonus as i32; let score_skip = p_score + next_bonus as i32; - if score_match > score_skip { + if score_match >= score_skip { ScoreCell { score: score_match + SCORE_MATCH as i32, bonus: consecutive_bonus, @@ -74,16 +74,24 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor } else { ScoreCell { score: score_skip + SCORE_MATCH as i32, - bonus: consecutive_bonus, + bonus: next_bonus, matched: false, } } } fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) { - let score_match = prev_m_score - PENALTY_GAP_START as i32; - let score_skip = prev_p_score - PENALTY_GAP_EXTENSION as i32; - if score_match > score_skip { + let score_match = if prev_m_score >= 0 { + (prev_m_score - PENALTY_GAP_START as i32).max(0) + } else { + i32::MIN / 2 + }; + let score_skip = if prev_p_score >= 0 { + (prev_p_score - PENALTY_GAP_EXTENSION as i32).max(0) + } else { + i32::MIN / 2 + }; + if score_match >= score_skip { (score_match, true) } else { (score_skip, false) diff --git a/src/score.rs b/src/score.rs index fd5d61f..4a14c7c 100644 --- a/src/score.rs +++ b/src/score.rs @@ -35,7 +35,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS // bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". // The amount of the extra bonus should be limited so that the gap penalty is // still respected. -pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; +pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 1; impl MatcherConfig { #[inline] diff --git a/src/tests.rs b/src/tests.rs index 2132e17..f6357ba 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -224,6 +224,19 @@ fn test_fuzzy() { - PENALTY_GAP_START - 23 * PENALTY_GAP_EXTENSION, ), + ( + "\nץ&`@ `---\0\0\0\0", + "`@ `--\0\0", + &[3, 4, 5, 6, 7, 8, 10, 11], + BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 3) + BONUS_BOUNDARY_WHITE * 4 + - PENALTY_GAP_START, + ), + ( + " 1111111u11111uuu111", + "11111uuu1", + &[9, 10, 11, 12, 13, 14, 15, 16, 17], + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 8), + ), ], ); } @@ -317,6 +330,7 @@ fn test_normalize() { &[1, 2, 3, 4, 5], BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), ), + ("ۂ(GCGɴCG", "n", &[5], 0), ], ) } @@ -399,7 +413,7 @@ fn test_optimal() { ( "Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}", "-!--!", - &[4, 5, 9, 10, 16], + &[4, 5, 13, 15, 16], BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4) - 2 * PENALTY_GAP_START - 6 * PENALTY_GAP_EXTENSION, @@ -413,6 +427,23 @@ fn test_optimal() { - 3 * PENALTY_GAP_EXTENSION + BONUS_CONSECUTIVE, ), + ( + "\nץ&`@ `;;;\0\0\0\0", + "`@ `;;\0\0", + &[3, 4, 5, 6, 7, 9, 10, 11], + BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + + BONUS_BOUNDARY_DELIMITER * 3 + + BONUS_BOUNDARY_WHITE * 3 + - PENALTY_GAP_START, + ), + ( + "dddddd\0\0\0ddddfdddddd", + "dddddfddddd", + &[0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 10 + - PENALTY_GAP_START + - 7 * PENALTY_GAP_EXTENSION, + ), ], ); }