fairly stable and consistent fuzzy matching

This commit is contained in:
Pascal Kuthe 2023-07-25 03:10:49 +02:00
parent becd35c5de
commit 2ce871b70c
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
7 changed files with 61 additions and 15 deletions

View File

@ -9,5 +9,5 @@ unzip UCD.zip
cd "${dir}"
cargo install ucd-generate
ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/case_fold.rs
ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/chars/case_fold.rs
rm -rf /tmp/ucd-15.0.0

View File

@ -109,14 +109,16 @@ impl Char for char {
return (c.0 as char, class);
}
let char_class = char_class_non_ascii(self);
if char_class == CharClass::Upper && config.ignore_case {
let mut case_fold = char_class == CharClass::Upper;
if config.normalize {
self = normalize::normalize(self);
case_fold = true
}
if case_fold && config.ignore_case {
self = CASE_FOLDING_SIMPLE
.binary_search_by_key(&self, |(upper, _)| *upper)
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
}
if config.normalize {
self = normalize::normalize(self);
}
(self, char_class)
}

View File

@ -7,7 +7,11 @@
// ucd-generate 0.3.0 is available on crates.io.
pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[
('µ', 'μ'), ('À', 'à'), ('Á', 'á'),
('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'),
('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'),
('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'),
('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'),
('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'),
('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'),
('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'),
('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'),

View File

@ -78,9 +78,10 @@ impl Matcher {
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.initial_char_class);
for (i, &c) in haystack.iter().enumerate() {
for (i, &c) in haystack[start..].iter().enumerate() {
let (c, char_class) = c.char_class_and_normalize(&self.config);
if c != needle {
println!("ups {c} {needle}");
continue;
}
let bonus = self.config.bonus_for(prev_class, char_class);
@ -100,7 +101,7 @@ impl Matcher {
if INDICES {
indices.clear();
indices.push(max_pos);
indices.push(max_pos + start as u32);
}
max_score
}

View File

@ -65,7 +65,7 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor
let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE));
let score_match = m_score + consecutive_bonus as i32;
let score_skip = p_score + next_bonus as i32;
if score_match > score_skip {
if score_match >= score_skip {
ScoreCell {
score: score_match + SCORE_MATCH as i32,
bonus: consecutive_bonus,
@ -74,16 +74,24 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor
} else {
ScoreCell {
score: score_skip + SCORE_MATCH as i32,
bonus: consecutive_bonus,
bonus: next_bonus,
matched: false,
}
}
}
fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) {
let score_match = prev_m_score - PENALTY_GAP_START as i32;
let score_skip = prev_p_score - PENALTY_GAP_EXTENSION as i32;
if score_match > score_skip {
let score_match = if prev_m_score >= 0 {
(prev_m_score - PENALTY_GAP_START as i32).max(0)
} else {
i32::MIN / 2
};
let score_skip = if prev_p_score >= 0 {
(prev_p_score - PENALTY_GAP_EXTENSION as i32).max(0)
} else {
i32::MIN / 2
};
if score_match >= score_skip {
(score_match, true)
} else {
(score_skip, false)

View File

@ -35,7 +35,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 1;
impl MatcherConfig {
#[inline]

View File

@ -224,6 +224,19 @@ fn test_fuzzy() {
- PENALTY_GAP_START
- 23 * PENALTY_GAP_EXTENSION,
),
(
"\nץ&`@ `---\0\0\0\0",
"`@ `--\0\0",
&[3, 4, 5, 6, 7, 8, 10, 11],
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 3) + BONUS_BOUNDARY_WHITE * 4
- PENALTY_GAP_START,
),
(
" 1111111u11111uuu111",
"11111uuu1",
&[9, 10, 11, 12, 13, 14, 15, 16, 17],
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 8),
),
],
);
}
@ -317,6 +330,7 @@ fn test_normalize() {
&[1, 2, 3, 4, 5],
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
("ۂ(GCGɴCG", "n", &[5], 0),
],
)
}
@ -399,7 +413,7 @@ fn test_optimal() {
(
"Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}",
"-!--!",
&[4, 5, 9, 10, 16],
&[4, 5, 13, 15, 16],
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4)
- 2 * PENALTY_GAP_START
- 6 * PENALTY_GAP_EXTENSION,
@ -413,6 +427,23 @@ fn test_optimal() {
- 3 * PENALTY_GAP_EXTENSION
+ BONUS_CONSECUTIVE,
),
(
"\nץ&`@ `;;;\0\0\0\0",
"`@ `;;\0\0",
&[3, 4, 5, 6, 7, 9, 10, 11],
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
+ BONUS_BOUNDARY_DELIMITER * 3
+ BONUS_BOUNDARY_WHITE * 3
- PENALTY_GAP_START,
),
(
"dddddd\0\0\0ddddfdddddd",
"dddddfddddd",
&[0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 10
- PENALTY_GAP_START
- 7 * PENALTY_GAP_EXTENSION,
),
],
);
}