fix scoring and case-sensitive matching

This commit is contained in:
Pascal Kuthe 2023-07-20 21:19:11 +02:00
parent 52f1712a78
commit 9ffa5e63c2
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
4 changed files with 88 additions and 21 deletions

View File

@ -120,7 +120,7 @@ impl Char for char {
return (c.0 as char, class);
}
let char_class = char_class_non_ascii(self);
if char_class == CharClass::Upper {
if char_class == CharClass::Upper && config.ignore_case {
self = CASE_FOLDING_SIMPLE
.binary_search_by_key(&self, |(upper, _)| *upper)
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
@ -136,7 +136,10 @@ impl Char for char {
if config.normalize {
self = normalize::normalize(self);
}
to_lower_case(self)
if config.ignore_case {
self = to_lower_case(self)
}
self
}
}

View File

@ -178,18 +178,26 @@ impl<H: Char> Matrix<'_, H> {
} else {
PENALTY_GAP_START
};
let mut score1 = 0;
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
// we calculate two scores:
// * one for transversing the matrix horizontially (no match at
// the current char)
// * one for transversing the matrix diagonally (match at the
// current char)
// the maximum of those two scores is used
let mut score_diag = 0;
let score_hory = prev_matrix_cell.score.saturating_sub(gap_penalty);
let mut consecutive = 0;
if haystack_char.char == needle_char {
score1 = diag_matrix_cell.score + SCORE_MATCH;
// we have a match at the current char
score_diag = diag_matrix_cell.score + SCORE_MATCH;
let mut bonus = haystack_char.bonus;
consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 {
let first_bonus = self.bonus[col + 1 - consecutive as usize];
println!("xoxo {bonus} {first_bonus} {consecutive}");
if bonus > first_bonus {
if bonus > BONUS_BOUNDARY {
if bonus >= BONUS_BOUNDARY {
consecutive = 1
} else {
bonus = max(bonus, BONUS_CONSECUTIVE)
@ -198,15 +206,15 @@ impl<H: Char> Matrix<'_, H> {
bonus = max(first_bonus, BONUS_CONSECUTIVE)
}
}
if score1 + bonus < score2 {
score1 += haystack_char.bonus;
if score_diag + bonus < score_hory {
score_diag += haystack_char.bonus;
consecutive = 0;
} else {
score1 += bonus;
score_diag += bonus;
}
}
in_gap = score1 < score2;
let score = max(score1, score2);
in_gap = score_diag < score_hory;
let score = max(score_diag, score_hory);
if i == needle.len() - 1 && score > max_score {
max_score = score;
max_score_end = col as u16;
@ -238,25 +246,31 @@ impl<H: Char> Matrix<'_, H> {
loop {
let score = row[col].score;
let mut score1 = 0;
let mut score2 = 0;
// we calculate two scores:
// * one for transversing the matrix horizontially (no match at
// the current char)
// * one for transversing the matrix diagonally (match at the
// current char)
// the maximum of those two scores is used
let mut score_diag = 0;
let mut score_horz = 0;
if let Some(&(prev_row, _)) = row_iter.peek() {
if col >= prev_row.off {
score1 = prev_row[col].score;
score_diag = prev_row[col].score;
}
}
if col > row.off {
score2 = row[col - 1].score;
score_horz = row[col - 1].score;
}
let mut new_prefer_match = row[col].consecutive_chars > 1;
if !new_prefer_match && col + 1 < haystack_len {
if let Some(next_row) = next_row {
if col + 1 > next_row.off {
if col + 1 >= next_row.off {
new_prefer_match = next_row[col + 1].consecutive_chars > 0
}
}
}
if score > score1 && (score > score2 || score == score2 && prefer_match) {
if score > score_diag && (score > score_horz || score == score_horz && prefer_match) {
*matched_col_idx = col as u32 + start;
next_row = Some(row);
let Some(next) = row_iter.next() else {
@ -267,5 +281,6 @@ impl<H: Char> Matrix<'_, H> {
prefer_match = new_prefer_match;
col -= 1;
}
println!("{:#?}", self);
}
}

View File

@ -51,11 +51,14 @@ pub fn assert_matches(
Some(score),
"{needle:?} did not match {haystack:?}: {match_chars:?}"
);
assert_eq!(match_chars, needle_chars, "match indices are incorrect");
assert_eq!(
match_chars, needle_chars,
"match indices are incorrect {indices:?}"
);
assert_eq!(
indices.first().copied()..indices.last().map(|&i| i + 1),
Some(start)..Some(end),
"{needle:?} match {haystack:?}[{start}..{end}]"
"{needle:?} match {haystack:?}"
);
}
}
@ -63,7 +66,7 @@ const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
#[test]
fn test_v2_fuzzy() {
fn test_fuzzy() {
assert_matches(
false,
false,
@ -176,6 +179,52 @@ fn test_v2_fuzzy() {
);
}
#[test]
fn test_fuzzy_case_sensitive() {
assert_matches(
false,
false,
true,
false,
&[
(
"fooBarbaz1",
"oBz",
2,
9,
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
),
(
"Foo/Bar/Baz",
"FBB",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"FooBarBaz",
"FBB",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
- 2 * PENALTY_GAP_START
- 2 * PENALTY_GAP_EXTENSION,
),
(
"FooBar Baz",
"FooB",
0,
4,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
),
// Consecutive bonus updated
("foo-bar", "o-ba", 2, 6, BONUS_BOUNDARY * 2 + BONUS_NON_WORD),
],
);
}
#[test]
fn test_v1_fuzzy() {
assert_matches(

View File

@ -1,3 +1,3 @@
default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"]
[files]
extend-exclude = ["integration_tests", "verilogae/tests", "*.mir", "openvaf/lexer/src/tests.rs"]
extend-exclude = ["src/tests.rs"]