mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 09:57:49 +00:00
fairly stable and consistent fuzzy matching
This commit is contained in:
parent
becd35c5de
commit
2ce871b70c
@ -9,5 +9,5 @@ unzip UCD.zip
|
|||||||
|
|
||||||
cd "${dir}"
|
cd "${dir}"
|
||||||
cargo install ucd-generate
|
cargo install ucd-generate
|
||||||
ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/case_fold.rs
|
ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/chars/case_fold.rs
|
||||||
rm -rf /tmp/ucd-15.0.0
|
rm -rf /tmp/ucd-15.0.0
|
||||||
|
10
src/chars.rs
10
src/chars.rs
@ -109,14 +109,16 @@ impl Char for char {
|
|||||||
return (c.0 as char, class);
|
return (c.0 as char, class);
|
||||||
}
|
}
|
||||||
let char_class = char_class_non_ascii(self);
|
let char_class = char_class_non_ascii(self);
|
||||||
if char_class == CharClass::Upper && config.ignore_case {
|
let mut case_fold = char_class == CharClass::Upper;
|
||||||
|
if config.normalize {
|
||||||
|
self = normalize::normalize(self);
|
||||||
|
case_fold = true
|
||||||
|
}
|
||||||
|
if case_fold && config.ignore_case {
|
||||||
self = CASE_FOLDING_SIMPLE
|
self = CASE_FOLDING_SIMPLE
|
||||||
.binary_search_by_key(&self, |(upper, _)| *upper)
|
.binary_search_by_key(&self, |(upper, _)| *upper)
|
||||||
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
||||||
}
|
}
|
||||||
if config.normalize {
|
|
||||||
self = normalize::normalize(self);
|
|
||||||
}
|
|
||||||
(self, char_class)
|
(self, char_class)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,11 @@
|
|||||||
// ucd-generate 0.3.0 is available on crates.io.
|
// ucd-generate 0.3.0 is available on crates.io.
|
||||||
|
|
||||||
pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[
|
pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[
|
||||||
('µ', 'μ'), ('À', 'à'), ('Á', 'á'),
|
('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'),
|
||||||
|
('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'),
|
||||||
|
('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'),
|
||||||
|
('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'),
|
||||||
|
('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'),
|
||||||
('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'),
|
('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'),
|
||||||
('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'),
|
('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'),
|
||||||
('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'),
|
('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'),
|
||||||
|
@ -78,9 +78,10 @@ impl Matcher {
|
|||||||
.checked_sub(1)
|
.checked_sub(1)
|
||||||
.map(|i| haystack[i].char_class(&self.config))
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
.unwrap_or(self.config.initial_char_class);
|
.unwrap_or(self.config.initial_char_class);
|
||||||
for (i, &c) in haystack.iter().enumerate() {
|
for (i, &c) in haystack[start..].iter().enumerate() {
|
||||||
let (c, char_class) = c.char_class_and_normalize(&self.config);
|
let (c, char_class) = c.char_class_and_normalize(&self.config);
|
||||||
if c != needle {
|
if c != needle {
|
||||||
|
println!("ups {c} {needle}");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let bonus = self.config.bonus_for(prev_class, char_class);
|
let bonus = self.config.bonus_for(prev_class, char_class);
|
||||||
@ -100,7 +101,7 @@ impl Matcher {
|
|||||||
|
|
||||||
if INDICES {
|
if INDICES {
|
||||||
indices.clear();
|
indices.clear();
|
||||||
indices.push(max_pos);
|
indices.push(max_pos + start as u32);
|
||||||
}
|
}
|
||||||
max_score
|
max_score
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,7 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor
|
|||||||
let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE));
|
let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE));
|
||||||
let score_match = m_score + consecutive_bonus as i32;
|
let score_match = m_score + consecutive_bonus as i32;
|
||||||
let score_skip = p_score + next_bonus as i32;
|
let score_skip = p_score + next_bonus as i32;
|
||||||
if score_match > score_skip {
|
if score_match >= score_skip {
|
||||||
ScoreCell {
|
ScoreCell {
|
||||||
score: score_match + SCORE_MATCH as i32,
|
score: score_match + SCORE_MATCH as i32,
|
||||||
bonus: consecutive_bonus,
|
bonus: consecutive_bonus,
|
||||||
@ -74,16 +74,24 @@ fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> Scor
|
|||||||
} else {
|
} else {
|
||||||
ScoreCell {
|
ScoreCell {
|
||||||
score: score_skip + SCORE_MATCH as i32,
|
score: score_skip + SCORE_MATCH as i32,
|
||||||
bonus: consecutive_bonus,
|
bonus: next_bonus,
|
||||||
matched: false,
|
matched: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) {
|
fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) {
|
||||||
let score_match = prev_m_score - PENALTY_GAP_START as i32;
|
let score_match = if prev_m_score >= 0 {
|
||||||
let score_skip = prev_p_score - PENALTY_GAP_EXTENSION as i32;
|
(prev_m_score - PENALTY_GAP_START as i32).max(0)
|
||||||
if score_match > score_skip {
|
} else {
|
||||||
|
i32::MIN / 2
|
||||||
|
};
|
||||||
|
let score_skip = if prev_p_score >= 0 {
|
||||||
|
(prev_p_score - PENALTY_GAP_EXTENSION as i32).max(0)
|
||||||
|
} else {
|
||||||
|
i32::MIN / 2
|
||||||
|
};
|
||||||
|
if score_match >= score_skip {
|
||||||
(score_match, true)
|
(score_match, true)
|
||||||
} else {
|
} else {
|
||||||
(score_skip, false)
|
(score_skip, false)
|
||||||
|
@ -35,7 +35,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS
|
|||||||
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
||||||
// The amount of the extra bonus should be limited so that the gap penalty is
|
// The amount of the extra bonus should be limited so that the gap penalty is
|
||||||
// still respected.
|
// still respected.
|
||||||
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
|
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 1;
|
||||||
|
|
||||||
impl MatcherConfig {
|
impl MatcherConfig {
|
||||||
#[inline]
|
#[inline]
|
||||||
|
33
src/tests.rs
33
src/tests.rs
@ -224,6 +224,19 @@ fn test_fuzzy() {
|
|||||||
- PENALTY_GAP_START
|
- PENALTY_GAP_START
|
||||||
- 23 * PENALTY_GAP_EXTENSION,
|
- 23 * PENALTY_GAP_EXTENSION,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"\nץ&`@ `---\0\0\0\0",
|
||||||
|
"`@ `--\0\0",
|
||||||
|
&[3, 4, 5, 6, 7, 8, 10, 11],
|
||||||
|
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 3) + BONUS_BOUNDARY_WHITE * 4
|
||||||
|
- PENALTY_GAP_START,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
" 1111111u11111uuu111",
|
||||||
|
"11111uuu1",
|
||||||
|
&[9, 10, 11, 12, 13, 14, 15, 16, 17],
|
||||||
|
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 8),
|
||||||
|
),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -317,6 +330,7 @@ fn test_normalize() {
|
|||||||
&[1, 2, 3, 4, 5],
|
&[1, 2, 3, 4, 5],
|
||||||
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
|
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
|
||||||
),
|
),
|
||||||
|
("ۂ(GCGɴCG", "n", &[5], 0),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -399,7 +413,7 @@ fn test_optimal() {
|
|||||||
(
|
(
|
||||||
"Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}",
|
"Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}",
|
||||||
"-!--!",
|
"-!--!",
|
||||||
&[4, 5, 9, 10, 16],
|
&[4, 5, 13, 15, 16],
|
||||||
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4)
|
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4)
|
||||||
- 2 * PENALTY_GAP_START
|
- 2 * PENALTY_GAP_START
|
||||||
- 6 * PENALTY_GAP_EXTENSION,
|
- 6 * PENALTY_GAP_EXTENSION,
|
||||||
@ -413,6 +427,23 @@ fn test_optimal() {
|
|||||||
- 3 * PENALTY_GAP_EXTENSION
|
- 3 * PENALTY_GAP_EXTENSION
|
||||||
+ BONUS_CONSECUTIVE,
|
+ BONUS_CONSECUTIVE,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"\nץ&`@ `;;;\0\0\0\0",
|
||||||
|
"`@ `;;\0\0",
|
||||||
|
&[3, 4, 5, 6, 7, 9, 10, 11],
|
||||||
|
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
|
||||||
|
+ BONUS_BOUNDARY_DELIMITER * 3
|
||||||
|
+ BONUS_BOUNDARY_WHITE * 3
|
||||||
|
- PENALTY_GAP_START,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"dddddd\0\0\0ddddfdddddd",
|
||||||
|
"dddddfddddd",
|
||||||
|
&[0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 10
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- 7 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user