From 74e2b46f04dacce68dbaeedb20bce28c083ed5f1 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Fri, 21 Jul 2023 00:16:15 +0200 Subject: [PATCH] fix remaining bugs, achive high coverage --- src/chars.rs | 13 +- src/chars/normalize.rs | 2 +- src/debug.rs | 69 +++++++++ src/fuzzy_greedy.rs | 31 +++-- src/fuzzy_optimal.rs | 13 +- src/lib.rs | 30 +--- src/matrix.rs | 72 +--------- src/prefilter.rs | 2 +- src/tests.rs | 307 ++++++++++++++++++++++++++++++++++++++++- src/utf32_str.rs | 11 +- tarpulin.toml | 3 + 11 files changed, 419 insertions(+), 134 deletions(-) create mode 100644 src/debug.rs create mode 100644 tarpulin.toml diff --git a/src/chars.rs b/src/chars.rs index 731aca6..391df76 100644 --- a/src/chars.rs +++ b/src/chars.rs @@ -9,7 +9,7 @@ use crate::MatcherConfig; mod case_fold; mod normalize; -pub trait Char: Copy + Eq + Ord + fmt::Debug + fmt::Display { +pub trait Char: Copy + Eq + Ord + fmt::Display { const ASCII: bool; fn char_class(self, config: &MatcherConfig) -> CharClass; fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); @@ -27,23 +27,12 @@ impl AsciiChar { } } -impl fmt::Debug for AsciiChar { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Debug::fmt(&(self.0 as char), f) - } -} - impl fmt::Display for AsciiChar { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { Display::fmt(&(self.0 as char), f) } } -impl PartialEq for AsciiChar { - fn eq(&self, other: &char) -> bool { - self.0 as char == *other - } -} impl PartialEq for char { fn eq(&self, other: &AsciiChar) -> bool { other.0 as char == *self diff --git a/src/chars/normalize.rs b/src/chars/normalize.rs index 14b1236..772d768 100644 --- a/src/chars/normalize.rs +++ b/src/chars/normalize.rs @@ -495,7 +495,7 @@ static TABLE3: [char; LEN3] = generate_table(&DATA3); pub fn normalize(c: char) -> char { let i = c as u32; - if i < DATA1_START || DATA3_END >= i { + if i < DATA1_START || i >= DATA3_END { return c; } if i < DATA1_END { diff --git a/src/debug.rs b/src/debug.rs new file mode 100644 index 0000000..8f35d00 --- /dev/null +++ b/src/debug.rs @@ -0,0 +1,69 @@ +use crate::chars::Char; +use crate::matrix::{haystack, HaystackChar, Matrix, MatrixCell, MatrixRow, MatrixRowMut}; +use std::fmt::{Debug, Formatter, Result}; + +impl Matrix<'_, C> { + pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { + let mut cells = &*self.cells; + self.row_offs.iter().map(move |&off| { + let len = self.haystack.len() - off as usize; + let (row, tmp) = cells.split_at(len); + cells = tmp; + MatrixRow { off, cells: row } + }) + } + + pub fn haystack( + &self, + ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { + haystack(self.haystack, self.bonus, 0) + } +} + +impl Debug for MatrixCell { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "({}, {})", self.score, self.consecutive_chars) + } +} +impl Debug for HaystackChar { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "({}, {})", self.char, self.bonus) + } +} +impl Debug for MatrixRow<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut f = f.debug_list(); + f.entries((0..self.off).map(|_| &MatrixCell { + score: 0, + consecutive_chars: 0, + })); + f.entries(self.cells.iter()); + f.finish() + } +} +impl Debug for MatrixRowMut<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut f = f.debug_list(); + f.entries((0..self.off).map(|_| &(0, 0))); + f.entries(self.cells.iter()); + f.finish() + } +} +pub struct DebugList(I); +impl Debug for DebugList +where + I: Iterator + Clone, + I::Item: Debug, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + f.debug_list().entries(self.0.clone()).finish() + } +} +impl<'a, C: Char> Debug for Matrix<'a, C> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + f.debug_struct("Matrix") + .field("haystack", &DebugList(self.haystack())) + .field("matrix", &DebugList(self.rows())) + .finish() + } +} diff --git a/src/fuzzy_greedy.rs b/src/fuzzy_greedy.rs index 1eb0bc6..54fd340 100644 --- a/src/fuzzy_greedy.rs +++ b/src/fuzzy_greedy.rs @@ -12,22 +12,27 @@ impl Matcher { mut end: usize, indices: &mut Vec, ) -> Option { - let first_char_end = if H::ASCII { start + 1 } else { end }; - if !H::ASCII && needle.len() != 1 { - let mut needle_iter = needle[1..].iter().copied(); - if let Some(mut needle_char) = needle_iter.next() { - for (i, &c) in haystack[first_char_end..].iter().enumerate() { - if c.normalize(&self.config) == needle_char { - let Some(next_needle_char) = needle_iter.next() else { - end = i + 1; - break; - }; - needle_char = next_needle_char; + let first_char_end = if H::ASCII && N::ASCII { start + 1 } else { end }; + 'nonascii: { + if !H::ASCII || !N::ASCII { + let mut needle_iter = needle[1..].iter().copied(); + if let Some(mut needle_char) = needle_iter.next() { + for (i, &c) in haystack[first_char_end..].iter().enumerate() { + if c.normalize(&self.config) == needle_char { + let Some(next_needle_char) = needle_iter.next() else { + // we found a match so we are now in the same state + // as the prefilter would produce + end = first_char_end + i + 1; + break 'nonascii; + }; + needle_char = next_needle_char; + } } + // some needle chars were not matched bail out + return None; } } - } - // minimize the greedly match by greedy matching in reverse + } // minimize the greedly match by greedy matching in reverse let mut needle_iter = needle.iter().rev().copied(); let mut needle_char = needle_iter.next().unwrap(); diff --git a/src/fuzzy_optimal.rs b/src/fuzzy_optimal.rs index beff56b..25ee42d 100644 --- a/src/fuzzy_optimal.rs +++ b/src/fuzzy_optimal.rs @@ -19,6 +19,7 @@ impl Matcher { end: usize, indices: &mut Vec, ) -> Option { + println!("{start} {end}"); // construct a matrix (and copy the haystack), the matrix and haystack size are bounded // to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows // us to treat needle indices as u16 @@ -88,9 +89,9 @@ impl Matrix<'_, H> { let first_needle_char = needle[0]; let mut matrix_cells = 0; - for (i, ((c, matrix_cell), bonus_)) in col_iter { - let class = c.char_class(config); - *c = c.normalize(config); + for (i, ((c_, matrix_cell), bonus_)) in col_iter { + let (c, class) = c_.char_class_and_normalize(config); + *c_ = c; let bonus = config.bonus_for(prev_class, class); // save bonus for later so we don't have to recompute it each time @@ -98,7 +99,7 @@ impl Matrix<'_, H> { prev_class = class; let i = i as u16; - if *c == needle_char { + if c == needle_char { // save the first idx of each char if let Some(next) = row_iter.next() { matrix_cells += haystack_len - i; @@ -111,7 +112,7 @@ impl Matrix<'_, H> { matched = true; } } - if *c == first_needle_char { + if c == first_needle_char { let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; matrix_cell.consecutive_chars = 1; if needle.len() == 1 && score > max_score { @@ -195,7 +196,6 @@ impl Matrix<'_, H> { consecutive = diag_matrix_cell.consecutive_chars + 1; if consecutive > 1 { let first_bonus = self.bonus[col + 1 - consecutive as usize]; - println!("xoxo {bonus} {first_bonus} {consecutive}"); if bonus > first_bonus { if bonus >= BONUS_BOUNDARY { consecutive = 1 @@ -281,6 +281,5 @@ impl Matrix<'_, H> { prefer_match = new_prefer_match; col -= 1; } - println!("{:#?}", self); } } diff --git a/src/lib.rs b/src/lib.rs index 19d5042..637e79a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,8 @@ mod chars; mod config; +#[cfg(test)] +mod debug; mod fuzzy_greedy; mod fuzzy_optimal; mod matrix; @@ -24,32 +26,6 @@ pub struct Matcher { slab: MatrixSlab, } -// // impl Query { -// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) { -// // self.needle_chars.reserve(needle.len()); -// // self.needle_chars.extend(needle.chars().map(|mut c| { -// // if !c.is_ascii() { -// // self.is_ascii = false; -// // } -// // if smart_case { -// // if c.is_uppercase() { -// // self.ignore_case = false; -// // } -// // } else if self.ignore_case { -// // if self.is_ascii { -// // c = to_lower_case::(c) -// // } else { -// // c = to_lower_case::(c) -// // } -// // } -// // if normalize_ && !self.is_ascii { -// // c = normalize(c); -// // } -// // c -// // })) -// // } -// // } - impl Matcher { pub fn new(config: MatcherConfig) -> Self { Self { @@ -79,7 +55,7 @@ impl Matcher { needle_: Utf32Str<'_>, indidies: &mut Vec, ) -> Option { - if needle_.len() > haystack.len() { + if needle_.len() > haystack.len() || needle_.is_empty() { return None; } // if needle_.len() == haystack.len() { diff --git a/src/matrix.rs b/src/matrix.rs index 37eb6a9..3ee4e4c 100644 --- a/src/matrix.rs +++ b/src/matrix.rs @@ -1,5 +1,4 @@ use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; -use std::fmt::{Debug, Formatter, Result}; use std::marker::PhantomData; use std::mem::{size_of, take}; use std::ops::Index; @@ -74,30 +73,18 @@ impl MatrixLayout { } } -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy)] pub(crate) struct MatrixCell { pub score: u16, pub consecutive_chars: u16, } -impl Debug for MatrixCell { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {})", self.score, self.consecutive_chars) - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy)] pub(crate) struct HaystackChar { pub char: C, pub bonus: u16, } -impl Debug for HaystackChar { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({:?}, {})", self.char, self.bonus) - } -} - #[derive(Clone, Copy)] pub(crate) struct MatrixRow<'a> { pub off: u16, @@ -116,43 +103,11 @@ impl Index for MatrixRow<'_> { } } -impl Debug for MatrixRow<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - let mut f = f.debug_list(); - f.entries((0..self.off).map(|_| &MatrixCell { - score: 0, - consecutive_chars: 0, - })); - f.entries(self.cells.iter()); - f.finish() - } -} - pub(crate) struct MatrixRowMut<'a> { pub off: u16, pub cells: &'a mut [MatrixCell], } -impl Debug for MatrixRowMut<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - let mut f = f.debug_list(); - f.entries((0..self.off).map(|_| &(0, 0))); - f.entries(self.cells.iter()); - f.finish() - } -} - -pub struct DebugList(I); -impl Debug for DebugList -where - I: Iterator + Clone, - I::Item: Debug, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - f.debug_list().entries(self.0.clone()).finish() - } -} - pub(crate) struct Matrix<'a, C: Char> { pub haystack: &'a mut [C], // stored as a separate array instead of struct @@ -163,16 +118,6 @@ pub(crate) struct Matrix<'a, C: Char> { } impl<'a, C: Char> Matrix<'a, C> { - pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { - let mut cells = &*self.cells; - self.row_offs.iter().map(move |&off| { - let len = self.haystack.len() - off as usize; - let (row, tmp) = cells.split_at(len); - cells = tmp; - MatrixRow { off, cells: row } - }) - } - pub fn rows_rev(&self) -> impl Iterator + ExactSizeIterator { let mut cells = &*self.cells; self.row_offs.iter().rev().map(move |&off| { @@ -182,21 +127,8 @@ impl<'a, C: Char> Matrix<'a, C> { MatrixRow { off, cells: row } }) } - pub fn haystack( - &self, - ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { - haystack(self.haystack, self.bonus, 0) - } } -impl<'a, C: Char> Debug for Matrix<'a, C> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - f.debug_struct("Matrix") - .field("haystack", &DebugList(self.haystack())) - .field("matrix", &DebugList(self.rows())) - .finish() - } -} pub(crate) fn haystack<'a, C: Char>( haystack: &'a [C], bonus: &'a [u16], diff --git a/src/prefilter.rs b/src/prefilter.rs index 634ea63..6b7c58e 100644 --- a/src/prefilter.rs +++ b/src/prefilter.rs @@ -79,7 +79,7 @@ impl Matcher { if only_greedy { Some((start, start + 1)) } else { - let end = start + haystack.len() + let end = haystack.len() - haystack[start..] .iter() .rev() diff --git a/src/tests.rs b/src/tests.rs index d51dffe..713fa2a 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -49,7 +49,7 @@ pub fn assert_matches( assert_eq!( res, Some(score), - "{needle:?} did not match {haystack:?}: {match_chars:?}" + "{needle:?} did not match {haystack:?}: matched {match_chars:?} {indices:?}" ); assert_eq!( match_chars, needle_chars, @@ -62,6 +62,42 @@ pub fn assert_matches( ); } } + +pub fn assert_not_matches( + normalize: bool, + case_sensitive: bool, + path: bool, + cases: &[(&str, &str)], +) { + let mut config = MatcherConfig { + normalize, + ignore_case: !case_sensitive, + ..MatcherConfig::DEFAULT + }; + if path { + config.set_match_paths(); + } + let mut matcher = Matcher::new(config); + let mut needle_buf = Vec::new(); + let mut haystack_buf = Vec::new(); + for &(haystack, needle) in cases { + let needle = if !case_sensitive { + needle.to_lowercase() + } else { + needle.to_owned() + }; + let needle = Utf32Str::new(&needle, &mut needle_buf); + let haystack = Utf32Str::new(haystack, &mut haystack_buf); + + let res = matcher.fuzzy_match(haystack, needle); + assert_eq!(res, None, "{needle:?} should not match {haystack:?}"); + let res = matcher.fuzzy_match_greedy(haystack, needle); + assert_eq!( + res, None, + "{needle:?} should not match {haystack:?} (greedy)" + ) + } +} const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; @@ -225,6 +261,52 @@ fn test_fuzzy_case_sensitive() { ); } +#[test] +fn test_fuzzy_case_sensitive_v1() { + assert_matches( + true, + false, + true, + false, + &[ + ( + "fooBarbaz1", + "oBz", + 2, + 9, + BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3, + ), + ( + "Foo/Bar/Baz", + "FBB", + 0, + 9, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "FooBarBaz", + "FBB", + 0, + 7, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 + - 2 * PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION, + ), + ( + "FooBar Baz", + "FooB", + 0, + 4, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3, + ), + // Consecutive bonus updated + ("foo-bar", "o-ba", 2, 6, BONUS_BOUNDARY * 2 + BONUS_NON_WORD), + ], + ); +} + #[test] fn test_v1_fuzzy() { assert_matches( @@ -338,3 +420,226 @@ fn test_v1_fuzzy() { ], ); } + +#[test] +fn test_normalize() { + assert_matches( + false, + true, + false, + false, + &[ + ( + "Só Danço Samba", + "So", + 0, + 2, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + ), + ( + "Só Danço Samba", + "sodc", + 0, + 7, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START + + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ( + "Danço", + "danco", + 0, + 5, + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "DanÇo", + "danco", + 0, + 5, + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "xÇando", + "cando", + 1, + 6, + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ], + ) +} + +#[test] +fn test_normalize_v1() { + assert_matches( + true, + true, + false, + false, + &[ + ( + "Só Danço Samba", + "So", + 0, + 2, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + ), + ( + "Só Danço Samba", + "sodc", + 0, + 7, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START + + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ( + "Danço", + "danco", + 0, + 5, + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "DanÇo", + "danco", + 0, + 5, + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "xÇando", + "cando", + 1, + 6, + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ], + ) +} + +#[test] +fn test_unicode_v1() { + assert_matches( + true, + true, + false, + false, + &[ + ( + "你好世界", + "你好", + 0, + 2, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + ), + ( + "你好世界", + "你世", + 0, + 3, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, + ), + ], + ) +} + +#[test] +fn test_unicode() { + assert_matches( + false, + true, + false, + false, + &[ + ( + "你好世界", + "你好", + 0, + 2, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + ), + ( + "你好世界", + "你世", + 0, + 3, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, + ), + ], + ) +} + +#[test] +fn test_long_str() { + assert_matches( + false, + false, + false, + false, + &[( + &"x".repeat(u16::MAX as usize + 1), + "xx", + 0, + 2, + (BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE, + )], + ); +} + +#[test] +fn test_optimal() { + assert_matches( + false, + false, + false, + false, + &[( + "axxx xx ", + "xx", + 5, + 7, + (BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE, + )], + ) +} + +#[test] +fn test_reject() { + assert_not_matches( + true, + false, + false, + &[ + ("你好界", "abc"), + ("你好世界", "富"), + ("Só Danço Samba", "sox"), + ("fooBarbaz", "fooBarbazz"), + ], + ); + assert_not_matches( + true, + true, + false, + &[ + ("你好界", "abc"), + ("abc", "你"), + ("你好世界", "富"), + ("Só Danço Samba", "sox"), + ("fooBarbaz", "oBZ"), + ("Foo Bar Baz", "fbb"), + ("fooBarbaz", "fooBarbazz"), + ], + ); + assert_not_matches( + false, + true, + false, + &[("Só Danço Samba", "sod"), ("Só Danço Samba", "soc")], + ) +} diff --git a/src/utf32_str.rs b/src/utf32_str.rs index 6f92042..cf66091 100644 --- a/src/utf32_str.rs +++ b/src/utf32_str.rs @@ -55,6 +55,13 @@ impl<'a> Utf32Str<'a> { Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), } } + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Utf32Str::Unicode(codepoints) => codepoints.is_empty(), + Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } #[inline] pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { @@ -105,8 +112,8 @@ impl<'a> Utf32Str<'a> { } pub fn last(&self) -> char { match self { - Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char, - Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()], + Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, + Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], } } pub fn chars(&self) -> Chars<'_> { diff --git a/tarpulin.toml b/tarpulin.toml new file mode 100644 index 0000000..3a54abf --- /dev/null +++ b/tarpulin.toml @@ -0,0 +1,3 @@ +exclude = ["src/tests.rs", "src/debug.rs", "src/chars/normalize.rs"] +[report] +out = ["Html", "Xml"] \ No newline at end of file