fix remaining bugs, achive high coverage

This commit is contained in:
Pascal Kuthe 2023-07-21 00:16:15 +02:00
parent 9ffa5e63c2
commit 74e2b46f04
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
11 changed files with 419 additions and 134 deletions

View File

@ -9,7 +9,7 @@ use crate::MatcherConfig;
mod case_fold;
mod normalize;
pub trait Char: Copy + Eq + Ord + fmt::Debug + fmt::Display {
pub trait Char: Copy + Eq + Ord + fmt::Display {
const ASCII: bool;
fn char_class(self, config: &MatcherConfig) -> CharClass;
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass);
@ -27,23 +27,12 @@ impl AsciiChar {
}
}
impl fmt::Debug for AsciiChar {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Debug::fmt(&(self.0 as char), f)
}
}
impl fmt::Display for AsciiChar {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Display::fmt(&(self.0 as char), f)
}
}
impl PartialEq<char> for AsciiChar {
fn eq(&self, other: &char) -> bool {
self.0 as char == *other
}
}
impl PartialEq<AsciiChar> for char {
fn eq(&self, other: &AsciiChar) -> bool {
other.0 as char == *self

View File

@ -495,7 +495,7 @@ static TABLE3: [char; LEN3] = generate_table(&DATA3);
pub fn normalize(c: char) -> char {
let i = c as u32;
if i < DATA1_START || DATA3_END >= i {
if i < DATA1_START || i >= DATA3_END {
return c;
}
if i < DATA1_END {

69
src/debug.rs Normal file
View File

@ -0,0 +1,69 @@
use crate::chars::Char;
use crate::matrix::{haystack, HaystackChar, Matrix, MatrixCell, MatrixRow, MatrixRowMut};
use std::fmt::{Debug, Formatter, Result};
impl<C: Char> Matrix<'_, C> {
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
let mut cells = &*self.cells;
self.row_offs.iter().map(move |&off| {
let len = self.haystack.len() - off as usize;
let (row, tmp) = cells.split_at(len);
cells = tmp;
MatrixRow { off, cells: row }
})
}
pub fn haystack(
&self,
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
haystack(self.haystack, self.bonus, 0)
}
}
impl Debug for MatrixCell {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
write!(f, "({}, {})", self.score, self.consecutive_chars)
}
}
impl<C: Char> Debug for HaystackChar<C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
write!(f, "({}, {})", self.char, self.bonus)
}
}
impl Debug for MatrixRow<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &MatrixCell {
score: 0,
consecutive_chars: 0,
}));
f.entries(self.cells.iter());
f.finish()
}
}
impl Debug for MatrixRowMut<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &(0, 0)));
f.entries(self.cells.iter());
f.finish()
}
}
pub struct DebugList<I>(I);
impl<I> Debug for DebugList<I>
where
I: Iterator + Clone,
I::Item: Debug,
{
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_list().entries(self.0.clone()).finish()
}
}
impl<'a, C: Char> Debug for Matrix<'a, C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_struct("Matrix")
.field("haystack", &DebugList(self.haystack()))
.field("matrix", &DebugList(self.rows()))
.finish()
}
}

View File

@ -12,22 +12,27 @@ impl Matcher {
mut end: usize,
indices: &mut Vec<u32>,
) -> Option<u16> {
let first_char_end = if H::ASCII { start + 1 } else { end };
if !H::ASCII && needle.len() != 1 {
let mut needle_iter = needle[1..].iter().copied();
if let Some(mut needle_char) = needle_iter.next() {
for (i, &c) in haystack[first_char_end..].iter().enumerate() {
if c.normalize(&self.config) == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
end = i + 1;
break;
};
needle_char = next_needle_char;
let first_char_end = if H::ASCII && N::ASCII { start + 1 } else { end };
'nonascii: {
if !H::ASCII || !N::ASCII {
let mut needle_iter = needle[1..].iter().copied();
if let Some(mut needle_char) = needle_iter.next() {
for (i, &c) in haystack[first_char_end..].iter().enumerate() {
if c.normalize(&self.config) == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
// we found a match so we are now in the same state
// as the prefilter would produce
end = first_char_end + i + 1;
break 'nonascii;
};
needle_char = next_needle_char;
}
}
// some needle chars were not matched bail out
return None;
}
}
}
// minimize the greedly match by greedy matching in reverse
} // minimize the greedly match by greedy matching in reverse
let mut needle_iter = needle.iter().rev().copied();
let mut needle_char = needle_iter.next().unwrap();

View File

@ -19,6 +19,7 @@ impl Matcher {
end: usize,
indices: &mut Vec<u32>,
) -> Option<u16> {
println!("{start} {end}");
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
// us to treat needle indices as u16
@ -88,9 +89,9 @@ impl<H: Char> Matrix<'_, H> {
let first_needle_char = needle[0];
let mut matrix_cells = 0;
for (i, ((c, matrix_cell), bonus_)) in col_iter {
let class = c.char_class(config);
*c = c.normalize(config);
for (i, ((c_, matrix_cell), bonus_)) in col_iter {
let (c, class) = c_.char_class_and_normalize(config);
*c_ = c;
let bonus = config.bonus_for(prev_class, class);
// save bonus for later so we don't have to recompute it each time
@ -98,7 +99,7 @@ impl<H: Char> Matrix<'_, H> {
prev_class = class;
let i = i as u16;
if *c == needle_char {
if c == needle_char {
// save the first idx of each char
if let Some(next) = row_iter.next() {
matrix_cells += haystack_len - i;
@ -111,7 +112,7 @@ impl<H: Char> Matrix<'_, H> {
matched = true;
}
}
if *c == first_needle_char {
if c == first_needle_char {
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
matrix_cell.consecutive_chars = 1;
if needle.len() == 1 && score > max_score {
@ -195,7 +196,6 @@ impl<H: Char> Matrix<'_, H> {
consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 {
let first_bonus = self.bonus[col + 1 - consecutive as usize];
println!("xoxo {bonus} {first_bonus} {consecutive}");
if bonus > first_bonus {
if bonus >= BONUS_BOUNDARY {
consecutive = 1
@ -281,6 +281,5 @@ impl<H: Char> Matrix<'_, H> {
prefer_match = new_prefer_match;
col -= 1;
}
println!("{:#?}", self);
}
}

View File

@ -3,6 +3,8 @@
mod chars;
mod config;
#[cfg(test)]
mod debug;
mod fuzzy_greedy;
mod fuzzy_optimal;
mod matrix;
@ -24,32 +26,6 @@ pub struct Matcher {
slab: MatrixSlab,
}
// // impl Query {
// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) {
// // self.needle_chars.reserve(needle.len());
// // self.needle_chars.extend(needle.chars().map(|mut c| {
// // if !c.is_ascii() {
// // self.is_ascii = false;
// // }
// // if smart_case {
// // if c.is_uppercase() {
// // self.ignore_case = false;
// // }
// // } else if self.ignore_case {
// // if self.is_ascii {
// // c = to_lower_case::<true>(c)
// // } else {
// // c = to_lower_case::<false>(c)
// // }
// // }
// // if normalize_ && !self.is_ascii {
// // c = normalize(c);
// // }
// // c
// // }))
// // }
// // }
impl Matcher {
pub fn new(config: MatcherConfig) -> Self {
Self {
@ -79,7 +55,7 @@ impl Matcher {
needle_: Utf32Str<'_>,
indidies: &mut Vec<u32>,
) -> Option<u16> {
if needle_.len() > haystack.len() {
if needle_.len() > haystack.len() || needle_.is_empty() {
return None;
}
// if needle_.len() == haystack.len() {

View File

@ -1,5 +1,4 @@
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
use std::fmt::{Debug, Formatter, Result};
use std::marker::PhantomData;
use std::mem::{size_of, take};
use std::ops::Index;
@ -74,30 +73,18 @@ impl<C: Char> MatrixLayout<C> {
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy)]
pub(crate) struct MatrixCell {
pub score: u16,
pub consecutive_chars: u16,
}
impl Debug for MatrixCell {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
write!(f, "({}, {})", self.score, self.consecutive_chars)
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy)]
pub(crate) struct HaystackChar<C: Char> {
pub char: C,
pub bonus: u16,
}
impl<C: Char> Debug for HaystackChar<C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
write!(f, "({:?}, {})", self.char, self.bonus)
}
}
#[derive(Clone, Copy)]
pub(crate) struct MatrixRow<'a> {
pub off: u16,
@ -116,43 +103,11 @@ impl Index<u16> for MatrixRow<'_> {
}
}
impl Debug for MatrixRow<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &MatrixCell {
score: 0,
consecutive_chars: 0,
}));
f.entries(self.cells.iter());
f.finish()
}
}
pub(crate) struct MatrixRowMut<'a> {
pub off: u16,
pub cells: &'a mut [MatrixCell],
}
impl Debug for MatrixRowMut<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &(0, 0)));
f.entries(self.cells.iter());
f.finish()
}
}
pub struct DebugList<I>(I);
impl<I> Debug for DebugList<I>
where
I: Iterator + Clone,
I::Item: Debug,
{
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_list().entries(self.0.clone()).finish()
}
}
pub(crate) struct Matrix<'a, C: Char> {
pub haystack: &'a mut [C],
// stored as a separate array instead of struct
@ -163,16 +118,6 @@ pub(crate) struct Matrix<'a, C: Char> {
}
impl<'a, C: Char> Matrix<'a, C> {
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
let mut cells = &*self.cells;
self.row_offs.iter().map(move |&off| {
let len = self.haystack.len() - off as usize;
let (row, tmp) = cells.split_at(len);
cells = tmp;
MatrixRow { off, cells: row }
})
}
pub fn rows_rev(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator {
let mut cells = &*self.cells;
self.row_offs.iter().rev().map(move |&off| {
@ -182,21 +127,8 @@ impl<'a, C: Char> Matrix<'a, C> {
MatrixRow { off, cells: row }
})
}
pub fn haystack(
&self,
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
haystack(self.haystack, self.bonus, 0)
}
}
impl<'a, C: Char> Debug for Matrix<'a, C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_struct("Matrix")
.field("haystack", &DebugList(self.haystack()))
.field("matrix", &DebugList(self.rows()))
.finish()
}
}
pub(crate) fn haystack<'a, C: Char>(
haystack: &'a [C],
bonus: &'a [u16],

View File

@ -79,7 +79,7 @@ impl Matcher {
if only_greedy {
Some((start, start + 1))
} else {
let end = start + haystack.len()
let end = haystack.len()
- haystack[start..]
.iter()
.rev()

View File

@ -49,7 +49,7 @@ pub fn assert_matches(
assert_eq!(
res,
Some(score),
"{needle:?} did not match {haystack:?}: {match_chars:?}"
"{needle:?} did not match {haystack:?}: matched {match_chars:?} {indices:?}"
);
assert_eq!(
match_chars, needle_chars,
@ -62,6 +62,42 @@ pub fn assert_matches(
);
}
}
pub fn assert_not_matches(
normalize: bool,
case_sensitive: bool,
path: bool,
cases: &[(&str, &str)],
) {
let mut config = MatcherConfig {
normalize,
ignore_case: !case_sensitive,
..MatcherConfig::DEFAULT
};
if path {
config.set_match_paths();
}
let mut matcher = Matcher::new(config);
let mut needle_buf = Vec::new();
let mut haystack_buf = Vec::new();
for &(haystack, needle) in cases {
let needle = if !case_sensitive {
needle.to_lowercase()
} else {
needle.to_owned()
};
let needle = Utf32Str::new(&needle, &mut needle_buf);
let haystack = Utf32Str::new(haystack, &mut haystack_buf);
let res = matcher.fuzzy_match(haystack, needle);
assert_eq!(res, None, "{needle:?} should not match {haystack:?}");
let res = matcher.fuzzy_match_greedy(haystack, needle);
assert_eq!(
res, None,
"{needle:?} should not match {haystack:?} (greedy)"
)
}
}
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
@ -225,6 +261,52 @@ fn test_fuzzy_case_sensitive() {
);
}
#[test]
fn test_fuzzy_case_sensitive_v1() {
assert_matches(
true,
false,
true,
false,
&[
(
"fooBarbaz1",
"oBz",
2,
9,
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
),
(
"Foo/Bar/Baz",
"FBB",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"FooBarBaz",
"FBB",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
- 2 * PENALTY_GAP_START
- 2 * PENALTY_GAP_EXTENSION,
),
(
"FooBar Baz",
"FooB",
0,
4,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
),
// Consecutive bonus updated
("foo-bar", "o-ba", 2, 6, BONUS_BOUNDARY * 2 + BONUS_NON_WORD),
],
);
}
#[test]
fn test_v1_fuzzy() {
assert_matches(
@ -338,3 +420,226 @@ fn test_v1_fuzzy() {
],
);
}
#[test]
fn test_normalize() {
assert_matches(
false,
true,
false,
false,
&[
(
"Só Danço Samba",
"So",
0,
2,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE,
),
(
"Só Danço Samba",
"sodc",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START
+ BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
(
"Danço",
"danco",
0,
5,
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
(
"DanÇo",
"danco",
0,
5,
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
(
"xÇando",
"cando",
1,
6,
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
],
)
}
#[test]
fn test_normalize_v1() {
assert_matches(
true,
true,
false,
false,
&[
(
"Só Danço Samba",
"So",
0,
2,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE,
),
(
"Só Danço Samba",
"sodc",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START
+ BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
(
"Danço",
"danco",
0,
5,
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
(
"DanÇo",
"danco",
0,
5,
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
(
"xÇando",
"cando",
1,
6,
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4),
),
],
)
}
#[test]
fn test_unicode_v1() {
assert_matches(
true,
true,
false,
false,
&[
(
"你好世界",
"你好",
0,
2,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE,
),
(
"你好世界",
"你世",
0,
3,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START,
),
],
)
}
#[test]
fn test_unicode() {
assert_matches(
false,
true,
false,
false,
&[
(
"你好世界",
"你好",
0,
2,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE,
),
(
"你好世界",
"你世",
0,
3,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START,
),
],
)
}
#[test]
fn test_long_str() {
assert_matches(
false,
false,
false,
false,
&[(
&"x".repeat(u16::MAX as usize + 1),
"xx",
0,
2,
(BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE,
)],
);
}
#[test]
fn test_optimal() {
assert_matches(
false,
false,
false,
false,
&[(
"axxx xx ",
"xx",
5,
7,
(BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE,
)],
)
}
#[test]
fn test_reject() {
assert_not_matches(
true,
false,
false,
&[
("你好界", "abc"),
("你好世界", ""),
("Só Danço Samba", "sox"),
("fooBarbaz", "fooBarbazz"),
],
);
assert_not_matches(
true,
true,
false,
&[
("你好界", "abc"),
("abc", ""),
("你好世界", ""),
("Só Danço Samba", "sox"),
("fooBarbaz", "oBZ"),
("Foo Bar Baz", "fbb"),
("fooBarbaz", "fooBarbazz"),
],
);
assert_not_matches(
false,
true,
false,
&[("Só Danço Samba", "sod"), ("Só Danço Samba", "soc")],
)
}

View File

@ -55,6 +55,13 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
}
}
#[inline]
pub fn is_empty(&self) -> bool {
match self {
Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
}
}
#[inline]
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
@ -105,8 +112,8 @@ impl<'a> Utf32Str<'a> {
}
pub fn last(&self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()],
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
}
}
pub fn chars(&self) -> Chars<'_> {

3
tarpulin.toml Normal file
View File

@ -0,0 +1,3 @@
exclude = ["src/tests.rs", "src/debug.rs", "src/chars/normalize.rs"]
[report]
out = ["Html", "Xml"]