diff --git a/src/chars.rs b/src/chars.rs index 530b7d0..a3d6f50 100644 --- a/src/chars.rs +++ b/src/chars.rs @@ -1,3 +1,5 @@ +use std::fmt::{self, Debug, Display}; + use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::MatcherConfig; @@ -7,18 +9,52 @@ use crate::MatcherConfig; mod case_fold; mod normalize; -pub trait Char: Copy + Eq + Ord + std::fmt::Debug { +pub trait Char: Copy + Eq + Ord + fmt::Debug + fmt::Display { const ASCII: bool; fn char_class(self, config: &MatcherConfig) -> CharClass; fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); fn normalize(self, config: &MatcherConfig) -> Self; } -impl Char for u8 { +/// repr tansparent wrapper around u8 with better formatting and PartialEq implementation +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub(crate) struct AsciiChar(u8); + +impl AsciiChar { + pub fn cast(bytes: &[u8]) -> &[AsciiChar] { + unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) } + } +} + +impl fmt::Debug for AsciiChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Debug::fmt(&(self.0 as char), f) + } +} + +impl fmt::Display for AsciiChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Display::fmt(&(self.0 as char), f) + } +} + +impl PartialEq for AsciiChar { + fn eq(&self, other: &char) -> bool { + self.0 as char == *other + } +} +impl PartialEq for char { + fn eq(&self, other: &AsciiChar) -> bool { + other.0 as char == *self + } +} + +impl Char for AsciiChar { const ASCII: bool = true; #[inline] fn char_class(self, config: &MatcherConfig) -> CharClass { - let c = self; + let c = self.0; // using manual if conditions instead optimizes better if c >= b'a' && c <= b'z' { CharClass::Lower @@ -36,23 +72,20 @@ impl Char for u8 { } #[inline(always)] - fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass) { + fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { let char_class = self.char_class(config); - let normalized = if config.ignore_case && char_class == CharClass::Upper { - self + 32 - } else { - self - }; - (normalized, char_class) + if config.ignore_case && char_class == CharClass::Upper { + self.0 += 32 + } + (self, char_class) } #[inline(always)] - fn normalize(self, config: &MatcherConfig) -> Self { - if config.ignore_case && self >= b'A' && self <= b'Z' { - self + 32 - } else { - self + fn normalize(mut self, config: &MatcherConfig) -> Self { + if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { + self.0 += 32 } + self } } fn char_class_non_ascii(c: char) -> CharClass { @@ -75,7 +108,7 @@ impl Char for char { #[inline(always)] fn char_class(self, config: &MatcherConfig) -> CharClass { if self.is_ascii() { - return (self as u8).char_class(config); + return AsciiChar(self as u8).char_class(config); } char_class_non_ascii(self) } @@ -83,8 +116,8 @@ impl Char for char { #[inline(always)] fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { if self.is_ascii() { - let (c, class) = (self as u8).char_class_and_normalize(config); - return (c as char, class); + let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); + return (c.0 as char, class); } let char_class = char_class_non_ascii(self); if char_class == CharClass::Upper { diff --git a/src/fuzzy_greedy.rs b/src/fuzzy_greedy.rs index 8a61052..3dcd6d1 100644 --- a/src/fuzzy_greedy.rs +++ b/src/fuzzy_greedy.rs @@ -32,7 +32,6 @@ impl Matcher { let mut needle_iter = needle.iter().rev().copied(); let mut needle_char = needle_iter.next().unwrap(); for (i, &c) in haystack[start..end].iter().enumerate().rev() { - println!("{c:?} {i} {needle_char:?}"); if c == needle_char { let Some(next_needle_char) = needle_iter.next() else { start += i; diff --git a/src/fuzzy_optimal.rs b/src/fuzzy_optimal.rs index f63aa3e..27d7840 100644 --- a/src/fuzzy_optimal.rs +++ b/src/fuzzy_optimal.rs @@ -1,4 +1,5 @@ use std::cmp::max; +use std::mem::take; use crate::chars::{Char, CharClass}; use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow}; @@ -54,8 +55,6 @@ impl Matcher { if INDICIES { matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end); } - println!("{indicies:?}"); - println!("{}", max_score); Some(max_score) } } @@ -70,6 +69,7 @@ impl Matrix<'_, H> { where H: PartialEq, { + let haystack_len = self.haystack.len() as u16; let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut()); let (mut needle_char, mut row_start) = row_iter.next().unwrap(); @@ -86,6 +86,7 @@ impl Matrix<'_, H> { let mut prev_score = 0u16; let mut matched = false; let first_needle_char = needle[0]; + let mut matrix_cells = 0; for (i, ((c, matrix_cell), bonus_)) in col_iter { let class = c.char_class(config); @@ -97,23 +98,21 @@ impl Matrix<'_, H> { prev_class = class; let i = i as u16; - println!("{i} {needle_char:?} {c:?}"); if *c == needle_char { // save the first idx of each char if let Some(next) = row_iter.next() { + matrix_cells += haystack_len - i; *row_start = i; (needle_char, row_start) = next; - } else { - if !matched { - *row_start = i; - } + } else if !matched { + matrix_cells += haystack_len - i; + *row_start = i; // we have atleast one match matched = true; } } if *c == first_needle_char { let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; - println!("start match {score}"); matrix_cell.consecutive_chars = 1; if needle.len() == 1 && score > max_score { max_score = score; @@ -137,7 +136,7 @@ impl Matrix<'_, H> { } prev_score = matrix_cell.score; } - + self.cells = &mut take(&mut self.cells)[..matrix_cells as usize]; (max_score_pos, max_score, matched) } @@ -208,7 +207,6 @@ impl Matrix<'_, H> { } in_gap = score1 < score2; let score = max(score1, score2); - println!("{score} {score1} {score2}"); if i == needle.len() - 1 && score > max_score { max_score = score; max_score_end = col as u16; @@ -231,7 +229,7 @@ impl Matrix<'_, H> { ) { indicies.resize(needle.len(), 0); - let mut row_iter = self.rows_rev().zip(indicies.iter_mut()).peekable(); + let mut row_iter = self.rows_rev().zip(indicies.iter_mut().rev()).peekable(); let (mut row, mut matched_col_idx) = row_iter.next().unwrap(); let mut next_row: Option = None; let mut col = best_match_end; @@ -239,7 +237,7 @@ impl Matrix<'_, H> { let haystack_len = self.haystack.len() as u16; loop { - let score = row.cells[col as usize].score; + let score = row[col].score; let mut score1 = 0; let mut score2 = 0; if let Some(&(prev_row, _)) = row_iter.peek() { @@ -250,19 +248,20 @@ impl Matrix<'_, H> { if col > row.off { score2 = row[col - 1].score; } - println!("{score} {score2} {score1} {prefer_match}"); let mut new_prefer_match = row[col].consecutive_chars > 1; if !new_prefer_match && col + 1 < haystack_len { if let Some(next_row) = next_row { - new_prefer_match = next_row[col + 1].consecutive_chars > 0 + if col + 1 > next_row.off { + new_prefer_match = next_row[col + 1].consecutive_chars > 0 + } } } if score > score1 && (score > score2 || score == score2 && prefer_match) { *matched_col_idx = col as u32 + start; next_row = Some(row); let Some(next) = row_iter.next() else { - break; - }; + break; + }; (row, matched_col_idx) = next } prefer_match = new_prefer_match; diff --git a/src/lib.rs b/src/lib.rs index e61dd7e..9ae70a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,11 +10,12 @@ mod prefilter; mod score; mod utf32_str; -// #[cfg(test)] -// mod tests; +#[cfg(test)] +mod tests; pub use config::MatcherConfig; +use crate::chars::AsciiChar; use crate::matrix::MatrixSlab; use crate::utf32_str::Utf32Str; @@ -61,12 +62,29 @@ impl Matcher { assert!(haystack.len() <= u32::MAX as usize); self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) } + + pub fn fuzzy_indicies( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indidies: &mut Vec, + ) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::(haystack, needle, indidies) + } + fn fuzzy_matcher_impl( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, indidies: &mut Vec, ) -> Option { + if needle_.len() > haystack.len() { + return None; + } + // if needle_.len() == haystack.len() { + // return self.exact_match(); + // } assert!( haystack.len() <= u32::MAX as usize, "fuzzy matching is only support for up to 2^32-1 codepoints" @@ -74,8 +92,13 @@ impl Matcher { match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?; - self.fuzzy_match_optimal::( - haystack, needle, start, greedy_end, end, indidies, + self.fuzzy_match_optimal::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + end, + indidies, ) } (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { @@ -84,16 +107,15 @@ impl Matcher { None } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - todo!() - // let (start, end) = self.prefilter_non_ascii(haystack, needle_)?; - // self.fuzzy_match_optimal::( - // haystack, - // needle, - // start, - // start + 1, - // end, - // indidies, - // ) + let (start, end) = self.prefilter_non_ascii(haystack, needle_)?; + self.fuzzy_match_optimal::( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + end, + indidies, + ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { let (start, end) = self.prefilter_non_ascii(haystack, needle_)?; diff --git a/src/matrix.rs b/src/matrix.rs index 47e1988..dadd526 100644 --- a/src/matrix.rs +++ b/src/matrix.rs @@ -82,7 +82,7 @@ pub(crate) struct MatrixCell { impl Debug for MatrixCell { fn fmt(&self, f: &mut Formatter<'_>) -> Result { - (self.score, self.consecutive_chars).fmt(f) + write!(f, "({}, {})", self.score, self.consecutive_chars) } } @@ -94,7 +94,7 @@ pub(crate) struct HaystackChar { impl Debug for HaystackChar { fn fmt(&self, f: &mut Formatter<'_>) -> Result { - (self.char, self.bonus).fmt(f) + write!(f, "({:?}, {})", self.char, self.bonus) } } @@ -103,18 +103,26 @@ pub(crate) struct MatrixRow<'a> { pub off: u16, pub cells: &'a [MatrixCell], } + +/// Intexing returns the cell that corresponds to colmun `col` in this row, +/// this is not the same as directly indexing the cells array because every row +/// starts at a column offset which needs to be accounted for impl Index for MatrixRow<'_> { type Output = MatrixCell; - fn index(&self, index: u16) -> &Self::Output { - &self.cells[index as usize] + #[inline(always)] + fn index(&self, col: u16) -> &Self::Output { + &self.cells[(col - self.off) as usize] } } impl Debug for MatrixRow<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> Result { let mut f = f.debug_list(); - f.entries((0..self.off).map(|_| &(0, 0))); + f.entries((0..self.off).map(|_| &MatrixCell { + score: 0, + consecutive_chars: 0, + })); f.entries(self.cells.iter()); f.finish() } @@ -250,7 +258,7 @@ impl MatrixSlab { let matrix_layout = MatrixLayout::::new( haystack_.len(), needle_len, - (haystack_.len() - needle_len / 2) * needle_len, + (haystack_.len() + 1 - needle_len / 2) * needle_len, ); if matrix_layout.layout.size() > size_of::() { return None; diff --git a/src/prefilter.rs b/src/prefilter.rs index 0d7c5da..cc73db1 100644 --- a/src/prefilter.rs +++ b/src/prefilter.rs @@ -38,7 +38,8 @@ impl Matcher { haystack = &haystack[idx..]; } let end = eager_end - + find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack).unwrap_or(0); + + find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack) + .map_or(0, |i| i + 1); Some((start, eager_end, end)) } else { let start = memchr(needle[0], haystack)?; @@ -49,7 +50,7 @@ impl Matcher { eager_end += idx; haystack = &haystack[idx..]; } - let end = eager_end + memrchr(*needle.last().unwrap(), haystack).unwrap_or(0); + let end = eager_end + memrchr(*needle.last().unwrap(), haystack).map_or(0, |i| i + 1); Some((start, eager_end, end)) } } @@ -64,9 +65,11 @@ impl Matcher { .iter() .position(|c| c.normalize(&self.config) == needle_char)?; let needle_char = needle.last(); - let end = haystack[start..] - .iter() - .position(|c| c.normalize(&self.config) == needle_char)?; + let end = start + haystack.len() + - haystack[start..] + .iter() + .rev() + .position(|c| c.normalize(&self.config) == needle_char)?; Some((start, end)) } diff --git a/src/tests.rs b/src/tests.rs index 5baf52b..6d8a903 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,8 +1,10 @@ -use crate::config::{ +use crate::chars::Char; +use crate::score::{ BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; -use crate::{CaseMatching, Matcher, MatcherConfig}; +use crate::utf32_str::Utf32Str; +use crate::{Matcher, MatcherConfig}; pub fn assert_matches( use_v1: bool, @@ -12,13 +14,8 @@ pub fn assert_matches( cases: &[(&str, &str, u32, u32, u16)], ) { let mut config = MatcherConfig { - use_v1, normalize, - case_matching: if case_sensitive { - CaseMatching::Respect - } else { - CaseMatching::Ignore - }, + ignore_case: !case_sensitive, ..MatcherConfig::DEFAULT }; if path { @@ -26,11 +23,31 @@ pub fn assert_matches( } let mut matcher = Matcher::new(config); let mut indicies = Vec::new(); + let mut needle_buf = Vec::new(); + let mut haystack_buf = Vec::new(); for &(haystack, needle, start, end, mut score) in cases { - score += needle.chars().count() as u16 * SCORE_MATCH; - let query = matcher.compile_query(needle); - let res = matcher.fuzzy_indicies(&query, haystack, &mut indicies); - assert_eq!(res, Some(score), "{needle:?} did not match {haystack:?}"); + let needle = if !case_sensitive { + needle.to_lowercase() + } else { + needle.to_owned() + }; + let needle = Utf32Str::new(&needle, &mut needle_buf); + let haystack = Utf32Str::new(haystack, &mut haystack_buf); + score += needle.len() as u16 * SCORE_MATCH; + + let res = matcher.fuzzy_indicies(haystack, needle, &mut indicies); + let match_chars: Vec<_> = indicies + .iter() + .map(|&i| haystack.get(i).normalize(&matcher.config)) + .collect(); + let needle_chars: Vec<_> = needle.chars().collect(); + + assert_eq!( + res, + Some(score), + "{needle:?} did not match {haystack:?}: {match_chars:?}" + ); + assert_eq!(match_chars, needle_chars, "match indicies are incorrect"); assert_eq!( indicies.first().copied()..indicies.last().map(|&i| i + 1), Some(start)..Some(end), diff --git a/src/utf32_str.rs b/src/utf32_str.rs index 982cf16..67e1aff 100644 --- a/src/utf32_str.rs +++ b/src/utf32_str.rs @@ -1,4 +1,5 @@ use std::ops::{Bound, RangeBounds}; +use std::slice; /// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching. /// @@ -108,16 +109,25 @@ impl<'a> Utf32Str<'a> { Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()], } } + pub fn chars(&self) -> Chars<'_> { + match self { + Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), + Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), + } + } } -// impl Str for &[char] { -// type Chars; +pub enum Chars<'a> { + Ascii(slice::Iter<'a, u8>), + Unicode(slice::Iter<'a, char>), +} +impl<'a> Iterator for Chars<'a> { + type Item = char; -// fn chars(&self) -> Self::Chars { -// todo!() -// } - -// fn slice(&self, range: impl RangeBounds) { -// todo!() -// } -// } + fn next(&mut self) -> Option { + match self { + Chars::Ascii(iter) => iter.next().map(|&c| c as char), + Chars::Unicode(iter) => iter.next().copied(), + } + } +}