diff --git a/src/chars.rs b/src/chars.rs index 5905f77..7764e33 100644 --- a/src/chars.rs +++ b/src/chars.rs @@ -19,7 +19,7 @@ pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { /// repr tansparent wrapper around u8 with better formatting and PartialEq implementation #[repr(transparent)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -pub(crate) struct AsciiChar(u8); +pub(crate) struct AsciiChar(pub u8); impl AsciiChar { pub fn cast(bytes: &[u8]) -> &[AsciiChar] { diff --git a/src/debug.rs b/src/debug.rs index 8f35d00..d167aeb 100644 --- a/src/debug.rs +++ b/src/debug.rs @@ -1,69 +1,32 @@ -use crate::chars::Char; -use crate::matrix::{haystack, HaystackChar, Matrix, MatrixCell, MatrixRow, MatrixRowMut}; +use crate::matrix::{MatrixCell, ScoreCell}; use std::fmt::{Debug, Formatter, Result}; -impl Matrix<'_, C> { - pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { - let mut cells = &*self.cells; - self.row_offs.iter().map(move |&off| { - let len = self.haystack.len() - off as usize; - let (row, tmp) = cells.split_at(len); - cells = tmp; - MatrixRow { off, cells: row } - }) - } +// impl MatcherData<'_, C> { +// pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { +// let mut cells = &*self.cells; +// self.row_offs.iter().map(move |&off| { +// let len = self.haystack.len() - off as usize; +// let (row, tmp) = cells.split_at(len); +// cells = tmp; +// MatrixRow { off, cells: row } +// }) +// } - pub fn haystack( - &self, - ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { - haystack(self.haystack, self.bonus, 0) +// pub fn haystack( +// &self, +// ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { +// haystack(self.haystack, self.bonus, 0) +// } +// } + +impl Debug for ScoreCell { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "({}, {}, {})", self.score, self.bonus, self.matched) } } impl Debug for MatrixCell { fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {})", self.score, self.consecutive_chars) - } -} -impl Debug for HaystackChar { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {})", self.char, self.bonus) - } -} -impl Debug for MatrixRow<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - let mut f = f.debug_list(); - f.entries((0..self.off).map(|_| &MatrixCell { - score: 0, - consecutive_chars: 0, - })); - f.entries(self.cells.iter()); - f.finish() - } -} -impl Debug for MatrixRowMut<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - let mut f = f.debug_list(); - f.entries((0..self.off).map(|_| &(0, 0))); - f.entries(self.cells.iter()); - f.finish() - } -} -pub struct DebugList(I); -impl Debug for DebugList -where - I: Iterator + Clone, - I::Item: Debug, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - f.debug_list().entries(self.0.clone()).finish() - } -} -impl<'a, C: Char> Debug for Matrix<'a, C> { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - f.debug_struct("Matrix") - .field("haystack", &DebugList(self.haystack())) - .field("matrix", &DebugList(self.rows())) - .finish() + write!(f, "({}, {})", (self.0 & 1) != 0, (self.0 & 2) != 0) } } diff --git a/src/exact.rs b/src/exact.rs new file mode 100644 index 0000000..ec28763 --- /dev/null +++ b/src/exact.rs @@ -0,0 +1,107 @@ +use memchr::{Memchr, Memchr2}; + +use crate::chars::{AsciiChar, Char}; +use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH}; +use crate::Matcher; + +impl Matcher { + pub(crate) fn substring_match_1_ascii( + &mut self, + haystack: &[u8], + c: u8, + indices: &mut Vec, + ) -> Option { + let mut max_score = 0; + let mut max_pos = 0; + if self.config.ignore_case && c >= b'a' && c <= b'z' { + for i in Memchr2::new(c, c - 32, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let char_class = AsciiChar(haystack[i]).char_class(&self.config); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white + && score >= self.config.bonus_boundary_delimiter + { + break; + } + } + } + } else { + let char_class = AsciiChar(c).char_class(&self.config); + for i in Memchr::new(c, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white + && score >= self.config.bonus_boundary_delimiter + { + break; + } + } + } + } + if max_score == 0 { + return None; + } + + if INDICES { + indices.clear(); + indices.push(max_pos); + } + Some(max_score) + } + + pub(crate) fn substring_match_1_non_ascii( + &mut self, + haystack: &[char], + needle: char, + start: usize, + indices: &mut Vec, + ) -> u16 { + let mut max_score = 0; + let mut max_pos = 0; + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + for (i, &c) in haystack.iter().enumerate() { + let (c, char_class) = c.char_class_and_normalize(&self.config); + if c != needle { + continue; + } + let bonus = self.config.bonus_for(prev_class, char_class); + prev_class = char_class; + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white + && score >= self.config.bonus_boundary_delimiter + { + break; + } + } + } + + if INDICES { + indices.clear(); + indices.push(max_pos); + } + max_score + } +} diff --git a/src/fuzzy_optimal.rs b/src/fuzzy_optimal.rs index 6a9a247..04e570e 100644 --- a/src/fuzzy_optimal.rs +++ b/src/fuzzy_optimal.rs @@ -1,11 +1,10 @@ use std::cmp::max; -use std::mem::take; use crate::chars::{Char, CharClass}; -use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow}; +use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; use crate::score::{ - BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, - PENALTY_GAP_START, SCORE_MATCH, + BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, + SCORE_MATCH, }; use crate::{Matcher, MatcherConfig}; @@ -36,61 +35,82 @@ impl Matcher { .checked_sub(1) .map(|i| haystack[i].char_class(&self.config)) .unwrap_or(self.config.initial_char_class); - let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config); + let matched = matrix.setup::(needle, prev_class, &self.config); // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects if !matched { - debug_assert!(!(H::ASCII && N::ASCII)); + assert!( + !N::ASCII || !H::ASCII, + "should have been caught by prefilter" + ); return None; } - if needle.len() == 1 { - indices.clear(); - indices.push(max_score_pos as u32 + start as u32); - return Some(max_score); - } - debug_assert_eq!( - matrix.row_offs[0], 0, - "prefilter should have put us at the start of the match" - ); // populate the matrix and find the best score - let (max_score, best_match_end) = matrix.populate_matrix(needle); + let matrix_len = matrix.populate_matrix::(needle); + let last_row_off = matrix.row_offs[needle.len() - 1]; + let relative_last_row_off = last_row_off as usize + 1 - needle.len(); + let (match_end, match_score_cell) = matrix.current_row[relative_last_row_off..] + .iter() + .enumerate() + .max_by_key(|(_, cell)| cell.score) + .expect("there must be atleast one match"); if INDICES { - matrix.reconstruct_optimal_path(needle, start as u32, indices, best_match_end); + matrix.reconstruct_optimal_path(match_end as u16, indices, matrix_len, start as u32); } - Some(max_score) + Some(match_score_cell.score as u16) } } -impl Matrix<'_, H> { - fn setup( +fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> ScoreCell { + let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE)); + let score_match = m_score + consecutive_bonus as i32; + let score_skip = p_score + next_bonus as i32; + if score_match > score_skip { + ScoreCell { + score: score_match + SCORE_MATCH as i32, + bonus: consecutive_bonus, + matched: true, + } + } else { + ScoreCell { + score: score_skip + SCORE_MATCH as i32, + bonus: consecutive_bonus, + matched: false, + } + } +} + +fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) { + let score_match = prev_m_score - PENALTY_GAP_START as i32; + let score_skip = prev_p_score - PENALTY_GAP_EXTENSION as i32; + if score_match > score_skip { + (score_match, true) + } else { + (score_skip, false) + } +} + +impl MatcherDataView<'_, H> { + fn setup( &mut self, needle: &[N], mut prev_class: CharClass, config: &MatcherConfig, - ) -> (u16, u16, bool) + ) -> bool where H: PartialEq, { - let haystack_len = self.haystack.len() as u16; let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut()); let (mut needle_char, mut row_start) = row_iter.next().unwrap(); let col_iter = self .haystack .iter_mut() - .zip(self.cells.iter_mut()) .zip(self.bonus.iter_mut()) .enumerate(); - let mut max_score = 0; - let mut max_score_pos = 0; - let mut in_gap = false; - let mut prev_score = 0u16; let mut matched = false; - let first_needle_char = needle[0]; - let mut matrix_cells = 0; - - for (i, ((c_, matrix_cell), bonus_)) in col_iter { + for (i, (c_, bonus_)) in col_iter { let (c, class) = c_.char_class_and_normalize(config); *c_ = c; @@ -103,195 +123,197 @@ impl Matrix<'_, H> { if c == needle_char { // save the first idx of each char if let Some(next) = row_iter.next() { - matrix_cells += haystack_len - i; *row_start = i; (needle_char, row_start) = next; } else if !matched { - matrix_cells += haystack_len - i; *row_start = i; // we have atleast one match matched = true; } } + } + if !matched { + return false; + } + debug_assert_eq!(self.row_offs[0], 0); + Self::score_row::( + self.current_row, + self.matrix_cells, + self.haystack, + self.bonus, + 0, + self.row_offs[1], + 0, + needle[0], + needle[1], + ); + true + } - // we calculate two scores: - // * one for transversing the matrix horizontially (no match at - // the current char) - // * one for transversing the matrix diagonally (match at the - // current char) - // the maximum of those two scores is used - let gap_penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - let score_gap = prev_score.saturating_sub(gap_penalty); - let score_match = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; - if c == first_needle_char && score_match >= score_gap { - matrix_cell.consecutive_chars = 1; - matrix_cell.score = score_match; - in_gap = false; - if needle.len() == 1 && score_match > max_score { - max_score = score_match; - max_score_pos = i; - // can't get better than this - if bonus >= BONUS_BOUNDARY { - break; + fn score_row( + current_row: &mut [ScoreCell], + matrix_cells: &mut [MatrixCell], + haystack: &[H], + bonus: &[u16], + row_off: u16, + mut next_row_off: u16, + needle_idx: u16, + needle_char: N, + next_needle_char: N, + ) where + H: PartialEq, + { + next_row_off -= 1; + let relative_row_off = row_off - needle_idx; + let next_relative_row_off = next_row_off - needle_idx; + let skipped_col_iter = haystack[row_off as usize..next_row_off as usize] + .iter() + .zip(bonus[row_off as usize..next_row_off as usize].iter()) + .zip(current_row[relative_row_off as usize..next_relative_row_off as usize].iter_mut()) + .zip(matrix_cells.iter_mut()); + let mut prev_p_score = i32::MIN / 2; + let mut prev_m_score = i32::MIN / 2; + for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { + let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); + let m_cell = if FIRST_ROW { + if c == needle_char { + // TODO: do we really want to start with a penalty here?? + let mut cell = + next_m_score(0, i32::MIN / 2, 0, bonus * BONUS_FIRST_CHAR_MULTIPLIER); + cell.bonus = *bonus; + cell + } else { + ScoreCell { + score: i32::MIN / 2, + bonus: 0, + matched: false, } } } else { - matrix_cell.consecutive_chars = 0; - matrix_cell.score = score_gap; - in_gap = true; + *score_cell + }; + if INDICES { + matrix_cell.set(p_matched, m_cell.matched); } - prev_score = matrix_cell.score; + prev_p_score = p_score; + prev_m_score = m_cell.score; + } + let col_iter = haystack[next_row_off as usize..] + .windows(2) + .zip(bonus[next_row_off as usize..].windows(2)) + .zip(current_row[next_relative_row_off as usize..].iter_mut()) + .zip(matrix_cells[(next_relative_row_off - relative_row_off) as usize..].iter_mut()); + for (((c, bonus), score_cell), matrix_cell) in col_iter { + let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); + let m_cell = if FIRST_ROW { + if c[0] == needle_char { + // TODO: do we really want to start with a penalty here?? + let mut cell = + next_m_score(0, i32::MIN / 2, 0, bonus[0] * BONUS_FIRST_CHAR_MULTIPLIER); + cell.bonus = bonus[0]; + cell + } else { + ScoreCell { + score: i32::MIN / 2, + bonus: 0, + matched: false, + } + } + } else { + *score_cell + }; + *score_cell = if c[1] == next_needle_char { + next_m_score(p_score, m_cell.score, m_cell.bonus, bonus[1]) + } else { + ScoreCell { + score: i32::MIN / 2, + bonus: 0, + matched: false, + } + }; + if INDICES { + matrix_cell.set(p_matched, m_cell.matched); + } + prev_p_score = p_score; + prev_m_score = m_cell.score; } - self.cells = &mut take(&mut self.cells)[..matrix_cells as usize]; - (max_score_pos, max_score, matched) } - fn populate_matrix(&mut self, needle: &[N]) -> (u16, u16) + fn populate_matrix(&mut self, needle: &[N]) -> usize where H: PartialEq, { - let mut max_score = 0; - let mut max_score_end = 0; - - let mut row_iter = needle + let mut matrix_cells = &mut self.matrix_cells[self.current_row.len()..]; + let mut row_iter = needle[1..] .iter() - .zip(rows_mut(self.row_offs, self.cells, self.haystack.len())) + .copied() + .zip(self.row_offs[1..].iter().copied()) .enumerate(); - // skip the first row we already calculated the in `setup` initial scores - let (_, mut prev_matrix_row) = row_iter.next().unwrap().1; - - for (i, (&needle_char, row)) in row_iter { - let haystack = haystack(self.haystack, self.bonus, row.off); - let mut in_gap = false; - let mut prev_matrix_cell = MatrixCell { - score: 0, - consecutive_chars: 0, - }; - // we are interested in the score of the previous character - // in the previous row. This represents the previous char - // for each possible pattern. This is equivalent to diagonal movement - let diagonal_start = row.off - prev_matrix_row.off - 1; - let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..]; - - for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack - .zip(row.cells.iter_mut()) - .zip(diagonal.iter()) - .enumerate() - { - let col = j + row.off as usize; - let gap_penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - // we calculate two scores: - // * one for transversing the matrix horizontially (no match at - // the current char) - // * one for transversing the matrix diagonally (match at the - // current char) - // the maximum of those two scores is used - let mut score_diag = 0; - let score_hor = prev_matrix_cell.score.saturating_sub(gap_penalty); - - let mut consecutive = 0; - if haystack_char.char == needle_char { - // we have a match at the current char - score_diag = diag_matrix_cell.score + SCORE_MATCH; - let mut bonus = haystack_char.bonus; - consecutive = diag_matrix_cell.consecutive_chars + 1; - if consecutive > 1 { - let first_bonus = self.bonus[col + 1 - consecutive as usize]; - if bonus > first_bonus { - if bonus >= BONUS_BOUNDARY { - consecutive = 1 - } else { - bonus = max(bonus, BONUS_CONSECUTIVE) - } - } else { - bonus = max(first_bonus, BONUS_CONSECUTIVE) - } - } - if score_diag + bonus < score_hor - || (consecutive == 1 && score_diag + bonus == score_hor) - { - score_diag += haystack_char.bonus; - consecutive = 0; - } else { - score_diag += bonus; - } - } - in_gap = consecutive == 0; - let score = max(score_diag, score_hor); - if i == needle.len() - 1 && score > max_score { - max_score = score; - max_score_end = col as u16; - } - matrix_cell.consecutive_chars = consecutive; - matrix_cell.score = score; - prev_matrix_cell = *matrix_cell; - } - prev_matrix_row = row; + let (mut needle_idx, (mut needle_char, mut row_off)) = row_iter.next().unwrap(); + for (next_needle_idx, (next_needle_char, next_row_off)) in row_iter { + Self::score_row::( + self.current_row, + matrix_cells, + self.haystack, + self.bonus, + row_off, + next_row_off, + needle_idx as u16 + 1, + needle_char, + next_needle_char, + ); + let len = self.current_row.len() + needle_idx + 1 - row_off as usize; + matrix_cells = &mut matrix_cells[len..]; + (needle_idx, needle_char, row_off) = (next_needle_idx, next_needle_char, next_row_off); } - (max_score, max_score_end) + matrix_cells.as_ptr() as usize - self.matrix_cells.as_ptr() as usize } - fn reconstruct_optimal_path( + fn reconstruct_optimal_path( &self, - needle: &[N], - start: u32, + max_score_end: u16, indices: &mut Vec, - best_match_end: u16, + matrix_len: usize, + start: u32, ) { indices.clear(); - indices.resize(needle.len(), 0); - - let mut row_iter = self.rows_rev().zip(indices.iter_mut().rev()).peekable(); - let (mut row, mut matched_col_idx) = row_iter.next().unwrap(); - let mut next_row: Option = None; - let mut col = best_match_end; - let mut prefer_match = true; - let haystack_len = self.haystack.len() as u16; + indices.resize(self.row_offs.len(), 0); + let last_row_off = *self.row_offs.last().unwrap(); + indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32; + let mut matrix_cells = &self.matrix_cells[..matrix_len]; + let width = self.current_row.len(); + let mut row_iter = self.row_offs[..self.row_offs.len() - 1] + .iter() + .copied() + .enumerate() + .rev() + .map(|(i, off)| { + let relative_off = off as usize - i; + let row; + (matrix_cells, row) = + matrix_cells.split_at(matrix_cells.len() - (width - relative_off)); + (i, off, row) + }); + let (mut row_idx, mut row_off, mut row) = row_iter.next().unwrap(); + let mut col = max_score_end; + let relative_last_row_off = last_row_off as usize + 1 - self.row_offs.len(); + let mut matched = self.current_row[col as usize + relative_last_row_off].matched; + col += last_row_off - row_off - 1; loop { - let score = row[col].score; - // we calculate two scores: - // * one for transversing the matrix horizontially (no match at - // the current char) - // * one for transversing the matrix diagonally (match at the - // current char) - // the maximum of those two scores is used - let mut score_diag = 0; - let mut score_horz = 0; - if let Some(&(prev_row, _)) = row_iter.peek() { - score_diag = prev_row[col - 1].score; + if matched { + indices[row_idx] = start + col as u32 + row_off as u32; } - if col > row.off { - score_horz = row[col - 1].score; - } - let mut in_block = row[col].consecutive_chars > 1; - if !in_block && col + 1 < haystack_len { - if let Some(next_row) = next_row { - if col + 1 >= next_row.off { - in_block = next_row[col + 1].consecutive_chars > 1 - } - } - } - if score > score_diag - && (score > score_horz || in_block || prefer_match && score == score_horz) - { - *matched_col_idx = col as u32 + start; - next_row = Some(row); - let Some(next) = row_iter.next() else { + let next_matched = row[col as usize].get(matched); + if matched { + let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else{ break; }; - (row, matched_col_idx) = next + col += row_off - next_row_off; + (row_idx, row_off, row) = (next_row_idx, next_row_off, next_row) } col -= 1; - prefer_match = row[col].consecutive_chars != 0; + matched = next_matched; } } } diff --git a/src/lib.rs b/src/lib.rs index e794bc0..d21bb0d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ pub mod chars; mod config; #[cfg(test)] mod debug; +mod exact; mod fuzzy_greedy; mod fuzzy_optimal; mod matrix; @@ -67,6 +68,9 @@ impl Matcher { ); match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::(haystack, needle, indidies); + } let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; self.fuzzy_match_optimal::( AsciiChar::cast(haystack), @@ -83,6 +87,16 @@ impl Matcher { None } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::( + haystack, + needle as char, + start, + indidies, + ); + return Some(res); + } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; self.fuzzy_match_optimal::( haystack, @@ -94,6 +108,12 @@ impl Matcher { ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::(haystack, needle, start, indidies); + return Some(res); + } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; self.fuzzy_match_optimal::( haystack, diff --git a/src/matrix.rs b/src/matrix.rs index 3ee4e4c..755b5d3 100644 --- a/src/matrix.rs +++ b/src/matrix.rs @@ -1,12 +1,11 @@ use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; use std::marker::PhantomData; -use std::mem::{size_of, take}; -use std::ops::Index; +use std::mem::size_of; use std::ptr::{slice_from_raw_parts_mut, NonNull}; use crate::chars::Char; -const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB +const MAX_MATRIX_SIZE: usize = 100 * 1024; // 100*1024 = 100KB // these two aren't hard maxima, instead we simply allow whatever will fit into memory const MAX_HAYSTACK_LEN: usize = 2048; // 64KB @@ -15,21 +14,23 @@ const MAX_NEEDLE_LEN: usize = 2048; // 64KB struct MatrixLayout { haystack_len: usize, needle_len: usize, - cell_count: usize, layout: Layout, haystack_off: usize, bonus_off: usize, rows_off: usize, - cells_off: usize, + score_off: usize, + matrix_off: usize, _phantom: PhantomData, } impl MatrixLayout { - fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout { + fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout { let mut layout = Layout::from_size_align(0, 1).unwrap(); let haystack_layout = Layout::array::(haystack_len).unwrap(); let bonus_layout = Layout::array::(haystack_len).unwrap(); let rows_layout = Layout::array::(needle_len).unwrap(); - let cells_layout = Layout::array::(cell_count).unwrap(); + let score_layout = Layout::array::(haystack_len + 1 - needle_len).unwrap(); + let matrix_layout = + Layout::array::((haystack_len + 1 - needle_len) * needle_len).unwrap(); let haystack_off; (layout, haystack_off) = layout.extend(haystack_layout).unwrap(); @@ -37,17 +38,19 @@ impl MatrixLayout { (layout, bonus_off) = layout.extend(bonus_layout).unwrap(); let rows_off; (layout, rows_off) = layout.extend(rows_layout).unwrap(); - let cells_off; - (layout, cells_off) = layout.extend(cells_layout).unwrap(); + let score_off; + (layout, score_off) = layout.extend(score_layout).unwrap(); + let matrix_off; + (layout, matrix_off) = layout.extend(matrix_layout).unwrap(); MatrixLayout { haystack_len, needle_len, - cell_count, layout, haystack_off, bonus_off, rows_off, - cells_off, + score_off, + matrix_off, _phantom: PhantomData, } } @@ -57,9 +60,13 @@ impl MatrixLayout { unsafe fn fieds_from_ptr( &self, ptr: NonNull, - ) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) { - // sanity checks, should not be necessary - + ) -> ( + *mut [C], + *mut [u16], + *mut [u16], + *mut [ScoreCell], + *mut [MatrixCell], + ) { let base = ptr.as_ptr(); let haystack = base.add(self.haystack_off) as *mut C; let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); @@ -67,109 +74,62 @@ impl MatrixLayout { let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); let rows = base.add(self.rows_off) as *mut u16; let rows = slice_from_raw_parts_mut(rows, self.needle_len); - let cells = base.add(self.cells_off) as *mut MatrixCell; - let cells = slice_from_raw_parts_mut(cells, self.cell_count); - (haystack, bonus, rows, cells) + let cells = base.add(self.score_off) as *mut ScoreCell; + let cells = slice_from_raw_parts_mut(cells, self.haystack_len + 1 - self.needle_len); + let matrix = base.add(self.matrix_off) as *mut MatrixCell; + let matrix = slice_from_raw_parts_mut( + matrix, + (self.haystack_len + 1 - self.needle_len) * self.haystack_len, + ); + (haystack, bonus, rows, cells, matrix) } } #[derive(Clone, Copy)] -pub(crate) struct MatrixCell { - pub score: u16, - pub consecutive_chars: u16, -} - -#[derive(Clone, Copy)] -pub(crate) struct HaystackChar { - pub char: C, +pub(crate) struct ScoreCell { + pub score: i32, pub bonus: u16, + pub matched: bool, } -#[derive(Clone, Copy)] -pub(crate) struct MatrixRow<'a> { - pub off: u16, - pub cells: &'a [MatrixCell], -} - -/// Intexing returns the cell that corresponds to colmun `col` in this row, -/// this is not the same as directly indexing the cells array because every row -/// starts at a column offset which needs to be accounted for -impl Index for MatrixRow<'_> { - type Output = MatrixCell; - - #[inline(always)] - fn index(&self, col: u16) -> &Self::Output { - &self.cells[(col - self.off) as usize] - } -} - -pub(crate) struct MatrixRowMut<'a> { - pub off: u16, - pub cells: &'a mut [MatrixCell], -} - -pub(crate) struct Matrix<'a, C: Char> { +pub(crate) struct MatcherDataView<'a, C: Char> { pub haystack: &'a mut [C], // stored as a separate array instead of struct // to avoid padding sine char is too large and u8 too small :/ pub bonus: &'a mut [u16], + pub current_row: &'a mut [ScoreCell], pub row_offs: &'a mut [u16], - pub cells: &'a mut [MatrixCell], + pub matrix_cells: &'a mut [MatrixCell], } +#[repr(transparent)] +pub struct MatrixCell(pub(crate) u8); -impl<'a, C: Char> Matrix<'a, C> { - pub fn rows_rev(&self) -> impl Iterator + ExactSizeIterator { - let mut cells = &*self.cells; - self.row_offs.iter().rev().map(move |&off| { - let len = self.haystack.len() - off as usize; - let (tmp, row) = cells.split_at(cells.len() - len); - cells = tmp; - MatrixRow { off, cells: row } - }) +impl MatrixCell { + pub fn set(&mut self, p_match: bool, m_match: bool) { + self.0 = p_match as u8 | ((m_match as u8) << 1); } -} -pub(crate) fn haystack<'a, C: Char>( - haystack: &'a [C], - bonus: &'a [u16], - skip: u16, -) -> impl Iterator> + ExactSizeIterator + Clone + 'a { - haystack[skip as usize..] - .iter() - .zip(bonus[skip as usize..].iter()) - .map(|(&char, &bonus)| HaystackChar { char, bonus }) -} - -pub(crate) fn rows_mut<'a>( - row_offs: &'a [u16], - mut cells: &'a mut [MatrixCell], - haystack_len: usize, -) -> impl Iterator> + ExactSizeIterator + 'a { - row_offs.iter().map(move |&off| { - let len = haystack_len - off as usize; - let (row, tmp) = take(&mut cells).split_at_mut(len); - cells = tmp; - MatrixRowMut { off, cells: row } - }) + pub fn get(&self, m_matrix: bool) -> bool { + let mask = m_matrix as u8 + 1; + (self.0 & mask) != 0 + } } // we only use this to construct the layout for the slab allocation #[allow(unused)] -struct MatrixData { +struct MatcherData { haystack: [char; MAX_HAYSTACK_LEN], bonus: [u16; MAX_HAYSTACK_LEN], row_offs: [u16; MAX_NEEDLE_LEN], - cells: [MatrixCell; MAX_MATRIX_SIZE], + scratch_space: [ScoreCell; MAX_HAYSTACK_LEN], + matrix: [u8; MAX_MATRIX_SIZE], } -// const MATRIX_ALLOC_LAYOUT: Layout = -// MatrixLayout::::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout; - pub(crate) struct MatrixSlab(NonNull); impl MatrixSlab { pub fn new() -> Self { - let layout = Layout::new::(); + let layout = Layout::new::(); // safety: the matrix is never zero sized (hardcoded constants) let ptr = unsafe { alloc_zeroed(layout) }; let Some(ptr) = NonNull::new(ptr) else{ @@ -182,32 +142,30 @@ impl MatrixSlab { &mut self, haystack_: &[C], needle_len: usize, - ) -> Option> { + ) -> Option> { let cells = haystack_.len() * needle_len; if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize { return None; } - let matrix_layout = MatrixLayout::::new( - haystack_.len(), - needle_len, - (haystack_.len() + 1 - needle_len / 2) * needle_len, - ); - if matrix_layout.layout.size() > size_of::() { + let matrix_layout = MatrixLayout::::new(haystack_.len(), needle_len); + if matrix_layout.layout.size() > size_of::() { return None; } unsafe { // safely: this allocation is valid for MATRIX_ALLOC_LAYOUT - let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0); + let (haystack, bonus, rows, current_row, matrix_cells) = + matrix_layout.fieds_from_ptr(self.0); // copy haystack before creating references to ensure we donu't crate // references to invalid chars (which may or may not be UB) haystack_ .as_ptr() .copy_to_nonoverlapping(haystack as *mut _, haystack_.len()); - Some(Matrix { + Some(MatcherDataView { haystack: &mut *haystack, row_offs: &mut *rows, bonus: &mut *bonus, - cells: &mut *cells, + current_row: &mut *current_row, + matrix_cells: &mut *matrix_cells, }) } } @@ -215,6 +173,6 @@ impl MatrixSlab { impl Drop for MatrixSlab { fn drop(&mut self) { - unsafe { dealloc(self.0.as_ptr(), Layout::new::()) }; + unsafe { dealloc(self.0.as_ptr(), Layout::new::()) }; } } diff --git a/src/score.rs b/src/score.rs index ac487ac..fd5d61f 100644 --- a/src/score.rs +++ b/src/score.rs @@ -114,11 +114,8 @@ impl Matcher { } else { // Break consecutive chunk if bonus > first_bonus { - if bonus >= BONUS_BOUNDARY { - first_bonus = bonus; - } else { - bonus = max(bonus, BONUS_CONSECUTIVE); - } + first_bonus = bonus; + bonus = max(max(bonus, first_bonus), BONUS_CONSECUTIVE); } else { bonus = max(first_bonus, BONUS_CONSECUTIVE); } diff --git a/src/tests.rs b/src/tests.rs index 1dd2fdd..2132e17 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -48,9 +48,11 @@ fn assert_matches( println!("xx {matched_indices:?} {algo:?}"); let res = match algo { Algorithm::FuzzyOptimal => { + matched_indices.clear(); matcher.fuzzy_indices(haystack, needle, &mut matched_indices) } Algorithm::FuzzyGreedy => { + matched_indices.clear(); matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices) } }; @@ -142,7 +144,7 @@ fn test_fuzzy() { "/AutomatorDocument.icns", "rdoc", &[9, 10, 11, 12], - BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2, + BONUS_CAMEL123 * 3, ), ( "/man1/zshcompctl.1", @@ -395,13 +397,22 @@ fn test_optimal() { - PENALTY_GAP_EXTENSION, ), ( - "Hٷ!!\0!!!\n\0\0\u{4}\u{c}\0\u{8}\0!\0\0\u{c}", - "\0!\0\0!", + "Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}", + "-!--!", &[4, 5, 9, 10, 16], BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4) - 2 * PENALTY_GAP_START - 6 * PENALTY_GAP_EXTENSION, ), + ( + "C8Gۂ(GECGS", + "8GCG", + &[1, 2, 7, 8], + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + - PENALTY_GAP_START + - 3 * PENALTY_GAP_EXTENSION + + BONUS_CONSECUTIVE, + ), ], ); }