From e964d42849a4a4e8ced97a2589dea57c850efcf4 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Thu, 20 Jul 2023 02:09:51 +0200 Subject: [PATCH] better implementation --- src/chars.rs | 135 +++++++ src/{ => chars}/case_fold.rs | 0 src/{ => chars}/normalize.rs | 0 src/config.rs | 138 +------ src/fuzzy_greedy.rs | 46 +++ src/fuzzy_optimal.rs | 272 ++++++++++++++ src/lib.rs | 699 ++++++----------------------------- src/matrix.rs | 280 ++++++++++++++ src/multizip.rs | 0 src/prefilter.rs | 73 ++++ src/score.rs | 145 ++++++++ src/tests.rs | 270 ++++++++++++++ src/utf32_str.rs | 123 ++++++ 13 files changed, 1467 insertions(+), 714 deletions(-) create mode 100644 src/chars.rs rename src/{ => chars}/case_fold.rs (100%) rename src/{ => chars}/normalize.rs (100%) create mode 100644 src/fuzzy_greedy.rs create mode 100644 src/fuzzy_optimal.rs create mode 100644 src/matrix.rs create mode 100644 src/multizip.rs create mode 100644 src/prefilter.rs create mode 100644 src/score.rs create mode 100644 src/tests.rs create mode 100644 src/utf32_str.rs diff --git a/src/chars.rs b/src/chars.rs new file mode 100644 index 0000000..530b7d0 --- /dev/null +++ b/src/chars.rs @@ -0,0 +1,135 @@ +use crate::chars::case_fold::CASE_FOLDING_SIMPLE; +use crate::MatcherConfig; + +//autogenerated by generate-ucd +#[allow(warnings)] +#[rustfmt::skip] +mod case_fold; +mod normalize; + +pub trait Char: Copy + Eq + Ord + std::fmt::Debug { + const ASCII: bool; + fn char_class(self, config: &MatcherConfig) -> CharClass; + fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); + fn normalize(self, config: &MatcherConfig) -> Self; +} + +impl Char for u8 { + const ASCII: bool = true; + #[inline] + fn char_class(self, config: &MatcherConfig) -> CharClass { + let c = self; + // using manual if conditions instead optimizes better + if c >= b'a' && c <= b'z' { + CharClass::Lower + } else if c >= b'A' && c <= b'Z' { + CharClass::Upper + } else if c >= b'0' && c <= b'9' { + CharClass::Number + } else if c.is_ascii_whitespace() { + CharClass::Whitespace + } else if config.delimeter_chars.contains(&c) { + CharClass::Delimiter + } else { + CharClass::NonWord + } + } + + #[inline(always)] + fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass) { + let char_class = self.char_class(config); + let normalized = if config.ignore_case && char_class == CharClass::Upper { + self + 32 + } else { + self + }; + (normalized, char_class) + } + + #[inline(always)] + fn normalize(self, config: &MatcherConfig) -> Self { + if config.ignore_case && self >= b'A' && self <= b'Z' { + self + 32 + } else { + self + } + } +} +fn char_class_non_ascii(c: char) -> CharClass { + if c.is_lowercase() { + CharClass::Lower + } else if c.is_uppercase() { + CharClass::Upper + } else if c.is_numeric() { + CharClass::Number + } else if c.is_alphabetic() { + CharClass::Letter + } else if c.is_whitespace() { + CharClass::Whitespace + } else { + CharClass::NonWord + } +} +impl Char for char { + const ASCII: bool = false; + #[inline(always)] + fn char_class(self, config: &MatcherConfig) -> CharClass { + if self.is_ascii() { + return (self as u8).char_class(config); + } + char_class_non_ascii(self) + } + + #[inline(always)] + fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { + if self.is_ascii() { + let (c, class) = (self as u8).char_class_and_normalize(config); + return (c as char, class); + } + let char_class = char_class_non_ascii(self); + if char_class == CharClass::Upper { + self = CASE_FOLDING_SIMPLE + .binary_search_by_key(&self, |(upper, _)| *upper) + .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) + } + if config.normalize { + self = normalize::normalize(self); + } + (self, char_class) + } + + #[inline(always)] + fn normalize(mut self, config: &MatcherConfig) -> Self { + if config.normalize { + self = normalize::normalize(self); + } + to_lower_case(self) + } +} + +pub use normalize::normalize; + +#[inline(always)] +pub fn to_lower_case(c: char) -> char { + if c >= 'A' && c <= 'Z' { + char::from_u32(c as u32 + 32).unwrap() + } else if !c.is_ascii() { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) + } else { + c + } +} + +#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] +#[non_exhaustive] +pub enum CharClass { + Whitespace, + NonWord, + Delimiter, + Lower, + Upper, + Letter, + Number, +} diff --git a/src/case_fold.rs b/src/chars/case_fold.rs similarity index 100% rename from src/case_fold.rs rename to src/chars/case_fold.rs diff --git a/src/normalize.rs b/src/chars/normalize.rs similarity index 100% rename from src/normalize.rs rename to src/chars/normalize.rs diff --git a/src/config.rs b/src/config.rs index ca076ae..41d12b4 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,37 +1,7 @@ -pub(crate) const SCORE_MATCH: u16 = 16; -pub(crate) const PENALTY_GAP_START: u16 = 3; -pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; - -// We prefer matches at the beginning of a word, but the bonus should not be -// too great to prevent the longer acronym matches from always winning over -// shorter fuzzy matches. The bonus point here was specifically chosen that -// the bonus is cancelled when the gap between the acronyms grows over -// 8 characters, which is approximately the average length of the words found -// in web2 dictionary and my file system. -pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; - -// Although bonus point for non-word characters is non-contextual, we need it -// for computing bonus points for consecutive chunks starting with a non-word -// character. -pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2; - -// Edge-triggered bonus for matches in camelCase words. -// Compared to word-boundary case, they don't accompany single-character gaps -// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly. -pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION; - -// Minimum bonus point given to characters in consecutive chunks. -// Note that bonus points for consecutive matches shouldn't have needed if we -// used fixed match score as in the original algorithm. -pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; - -// The first character in the typed pattern usually has more significance -// than the rest so it's important that it appears at special positions where -// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". -// The amount of the extra bonus should be limited so that the gap penalty is -// still respected. -pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; +use crate::chars::CharClass; +use crate::score::BONUS_BOUNDARY; +#[non_exhaustive] pub struct MatcherConfig { pub delimeter_chars: &'static [u8], /// Extra bonus for word boundary after whitespace character or beginning of the string @@ -44,33 +14,17 @@ pub struct MatcherConfig { /// this significantly degrades performance so its not recommended /// to be truned on by default pub normalize: bool, - /// use faster/simpler algorithm at the cost of (potentially) much worse results - /// For long inputs this algorith is always used as a fallbach to avoid - /// blowups in time complexity - pub use_v1: bool, - /// The case matching to perform - pub case_matching: CaseMatching, + /// whether to ignore casing + pub ignore_case: bool, } -#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] -#[non_exhaustive] -pub enum CharClass { - Whitespace, - NonWord, - Delimiter, - Lower, - Upper, - Letter, - Number, -} - -#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] -#[non_exhaustive] -pub enum CaseMatching { - Respect, - Ignore, - Smart, -} +// #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] +// #[non_exhaustive] +// pub enum CaseMatching { +// Respect, +// Ignore, +// Smart, +// } impl MatcherConfig { pub const DEFAULT: Self = { @@ -80,8 +34,7 @@ impl MatcherConfig { bonus_boundary_delimiter: BONUS_BOUNDARY + 1, inital_char_class: CharClass::Whitespace, normalize: false, - use_v1: false, - case_matching: CaseMatching::Smart, + ignore_case: true, } }; } @@ -107,69 +60,4 @@ impl MatcherConfig { self.inital_char_class = CharClass::Delimiter; self } - - fn char_class_non_ascii(c: char) -> CharClass { - if c.is_lowercase() { - CharClass::Lower - } else if c.is_uppercase() { - CharClass::Upper - } else if c.is_numeric() { - CharClass::Number - } else if c.is_alphabetic() { - CharClass::Letter - } else if c.is_whitespace() { - CharClass::Whitespace - } else { - CharClass::NonWord - } - } - - fn char_class_ascii(&self, c: char) -> CharClass { - // using manual if conditions instead optimizes better - if c >= 'a' && c <= 'z' { - CharClass::Lower - } else if c >= 'A' && c <= 'Z' { - CharClass::Upper - } else if c >= '0' && c <= '9' { - CharClass::Number - } else if c.is_ascii_whitespace() { - CharClass::Whitespace - } else if self.delimeter_chars.contains(&(c as u8)) { - CharClass::Delimiter - } else { - CharClass::NonWord - } - } - - pub(crate) fn char_class(&self, c: char) -> CharClass { - if c.is_ascii() { - self.char_class_ascii(c) - } else { - Self::char_class_non_ascii(c) - } - } - - pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { - if class > CharClass::NonWord { - // transition from non word to word - match prev_class { - CharClass::Whitespace => return self.bonus_boundary_white, - CharClass::Delimiter => return self.bonus_boundary_delimiter, - CharClass::NonWord => return BONUS_BOUNDARY, - _ => (), - } - } - if prev_class == CharClass::Lower && class == CharClass::Upper - || prev_class != CharClass::Number && class == CharClass::Number - { - // camelCase letter123 - BONUS_CAMEL123 - } else if class == CharClass::NonWord { - BONUS_NON_WORD - } else if class == CharClass::Whitespace { - self.bonus_boundary_white - } else { - 0 - } - } } diff --git a/src/fuzzy_greedy.rs b/src/fuzzy_greedy.rs new file mode 100644 index 0000000..8a61052 --- /dev/null +++ b/src/fuzzy_greedy.rs @@ -0,0 +1,46 @@ +use crate::chars::Char; +use crate::Matcher; + +impl Matcher { + /// greedy fallback algoritm, much faster (linear time) but reported scores/indicies + /// might not be the best match + pub(crate) fn fuzzy_match_greedy, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + mut start: usize, + mut end: usize, + indicies: &mut Vec, + ) -> Option { + let first_char_end = if H::ASCII { start + 1 } else { end }; + if !H::ASCII && needle.len() != 1 { + let mut needle_iter = needle[1..].iter().copied(); + if let Some(mut needle_char) = needle_iter.next() { + for (i, &c) in haystack[first_char_end..].iter().enumerate() { + if c.normalize(&self.config) == needle_char { + let Some(next_needle_char) = needle_iter.next() else { + end = i + 1; + break; + }; + needle_char = next_needle_char; + } + } + } + } + // mimimize the greedly match by greedy matching in reverse + + let mut needle_iter = needle.iter().rev().copied(); + let mut needle_char = needle_iter.next().unwrap(); + for (i, &c) in haystack[start..end].iter().enumerate().rev() { + println!("{c:?} {i} {needle_char:?}"); + if c == needle_char { + let Some(next_needle_char) = needle_iter.next() else { + start += i; + break; + }; + needle_char = next_needle_char; + } + } + Some(self.calculate_score::(haystack, needle, start, end, indicies)) + } +} diff --git a/src/fuzzy_optimal.rs b/src/fuzzy_optimal.rs new file mode 100644 index 0000000..f63aa3e --- /dev/null +++ b/src/fuzzy_optimal.rs @@ -0,0 +1,272 @@ +use std::cmp::max; + +use crate::chars::{Char, CharClass}; +use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow}; +use crate::score::{ + BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, + PENALTY_GAP_START, SCORE_MATCH, +}; +use crate::{Matcher, MatcherConfig}; + +impl Matcher { + pub(crate) fn fuzzy_match_optimal, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + start: usize, + greedy_end: usize, + end: usize, + indicies: &mut Vec, + ) -> Option { + // construct a matrix (and copy the haystack), the matrix and haystack size are bounded + // to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows + // us to treat needle indecies as u16 + let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else { + return self.fuzzy_match_greedy::( + haystack, + needle, + start, + greedy_end, + indicies, + ); + }; + + let prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.inital_char_class); + let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config); + // this only happend with unicode haystacks, for ASCII the prefilter handles all rejects + if !matched { + return None; + } + if needle.len() == 1 { + indicies.push(max_score_pos as u32); + return Some(max_score); + } + debug_assert_eq!( + matrix.row_offs[0], 0, + "prefilter should have put us at the start of the match" + ); + + // populate the matrix and find the best score + let (max_score, best_match_end) = matrix.populate_matrix(needle); + if INDICIES { + matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end); + } + println!("{indicies:?}"); + println!("{}", max_score); + Some(max_score) + } +} + +impl Matrix<'_, H> { + fn setup( + &mut self, + needle: &[N], + mut prev_class: CharClass, + config: &MatcherConfig, + ) -> (u16, u16, bool) + where + H: PartialEq, + { + let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut()); + let (mut needle_char, mut row_start) = row_iter.next().unwrap(); + + let col_iter = self + .haystack + .iter_mut() + .zip(self.cells.iter_mut()) + .zip(self.bonus.iter_mut()) + .enumerate(); + + let mut max_score = 0; + let mut max_score_pos = 0; + let mut in_gap = false; + let mut prev_score = 0u16; + let mut matched = false; + let first_needle_char = needle[0]; + + for (i, ((c, matrix_cell), bonus_)) in col_iter { + let class = c.char_class(config); + *c = c.normalize(config); + + let bonus = config.bonus_for(prev_class, class); + // save bonus for later so we don't have to recompute it each time + *bonus_ = bonus; + prev_class = class; + + let i = i as u16; + println!("{i} {needle_char:?} {c:?}"); + if *c == needle_char { + // save the first idx of each char + if let Some(next) = row_iter.next() { + *row_start = i; + (needle_char, row_start) = next; + } else { + if !matched { + *row_start = i; + } + // we have atleast one match + matched = true; + } + } + if *c == first_needle_char { + let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; + println!("start match {score}"); + matrix_cell.consecutive_chars = 1; + if needle.len() == 1 && score > max_score { + max_score = score; + max_score_pos = i; + // can't get better than this + if bonus >= BONUS_BOUNDARY { + break; + } + } + matrix_cell.score = score; + in_gap = false; + } else { + let gap_penalty = if in_gap { + PENALTY_GAP_EXTENSION + } else { + PENALTY_GAP_START + }; + matrix_cell.score = prev_score.saturating_sub(gap_penalty); + matrix_cell.consecutive_chars = 0; + in_gap = true; + } + prev_score = matrix_cell.score; + } + + (max_score_pos, max_score, matched) + } + + fn populate_matrix(&mut self, needle: &[N]) -> (u16, u16) + where + H: PartialEq, + { + let mut max_score = 0; + let mut max_score_end = 0; + + let mut row_iter = needle + .iter() + .zip(rows_mut(self.row_offs, self.cells, self.haystack.len())) + .enumerate(); + // skip the first row we already calculated the in `setup` initial scores + let (_, mut prev_matrix_row) = row_iter.next().unwrap().1; + + for (i, (&needle_char, row)) in row_iter { + let haystack = haystack(self.haystack, self.bonus, row.off); + let mut in_gap = false; + let mut prev_matrix_cell = MatrixCell { + score: 0, + consecutive_chars: 0, + }; + // we are interested in the score of the previous character + // in the previous row. This represents the previous char + // for each possible pattern. This is equivalent to diagonal movement + let diagonal_start = row.off - prev_matrix_row.off - 1; + let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..]; + + for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack + .zip(row.cells.iter_mut()) + .zip(diagonal.iter()) + .enumerate() + { + let col = j + row.off as usize; + let gap_penalty = if in_gap { + PENALTY_GAP_EXTENSION + } else { + PENALTY_GAP_START + }; + let mut score1 = 0; + let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty); + + let mut consecutive = 0; + if haystack_char.char == needle_char { + score1 = diag_matrix_cell.score + SCORE_MATCH; + let mut bonus = haystack_char.bonus; + consecutive = diag_matrix_cell.consecutive_chars + 1; + if consecutive > 1 { + let first_bonus = self.bonus[col + 1 - consecutive as usize]; + if bonus > first_bonus { + if bonus > BONUS_BOUNDARY { + consecutive = 1 + } else { + bonus = max(bonus, BONUS_CONSECUTIVE) + } + } else { + bonus = max(first_bonus, BONUS_CONSECUTIVE) + } + } + if score1 + bonus < score2 { + score1 += haystack_char.bonus; + consecutive = 0; + } else { + score1 += bonus; + } + } + in_gap = score1 < score2; + let score = max(score1, score2); + println!("{score} {score1} {score2}"); + if i == needle.len() - 1 && score > max_score { + max_score = score; + max_score_end = col as u16; + } + matrix_cell.consecutive_chars = consecutive; + matrix_cell.score = score; + prev_matrix_cell = *matrix_cell; + } + prev_matrix_row = row; + } + (max_score, max_score_end) + } + + fn reconstruct_optimal_path( + &self, + needle: &[N], + start: u32, + indicies: &mut Vec, + best_match_end: u16, + ) { + indicies.resize(needle.len(), 0); + + let mut row_iter = self.rows_rev().zip(indicies.iter_mut()).peekable(); + let (mut row, mut matched_col_idx) = row_iter.next().unwrap(); + let mut next_row: Option = None; + let mut col = best_match_end; + let mut prefer_match = true; + let haystack_len = self.haystack.len() as u16; + + loop { + let score = row.cells[col as usize].score; + let mut score1 = 0; + let mut score2 = 0; + if let Some(&(prev_row, _)) = row_iter.peek() { + if col >= prev_row.off { + score1 = prev_row[col].score; + } + } + if col > row.off { + score2 = row[col - 1].score; + } + println!("{score} {score2} {score1} {prefer_match}"); + let mut new_prefer_match = row[col].consecutive_chars > 1; + if !new_prefer_match && col + 1 < haystack_len { + if let Some(next_row) = next_row { + new_prefer_match = next_row[col + 1].consecutive_chars > 0 + } + } + if score > score1 && (score > score2 || score == score2 && prefer_match) { + *matched_col_idx = col as u32 + start; + next_row = Some(row); + let Some(next) = row_iter.next() else { + break; + }; + (row, matched_col_idx) = next + } + prefer_match = new_prefer_match; + col -= 1; + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 0ff4c2c..e61dd7e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,616 +1,137 @@ // sadly this doens't optmimzie well currently #![allow(clippy::manual_range_contains)] -use std::alloc::Layout; -use std::cmp::max; - -use memchr::{memchr, memchr2}; -use normalize::normalize; - -//autogenerated by generate-ucd -#[allow(warnings)] -#[rustfmt::skip] -mod case_fold; +mod chars; mod config; -mod normalize; +mod fuzzy_greedy; +mod fuzzy_optimal; +mod matrix; +mod prefilter; +mod score; +mod utf32_str; -pub use config::{CaseMatching, CharClass, MatcherConfig}; +// #[cfg(test)] +// mod tests; -use crate::config::{ - BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, - PENALTY_GAP_START, SCORE_MATCH, -}; +pub use config::MatcherConfig; -const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB -const MAX_HAYSTACK_LEN: usize = 8192; // 64KB - -#[derive(Clone, Copy, PartialEq, Eq)] -struct MatrixCell { - score: u16, - consecutive_chars: u16, -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct HaystackChar { - char: char, - bonus: u16, -} +use crate::matrix::MatrixSlab; +use crate::utf32_str::Utf32Str; pub struct Matcher { pub config: MatcherConfig, - matrix: Box<[MatrixCell; MAX_MATRIX_SIZE]>, - haystack: Box<[HaystackChar; MAX_HAYSTACK_LEN]>, - // needle can be at most as long as the haystack - first_needle_occurance: Box<[u16; MAX_HAYSTACK_LEN]>, + slab: MatrixSlab, } -pub struct Query { - needle_chars: Vec, - is_ascii: bool, - ignore_case: bool, -} - -impl Query { - fn push(&mut self, needle: &str, normalize_: bool, smart_case: bool) { - self.needle_chars.reserve(needle.len()); - self.needle_chars.extend(needle.chars().map(|mut c| { - if !c.is_ascii() { - self.is_ascii = false; - } - if smart_case { - if c.is_uppercase() { - self.ignore_case = false; - } - } else if self.ignore_case { - if self.is_ascii { - c = to_lower_case::(c) - } else { - c = to_lower_case::(c) - } - } - if normalize_ && !self.is_ascii { - c = normalize(c); - } - c - })) - } -} - -#[inline(always)] -fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option { - if c >= b'a' || c <= b'z' { - memchr2(c, c + 32, haystack) - } else { - memchr(c, haystack) - } -} -/// Safety: T must be vaind if initalized with zeros -unsafe fn zeroed_array_on_heap() -> Box<[T; LEN]> { - let layout = Layout::new::<[T; LEN]>(); - let res = std::alloc::alloc_zeroed(layout); - if res.is_null() { - std::alloc::handle_alloc_error(layout) - } - Box::from_raw(res as _) -} +// // impl Query { +// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) { +// // self.needle_chars.reserve(needle.len()); +// // self.needle_chars.extend(needle.chars().map(|mut c| { +// // if !c.is_ascii() { +// // self.is_ascii = false; +// // } +// // if smart_case { +// // if c.is_uppercase() { +// // self.ignore_case = false; +// // } +// // } else if self.ignore_case { +// // if self.is_ascii { +// // c = to_lower_case::(c) +// // } else { +// // c = to_lower_case::(c) +// // } +// // } +// // if normalize_ && !self.is_ascii { +// // c = normalize(c); +// // } +// // c +// // })) +// // } +// // } impl Matcher { pub fn new(config: MatcherConfig) -> Self { - // Safety: all data allocated here is just integers/structs that contain - // integers so zeroed values are legal - unsafe { - Self { - config, - matrix: zeroed_array_on_heap(), - haystack: zeroed_array_on_heap(), - first_needle_occurance: zeroed_array_on_heap(), - } + Self { + config, + slab: MatrixSlab::new(), } } - pub fn compile_query(&self, needle: &str) -> Query { - let mut query = Query { - needle_chars: Vec::new(), - is_ascii: true, - ignore_case: self.config.case_matching == CaseMatching::Ignore, - }; - query.push( - needle, - self.config.normalize, - self.config.case_matching == CaseMatching::Smart, - ); - query + pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) } - pub fn recompile_query(&self, query: &mut Query, needle: &str) { - query.needle_chars.clear(); - query.is_ascii = false; - query.ignore_case = self.config.case_matching == CaseMatching::Ignore; - query.push( - needle, - self.config.normalize, - self.config.case_matching == CaseMatching::Smart, - ); - } - pub fn append_query(&self, query: &mut Query, needle: &str) { - query.push( - needle, - self.config.normalize, - self.config.case_matching == CaseMatching::Smart, - ); - } - - pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option { - if haystack.len() > u32::MAX as usize { - haystack = &haystack[..u32::MAX as usize] - } - if self.config.use_v1 { - if query.is_ascii && !self.config.normalize { - self.fuzzy_matcher_v1::(query, haystack, &mut Vec::new()) - } else { - self.fuzzy_matcher_v1::(query, haystack, &mut Vec::new()) - } - } else if query.is_ascii && !self.config.normalize { - self.fuzzy_matcher_v2::(query, haystack, &mut Vec::new()) - } else { - self.fuzzy_matcher_v2::(query, haystack, &mut Vec::new()) - } - } - - pub fn fuzzy_indicies( + fn fuzzy_matcher_impl( &mut self, - query: &Query, - mut haystack: &str, - indicies: &mut Vec, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indidies: &mut Vec, ) -> Option { - if haystack.len() > u32::MAX as usize { - haystack = &haystack[..u32::MAX as usize] - } - if self.config.use_v1 { - if query.is_ascii && !self.config.normalize { - self.fuzzy_matcher_v1::(query, haystack, indicies) - } else { - self.fuzzy_matcher_v1::(query, haystack, indicies) - } - } else if query.is_ascii && !self.config.normalize { - self.fuzzy_matcher_v2::(query, haystack, indicies) - } else { - self.fuzzy_matcher_v2::(query, haystack, indicies) - } - } - - #[inline(always)] - fn normalize_char(&self, ignore_case: bool, mut c: char) -> char { - if ignore_case { - c = to_lower_case::(c) - } - if !ASCII_ONLY && self.config.normalize { - c = normalize(c) - } - c - } - - fn prefilter_ascii(&self, query: &Query, mut haystack: &[u8]) -> Option<(usize, usize)> { - let needle = &query.needle_chars; - if query.ignore_case { - let first_idx = find_ascii_ignore_case(needle[0] as u8, haystack)?; - let mut last_idx = first_idx + 1; - haystack = &haystack[last_idx..]; - for &c in &needle[1..] { - let idx = find_ascii_ignore_case(c as u8, haystack)? + 1; - last_idx += idx; - haystack = &haystack[idx..]; - } - Some((first_idx, last_idx)) - } else { - let first_idx = memchr(needle[0] as u8, haystack)?; - let mut last_idx = first_idx + 1; - haystack = &haystack[last_idx..]; - for &c in &needle[1..] { - let idx = memchr(c as u8, haystack)? + 1; - last_idx += idx; - haystack = &haystack[idx..]; - } - Some((first_idx, last_idx)) - } - } - - fn prefilter_non_ascii(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> { - let needle_char = query.needle_chars[0]; - let mut text = haystack - .char_indices() - .map(|(i, c)| (i, self.normalize_char::(query.ignore_case, c))); - - let (match_start, c) = text.find(|&(_, c)| c == needle_char)?; - Some((match_start, match_start + c.len_utf8())) - } - - fn prefilter(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> { - // quickly reject small matches - if query.needle_chars.len() > haystack.len() { - return None; - } - if query.is_ascii { - self.prefilter_ascii(query, haystack.as_bytes()) - } else { - self.prefilter_non_ascii(query, haystack) - } - } - - fn fuzzy_matcher_v1( - &mut self, - query: &Query, - haystack: &str, - indicies: &mut Vec, - ) -> Option { - let (start, end) = self.prefilter(query, haystack)?; - self.fuzzy_matcher_v1_with_prefilter::( - query, haystack, start, end, indicies, - ) - } - - fn fuzzy_matcher_v1_with_prefilter( - &mut self, - query: &Query, - haystack: &str, - mut start: usize, - mut end: usize, - indicies: &mut Vec, - ) -> Option { - let first_char_end = if ASCII_ONLY { start + 1 } else { end }; - if !ASCII_ONLY && query.needle_chars.len() != 1 { - let mut needle_iter = query.needle_chars[1..].iter().copied(); - if let Some(mut needle_char) = needle_iter.next() { - let haystack = haystack[first_char_end..] - .char_indices() - .rev() - .map(|(i, c)| (i, self.normalize_char::(query.ignore_case, c))); - for (i, c) in haystack { - if c == needle_char { - let Some(next_needle_char) = needle_iter.next() else { - end = i + c.len_utf8(); - break; - }; - needle_char = next_needle_char; - } - } - } - } - // very simple, just mimimize from the back - let match_ = haystack[first_char_end..end] - .char_indices() - .rev() - .map(|(i, c)| (i, self.normalize_char::(query.ignore_case, c))); - - let mut needle_iter = query.needle_chars[..].iter().rev().copied(); - let mut needle_char = needle_iter.next().unwrap(); - for (i, c) in match_ { - if c == needle_char { - let Some(next_needle_char) = needle_iter.next() else { - start = i; - break; - }; - needle_char = next_needle_char; - } - } - Some(self.calculate_score::(query, haystack, start, end, indicies)) - } - - fn calculate_score( - &mut self, - query: &Query, - text: &str, - match_start: usize, - match_end: usize, - indicies: &mut Vec, - ) -> u16 { - if INDICIES { - indicies.reserve(query.needle_chars.len()); - } - let mut prev_class = text[..match_start] - .chars() - .next_back() - .map(|c| self.config.char_class(c)) - .unwrap_or(self.config.inital_char_class); - let mut needle_idx = 0; - let mut score = 0u16; - let mut in_gap = false; - let mut consecutive = 0; - let mut first_bonus = 0u16; - for (i, mut c) in text[match_start..match_end].char_indices() { - let class = self.config.char_class(c); - if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case { - c = to_lower_case::(c); - } - if self.config.normalize && !ASCII_ONLY { - c = normalize(c) - } - if c == query.needle_chars[needle_idx] { - if INDICIES { - indicies.push(i as u32) - } - score += SCORE_MATCH; - let mut bonus = self.config.bonus_for(prev_class, class); - if consecutive == 0 { - first_bonus = bonus - } else { - // Break consecutive chunk - if bonus > first_bonus { - if bonus >= BONUS_BOUNDARY { - first_bonus = bonus; - } else { - bonus = max(bonus, BONUS_CONSECUTIVE); - } - } else { - bonus = max(first_bonus, BONUS_CONSECUTIVE); - } - } - if needle_idx == 0 { - bonus *= BONUS_FIRST_CHAR_MULTIPLIER; - } - score += bonus; - needle_idx += 1; - in_gap = false; - consecutive += 1; - } else { - let penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - score = score.saturating_sub(penalty); - in_gap = true; - consecutive = 0; - first_bonus = 0; - } - prev_class = class; - } - - score - } - - fn fuzzy_matcher_v2( - &mut self, - query: &Query, - text: &str, - indicies: &mut Vec, - ) -> Option { - let (start, prefilter_end) = self.prefilter(query, text)?; - let text_len = text.len() - start; - // fallback to v1 algorithms for long haystacks - // technically we need to multiply by char len here - // but counting chars has a lot of unecessary overhead that we can avoid - // here in practice using bytelen should be a reasonable approximation - // we also differ from fzf here in that we never allocate and instead stringintly check here - if text_len > u16::MAX as usize || text_len * query.needle_chars.len() > MAX_HAYSTACK_LEN { - return self.fuzzy_matcher_v1_with_prefilter::( - query, - text, - start, - prefilter_end, - indicies, - ); - } - - let mut prev_class = text[..start] - .chars() - .next_back() - .map(|c| self.config.char_class(c)) - .unwrap_or(self.config.inital_char_class); - - let text = &text[start..]; - - let mut needle_iter = query.needle_chars[..] - .iter() - .copied() - .zip(self.first_needle_occurance.iter_mut()); - let (mut needle_char, mut needle_char_idx) = needle_iter.next().unwrap(); - - let iter = text[start..] - .chars() - .zip(self.matrix.iter_mut()) - .zip(self.haystack.iter_mut()) - .enumerate(); - - let mut last_matched_idx = 0; - let mut max_score = 0; - let mut max_score_pos = 0; - let mut in_gap = false; - let mut prev_score = 0u16; - let mut matched = false; - - let first_needle_char = query.needle_chars[0]; - for (i, ((mut c, matrix_cell), char_info)) in iter { - let class = self.config.char_class(c); - if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case { - c = to_lower_case::(c); - } - if self.config.normalize && !ASCII_ONLY { - c = normalize(c) - } - char_info.char = c; - let bonus = self.config.bonus_for(prev_class, class); - char_info.char = c; - prev_class = class; - - let i = i as u16; - if c == needle_char { - // save the first idx of each char - if let Some(next) = needle_iter.next() { - *needle_char_idx = i; - (needle_char, needle_char_idx) = next - } else { - // we have atleast one match - matched = true; - } - // and the last matched char - last_matched_idx = i; - } - if c == first_needle_char { - let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; - matrix_cell.consecutive_chars = 1; - if query.needle_chars.len() == 1 && score > max_score { - max_score = score; - max_score_pos = i; - // can't get better than this - if bonus >= BONUS_BOUNDARY { - break; - } - } - in_gap = false; - } else { - let gap_penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - matrix_cell.score = prev_score.saturating_sub(gap_penalty); - matrix_cell.consecutive_chars = 0; - in_gap = true; - } - prev_score = matrix_cell.score; - } - if !matched { - debug_assert!(!ASCII_ONLY, "prefilter should have rejected"); - return None; - } - if query.needle_chars.len() == 1 { - indicies.push(max_score_pos as u32); - return Some(max_score); - } - assert_eq!( - self.first_needle_occurance[0], 0, - "prefilter should have put us at the start of the match" + assert!( + haystack.len() <= u32::MAX as usize, + "fuzzy matching is only support for up to 2^32-1 codepoints" ); - let haystack_len = last_matched_idx as usize + 1; - let (max_score, best_match_end) = self.popultate_matrix(haystack_len, query); - if INDICIES { - indicies.reserve(query.needle_chars.len()); - let mut col = best_match_end; - let mut needle_iter = self.matrix[..haystack_len * query.needle_chars.len()] - .windows(haystack_len) - .zip(self.first_needle_occurance[..haystack_len].iter()) - .rev() - .peekable(); - let mut next_row = None; - let (mut row, mut first_needle_occurance) = needle_iter.next().unwrap(); - let mut prefer_match = true; - loop { - let score = row[col as usize].score; - let mut score1 = 0; - let mut score2 = 0; - if let Some((prev_row, _)) = needle_iter.peek() { - if col >= *first_needle_occurance { - score1 = prev_row[col as usize].score; - } - } - if col > *first_needle_occurance { - score2 = row[col as usize - 1].score; - } - if score > score1 && (score > score2 || score == score2 && prefer_match) { - indicies.push(col as u32 + start as u32); - next_row = Some(row); - let Some(next) = needle_iter.next() else { - break; - }; - (row, first_needle_occurance) = next - } - prefer_match = row[col as usize].consecutive_chars > 1; - if !prefer_match && col + 1 < query.needle_chars.len() as u16 { - if let Some(next_row) = next_row { - prefer_match = next_row[col as usize + 1].consecutive_chars > 0 - } - } - col -= 1; + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?; + self.fuzzy_match_optimal::( + haystack, needle, start, greedy_end, end, indidies, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + todo!() + // let (start, end) = self.prefilter_non_ascii(haystack, needle_)?; + // self.fuzzy_match_optimal::( + // haystack, + // needle, + // start, + // start + 1, + // end, + // indidies, + // ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let (start, end) = self.prefilter_non_ascii(haystack, needle_)?; + self.fuzzy_match_optimal::( + haystack, + needle, + start, + start + 1, + end, + indidies, + ) } } - - Some(max_score) } - fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) { - let mut max_score = 0; - let mut max_score_end = 0; - let mut iter = query - .needle_chars - .iter() - .zip(self.first_needle_occurance.iter()) - .zip(self.matrix.chunks_mut(haystack_len)) - .enumerate(); - // skip the first row we already calculated the initial scores - let (_, ((&_, &_), mut prev_matrix_row)) = iter.next().unwrap(); - for (i, ((&needle_char, &first_occurance), matrix_row)) in iter { - // help the optimizer out a little - assert!((first_occurance as usize) < matrix_row.len()); - assert!(first_occurance != 0); - let mut in_gap = false; - let haystack = &self.haystack[first_occurance as usize..haystack_len]; - let mut prev_matrix_cell = matrix_row[first_occurance as usize - 1]; - let matrix_row = &mut matrix_row[first_occurance as usize..haystack_len]; - let prev_matrix_diagonal = - &mut prev_matrix_row[first_occurance as usize - 1..haystack_len - 1]; - for (j, ((&haystack_char, matrix_cell), &diag_matrix_cell)) in haystack - .iter() - .zip(matrix_row.iter_mut()) - .zip(prev_matrix_diagonal.iter()) - .enumerate() - { - let col = j + first_occurance as usize; - let gap_penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - let mut score1 = 0; - let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty); - let mut consecutive = 0; - if haystack_char.char == needle_char { - score1 = diag_matrix_cell.score + SCORE_MATCH; - let mut bonus = haystack_char.bonus; - consecutive = diag_matrix_cell.consecutive_chars + 1; - if consecutive > 1 { - let first_bonus = self.haystack[col - consecutive as usize].bonus; - if bonus > first_bonus { - if bonus > BONUS_BOUNDARY { - consecutive = 1 - } else { - bonus = max(bonus, BONUS_CONSECUTIVE) - } - } else { - bonus = max(first_bonus, BONUS_CONSECUTIVE) - } - } - if score1 + bonus < score2 { - score1 += haystack_char.bonus; - consecutive = 0; - } else { - score1 += bonus; - } - } - in_gap = score1 < score2; - let score = max(max(score1, score2), 0); - prev_matrix_cell = *matrix_cell; - if i == query.needle_chars.len() - 1 && score > max_score { - max_score = score; - max_score_end = col as u16; - } - matrix_cell.consecutive_chars = consecutive; - matrix_cell.score = score; - } - prev_matrix_row = matrix_row; - } - (max_score, max_score_end) - } -} - -#[inline(always)] -fn to_lower_case(c: char) -> char { - if c >= 'A' && c <= 'Z' { - char::from_u32(c as u32 + 32).unwrap() - } else if !c.is_ascii() && !ASCII_ONLY { - case_fold::CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |(upper, _)| *upper) - .map_or(c, |idx| case_fold::CASE_FOLDING_SIMPLE[idx].1) - } else { - c - } + // pub fn fuzzy_indicies( + // &mut self, + // query: &Query, + // mut haystack: Utf32Str<'_>, + // indicies: &mut Vec, + // ) -> Option { + // if haystack.len() > u32::MAX as usize { + // haystack = &haystack[..u32::MAX as usize] + // } + // println!( + // "start {haystack:?}, {:?} {} {}", + // query.needle_chars, query.ignore_case, query.is_ascii + // ); + // if self.config.use_v1 { + // if query.is_ascii && !self.config.normalize { + // self.fuzzy_matcher_v1::(query, haystack, indicies) + // } else { + // self.fuzzy_matcher_v1::(query, haystack, indicies) + // } + // } else if query.is_ascii && !self.config.normalize { + // self.fuzzy_matcher_v2::(query, haystack, indicies) + // } else { + // self.fuzzy_matcher_v2::(query, haystack, indicies) + // } + // } } diff --git a/src/matrix.rs b/src/matrix.rs new file mode 100644 index 0000000..47e1988 --- /dev/null +++ b/src/matrix.rs @@ -0,0 +1,280 @@ +use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; +use std::fmt::{Debug, Formatter, Result}; +use std::marker::PhantomData; +use std::mem::{size_of, take}; +use std::ops::Index; +use std::ptr::{slice_from_raw_parts_mut, NonNull}; + +use crate::chars::Char; + +const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB + +// these two aren't hard maxima, instead we simply allow whatever will fit into memory +const MAX_HAYSTACK_LEN: usize = 2048; // 64KB +const MAX_NEEDLE_LEN: usize = 2048; // 64KB + +struct MatrixLayout { + haystack_len: usize, + needle_len: usize, + cell_count: usize, + layout: Layout, + haystack_off: usize, + bonus_off: usize, + rows_off: usize, + cells_off: usize, + _phantom: PhantomData, +} +impl MatrixLayout { + fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout { + let mut layout = Layout::from_size_align(0, 1).unwrap(); + let haystack_layout = Layout::array::(haystack_len).unwrap(); + let bonus_layout = Layout::array::(haystack_len).unwrap(); + let rows_layout = Layout::array::(needle_len).unwrap(); + let cells_layout = Layout::array::(cell_count).unwrap(); + + let haystack_off; + (layout, haystack_off) = layout.extend(haystack_layout).unwrap(); + let bonus_off; + (layout, bonus_off) = layout.extend(bonus_layout).unwrap(); + let rows_off; + (layout, rows_off) = layout.extend(rows_layout).unwrap(); + let cells_off; + (layout, cells_off) = layout.extend(cells_layout).unwrap(); + MatrixLayout { + haystack_len, + needle_len, + cell_count, + layout, + haystack_off, + bonus_off, + rows_off, + cells_off, + _phantom: PhantomData, + } + } + /// # Safety + /// + /// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT + unsafe fn fieds_from_ptr( + &self, + ptr: NonNull, + ) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) { + // sanity checks, should not be necessary + + let base = ptr.as_ptr(); + let haystack = base.add(self.haystack_off) as *mut C; + let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); + let bonus = base.add(self.bonus_off) as *mut u16; + let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); + let rows = base.add(self.rows_off) as *mut u16; + let rows = slice_from_raw_parts_mut(rows, self.needle_len); + let cells = base.add(self.cells_off) as *mut MatrixCell; + let cells = slice_from_raw_parts_mut(cells, self.cell_count); + (haystack, bonus, rows, cells) + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) struct MatrixCell { + pub score: u16, + pub consecutive_chars: u16, +} + +impl Debug for MatrixCell { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + (self.score, self.consecutive_chars).fmt(f) + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) struct HaystackChar { + pub char: C, + pub bonus: u16, +} + +impl Debug for HaystackChar { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + (self.char, self.bonus).fmt(f) + } +} + +#[derive(Clone, Copy)] +pub(crate) struct MatrixRow<'a> { + pub off: u16, + pub cells: &'a [MatrixCell], +} +impl Index for MatrixRow<'_> { + type Output = MatrixCell; + + fn index(&self, index: u16) -> &Self::Output { + &self.cells[index as usize] + } +} + +impl Debug for MatrixRow<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut f = f.debug_list(); + f.entries((0..self.off).map(|_| &(0, 0))); + f.entries(self.cells.iter()); + f.finish() + } +} + +pub(crate) struct MatrixRowMut<'a> { + pub off: u16, + pub cells: &'a mut [MatrixCell], +} + +impl Debug for MatrixRowMut<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut f = f.debug_list(); + f.entries((0..self.off).map(|_| &(0, 0))); + f.entries(self.cells.iter()); + f.finish() + } +} + +pub struct DebugList(I); +impl Debug for DebugList +where + I: Iterator + Clone, + I::Item: Debug, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + f.debug_list().entries(self.0.clone()).finish() + } +} + +pub(crate) struct Matrix<'a, C: Char> { + pub haystack: &'a mut [C], + // stored as a seperate array instead of struct + // to avoid padding sine char is too large and u8 too small :/ + pub bonus: &'a mut [u16], + pub row_offs: &'a mut [u16], + pub cells: &'a mut [MatrixCell], +} + +impl<'a, C: Char> Matrix<'a, C> { + pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { + let mut cells = &*self.cells; + self.row_offs.iter().map(move |&off| { + let len = self.haystack.len() - off as usize; + let (row, tmp) = cells.split_at(len); + cells = tmp; + MatrixRow { off, cells: row } + }) + } + + pub fn rows_rev(&self) -> impl Iterator + ExactSizeIterator { + let mut cells = &*self.cells; + self.row_offs.iter().rev().map(move |&off| { + let len = self.haystack.len() - off as usize; + let (tmp, row) = cells.split_at(cells.len() - len); + cells = tmp; + MatrixRow { off, cells: row } + }) + } + pub fn haystack( + &self, + ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { + haystack(self.haystack, self.bonus, 0) + } +} + +impl<'a, C: Char> Debug for Matrix<'a, C> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + f.debug_struct("Matrix") + .field("haystack", &DebugList(self.haystack())) + .field("matrix", &DebugList(self.rows())) + .finish() + } +} +pub(crate) fn haystack<'a, C: Char>( + haystack: &'a [C], + bonus: &'a [u16], + skip: u16, +) -> impl Iterator> + ExactSizeIterator + Clone + 'a { + haystack[skip as usize..] + .iter() + .zip(bonus[skip as usize..].iter()) + .map(|(&char, &bonus)| HaystackChar { char, bonus }) +} + +pub(crate) fn rows_mut<'a>( + row_offs: &'a [u16], + mut cells: &'a mut [MatrixCell], + haystack_len: usize, +) -> impl Iterator> + ExactSizeIterator + 'a { + row_offs.iter().map(move |&off| { + let len = haystack_len - off as usize; + let (row, tmp) = take(&mut cells).split_at_mut(len); + cells = tmp; + MatrixRowMut { off, cells: row } + }) +} + +// we only use this to construct the layout for the slab allocation +#[allow(unused)] +struct MatrixData { + haystack: [char; MAX_HAYSTACK_LEN], + bonus: [u16; MAX_HAYSTACK_LEN], + row_offs: [u16; MAX_NEEDLE_LEN], + cells: [MatrixCell; MAX_MATRIX_SIZE], +} + +// const MATRIX_ALLOC_LAYOUT: Layout = +// MatrixLayout::::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout; + +pub(crate) struct MatrixSlab(NonNull); + +impl MatrixSlab { + pub fn new() -> Self { + let layout = Layout::new::(); + // safety: the matrix is never zero sized (hardcoded constants) + let ptr = unsafe { alloc_zeroed(layout) }; + let Some(ptr) = NonNull::new(ptr) else{ + handle_alloc_error(layout) + }; + MatrixSlab(ptr.cast()) + } + + pub(crate) fn alloc( + &mut self, + haystack_: &[C], + needle_len: usize, + ) -> Option> { + let cells = haystack_.len() * needle_len; + if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize { + return None; + } + let matrix_layout = MatrixLayout::::new( + haystack_.len(), + needle_len, + (haystack_.len() - needle_len / 2) * needle_len, + ); + if matrix_layout.layout.size() > size_of::() { + return None; + } + unsafe { + // safetly: this allocation is valid for MATRIX_ALLOC_LAYOUT + let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0); + // copy haystack before creating refernces to ensure we donu't crate + // refrences to invalid chars (which may or may not be UB) + haystack_ + .as_ptr() + .copy_to_nonoverlapping(haystack as *mut _, haystack_.len()); + Some(Matrix { + haystack: &mut *haystack, + row_offs: &mut *rows, + bonus: &mut *bonus, + cells: &mut *cells, + }) + } + } +} + +impl Drop for MatrixSlab { + fn drop(&mut self) { + unsafe { dealloc(self.0.as_ptr(), Layout::new::()) }; + } +} diff --git a/src/multizip.rs b/src/multizip.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/prefilter.rs b/src/prefilter.rs new file mode 100644 index 0000000..0d7c5da --- /dev/null +++ b/src/prefilter.rs @@ -0,0 +1,73 @@ +use ::memchr::{memchr, memchr2, memrchr, memrchr2}; + +use crate::chars::Char; +use crate::utf32_str::Utf32Str; +use crate::Matcher; + +#[inline(always)] +fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option { + if c >= b'a' || c <= b'z' { + memchr2(c, c - 32, haystack) + } else { + memchr(c, haystack) + } +} + +#[inline(always)] +fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option { + if c >= b'a' || c <= b'z' { + memrchr2(c, c - 32, haystack) + } else { + memrchr(c, haystack) + } +} + +impl Matcher { + pub(crate) fn prefilter_ascii( + &self, + mut haystack: &[u8], + needle: &[u8], + ) -> Option<(usize, usize, usize)> { + if self.config.ignore_case { + let start = find_ascii_ignore_case(needle[0], haystack)?; + let mut eager_end = start + 1; + haystack = &haystack[eager_end..]; + for &c in &needle[1..] { + let idx = find_ascii_ignore_case(c, haystack)? + 1; + eager_end += idx; + haystack = &haystack[idx..]; + } + let end = eager_end + + find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack).unwrap_or(0); + Some((start, eager_end, end)) + } else { + let start = memchr(needle[0], haystack)?; + let mut eager_end = start + 1; + haystack = &haystack[eager_end..]; + for &c in &needle[1..] { + let idx = memchr(c, haystack)? + 1; + eager_end += idx; + haystack = &haystack[idx..]; + } + let end = eager_end + memrchr(*needle.last().unwrap(), haystack).unwrap_or(0); + Some((start, eager_end, end)) + } + } + + pub(crate) fn prefilter_non_ascii( + &self, + haystack: &[char], + needle: Utf32Str<'_>, + ) -> Option<(usize, usize)> { + let needle_char = needle.get(0); + let start = haystack + .iter() + .position(|c| c.normalize(&self.config) == needle_char)?; + let needle_char = needle.last(); + let end = haystack[start..] + .iter() + .position(|c| c.normalize(&self.config) == needle_char)?; + + Some((start, end)) + } +} diff --git a/src/score.rs b/src/score.rs new file mode 100644 index 0000000..fca3f7d --- /dev/null +++ b/src/score.rs @@ -0,0 +1,145 @@ +use std::cmp::max; + +use crate::chars::{Char, CharClass}; +use crate::{Matcher, MatcherConfig}; + +pub(crate) const SCORE_MATCH: u16 = 16; +pub(crate) const PENALTY_GAP_START: u16 = 3; +pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; + +// We prefer matches at the beginning of a word, but the bonus should not be +// too great to prevent the longer acronym matches from always winning over +// shorter fuzzy matches. The bonus point here was specifically chosen that +// the bonus is cancelled when the gap between the acronyms grows over +// 8 characters, which is approximately the average length of the words found +// in web2 dictionary and my file system. +pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; + +// Although bonus point for non-word characters is non-contextual, we need it +// for computing bonus points for consecutive chunks starting with a non-word +// character. +pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2; + +// Edge-triggered bonus for matches in camelCase words. +// Compared to word-boundary case, they don't accompany single-character gaps +// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly. +pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION; + +// Minimum bonus point given to characters in consecutive chunks. +// Note that bonus points for consecutive matches shouldn't have needed if we +// used fixed match score as in the original algorithm. +pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; + +// The first character in the typed pattern usually has more significance +// than the rest so it's important that it appears at special positions where +// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". +// The amount of the extra bonus should be limited so that the gap penalty is +// still respected. +pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; + +impl MatcherConfig { + #[inline] + pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { + if class > CharClass::NonWord { + // transition from non word to word + match prev_class { + CharClass::Whitespace => return self.bonus_boundary_white, + CharClass::Delimiter => return self.bonus_boundary_delimiter, + CharClass::NonWord => return BONUS_BOUNDARY, + _ => (), + } + } + if prev_class == CharClass::Lower && class == CharClass::Upper + || prev_class != CharClass::Number && class == CharClass::Number + { + // camelCase letter123 + BONUS_CAMEL123 + } else if class == CharClass::NonWord { + BONUS_NON_WORD + } else if class == CharClass::Whitespace { + self.bonus_boundary_white + } else { + 0 + } + } +} +impl Matcher { + #[inline(always)] + pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { + self.config.bonus_for(prev_class, class) + } + + pub(crate) fn calculate_score, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + start: usize, + end: usize, + indicies: &mut Vec, + ) -> u16 { + if INDICIES { + indicies.reserve(needle.len()); + } + + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.inital_char_class); + let mut needle_iter = needle.iter(); + let mut needle_char = *needle_iter.next().unwrap(); + + let mut in_gap = false; + let mut consecutive = 1; + + // unrolled the firs iteration to make applying the first char multiplier less akward + if INDICIES { + indicies.push(start as u32) + } + let mut first_bonus = self.bonus_for(prev_class, haystack[0].char_class(&self.config)); + let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER; + + for (i, c) in haystack[start + 1..end].iter().enumerate() { + let class = c.char_class(&self.config); + let c = c.normalize(&self.config); + if c == needle_char { + if INDICIES { + indicies.push(i as u32 + start as u32) + } + let mut bonus = self.bonus_for(prev_class, class); + if consecutive == 0 { + first_bonus = bonus + } else { + // Break consecutive chunk + if bonus > first_bonus { + if bonus >= BONUS_BOUNDARY { + first_bonus = bonus; + } else { + bonus = max(bonus, BONUS_CONSECUTIVE); + } + } else { + bonus = max(first_bonus, BONUS_CONSECUTIVE); + } + } + score += SCORE_MATCH + bonus; + in_gap = false; + consecutive += 1; + if let Some(&next) = needle_iter.next() { + needle_char = next; + } + } else { + let penalty = if in_gap { + PENALTY_GAP_EXTENSION + } else { + PENALTY_GAP_START + }; + score = score.saturating_sub(penalty); + in_gap = true; + consecutive = 0; + first_bonus = 0; + } + prev_class = class; + } + + score + } +} diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..5baf52b --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,270 @@ +use crate::config::{ + BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, + PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, +}; +use crate::{CaseMatching, Matcher, MatcherConfig}; + +pub fn assert_matches( + use_v1: bool, + normalize: bool, + case_sensitive: bool, + path: bool, + cases: &[(&str, &str, u32, u32, u16)], +) { + let mut config = MatcherConfig { + use_v1, + normalize, + case_matching: if case_sensitive { + CaseMatching::Respect + } else { + CaseMatching::Ignore + }, + ..MatcherConfig::DEFAULT + }; + if path { + config.set_match_paths(); + } + let mut matcher = Matcher::new(config); + let mut indicies = Vec::new(); + for &(haystack, needle, start, end, mut score) in cases { + score += needle.chars().count() as u16 * SCORE_MATCH; + let query = matcher.compile_query(needle); + let res = matcher.fuzzy_indicies(&query, haystack, &mut indicies); + assert_eq!(res, Some(score), "{needle:?} did not match {haystack:?}"); + assert_eq!( + indicies.first().copied()..indicies.last().map(|&i| i + 1), + Some(start)..Some(end), + "{needle:?} match {haystack:?}[{start}..{end}]" + ); + } +} +const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; +const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; + +#[test] +fn test_v2_fuzzy() { + assert_matches( + false, + false, + false, + false, + &[ + ( + "fooBarbaz1", + "oBZ", + 2, + 9, + BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3, + ), + ( + "foo bar baz", + "fbb", + 0, + 9, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "/AutomatorDocument.icns", + "rdoc", + 9, + 13, + BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2, + ), + ( + "/man1/zshcompctl.1", + "zshc", + 6, + 10, + BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_BOUNDARY_DELIMITER * 3, + ), + ( + "/.oh-my-zsh/cache", + "zshc", + 8, + 13, + BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2 + - PENALTY_GAP_START + + BONUS_BOUNDARY_DELIMITER, + ), + ( + "ab0123 456", + "12356", + 3, + 10, + BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, + ), + ( + "abc123 456", + "12356", + 3, + 10, + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CAMEL123 * 2 + + BONUS_CONSECUTIVE + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ( + "foo/bar/baz", + "fbb", + 0, + 9, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "fooBarBaz", + "fbb", + 0, + 7, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 + - 2 * PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION, + ), + ( + "foo barbaz", + "fbb", + 0, + 8, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START * 2 + - PENALTY_GAP_EXTENSION * 3, + ), + ( + "fooBar Baz", + "foob", + 0, + 4, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3, + ), + ( + "xFoo-Bar Baz", + "foo-b", + 1, + 6, + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CAMEL123 * 2 + + BONUS_NON_WORD + + BONUS_BOUNDARY, + ), + ], + ); +} + +#[test] +fn test_v1_fuzzy() { + assert_matches( + true, + false, + false, + false, + &[ + ( + "fooBarbaz1", + "oBZ", + 2, + 9, + BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3, + ), + ( + "foo bar baz", + "fbb", + 0, + 9, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "/AutomatorDocument.icns", + "rdoc", + 9, + 13, + BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2, + ), + ( + "/man1/zshcompctl.1", + "zshc", + 6, + 10, + BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_BOUNDARY_DELIMITER * 3, + ), + ( + "/.oh-my-zsh/cache", + "zshc", + 8, + 13, + BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2 + - PENALTY_GAP_START + + BONUS_BOUNDARY_DELIMITER, + ), + ( + "ab0123 456", + "12356", + 3, + 10, + BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, + ), + ( + "abc123 456", + "12356", + 3, + 10, + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CAMEL123 * 2 + + BONUS_CONSECUTIVE + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ( + "foo/bar/baz", + "fbb", + 0, + 9, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "fooBarBaz", + "fbb", + 0, + 7, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 + - 2 * PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION, + ), + ( + "foo barbaz", + "fbb", + 0, + 8, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START * 2 + - PENALTY_GAP_EXTENSION * 3, + ), + ( + "fooBar Baz", + "foob", + 0, + 4, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3, + ), + ( + "xFoo-Bar Baz", + "foo-b", + 1, + 6, + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CAMEL123 * 2 + + BONUS_NON_WORD + + BONUS_BOUNDARY, + ), + ], + ); +} diff --git a/src/utf32_str.rs b/src/utf32_str.rs new file mode 100644 index 0000000..982cf16 --- /dev/null +++ b/src/utf32_str.rs @@ -0,0 +1,123 @@ +use std::ops::{Bound, RangeBounds}; + +/// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching. +/// +/// Usually rusts utf8 encoded strings are great. However during fuzzy matching +/// operates on codepoints (it should operate on graphemes but that's too much +/// hassle to deal with). We want to quickly iterate these codeboints between +/// (up to 5 times) during matching. +/// +/// Doing codepoint segmentation on the fly not only blows trough the cache +/// (lookuptables and Icache) but also has nontrivial runtime compared to the +/// matching itself. Furthermore there are a lot of exta optimizations available +/// for ascii only text (but checking during each match has too much overhead). +/// +/// Ofcourse this comes at exta memory cost as we usally still need the ut8 +/// encoded variant for rendenring. In the (dominant) case of ascii-only text +/// we don't require a copy. Furthermore fuzzy matching usually is applied while +/// the user is typing on the fly so the same item is potentially matched many +/// times (making the the upfront cost more worth it). That means that its +/// basically always worth it to presegment the string. +/// +/// For usecases that only match (a lot of) strings once its possible to keep +/// char buffer around that is filled with the presegmented chars +/// +/// Another advantage of this approach is that the matcher will naturally +/// produce char indecies (instead of utf8 offsets) annyway. With a +/// codepoint basec representation like this the indecies can be used +/// directly +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Debug)] +pub enum Utf32Str<'a> { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain vaild ASCII (<=127) + Ascii(&'a [u8]), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(&'a [char]), +} + +impl<'a> Utf32Str<'a> { + /// Convenience method to construct a `Utf32Str` from a normal utf8 str + pub fn new(str: &'a str, buf: &'a mut Vec) -> Self { + if str.is_ascii() { + Utf32Str::Ascii(str.as_bytes()) + } else { + buf.clear(); + buf.extend(str.chars()); + Utf32Str::Unicode(&*buf) + } + } + + #[inline] + pub fn len(&self) -> usize { + match self { + Utf32Str::Unicode(codepoints) => codepoints.len(), + Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + + #[inline] + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end, + Bound::Excluded(&end) => end + 1, + Bound::Unbounded => self.len(), + }; + match self { + Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), + Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + /// Same as `slice` but accepts a u32 range for convenicene sine + /// those are the indecies returned by the matcher + #[inline] + pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start as usize, + Bound::Excluded(&start) => start as usize + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end as usize, + Bound::Excluded(&end) => end as usize + 1, + Bound::Unbounded => self.len(), + }; + match self { + Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), + Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + pub fn is_ascii(&self) -> bool { + matches!(self, Utf32Str::Ascii(_)) + } + + pub fn get(&self, idx: u32) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, + Utf32Str::Unicode(codepoints) => codepoints[idx as usize], + } + } + pub fn last(&self) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char, + Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()], + } + } +} + +// impl Str for &[char] { +// type Chars; + +// fn chars(&self) -> Self::Chars { +// todo!() +// } + +// fn slice(&self, range: impl RangeBounds) { +// todo!() +// } +// }