better implementation

This commit is contained in:
Pascal Kuthe 2023-07-20 02:09:51 +02:00
parent 6837b4e2cb
commit e964d42849
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
13 changed files with 1467 additions and 714 deletions

135
src/chars.rs Normal file
View File

@ -0,0 +1,135 @@
use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
use crate::MatcherConfig;
//autogenerated by generate-ucd
#[allow(warnings)]
#[rustfmt::skip]
mod case_fold;
mod normalize;
pub trait Char: Copy + Eq + Ord + std::fmt::Debug {
const ASCII: bool;
fn char_class(self, config: &MatcherConfig) -> CharClass;
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass);
fn normalize(self, config: &MatcherConfig) -> Self;
}
impl Char for u8 {
const ASCII: bool = true;
#[inline]
fn char_class(self, config: &MatcherConfig) -> CharClass {
let c = self;
// using manual if conditions instead optimizes better
if c >= b'a' && c <= b'z' {
CharClass::Lower
} else if c >= b'A' && c <= b'Z' {
CharClass::Upper
} else if c >= b'0' && c <= b'9' {
CharClass::Number
} else if c.is_ascii_whitespace() {
CharClass::Whitespace
} else if config.delimeter_chars.contains(&c) {
CharClass::Delimiter
} else {
CharClass::NonWord
}
}
#[inline(always)]
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass) {
let char_class = self.char_class(config);
let normalized = if config.ignore_case && char_class == CharClass::Upper {
self + 32
} else {
self
};
(normalized, char_class)
}
#[inline(always)]
fn normalize(self, config: &MatcherConfig) -> Self {
if config.ignore_case && self >= b'A' && self <= b'Z' {
self + 32
} else {
self
}
}
}
fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() {
CharClass::Lower
} else if c.is_uppercase() {
CharClass::Upper
} else if c.is_numeric() {
CharClass::Number
} else if c.is_alphabetic() {
CharClass::Letter
} else if c.is_whitespace() {
CharClass::Whitespace
} else {
CharClass::NonWord
}
}
impl Char for char {
const ASCII: bool = false;
#[inline(always)]
fn char_class(self, config: &MatcherConfig) -> CharClass {
if self.is_ascii() {
return (self as u8).char_class(config);
}
char_class_non_ascii(self)
}
#[inline(always)]
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) {
if self.is_ascii() {
let (c, class) = (self as u8).char_class_and_normalize(config);
return (c as char, class);
}
let char_class = char_class_non_ascii(self);
if char_class == CharClass::Upper {
self = CASE_FOLDING_SIMPLE
.binary_search_by_key(&self, |(upper, _)| *upper)
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
}
if config.normalize {
self = normalize::normalize(self);
}
(self, char_class)
}
#[inline(always)]
fn normalize(mut self, config: &MatcherConfig) -> Self {
if config.normalize {
self = normalize::normalize(self);
}
to_lower_case(self)
}
}
pub use normalize::normalize;
#[inline(always)]
pub fn to_lower_case(c: char) -> char {
if c >= 'A' && c <= 'Z' {
char::from_u32(c as u32 + 32).unwrap()
} else if !c.is_ascii() {
CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
} else {
c
}
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CharClass {
Whitespace,
NonWord,
Delimiter,
Lower,
Upper,
Letter,
Number,
}

View File

@ -1,37 +1,7 @@
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
use crate::chars::CharClass;
use crate::score::BONUS_BOUNDARY;
#[non_exhaustive]
pub struct MatcherConfig {
pub delimeter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string
@ -44,33 +14,17 @@ pub struct MatcherConfig {
/// this significantly degrades performance so its not recommended
/// to be truned on by default
pub normalize: bool,
/// use faster/simpler algorithm at the cost of (potentially) much worse results
/// For long inputs this algorith is always used as a fallbach to avoid
/// blowups in time complexity
pub use_v1: bool,
/// The case matching to perform
pub case_matching: CaseMatching,
/// whether to ignore casing
pub ignore_case: bool,
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CharClass {
Whitespace,
NonWord,
Delimiter,
Lower,
Upper,
Letter,
Number,
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CaseMatching {
Respect,
Ignore,
Smart,
}
// #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
// #[non_exhaustive]
// pub enum CaseMatching {
// Respect,
// Ignore,
// Smart,
// }
impl MatcherConfig {
pub const DEFAULT: Self = {
@ -80,8 +34,7 @@ impl MatcherConfig {
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
inital_char_class: CharClass::Whitespace,
normalize: false,
use_v1: false,
case_matching: CaseMatching::Smart,
ignore_case: true,
}
};
}
@ -107,69 +60,4 @@ impl MatcherConfig {
self.inital_char_class = CharClass::Delimiter;
self
}
fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() {
CharClass::Lower
} else if c.is_uppercase() {
CharClass::Upper
} else if c.is_numeric() {
CharClass::Number
} else if c.is_alphabetic() {
CharClass::Letter
} else if c.is_whitespace() {
CharClass::Whitespace
} else {
CharClass::NonWord
}
}
fn char_class_ascii(&self, c: char) -> CharClass {
// using manual if conditions instead optimizes better
if c >= 'a' && c <= 'z' {
CharClass::Lower
} else if c >= 'A' && c <= 'Z' {
CharClass::Upper
} else if c >= '0' && c <= '9' {
CharClass::Number
} else if c.is_ascii_whitespace() {
CharClass::Whitespace
} else if self.delimeter_chars.contains(&(c as u8)) {
CharClass::Delimiter
} else {
CharClass::NonWord
}
}
pub(crate) fn char_class(&self, c: char) -> CharClass {
if c.is_ascii() {
self.char_class_ascii(c)
} else {
Self::char_class_non_ascii(c)
}
}
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::NonWord {
// transition from non word to word
match prev_class {
CharClass::Whitespace => return self.bonus_boundary_white,
CharClass::Delimiter => return self.bonus_boundary_delimiter,
CharClass::NonWord => return BONUS_BOUNDARY,
_ => (),
}
}
if prev_class == CharClass::Lower && class == CharClass::Upper
|| prev_class != CharClass::Number && class == CharClass::Number
{
// camelCase letter123
BONUS_CAMEL123
} else if class == CharClass::NonWord {
BONUS_NON_WORD
} else if class == CharClass::Whitespace {
self.bonus_boundary_white
} else {
0
}
}
}

46
src/fuzzy_greedy.rs Normal file
View File

@ -0,0 +1,46 @@
use crate::chars::Char;
use crate::Matcher;
impl Matcher {
/// greedy fallback algoritm, much faster (linear time) but reported scores/indicies
/// might not be the best match
pub(crate) fn fuzzy_match_greedy<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
mut start: usize,
mut end: usize,
indicies: &mut Vec<u32>,
) -> Option<u16> {
let first_char_end = if H::ASCII { start + 1 } else { end };
if !H::ASCII && needle.len() != 1 {
let mut needle_iter = needle[1..].iter().copied();
if let Some(mut needle_char) = needle_iter.next() {
for (i, &c) in haystack[first_char_end..].iter().enumerate() {
if c.normalize(&self.config) == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
end = i + 1;
break;
};
needle_char = next_needle_char;
}
}
}
}
// mimimize the greedly match by greedy matching in reverse
let mut needle_iter = needle.iter().rev().copied();
let mut needle_char = needle_iter.next().unwrap();
for (i, &c) in haystack[start..end].iter().enumerate().rev() {
println!("{c:?} {i} {needle_char:?}");
if c == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
start += i;
break;
};
needle_char = next_needle_char;
}
}
Some(self.calculate_score::<INDICIES, H, N>(haystack, needle, start, end, indicies))
}
}

272
src/fuzzy_optimal.rs Normal file
View File

@ -0,0 +1,272 @@
use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow};
use crate::score::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
};
use crate::{Matcher, MatcherConfig};
impl Matcher {
pub(crate) fn fuzzy_match_optimal<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
start: usize,
greedy_end: usize,
end: usize,
indicies: &mut Vec<u32>,
) -> Option<u16> {
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
// us to treat needle indecies as u16
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
return self.fuzzy_match_greedy::<INDICIES, H, N>(
haystack,
needle,
start,
greedy_end,
indicies,
);
};
let prev_class = start
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.inital_char_class);
let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config);
// this only happend with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched {
return None;
}
if needle.len() == 1 {
indicies.push(max_score_pos as u32);
return Some(max_score);
}
debug_assert_eq!(
matrix.row_offs[0], 0,
"prefilter should have put us at the start of the match"
);
// populate the matrix and find the best score
let (max_score, best_match_end) = matrix.populate_matrix(needle);
if INDICIES {
matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end);
}
println!("{indicies:?}");
println!("{}", max_score);
Some(max_score)
}
}
impl<H: Char> Matrix<'_, H> {
fn setup<N: Char>(
&mut self,
needle: &[N],
mut prev_class: CharClass,
config: &MatcherConfig,
) -> (u16, u16, bool)
where
H: PartialEq<N>,
{
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
let col_iter = self
.haystack
.iter_mut()
.zip(self.cells.iter_mut())
.zip(self.bonus.iter_mut())
.enumerate();
let mut max_score = 0;
let mut max_score_pos = 0;
let mut in_gap = false;
let mut prev_score = 0u16;
let mut matched = false;
let first_needle_char = needle[0];
for (i, ((c, matrix_cell), bonus_)) in col_iter {
let class = c.char_class(config);
*c = c.normalize(config);
let bonus = config.bonus_for(prev_class, class);
// save bonus for later so we don't have to recompute it each time
*bonus_ = bonus;
prev_class = class;
let i = i as u16;
println!("{i} {needle_char:?} {c:?}");
if *c == needle_char {
// save the first idx of each char
if let Some(next) = row_iter.next() {
*row_start = i;
(needle_char, row_start) = next;
} else {
if !matched {
*row_start = i;
}
// we have atleast one match
matched = true;
}
}
if *c == first_needle_char {
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
println!("start match {score}");
matrix_cell.consecutive_chars = 1;
if needle.len() == 1 && score > max_score {
max_score = score;
max_score_pos = i;
// can't get better than this
if bonus >= BONUS_BOUNDARY {
break;
}
}
matrix_cell.score = score;
in_gap = false;
} else {
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
matrix_cell.consecutive_chars = 0;
in_gap = true;
}
prev_score = matrix_cell.score;
}
(max_score_pos, max_score, matched)
}
fn populate_matrix<N: Char>(&mut self, needle: &[N]) -> (u16, u16)
where
H: PartialEq<N>,
{
let mut max_score = 0;
let mut max_score_end = 0;
let mut row_iter = needle
.iter()
.zip(rows_mut(self.row_offs, self.cells, self.haystack.len()))
.enumerate();
// skip the first row we already calculated the in `setup` initial scores
let (_, mut prev_matrix_row) = row_iter.next().unwrap().1;
for (i, (&needle_char, row)) in row_iter {
let haystack = haystack(self.haystack, self.bonus, row.off);
let mut in_gap = false;
let mut prev_matrix_cell = MatrixCell {
score: 0,
consecutive_chars: 0,
};
// we are interested in the score of the previous character
// in the previous row. This represents the previous char
// for each possible pattern. This is equivalent to diagonal movement
let diagonal_start = row.off - prev_matrix_row.off - 1;
let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..];
for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
.zip(row.cells.iter_mut())
.zip(diagonal.iter())
.enumerate()
{
let col = j + row.off as usize;
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
let mut score1 = 0;
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
let mut consecutive = 0;
if haystack_char.char == needle_char {
score1 = diag_matrix_cell.score + SCORE_MATCH;
let mut bonus = haystack_char.bonus;
consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 {
let first_bonus = self.bonus[col + 1 - consecutive as usize];
if bonus > first_bonus {
if bonus > BONUS_BOUNDARY {
consecutive = 1
} else {
bonus = max(bonus, BONUS_CONSECUTIVE)
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE)
}
}
if score1 + bonus < score2 {
score1 += haystack_char.bonus;
consecutive = 0;
} else {
score1 += bonus;
}
}
in_gap = score1 < score2;
let score = max(score1, score2);
println!("{score} {score1} {score2}");
if i == needle.len() - 1 && score > max_score {
max_score = score;
max_score_end = col as u16;
}
matrix_cell.consecutive_chars = consecutive;
matrix_cell.score = score;
prev_matrix_cell = *matrix_cell;
}
prev_matrix_row = row;
}
(max_score, max_score_end)
}
fn reconstruct_optimal_path<N: Char>(
&self,
needle: &[N],
start: u32,
indicies: &mut Vec<u32>,
best_match_end: u16,
) {
indicies.resize(needle.len(), 0);
let mut row_iter = self.rows_rev().zip(indicies.iter_mut()).peekable();
let (mut row, mut matched_col_idx) = row_iter.next().unwrap();
let mut next_row: Option<MatrixRow> = None;
let mut col = best_match_end;
let mut prefer_match = true;
let haystack_len = self.haystack.len() as u16;
loop {
let score = row.cells[col as usize].score;
let mut score1 = 0;
let mut score2 = 0;
if let Some(&(prev_row, _)) = row_iter.peek() {
if col >= prev_row.off {
score1 = prev_row[col].score;
}
}
if col > row.off {
score2 = row[col - 1].score;
}
println!("{score} {score2} {score1} {prefer_match}");
let mut new_prefer_match = row[col].consecutive_chars > 1;
if !new_prefer_match && col + 1 < haystack_len {
if let Some(next_row) = next_row {
new_prefer_match = next_row[col + 1].consecutive_chars > 0
}
}
if score > score1 && (score > score2 || score == score2 && prefer_match) {
*matched_col_idx = col as u32 + start;
next_row = Some(row);
let Some(next) = row_iter.next() else {
break;
};
(row, matched_col_idx) = next
}
prefer_match = new_prefer_match;
col -= 1;
}
}
}

View File

@ -1,616 +1,137 @@
// sadly this doens't optmimzie well currently
#![allow(clippy::manual_range_contains)]
use std::alloc::Layout;
use std::cmp::max;
use memchr::{memchr, memchr2};
use normalize::normalize;
//autogenerated by generate-ucd
#[allow(warnings)]
#[rustfmt::skip]
mod case_fold;
mod chars;
mod config;
mod normalize;
mod fuzzy_greedy;
mod fuzzy_optimal;
mod matrix;
mod prefilter;
mod score;
mod utf32_str;
pub use config::{CaseMatching, CharClass, MatcherConfig};
// #[cfg(test)]
// mod tests;
use crate::config::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
};
pub use config::MatcherConfig;
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
#[derive(Clone, Copy, PartialEq, Eq)]
struct MatrixCell {
score: u16,
consecutive_chars: u16,
}
#[derive(Clone, Copy, PartialEq, Eq)]
struct HaystackChar {
char: char,
bonus: u16,
}
use crate::matrix::MatrixSlab;
use crate::utf32_str::Utf32Str;
pub struct Matcher {
pub config: MatcherConfig,
matrix: Box<[MatrixCell; MAX_MATRIX_SIZE]>,
haystack: Box<[HaystackChar; MAX_HAYSTACK_LEN]>,
// needle can be at most as long as the haystack
first_needle_occurance: Box<[u16; MAX_HAYSTACK_LEN]>,
slab: MatrixSlab,
}
pub struct Query {
needle_chars: Vec<char>,
is_ascii: bool,
ignore_case: bool,
}
impl Query {
fn push(&mut self, needle: &str, normalize_: bool, smart_case: bool) {
self.needle_chars.reserve(needle.len());
self.needle_chars.extend(needle.chars().map(|mut c| {
if !c.is_ascii() {
self.is_ascii = false;
}
if smart_case {
if c.is_uppercase() {
self.ignore_case = false;
}
} else if self.ignore_case {
if self.is_ascii {
c = to_lower_case::<true>(c)
} else {
c = to_lower_case::<false>(c)
}
}
if normalize_ && !self.is_ascii {
c = normalize(c);
}
c
}))
}
}
#[inline(always)]
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
if c >= b'a' || c <= b'z' {
memchr2(c, c + 32, haystack)
} else {
memchr(c, haystack)
}
}
/// Safety: T must be vaind if initalized with zeros
unsafe fn zeroed_array_on_heap<T: Copy, const LEN: usize>() -> Box<[T; LEN]> {
let layout = Layout::new::<[T; LEN]>();
let res = std::alloc::alloc_zeroed(layout);
if res.is_null() {
std::alloc::handle_alloc_error(layout)
}
Box::from_raw(res as _)
}
// // impl Query {
// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) {
// // self.needle_chars.reserve(needle.len());
// // self.needle_chars.extend(needle.chars().map(|mut c| {
// // if !c.is_ascii() {
// // self.is_ascii = false;
// // }
// // if smart_case {
// // if c.is_uppercase() {
// // self.ignore_case = false;
// // }
// // } else if self.ignore_case {
// // if self.is_ascii {
// // c = to_lower_case::<true>(c)
// // } else {
// // c = to_lower_case::<false>(c)
// // }
// // }
// // if normalize_ && !self.is_ascii {
// // c = normalize(c);
// // }
// // c
// // }))
// // }
// // }
impl Matcher {
pub fn new(config: MatcherConfig) -> Self {
// Safety: all data allocated here is just integers/structs that contain
// integers so zeroed values are legal
unsafe {
Self {
config,
matrix: zeroed_array_on_heap(),
haystack: zeroed_array_on_heap(),
first_needle_occurance: zeroed_array_on_heap(),
}
Self {
config,
slab: MatrixSlab::new(),
}
}
pub fn compile_query(&self, needle: &str) -> Query {
let mut query = Query {
needle_chars: Vec::new(),
is_ascii: true,
ignore_case: self.config.case_matching == CaseMatching::Ignore,
};
query.push(
needle,
self.config.normalize,
self.config.case_matching == CaseMatching::Smart,
);
query
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
assert!(haystack.len() <= u32::MAX as usize);
self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new())
}
pub fn recompile_query(&self, query: &mut Query, needle: &str) {
query.needle_chars.clear();
query.is_ascii = false;
query.ignore_case = self.config.case_matching == CaseMatching::Ignore;
query.push(
needle,
self.config.normalize,
self.config.case_matching == CaseMatching::Smart,
);
}
pub fn append_query(&self, query: &mut Query, needle: &str) {
query.push(
needle,
self.config.normalize,
self.config.case_matching == CaseMatching::Smart,
);
}
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<u16> {
if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize]
}
if self.config.use_v1 {
if query.is_ascii && !self.config.normalize {
self.fuzzy_matcher_v1::<false, true>(query, haystack, &mut Vec::new())
} else {
self.fuzzy_matcher_v1::<false, false>(query, haystack, &mut Vec::new())
}
} else if query.is_ascii && !self.config.normalize {
self.fuzzy_matcher_v2::<false, true>(query, haystack, &mut Vec::new())
} else {
self.fuzzy_matcher_v2::<false, false>(query, haystack, &mut Vec::new())
}
}
pub fn fuzzy_indicies(
fn fuzzy_matcher_impl<const INDICIES: bool>(
&mut self,
query: &Query,
mut haystack: &str,
indicies: &mut Vec<u32>,
haystack: Utf32Str<'_>,
needle_: Utf32Str<'_>,
indidies: &mut Vec<u32>,
) -> Option<u16> {
if haystack.len() > u32::MAX as usize {
haystack = &haystack[..u32::MAX as usize]
}
if self.config.use_v1 {
if query.is_ascii && !self.config.normalize {
self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
} else {
self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
}
} else if query.is_ascii && !self.config.normalize {
self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
} else {
self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
}
}
#[inline(always)]
fn normalize_char<const ASCII_ONLY: bool>(&self, ignore_case: bool, mut c: char) -> char {
if ignore_case {
c = to_lower_case::<ASCII_ONLY>(c)
}
if !ASCII_ONLY && self.config.normalize {
c = normalize(c)
}
c
}
fn prefilter_ascii(&self, query: &Query, mut haystack: &[u8]) -> Option<(usize, usize)> {
let needle = &query.needle_chars;
if query.ignore_case {
let first_idx = find_ascii_ignore_case(needle[0] as u8, haystack)?;
let mut last_idx = first_idx + 1;
haystack = &haystack[last_idx..];
for &c in &needle[1..] {
let idx = find_ascii_ignore_case(c as u8, haystack)? + 1;
last_idx += idx;
haystack = &haystack[idx..];
}
Some((first_idx, last_idx))
} else {
let first_idx = memchr(needle[0] as u8, haystack)?;
let mut last_idx = first_idx + 1;
haystack = &haystack[last_idx..];
for &c in &needle[1..] {
let idx = memchr(c as u8, haystack)? + 1;
last_idx += idx;
haystack = &haystack[idx..];
}
Some((first_idx, last_idx))
}
}
fn prefilter_non_ascii(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
let needle_char = query.needle_chars[0];
let mut text = haystack
.char_indices()
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
let (match_start, c) = text.find(|&(_, c)| c == needle_char)?;
Some((match_start, match_start + c.len_utf8()))
}
fn prefilter(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
// quickly reject small matches
if query.needle_chars.len() > haystack.len() {
return None;
}
if query.is_ascii {
self.prefilter_ascii(query, haystack.as_bytes())
} else {
self.prefilter_non_ascii(query, haystack)
}
}
fn fuzzy_matcher_v1<const INDICIES: bool, const ASCII_ONLY: bool>(
&mut self,
query: &Query,
haystack: &str,
indicies: &mut Vec<u32>,
) -> Option<u16> {
let (start, end) = self.prefilter(query, haystack)?;
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
query, haystack, start, end, indicies,
)
}
fn fuzzy_matcher_v1_with_prefilter<const INDICIES: bool, const ASCII_ONLY: bool>(
&mut self,
query: &Query,
haystack: &str,
mut start: usize,
mut end: usize,
indicies: &mut Vec<u32>,
) -> Option<u16> {
let first_char_end = if ASCII_ONLY { start + 1 } else { end };
if !ASCII_ONLY && query.needle_chars.len() != 1 {
let mut needle_iter = query.needle_chars[1..].iter().copied();
if let Some(mut needle_char) = needle_iter.next() {
let haystack = haystack[first_char_end..]
.char_indices()
.rev()
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
for (i, c) in haystack {
if c == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
end = i + c.len_utf8();
break;
};
needle_char = next_needle_char;
}
}
}
}
// very simple, just mimimize from the back
let match_ = haystack[first_char_end..end]
.char_indices()
.rev()
.map(|(i, c)| (i, self.normalize_char::<ASCII_ONLY>(query.ignore_case, c)));
let mut needle_iter = query.needle_chars[..].iter().rev().copied();
let mut needle_char = needle_iter.next().unwrap();
for (i, c) in match_ {
if c == needle_char {
let Some(next_needle_char) = needle_iter.next() else {
start = i;
break;
};
needle_char = next_needle_char;
}
}
Some(self.calculate_score::<INDICIES, ASCII_ONLY>(query, haystack, start, end, indicies))
}
fn calculate_score<const INDICIES: bool, const ASCII_ONLY: bool>(
&mut self,
query: &Query,
text: &str,
match_start: usize,
match_end: usize,
indicies: &mut Vec<u32>,
) -> u16 {
if INDICIES {
indicies.reserve(query.needle_chars.len());
}
let mut prev_class = text[..match_start]
.chars()
.next_back()
.map(|c| self.config.char_class(c))
.unwrap_or(self.config.inital_char_class);
let mut needle_idx = 0;
let mut score = 0u16;
let mut in_gap = false;
let mut consecutive = 0;
let mut first_bonus = 0u16;
for (i, mut c) in text[match_start..match_end].char_indices() {
let class = self.config.char_class(c);
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
c = to_lower_case::<ASCII_ONLY>(c);
}
if self.config.normalize && !ASCII_ONLY {
c = normalize(c)
}
if c == query.needle_chars[needle_idx] {
if INDICIES {
indicies.push(i as u32)
}
score += SCORE_MATCH;
let mut bonus = self.config.bonus_for(prev_class, class);
if consecutive == 0 {
first_bonus = bonus
} else {
// Break consecutive chunk
if bonus > first_bonus {
if bonus >= BONUS_BOUNDARY {
first_bonus = bonus;
} else {
bonus = max(bonus, BONUS_CONSECUTIVE);
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE);
}
}
if needle_idx == 0 {
bonus *= BONUS_FIRST_CHAR_MULTIPLIER;
}
score += bonus;
needle_idx += 1;
in_gap = false;
consecutive += 1;
} else {
let penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
score = score.saturating_sub(penalty);
in_gap = true;
consecutive = 0;
first_bonus = 0;
}
prev_class = class;
}
score
}
fn fuzzy_matcher_v2<const INDICIES: bool, const ASCII_ONLY: bool>(
&mut self,
query: &Query,
text: &str,
indicies: &mut Vec<u32>,
) -> Option<u16> {
let (start, prefilter_end) = self.prefilter(query, text)?;
let text_len = text.len() - start;
// fallback to v1 algorithms for long haystacks
// technically we need to multiply by char len here
// but counting chars has a lot of unecessary overhead that we can avoid
// here in practice using bytelen should be a reasonable approximation
// we also differ from fzf here in that we never allocate and instead stringintly check here
if text_len > u16::MAX as usize || text_len * query.needle_chars.len() > MAX_HAYSTACK_LEN {
return self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
query,
text,
start,
prefilter_end,
indicies,
);
}
let mut prev_class = text[..start]
.chars()
.next_back()
.map(|c| self.config.char_class(c))
.unwrap_or(self.config.inital_char_class);
let text = &text[start..];
let mut needle_iter = query.needle_chars[..]
.iter()
.copied()
.zip(self.first_needle_occurance.iter_mut());
let (mut needle_char, mut needle_char_idx) = needle_iter.next().unwrap();
let iter = text[start..]
.chars()
.zip(self.matrix.iter_mut())
.zip(self.haystack.iter_mut())
.enumerate();
let mut last_matched_idx = 0;
let mut max_score = 0;
let mut max_score_pos = 0;
let mut in_gap = false;
let mut prev_score = 0u16;
let mut matched = false;
let first_needle_char = query.needle_chars[0];
for (i, ((mut c, matrix_cell), char_info)) in iter {
let class = self.config.char_class(c);
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
c = to_lower_case::<ASCII_ONLY>(c);
}
if self.config.normalize && !ASCII_ONLY {
c = normalize(c)
}
char_info.char = c;
let bonus = self.config.bonus_for(prev_class, class);
char_info.char = c;
prev_class = class;
let i = i as u16;
if c == needle_char {
// save the first idx of each char
if let Some(next) = needle_iter.next() {
*needle_char_idx = i;
(needle_char, needle_char_idx) = next
} else {
// we have atleast one match
matched = true;
}
// and the last matched char
last_matched_idx = i;
}
if c == first_needle_char {
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
matrix_cell.consecutive_chars = 1;
if query.needle_chars.len() == 1 && score > max_score {
max_score = score;
max_score_pos = i;
// can't get better than this
if bonus >= BONUS_BOUNDARY {
break;
}
}
in_gap = false;
} else {
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
matrix_cell.consecutive_chars = 0;
in_gap = true;
}
prev_score = matrix_cell.score;
}
if !matched {
debug_assert!(!ASCII_ONLY, "prefilter should have rejected");
return None;
}
if query.needle_chars.len() == 1 {
indicies.push(max_score_pos as u32);
return Some(max_score);
}
assert_eq!(
self.first_needle_occurance[0], 0,
"prefilter should have put us at the start of the match"
assert!(
haystack.len() <= u32::MAX as usize,
"fuzzy matching is only support for up to 2^32-1 codepoints"
);
let haystack_len = last_matched_idx as usize + 1;
let (max_score, best_match_end) = self.popultate_matrix(haystack_len, query);
if INDICIES {
indicies.reserve(query.needle_chars.len());
let mut col = best_match_end;
let mut needle_iter = self.matrix[..haystack_len * query.needle_chars.len()]
.windows(haystack_len)
.zip(self.first_needle_occurance[..haystack_len].iter())
.rev()
.peekable();
let mut next_row = None;
let (mut row, mut first_needle_occurance) = needle_iter.next().unwrap();
let mut prefer_match = true;
loop {
let score = row[col as usize].score;
let mut score1 = 0;
let mut score2 = 0;
if let Some((prev_row, _)) = needle_iter.peek() {
if col >= *first_needle_occurance {
score1 = prev_row[col as usize].score;
}
}
if col > *first_needle_occurance {
score2 = row[col as usize - 1].score;
}
if score > score1 && (score > score2 || score == score2 && prefer_match) {
indicies.push(col as u32 + start as u32);
next_row = Some(row);
let Some(next) = needle_iter.next() else {
break;
};
(row, first_needle_occurance) = next
}
prefer_match = row[col as usize].consecutive_chars > 1;
if !prefer_match && col + 1 < query.needle_chars.len() as u16 {
if let Some(next_row) = next_row {
prefer_match = next_row[col as usize + 1].consecutive_chars > 0
}
}
col -= 1;
match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?;
self.fuzzy_match_optimal::<INDICIES, u8, u8>(
haystack, needle, start, greedy_end, end, indidies,
)
}
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
// a purely ascii haystack can never be transformed to match
// a needle that contains non-ascii chars since we don't allow gaps
None
}
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
todo!()
// let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
// self.fuzzy_match_optimal::<INDICIES, char, u8>(
// haystack,
// needle,
// start,
// start + 1,
// end,
// indidies,
// )
}
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
self.fuzzy_match_optimal::<INDICIES, char, char>(
haystack,
needle,
start,
start + 1,
end,
indidies,
)
}
}
Some(max_score)
}
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) {
let mut max_score = 0;
let mut max_score_end = 0;
let mut iter = query
.needle_chars
.iter()
.zip(self.first_needle_occurance.iter())
.zip(self.matrix.chunks_mut(haystack_len))
.enumerate();
// skip the first row we already calculated the initial scores
let (_, ((&_, &_), mut prev_matrix_row)) = iter.next().unwrap();
for (i, ((&needle_char, &first_occurance), matrix_row)) in iter {
// help the optimizer out a little
assert!((first_occurance as usize) < matrix_row.len());
assert!(first_occurance != 0);
let mut in_gap = false;
let haystack = &self.haystack[first_occurance as usize..haystack_len];
let mut prev_matrix_cell = matrix_row[first_occurance as usize - 1];
let matrix_row = &mut matrix_row[first_occurance as usize..haystack_len];
let prev_matrix_diagonal =
&mut prev_matrix_row[first_occurance as usize - 1..haystack_len - 1];
for (j, ((&haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
.iter()
.zip(matrix_row.iter_mut())
.zip(prev_matrix_diagonal.iter())
.enumerate()
{
let col = j + first_occurance as usize;
let gap_penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
let mut score1 = 0;
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
let mut consecutive = 0;
if haystack_char.char == needle_char {
score1 = diag_matrix_cell.score + SCORE_MATCH;
let mut bonus = haystack_char.bonus;
consecutive = diag_matrix_cell.consecutive_chars + 1;
if consecutive > 1 {
let first_bonus = self.haystack[col - consecutive as usize].bonus;
if bonus > first_bonus {
if bonus > BONUS_BOUNDARY {
consecutive = 1
} else {
bonus = max(bonus, BONUS_CONSECUTIVE)
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE)
}
}
if score1 + bonus < score2 {
score1 += haystack_char.bonus;
consecutive = 0;
} else {
score1 += bonus;
}
}
in_gap = score1 < score2;
let score = max(max(score1, score2), 0);
prev_matrix_cell = *matrix_cell;
if i == query.needle_chars.len() - 1 && score > max_score {
max_score = score;
max_score_end = col as u16;
}
matrix_cell.consecutive_chars = consecutive;
matrix_cell.score = score;
}
prev_matrix_row = matrix_row;
}
(max_score, max_score_end)
}
}
#[inline(always)]
fn to_lower_case<const ASCII_ONLY: bool>(c: char) -> char {
if c >= 'A' && c <= 'Z' {
char::from_u32(c as u32 + 32).unwrap()
} else if !c.is_ascii() && !ASCII_ONLY {
case_fold::CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| case_fold::CASE_FOLDING_SIMPLE[idx].1)
} else {
c
}
// pub fn fuzzy_indicies(
// &mut self,
// query: &Query,
// mut haystack: Utf32Str<'_>,
// indicies: &mut Vec<u32>,
// ) -> Option<u16> {
// if haystack.len() > u32::MAX as usize {
// haystack = &haystack[..u32::MAX as usize]
// }
// println!(
// "start {haystack:?}, {:?} {} {}",
// query.needle_chars, query.ignore_case, query.is_ascii
// );
// if self.config.use_v1 {
// if query.is_ascii && !self.config.normalize {
// self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
// } else {
// self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
// }
// } else if query.is_ascii && !self.config.normalize {
// self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
// } else {
// self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
// }
// }
}

280
src/matrix.rs Normal file
View File

@ -0,0 +1,280 @@
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
use std::fmt::{Debug, Formatter, Result};
use std::marker::PhantomData;
use std::mem::{size_of, take};
use std::ops::Index;
use std::ptr::{slice_from_raw_parts_mut, NonNull};
use crate::chars::Char;
const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB
// these two aren't hard maxima, instead we simply allow whatever will fit into memory
const MAX_HAYSTACK_LEN: usize = 2048; // 64KB
const MAX_NEEDLE_LEN: usize = 2048; // 64KB
struct MatrixLayout<C: Char> {
haystack_len: usize,
needle_len: usize,
cell_count: usize,
layout: Layout,
haystack_off: usize,
bonus_off: usize,
rows_off: usize,
cells_off: usize,
_phantom: PhantomData<C>,
}
impl<C: Char> MatrixLayout<C> {
fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout<C> {
let mut layout = Layout::from_size_align(0, 1).unwrap();
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();
let rows_layout = Layout::array::<u16>(needle_len).unwrap();
let cells_layout = Layout::array::<MatrixCell>(cell_count).unwrap();
let haystack_off;
(layout, haystack_off) = layout.extend(haystack_layout).unwrap();
let bonus_off;
(layout, bonus_off) = layout.extend(bonus_layout).unwrap();
let rows_off;
(layout, rows_off) = layout.extend(rows_layout).unwrap();
let cells_off;
(layout, cells_off) = layout.extend(cells_layout).unwrap();
MatrixLayout {
haystack_len,
needle_len,
cell_count,
layout,
haystack_off,
bonus_off,
rows_off,
cells_off,
_phantom: PhantomData,
}
}
/// # Safety
///
/// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT
unsafe fn fieds_from_ptr(
&self,
ptr: NonNull<u8>,
) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) {
// sanity checks, should not be necessary
let base = ptr.as_ptr();
let haystack = base.add(self.haystack_off) as *mut C;
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
let bonus = base.add(self.bonus_off) as *mut u16;
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
let rows = base.add(self.rows_off) as *mut u16;
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
let cells = base.add(self.cells_off) as *mut MatrixCell;
let cells = slice_from_raw_parts_mut(cells, self.cell_count);
(haystack, bonus, rows, cells)
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub(crate) struct MatrixCell {
pub score: u16,
pub consecutive_chars: u16,
}
impl Debug for MatrixCell {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
(self.score, self.consecutive_chars).fmt(f)
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub(crate) struct HaystackChar<C: Char> {
pub char: C,
pub bonus: u16,
}
impl<C: Char> Debug for HaystackChar<C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
(self.char, self.bonus).fmt(f)
}
}
#[derive(Clone, Copy)]
pub(crate) struct MatrixRow<'a> {
pub off: u16,
pub cells: &'a [MatrixCell],
}
impl Index<u16> for MatrixRow<'_> {
type Output = MatrixCell;
fn index(&self, index: u16) -> &Self::Output {
&self.cells[index as usize]
}
}
impl Debug for MatrixRow<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &(0, 0)));
f.entries(self.cells.iter());
f.finish()
}
}
pub(crate) struct MatrixRowMut<'a> {
pub off: u16,
pub cells: &'a mut [MatrixCell],
}
impl Debug for MatrixRowMut<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut f = f.debug_list();
f.entries((0..self.off).map(|_| &(0, 0)));
f.entries(self.cells.iter());
f.finish()
}
}
pub struct DebugList<I>(I);
impl<I> Debug for DebugList<I>
where
I: Iterator + Clone,
I::Item: Debug,
{
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_list().entries(self.0.clone()).finish()
}
}
pub(crate) struct Matrix<'a, C: Char> {
pub haystack: &'a mut [C],
// stored as a seperate array instead of struct
// to avoid padding sine char is too large and u8 too small :/
pub bonus: &'a mut [u16],
pub row_offs: &'a mut [u16],
pub cells: &'a mut [MatrixCell],
}
impl<'a, C: Char> Matrix<'a, C> {
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
let mut cells = &*self.cells;
self.row_offs.iter().map(move |&off| {
let len = self.haystack.len() - off as usize;
let (row, tmp) = cells.split_at(len);
cells = tmp;
MatrixRow { off, cells: row }
})
}
pub fn rows_rev(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator {
let mut cells = &*self.cells;
self.row_offs.iter().rev().map(move |&off| {
let len = self.haystack.len() - off as usize;
let (tmp, row) = cells.split_at(cells.len() - len);
cells = tmp;
MatrixRow { off, cells: row }
})
}
pub fn haystack(
&self,
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
haystack(self.haystack, self.bonus, 0)
}
}
impl<'a, C: Char> Debug for Matrix<'a, C> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_struct("Matrix")
.field("haystack", &DebugList(self.haystack()))
.field("matrix", &DebugList(self.rows()))
.finish()
}
}
pub(crate) fn haystack<'a, C: Char>(
haystack: &'a [C],
bonus: &'a [u16],
skip: u16,
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + Clone + 'a {
haystack[skip as usize..]
.iter()
.zip(bonus[skip as usize..].iter())
.map(|(&char, &bonus)| HaystackChar { char, bonus })
}
pub(crate) fn rows_mut<'a>(
row_offs: &'a [u16],
mut cells: &'a mut [MatrixCell],
haystack_len: usize,
) -> impl Iterator<Item = MatrixRowMut<'a>> + ExactSizeIterator + 'a {
row_offs.iter().map(move |&off| {
let len = haystack_len - off as usize;
let (row, tmp) = take(&mut cells).split_at_mut(len);
cells = tmp;
MatrixRowMut { off, cells: row }
})
}
// we only use this to construct the layout for the slab allocation
#[allow(unused)]
struct MatrixData {
haystack: [char; MAX_HAYSTACK_LEN],
bonus: [u16; MAX_HAYSTACK_LEN],
row_offs: [u16; MAX_NEEDLE_LEN],
cells: [MatrixCell; MAX_MATRIX_SIZE],
}
// const MATRIX_ALLOC_LAYOUT: Layout =
// MatrixLayout::<char>::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout;
pub(crate) struct MatrixSlab(NonNull<u8>);
impl MatrixSlab {
pub fn new() -> Self {
let layout = Layout::new::<MatrixData>();
// safety: the matrix is never zero sized (hardcoded constants)
let ptr = unsafe { alloc_zeroed(layout) };
let Some(ptr) = NonNull::new(ptr) else{
handle_alloc_error(layout)
};
MatrixSlab(ptr.cast())
}
pub(crate) fn alloc<C: Char>(
&mut self,
haystack_: &[C],
needle_len: usize,
) -> Option<Matrix<'_, C>> {
let cells = haystack_.len() * needle_len;
if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize {
return None;
}
let matrix_layout = MatrixLayout::<C>::new(
haystack_.len(),
needle_len,
(haystack_.len() - needle_len / 2) * needle_len,
);
if matrix_layout.layout.size() > size_of::<MatrixData>() {
return None;
}
unsafe {
// safetly: this allocation is valid for MATRIX_ALLOC_LAYOUT
let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0);
// copy haystack before creating refernces to ensure we donu't crate
// refrences to invalid chars (which may or may not be UB)
haystack_
.as_ptr()
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());
Some(Matrix {
haystack: &mut *haystack,
row_offs: &mut *rows,
bonus: &mut *bonus,
cells: &mut *cells,
})
}
}
}
impl Drop for MatrixSlab {
fn drop(&mut self) {
unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatrixData>()) };
}
}

0
src/multizip.rs Normal file
View File

73
src/prefilter.rs Normal file
View File

@ -0,0 +1,73 @@
use ::memchr::{memchr, memchr2, memrchr, memrchr2};
use crate::chars::Char;
use crate::utf32_str::Utf32Str;
use crate::Matcher;
#[inline(always)]
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
if c >= b'a' || c <= b'z' {
memchr2(c, c - 32, haystack)
} else {
memchr(c, haystack)
}
}
#[inline(always)]
fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option<usize> {
if c >= b'a' || c <= b'z' {
memrchr2(c, c - 32, haystack)
} else {
memrchr(c, haystack)
}
}
impl Matcher {
pub(crate) fn prefilter_ascii(
&self,
mut haystack: &[u8],
needle: &[u8],
) -> Option<(usize, usize, usize)> {
if self.config.ignore_case {
let start = find_ascii_ignore_case(needle[0], haystack)?;
let mut eager_end = start + 1;
haystack = &haystack[eager_end..];
for &c in &needle[1..] {
let idx = find_ascii_ignore_case(c, haystack)? + 1;
eager_end += idx;
haystack = &haystack[idx..];
}
let end = eager_end
+ find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack).unwrap_or(0);
Some((start, eager_end, end))
} else {
let start = memchr(needle[0], haystack)?;
let mut eager_end = start + 1;
haystack = &haystack[eager_end..];
for &c in &needle[1..] {
let idx = memchr(c, haystack)? + 1;
eager_end += idx;
haystack = &haystack[idx..];
}
let end = eager_end + memrchr(*needle.last().unwrap(), haystack).unwrap_or(0);
Some((start, eager_end, end))
}
}
pub(crate) fn prefilter_non_ascii(
&self,
haystack: &[char],
needle: Utf32Str<'_>,
) -> Option<(usize, usize)> {
let needle_char = needle.get(0);
let start = haystack
.iter()
.position(|c| c.normalize(&self.config) == needle_char)?;
let needle_char = needle.last();
let end = haystack[start..]
.iter()
.position(|c| c.normalize(&self.config) == needle_char)?;
Some((start, end))
}
}

145
src/score.rs Normal file
View File

@ -0,0 +1,145 @@
use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::{Matcher, MatcherConfig};
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
// shorter fuzzy matches. The bonus point here was specifically chosen that
// the bonus is cancelled when the gap between the acronyms grows over
// 8 characters, which is approximately the average length of the words found
// in web2 dictionary and my file system.
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
// Although bonus point for non-word characters is non-contextual, we need it
// for computing bonus points for consecutive chunks starting with a non-word
// character.
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
// Edge-triggered bonus for matches in camelCase words.
// Compared to word-boundary case, they don't accompany single-character gaps
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
// Minimum bonus point given to characters in consecutive chunks.
// Note that bonus points for consecutive matches shouldn't have needed if we
// used fixed match score as in the original algorithm.
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
// The first character in the typed pattern usually has more significance
// than the rest so it's important that it appears at special positions where
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
// The amount of the extra bonus should be limited so that the gap penalty is
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
impl MatcherConfig {
#[inline]
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::NonWord {
// transition from non word to word
match prev_class {
CharClass::Whitespace => return self.bonus_boundary_white,
CharClass::Delimiter => return self.bonus_boundary_delimiter,
CharClass::NonWord => return BONUS_BOUNDARY,
_ => (),
}
}
if prev_class == CharClass::Lower && class == CharClass::Upper
|| prev_class != CharClass::Number && class == CharClass::Number
{
// camelCase letter123
BONUS_CAMEL123
} else if class == CharClass::NonWord {
BONUS_NON_WORD
} else if class == CharClass::Whitespace {
self.bonus_boundary_white
} else {
0
}
}
}
impl Matcher {
#[inline(always)]
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
self.config.bonus_for(prev_class, class)
}
pub(crate) fn calculate_score<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
start: usize,
end: usize,
indicies: &mut Vec<u32>,
) -> u16 {
if INDICIES {
indicies.reserve(needle.len());
}
let mut prev_class = start
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.inital_char_class);
let mut needle_iter = needle.iter();
let mut needle_char = *needle_iter.next().unwrap();
let mut in_gap = false;
let mut consecutive = 1;
// unrolled the firs iteration to make applying the first char multiplier less akward
if INDICIES {
indicies.push(start as u32)
}
let mut first_bonus = self.bonus_for(prev_class, haystack[0].char_class(&self.config));
let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER;
for (i, c) in haystack[start + 1..end].iter().enumerate() {
let class = c.char_class(&self.config);
let c = c.normalize(&self.config);
if c == needle_char {
if INDICIES {
indicies.push(i as u32 + start as u32)
}
let mut bonus = self.bonus_for(prev_class, class);
if consecutive == 0 {
first_bonus = bonus
} else {
// Break consecutive chunk
if bonus > first_bonus {
if bonus >= BONUS_BOUNDARY {
first_bonus = bonus;
} else {
bonus = max(bonus, BONUS_CONSECUTIVE);
}
} else {
bonus = max(first_bonus, BONUS_CONSECUTIVE);
}
}
score += SCORE_MATCH + bonus;
in_gap = false;
consecutive += 1;
if let Some(&next) = needle_iter.next() {
needle_char = next;
}
} else {
let penalty = if in_gap {
PENALTY_GAP_EXTENSION
} else {
PENALTY_GAP_START
};
score = score.saturating_sub(penalty);
in_gap = true;
consecutive = 0;
first_bonus = 0;
}
prev_class = class;
}
score
}
}

270
src/tests.rs Normal file
View File

@ -0,0 +1,270 @@
use crate::config::{
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
};
use crate::{CaseMatching, Matcher, MatcherConfig};
pub fn assert_matches(
use_v1: bool,
normalize: bool,
case_sensitive: bool,
path: bool,
cases: &[(&str, &str, u32, u32, u16)],
) {
let mut config = MatcherConfig {
use_v1,
normalize,
case_matching: if case_sensitive {
CaseMatching::Respect
} else {
CaseMatching::Ignore
},
..MatcherConfig::DEFAULT
};
if path {
config.set_match_paths();
}
let mut matcher = Matcher::new(config);
let mut indicies = Vec::new();
for &(haystack, needle, start, end, mut score) in cases {
score += needle.chars().count() as u16 * SCORE_MATCH;
let query = matcher.compile_query(needle);
let res = matcher.fuzzy_indicies(&query, haystack, &mut indicies);
assert_eq!(res, Some(score), "{needle:?} did not match {haystack:?}");
assert_eq!(
indicies.first().copied()..indicies.last().map(|&i| i + 1),
Some(start)..Some(end),
"{needle:?} match {haystack:?}[{start}..{end}]"
);
}
}
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
#[test]
fn test_v2_fuzzy() {
assert_matches(
false,
false,
false,
false,
&[
(
"fooBarbaz1",
"oBZ",
2,
9,
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
),
(
"foo bar baz",
"fbb",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"/AutomatorDocument.icns",
"rdoc",
9,
13,
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
),
(
"/man1/zshcompctl.1",
"zshc",
6,
10,
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_BOUNDARY_DELIMITER * 3,
),
(
"/.oh-my-zsh/cache",
"zshc",
8,
13,
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
- PENALTY_GAP_START
+ BONUS_BOUNDARY_DELIMITER,
),
(
"ab0123 456",
"12356",
3,
10,
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
),
(
"abc123 456",
"12356",
3,
10,
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_CAMEL123 * 2
+ BONUS_CONSECUTIVE
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
(
"foo/bar/baz",
"fbb",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"fooBarBaz",
"fbb",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
- 2 * PENALTY_GAP_START
- 2 * PENALTY_GAP_EXTENSION,
),
(
"foo barbaz",
"fbb",
0,
8,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START * 2
- PENALTY_GAP_EXTENSION * 3,
),
(
"fooBar Baz",
"foob",
0,
4,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
),
(
"xFoo-Bar Baz",
"foo-b",
1,
6,
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_CAMEL123 * 2
+ BONUS_NON_WORD
+ BONUS_BOUNDARY,
),
],
);
}
#[test]
fn test_v1_fuzzy() {
assert_matches(
true,
false,
false,
false,
&[
(
"fooBarbaz1",
"oBZ",
2,
9,
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
),
(
"foo bar baz",
"fbb",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"/AutomatorDocument.icns",
"rdoc",
9,
13,
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
),
(
"/man1/zshcompctl.1",
"zshc",
6,
10,
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_BOUNDARY_DELIMITER * 3,
),
(
"/.oh-my-zsh/cache",
"zshc",
8,
13,
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
- PENALTY_GAP_START
+ BONUS_BOUNDARY_DELIMITER,
),
(
"ab0123 456",
"12356",
3,
10,
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
),
(
"abc123 456",
"12356",
3,
10,
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_CAMEL123 * 2
+ BONUS_CONSECUTIVE
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
(
"foo/bar/baz",
"fbb",
0,
9,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
- 2 * PENALTY_GAP_START
- 4 * PENALTY_GAP_EXTENSION,
),
(
"fooBarBaz",
"fbb",
0,
7,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
- 2 * PENALTY_GAP_START
- 2 * PENALTY_GAP_EXTENSION,
),
(
"foo barbaz",
"fbb",
0,
8,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
- PENALTY_GAP_START * 2
- PENALTY_GAP_EXTENSION * 3,
),
(
"fooBar Baz",
"foob",
0,
4,
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
),
(
"xFoo-Bar Baz",
"foo-b",
1,
6,
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
+ BONUS_CAMEL123 * 2
+ BONUS_NON_WORD
+ BONUS_BOUNDARY,
),
],
);
}

123
src/utf32_str.rs Normal file
View File

@ -0,0 +1,123 @@
use std::ops::{Bound, RangeBounds};
/// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching.
///
/// Usually rusts utf8 encoded strings are great. However during fuzzy matching
/// operates on codepoints (it should operate on graphemes but that's too much
/// hassle to deal with). We want to quickly iterate these codeboints between
/// (up to 5 times) during matching.
///
/// Doing codepoint segmentation on the fly not only blows trough the cache
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
/// matching itself. Furthermore there are a lot of exta optimizations available
/// for ascii only text (but checking during each match has too much overhead).
///
/// Ofcourse this comes at exta memory cost as we usally still need the ut8
/// encoded variant for rendenring. In the (dominant) case of ascii-only text
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
/// the user is typing on the fly so the same item is potentially matched many
/// times (making the the upfront cost more worth it). That means that its
/// basically always worth it to presegment the string.
///
/// For usecases that only match (a lot of) strings once its possible to keep
/// char buffer around that is filled with the presegmented chars
///
/// Another advantage of this approach is that the matcher will naturally
/// produce char indecies (instead of utf8 offsets) annyway. With a
/// codepoint basec representation like this the indecies can be used
/// directly
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Debug)]
pub enum Utf32Str<'a> {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain vaild ASCII (<=127)
Ascii(&'a [u8]),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(&'a [char]),
}
impl<'a> Utf32Str<'a> {
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
if str.is_ascii() {
Utf32Str::Ascii(str.as_bytes())
} else {
buf.clear();
buf.extend(str.chars());
Utf32Str::Unicode(&*buf)
}
}
#[inline]
pub fn len(&self) -> usize {
match self {
Utf32Str::Unicode(codepoints) => codepoints.len(),
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
}
}
#[inline]
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end,
Bound::Excluded(&end) => end + 1,
Bound::Unbounded => self.len(),
};
match self {
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
}
}
/// Same as `slice` but accepts a u32 range for convenicene sine
/// those are the indecies returned by the matcher
#[inline]
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start as usize,
Bound::Excluded(&start) => start as usize + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end as usize,
Bound::Excluded(&end) => end as usize + 1,
Bound::Unbounded => self.len(),
};
match self {
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
}
}
pub fn is_ascii(&self) -> bool {
matches!(self, Utf32Str::Ascii(_))
}
pub fn get(&self, idx: u32) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
}
}
pub fn last(&self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()],
}
}
}
// impl Str for &[char] {
// type Chars;
// fn chars(&self) -> Self::Chars {
// todo!()
// }
// fn slice(&self, range: impl RangeBounds<u32>) {
// todo!()
// }
// }