mirror of
https://github.com/solaeus/nucleo.git
synced 2024-11-10 01:27:10 +00:00
better implementation
This commit is contained in:
parent
6837b4e2cb
commit
e964d42849
135
src/chars.rs
Normal file
135
src/chars.rs
Normal file
@ -0,0 +1,135 @@
|
||||
use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
|
||||
use crate::MatcherConfig;
|
||||
|
||||
//autogenerated by generate-ucd
|
||||
#[allow(warnings)]
|
||||
#[rustfmt::skip]
|
||||
mod case_fold;
|
||||
mod normalize;
|
||||
|
||||
pub trait Char: Copy + Eq + Ord + std::fmt::Debug {
|
||||
const ASCII: bool;
|
||||
fn char_class(self, config: &MatcherConfig) -> CharClass;
|
||||
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass);
|
||||
fn normalize(self, config: &MatcherConfig) -> Self;
|
||||
}
|
||||
|
||||
impl Char for u8 {
|
||||
const ASCII: bool = true;
|
||||
#[inline]
|
||||
fn char_class(self, config: &MatcherConfig) -> CharClass {
|
||||
let c = self;
|
||||
// using manual if conditions instead optimizes better
|
||||
if c >= b'a' && c <= b'z' {
|
||||
CharClass::Lower
|
||||
} else if c >= b'A' && c <= b'Z' {
|
||||
CharClass::Upper
|
||||
} else if c >= b'0' && c <= b'9' {
|
||||
CharClass::Number
|
||||
} else if c.is_ascii_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else if config.delimeter_chars.contains(&c) {
|
||||
CharClass::Delimiter
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass) {
|
||||
let char_class = self.char_class(config);
|
||||
let normalized = if config.ignore_case && char_class == CharClass::Upper {
|
||||
self + 32
|
||||
} else {
|
||||
self
|
||||
};
|
||||
(normalized, char_class)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn normalize(self, config: &MatcherConfig) -> Self {
|
||||
if config.ignore_case && self >= b'A' && self <= b'Z' {
|
||||
self + 32
|
||||
} else {
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
fn char_class_non_ascii(c: char) -> CharClass {
|
||||
if c.is_lowercase() {
|
||||
CharClass::Lower
|
||||
} else if c.is_uppercase() {
|
||||
CharClass::Upper
|
||||
} else if c.is_numeric() {
|
||||
CharClass::Number
|
||||
} else if c.is_alphabetic() {
|
||||
CharClass::Letter
|
||||
} else if c.is_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
impl Char for char {
|
||||
const ASCII: bool = false;
|
||||
#[inline(always)]
|
||||
fn char_class(self, config: &MatcherConfig) -> CharClass {
|
||||
if self.is_ascii() {
|
||||
return (self as u8).char_class(config);
|
||||
}
|
||||
char_class_non_ascii(self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) {
|
||||
if self.is_ascii() {
|
||||
let (c, class) = (self as u8).char_class_and_normalize(config);
|
||||
return (c as char, class);
|
||||
}
|
||||
let char_class = char_class_non_ascii(self);
|
||||
if char_class == CharClass::Upper {
|
||||
self = CASE_FOLDING_SIMPLE
|
||||
.binary_search_by_key(&self, |(upper, _)| *upper)
|
||||
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
||||
}
|
||||
if config.normalize {
|
||||
self = normalize::normalize(self);
|
||||
}
|
||||
(self, char_class)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn normalize(mut self, config: &MatcherConfig) -> Self {
|
||||
if config.normalize {
|
||||
self = normalize::normalize(self);
|
||||
}
|
||||
to_lower_case(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub use normalize::normalize;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn to_lower_case(c: char) -> char {
|
||||
if c >= 'A' && c <= 'Z' {
|
||||
char::from_u32(c as u32 + 32).unwrap()
|
||||
} else if !c.is_ascii() {
|
||||
CASE_FOLDING_SIMPLE
|
||||
.binary_search_by_key(&c, |(upper, _)| *upper)
|
||||
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
||||
} else {
|
||||
c
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum CharClass {
|
||||
Whitespace,
|
||||
NonWord,
|
||||
Delimiter,
|
||||
Lower,
|
||||
Upper,
|
||||
Letter,
|
||||
Number,
|
||||
}
|
138
src/config.rs
138
src/config.rs
@ -1,37 +1,7 @@
|
||||
pub(crate) const SCORE_MATCH: u16 = 16;
|
||||
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
||||
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
||||
|
||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||
// too great to prevent the longer acronym matches from always winning over
|
||||
// shorter fuzzy matches. The bonus point here was specifically chosen that
|
||||
// the bonus is cancelled when the gap between the acronyms grows over
|
||||
// 8 characters, which is approximately the average length of the words found
|
||||
// in web2 dictionary and my file system.
|
||||
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
|
||||
|
||||
// Although bonus point for non-word characters is non-contextual, we need it
|
||||
// for computing bonus points for consecutive chunks starting with a non-word
|
||||
// character.
|
||||
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
|
||||
|
||||
// Edge-triggered bonus for matches in camelCase words.
|
||||
// Compared to word-boundary case, they don't accompany single-character gaps
|
||||
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
||||
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
|
||||
|
||||
// Minimum bonus point given to characters in consecutive chunks.
|
||||
// Note that bonus points for consecutive matches shouldn't have needed if we
|
||||
// used fixed match score as in the original algorithm.
|
||||
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
|
||||
|
||||
// The first character in the typed pattern usually has more significance
|
||||
// than the rest so it's important that it appears at special positions where
|
||||
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
||||
// The amount of the extra bonus should be limited so that the gap penalty is
|
||||
// still respected.
|
||||
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
|
||||
use crate::chars::CharClass;
|
||||
use crate::score::BONUS_BOUNDARY;
|
||||
|
||||
#[non_exhaustive]
|
||||
pub struct MatcherConfig {
|
||||
pub delimeter_chars: &'static [u8],
|
||||
/// Extra bonus for word boundary after whitespace character or beginning of the string
|
||||
@ -44,33 +14,17 @@ pub struct MatcherConfig {
|
||||
/// this significantly degrades performance so its not recommended
|
||||
/// to be truned on by default
|
||||
pub normalize: bool,
|
||||
/// use faster/simpler algorithm at the cost of (potentially) much worse results
|
||||
/// For long inputs this algorith is always used as a fallbach to avoid
|
||||
/// blowups in time complexity
|
||||
pub use_v1: bool,
|
||||
/// The case matching to perform
|
||||
pub case_matching: CaseMatching,
|
||||
/// whether to ignore casing
|
||||
pub ignore_case: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum CharClass {
|
||||
Whitespace,
|
||||
NonWord,
|
||||
Delimiter,
|
||||
Lower,
|
||||
Upper,
|
||||
Letter,
|
||||
Number,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum CaseMatching {
|
||||
Respect,
|
||||
Ignore,
|
||||
Smart,
|
||||
}
|
||||
// #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
// #[non_exhaustive]
|
||||
// pub enum CaseMatching {
|
||||
// Respect,
|
||||
// Ignore,
|
||||
// Smart,
|
||||
// }
|
||||
|
||||
impl MatcherConfig {
|
||||
pub const DEFAULT: Self = {
|
||||
@ -80,8 +34,7 @@ impl MatcherConfig {
|
||||
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
|
||||
inital_char_class: CharClass::Whitespace,
|
||||
normalize: false,
|
||||
use_v1: false,
|
||||
case_matching: CaseMatching::Smart,
|
||||
ignore_case: true,
|
||||
}
|
||||
};
|
||||
}
|
||||
@ -107,69 +60,4 @@ impl MatcherConfig {
|
||||
self.inital_char_class = CharClass::Delimiter;
|
||||
self
|
||||
}
|
||||
|
||||
fn char_class_non_ascii(c: char) -> CharClass {
|
||||
if c.is_lowercase() {
|
||||
CharClass::Lower
|
||||
} else if c.is_uppercase() {
|
||||
CharClass::Upper
|
||||
} else if c.is_numeric() {
|
||||
CharClass::Number
|
||||
} else if c.is_alphabetic() {
|
||||
CharClass::Letter
|
||||
} else if c.is_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
|
||||
fn char_class_ascii(&self, c: char) -> CharClass {
|
||||
// using manual if conditions instead optimizes better
|
||||
if c >= 'a' && c <= 'z' {
|
||||
CharClass::Lower
|
||||
} else if c >= 'A' && c <= 'Z' {
|
||||
CharClass::Upper
|
||||
} else if c >= '0' && c <= '9' {
|
||||
CharClass::Number
|
||||
} else if c.is_ascii_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else if self.delimeter_chars.contains(&(c as u8)) {
|
||||
CharClass::Delimiter
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn char_class(&self, c: char) -> CharClass {
|
||||
if c.is_ascii() {
|
||||
self.char_class_ascii(c)
|
||||
} else {
|
||||
Self::char_class_non_ascii(c)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
||||
if class > CharClass::NonWord {
|
||||
// transition from non word to word
|
||||
match prev_class {
|
||||
CharClass::Whitespace => return self.bonus_boundary_white,
|
||||
CharClass::Delimiter => return self.bonus_boundary_delimiter,
|
||||
CharClass::NonWord => return BONUS_BOUNDARY,
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
if prev_class == CharClass::Lower && class == CharClass::Upper
|
||||
|| prev_class != CharClass::Number && class == CharClass::Number
|
||||
{
|
||||
// camelCase letter123
|
||||
BONUS_CAMEL123
|
||||
} else if class == CharClass::NonWord {
|
||||
BONUS_NON_WORD
|
||||
} else if class == CharClass::Whitespace {
|
||||
self.bonus_boundary_white
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
46
src/fuzzy_greedy.rs
Normal file
46
src/fuzzy_greedy.rs
Normal file
@ -0,0 +1,46 @@
|
||||
use crate::chars::Char;
|
||||
use crate::Matcher;
|
||||
|
||||
impl Matcher {
|
||||
/// greedy fallback algoritm, much faster (linear time) but reported scores/indicies
|
||||
/// might not be the best match
|
||||
pub(crate) fn fuzzy_match_greedy<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||
&mut self,
|
||||
haystack: &[H],
|
||||
needle: &[N],
|
||||
mut start: usize,
|
||||
mut end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
let first_char_end = if H::ASCII { start + 1 } else { end };
|
||||
if !H::ASCII && needle.len() != 1 {
|
||||
let mut needle_iter = needle[1..].iter().copied();
|
||||
if let Some(mut needle_char) = needle_iter.next() {
|
||||
for (i, &c) in haystack[first_char_end..].iter().enumerate() {
|
||||
if c.normalize(&self.config) == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
end = i + 1;
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// mimimize the greedly match by greedy matching in reverse
|
||||
|
||||
let mut needle_iter = needle.iter().rev().copied();
|
||||
let mut needle_char = needle_iter.next().unwrap();
|
||||
for (i, &c) in haystack[start..end].iter().enumerate().rev() {
|
||||
println!("{c:?} {i} {needle_char:?}");
|
||||
if c == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
start += i;
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
}
|
||||
}
|
||||
Some(self.calculate_score::<INDICIES, H, N>(haystack, needle, start, end, indicies))
|
||||
}
|
||||
}
|
272
src/fuzzy_optimal.rs
Normal file
272
src/fuzzy_optimal.rs
Normal file
@ -0,0 +1,272 @@
|
||||
use std::cmp::max;
|
||||
|
||||
use crate::chars::{Char, CharClass};
|
||||
use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow};
|
||||
use crate::score::{
|
||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
||||
PENALTY_GAP_START, SCORE_MATCH,
|
||||
};
|
||||
use crate::{Matcher, MatcherConfig};
|
||||
|
||||
impl Matcher {
|
||||
pub(crate) fn fuzzy_match_optimal<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||
&mut self,
|
||||
haystack: &[H],
|
||||
needle: &[N],
|
||||
start: usize,
|
||||
greedy_end: usize,
|
||||
end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
|
||||
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
|
||||
// us to treat needle indecies as u16
|
||||
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
|
||||
return self.fuzzy_match_greedy::<INDICIES, H, N>(
|
||||
haystack,
|
||||
needle,
|
||||
start,
|
||||
greedy_end,
|
||||
indicies,
|
||||
);
|
||||
};
|
||||
|
||||
let prev_class = start
|
||||
.checked_sub(1)
|
||||
.map(|i| haystack[i].char_class(&self.config))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config);
|
||||
// this only happend with unicode haystacks, for ASCII the prefilter handles all rejects
|
||||
if !matched {
|
||||
return None;
|
||||
}
|
||||
if needle.len() == 1 {
|
||||
indicies.push(max_score_pos as u32);
|
||||
return Some(max_score);
|
||||
}
|
||||
debug_assert_eq!(
|
||||
matrix.row_offs[0], 0,
|
||||
"prefilter should have put us at the start of the match"
|
||||
);
|
||||
|
||||
// populate the matrix and find the best score
|
||||
let (max_score, best_match_end) = matrix.populate_matrix(needle);
|
||||
if INDICIES {
|
||||
matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end);
|
||||
}
|
||||
println!("{indicies:?}");
|
||||
println!("{}", max_score);
|
||||
Some(max_score)
|
||||
}
|
||||
}
|
||||
|
||||
impl<H: Char> Matrix<'_, H> {
|
||||
fn setup<N: Char>(
|
||||
&mut self,
|
||||
needle: &[N],
|
||||
mut prev_class: CharClass,
|
||||
config: &MatcherConfig,
|
||||
) -> (u16, u16, bool)
|
||||
where
|
||||
H: PartialEq<N>,
|
||||
{
|
||||
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
|
||||
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
|
||||
|
||||
let col_iter = self
|
||||
.haystack
|
||||
.iter_mut()
|
||||
.zip(self.cells.iter_mut())
|
||||
.zip(self.bonus.iter_mut())
|
||||
.enumerate();
|
||||
|
||||
let mut max_score = 0;
|
||||
let mut max_score_pos = 0;
|
||||
let mut in_gap = false;
|
||||
let mut prev_score = 0u16;
|
||||
let mut matched = false;
|
||||
let first_needle_char = needle[0];
|
||||
|
||||
for (i, ((c, matrix_cell), bonus_)) in col_iter {
|
||||
let class = c.char_class(config);
|
||||
*c = c.normalize(config);
|
||||
|
||||
let bonus = config.bonus_for(prev_class, class);
|
||||
// save bonus for later so we don't have to recompute it each time
|
||||
*bonus_ = bonus;
|
||||
prev_class = class;
|
||||
|
||||
let i = i as u16;
|
||||
println!("{i} {needle_char:?} {c:?}");
|
||||
if *c == needle_char {
|
||||
// save the first idx of each char
|
||||
if let Some(next) = row_iter.next() {
|
||||
*row_start = i;
|
||||
(needle_char, row_start) = next;
|
||||
} else {
|
||||
if !matched {
|
||||
*row_start = i;
|
||||
}
|
||||
// we have atleast one match
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
if *c == first_needle_char {
|
||||
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
||||
println!("start match {score}");
|
||||
matrix_cell.consecutive_chars = 1;
|
||||
if needle.len() == 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_pos = i;
|
||||
// can't get better than this
|
||||
if bonus >= BONUS_BOUNDARY {
|
||||
break;
|
||||
}
|
||||
}
|
||||
matrix_cell.score = score;
|
||||
in_gap = false;
|
||||
} else {
|
||||
let gap_penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
|
||||
matrix_cell.consecutive_chars = 0;
|
||||
in_gap = true;
|
||||
}
|
||||
prev_score = matrix_cell.score;
|
||||
}
|
||||
|
||||
(max_score_pos, max_score, matched)
|
||||
}
|
||||
|
||||
fn populate_matrix<N: Char>(&mut self, needle: &[N]) -> (u16, u16)
|
||||
where
|
||||
H: PartialEq<N>,
|
||||
{
|
||||
let mut max_score = 0;
|
||||
let mut max_score_end = 0;
|
||||
|
||||
let mut row_iter = needle
|
||||
.iter()
|
||||
.zip(rows_mut(self.row_offs, self.cells, self.haystack.len()))
|
||||
.enumerate();
|
||||
// skip the first row we already calculated the in `setup` initial scores
|
||||
let (_, mut prev_matrix_row) = row_iter.next().unwrap().1;
|
||||
|
||||
for (i, (&needle_char, row)) in row_iter {
|
||||
let haystack = haystack(self.haystack, self.bonus, row.off);
|
||||
let mut in_gap = false;
|
||||
let mut prev_matrix_cell = MatrixCell {
|
||||
score: 0,
|
||||
consecutive_chars: 0,
|
||||
};
|
||||
// we are interested in the score of the previous character
|
||||
// in the previous row. This represents the previous char
|
||||
// for each possible pattern. This is equivalent to diagonal movement
|
||||
let diagonal_start = row.off - prev_matrix_row.off - 1;
|
||||
let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..];
|
||||
|
||||
for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
||||
.zip(row.cells.iter_mut())
|
||||
.zip(diagonal.iter())
|
||||
.enumerate()
|
||||
{
|
||||
let col = j + row.off as usize;
|
||||
let gap_penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
let mut score1 = 0;
|
||||
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
|
||||
|
||||
let mut consecutive = 0;
|
||||
if haystack_char.char == needle_char {
|
||||
score1 = diag_matrix_cell.score + SCORE_MATCH;
|
||||
let mut bonus = haystack_char.bonus;
|
||||
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
||||
if consecutive > 1 {
|
||||
let first_bonus = self.bonus[col + 1 - consecutive as usize];
|
||||
if bonus > first_bonus {
|
||||
if bonus > BONUS_BOUNDARY {
|
||||
consecutive = 1
|
||||
} else {
|
||||
bonus = max(bonus, BONUS_CONSECUTIVE)
|
||||
}
|
||||
} else {
|
||||
bonus = max(first_bonus, BONUS_CONSECUTIVE)
|
||||
}
|
||||
}
|
||||
if score1 + bonus < score2 {
|
||||
score1 += haystack_char.bonus;
|
||||
consecutive = 0;
|
||||
} else {
|
||||
score1 += bonus;
|
||||
}
|
||||
}
|
||||
in_gap = score1 < score2;
|
||||
let score = max(score1, score2);
|
||||
println!("{score} {score1} {score2}");
|
||||
if i == needle.len() - 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_end = col as u16;
|
||||
}
|
||||
matrix_cell.consecutive_chars = consecutive;
|
||||
matrix_cell.score = score;
|
||||
prev_matrix_cell = *matrix_cell;
|
||||
}
|
||||
prev_matrix_row = row;
|
||||
}
|
||||
(max_score, max_score_end)
|
||||
}
|
||||
|
||||
fn reconstruct_optimal_path<N: Char>(
|
||||
&self,
|
||||
needle: &[N],
|
||||
start: u32,
|
||||
indicies: &mut Vec<u32>,
|
||||
best_match_end: u16,
|
||||
) {
|
||||
indicies.resize(needle.len(), 0);
|
||||
|
||||
let mut row_iter = self.rows_rev().zip(indicies.iter_mut()).peekable();
|
||||
let (mut row, mut matched_col_idx) = row_iter.next().unwrap();
|
||||
let mut next_row: Option<MatrixRow> = None;
|
||||
let mut col = best_match_end;
|
||||
let mut prefer_match = true;
|
||||
let haystack_len = self.haystack.len() as u16;
|
||||
|
||||
loop {
|
||||
let score = row.cells[col as usize].score;
|
||||
let mut score1 = 0;
|
||||
let mut score2 = 0;
|
||||
if let Some(&(prev_row, _)) = row_iter.peek() {
|
||||
if col >= prev_row.off {
|
||||
score1 = prev_row[col].score;
|
||||
}
|
||||
}
|
||||
if col > row.off {
|
||||
score2 = row[col - 1].score;
|
||||
}
|
||||
println!("{score} {score2} {score1} {prefer_match}");
|
||||
let mut new_prefer_match = row[col].consecutive_chars > 1;
|
||||
if !new_prefer_match && col + 1 < haystack_len {
|
||||
if let Some(next_row) = next_row {
|
||||
new_prefer_match = next_row[col + 1].consecutive_chars > 0
|
||||
}
|
||||
}
|
||||
if score > score1 && (score > score2 || score == score2 && prefer_match) {
|
||||
*matched_col_idx = col as u32 + start;
|
||||
next_row = Some(row);
|
||||
let Some(next) = row_iter.next() else {
|
||||
break;
|
||||
};
|
||||
(row, matched_col_idx) = next
|
||||
}
|
||||
prefer_match = new_prefer_match;
|
||||
col -= 1;
|
||||
}
|
||||
}
|
||||
}
|
689
src/lib.rs
689
src/lib.rs
@ -1,616 +1,137 @@
|
||||
// sadly this doens't optmimzie well currently
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::cmp::max;
|
||||
|
||||
use memchr::{memchr, memchr2};
|
||||
use normalize::normalize;
|
||||
|
||||
//autogenerated by generate-ucd
|
||||
#[allow(warnings)]
|
||||
#[rustfmt::skip]
|
||||
mod case_fold;
|
||||
mod chars;
|
||||
mod config;
|
||||
mod normalize;
|
||||
mod fuzzy_greedy;
|
||||
mod fuzzy_optimal;
|
||||
mod matrix;
|
||||
mod prefilter;
|
||||
mod score;
|
||||
mod utf32_str;
|
||||
|
||||
pub use config::{CaseMatching, CharClass, MatcherConfig};
|
||||
// #[cfg(test)]
|
||||
// mod tests;
|
||||
|
||||
use crate::config::{
|
||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
||||
PENALTY_GAP_START, SCORE_MATCH,
|
||||
};
|
||||
pub use config::MatcherConfig;
|
||||
|
||||
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
|
||||
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct MatrixCell {
|
||||
score: u16,
|
||||
consecutive_chars: u16,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct HaystackChar {
|
||||
char: char,
|
||||
bonus: u16,
|
||||
}
|
||||
use crate::matrix::MatrixSlab;
|
||||
use crate::utf32_str::Utf32Str;
|
||||
|
||||
pub struct Matcher {
|
||||
pub config: MatcherConfig,
|
||||
matrix: Box<[MatrixCell; MAX_MATRIX_SIZE]>,
|
||||
haystack: Box<[HaystackChar; MAX_HAYSTACK_LEN]>,
|
||||
// needle can be at most as long as the haystack
|
||||
first_needle_occurance: Box<[u16; MAX_HAYSTACK_LEN]>,
|
||||
slab: MatrixSlab,
|
||||
}
|
||||
|
||||
pub struct Query {
|
||||
needle_chars: Vec<char>,
|
||||
is_ascii: bool,
|
||||
ignore_case: bool,
|
||||
}
|
||||
|
||||
impl Query {
|
||||
fn push(&mut self, needle: &str, normalize_: bool, smart_case: bool) {
|
||||
self.needle_chars.reserve(needle.len());
|
||||
self.needle_chars.extend(needle.chars().map(|mut c| {
|
||||
if !c.is_ascii() {
|
||||
self.is_ascii = false;
|
||||
}
|
||||
if smart_case {
|
||||
if c.is_uppercase() {
|
||||
self.ignore_case = false;
|
||||
}
|
||||
} else if self.ignore_case {
|
||||
if self.is_ascii {
|
||||
c = to_lower_case::<true>(c)
|
||||
} else {
|
||||
c = to_lower_case::<false>(c)
|
||||
}
|
||||
}
|
||||
if normalize_ && !self.is_ascii {
|
||||
c = normalize(c);
|
||||
}
|
||||
c
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||
if c >= b'a' || c <= b'z' {
|
||||
memchr2(c, c + 32, haystack)
|
||||
} else {
|
||||
memchr(c, haystack)
|
||||
}
|
||||
}
|
||||
/// Safety: T must be vaind if initalized with zeros
|
||||
unsafe fn zeroed_array_on_heap<T: Copy, const LEN: usize>() -> Box<[T; LEN]> {
|
||||
let layout = Layout::new::<[T; LEN]>();
|
||||
let res = std::alloc::alloc_zeroed(layout);
|
||||
if res.is_null() {
|
||||
std::alloc::handle_alloc_error(layout)
|
||||
}
|
||||
Box::from_raw(res as _)
|
||||
}
|
||||
// // impl Query {
|
||||
// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) {
|
||||
// // self.needle_chars.reserve(needle.len());
|
||||
// // self.needle_chars.extend(needle.chars().map(|mut c| {
|
||||
// // if !c.is_ascii() {
|
||||
// // self.is_ascii = false;
|
||||
// // }
|
||||
// // if smart_case {
|
||||
// // if c.is_uppercase() {
|
||||
// // self.ignore_case = false;
|
||||
// // }
|
||||
// // } else if self.ignore_case {
|
||||
// // if self.is_ascii {
|
||||
// // c = to_lower_case::<true>(c)
|
||||
// // } else {
|
||||
// // c = to_lower_case::<false>(c)
|
||||
// // }
|
||||
// // }
|
||||
// // if normalize_ && !self.is_ascii {
|
||||
// // c = normalize(c);
|
||||
// // }
|
||||
// // c
|
||||
// // }))
|
||||
// // }
|
||||
// // }
|
||||
|
||||
impl Matcher {
|
||||
pub fn new(config: MatcherConfig) -> Self {
|
||||
// Safety: all data allocated here is just integers/structs that contain
|
||||
// integers so zeroed values are legal
|
||||
unsafe {
|
||||
Self {
|
||||
config,
|
||||
matrix: zeroed_array_on_heap(),
|
||||
haystack: zeroed_array_on_heap(),
|
||||
first_needle_occurance: zeroed_array_on_heap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compile_query(&self, needle: &str) -> Query {
|
||||
let mut query = Query {
|
||||
needle_chars: Vec::new(),
|
||||
is_ascii: true,
|
||||
ignore_case: self.config.case_matching == CaseMatching::Ignore,
|
||||
};
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
query
|
||||
}
|
||||
pub fn recompile_query(&self, query: &mut Query, needle: &str) {
|
||||
query.needle_chars.clear();
|
||||
query.is_ascii = false;
|
||||
query.ignore_case = self.config.case_matching == CaseMatching::Ignore;
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
}
|
||||
pub fn append_query(&self, query: &mut Query, needle: &str) {
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<u16> {
|
||||
if haystack.len() > u32::MAX as usize {
|
||||
haystack = &haystack[..u32::MAX as usize]
|
||||
}
|
||||
if self.config.use_v1 {
|
||||
if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v1::<false, true>(query, haystack, &mut Vec::new())
|
||||
} else {
|
||||
self.fuzzy_matcher_v1::<false, false>(query, haystack, &mut Vec::new())
|
||||
}
|
||||
} else if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v2::<false, true>(query, haystack, &mut Vec::new())
|
||||
} else {
|
||||
self.fuzzy_matcher_v2::<false, false>(query, haystack, &mut Vec::new())
|
||||
slab: MatrixSlab::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fuzzy_indicies(
|
||||
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
||||
assert!(haystack.len() <= u32::MAX as usize);
|
||||
self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new())
|
||||
}
|
||||
fn fuzzy_matcher_impl<const INDICIES: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
mut haystack: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
haystack: Utf32Str<'_>,
|
||||
needle_: Utf32Str<'_>,
|
||||
indidies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
if haystack.len() > u32::MAX as usize {
|
||||
haystack = &haystack[..u32::MAX as usize]
|
||||
}
|
||||
if self.config.use_v1 {
|
||||
if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
|
||||
} else {
|
||||
self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
|
||||
}
|
||||
} else if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
|
||||
} else {
|
||||
self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn normalize_char<const ASCII_ONLY: bool>(&self, ignore_case: bool, mut c: char) -> char {
|
||||
if ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c)
|
||||
}
|
||||
if !ASCII_ONLY && self.config.normalize {
|
||||
c = normalize(c)
|
||||
}
|
||||
c
|
||||
}
|
||||
|
||||
fn prefilter_ascii(&self, query: &Query, mut haystack: &[u8]) -> Option<(usize, usize)> {
|
||||
let needle = &query.needle_chars;
|
||||
if query.ignore_case {
|
||||
let first_idx = find_ascii_ignore_case(needle[0] as u8, haystack)?;
|
||||
let mut last_idx = first_idx + 1;
|
||||
haystack = &haystack[last_idx..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = find_ascii_ignore_case(c as u8, haystack)? + 1;
|
||||
last_idx += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
Some((first_idx, last_idx))
|
||||
} else {
|
||||
let first_idx = memchr(needle[0] as u8, haystack)?;
|
||||
let mut last_idx = first_idx + 1;
|
||||
haystack = &haystack[last_idx..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = memchr(c as u8, haystack)? + 1;
|
||||
last_idx += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
Some((first_idx, last_idx))
|
||||
}
|
||||
}
|
||||
|
||||
fn prefilter_non_ascii(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
||||
let needle_char = query.needle_chars[0];
|
||||
let mut text = haystack
|
||||
.char_indices()
|
||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
||||
|
||||
let (match_start, c) = text.find(|&(_, c)| c == needle_char)?;
|
||||
Some((match_start, match_start + c.len_utf8()))
|
||||
}
|
||||
|
||||
fn prefilter(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
||||
// quickly reject small matches
|
||||
if query.needle_chars.len() > haystack.len() {
|
||||
return None;
|
||||
}
|
||||
if query.is_ascii {
|
||||
self.prefilter_ascii(query, haystack.as_bytes())
|
||||
} else {
|
||||
self.prefilter_non_ascii(query, haystack)
|
||||
}
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v1<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
haystack: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
let (start, end) = self.prefilter(query, haystack)?;
|
||||
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
||||
query, haystack, start, end, indicies,
|
||||
assert!(
|
||||
haystack.len() <= u32::MAX as usize,
|
||||
"fuzzy matching is only support for up to 2^32-1 codepoints"
|
||||
);
|
||||
match (haystack, needle_) {
|
||||
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
||||
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?;
|
||||
self.fuzzy_match_optimal::<INDICIES, u8, u8>(
|
||||
haystack, needle, start, greedy_end, end, indidies,
|
||||
)
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v1_with_prefilter<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
haystack: &str,
|
||||
mut start: usize,
|
||||
mut end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
let first_char_end = if ASCII_ONLY { start + 1 } else { end };
|
||||
if !ASCII_ONLY && query.needle_chars.len() != 1 {
|
||||
let mut needle_iter = query.needle_chars[1..].iter().copied();
|
||||
if let Some(mut needle_char) = needle_iter.next() {
|
||||
let haystack = haystack[first_char_end..]
|
||||
.char_indices()
|
||||
.rev()
|
||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
||||
for (i, c) in haystack {
|
||||
if c == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
end = i + c.len_utf8();
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
||||
// a purely ascii haystack can never be transformed to match
|
||||
// a needle that contains non-ascii chars since we don't allow gaps
|
||||
None
|
||||
}
|
||||
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
||||
todo!()
|
||||
// let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
|
||||
// self.fuzzy_match_optimal::<INDICIES, char, u8>(
|
||||
// haystack,
|
||||
// needle,
|
||||
// start,
|
||||
// start + 1,
|
||||
// end,
|
||||
// indidies,
|
||||
// )
|
||||
}
|
||||
}
|
||||
}
|
||||
// very simple, just mimimize from the back
|
||||
let match_ = haystack[first_char_end..end]
|
||||
.char_indices()
|
||||
.rev()
|
||||
.map(|(i, c)| (i, self.normalize_char::<ASCII_ONLY>(query.ignore_case, c)));
|
||||
|
||||
let mut needle_iter = query.needle_chars[..].iter().rev().copied();
|
||||
let mut needle_char = needle_iter.next().unwrap();
|
||||
for (i, c) in match_ {
|
||||
if c == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
start = i;
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
}
|
||||
}
|
||||
Some(self.calculate_score::<INDICIES, ASCII_ONLY>(query, haystack, start, end, indicies))
|
||||
}
|
||||
|
||||
fn calculate_score<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
text: &str,
|
||||
match_start: usize,
|
||||
match_end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> u16 {
|
||||
if INDICIES {
|
||||
indicies.reserve(query.needle_chars.len());
|
||||
}
|
||||
let mut prev_class = text[..match_start]
|
||||
.chars()
|
||||
.next_back()
|
||||
.map(|c| self.config.char_class(c))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
let mut needle_idx = 0;
|
||||
let mut score = 0u16;
|
||||
let mut in_gap = false;
|
||||
let mut consecutive = 0;
|
||||
let mut first_bonus = 0u16;
|
||||
for (i, mut c) in text[match_start..match_end].char_indices() {
|
||||
let class = self.config.char_class(c);
|
||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c);
|
||||
}
|
||||
if self.config.normalize && !ASCII_ONLY {
|
||||
c = normalize(c)
|
||||
}
|
||||
if c == query.needle_chars[needle_idx] {
|
||||
if INDICIES {
|
||||
indicies.push(i as u32)
|
||||
}
|
||||
score += SCORE_MATCH;
|
||||
let mut bonus = self.config.bonus_for(prev_class, class);
|
||||
if consecutive == 0 {
|
||||
first_bonus = bonus
|
||||
} else {
|
||||
// Break consecutive chunk
|
||||
if bonus > first_bonus {
|
||||
if bonus >= BONUS_BOUNDARY {
|
||||
first_bonus = bonus;
|
||||
} else {
|
||||
bonus = max(bonus, BONUS_CONSECUTIVE);
|
||||
}
|
||||
} else {
|
||||
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
||||
}
|
||||
}
|
||||
if needle_idx == 0 {
|
||||
bonus *= BONUS_FIRST_CHAR_MULTIPLIER;
|
||||
}
|
||||
score += bonus;
|
||||
needle_idx += 1;
|
||||
in_gap = false;
|
||||
consecutive += 1;
|
||||
} else {
|
||||
let penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
score = score.saturating_sub(penalty);
|
||||
in_gap = true;
|
||||
consecutive = 0;
|
||||
first_bonus = 0;
|
||||
}
|
||||
prev_class = class;
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v2<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
text: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<u16> {
|
||||
let (start, prefilter_end) = self.prefilter(query, text)?;
|
||||
let text_len = text.len() - start;
|
||||
// fallback to v1 algorithms for long haystacks
|
||||
// technically we need to multiply by char len here
|
||||
// but counting chars has a lot of unecessary overhead that we can avoid
|
||||
// here in practice using bytelen should be a reasonable approximation
|
||||
// we also differ from fzf here in that we never allocate and instead stringintly check here
|
||||
if text_len > u16::MAX as usize || text_len * query.needle_chars.len() > MAX_HAYSTACK_LEN {
|
||||
return self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
||||
query,
|
||||
text,
|
||||
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
||||
let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
|
||||
self.fuzzy_match_optimal::<INDICIES, char, char>(
|
||||
haystack,
|
||||
needle,
|
||||
start,
|
||||
prefilter_end,
|
||||
indicies,
|
||||
);
|
||||
start + 1,
|
||||
end,
|
||||
indidies,
|
||||
)
|
||||
}
|
||||
|
||||
let mut prev_class = text[..start]
|
||||
.chars()
|
||||
.next_back()
|
||||
.map(|c| self.config.char_class(c))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
|
||||
let text = &text[start..];
|
||||
|
||||
let mut needle_iter = query.needle_chars[..]
|
||||
.iter()
|
||||
.copied()
|
||||
.zip(self.first_needle_occurance.iter_mut());
|
||||
let (mut needle_char, mut needle_char_idx) = needle_iter.next().unwrap();
|
||||
|
||||
let iter = text[start..]
|
||||
.chars()
|
||||
.zip(self.matrix.iter_mut())
|
||||
.zip(self.haystack.iter_mut())
|
||||
.enumerate();
|
||||
|
||||
let mut last_matched_idx = 0;
|
||||
let mut max_score = 0;
|
||||
let mut max_score_pos = 0;
|
||||
let mut in_gap = false;
|
||||
let mut prev_score = 0u16;
|
||||
let mut matched = false;
|
||||
|
||||
let first_needle_char = query.needle_chars[0];
|
||||
for (i, ((mut c, matrix_cell), char_info)) in iter {
|
||||
let class = self.config.char_class(c);
|
||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c);
|
||||
}
|
||||
if self.config.normalize && !ASCII_ONLY {
|
||||
c = normalize(c)
|
||||
}
|
||||
char_info.char = c;
|
||||
let bonus = self.config.bonus_for(prev_class, class);
|
||||
char_info.char = c;
|
||||
prev_class = class;
|
||||
|
||||
let i = i as u16;
|
||||
if c == needle_char {
|
||||
// save the first idx of each char
|
||||
if let Some(next) = needle_iter.next() {
|
||||
*needle_char_idx = i;
|
||||
(needle_char, needle_char_idx) = next
|
||||
} else {
|
||||
// we have atleast one match
|
||||
matched = true;
|
||||
}
|
||||
// and the last matched char
|
||||
last_matched_idx = i;
|
||||
}
|
||||
if c == first_needle_char {
|
||||
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
||||
matrix_cell.consecutive_chars = 1;
|
||||
if query.needle_chars.len() == 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_pos = i;
|
||||
// can't get better than this
|
||||
if bonus >= BONUS_BOUNDARY {
|
||||
break;
|
||||
}
|
||||
}
|
||||
in_gap = false;
|
||||
} else {
|
||||
let gap_penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
|
||||
matrix_cell.consecutive_chars = 0;
|
||||
in_gap = true;
|
||||
}
|
||||
prev_score = matrix_cell.score;
|
||||
}
|
||||
if !matched {
|
||||
debug_assert!(!ASCII_ONLY, "prefilter should have rejected");
|
||||
return None;
|
||||
}
|
||||
if query.needle_chars.len() == 1 {
|
||||
indicies.push(max_score_pos as u32);
|
||||
return Some(max_score);
|
||||
}
|
||||
assert_eq!(
|
||||
self.first_needle_occurance[0], 0,
|
||||
"prefilter should have put us at the start of the match"
|
||||
);
|
||||
let haystack_len = last_matched_idx as usize + 1;
|
||||
let (max_score, best_match_end) = self.popultate_matrix(haystack_len, query);
|
||||
if INDICIES {
|
||||
indicies.reserve(query.needle_chars.len());
|
||||
let mut col = best_match_end;
|
||||
let mut needle_iter = self.matrix[..haystack_len * query.needle_chars.len()]
|
||||
.windows(haystack_len)
|
||||
.zip(self.first_needle_occurance[..haystack_len].iter())
|
||||
.rev()
|
||||
.peekable();
|
||||
let mut next_row = None;
|
||||
let (mut row, mut first_needle_occurance) = needle_iter.next().unwrap();
|
||||
let mut prefer_match = true;
|
||||
loop {
|
||||
let score = row[col as usize].score;
|
||||
let mut score1 = 0;
|
||||
let mut score2 = 0;
|
||||
if let Some((prev_row, _)) = needle_iter.peek() {
|
||||
if col >= *first_needle_occurance {
|
||||
score1 = prev_row[col as usize].score;
|
||||
}
|
||||
}
|
||||
if col > *first_needle_occurance {
|
||||
score2 = row[col as usize - 1].score;
|
||||
}
|
||||
if score > score1 && (score > score2 || score == score2 && prefer_match) {
|
||||
indicies.push(col as u32 + start as u32);
|
||||
next_row = Some(row);
|
||||
let Some(next) = needle_iter.next() else {
|
||||
break;
|
||||
};
|
||||
(row, first_needle_occurance) = next
|
||||
}
|
||||
prefer_match = row[col as usize].consecutive_chars > 1;
|
||||
if !prefer_match && col + 1 < query.needle_chars.len() as u16 {
|
||||
if let Some(next_row) = next_row {
|
||||
prefer_match = next_row[col as usize + 1].consecutive_chars > 0
|
||||
}
|
||||
}
|
||||
col -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
Some(max_score)
|
||||
}
|
||||
|
||||
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) {
|
||||
let mut max_score = 0;
|
||||
let mut max_score_end = 0;
|
||||
let mut iter = query
|
||||
.needle_chars
|
||||
.iter()
|
||||
.zip(self.first_needle_occurance.iter())
|
||||
.zip(self.matrix.chunks_mut(haystack_len))
|
||||
.enumerate();
|
||||
// skip the first row we already calculated the initial scores
|
||||
let (_, ((&_, &_), mut prev_matrix_row)) = iter.next().unwrap();
|
||||
for (i, ((&needle_char, &first_occurance), matrix_row)) in iter {
|
||||
// help the optimizer out a little
|
||||
assert!((first_occurance as usize) < matrix_row.len());
|
||||
assert!(first_occurance != 0);
|
||||
let mut in_gap = false;
|
||||
let haystack = &self.haystack[first_occurance as usize..haystack_len];
|
||||
let mut prev_matrix_cell = matrix_row[first_occurance as usize - 1];
|
||||
let matrix_row = &mut matrix_row[first_occurance as usize..haystack_len];
|
||||
let prev_matrix_diagonal =
|
||||
&mut prev_matrix_row[first_occurance as usize - 1..haystack_len - 1];
|
||||
for (j, ((&haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
||||
.iter()
|
||||
.zip(matrix_row.iter_mut())
|
||||
.zip(prev_matrix_diagonal.iter())
|
||||
.enumerate()
|
||||
{
|
||||
let col = j + first_occurance as usize;
|
||||
let gap_penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
let mut score1 = 0;
|
||||
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
|
||||
let mut consecutive = 0;
|
||||
if haystack_char.char == needle_char {
|
||||
score1 = diag_matrix_cell.score + SCORE_MATCH;
|
||||
let mut bonus = haystack_char.bonus;
|
||||
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
||||
if consecutive > 1 {
|
||||
let first_bonus = self.haystack[col - consecutive as usize].bonus;
|
||||
if bonus > first_bonus {
|
||||
if bonus > BONUS_BOUNDARY {
|
||||
consecutive = 1
|
||||
} else {
|
||||
bonus = max(bonus, BONUS_CONSECUTIVE)
|
||||
}
|
||||
} else {
|
||||
bonus = max(first_bonus, BONUS_CONSECUTIVE)
|
||||
}
|
||||
}
|
||||
if score1 + bonus < score2 {
|
||||
score1 += haystack_char.bonus;
|
||||
consecutive = 0;
|
||||
} else {
|
||||
score1 += bonus;
|
||||
}
|
||||
}
|
||||
in_gap = score1 < score2;
|
||||
let score = max(max(score1, score2), 0);
|
||||
prev_matrix_cell = *matrix_cell;
|
||||
if i == query.needle_chars.len() - 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_end = col as u16;
|
||||
}
|
||||
matrix_cell.consecutive_chars = consecutive;
|
||||
matrix_cell.score = score;
|
||||
}
|
||||
prev_matrix_row = matrix_row;
|
||||
}
|
||||
(max_score, max_score_end)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_lower_case<const ASCII_ONLY: bool>(c: char) -> char {
|
||||
if c >= 'A' && c <= 'Z' {
|
||||
char::from_u32(c as u32 + 32).unwrap()
|
||||
} else if !c.is_ascii() && !ASCII_ONLY {
|
||||
case_fold::CASE_FOLDING_SIMPLE
|
||||
.binary_search_by_key(&c, |(upper, _)| *upper)
|
||||
.map_or(c, |idx| case_fold::CASE_FOLDING_SIMPLE[idx].1)
|
||||
} else {
|
||||
c
|
||||
}
|
||||
// pub fn fuzzy_indicies(
|
||||
// &mut self,
|
||||
// query: &Query,
|
||||
// mut haystack: Utf32Str<'_>,
|
||||
// indicies: &mut Vec<u32>,
|
||||
// ) -> Option<u16> {
|
||||
// if haystack.len() > u32::MAX as usize {
|
||||
// haystack = &haystack[..u32::MAX as usize]
|
||||
// }
|
||||
// println!(
|
||||
// "start {haystack:?}, {:?} {} {}",
|
||||
// query.needle_chars, query.ignore_case, query.is_ascii
|
||||
// );
|
||||
// if self.config.use_v1 {
|
||||
// if query.is_ascii && !self.config.normalize {
|
||||
// self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
|
||||
// } else {
|
||||
// self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
|
||||
// }
|
||||
// } else if query.is_ascii && !self.config.normalize {
|
||||
// self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
|
||||
// } else {
|
||||
// self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
280
src/matrix.rs
Normal file
280
src/matrix.rs
Normal file
@ -0,0 +1,280 @@
|
||||
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
|
||||
use std::fmt::{Debug, Formatter, Result};
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::{size_of, take};
|
||||
use std::ops::Index;
|
||||
use std::ptr::{slice_from_raw_parts_mut, NonNull};
|
||||
|
||||
use crate::chars::Char;
|
||||
|
||||
const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB
|
||||
|
||||
// these two aren't hard maxima, instead we simply allow whatever will fit into memory
|
||||
const MAX_HAYSTACK_LEN: usize = 2048; // 64KB
|
||||
const MAX_NEEDLE_LEN: usize = 2048; // 64KB
|
||||
|
||||
struct MatrixLayout<C: Char> {
|
||||
haystack_len: usize,
|
||||
needle_len: usize,
|
||||
cell_count: usize,
|
||||
layout: Layout,
|
||||
haystack_off: usize,
|
||||
bonus_off: usize,
|
||||
rows_off: usize,
|
||||
cells_off: usize,
|
||||
_phantom: PhantomData<C>,
|
||||
}
|
||||
impl<C: Char> MatrixLayout<C> {
|
||||
fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout<C> {
|
||||
let mut layout = Layout::from_size_align(0, 1).unwrap();
|
||||
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
|
||||
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();
|
||||
let rows_layout = Layout::array::<u16>(needle_len).unwrap();
|
||||
let cells_layout = Layout::array::<MatrixCell>(cell_count).unwrap();
|
||||
|
||||
let haystack_off;
|
||||
(layout, haystack_off) = layout.extend(haystack_layout).unwrap();
|
||||
let bonus_off;
|
||||
(layout, bonus_off) = layout.extend(bonus_layout).unwrap();
|
||||
let rows_off;
|
||||
(layout, rows_off) = layout.extend(rows_layout).unwrap();
|
||||
let cells_off;
|
||||
(layout, cells_off) = layout.extend(cells_layout).unwrap();
|
||||
MatrixLayout {
|
||||
haystack_len,
|
||||
needle_len,
|
||||
cell_count,
|
||||
layout,
|
||||
haystack_off,
|
||||
bonus_off,
|
||||
rows_off,
|
||||
cells_off,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
/// # Safety
|
||||
///
|
||||
/// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT
|
||||
unsafe fn fieds_from_ptr(
|
||||
&self,
|
||||
ptr: NonNull<u8>,
|
||||
) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) {
|
||||
// sanity checks, should not be necessary
|
||||
|
||||
let base = ptr.as_ptr();
|
||||
let haystack = base.add(self.haystack_off) as *mut C;
|
||||
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
|
||||
let bonus = base.add(self.bonus_off) as *mut u16;
|
||||
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
|
||||
let rows = base.add(self.rows_off) as *mut u16;
|
||||
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
|
||||
let cells = base.add(self.cells_off) as *mut MatrixCell;
|
||||
let cells = slice_from_raw_parts_mut(cells, self.cell_count);
|
||||
(haystack, bonus, rows, cells)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
pub(crate) struct MatrixCell {
|
||||
pub score: u16,
|
||||
pub consecutive_chars: u16,
|
||||
}
|
||||
|
||||
impl Debug for MatrixCell {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
(self.score, self.consecutive_chars).fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
pub(crate) struct HaystackChar<C: Char> {
|
||||
pub char: C,
|
||||
pub bonus: u16,
|
||||
}
|
||||
|
||||
impl<C: Char> Debug for HaystackChar<C> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
(self.char, self.bonus).fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct MatrixRow<'a> {
|
||||
pub off: u16,
|
||||
pub cells: &'a [MatrixCell],
|
||||
}
|
||||
impl Index<u16> for MatrixRow<'_> {
|
||||
type Output = MatrixCell;
|
||||
|
||||
fn index(&self, index: u16) -> &Self::Output {
|
||||
&self.cells[index as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for MatrixRow<'_> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
let mut f = f.debug_list();
|
||||
f.entries((0..self.off).map(|_| &(0, 0)));
|
||||
f.entries(self.cells.iter());
|
||||
f.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct MatrixRowMut<'a> {
|
||||
pub off: u16,
|
||||
pub cells: &'a mut [MatrixCell],
|
||||
}
|
||||
|
||||
impl Debug for MatrixRowMut<'_> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
let mut f = f.debug_list();
|
||||
f.entries((0..self.off).map(|_| &(0, 0)));
|
||||
f.entries(self.cells.iter());
|
||||
f.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DebugList<I>(I);
|
||||
impl<I> Debug for DebugList<I>
|
||||
where
|
||||
I: Iterator + Clone,
|
||||
I::Item: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
f.debug_list().entries(self.0.clone()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Matrix<'a, C: Char> {
|
||||
pub haystack: &'a mut [C],
|
||||
// stored as a seperate array instead of struct
|
||||
// to avoid padding sine char is too large and u8 too small :/
|
||||
pub bonus: &'a mut [u16],
|
||||
pub row_offs: &'a mut [u16],
|
||||
pub cells: &'a mut [MatrixCell],
|
||||
}
|
||||
|
||||
impl<'a, C: Char> Matrix<'a, C> {
|
||||
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
|
||||
let mut cells = &*self.cells;
|
||||
self.row_offs.iter().map(move |&off| {
|
||||
let len = self.haystack.len() - off as usize;
|
||||
let (row, tmp) = cells.split_at(len);
|
||||
cells = tmp;
|
||||
MatrixRow { off, cells: row }
|
||||
})
|
||||
}
|
||||
|
||||
pub fn rows_rev(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator {
|
||||
let mut cells = &*self.cells;
|
||||
self.row_offs.iter().rev().map(move |&off| {
|
||||
let len = self.haystack.len() - off as usize;
|
||||
let (tmp, row) = cells.split_at(cells.len() - len);
|
||||
cells = tmp;
|
||||
MatrixRow { off, cells: row }
|
||||
})
|
||||
}
|
||||
pub fn haystack(
|
||||
&self,
|
||||
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
|
||||
haystack(self.haystack, self.bonus, 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: Char> Debug for Matrix<'a, C> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
f.debug_struct("Matrix")
|
||||
.field("haystack", &DebugList(self.haystack()))
|
||||
.field("matrix", &DebugList(self.rows()))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
pub(crate) fn haystack<'a, C: Char>(
|
||||
haystack: &'a [C],
|
||||
bonus: &'a [u16],
|
||||
skip: u16,
|
||||
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + Clone + 'a {
|
||||
haystack[skip as usize..]
|
||||
.iter()
|
||||
.zip(bonus[skip as usize..].iter())
|
||||
.map(|(&char, &bonus)| HaystackChar { char, bonus })
|
||||
}
|
||||
|
||||
pub(crate) fn rows_mut<'a>(
|
||||
row_offs: &'a [u16],
|
||||
mut cells: &'a mut [MatrixCell],
|
||||
haystack_len: usize,
|
||||
) -> impl Iterator<Item = MatrixRowMut<'a>> + ExactSizeIterator + 'a {
|
||||
row_offs.iter().map(move |&off| {
|
||||
let len = haystack_len - off as usize;
|
||||
let (row, tmp) = take(&mut cells).split_at_mut(len);
|
||||
cells = tmp;
|
||||
MatrixRowMut { off, cells: row }
|
||||
})
|
||||
}
|
||||
|
||||
// we only use this to construct the layout for the slab allocation
|
||||
#[allow(unused)]
|
||||
struct MatrixData {
|
||||
haystack: [char; MAX_HAYSTACK_LEN],
|
||||
bonus: [u16; MAX_HAYSTACK_LEN],
|
||||
row_offs: [u16; MAX_NEEDLE_LEN],
|
||||
cells: [MatrixCell; MAX_MATRIX_SIZE],
|
||||
}
|
||||
|
||||
// const MATRIX_ALLOC_LAYOUT: Layout =
|
||||
// MatrixLayout::<char>::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout;
|
||||
|
||||
pub(crate) struct MatrixSlab(NonNull<u8>);
|
||||
|
||||
impl MatrixSlab {
|
||||
pub fn new() -> Self {
|
||||
let layout = Layout::new::<MatrixData>();
|
||||
// safety: the matrix is never zero sized (hardcoded constants)
|
||||
let ptr = unsafe { alloc_zeroed(layout) };
|
||||
let Some(ptr) = NonNull::new(ptr) else{
|
||||
handle_alloc_error(layout)
|
||||
};
|
||||
MatrixSlab(ptr.cast())
|
||||
}
|
||||
|
||||
pub(crate) fn alloc<C: Char>(
|
||||
&mut self,
|
||||
haystack_: &[C],
|
||||
needle_len: usize,
|
||||
) -> Option<Matrix<'_, C>> {
|
||||
let cells = haystack_.len() * needle_len;
|
||||
if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize {
|
||||
return None;
|
||||
}
|
||||
let matrix_layout = MatrixLayout::<C>::new(
|
||||
haystack_.len(),
|
||||
needle_len,
|
||||
(haystack_.len() - needle_len / 2) * needle_len,
|
||||
);
|
||||
if matrix_layout.layout.size() > size_of::<MatrixData>() {
|
||||
return None;
|
||||
}
|
||||
unsafe {
|
||||
// safetly: this allocation is valid for MATRIX_ALLOC_LAYOUT
|
||||
let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0);
|
||||
// copy haystack before creating refernces to ensure we donu't crate
|
||||
// refrences to invalid chars (which may or may not be UB)
|
||||
haystack_
|
||||
.as_ptr()
|
||||
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());
|
||||
Some(Matrix {
|
||||
haystack: &mut *haystack,
|
||||
row_offs: &mut *rows,
|
||||
bonus: &mut *bonus,
|
||||
cells: &mut *cells,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MatrixSlab {
|
||||
fn drop(&mut self) {
|
||||
unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatrixData>()) };
|
||||
}
|
||||
}
|
0
src/multizip.rs
Normal file
0
src/multizip.rs
Normal file
73
src/prefilter.rs
Normal file
73
src/prefilter.rs
Normal file
@ -0,0 +1,73 @@
|
||||
use ::memchr::{memchr, memchr2, memrchr, memrchr2};
|
||||
|
||||
use crate::chars::Char;
|
||||
use crate::utf32_str::Utf32Str;
|
||||
use crate::Matcher;
|
||||
|
||||
#[inline(always)]
|
||||
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||
if c >= b'a' || c <= b'z' {
|
||||
memchr2(c, c - 32, haystack)
|
||||
} else {
|
||||
memchr(c, haystack)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||
if c >= b'a' || c <= b'z' {
|
||||
memrchr2(c, c - 32, haystack)
|
||||
} else {
|
||||
memrchr(c, haystack)
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher {
|
||||
pub(crate) fn prefilter_ascii(
|
||||
&self,
|
||||
mut haystack: &[u8],
|
||||
needle: &[u8],
|
||||
) -> Option<(usize, usize, usize)> {
|
||||
if self.config.ignore_case {
|
||||
let start = find_ascii_ignore_case(needle[0], haystack)?;
|
||||
let mut eager_end = start + 1;
|
||||
haystack = &haystack[eager_end..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = find_ascii_ignore_case(c, haystack)? + 1;
|
||||
eager_end += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
let end = eager_end
|
||||
+ find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack).unwrap_or(0);
|
||||
Some((start, eager_end, end))
|
||||
} else {
|
||||
let start = memchr(needle[0], haystack)?;
|
||||
let mut eager_end = start + 1;
|
||||
haystack = &haystack[eager_end..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = memchr(c, haystack)? + 1;
|
||||
eager_end += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
let end = eager_end + memrchr(*needle.last().unwrap(), haystack).unwrap_or(0);
|
||||
Some((start, eager_end, end))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn prefilter_non_ascii(
|
||||
&self,
|
||||
haystack: &[char],
|
||||
needle: Utf32Str<'_>,
|
||||
) -> Option<(usize, usize)> {
|
||||
let needle_char = needle.get(0);
|
||||
let start = haystack
|
||||
.iter()
|
||||
.position(|c| c.normalize(&self.config) == needle_char)?;
|
||||
let needle_char = needle.last();
|
||||
let end = haystack[start..]
|
||||
.iter()
|
||||
.position(|c| c.normalize(&self.config) == needle_char)?;
|
||||
|
||||
Some((start, end))
|
||||
}
|
||||
}
|
145
src/score.rs
Normal file
145
src/score.rs
Normal file
@ -0,0 +1,145 @@
|
||||
use std::cmp::max;
|
||||
|
||||
use crate::chars::{Char, CharClass};
|
||||
use crate::{Matcher, MatcherConfig};
|
||||
|
||||
pub(crate) const SCORE_MATCH: u16 = 16;
|
||||
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
||||
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
||||
|
||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||
// too great to prevent the longer acronym matches from always winning over
|
||||
// shorter fuzzy matches. The bonus point here was specifically chosen that
|
||||
// the bonus is cancelled when the gap between the acronyms grows over
|
||||
// 8 characters, which is approximately the average length of the words found
|
||||
// in web2 dictionary and my file system.
|
||||
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
|
||||
|
||||
// Although bonus point for non-word characters is non-contextual, we need it
|
||||
// for computing bonus points for consecutive chunks starting with a non-word
|
||||
// character.
|
||||
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
|
||||
|
||||
// Edge-triggered bonus for matches in camelCase words.
|
||||
// Compared to word-boundary case, they don't accompany single-character gaps
|
||||
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
||||
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
|
||||
|
||||
// Minimum bonus point given to characters in consecutive chunks.
|
||||
// Note that bonus points for consecutive matches shouldn't have needed if we
|
||||
// used fixed match score as in the original algorithm.
|
||||
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
|
||||
|
||||
// The first character in the typed pattern usually has more significance
|
||||
// than the rest so it's important that it appears at special positions where
|
||||
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
||||
// The amount of the extra bonus should be limited so that the gap penalty is
|
||||
// still respected.
|
||||
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
|
||||
|
||||
impl MatcherConfig {
|
||||
#[inline]
|
||||
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
||||
if class > CharClass::NonWord {
|
||||
// transition from non word to word
|
||||
match prev_class {
|
||||
CharClass::Whitespace => return self.bonus_boundary_white,
|
||||
CharClass::Delimiter => return self.bonus_boundary_delimiter,
|
||||
CharClass::NonWord => return BONUS_BOUNDARY,
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
if prev_class == CharClass::Lower && class == CharClass::Upper
|
||||
|| prev_class != CharClass::Number && class == CharClass::Number
|
||||
{
|
||||
// camelCase letter123
|
||||
BONUS_CAMEL123
|
||||
} else if class == CharClass::NonWord {
|
||||
BONUS_NON_WORD
|
||||
} else if class == CharClass::Whitespace {
|
||||
self.bonus_boundary_white
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
impl Matcher {
|
||||
#[inline(always)]
|
||||
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
||||
self.config.bonus_for(prev_class, class)
|
||||
}
|
||||
|
||||
pub(crate) fn calculate_score<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||
&mut self,
|
||||
haystack: &[H],
|
||||
needle: &[N],
|
||||
start: usize,
|
||||
end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> u16 {
|
||||
if INDICIES {
|
||||
indicies.reserve(needle.len());
|
||||
}
|
||||
|
||||
let mut prev_class = start
|
||||
.checked_sub(1)
|
||||
.map(|i| haystack[i].char_class(&self.config))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
let mut needle_iter = needle.iter();
|
||||
let mut needle_char = *needle_iter.next().unwrap();
|
||||
|
||||
let mut in_gap = false;
|
||||
let mut consecutive = 1;
|
||||
|
||||
// unrolled the firs iteration to make applying the first char multiplier less akward
|
||||
if INDICIES {
|
||||
indicies.push(start as u32)
|
||||
}
|
||||
let mut first_bonus = self.bonus_for(prev_class, haystack[0].char_class(&self.config));
|
||||
let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
||||
|
||||
for (i, c) in haystack[start + 1..end].iter().enumerate() {
|
||||
let class = c.char_class(&self.config);
|
||||
let c = c.normalize(&self.config);
|
||||
if c == needle_char {
|
||||
if INDICIES {
|
||||
indicies.push(i as u32 + start as u32)
|
||||
}
|
||||
let mut bonus = self.bonus_for(prev_class, class);
|
||||
if consecutive == 0 {
|
||||
first_bonus = bonus
|
||||
} else {
|
||||
// Break consecutive chunk
|
||||
if bonus > first_bonus {
|
||||
if bonus >= BONUS_BOUNDARY {
|
||||
first_bonus = bonus;
|
||||
} else {
|
||||
bonus = max(bonus, BONUS_CONSECUTIVE);
|
||||
}
|
||||
} else {
|
||||
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
||||
}
|
||||
}
|
||||
score += SCORE_MATCH + bonus;
|
||||
in_gap = false;
|
||||
consecutive += 1;
|
||||
if let Some(&next) = needle_iter.next() {
|
||||
needle_char = next;
|
||||
}
|
||||
} else {
|
||||
let penalty = if in_gap {
|
||||
PENALTY_GAP_EXTENSION
|
||||
} else {
|
||||
PENALTY_GAP_START
|
||||
};
|
||||
score = score.saturating_sub(penalty);
|
||||
in_gap = true;
|
||||
consecutive = 0;
|
||||
first_bonus = 0;
|
||||
}
|
||||
prev_class = class;
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
}
|
270
src/tests.rs
Normal file
270
src/tests.rs
Normal file
@ -0,0 +1,270 @@
|
||||
use crate::config::{
|
||||
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
|
||||
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
||||
};
|
||||
use crate::{CaseMatching, Matcher, MatcherConfig};
|
||||
|
||||
pub fn assert_matches(
|
||||
use_v1: bool,
|
||||
normalize: bool,
|
||||
case_sensitive: bool,
|
||||
path: bool,
|
||||
cases: &[(&str, &str, u32, u32, u16)],
|
||||
) {
|
||||
let mut config = MatcherConfig {
|
||||
use_v1,
|
||||
normalize,
|
||||
case_matching: if case_sensitive {
|
||||
CaseMatching::Respect
|
||||
} else {
|
||||
CaseMatching::Ignore
|
||||
},
|
||||
..MatcherConfig::DEFAULT
|
||||
};
|
||||
if path {
|
||||
config.set_match_paths();
|
||||
}
|
||||
let mut matcher = Matcher::new(config);
|
||||
let mut indicies = Vec::new();
|
||||
for &(haystack, needle, start, end, mut score) in cases {
|
||||
score += needle.chars().count() as u16 * SCORE_MATCH;
|
||||
let query = matcher.compile_query(needle);
|
||||
let res = matcher.fuzzy_indicies(&query, haystack, &mut indicies);
|
||||
assert_eq!(res, Some(score), "{needle:?} did not match {haystack:?}");
|
||||
assert_eq!(
|
||||
indicies.first().copied()..indicies.last().map(|&i| i + 1),
|
||||
Some(start)..Some(end),
|
||||
"{needle:?} match {haystack:?}[{start}..{end}]"
|
||||
);
|
||||
}
|
||||
}
|
||||
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
|
||||
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
|
||||
|
||||
#[test]
|
||||
fn test_v2_fuzzy() {
|
||||
assert_matches(
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"fooBarbaz1",
|
||||
"oBZ",
|
||||
2,
|
||||
9,
|
||||
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
|
||||
),
|
||||
(
|
||||
"foo bar baz",
|
||||
"fbb",
|
||||
0,
|
||||
9,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 4 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"/AutomatorDocument.icns",
|
||||
"rdoc",
|
||||
9,
|
||||
13,
|
||||
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
|
||||
),
|
||||
(
|
||||
"/man1/zshcompctl.1",
|
||||
"zshc",
|
||||
6,
|
||||
10,
|
||||
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_BOUNDARY_DELIMITER * 3,
|
||||
),
|
||||
(
|
||||
"/.oh-my-zsh/cache",
|
||||
"zshc",
|
||||
8,
|
||||
13,
|
||||
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
|
||||
- PENALTY_GAP_START
|
||||
+ BONUS_BOUNDARY_DELIMITER,
|
||||
),
|
||||
(
|
||||
"ab0123 456",
|
||||
"12356",
|
||||
3,
|
||||
10,
|
||||
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"abc123 456",
|
||||
"12356",
|
||||
3,
|
||||
10,
|
||||
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_CAMEL123 * 2
|
||||
+ BONUS_CONSECUTIVE
|
||||
- PENALTY_GAP_START
|
||||
- PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"foo/bar/baz",
|
||||
"fbb",
|
||||
0,
|
||||
9,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 4 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"fooBarBaz",
|
||||
"fbb",
|
||||
0,
|
||||
7,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 2 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"foo barbaz",
|
||||
"fbb",
|
||||
0,
|
||||
8,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
|
||||
- PENALTY_GAP_START * 2
|
||||
- PENALTY_GAP_EXTENSION * 3,
|
||||
),
|
||||
(
|
||||
"fooBar Baz",
|
||||
"foob",
|
||||
0,
|
||||
4,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
|
||||
),
|
||||
(
|
||||
"xFoo-Bar Baz",
|
||||
"foo-b",
|
||||
1,
|
||||
6,
|
||||
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_CAMEL123 * 2
|
||||
+ BONUS_NON_WORD
|
||||
+ BONUS_BOUNDARY,
|
||||
),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_v1_fuzzy() {
|
||||
assert_matches(
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
&[
|
||||
(
|
||||
"fooBarbaz1",
|
||||
"oBZ",
|
||||
2,
|
||||
9,
|
||||
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
|
||||
),
|
||||
(
|
||||
"foo bar baz",
|
||||
"fbb",
|
||||
0,
|
||||
9,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 4 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"/AutomatorDocument.icns",
|
||||
"rdoc",
|
||||
9,
|
||||
13,
|
||||
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
|
||||
),
|
||||
(
|
||||
"/man1/zshcompctl.1",
|
||||
"zshc",
|
||||
6,
|
||||
10,
|
||||
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_BOUNDARY_DELIMITER * 3,
|
||||
),
|
||||
(
|
||||
"/.oh-my-zsh/cache",
|
||||
"zshc",
|
||||
8,
|
||||
13,
|
||||
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
|
||||
- PENALTY_GAP_START
|
||||
+ BONUS_BOUNDARY_DELIMITER,
|
||||
),
|
||||
(
|
||||
"ab0123 456",
|
||||
"12356",
|
||||
3,
|
||||
10,
|
||||
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"abc123 456",
|
||||
"12356",
|
||||
3,
|
||||
10,
|
||||
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_CAMEL123 * 2
|
||||
+ BONUS_CONSECUTIVE
|
||||
- PENALTY_GAP_START
|
||||
- PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"foo/bar/baz",
|
||||
"fbb",
|
||||
0,
|
||||
9,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 4 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"fooBarBaz",
|
||||
"fbb",
|
||||
0,
|
||||
7,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
|
||||
- 2 * PENALTY_GAP_START
|
||||
- 2 * PENALTY_GAP_EXTENSION,
|
||||
),
|
||||
(
|
||||
"foo barbaz",
|
||||
"fbb",
|
||||
0,
|
||||
8,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
|
||||
- PENALTY_GAP_START * 2
|
||||
- PENALTY_GAP_EXTENSION * 3,
|
||||
),
|
||||
(
|
||||
"fooBar Baz",
|
||||
"foob",
|
||||
0,
|
||||
4,
|
||||
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
|
||||
),
|
||||
(
|
||||
"xFoo-Bar Baz",
|
||||
"foo-b",
|
||||
1,
|
||||
6,
|
||||
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||
+ BONUS_CAMEL123 * 2
|
||||
+ BONUS_NON_WORD
|
||||
+ BONUS_BOUNDARY,
|
||||
),
|
||||
],
|
||||
);
|
||||
}
|
123
src/utf32_str.rs
Normal file
123
src/utf32_str.rs
Normal file
@ -0,0 +1,123 @@
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
|
||||
/// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching.
|
||||
///
|
||||
/// Usually rusts utf8 encoded strings are great. However during fuzzy matching
|
||||
/// operates on codepoints (it should operate on graphemes but that's too much
|
||||
/// hassle to deal with). We want to quickly iterate these codeboints between
|
||||
/// (up to 5 times) during matching.
|
||||
///
|
||||
/// Doing codepoint segmentation on the fly not only blows trough the cache
|
||||
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
|
||||
/// matching itself. Furthermore there are a lot of exta optimizations available
|
||||
/// for ascii only text (but checking during each match has too much overhead).
|
||||
///
|
||||
/// Ofcourse this comes at exta memory cost as we usally still need the ut8
|
||||
/// encoded variant for rendenring. In the (dominant) case of ascii-only text
|
||||
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
|
||||
/// the user is typing on the fly so the same item is potentially matched many
|
||||
/// times (making the the upfront cost more worth it). That means that its
|
||||
/// basically always worth it to presegment the string.
|
||||
///
|
||||
/// For usecases that only match (a lot of) strings once its possible to keep
|
||||
/// char buffer around that is filled with the presegmented chars
|
||||
///
|
||||
/// Another advantage of this approach is that the matcher will naturally
|
||||
/// produce char indecies (instead of utf8 offsets) annyway. With a
|
||||
/// codepoint basec representation like this the indecies can be used
|
||||
/// directly
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Debug)]
|
||||
pub enum Utf32Str<'a> {
|
||||
/// A string represented as ASCII encoded bytes.
|
||||
/// Correctness invariant: must only contain vaild ASCII (<=127)
|
||||
Ascii(&'a [u8]),
|
||||
/// A string represented as an array of unicode codepoints (basically UTF-32).
|
||||
Unicode(&'a [char]),
|
||||
}
|
||||
|
||||
impl<'a> Utf32Str<'a> {
|
||||
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
|
||||
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
|
||||
if str.is_ascii() {
|
||||
Utf32Str::Ascii(str.as_bytes())
|
||||
} else {
|
||||
buf.clear();
|
||||
buf.extend(str.chars());
|
||||
Utf32Str::Unicode(&*buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
match self {
|
||||
Utf32Str::Unicode(codepoints) => codepoints.len(),
|
||||
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
|
||||
let start = match range.start_bound() {
|
||||
Bound::Included(&start) => start,
|
||||
Bound::Excluded(&start) => start + 1,
|
||||
Bound::Unbounded => 0,
|
||||
};
|
||||
let end = match range.end_bound() {
|
||||
Bound::Included(&end) => end,
|
||||
Bound::Excluded(&end) => end + 1,
|
||||
Bound::Unbounded => self.len(),
|
||||
};
|
||||
match self {
|
||||
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
|
||||
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as `slice` but accepts a u32 range for convenicene sine
|
||||
/// those are the indecies returned by the matcher
|
||||
#[inline]
|
||||
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
|
||||
let start = match range.start_bound() {
|
||||
Bound::Included(&start) => start as usize,
|
||||
Bound::Excluded(&start) => start as usize + 1,
|
||||
Bound::Unbounded => 0,
|
||||
};
|
||||
let end = match range.end_bound() {
|
||||
Bound::Included(&end) => end as usize,
|
||||
Bound::Excluded(&end) => end as usize + 1,
|
||||
Bound::Unbounded => self.len(),
|
||||
};
|
||||
match self {
|
||||
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
|
||||
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
|
||||
}
|
||||
}
|
||||
pub fn is_ascii(&self) -> bool {
|
||||
matches!(self, Utf32Str::Ascii(_))
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: u32) -> char {
|
||||
match self {
|
||||
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
|
||||
Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
|
||||
}
|
||||
}
|
||||
pub fn last(&self) -> char {
|
||||
match self {
|
||||
Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char,
|
||||
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// impl Str for &[char] {
|
||||
// type Chars;
|
||||
|
||||
// fn chars(&self) -> Self::Chars {
|
||||
// todo!()
|
||||
// }
|
||||
|
||||
// fn slice(&self, range: impl RangeBounds<u32>) {
|
||||
// todo!()
|
||||
// }
|
||||
// }
|
Loading…
Reference in New Issue
Block a user