mirror of
https://github.com/solaeus/nucleo.git
synced 2025-01-05 07:27:49 +00:00
better algorithm
switch to a algorithm with two matrices (but optimized to have 1 matrix while tracking indices and just a single row if only scoring) that can be proven to always provide the optimal result (fzfs' v2 algorithm does not and can even produce wore results than its v1 algorithm). The algorithm is very similar to skim but still uses fzfs bonus system and is orders of magnitude faster (and falls back to fzfs greedy v1 algorithm which is actually surprisingly close to the optimal algorithm).
This commit is contained in:
parent
8527340bc9
commit
becd35c5de
@ -19,7 +19,7 @@ pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
|
|||||||
/// repr tansparent wrapper around u8 with better formatting and PartialEq<char> implementation
|
/// repr tansparent wrapper around u8 with better formatting and PartialEq<char> implementation
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||||
pub(crate) struct AsciiChar(u8);
|
pub(crate) struct AsciiChar(pub u8);
|
||||||
|
|
||||||
impl AsciiChar {
|
impl AsciiChar {
|
||||||
pub fn cast(bytes: &[u8]) -> &[AsciiChar] {
|
pub fn cast(bytes: &[u8]) -> &[AsciiChar] {
|
||||||
|
81
src/debug.rs
81
src/debug.rs
@ -1,69 +1,32 @@
|
|||||||
use crate::chars::Char;
|
use crate::matrix::{MatrixCell, ScoreCell};
|
||||||
use crate::matrix::{haystack, HaystackChar, Matrix, MatrixCell, MatrixRow, MatrixRowMut};
|
|
||||||
use std::fmt::{Debug, Formatter, Result};
|
use std::fmt::{Debug, Formatter, Result};
|
||||||
|
|
||||||
impl<C: Char> Matrix<'_, C> {
|
// impl<C: Char> MatcherData<'_, C> {
|
||||||
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
|
// pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
|
||||||
let mut cells = &*self.cells;
|
// let mut cells = &*self.cells;
|
||||||
self.row_offs.iter().map(move |&off| {
|
// self.row_offs.iter().map(move |&off| {
|
||||||
let len = self.haystack.len() - off as usize;
|
// let len = self.haystack.len() - off as usize;
|
||||||
let (row, tmp) = cells.split_at(len);
|
// let (row, tmp) = cells.split_at(len);
|
||||||
cells = tmp;
|
// cells = tmp;
|
||||||
MatrixRow { off, cells: row }
|
// MatrixRow { off, cells: row }
|
||||||
})
|
// })
|
||||||
}
|
// }
|
||||||
|
|
||||||
pub fn haystack(
|
// pub fn haystack(
|
||||||
&self,
|
// &self,
|
||||||
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
|
// ) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
|
||||||
haystack(self.haystack, self.bonus, 0)
|
// haystack(self.haystack, self.bonus, 0)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
impl Debug for ScoreCell {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
write!(f, "({}, {}, {})", self.score, self.bonus, self.matched)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for MatrixCell {
|
impl Debug for MatrixCell {
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
write!(f, "({}, {})", self.score, self.consecutive_chars)
|
write!(f, "({}, {})", (self.0 & 1) != 0, (self.0 & 2) != 0)
|
||||||
}
|
|
||||||
}
|
|
||||||
impl<C: Char> Debug for HaystackChar<C> {
|
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
|
||||||
write!(f, "({}, {})", self.char, self.bonus)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl Debug for MatrixRow<'_> {
|
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
|
||||||
let mut f = f.debug_list();
|
|
||||||
f.entries((0..self.off).map(|_| &MatrixCell {
|
|
||||||
score: 0,
|
|
||||||
consecutive_chars: 0,
|
|
||||||
}));
|
|
||||||
f.entries(self.cells.iter());
|
|
||||||
f.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl Debug for MatrixRowMut<'_> {
|
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
|
||||||
let mut f = f.debug_list();
|
|
||||||
f.entries((0..self.off).map(|_| &(0, 0)));
|
|
||||||
f.entries(self.cells.iter());
|
|
||||||
f.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub struct DebugList<I>(I);
|
|
||||||
impl<I> Debug for DebugList<I>
|
|
||||||
where
|
|
||||||
I: Iterator + Clone,
|
|
||||||
I::Item: Debug,
|
|
||||||
{
|
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
|
||||||
f.debug_list().entries(self.0.clone()).finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl<'a, C: Char> Debug for Matrix<'a, C> {
|
|
||||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
|
||||||
f.debug_struct("Matrix")
|
|
||||||
.field("haystack", &DebugList(self.haystack()))
|
|
||||||
.field("matrix", &DebugList(self.rows()))
|
|
||||||
.finish()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
107
src/exact.rs
Normal file
107
src/exact.rs
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
use memchr::{Memchr, Memchr2};
|
||||||
|
|
||||||
|
use crate::chars::{AsciiChar, Char};
|
||||||
|
use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH};
|
||||||
|
use crate::Matcher;
|
||||||
|
|
||||||
|
impl Matcher {
|
||||||
|
pub(crate) fn substring_match_1_ascii<const INDICES: bool>(
|
||||||
|
&mut self,
|
||||||
|
haystack: &[u8],
|
||||||
|
c: u8,
|
||||||
|
indices: &mut Vec<u32>,
|
||||||
|
) -> Option<u16> {
|
||||||
|
let mut max_score = 0;
|
||||||
|
let mut max_pos = 0;
|
||||||
|
if self.config.ignore_case && c >= b'a' && c <= b'z' {
|
||||||
|
for i in Memchr2::new(c, c - 32, haystack) {
|
||||||
|
let prev_char_class = i
|
||||||
|
.checked_sub(1)
|
||||||
|
.map(|i| AsciiChar(haystack[i]).char_class(&self.config))
|
||||||
|
.unwrap_or(self.config.initial_char_class);
|
||||||
|
let char_class = AsciiChar(haystack[i]).char_class(&self.config);
|
||||||
|
let bonus = self.config.bonus_for(prev_char_class, char_class);
|
||||||
|
let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH;
|
||||||
|
if score > max_score {
|
||||||
|
max_pos = i as u32;
|
||||||
|
max_score = score;
|
||||||
|
// can't get better than this
|
||||||
|
if score >= self.config.bonus_boundary_white
|
||||||
|
&& score >= self.config.bonus_boundary_delimiter
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let char_class = AsciiChar(c).char_class(&self.config);
|
||||||
|
for i in Memchr::new(c, haystack) {
|
||||||
|
let prev_char_class = i
|
||||||
|
.checked_sub(1)
|
||||||
|
.map(|i| AsciiChar(haystack[i]).char_class(&self.config))
|
||||||
|
.unwrap_or(self.config.initial_char_class);
|
||||||
|
let bonus = self.config.bonus_for(prev_char_class, char_class);
|
||||||
|
let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH;
|
||||||
|
if score > max_score {
|
||||||
|
max_pos = i as u32;
|
||||||
|
max_score = score;
|
||||||
|
// can't get better than this
|
||||||
|
if score >= self.config.bonus_boundary_white
|
||||||
|
&& score >= self.config.bonus_boundary_delimiter
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if max_score == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if INDICES {
|
||||||
|
indices.clear();
|
||||||
|
indices.push(max_pos);
|
||||||
|
}
|
||||||
|
Some(max_score)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn substring_match_1_non_ascii<const INDICES: bool>(
|
||||||
|
&mut self,
|
||||||
|
haystack: &[char],
|
||||||
|
needle: char,
|
||||||
|
start: usize,
|
||||||
|
indices: &mut Vec<u32>,
|
||||||
|
) -> u16 {
|
||||||
|
let mut max_score = 0;
|
||||||
|
let mut max_pos = 0;
|
||||||
|
let mut prev_class = start
|
||||||
|
.checked_sub(1)
|
||||||
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
|
.unwrap_or(self.config.initial_char_class);
|
||||||
|
for (i, &c) in haystack.iter().enumerate() {
|
||||||
|
let (c, char_class) = c.char_class_and_normalize(&self.config);
|
||||||
|
if c != needle {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let bonus = self.config.bonus_for(prev_class, char_class);
|
||||||
|
prev_class = char_class;
|
||||||
|
let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH;
|
||||||
|
if score > max_score {
|
||||||
|
max_pos = i as u32;
|
||||||
|
max_score = score;
|
||||||
|
// can't get better than this
|
||||||
|
if score >= self.config.bonus_boundary_white
|
||||||
|
&& score >= self.config.bonus_boundary_delimiter
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if INDICES {
|
||||||
|
indices.clear();
|
||||||
|
indices.push(max_pos);
|
||||||
|
}
|
||||||
|
max_score
|
||||||
|
}
|
||||||
|
}
|
@ -1,11 +1,10 @@
|
|||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
use std::mem::take;
|
|
||||||
|
|
||||||
use crate::chars::{Char, CharClass};
|
use crate::chars::{Char, CharClass};
|
||||||
use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow};
|
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
|
||||||
use crate::score::{
|
use crate::score::{
|
||||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, PENALTY_GAP_START,
|
||||||
PENALTY_GAP_START, SCORE_MATCH,
|
SCORE_MATCH,
|
||||||
};
|
};
|
||||||
use crate::{Matcher, MatcherConfig};
|
use crate::{Matcher, MatcherConfig};
|
||||||
|
|
||||||
@ -36,61 +35,82 @@ impl Matcher {
|
|||||||
.checked_sub(1)
|
.checked_sub(1)
|
||||||
.map(|i| haystack[i].char_class(&self.config))
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
.unwrap_or(self.config.initial_char_class);
|
.unwrap_or(self.config.initial_char_class);
|
||||||
let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config);
|
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
|
||||||
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
|
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
|
||||||
if !matched {
|
if !matched {
|
||||||
debug_assert!(!(H::ASCII && N::ASCII));
|
assert!(
|
||||||
|
!N::ASCII || !H::ASCII,
|
||||||
|
"should have been caught by prefilter"
|
||||||
|
);
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
if needle.len() == 1 {
|
|
||||||
indices.clear();
|
|
||||||
indices.push(max_score_pos as u32 + start as u32);
|
|
||||||
return Some(max_score);
|
|
||||||
}
|
|
||||||
debug_assert_eq!(
|
|
||||||
matrix.row_offs[0], 0,
|
|
||||||
"prefilter should have put us at the start of the match"
|
|
||||||
);
|
|
||||||
|
|
||||||
// populate the matrix and find the best score
|
// populate the matrix and find the best score
|
||||||
let (max_score, best_match_end) = matrix.populate_matrix(needle);
|
let matrix_len = matrix.populate_matrix::<INDICES, _>(needle);
|
||||||
|
let last_row_off = matrix.row_offs[needle.len() - 1];
|
||||||
|
let relative_last_row_off = last_row_off as usize + 1 - needle.len();
|
||||||
|
let (match_end, match_score_cell) = matrix.current_row[relative_last_row_off..]
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.max_by_key(|(_, cell)| cell.score)
|
||||||
|
.expect("there must be atleast one match");
|
||||||
if INDICES {
|
if INDICES {
|
||||||
matrix.reconstruct_optimal_path(needle, start as u32, indices, best_match_end);
|
matrix.reconstruct_optimal_path(match_end as u16, indices, matrix_len, start as u32);
|
||||||
}
|
}
|
||||||
Some(max_score)
|
Some(match_score_cell.score as u16)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<H: Char> Matrix<'_, H> {
|
fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> ScoreCell {
|
||||||
fn setup<N: Char>(
|
let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE));
|
||||||
|
let score_match = m_score + consecutive_bonus as i32;
|
||||||
|
let score_skip = p_score + next_bonus as i32;
|
||||||
|
if score_match > score_skip {
|
||||||
|
ScoreCell {
|
||||||
|
score: score_match + SCORE_MATCH as i32,
|
||||||
|
bonus: consecutive_bonus,
|
||||||
|
matched: true,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ScoreCell {
|
||||||
|
score: score_skip + SCORE_MATCH as i32,
|
||||||
|
bonus: consecutive_bonus,
|
||||||
|
matched: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) {
|
||||||
|
let score_match = prev_m_score - PENALTY_GAP_START as i32;
|
||||||
|
let score_skip = prev_p_score - PENALTY_GAP_EXTENSION as i32;
|
||||||
|
if score_match > score_skip {
|
||||||
|
(score_match, true)
|
||||||
|
} else {
|
||||||
|
(score_skip, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<H: Char> MatcherDataView<'_, H> {
|
||||||
|
fn setup<const INDICES: bool, N: Char>(
|
||||||
&mut self,
|
&mut self,
|
||||||
needle: &[N],
|
needle: &[N],
|
||||||
mut prev_class: CharClass,
|
mut prev_class: CharClass,
|
||||||
config: &MatcherConfig,
|
config: &MatcherConfig,
|
||||||
) -> (u16, u16, bool)
|
) -> bool
|
||||||
where
|
where
|
||||||
H: PartialEq<N>,
|
H: PartialEq<N>,
|
||||||
{
|
{
|
||||||
let haystack_len = self.haystack.len() as u16;
|
|
||||||
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
|
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
|
||||||
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
|
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
|
||||||
|
|
||||||
let col_iter = self
|
let col_iter = self
|
||||||
.haystack
|
.haystack
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.zip(self.cells.iter_mut())
|
|
||||||
.zip(self.bonus.iter_mut())
|
.zip(self.bonus.iter_mut())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
|
|
||||||
let mut max_score = 0;
|
|
||||||
let mut max_score_pos = 0;
|
|
||||||
let mut in_gap = false;
|
|
||||||
let mut prev_score = 0u16;
|
|
||||||
let mut matched = false;
|
let mut matched = false;
|
||||||
let first_needle_char = needle[0];
|
for (i, (c_, bonus_)) in col_iter {
|
||||||
let mut matrix_cells = 0;
|
|
||||||
|
|
||||||
for (i, ((c_, matrix_cell), bonus_)) in col_iter {
|
|
||||||
let (c, class) = c_.char_class_and_normalize(config);
|
let (c, class) = c_.char_class_and_normalize(config);
|
||||||
*c_ = c;
|
*c_ = c;
|
||||||
|
|
||||||
@ -103,195 +123,197 @@ impl<H: Char> Matrix<'_, H> {
|
|||||||
if c == needle_char {
|
if c == needle_char {
|
||||||
// save the first idx of each char
|
// save the first idx of each char
|
||||||
if let Some(next) = row_iter.next() {
|
if let Some(next) = row_iter.next() {
|
||||||
matrix_cells += haystack_len - i;
|
|
||||||
*row_start = i;
|
*row_start = i;
|
||||||
(needle_char, row_start) = next;
|
(needle_char, row_start) = next;
|
||||||
} else if !matched {
|
} else if !matched {
|
||||||
matrix_cells += haystack_len - i;
|
|
||||||
*row_start = i;
|
*row_start = i;
|
||||||
// we have atleast one match
|
// we have atleast one match
|
||||||
matched = true;
|
matched = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if !matched {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
debug_assert_eq!(self.row_offs[0], 0);
|
||||||
|
Self::score_row::<true, INDICES, _>(
|
||||||
|
self.current_row,
|
||||||
|
self.matrix_cells,
|
||||||
|
self.haystack,
|
||||||
|
self.bonus,
|
||||||
|
0,
|
||||||
|
self.row_offs[1],
|
||||||
|
0,
|
||||||
|
needle[0],
|
||||||
|
needle[1],
|
||||||
|
);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
// we calculate two scores:
|
fn score_row<const FIRST_ROW: bool, const INDICES: bool, N: Char>(
|
||||||
// * one for transversing the matrix horizontially (no match at
|
current_row: &mut [ScoreCell],
|
||||||
// the current char)
|
matrix_cells: &mut [MatrixCell],
|
||||||
// * one for transversing the matrix diagonally (match at the
|
haystack: &[H],
|
||||||
// current char)
|
bonus: &[u16],
|
||||||
// the maximum of those two scores is used
|
row_off: u16,
|
||||||
let gap_penalty = if in_gap {
|
mut next_row_off: u16,
|
||||||
PENALTY_GAP_EXTENSION
|
needle_idx: u16,
|
||||||
|
needle_char: N,
|
||||||
|
next_needle_char: N,
|
||||||
|
) where
|
||||||
|
H: PartialEq<N>,
|
||||||
|
{
|
||||||
|
next_row_off -= 1;
|
||||||
|
let relative_row_off = row_off - needle_idx;
|
||||||
|
let next_relative_row_off = next_row_off - needle_idx;
|
||||||
|
let skipped_col_iter = haystack[row_off as usize..next_row_off as usize]
|
||||||
|
.iter()
|
||||||
|
.zip(bonus[row_off as usize..next_row_off as usize].iter())
|
||||||
|
.zip(current_row[relative_row_off as usize..next_relative_row_off as usize].iter_mut())
|
||||||
|
.zip(matrix_cells.iter_mut());
|
||||||
|
let mut prev_p_score = i32::MIN / 2;
|
||||||
|
let mut prev_m_score = i32::MIN / 2;
|
||||||
|
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
|
||||||
|
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||||
|
let m_cell = if FIRST_ROW {
|
||||||
|
if c == needle_char {
|
||||||
|
// TODO: do we really want to start with a penalty here??
|
||||||
|
let mut cell =
|
||||||
|
next_m_score(0, i32::MIN / 2, 0, bonus * BONUS_FIRST_CHAR_MULTIPLIER);
|
||||||
|
cell.bonus = *bonus;
|
||||||
|
cell
|
||||||
} else {
|
} else {
|
||||||
PENALTY_GAP_START
|
ScoreCell {
|
||||||
|
score: i32::MIN / 2,
|
||||||
|
bonus: 0,
|
||||||
|
matched: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*score_cell
|
||||||
};
|
};
|
||||||
let score_gap = prev_score.saturating_sub(gap_penalty);
|
if INDICES {
|
||||||
let score_match = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
matrix_cell.set(p_matched, m_cell.matched);
|
||||||
if c == first_needle_char && score_match >= score_gap {
|
}
|
||||||
matrix_cell.consecutive_chars = 1;
|
prev_p_score = p_score;
|
||||||
matrix_cell.score = score_match;
|
prev_m_score = m_cell.score;
|
||||||
in_gap = false;
|
}
|
||||||
if needle.len() == 1 && score_match > max_score {
|
let col_iter = haystack[next_row_off as usize..]
|
||||||
max_score = score_match;
|
.windows(2)
|
||||||
max_score_pos = i;
|
.zip(bonus[next_row_off as usize..].windows(2))
|
||||||
// can't get better than this
|
.zip(current_row[next_relative_row_off as usize..].iter_mut())
|
||||||
if bonus >= BONUS_BOUNDARY {
|
.zip(matrix_cells[(next_relative_row_off - relative_row_off) as usize..].iter_mut());
|
||||||
break;
|
for (((c, bonus), score_cell), matrix_cell) in col_iter {
|
||||||
|
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
|
||||||
|
let m_cell = if FIRST_ROW {
|
||||||
|
if c[0] == needle_char {
|
||||||
|
// TODO: do we really want to start with a penalty here??
|
||||||
|
let mut cell =
|
||||||
|
next_m_score(0, i32::MIN / 2, 0, bonus[0] * BONUS_FIRST_CHAR_MULTIPLIER);
|
||||||
|
cell.bonus = bonus[0];
|
||||||
|
cell
|
||||||
|
} else {
|
||||||
|
ScoreCell {
|
||||||
|
score: i32::MIN / 2,
|
||||||
|
bonus: 0,
|
||||||
|
matched: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
matrix_cell.consecutive_chars = 0;
|
*score_cell
|
||||||
matrix_cell.score = score_gap;
|
};
|
||||||
in_gap = true;
|
*score_cell = if c[1] == next_needle_char {
|
||||||
|
next_m_score(p_score, m_cell.score, m_cell.bonus, bonus[1])
|
||||||
|
} else {
|
||||||
|
ScoreCell {
|
||||||
|
score: i32::MIN / 2,
|
||||||
|
bonus: 0,
|
||||||
|
matched: false,
|
||||||
}
|
}
|
||||||
prev_score = matrix_cell.score;
|
};
|
||||||
|
if INDICES {
|
||||||
|
matrix_cell.set(p_matched, m_cell.matched);
|
||||||
|
}
|
||||||
|
prev_p_score = p_score;
|
||||||
|
prev_m_score = m_cell.score;
|
||||||
}
|
}
|
||||||
self.cells = &mut take(&mut self.cells)[..matrix_cells as usize];
|
|
||||||
(max_score_pos, max_score, matched)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn populate_matrix<N: Char>(&mut self, needle: &[N]) -> (u16, u16)
|
fn populate_matrix<const INDICES: bool, N: Char>(&mut self, needle: &[N]) -> usize
|
||||||
where
|
where
|
||||||
H: PartialEq<N>,
|
H: PartialEq<N>,
|
||||||
{
|
{
|
||||||
let mut max_score = 0;
|
let mut matrix_cells = &mut self.matrix_cells[self.current_row.len()..];
|
||||||
let mut max_score_end = 0;
|
let mut row_iter = needle[1..]
|
||||||
|
|
||||||
let mut row_iter = needle
|
|
||||||
.iter()
|
.iter()
|
||||||
.zip(rows_mut(self.row_offs, self.cells, self.haystack.len()))
|
.copied()
|
||||||
|
.zip(self.row_offs[1..].iter().copied())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
// skip the first row we already calculated the in `setup` initial scores
|
let (mut needle_idx, (mut needle_char, mut row_off)) = row_iter.next().unwrap();
|
||||||
let (_, mut prev_matrix_row) = row_iter.next().unwrap().1;
|
for (next_needle_idx, (next_needle_char, next_row_off)) in row_iter {
|
||||||
|
Self::score_row::<false, INDICES, _>(
|
||||||
for (i, (&needle_char, row)) in row_iter {
|
self.current_row,
|
||||||
let haystack = haystack(self.haystack, self.bonus, row.off);
|
matrix_cells,
|
||||||
let mut in_gap = false;
|
self.haystack,
|
||||||
let mut prev_matrix_cell = MatrixCell {
|
self.bonus,
|
||||||
score: 0,
|
row_off,
|
||||||
consecutive_chars: 0,
|
next_row_off,
|
||||||
};
|
needle_idx as u16 + 1,
|
||||||
// we are interested in the score of the previous character
|
needle_char,
|
||||||
// in the previous row. This represents the previous char
|
next_needle_char,
|
||||||
// for each possible pattern. This is equivalent to diagonal movement
|
);
|
||||||
let diagonal_start = row.off - prev_matrix_row.off - 1;
|
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
|
||||||
let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..];
|
matrix_cells = &mut matrix_cells[len..];
|
||||||
|
(needle_idx, needle_char, row_off) = (next_needle_idx, next_needle_char, next_row_off);
|
||||||
for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
|
||||||
.zip(row.cells.iter_mut())
|
|
||||||
.zip(diagonal.iter())
|
|
||||||
.enumerate()
|
|
||||||
{
|
|
||||||
let col = j + row.off as usize;
|
|
||||||
let gap_penalty = if in_gap {
|
|
||||||
PENALTY_GAP_EXTENSION
|
|
||||||
} else {
|
|
||||||
PENALTY_GAP_START
|
|
||||||
};
|
|
||||||
// we calculate two scores:
|
|
||||||
// * one for transversing the matrix horizontially (no match at
|
|
||||||
// the current char)
|
|
||||||
// * one for transversing the matrix diagonally (match at the
|
|
||||||
// current char)
|
|
||||||
// the maximum of those two scores is used
|
|
||||||
let mut score_diag = 0;
|
|
||||||
let score_hor = prev_matrix_cell.score.saturating_sub(gap_penalty);
|
|
||||||
|
|
||||||
let mut consecutive = 0;
|
|
||||||
if haystack_char.char == needle_char {
|
|
||||||
// we have a match at the current char
|
|
||||||
score_diag = diag_matrix_cell.score + SCORE_MATCH;
|
|
||||||
let mut bonus = haystack_char.bonus;
|
|
||||||
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
|
||||||
if consecutive > 1 {
|
|
||||||
let first_bonus = self.bonus[col + 1 - consecutive as usize];
|
|
||||||
if bonus > first_bonus {
|
|
||||||
if bonus >= BONUS_BOUNDARY {
|
|
||||||
consecutive = 1
|
|
||||||
} else {
|
|
||||||
bonus = max(bonus, BONUS_CONSECUTIVE)
|
|
||||||
}
|
}
|
||||||
} else {
|
matrix_cells.as_ptr() as usize - self.matrix_cells.as_ptr() as usize
|
||||||
bonus = max(first_bonus, BONUS_CONSECUTIVE)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if score_diag + bonus < score_hor
|
|
||||||
|| (consecutive == 1 && score_diag + bonus == score_hor)
|
|
||||||
{
|
|
||||||
score_diag += haystack_char.bonus;
|
|
||||||
consecutive = 0;
|
|
||||||
} else {
|
|
||||||
score_diag += bonus;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
in_gap = consecutive == 0;
|
|
||||||
let score = max(score_diag, score_hor);
|
|
||||||
if i == needle.len() - 1 && score > max_score {
|
|
||||||
max_score = score;
|
|
||||||
max_score_end = col as u16;
|
|
||||||
}
|
|
||||||
matrix_cell.consecutive_chars = consecutive;
|
|
||||||
matrix_cell.score = score;
|
|
||||||
prev_matrix_cell = *matrix_cell;
|
|
||||||
}
|
|
||||||
prev_matrix_row = row;
|
|
||||||
}
|
|
||||||
(max_score, max_score_end)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reconstruct_optimal_path<N: Char>(
|
fn reconstruct_optimal_path(
|
||||||
&self,
|
&self,
|
||||||
needle: &[N],
|
max_score_end: u16,
|
||||||
start: u32,
|
|
||||||
indices: &mut Vec<u32>,
|
indices: &mut Vec<u32>,
|
||||||
best_match_end: u16,
|
matrix_len: usize,
|
||||||
|
start: u32,
|
||||||
) {
|
) {
|
||||||
indices.clear();
|
indices.clear();
|
||||||
indices.resize(needle.len(), 0);
|
indices.resize(self.row_offs.len(), 0);
|
||||||
|
let last_row_off = *self.row_offs.last().unwrap();
|
||||||
let mut row_iter = self.rows_rev().zip(indices.iter_mut().rev()).peekable();
|
indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32;
|
||||||
let (mut row, mut matched_col_idx) = row_iter.next().unwrap();
|
|
||||||
let mut next_row: Option<MatrixRow> = None;
|
|
||||||
let mut col = best_match_end;
|
|
||||||
let mut prefer_match = true;
|
|
||||||
let haystack_len = self.haystack.len() as u16;
|
|
||||||
|
|
||||||
|
let mut matrix_cells = &self.matrix_cells[..matrix_len];
|
||||||
|
let width = self.current_row.len();
|
||||||
|
let mut row_iter = self.row_offs[..self.row_offs.len() - 1]
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.enumerate()
|
||||||
|
.rev()
|
||||||
|
.map(|(i, off)| {
|
||||||
|
let relative_off = off as usize - i;
|
||||||
|
let row;
|
||||||
|
(matrix_cells, row) =
|
||||||
|
matrix_cells.split_at(matrix_cells.len() - (width - relative_off));
|
||||||
|
(i, off, row)
|
||||||
|
});
|
||||||
|
let (mut row_idx, mut row_off, mut row) = row_iter.next().unwrap();
|
||||||
|
let mut col = max_score_end;
|
||||||
|
let relative_last_row_off = last_row_off as usize + 1 - self.row_offs.len();
|
||||||
|
let mut matched = self.current_row[col as usize + relative_last_row_off].matched;
|
||||||
|
col += last_row_off - row_off - 1;
|
||||||
loop {
|
loop {
|
||||||
let score = row[col].score;
|
if matched {
|
||||||
// we calculate two scores:
|
indices[row_idx] = start + col as u32 + row_off as u32;
|
||||||
// * one for transversing the matrix horizontially (no match at
|
|
||||||
// the current char)
|
|
||||||
// * one for transversing the matrix diagonally (match at the
|
|
||||||
// current char)
|
|
||||||
// the maximum of those two scores is used
|
|
||||||
let mut score_diag = 0;
|
|
||||||
let mut score_horz = 0;
|
|
||||||
if let Some(&(prev_row, _)) = row_iter.peek() {
|
|
||||||
score_diag = prev_row[col - 1].score;
|
|
||||||
}
|
}
|
||||||
if col > row.off {
|
let next_matched = row[col as usize].get(matched);
|
||||||
score_horz = row[col - 1].score;
|
if matched {
|
||||||
}
|
let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else{
|
||||||
let mut in_block = row[col].consecutive_chars > 1;
|
|
||||||
if !in_block && col + 1 < haystack_len {
|
|
||||||
if let Some(next_row) = next_row {
|
|
||||||
if col + 1 >= next_row.off {
|
|
||||||
in_block = next_row[col + 1].consecutive_chars > 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if score > score_diag
|
|
||||||
&& (score > score_horz || in_block || prefer_match && score == score_horz)
|
|
||||||
{
|
|
||||||
*matched_col_idx = col as u32 + start;
|
|
||||||
next_row = Some(row);
|
|
||||||
let Some(next) = row_iter.next() else {
|
|
||||||
break;
|
break;
|
||||||
};
|
};
|
||||||
(row, matched_col_idx) = next
|
col += row_off - next_row_off;
|
||||||
|
(row_idx, row_off, row) = (next_row_idx, next_row_off, next_row)
|
||||||
}
|
}
|
||||||
col -= 1;
|
col -= 1;
|
||||||
prefer_match = row[col].consecutive_chars != 0;
|
matched = next_matched;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
20
src/lib.rs
20
src/lib.rs
@ -5,6 +5,7 @@ pub mod chars;
|
|||||||
mod config;
|
mod config;
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod debug;
|
mod debug;
|
||||||
|
mod exact;
|
||||||
mod fuzzy_greedy;
|
mod fuzzy_greedy;
|
||||||
mod fuzzy_optimal;
|
mod fuzzy_optimal;
|
||||||
mod matrix;
|
mod matrix;
|
||||||
@ -67,6 +68,9 @@ impl Matcher {
|
|||||||
);
|
);
|
||||||
match (haystack, needle_) {
|
match (haystack, needle_) {
|
||||||
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
||||||
|
if let &[needle] = needle {
|
||||||
|
return self.substring_match_1_ascii::<INDICES>(haystack, needle, indidies);
|
||||||
|
}
|
||||||
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?;
|
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?;
|
||||||
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
|
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
|
||||||
AsciiChar::cast(haystack),
|
AsciiChar::cast(haystack),
|
||||||
@ -83,6 +87,16 @@ impl Matcher {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
||||||
|
if let &[needle] = needle {
|
||||||
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
||||||
|
let res = self.substring_match_1_non_ascii::<INDICES>(
|
||||||
|
haystack,
|
||||||
|
needle as char,
|
||||||
|
start,
|
||||||
|
indidies,
|
||||||
|
);
|
||||||
|
return Some(res);
|
||||||
|
}
|
||||||
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
||||||
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
|
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
|
||||||
haystack,
|
haystack,
|
||||||
@ -94,6 +108,12 @@ impl Matcher {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
||||||
|
if let &[needle] = needle {
|
||||||
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
||||||
|
let res = self
|
||||||
|
.substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indidies);
|
||||||
|
return Some(res);
|
||||||
|
}
|
||||||
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
||||||
self.fuzzy_match_optimal::<INDICES, char, char>(
|
self.fuzzy_match_optimal::<INDICES, char, char>(
|
||||||
haystack,
|
haystack,
|
||||||
|
156
src/matrix.rs
156
src/matrix.rs
@ -1,12 +1,11 @@
|
|||||||
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
|
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::mem::{size_of, take};
|
use std::mem::size_of;
|
||||||
use std::ops::Index;
|
|
||||||
use std::ptr::{slice_from_raw_parts_mut, NonNull};
|
use std::ptr::{slice_from_raw_parts_mut, NonNull};
|
||||||
|
|
||||||
use crate::chars::Char;
|
use crate::chars::Char;
|
||||||
|
|
||||||
const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB
|
const MAX_MATRIX_SIZE: usize = 100 * 1024; // 100*1024 = 100KB
|
||||||
|
|
||||||
// these two aren't hard maxima, instead we simply allow whatever will fit into memory
|
// these two aren't hard maxima, instead we simply allow whatever will fit into memory
|
||||||
const MAX_HAYSTACK_LEN: usize = 2048; // 64KB
|
const MAX_HAYSTACK_LEN: usize = 2048; // 64KB
|
||||||
@ -15,21 +14,23 @@ const MAX_NEEDLE_LEN: usize = 2048; // 64KB
|
|||||||
struct MatrixLayout<C: Char> {
|
struct MatrixLayout<C: Char> {
|
||||||
haystack_len: usize,
|
haystack_len: usize,
|
||||||
needle_len: usize,
|
needle_len: usize,
|
||||||
cell_count: usize,
|
|
||||||
layout: Layout,
|
layout: Layout,
|
||||||
haystack_off: usize,
|
haystack_off: usize,
|
||||||
bonus_off: usize,
|
bonus_off: usize,
|
||||||
rows_off: usize,
|
rows_off: usize,
|
||||||
cells_off: usize,
|
score_off: usize,
|
||||||
|
matrix_off: usize,
|
||||||
_phantom: PhantomData<C>,
|
_phantom: PhantomData<C>,
|
||||||
}
|
}
|
||||||
impl<C: Char> MatrixLayout<C> {
|
impl<C: Char> MatrixLayout<C> {
|
||||||
fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout<C> {
|
fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout<C> {
|
||||||
let mut layout = Layout::from_size_align(0, 1).unwrap();
|
let mut layout = Layout::from_size_align(0, 1).unwrap();
|
||||||
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
|
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
|
||||||
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();
|
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();
|
||||||
let rows_layout = Layout::array::<u16>(needle_len).unwrap();
|
let rows_layout = Layout::array::<u16>(needle_len).unwrap();
|
||||||
let cells_layout = Layout::array::<MatrixCell>(cell_count).unwrap();
|
let score_layout = Layout::array::<ScoreCell>(haystack_len + 1 - needle_len).unwrap();
|
||||||
|
let matrix_layout =
|
||||||
|
Layout::array::<MatrixCell>((haystack_len + 1 - needle_len) * needle_len).unwrap();
|
||||||
|
|
||||||
let haystack_off;
|
let haystack_off;
|
||||||
(layout, haystack_off) = layout.extend(haystack_layout).unwrap();
|
(layout, haystack_off) = layout.extend(haystack_layout).unwrap();
|
||||||
@ -37,17 +38,19 @@ impl<C: Char> MatrixLayout<C> {
|
|||||||
(layout, bonus_off) = layout.extend(bonus_layout).unwrap();
|
(layout, bonus_off) = layout.extend(bonus_layout).unwrap();
|
||||||
let rows_off;
|
let rows_off;
|
||||||
(layout, rows_off) = layout.extend(rows_layout).unwrap();
|
(layout, rows_off) = layout.extend(rows_layout).unwrap();
|
||||||
let cells_off;
|
let score_off;
|
||||||
(layout, cells_off) = layout.extend(cells_layout).unwrap();
|
(layout, score_off) = layout.extend(score_layout).unwrap();
|
||||||
|
let matrix_off;
|
||||||
|
(layout, matrix_off) = layout.extend(matrix_layout).unwrap();
|
||||||
MatrixLayout {
|
MatrixLayout {
|
||||||
haystack_len,
|
haystack_len,
|
||||||
needle_len,
|
needle_len,
|
||||||
cell_count,
|
|
||||||
layout,
|
layout,
|
||||||
haystack_off,
|
haystack_off,
|
||||||
bonus_off,
|
bonus_off,
|
||||||
rows_off,
|
rows_off,
|
||||||
cells_off,
|
score_off,
|
||||||
|
matrix_off,
|
||||||
_phantom: PhantomData,
|
_phantom: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -57,9 +60,13 @@ impl<C: Char> MatrixLayout<C> {
|
|||||||
unsafe fn fieds_from_ptr(
|
unsafe fn fieds_from_ptr(
|
||||||
&self,
|
&self,
|
||||||
ptr: NonNull<u8>,
|
ptr: NonNull<u8>,
|
||||||
) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) {
|
) -> (
|
||||||
// sanity checks, should not be necessary
|
*mut [C],
|
||||||
|
*mut [u16],
|
||||||
|
*mut [u16],
|
||||||
|
*mut [ScoreCell],
|
||||||
|
*mut [MatrixCell],
|
||||||
|
) {
|
||||||
let base = ptr.as_ptr();
|
let base = ptr.as_ptr();
|
||||||
let haystack = base.add(self.haystack_off) as *mut C;
|
let haystack = base.add(self.haystack_off) as *mut C;
|
||||||
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
|
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
|
||||||
@ -67,109 +74,62 @@ impl<C: Char> MatrixLayout<C> {
|
|||||||
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
|
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
|
||||||
let rows = base.add(self.rows_off) as *mut u16;
|
let rows = base.add(self.rows_off) as *mut u16;
|
||||||
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
|
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
|
||||||
let cells = base.add(self.cells_off) as *mut MatrixCell;
|
let cells = base.add(self.score_off) as *mut ScoreCell;
|
||||||
let cells = slice_from_raw_parts_mut(cells, self.cell_count);
|
let cells = slice_from_raw_parts_mut(cells, self.haystack_len + 1 - self.needle_len);
|
||||||
(haystack, bonus, rows, cells)
|
let matrix = base.add(self.matrix_off) as *mut MatrixCell;
|
||||||
|
let matrix = slice_from_raw_parts_mut(
|
||||||
|
matrix,
|
||||||
|
(self.haystack_len + 1 - self.needle_len) * self.haystack_len,
|
||||||
|
);
|
||||||
|
(haystack, bonus, rows, cells, matrix)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub(crate) struct MatrixCell {
|
pub(crate) struct ScoreCell {
|
||||||
pub score: u16,
|
pub score: i32,
|
||||||
pub consecutive_chars: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub(crate) struct HaystackChar<C: Char> {
|
|
||||||
pub char: C,
|
|
||||||
pub bonus: u16,
|
pub bonus: u16,
|
||||||
|
pub matched: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
pub(crate) struct MatcherDataView<'a, C: Char> {
|
||||||
pub(crate) struct MatrixRow<'a> {
|
|
||||||
pub off: u16,
|
|
||||||
pub cells: &'a [MatrixCell],
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Intexing returns the cell that corresponds to colmun `col` in this row,
|
|
||||||
/// this is not the same as directly indexing the cells array because every row
|
|
||||||
/// starts at a column offset which needs to be accounted for
|
|
||||||
impl Index<u16> for MatrixRow<'_> {
|
|
||||||
type Output = MatrixCell;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn index(&self, col: u16) -> &Self::Output {
|
|
||||||
&self.cells[(col - self.off) as usize]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct MatrixRowMut<'a> {
|
|
||||||
pub off: u16,
|
|
||||||
pub cells: &'a mut [MatrixCell],
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct Matrix<'a, C: Char> {
|
|
||||||
pub haystack: &'a mut [C],
|
pub haystack: &'a mut [C],
|
||||||
// stored as a separate array instead of struct
|
// stored as a separate array instead of struct
|
||||||
// to avoid padding sine char is too large and u8 too small :/
|
// to avoid padding sine char is too large and u8 too small :/
|
||||||
pub bonus: &'a mut [u16],
|
pub bonus: &'a mut [u16],
|
||||||
|
pub current_row: &'a mut [ScoreCell],
|
||||||
pub row_offs: &'a mut [u16],
|
pub row_offs: &'a mut [u16],
|
||||||
pub cells: &'a mut [MatrixCell],
|
pub matrix_cells: &'a mut [MatrixCell],
|
||||||
}
|
}
|
||||||
|
#[repr(transparent)]
|
||||||
|
pub struct MatrixCell(pub(crate) u8);
|
||||||
|
|
||||||
impl<'a, C: Char> Matrix<'a, C> {
|
impl MatrixCell {
|
||||||
pub fn rows_rev(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator {
|
pub fn set(&mut self, p_match: bool, m_match: bool) {
|
||||||
let mut cells = &*self.cells;
|
self.0 = p_match as u8 | ((m_match as u8) << 1);
|
||||||
self.row_offs.iter().rev().map(move |&off| {
|
|
||||||
let len = self.haystack.len() - off as usize;
|
|
||||||
let (tmp, row) = cells.split_at(cells.len() - len);
|
|
||||||
cells = tmp;
|
|
||||||
MatrixRow { off, cells: row }
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn haystack<'a, C: Char>(
|
pub fn get(&self, m_matrix: bool) -> bool {
|
||||||
haystack: &'a [C],
|
let mask = m_matrix as u8 + 1;
|
||||||
bonus: &'a [u16],
|
(self.0 & mask) != 0
|
||||||
skip: u16,
|
}
|
||||||
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + Clone + 'a {
|
|
||||||
haystack[skip as usize..]
|
|
||||||
.iter()
|
|
||||||
.zip(bonus[skip as usize..].iter())
|
|
||||||
.map(|(&char, &bonus)| HaystackChar { char, bonus })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn rows_mut<'a>(
|
|
||||||
row_offs: &'a [u16],
|
|
||||||
mut cells: &'a mut [MatrixCell],
|
|
||||||
haystack_len: usize,
|
|
||||||
) -> impl Iterator<Item = MatrixRowMut<'a>> + ExactSizeIterator + 'a {
|
|
||||||
row_offs.iter().map(move |&off| {
|
|
||||||
let len = haystack_len - off as usize;
|
|
||||||
let (row, tmp) = take(&mut cells).split_at_mut(len);
|
|
||||||
cells = tmp;
|
|
||||||
MatrixRowMut { off, cells: row }
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// we only use this to construct the layout for the slab allocation
|
// we only use this to construct the layout for the slab allocation
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
struct MatrixData {
|
struct MatcherData {
|
||||||
haystack: [char; MAX_HAYSTACK_LEN],
|
haystack: [char; MAX_HAYSTACK_LEN],
|
||||||
bonus: [u16; MAX_HAYSTACK_LEN],
|
bonus: [u16; MAX_HAYSTACK_LEN],
|
||||||
row_offs: [u16; MAX_NEEDLE_LEN],
|
row_offs: [u16; MAX_NEEDLE_LEN],
|
||||||
cells: [MatrixCell; MAX_MATRIX_SIZE],
|
scratch_space: [ScoreCell; MAX_HAYSTACK_LEN],
|
||||||
|
matrix: [u8; MAX_MATRIX_SIZE],
|
||||||
}
|
}
|
||||||
|
|
||||||
// const MATRIX_ALLOC_LAYOUT: Layout =
|
|
||||||
// MatrixLayout::<char>::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout;
|
|
||||||
|
|
||||||
pub(crate) struct MatrixSlab(NonNull<u8>);
|
pub(crate) struct MatrixSlab(NonNull<u8>);
|
||||||
|
|
||||||
impl MatrixSlab {
|
impl MatrixSlab {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
let layout = Layout::new::<MatrixData>();
|
let layout = Layout::new::<MatcherData>();
|
||||||
// safety: the matrix is never zero sized (hardcoded constants)
|
// safety: the matrix is never zero sized (hardcoded constants)
|
||||||
let ptr = unsafe { alloc_zeroed(layout) };
|
let ptr = unsafe { alloc_zeroed(layout) };
|
||||||
let Some(ptr) = NonNull::new(ptr) else{
|
let Some(ptr) = NonNull::new(ptr) else{
|
||||||
@ -182,32 +142,30 @@ impl MatrixSlab {
|
|||||||
&mut self,
|
&mut self,
|
||||||
haystack_: &[C],
|
haystack_: &[C],
|
||||||
needle_len: usize,
|
needle_len: usize,
|
||||||
) -> Option<Matrix<'_, C>> {
|
) -> Option<MatcherDataView<'_, C>> {
|
||||||
let cells = haystack_.len() * needle_len;
|
let cells = haystack_.len() * needle_len;
|
||||||
if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize {
|
if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let matrix_layout = MatrixLayout::<C>::new(
|
let matrix_layout = MatrixLayout::<C>::new(haystack_.len(), needle_len);
|
||||||
haystack_.len(),
|
if matrix_layout.layout.size() > size_of::<MatcherData>() {
|
||||||
needle_len,
|
|
||||||
(haystack_.len() + 1 - needle_len / 2) * needle_len,
|
|
||||||
);
|
|
||||||
if matrix_layout.layout.size() > size_of::<MatrixData>() {
|
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
unsafe {
|
unsafe {
|
||||||
// safely: this allocation is valid for MATRIX_ALLOC_LAYOUT
|
// safely: this allocation is valid for MATRIX_ALLOC_LAYOUT
|
||||||
let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0);
|
let (haystack, bonus, rows, current_row, matrix_cells) =
|
||||||
|
matrix_layout.fieds_from_ptr(self.0);
|
||||||
// copy haystack before creating references to ensure we donu't crate
|
// copy haystack before creating references to ensure we donu't crate
|
||||||
// references to invalid chars (which may or may not be UB)
|
// references to invalid chars (which may or may not be UB)
|
||||||
haystack_
|
haystack_
|
||||||
.as_ptr()
|
.as_ptr()
|
||||||
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());
|
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());
|
||||||
Some(Matrix {
|
Some(MatcherDataView {
|
||||||
haystack: &mut *haystack,
|
haystack: &mut *haystack,
|
||||||
row_offs: &mut *rows,
|
row_offs: &mut *rows,
|
||||||
bonus: &mut *bonus,
|
bonus: &mut *bonus,
|
||||||
cells: &mut *cells,
|
current_row: &mut *current_row,
|
||||||
|
matrix_cells: &mut *matrix_cells,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -215,6 +173,6 @@ impl MatrixSlab {
|
|||||||
|
|
||||||
impl Drop for MatrixSlab {
|
impl Drop for MatrixSlab {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatrixData>()) };
|
unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatcherData>()) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -114,11 +114,8 @@ impl Matcher {
|
|||||||
} else {
|
} else {
|
||||||
// Break consecutive chunk
|
// Break consecutive chunk
|
||||||
if bonus > first_bonus {
|
if bonus > first_bonus {
|
||||||
if bonus >= BONUS_BOUNDARY {
|
|
||||||
first_bonus = bonus;
|
first_bonus = bonus;
|
||||||
} else {
|
bonus = max(max(bonus, first_bonus), BONUS_CONSECUTIVE);
|
||||||
bonus = max(bonus, BONUS_CONSECUTIVE);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
||||||
}
|
}
|
||||||
|
17
src/tests.rs
17
src/tests.rs
@ -48,9 +48,11 @@ fn assert_matches(
|
|||||||
println!("xx {matched_indices:?} {algo:?}");
|
println!("xx {matched_indices:?} {algo:?}");
|
||||||
let res = match algo {
|
let res = match algo {
|
||||||
Algorithm::FuzzyOptimal => {
|
Algorithm::FuzzyOptimal => {
|
||||||
|
matched_indices.clear();
|
||||||
matcher.fuzzy_indices(haystack, needle, &mut matched_indices)
|
matcher.fuzzy_indices(haystack, needle, &mut matched_indices)
|
||||||
}
|
}
|
||||||
Algorithm::FuzzyGreedy => {
|
Algorithm::FuzzyGreedy => {
|
||||||
|
matched_indices.clear();
|
||||||
matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices)
|
matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -142,7 +144,7 @@ fn test_fuzzy() {
|
|||||||
"/AutomatorDocument.icns",
|
"/AutomatorDocument.icns",
|
||||||
"rdoc",
|
"rdoc",
|
||||||
&[9, 10, 11, 12],
|
&[9, 10, 11, 12],
|
||||||
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
|
BONUS_CAMEL123 * 3,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"/man1/zshcompctl.1",
|
"/man1/zshcompctl.1",
|
||||||
@ -395,13 +397,22 @@ fn test_optimal() {
|
|||||||
- PENALTY_GAP_EXTENSION,
|
- PENALTY_GAP_EXTENSION,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"Hٷ!!\0!!!\n\0\0\u{4}\u{c}\0\u{8}\0!\0\0\u{c}",
|
"Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}",
|
||||||
"\0!\0\0!",
|
"-!--!",
|
||||||
&[4, 5, 9, 10, 16],
|
&[4, 5, 9, 10, 16],
|
||||||
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4)
|
BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4)
|
||||||
- 2 * PENALTY_GAP_START
|
- 2 * PENALTY_GAP_START
|
||||||
- 6 * PENALTY_GAP_EXTENSION,
|
- 6 * PENALTY_GAP_EXTENSION,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"C8Gۂ(GECGS",
|
||||||
|
"8GCG",
|
||||||
|
&[1, 2, 7, 8],
|
||||||
|
BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- 3 * PENALTY_GAP_EXTENSION
|
||||||
|
+ BONUS_CONSECUTIVE,
|
||||||
|
),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user