mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 09:57:49 +00:00
better implementation
This commit is contained in:
parent
6837b4e2cb
commit
e964d42849
135
src/chars.rs
Normal file
135
src/chars.rs
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
|
||||||
|
use crate::MatcherConfig;
|
||||||
|
|
||||||
|
//autogenerated by generate-ucd
|
||||||
|
#[allow(warnings)]
|
||||||
|
#[rustfmt::skip]
|
||||||
|
mod case_fold;
|
||||||
|
mod normalize;
|
||||||
|
|
||||||
|
pub trait Char: Copy + Eq + Ord + std::fmt::Debug {
|
||||||
|
const ASCII: bool;
|
||||||
|
fn char_class(self, config: &MatcherConfig) -> CharClass;
|
||||||
|
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass);
|
||||||
|
fn normalize(self, config: &MatcherConfig) -> Self;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Char for u8 {
|
||||||
|
const ASCII: bool = true;
|
||||||
|
#[inline]
|
||||||
|
fn char_class(self, config: &MatcherConfig) -> CharClass {
|
||||||
|
let c = self;
|
||||||
|
// using manual if conditions instead optimizes better
|
||||||
|
if c >= b'a' && c <= b'z' {
|
||||||
|
CharClass::Lower
|
||||||
|
} else if c >= b'A' && c <= b'Z' {
|
||||||
|
CharClass::Upper
|
||||||
|
} else if c >= b'0' && c <= b'9' {
|
||||||
|
CharClass::Number
|
||||||
|
} else if c.is_ascii_whitespace() {
|
||||||
|
CharClass::Whitespace
|
||||||
|
} else if config.delimeter_chars.contains(&c) {
|
||||||
|
CharClass::Delimiter
|
||||||
|
} else {
|
||||||
|
CharClass::NonWord
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass) {
|
||||||
|
let char_class = self.char_class(config);
|
||||||
|
let normalized = if config.ignore_case && char_class == CharClass::Upper {
|
||||||
|
self + 32
|
||||||
|
} else {
|
||||||
|
self
|
||||||
|
};
|
||||||
|
(normalized, char_class)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn normalize(self, config: &MatcherConfig) -> Self {
|
||||||
|
if config.ignore_case && self >= b'A' && self <= b'Z' {
|
||||||
|
self + 32
|
||||||
|
} else {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn char_class_non_ascii(c: char) -> CharClass {
|
||||||
|
if c.is_lowercase() {
|
||||||
|
CharClass::Lower
|
||||||
|
} else if c.is_uppercase() {
|
||||||
|
CharClass::Upper
|
||||||
|
} else if c.is_numeric() {
|
||||||
|
CharClass::Number
|
||||||
|
} else if c.is_alphabetic() {
|
||||||
|
CharClass::Letter
|
||||||
|
} else if c.is_whitespace() {
|
||||||
|
CharClass::Whitespace
|
||||||
|
} else {
|
||||||
|
CharClass::NonWord
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl Char for char {
|
||||||
|
const ASCII: bool = false;
|
||||||
|
#[inline(always)]
|
||||||
|
fn char_class(self, config: &MatcherConfig) -> CharClass {
|
||||||
|
if self.is_ascii() {
|
||||||
|
return (self as u8).char_class(config);
|
||||||
|
}
|
||||||
|
char_class_non_ascii(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) {
|
||||||
|
if self.is_ascii() {
|
||||||
|
let (c, class) = (self as u8).char_class_and_normalize(config);
|
||||||
|
return (c as char, class);
|
||||||
|
}
|
||||||
|
let char_class = char_class_non_ascii(self);
|
||||||
|
if char_class == CharClass::Upper {
|
||||||
|
self = CASE_FOLDING_SIMPLE
|
||||||
|
.binary_search_by_key(&self, |(upper, _)| *upper)
|
||||||
|
.map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
||||||
|
}
|
||||||
|
if config.normalize {
|
||||||
|
self = normalize::normalize(self);
|
||||||
|
}
|
||||||
|
(self, char_class)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn normalize(mut self, config: &MatcherConfig) -> Self {
|
||||||
|
if config.normalize {
|
||||||
|
self = normalize::normalize(self);
|
||||||
|
}
|
||||||
|
to_lower_case(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub use normalize::normalize;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn to_lower_case(c: char) -> char {
|
||||||
|
if c >= 'A' && c <= 'Z' {
|
||||||
|
char::from_u32(c as u32 + 32).unwrap()
|
||||||
|
} else if !c.is_ascii() {
|
||||||
|
CASE_FOLDING_SIMPLE
|
||||||
|
.binary_search_by_key(&c, |(upper, _)| *upper)
|
||||||
|
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
|
||||||
|
} else {
|
||||||
|
c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||||
|
#[non_exhaustive]
|
||||||
|
pub enum CharClass {
|
||||||
|
Whitespace,
|
||||||
|
NonWord,
|
||||||
|
Delimiter,
|
||||||
|
Lower,
|
||||||
|
Upper,
|
||||||
|
Letter,
|
||||||
|
Number,
|
||||||
|
}
|
138
src/config.rs
138
src/config.rs
@ -1,37 +1,7 @@
|
|||||||
pub(crate) const SCORE_MATCH: u16 = 16;
|
use crate::chars::CharClass;
|
||||||
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
use crate::score::BONUS_BOUNDARY;
|
||||||
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
|
||||||
|
|
||||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
|
||||||
// too great to prevent the longer acronym matches from always winning over
|
|
||||||
// shorter fuzzy matches. The bonus point here was specifically chosen that
|
|
||||||
// the bonus is cancelled when the gap between the acronyms grows over
|
|
||||||
// 8 characters, which is approximately the average length of the words found
|
|
||||||
// in web2 dictionary and my file system.
|
|
||||||
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
|
|
||||||
|
|
||||||
// Although bonus point for non-word characters is non-contextual, we need it
|
|
||||||
// for computing bonus points for consecutive chunks starting with a non-word
|
|
||||||
// character.
|
|
||||||
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
|
|
||||||
|
|
||||||
// Edge-triggered bonus for matches in camelCase words.
|
|
||||||
// Compared to word-boundary case, they don't accompany single-character gaps
|
|
||||||
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
|
||||||
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
|
|
||||||
|
|
||||||
// Minimum bonus point given to characters in consecutive chunks.
|
|
||||||
// Note that bonus points for consecutive matches shouldn't have needed if we
|
|
||||||
// used fixed match score as in the original algorithm.
|
|
||||||
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
|
|
||||||
|
|
||||||
// The first character in the typed pattern usually has more significance
|
|
||||||
// than the rest so it's important that it appears at special positions where
|
|
||||||
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
|
||||||
// The amount of the extra bonus should be limited so that the gap penalty is
|
|
||||||
// still respected.
|
|
||||||
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
|
|
||||||
|
|
||||||
|
#[non_exhaustive]
|
||||||
pub struct MatcherConfig {
|
pub struct MatcherConfig {
|
||||||
pub delimeter_chars: &'static [u8],
|
pub delimeter_chars: &'static [u8],
|
||||||
/// Extra bonus for word boundary after whitespace character or beginning of the string
|
/// Extra bonus for word boundary after whitespace character or beginning of the string
|
||||||
@ -44,33 +14,17 @@ pub struct MatcherConfig {
|
|||||||
/// this significantly degrades performance so its not recommended
|
/// this significantly degrades performance so its not recommended
|
||||||
/// to be truned on by default
|
/// to be truned on by default
|
||||||
pub normalize: bool,
|
pub normalize: bool,
|
||||||
/// use faster/simpler algorithm at the cost of (potentially) much worse results
|
/// whether to ignore casing
|
||||||
/// For long inputs this algorith is always used as a fallbach to avoid
|
pub ignore_case: bool,
|
||||||
/// blowups in time complexity
|
|
||||||
pub use_v1: bool,
|
|
||||||
/// The case matching to perform
|
|
||||||
pub case_matching: CaseMatching,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
// #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||||
#[non_exhaustive]
|
// #[non_exhaustive]
|
||||||
pub enum CharClass {
|
// pub enum CaseMatching {
|
||||||
Whitespace,
|
// Respect,
|
||||||
NonWord,
|
// Ignore,
|
||||||
Delimiter,
|
// Smart,
|
||||||
Lower,
|
// }
|
||||||
Upper,
|
|
||||||
Letter,
|
|
||||||
Number,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
|
||||||
#[non_exhaustive]
|
|
||||||
pub enum CaseMatching {
|
|
||||||
Respect,
|
|
||||||
Ignore,
|
|
||||||
Smart,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MatcherConfig {
|
impl MatcherConfig {
|
||||||
pub const DEFAULT: Self = {
|
pub const DEFAULT: Self = {
|
||||||
@ -80,8 +34,7 @@ impl MatcherConfig {
|
|||||||
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
|
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
|
||||||
inital_char_class: CharClass::Whitespace,
|
inital_char_class: CharClass::Whitespace,
|
||||||
normalize: false,
|
normalize: false,
|
||||||
use_v1: false,
|
ignore_case: true,
|
||||||
case_matching: CaseMatching::Smart,
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -107,69 +60,4 @@ impl MatcherConfig {
|
|||||||
self.inital_char_class = CharClass::Delimiter;
|
self.inital_char_class = CharClass::Delimiter;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
fn char_class_non_ascii(c: char) -> CharClass {
|
|
||||||
if c.is_lowercase() {
|
|
||||||
CharClass::Lower
|
|
||||||
} else if c.is_uppercase() {
|
|
||||||
CharClass::Upper
|
|
||||||
} else if c.is_numeric() {
|
|
||||||
CharClass::Number
|
|
||||||
} else if c.is_alphabetic() {
|
|
||||||
CharClass::Letter
|
|
||||||
} else if c.is_whitespace() {
|
|
||||||
CharClass::Whitespace
|
|
||||||
} else {
|
|
||||||
CharClass::NonWord
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn char_class_ascii(&self, c: char) -> CharClass {
|
|
||||||
// using manual if conditions instead optimizes better
|
|
||||||
if c >= 'a' && c <= 'z' {
|
|
||||||
CharClass::Lower
|
|
||||||
} else if c >= 'A' && c <= 'Z' {
|
|
||||||
CharClass::Upper
|
|
||||||
} else if c >= '0' && c <= '9' {
|
|
||||||
CharClass::Number
|
|
||||||
} else if c.is_ascii_whitespace() {
|
|
||||||
CharClass::Whitespace
|
|
||||||
} else if self.delimeter_chars.contains(&(c as u8)) {
|
|
||||||
CharClass::Delimiter
|
|
||||||
} else {
|
|
||||||
CharClass::NonWord
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn char_class(&self, c: char) -> CharClass {
|
|
||||||
if c.is_ascii() {
|
|
||||||
self.char_class_ascii(c)
|
|
||||||
} else {
|
|
||||||
Self::char_class_non_ascii(c)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
|
||||||
if class > CharClass::NonWord {
|
|
||||||
// transition from non word to word
|
|
||||||
match prev_class {
|
|
||||||
CharClass::Whitespace => return self.bonus_boundary_white,
|
|
||||||
CharClass::Delimiter => return self.bonus_boundary_delimiter,
|
|
||||||
CharClass::NonWord => return BONUS_BOUNDARY,
|
|
||||||
_ => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if prev_class == CharClass::Lower && class == CharClass::Upper
|
|
||||||
|| prev_class != CharClass::Number && class == CharClass::Number
|
|
||||||
{
|
|
||||||
// camelCase letter123
|
|
||||||
BONUS_CAMEL123
|
|
||||||
} else if class == CharClass::NonWord {
|
|
||||||
BONUS_NON_WORD
|
|
||||||
} else if class == CharClass::Whitespace {
|
|
||||||
self.bonus_boundary_white
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
46
src/fuzzy_greedy.rs
Normal file
46
src/fuzzy_greedy.rs
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
use crate::chars::Char;
|
||||||
|
use crate::Matcher;
|
||||||
|
|
||||||
|
impl Matcher {
|
||||||
|
/// greedy fallback algoritm, much faster (linear time) but reported scores/indicies
|
||||||
|
/// might not be the best match
|
||||||
|
pub(crate) fn fuzzy_match_greedy<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||||
|
&mut self,
|
||||||
|
haystack: &[H],
|
||||||
|
needle: &[N],
|
||||||
|
mut start: usize,
|
||||||
|
mut end: usize,
|
||||||
|
indicies: &mut Vec<u32>,
|
||||||
|
) -> Option<u16> {
|
||||||
|
let first_char_end = if H::ASCII { start + 1 } else { end };
|
||||||
|
if !H::ASCII && needle.len() != 1 {
|
||||||
|
let mut needle_iter = needle[1..].iter().copied();
|
||||||
|
if let Some(mut needle_char) = needle_iter.next() {
|
||||||
|
for (i, &c) in haystack[first_char_end..].iter().enumerate() {
|
||||||
|
if c.normalize(&self.config) == needle_char {
|
||||||
|
let Some(next_needle_char) = needle_iter.next() else {
|
||||||
|
end = i + 1;
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
needle_char = next_needle_char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// mimimize the greedly match by greedy matching in reverse
|
||||||
|
|
||||||
|
let mut needle_iter = needle.iter().rev().copied();
|
||||||
|
let mut needle_char = needle_iter.next().unwrap();
|
||||||
|
for (i, &c) in haystack[start..end].iter().enumerate().rev() {
|
||||||
|
println!("{c:?} {i} {needle_char:?}");
|
||||||
|
if c == needle_char {
|
||||||
|
let Some(next_needle_char) = needle_iter.next() else {
|
||||||
|
start += i;
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
needle_char = next_needle_char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(self.calculate_score::<INDICIES, H, N>(haystack, needle, start, end, indicies))
|
||||||
|
}
|
||||||
|
}
|
272
src/fuzzy_optimal.rs
Normal file
272
src/fuzzy_optimal.rs
Normal file
@ -0,0 +1,272 @@
|
|||||||
|
use std::cmp::max;
|
||||||
|
|
||||||
|
use crate::chars::{Char, CharClass};
|
||||||
|
use crate::matrix::{haystack, rows_mut, Matrix, MatrixCell, MatrixRow};
|
||||||
|
use crate::score::{
|
||||||
|
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
||||||
|
PENALTY_GAP_START, SCORE_MATCH,
|
||||||
|
};
|
||||||
|
use crate::{Matcher, MatcherConfig};
|
||||||
|
|
||||||
|
impl Matcher {
|
||||||
|
pub(crate) fn fuzzy_match_optimal<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||||
|
&mut self,
|
||||||
|
haystack: &[H],
|
||||||
|
needle: &[N],
|
||||||
|
start: usize,
|
||||||
|
greedy_end: usize,
|
||||||
|
end: usize,
|
||||||
|
indicies: &mut Vec<u32>,
|
||||||
|
) -> Option<u16> {
|
||||||
|
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
|
||||||
|
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
|
||||||
|
// us to treat needle indecies as u16
|
||||||
|
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
|
||||||
|
return self.fuzzy_match_greedy::<INDICIES, H, N>(
|
||||||
|
haystack,
|
||||||
|
needle,
|
||||||
|
start,
|
||||||
|
greedy_end,
|
||||||
|
indicies,
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
let prev_class = start
|
||||||
|
.checked_sub(1)
|
||||||
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
|
.unwrap_or(self.config.inital_char_class);
|
||||||
|
let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config);
|
||||||
|
// this only happend with unicode haystacks, for ASCII the prefilter handles all rejects
|
||||||
|
if !matched {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if needle.len() == 1 {
|
||||||
|
indicies.push(max_score_pos as u32);
|
||||||
|
return Some(max_score);
|
||||||
|
}
|
||||||
|
debug_assert_eq!(
|
||||||
|
matrix.row_offs[0], 0,
|
||||||
|
"prefilter should have put us at the start of the match"
|
||||||
|
);
|
||||||
|
|
||||||
|
// populate the matrix and find the best score
|
||||||
|
let (max_score, best_match_end) = matrix.populate_matrix(needle);
|
||||||
|
if INDICIES {
|
||||||
|
matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end);
|
||||||
|
}
|
||||||
|
println!("{indicies:?}");
|
||||||
|
println!("{}", max_score);
|
||||||
|
Some(max_score)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<H: Char> Matrix<'_, H> {
|
||||||
|
fn setup<N: Char>(
|
||||||
|
&mut self,
|
||||||
|
needle: &[N],
|
||||||
|
mut prev_class: CharClass,
|
||||||
|
config: &MatcherConfig,
|
||||||
|
) -> (u16, u16, bool)
|
||||||
|
where
|
||||||
|
H: PartialEq<N>,
|
||||||
|
{
|
||||||
|
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
|
||||||
|
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
|
||||||
|
|
||||||
|
let col_iter = self
|
||||||
|
.haystack
|
||||||
|
.iter_mut()
|
||||||
|
.zip(self.cells.iter_mut())
|
||||||
|
.zip(self.bonus.iter_mut())
|
||||||
|
.enumerate();
|
||||||
|
|
||||||
|
let mut max_score = 0;
|
||||||
|
let mut max_score_pos = 0;
|
||||||
|
let mut in_gap = false;
|
||||||
|
let mut prev_score = 0u16;
|
||||||
|
let mut matched = false;
|
||||||
|
let first_needle_char = needle[0];
|
||||||
|
|
||||||
|
for (i, ((c, matrix_cell), bonus_)) in col_iter {
|
||||||
|
let class = c.char_class(config);
|
||||||
|
*c = c.normalize(config);
|
||||||
|
|
||||||
|
let bonus = config.bonus_for(prev_class, class);
|
||||||
|
// save bonus for later so we don't have to recompute it each time
|
||||||
|
*bonus_ = bonus;
|
||||||
|
prev_class = class;
|
||||||
|
|
||||||
|
let i = i as u16;
|
||||||
|
println!("{i} {needle_char:?} {c:?}");
|
||||||
|
if *c == needle_char {
|
||||||
|
// save the first idx of each char
|
||||||
|
if let Some(next) = row_iter.next() {
|
||||||
|
*row_start = i;
|
||||||
|
(needle_char, row_start) = next;
|
||||||
|
} else {
|
||||||
|
if !matched {
|
||||||
|
*row_start = i;
|
||||||
|
}
|
||||||
|
// we have atleast one match
|
||||||
|
matched = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if *c == first_needle_char {
|
||||||
|
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
||||||
|
println!("start match {score}");
|
||||||
|
matrix_cell.consecutive_chars = 1;
|
||||||
|
if needle.len() == 1 && score > max_score {
|
||||||
|
max_score = score;
|
||||||
|
max_score_pos = i;
|
||||||
|
// can't get better than this
|
||||||
|
if bonus >= BONUS_BOUNDARY {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix_cell.score = score;
|
||||||
|
in_gap = false;
|
||||||
|
} else {
|
||||||
|
let gap_penalty = if in_gap {
|
||||||
|
PENALTY_GAP_EXTENSION
|
||||||
|
} else {
|
||||||
|
PENALTY_GAP_START
|
||||||
|
};
|
||||||
|
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
|
||||||
|
matrix_cell.consecutive_chars = 0;
|
||||||
|
in_gap = true;
|
||||||
|
}
|
||||||
|
prev_score = matrix_cell.score;
|
||||||
|
}
|
||||||
|
|
||||||
|
(max_score_pos, max_score, matched)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn populate_matrix<N: Char>(&mut self, needle: &[N]) -> (u16, u16)
|
||||||
|
where
|
||||||
|
H: PartialEq<N>,
|
||||||
|
{
|
||||||
|
let mut max_score = 0;
|
||||||
|
let mut max_score_end = 0;
|
||||||
|
|
||||||
|
let mut row_iter = needle
|
||||||
|
.iter()
|
||||||
|
.zip(rows_mut(self.row_offs, self.cells, self.haystack.len()))
|
||||||
|
.enumerate();
|
||||||
|
// skip the first row we already calculated the in `setup` initial scores
|
||||||
|
let (_, mut prev_matrix_row) = row_iter.next().unwrap().1;
|
||||||
|
|
||||||
|
for (i, (&needle_char, row)) in row_iter {
|
||||||
|
let haystack = haystack(self.haystack, self.bonus, row.off);
|
||||||
|
let mut in_gap = false;
|
||||||
|
let mut prev_matrix_cell = MatrixCell {
|
||||||
|
score: 0,
|
||||||
|
consecutive_chars: 0,
|
||||||
|
};
|
||||||
|
// we are interested in the score of the previous character
|
||||||
|
// in the previous row. This represents the previous char
|
||||||
|
// for each possible pattern. This is equivalent to diagonal movement
|
||||||
|
let diagonal_start = row.off - prev_matrix_row.off - 1;
|
||||||
|
let diagonal = &mut prev_matrix_row.cells[diagonal_start as usize..];
|
||||||
|
|
||||||
|
for (j, ((haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
||||||
|
.zip(row.cells.iter_mut())
|
||||||
|
.zip(diagonal.iter())
|
||||||
|
.enumerate()
|
||||||
|
{
|
||||||
|
let col = j + row.off as usize;
|
||||||
|
let gap_penalty = if in_gap {
|
||||||
|
PENALTY_GAP_EXTENSION
|
||||||
|
} else {
|
||||||
|
PENALTY_GAP_START
|
||||||
|
};
|
||||||
|
let mut score1 = 0;
|
||||||
|
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
|
||||||
|
|
||||||
|
let mut consecutive = 0;
|
||||||
|
if haystack_char.char == needle_char {
|
||||||
|
score1 = diag_matrix_cell.score + SCORE_MATCH;
|
||||||
|
let mut bonus = haystack_char.bonus;
|
||||||
|
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
||||||
|
if consecutive > 1 {
|
||||||
|
let first_bonus = self.bonus[col + 1 - consecutive as usize];
|
||||||
|
if bonus > first_bonus {
|
||||||
|
if bonus > BONUS_BOUNDARY {
|
||||||
|
consecutive = 1
|
||||||
|
} else {
|
||||||
|
bonus = max(bonus, BONUS_CONSECUTIVE)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
bonus = max(first_bonus, BONUS_CONSECUTIVE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if score1 + bonus < score2 {
|
||||||
|
score1 += haystack_char.bonus;
|
||||||
|
consecutive = 0;
|
||||||
|
} else {
|
||||||
|
score1 += bonus;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in_gap = score1 < score2;
|
||||||
|
let score = max(score1, score2);
|
||||||
|
println!("{score} {score1} {score2}");
|
||||||
|
if i == needle.len() - 1 && score > max_score {
|
||||||
|
max_score = score;
|
||||||
|
max_score_end = col as u16;
|
||||||
|
}
|
||||||
|
matrix_cell.consecutive_chars = consecutive;
|
||||||
|
matrix_cell.score = score;
|
||||||
|
prev_matrix_cell = *matrix_cell;
|
||||||
|
}
|
||||||
|
prev_matrix_row = row;
|
||||||
|
}
|
||||||
|
(max_score, max_score_end)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reconstruct_optimal_path<N: Char>(
|
||||||
|
&self,
|
||||||
|
needle: &[N],
|
||||||
|
start: u32,
|
||||||
|
indicies: &mut Vec<u32>,
|
||||||
|
best_match_end: u16,
|
||||||
|
) {
|
||||||
|
indicies.resize(needle.len(), 0);
|
||||||
|
|
||||||
|
let mut row_iter = self.rows_rev().zip(indicies.iter_mut()).peekable();
|
||||||
|
let (mut row, mut matched_col_idx) = row_iter.next().unwrap();
|
||||||
|
let mut next_row: Option<MatrixRow> = None;
|
||||||
|
let mut col = best_match_end;
|
||||||
|
let mut prefer_match = true;
|
||||||
|
let haystack_len = self.haystack.len() as u16;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let score = row.cells[col as usize].score;
|
||||||
|
let mut score1 = 0;
|
||||||
|
let mut score2 = 0;
|
||||||
|
if let Some(&(prev_row, _)) = row_iter.peek() {
|
||||||
|
if col >= prev_row.off {
|
||||||
|
score1 = prev_row[col].score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if col > row.off {
|
||||||
|
score2 = row[col - 1].score;
|
||||||
|
}
|
||||||
|
println!("{score} {score2} {score1} {prefer_match}");
|
||||||
|
let mut new_prefer_match = row[col].consecutive_chars > 1;
|
||||||
|
if !new_prefer_match && col + 1 < haystack_len {
|
||||||
|
if let Some(next_row) = next_row {
|
||||||
|
new_prefer_match = next_row[col + 1].consecutive_chars > 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if score > score1 && (score > score2 || score == score2 && prefer_match) {
|
||||||
|
*matched_col_idx = col as u32 + start;
|
||||||
|
next_row = Some(row);
|
||||||
|
let Some(next) = row_iter.next() else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
(row, matched_col_idx) = next
|
||||||
|
}
|
||||||
|
prefer_match = new_prefer_match;
|
||||||
|
col -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
699
src/lib.rs
699
src/lib.rs
@ -1,616 +1,137 @@
|
|||||||
// sadly this doens't optmimzie well currently
|
// sadly this doens't optmimzie well currently
|
||||||
#![allow(clippy::manual_range_contains)]
|
#![allow(clippy::manual_range_contains)]
|
||||||
|
|
||||||
use std::alloc::Layout;
|
mod chars;
|
||||||
use std::cmp::max;
|
|
||||||
|
|
||||||
use memchr::{memchr, memchr2};
|
|
||||||
use normalize::normalize;
|
|
||||||
|
|
||||||
//autogenerated by generate-ucd
|
|
||||||
#[allow(warnings)]
|
|
||||||
#[rustfmt::skip]
|
|
||||||
mod case_fold;
|
|
||||||
mod config;
|
mod config;
|
||||||
mod normalize;
|
mod fuzzy_greedy;
|
||||||
|
mod fuzzy_optimal;
|
||||||
|
mod matrix;
|
||||||
|
mod prefilter;
|
||||||
|
mod score;
|
||||||
|
mod utf32_str;
|
||||||
|
|
||||||
pub use config::{CaseMatching, CharClass, MatcherConfig};
|
// #[cfg(test)]
|
||||||
|
// mod tests;
|
||||||
|
|
||||||
use crate::config::{
|
pub use config::MatcherConfig;
|
||||||
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
|
|
||||||
PENALTY_GAP_START, SCORE_MATCH,
|
|
||||||
};
|
|
||||||
|
|
||||||
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
|
use crate::matrix::MatrixSlab;
|
||||||
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
|
use crate::utf32_str::Utf32Str;
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
|
||||||
struct MatrixCell {
|
|
||||||
score: u16,
|
|
||||||
consecutive_chars: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
|
||||||
struct HaystackChar {
|
|
||||||
char: char,
|
|
||||||
bonus: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Matcher {
|
pub struct Matcher {
|
||||||
pub config: MatcherConfig,
|
pub config: MatcherConfig,
|
||||||
matrix: Box<[MatrixCell; MAX_MATRIX_SIZE]>,
|
slab: MatrixSlab,
|
||||||
haystack: Box<[HaystackChar; MAX_HAYSTACK_LEN]>,
|
|
||||||
// needle can be at most as long as the haystack
|
|
||||||
first_needle_occurance: Box<[u16; MAX_HAYSTACK_LEN]>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Query {
|
// // impl Query {
|
||||||
needle_chars: Vec<char>,
|
// // fn push(&mut self, needle: Utf32Str<'_>, normalize_: bool, smart_case: bool) {
|
||||||
is_ascii: bool,
|
// // self.needle_chars.reserve(needle.len());
|
||||||
ignore_case: bool,
|
// // self.needle_chars.extend(needle.chars().map(|mut c| {
|
||||||
}
|
// // if !c.is_ascii() {
|
||||||
|
// // self.is_ascii = false;
|
||||||
impl Query {
|
// // }
|
||||||
fn push(&mut self, needle: &str, normalize_: bool, smart_case: bool) {
|
// // if smart_case {
|
||||||
self.needle_chars.reserve(needle.len());
|
// // if c.is_uppercase() {
|
||||||
self.needle_chars.extend(needle.chars().map(|mut c| {
|
// // self.ignore_case = false;
|
||||||
if !c.is_ascii() {
|
// // }
|
||||||
self.is_ascii = false;
|
// // } else if self.ignore_case {
|
||||||
}
|
// // if self.is_ascii {
|
||||||
if smart_case {
|
// // c = to_lower_case::<true>(c)
|
||||||
if c.is_uppercase() {
|
// // } else {
|
||||||
self.ignore_case = false;
|
// // c = to_lower_case::<false>(c)
|
||||||
}
|
// // }
|
||||||
} else if self.ignore_case {
|
// // }
|
||||||
if self.is_ascii {
|
// // if normalize_ && !self.is_ascii {
|
||||||
c = to_lower_case::<true>(c)
|
// // c = normalize(c);
|
||||||
} else {
|
// // }
|
||||||
c = to_lower_case::<false>(c)
|
// // c
|
||||||
}
|
// // }))
|
||||||
}
|
// // }
|
||||||
if normalize_ && !self.is_ascii {
|
// // }
|
||||||
c = normalize(c);
|
|
||||||
}
|
|
||||||
c
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
|
|
||||||
if c >= b'a' || c <= b'z' {
|
|
||||||
memchr2(c, c + 32, haystack)
|
|
||||||
} else {
|
|
||||||
memchr(c, haystack)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Safety: T must be vaind if initalized with zeros
|
|
||||||
unsafe fn zeroed_array_on_heap<T: Copy, const LEN: usize>() -> Box<[T; LEN]> {
|
|
||||||
let layout = Layout::new::<[T; LEN]>();
|
|
||||||
let res = std::alloc::alloc_zeroed(layout);
|
|
||||||
if res.is_null() {
|
|
||||||
std::alloc::handle_alloc_error(layout)
|
|
||||||
}
|
|
||||||
Box::from_raw(res as _)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Matcher {
|
impl Matcher {
|
||||||
pub fn new(config: MatcherConfig) -> Self {
|
pub fn new(config: MatcherConfig) -> Self {
|
||||||
// Safety: all data allocated here is just integers/structs that contain
|
Self {
|
||||||
// integers so zeroed values are legal
|
config,
|
||||||
unsafe {
|
slab: MatrixSlab::new(),
|
||||||
Self {
|
|
||||||
config,
|
|
||||||
matrix: zeroed_array_on_heap(),
|
|
||||||
haystack: zeroed_array_on_heap(),
|
|
||||||
first_needle_occurance: zeroed_array_on_heap(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn compile_query(&self, needle: &str) -> Query {
|
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
||||||
let mut query = Query {
|
assert!(haystack.len() <= u32::MAX as usize);
|
||||||
needle_chars: Vec::new(),
|
self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new())
|
||||||
is_ascii: true,
|
|
||||||
ignore_case: self.config.case_matching == CaseMatching::Ignore,
|
|
||||||
};
|
|
||||||
query.push(
|
|
||||||
needle,
|
|
||||||
self.config.normalize,
|
|
||||||
self.config.case_matching == CaseMatching::Smart,
|
|
||||||
);
|
|
||||||
query
|
|
||||||
}
|
}
|
||||||
pub fn recompile_query(&self, query: &mut Query, needle: &str) {
|
fn fuzzy_matcher_impl<const INDICIES: bool>(
|
||||||
query.needle_chars.clear();
|
|
||||||
query.is_ascii = false;
|
|
||||||
query.ignore_case = self.config.case_matching == CaseMatching::Ignore;
|
|
||||||
query.push(
|
|
||||||
needle,
|
|
||||||
self.config.normalize,
|
|
||||||
self.config.case_matching == CaseMatching::Smart,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
pub fn append_query(&self, query: &mut Query, needle: &str) {
|
|
||||||
query.push(
|
|
||||||
needle,
|
|
||||||
self.config.normalize,
|
|
||||||
self.config.case_matching == CaseMatching::Smart,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<u16> {
|
|
||||||
if haystack.len() > u32::MAX as usize {
|
|
||||||
haystack = &haystack[..u32::MAX as usize]
|
|
||||||
}
|
|
||||||
if self.config.use_v1 {
|
|
||||||
if query.is_ascii && !self.config.normalize {
|
|
||||||
self.fuzzy_matcher_v1::<false, true>(query, haystack, &mut Vec::new())
|
|
||||||
} else {
|
|
||||||
self.fuzzy_matcher_v1::<false, false>(query, haystack, &mut Vec::new())
|
|
||||||
}
|
|
||||||
} else if query.is_ascii && !self.config.normalize {
|
|
||||||
self.fuzzy_matcher_v2::<false, true>(query, haystack, &mut Vec::new())
|
|
||||||
} else {
|
|
||||||
self.fuzzy_matcher_v2::<false, false>(query, haystack, &mut Vec::new())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fuzzy_indicies(
|
|
||||||
&mut self,
|
&mut self,
|
||||||
query: &Query,
|
haystack: Utf32Str<'_>,
|
||||||
mut haystack: &str,
|
needle_: Utf32Str<'_>,
|
||||||
indicies: &mut Vec<u32>,
|
indidies: &mut Vec<u32>,
|
||||||
) -> Option<u16> {
|
) -> Option<u16> {
|
||||||
if haystack.len() > u32::MAX as usize {
|
assert!(
|
||||||
haystack = &haystack[..u32::MAX as usize]
|
haystack.len() <= u32::MAX as usize,
|
||||||
}
|
"fuzzy matching is only support for up to 2^32-1 codepoints"
|
||||||
if self.config.use_v1 {
|
|
||||||
if query.is_ascii && !self.config.normalize {
|
|
||||||
self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
|
|
||||||
} else {
|
|
||||||
self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
|
|
||||||
}
|
|
||||||
} else if query.is_ascii && !self.config.normalize {
|
|
||||||
self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
|
|
||||||
} else {
|
|
||||||
self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn normalize_char<const ASCII_ONLY: bool>(&self, ignore_case: bool, mut c: char) -> char {
|
|
||||||
if ignore_case {
|
|
||||||
c = to_lower_case::<ASCII_ONLY>(c)
|
|
||||||
}
|
|
||||||
if !ASCII_ONLY && self.config.normalize {
|
|
||||||
c = normalize(c)
|
|
||||||
}
|
|
||||||
c
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prefilter_ascii(&self, query: &Query, mut haystack: &[u8]) -> Option<(usize, usize)> {
|
|
||||||
let needle = &query.needle_chars;
|
|
||||||
if query.ignore_case {
|
|
||||||
let first_idx = find_ascii_ignore_case(needle[0] as u8, haystack)?;
|
|
||||||
let mut last_idx = first_idx + 1;
|
|
||||||
haystack = &haystack[last_idx..];
|
|
||||||
for &c in &needle[1..] {
|
|
||||||
let idx = find_ascii_ignore_case(c as u8, haystack)? + 1;
|
|
||||||
last_idx += idx;
|
|
||||||
haystack = &haystack[idx..];
|
|
||||||
}
|
|
||||||
Some((first_idx, last_idx))
|
|
||||||
} else {
|
|
||||||
let first_idx = memchr(needle[0] as u8, haystack)?;
|
|
||||||
let mut last_idx = first_idx + 1;
|
|
||||||
haystack = &haystack[last_idx..];
|
|
||||||
for &c in &needle[1..] {
|
|
||||||
let idx = memchr(c as u8, haystack)? + 1;
|
|
||||||
last_idx += idx;
|
|
||||||
haystack = &haystack[idx..];
|
|
||||||
}
|
|
||||||
Some((first_idx, last_idx))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prefilter_non_ascii(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
|
||||||
let needle_char = query.needle_chars[0];
|
|
||||||
let mut text = haystack
|
|
||||||
.char_indices()
|
|
||||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
|
||||||
|
|
||||||
let (match_start, c) = text.find(|&(_, c)| c == needle_char)?;
|
|
||||||
Some((match_start, match_start + c.len_utf8()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prefilter(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
|
||||||
// quickly reject small matches
|
|
||||||
if query.needle_chars.len() > haystack.len() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
if query.is_ascii {
|
|
||||||
self.prefilter_ascii(query, haystack.as_bytes())
|
|
||||||
} else {
|
|
||||||
self.prefilter_non_ascii(query, haystack)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fuzzy_matcher_v1<const INDICIES: bool, const ASCII_ONLY: bool>(
|
|
||||||
&mut self,
|
|
||||||
query: &Query,
|
|
||||||
haystack: &str,
|
|
||||||
indicies: &mut Vec<u32>,
|
|
||||||
) -> Option<u16> {
|
|
||||||
let (start, end) = self.prefilter(query, haystack)?;
|
|
||||||
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
|
||||||
query, haystack, start, end, indicies,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fuzzy_matcher_v1_with_prefilter<const INDICIES: bool, const ASCII_ONLY: bool>(
|
|
||||||
&mut self,
|
|
||||||
query: &Query,
|
|
||||||
haystack: &str,
|
|
||||||
mut start: usize,
|
|
||||||
mut end: usize,
|
|
||||||
indicies: &mut Vec<u32>,
|
|
||||||
) -> Option<u16> {
|
|
||||||
let first_char_end = if ASCII_ONLY { start + 1 } else { end };
|
|
||||||
if !ASCII_ONLY && query.needle_chars.len() != 1 {
|
|
||||||
let mut needle_iter = query.needle_chars[1..].iter().copied();
|
|
||||||
if let Some(mut needle_char) = needle_iter.next() {
|
|
||||||
let haystack = haystack[first_char_end..]
|
|
||||||
.char_indices()
|
|
||||||
.rev()
|
|
||||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
|
||||||
for (i, c) in haystack {
|
|
||||||
if c == needle_char {
|
|
||||||
let Some(next_needle_char) = needle_iter.next() else {
|
|
||||||
end = i + c.len_utf8();
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
needle_char = next_needle_char;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// very simple, just mimimize from the back
|
|
||||||
let match_ = haystack[first_char_end..end]
|
|
||||||
.char_indices()
|
|
||||||
.rev()
|
|
||||||
.map(|(i, c)| (i, self.normalize_char::<ASCII_ONLY>(query.ignore_case, c)));
|
|
||||||
|
|
||||||
let mut needle_iter = query.needle_chars[..].iter().rev().copied();
|
|
||||||
let mut needle_char = needle_iter.next().unwrap();
|
|
||||||
for (i, c) in match_ {
|
|
||||||
if c == needle_char {
|
|
||||||
let Some(next_needle_char) = needle_iter.next() else {
|
|
||||||
start = i;
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
needle_char = next_needle_char;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(self.calculate_score::<INDICIES, ASCII_ONLY>(query, haystack, start, end, indicies))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn calculate_score<const INDICIES: bool, const ASCII_ONLY: bool>(
|
|
||||||
&mut self,
|
|
||||||
query: &Query,
|
|
||||||
text: &str,
|
|
||||||
match_start: usize,
|
|
||||||
match_end: usize,
|
|
||||||
indicies: &mut Vec<u32>,
|
|
||||||
) -> u16 {
|
|
||||||
if INDICIES {
|
|
||||||
indicies.reserve(query.needle_chars.len());
|
|
||||||
}
|
|
||||||
let mut prev_class = text[..match_start]
|
|
||||||
.chars()
|
|
||||||
.next_back()
|
|
||||||
.map(|c| self.config.char_class(c))
|
|
||||||
.unwrap_or(self.config.inital_char_class);
|
|
||||||
let mut needle_idx = 0;
|
|
||||||
let mut score = 0u16;
|
|
||||||
let mut in_gap = false;
|
|
||||||
let mut consecutive = 0;
|
|
||||||
let mut first_bonus = 0u16;
|
|
||||||
for (i, mut c) in text[match_start..match_end].char_indices() {
|
|
||||||
let class = self.config.char_class(c);
|
|
||||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
|
||||||
c = to_lower_case::<ASCII_ONLY>(c);
|
|
||||||
}
|
|
||||||
if self.config.normalize && !ASCII_ONLY {
|
|
||||||
c = normalize(c)
|
|
||||||
}
|
|
||||||
if c == query.needle_chars[needle_idx] {
|
|
||||||
if INDICIES {
|
|
||||||
indicies.push(i as u32)
|
|
||||||
}
|
|
||||||
score += SCORE_MATCH;
|
|
||||||
let mut bonus = self.config.bonus_for(prev_class, class);
|
|
||||||
if consecutive == 0 {
|
|
||||||
first_bonus = bonus
|
|
||||||
} else {
|
|
||||||
// Break consecutive chunk
|
|
||||||
if bonus > first_bonus {
|
|
||||||
if bonus >= BONUS_BOUNDARY {
|
|
||||||
first_bonus = bonus;
|
|
||||||
} else {
|
|
||||||
bonus = max(bonus, BONUS_CONSECUTIVE);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if needle_idx == 0 {
|
|
||||||
bonus *= BONUS_FIRST_CHAR_MULTIPLIER;
|
|
||||||
}
|
|
||||||
score += bonus;
|
|
||||||
needle_idx += 1;
|
|
||||||
in_gap = false;
|
|
||||||
consecutive += 1;
|
|
||||||
} else {
|
|
||||||
let penalty = if in_gap {
|
|
||||||
PENALTY_GAP_EXTENSION
|
|
||||||
} else {
|
|
||||||
PENALTY_GAP_START
|
|
||||||
};
|
|
||||||
score = score.saturating_sub(penalty);
|
|
||||||
in_gap = true;
|
|
||||||
consecutive = 0;
|
|
||||||
first_bonus = 0;
|
|
||||||
}
|
|
||||||
prev_class = class;
|
|
||||||
}
|
|
||||||
|
|
||||||
score
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fuzzy_matcher_v2<const INDICIES: bool, const ASCII_ONLY: bool>(
|
|
||||||
&mut self,
|
|
||||||
query: &Query,
|
|
||||||
text: &str,
|
|
||||||
indicies: &mut Vec<u32>,
|
|
||||||
) -> Option<u16> {
|
|
||||||
let (start, prefilter_end) = self.prefilter(query, text)?;
|
|
||||||
let text_len = text.len() - start;
|
|
||||||
// fallback to v1 algorithms for long haystacks
|
|
||||||
// technically we need to multiply by char len here
|
|
||||||
// but counting chars has a lot of unecessary overhead that we can avoid
|
|
||||||
// here in practice using bytelen should be a reasonable approximation
|
|
||||||
// we also differ from fzf here in that we never allocate and instead stringintly check here
|
|
||||||
if text_len > u16::MAX as usize || text_len * query.needle_chars.len() > MAX_HAYSTACK_LEN {
|
|
||||||
return self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
|
||||||
query,
|
|
||||||
text,
|
|
||||||
start,
|
|
||||||
prefilter_end,
|
|
||||||
indicies,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut prev_class = text[..start]
|
|
||||||
.chars()
|
|
||||||
.next_back()
|
|
||||||
.map(|c| self.config.char_class(c))
|
|
||||||
.unwrap_or(self.config.inital_char_class);
|
|
||||||
|
|
||||||
let text = &text[start..];
|
|
||||||
|
|
||||||
let mut needle_iter = query.needle_chars[..]
|
|
||||||
.iter()
|
|
||||||
.copied()
|
|
||||||
.zip(self.first_needle_occurance.iter_mut());
|
|
||||||
let (mut needle_char, mut needle_char_idx) = needle_iter.next().unwrap();
|
|
||||||
|
|
||||||
let iter = text[start..]
|
|
||||||
.chars()
|
|
||||||
.zip(self.matrix.iter_mut())
|
|
||||||
.zip(self.haystack.iter_mut())
|
|
||||||
.enumerate();
|
|
||||||
|
|
||||||
let mut last_matched_idx = 0;
|
|
||||||
let mut max_score = 0;
|
|
||||||
let mut max_score_pos = 0;
|
|
||||||
let mut in_gap = false;
|
|
||||||
let mut prev_score = 0u16;
|
|
||||||
let mut matched = false;
|
|
||||||
|
|
||||||
let first_needle_char = query.needle_chars[0];
|
|
||||||
for (i, ((mut c, matrix_cell), char_info)) in iter {
|
|
||||||
let class = self.config.char_class(c);
|
|
||||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
|
||||||
c = to_lower_case::<ASCII_ONLY>(c);
|
|
||||||
}
|
|
||||||
if self.config.normalize && !ASCII_ONLY {
|
|
||||||
c = normalize(c)
|
|
||||||
}
|
|
||||||
char_info.char = c;
|
|
||||||
let bonus = self.config.bonus_for(prev_class, class);
|
|
||||||
char_info.char = c;
|
|
||||||
prev_class = class;
|
|
||||||
|
|
||||||
let i = i as u16;
|
|
||||||
if c == needle_char {
|
|
||||||
// save the first idx of each char
|
|
||||||
if let Some(next) = needle_iter.next() {
|
|
||||||
*needle_char_idx = i;
|
|
||||||
(needle_char, needle_char_idx) = next
|
|
||||||
} else {
|
|
||||||
// we have atleast one match
|
|
||||||
matched = true;
|
|
||||||
}
|
|
||||||
// and the last matched char
|
|
||||||
last_matched_idx = i;
|
|
||||||
}
|
|
||||||
if c == first_needle_char {
|
|
||||||
let score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
|
||||||
matrix_cell.consecutive_chars = 1;
|
|
||||||
if query.needle_chars.len() == 1 && score > max_score {
|
|
||||||
max_score = score;
|
|
||||||
max_score_pos = i;
|
|
||||||
// can't get better than this
|
|
||||||
if bonus >= BONUS_BOUNDARY {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
in_gap = false;
|
|
||||||
} else {
|
|
||||||
let gap_penalty = if in_gap {
|
|
||||||
PENALTY_GAP_EXTENSION
|
|
||||||
} else {
|
|
||||||
PENALTY_GAP_START
|
|
||||||
};
|
|
||||||
matrix_cell.score = prev_score.saturating_sub(gap_penalty);
|
|
||||||
matrix_cell.consecutive_chars = 0;
|
|
||||||
in_gap = true;
|
|
||||||
}
|
|
||||||
prev_score = matrix_cell.score;
|
|
||||||
}
|
|
||||||
if !matched {
|
|
||||||
debug_assert!(!ASCII_ONLY, "prefilter should have rejected");
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
if query.needle_chars.len() == 1 {
|
|
||||||
indicies.push(max_score_pos as u32);
|
|
||||||
return Some(max_score);
|
|
||||||
}
|
|
||||||
assert_eq!(
|
|
||||||
self.first_needle_occurance[0], 0,
|
|
||||||
"prefilter should have put us at the start of the match"
|
|
||||||
);
|
);
|
||||||
let haystack_len = last_matched_idx as usize + 1;
|
match (haystack, needle_) {
|
||||||
let (max_score, best_match_end) = self.popultate_matrix(haystack_len, query);
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
||||||
if INDICIES {
|
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?;
|
||||||
indicies.reserve(query.needle_chars.len());
|
self.fuzzy_match_optimal::<INDICIES, u8, u8>(
|
||||||
let mut col = best_match_end;
|
haystack, needle, start, greedy_end, end, indidies,
|
||||||
let mut needle_iter = self.matrix[..haystack_len * query.needle_chars.len()]
|
)
|
||||||
.windows(haystack_len)
|
}
|
||||||
.zip(self.first_needle_occurance[..haystack_len].iter())
|
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
||||||
.rev()
|
// a purely ascii haystack can never be transformed to match
|
||||||
.peekable();
|
// a needle that contains non-ascii chars since we don't allow gaps
|
||||||
let mut next_row = None;
|
None
|
||||||
let (mut row, mut first_needle_occurance) = needle_iter.next().unwrap();
|
}
|
||||||
let mut prefer_match = true;
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
||||||
loop {
|
todo!()
|
||||||
let score = row[col as usize].score;
|
// let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
|
||||||
let mut score1 = 0;
|
// self.fuzzy_match_optimal::<INDICIES, char, u8>(
|
||||||
let mut score2 = 0;
|
// haystack,
|
||||||
if let Some((prev_row, _)) = needle_iter.peek() {
|
// needle,
|
||||||
if col >= *first_needle_occurance {
|
// start,
|
||||||
score1 = prev_row[col as usize].score;
|
// start + 1,
|
||||||
}
|
// end,
|
||||||
}
|
// indidies,
|
||||||
if col > *first_needle_occurance {
|
// )
|
||||||
score2 = row[col as usize - 1].score;
|
}
|
||||||
}
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
||||||
if score > score1 && (score > score2 || score == score2 && prefer_match) {
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
|
||||||
indicies.push(col as u32 + start as u32);
|
self.fuzzy_match_optimal::<INDICIES, char, char>(
|
||||||
next_row = Some(row);
|
haystack,
|
||||||
let Some(next) = needle_iter.next() else {
|
needle,
|
||||||
break;
|
start,
|
||||||
};
|
start + 1,
|
||||||
(row, first_needle_occurance) = next
|
end,
|
||||||
}
|
indidies,
|
||||||
prefer_match = row[col as usize].consecutive_chars > 1;
|
)
|
||||||
if !prefer_match && col + 1 < query.needle_chars.len() as u16 {
|
|
||||||
if let Some(next_row) = next_row {
|
|
||||||
prefer_match = next_row[col as usize + 1].consecutive_chars > 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
col -= 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(max_score)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (u16, u16) {
|
// pub fn fuzzy_indicies(
|
||||||
let mut max_score = 0;
|
// &mut self,
|
||||||
let mut max_score_end = 0;
|
// query: &Query,
|
||||||
let mut iter = query
|
// mut haystack: Utf32Str<'_>,
|
||||||
.needle_chars
|
// indicies: &mut Vec<u32>,
|
||||||
.iter()
|
// ) -> Option<u16> {
|
||||||
.zip(self.first_needle_occurance.iter())
|
// if haystack.len() > u32::MAX as usize {
|
||||||
.zip(self.matrix.chunks_mut(haystack_len))
|
// haystack = &haystack[..u32::MAX as usize]
|
||||||
.enumerate();
|
// }
|
||||||
// skip the first row we already calculated the initial scores
|
// println!(
|
||||||
let (_, ((&_, &_), mut prev_matrix_row)) = iter.next().unwrap();
|
// "start {haystack:?}, {:?} {} {}",
|
||||||
for (i, ((&needle_char, &first_occurance), matrix_row)) in iter {
|
// query.needle_chars, query.ignore_case, query.is_ascii
|
||||||
// help the optimizer out a little
|
// );
|
||||||
assert!((first_occurance as usize) < matrix_row.len());
|
// if self.config.use_v1 {
|
||||||
assert!(first_occurance != 0);
|
// if query.is_ascii && !self.config.normalize {
|
||||||
let mut in_gap = false;
|
// self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
|
||||||
let haystack = &self.haystack[first_occurance as usize..haystack_len];
|
// } else {
|
||||||
let mut prev_matrix_cell = matrix_row[first_occurance as usize - 1];
|
// self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
|
||||||
let matrix_row = &mut matrix_row[first_occurance as usize..haystack_len];
|
// }
|
||||||
let prev_matrix_diagonal =
|
// } else if query.is_ascii && !self.config.normalize {
|
||||||
&mut prev_matrix_row[first_occurance as usize - 1..haystack_len - 1];
|
// self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
|
||||||
for (j, ((&haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
// } else {
|
||||||
.iter()
|
// self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
|
||||||
.zip(matrix_row.iter_mut())
|
// }
|
||||||
.zip(prev_matrix_diagonal.iter())
|
// }
|
||||||
.enumerate()
|
|
||||||
{
|
|
||||||
let col = j + first_occurance as usize;
|
|
||||||
let gap_penalty = if in_gap {
|
|
||||||
PENALTY_GAP_EXTENSION
|
|
||||||
} else {
|
|
||||||
PENALTY_GAP_START
|
|
||||||
};
|
|
||||||
let mut score1 = 0;
|
|
||||||
let score2 = prev_matrix_cell.score.saturating_sub(gap_penalty);
|
|
||||||
let mut consecutive = 0;
|
|
||||||
if haystack_char.char == needle_char {
|
|
||||||
score1 = diag_matrix_cell.score + SCORE_MATCH;
|
|
||||||
let mut bonus = haystack_char.bonus;
|
|
||||||
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
|
||||||
if consecutive > 1 {
|
|
||||||
let first_bonus = self.haystack[col - consecutive as usize].bonus;
|
|
||||||
if bonus > first_bonus {
|
|
||||||
if bonus > BONUS_BOUNDARY {
|
|
||||||
consecutive = 1
|
|
||||||
} else {
|
|
||||||
bonus = max(bonus, BONUS_CONSECUTIVE)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
bonus = max(first_bonus, BONUS_CONSECUTIVE)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if score1 + bonus < score2 {
|
|
||||||
score1 += haystack_char.bonus;
|
|
||||||
consecutive = 0;
|
|
||||||
} else {
|
|
||||||
score1 += bonus;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
in_gap = score1 < score2;
|
|
||||||
let score = max(max(score1, score2), 0);
|
|
||||||
prev_matrix_cell = *matrix_cell;
|
|
||||||
if i == query.needle_chars.len() - 1 && score > max_score {
|
|
||||||
max_score = score;
|
|
||||||
max_score_end = col as u16;
|
|
||||||
}
|
|
||||||
matrix_cell.consecutive_chars = consecutive;
|
|
||||||
matrix_cell.score = score;
|
|
||||||
}
|
|
||||||
prev_matrix_row = matrix_row;
|
|
||||||
}
|
|
||||||
(max_score, max_score_end)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn to_lower_case<const ASCII_ONLY: bool>(c: char) -> char {
|
|
||||||
if c >= 'A' && c <= 'Z' {
|
|
||||||
char::from_u32(c as u32 + 32).unwrap()
|
|
||||||
} else if !c.is_ascii() && !ASCII_ONLY {
|
|
||||||
case_fold::CASE_FOLDING_SIMPLE
|
|
||||||
.binary_search_by_key(&c, |(upper, _)| *upper)
|
|
||||||
.map_or(c, |idx| case_fold::CASE_FOLDING_SIMPLE[idx].1)
|
|
||||||
} else {
|
|
||||||
c
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
280
src/matrix.rs
Normal file
280
src/matrix.rs
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
|
||||||
|
use std::fmt::{Debug, Formatter, Result};
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
use std::mem::{size_of, take};
|
||||||
|
use std::ops::Index;
|
||||||
|
use std::ptr::{slice_from_raw_parts_mut, NonNull};
|
||||||
|
|
||||||
|
use crate::chars::Char;
|
||||||
|
|
||||||
|
const MAX_MATRIX_SIZE: usize = 100 * 1024; // 4*60*1024 = 240KB
|
||||||
|
|
||||||
|
// these two aren't hard maxima, instead we simply allow whatever will fit into memory
|
||||||
|
const MAX_HAYSTACK_LEN: usize = 2048; // 64KB
|
||||||
|
const MAX_NEEDLE_LEN: usize = 2048; // 64KB
|
||||||
|
|
||||||
|
struct MatrixLayout<C: Char> {
|
||||||
|
haystack_len: usize,
|
||||||
|
needle_len: usize,
|
||||||
|
cell_count: usize,
|
||||||
|
layout: Layout,
|
||||||
|
haystack_off: usize,
|
||||||
|
bonus_off: usize,
|
||||||
|
rows_off: usize,
|
||||||
|
cells_off: usize,
|
||||||
|
_phantom: PhantomData<C>,
|
||||||
|
}
|
||||||
|
impl<C: Char> MatrixLayout<C> {
|
||||||
|
fn new(haystack_len: usize, needle_len: usize, cell_count: usize) -> MatrixLayout<C> {
|
||||||
|
let mut layout = Layout::from_size_align(0, 1).unwrap();
|
||||||
|
let haystack_layout = Layout::array::<C>(haystack_len).unwrap();
|
||||||
|
let bonus_layout = Layout::array::<u16>(haystack_len).unwrap();
|
||||||
|
let rows_layout = Layout::array::<u16>(needle_len).unwrap();
|
||||||
|
let cells_layout = Layout::array::<MatrixCell>(cell_count).unwrap();
|
||||||
|
|
||||||
|
let haystack_off;
|
||||||
|
(layout, haystack_off) = layout.extend(haystack_layout).unwrap();
|
||||||
|
let bonus_off;
|
||||||
|
(layout, bonus_off) = layout.extend(bonus_layout).unwrap();
|
||||||
|
let rows_off;
|
||||||
|
(layout, rows_off) = layout.extend(rows_layout).unwrap();
|
||||||
|
let cells_off;
|
||||||
|
(layout, cells_off) = layout.extend(cells_layout).unwrap();
|
||||||
|
MatrixLayout {
|
||||||
|
haystack_len,
|
||||||
|
needle_len,
|
||||||
|
cell_count,
|
||||||
|
layout,
|
||||||
|
haystack_off,
|
||||||
|
bonus_off,
|
||||||
|
rows_off,
|
||||||
|
cells_off,
|
||||||
|
_phantom: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT
|
||||||
|
unsafe fn fieds_from_ptr(
|
||||||
|
&self,
|
||||||
|
ptr: NonNull<u8>,
|
||||||
|
) -> (*mut [C], *mut [u16], *mut [u16], *mut [MatrixCell]) {
|
||||||
|
// sanity checks, should not be necessary
|
||||||
|
|
||||||
|
let base = ptr.as_ptr();
|
||||||
|
let haystack = base.add(self.haystack_off) as *mut C;
|
||||||
|
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
|
||||||
|
let bonus = base.add(self.bonus_off) as *mut u16;
|
||||||
|
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
|
||||||
|
let rows = base.add(self.rows_off) as *mut u16;
|
||||||
|
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
|
||||||
|
let cells = base.add(self.cells_off) as *mut MatrixCell;
|
||||||
|
let cells = slice_from_raw_parts_mut(cells, self.cell_count);
|
||||||
|
(haystack, bonus, rows, cells)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub(crate) struct MatrixCell {
|
||||||
|
pub score: u16,
|
||||||
|
pub consecutive_chars: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for MatrixCell {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
(self.score, self.consecutive_chars).fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub(crate) struct HaystackChar<C: Char> {
|
||||||
|
pub char: C,
|
||||||
|
pub bonus: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<C: Char> Debug for HaystackChar<C> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
(self.char, self.bonus).fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub(crate) struct MatrixRow<'a> {
|
||||||
|
pub off: u16,
|
||||||
|
pub cells: &'a [MatrixCell],
|
||||||
|
}
|
||||||
|
impl Index<u16> for MatrixRow<'_> {
|
||||||
|
type Output = MatrixCell;
|
||||||
|
|
||||||
|
fn index(&self, index: u16) -> &Self::Output {
|
||||||
|
&self.cells[index as usize]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for MatrixRow<'_> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
let mut f = f.debug_list();
|
||||||
|
f.entries((0..self.off).map(|_| &(0, 0)));
|
||||||
|
f.entries(self.cells.iter());
|
||||||
|
f.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct MatrixRowMut<'a> {
|
||||||
|
pub off: u16,
|
||||||
|
pub cells: &'a mut [MatrixCell],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for MatrixRowMut<'_> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
let mut f = f.debug_list();
|
||||||
|
f.entries((0..self.off).map(|_| &(0, 0)));
|
||||||
|
f.entries(self.cells.iter());
|
||||||
|
f.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DebugList<I>(I);
|
||||||
|
impl<I> Debug for DebugList<I>
|
||||||
|
where
|
||||||
|
I: Iterator + Clone,
|
||||||
|
I::Item: Debug,
|
||||||
|
{
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
f.debug_list().entries(self.0.clone()).finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct Matrix<'a, C: Char> {
|
||||||
|
pub haystack: &'a mut [C],
|
||||||
|
// stored as a seperate array instead of struct
|
||||||
|
// to avoid padding sine char is too large and u8 too small :/
|
||||||
|
pub bonus: &'a mut [u16],
|
||||||
|
pub row_offs: &'a mut [u16],
|
||||||
|
pub cells: &'a mut [MatrixCell],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, C: Char> Matrix<'a, C> {
|
||||||
|
pub fn rows(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator + Clone + Sized {
|
||||||
|
let mut cells = &*self.cells;
|
||||||
|
self.row_offs.iter().map(move |&off| {
|
||||||
|
let len = self.haystack.len() - off as usize;
|
||||||
|
let (row, tmp) = cells.split_at(len);
|
||||||
|
cells = tmp;
|
||||||
|
MatrixRow { off, cells: row }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rows_rev(&self) -> impl Iterator<Item = MatrixRow> + ExactSizeIterator {
|
||||||
|
let mut cells = &*self.cells;
|
||||||
|
self.row_offs.iter().rev().map(move |&off| {
|
||||||
|
let len = self.haystack.len() - off as usize;
|
||||||
|
let (tmp, row) = cells.split_at(cells.len() - len);
|
||||||
|
cells = tmp;
|
||||||
|
MatrixRow { off, cells: row }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
pub fn haystack(
|
||||||
|
&self,
|
||||||
|
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + '_ + Clone {
|
||||||
|
haystack(self.haystack, self.bonus, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, C: Char> Debug for Matrix<'a, C> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||||
|
f.debug_struct("Matrix")
|
||||||
|
.field("haystack", &DebugList(self.haystack()))
|
||||||
|
.field("matrix", &DebugList(self.rows()))
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub(crate) fn haystack<'a, C: Char>(
|
||||||
|
haystack: &'a [C],
|
||||||
|
bonus: &'a [u16],
|
||||||
|
skip: u16,
|
||||||
|
) -> impl Iterator<Item = HaystackChar<C>> + ExactSizeIterator + Clone + 'a {
|
||||||
|
haystack[skip as usize..]
|
||||||
|
.iter()
|
||||||
|
.zip(bonus[skip as usize..].iter())
|
||||||
|
.map(|(&char, &bonus)| HaystackChar { char, bonus })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn rows_mut<'a>(
|
||||||
|
row_offs: &'a [u16],
|
||||||
|
mut cells: &'a mut [MatrixCell],
|
||||||
|
haystack_len: usize,
|
||||||
|
) -> impl Iterator<Item = MatrixRowMut<'a>> + ExactSizeIterator + 'a {
|
||||||
|
row_offs.iter().map(move |&off| {
|
||||||
|
let len = haystack_len - off as usize;
|
||||||
|
let (row, tmp) = take(&mut cells).split_at_mut(len);
|
||||||
|
cells = tmp;
|
||||||
|
MatrixRowMut { off, cells: row }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// we only use this to construct the layout for the slab allocation
|
||||||
|
#[allow(unused)]
|
||||||
|
struct MatrixData {
|
||||||
|
haystack: [char; MAX_HAYSTACK_LEN],
|
||||||
|
bonus: [u16; MAX_HAYSTACK_LEN],
|
||||||
|
row_offs: [u16; MAX_NEEDLE_LEN],
|
||||||
|
cells: [MatrixCell; MAX_MATRIX_SIZE],
|
||||||
|
}
|
||||||
|
|
||||||
|
// const MATRIX_ALLOC_LAYOUT: Layout =
|
||||||
|
// MatrixLayout::<char>::new(MAX_HAYSTACK_LEN, MAX_NEEDLE_LEN, MAX_MATRIX_SIZE).layout;
|
||||||
|
|
||||||
|
pub(crate) struct MatrixSlab(NonNull<u8>);
|
||||||
|
|
||||||
|
impl MatrixSlab {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let layout = Layout::new::<MatrixData>();
|
||||||
|
// safety: the matrix is never zero sized (hardcoded constants)
|
||||||
|
let ptr = unsafe { alloc_zeroed(layout) };
|
||||||
|
let Some(ptr) = NonNull::new(ptr) else{
|
||||||
|
handle_alloc_error(layout)
|
||||||
|
};
|
||||||
|
MatrixSlab(ptr.cast())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn alloc<C: Char>(
|
||||||
|
&mut self,
|
||||||
|
haystack_: &[C],
|
||||||
|
needle_len: usize,
|
||||||
|
) -> Option<Matrix<'_, C>> {
|
||||||
|
let cells = haystack_.len() * needle_len;
|
||||||
|
if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let matrix_layout = MatrixLayout::<C>::new(
|
||||||
|
haystack_.len(),
|
||||||
|
needle_len,
|
||||||
|
(haystack_.len() - needle_len / 2) * needle_len,
|
||||||
|
);
|
||||||
|
if matrix_layout.layout.size() > size_of::<MatrixData>() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
unsafe {
|
||||||
|
// safetly: this allocation is valid for MATRIX_ALLOC_LAYOUT
|
||||||
|
let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0);
|
||||||
|
// copy haystack before creating refernces to ensure we donu't crate
|
||||||
|
// refrences to invalid chars (which may or may not be UB)
|
||||||
|
haystack_
|
||||||
|
.as_ptr()
|
||||||
|
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());
|
||||||
|
Some(Matrix {
|
||||||
|
haystack: &mut *haystack,
|
||||||
|
row_offs: &mut *rows,
|
||||||
|
bonus: &mut *bonus,
|
||||||
|
cells: &mut *cells,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for MatrixSlab {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatrixData>()) };
|
||||||
|
}
|
||||||
|
}
|
0
src/multizip.rs
Normal file
0
src/multizip.rs
Normal file
73
src/prefilter.rs
Normal file
73
src/prefilter.rs
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
use ::memchr::{memchr, memchr2, memrchr, memrchr2};
|
||||||
|
|
||||||
|
use crate::chars::Char;
|
||||||
|
use crate::utf32_str::Utf32Str;
|
||||||
|
use crate::Matcher;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||||
|
if c >= b'a' || c <= b'z' {
|
||||||
|
memchr2(c, c - 32, haystack)
|
||||||
|
} else {
|
||||||
|
memchr(c, haystack)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||||
|
if c >= b'a' || c <= b'z' {
|
||||||
|
memrchr2(c, c - 32, haystack)
|
||||||
|
} else {
|
||||||
|
memrchr(c, haystack)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher {
|
||||||
|
pub(crate) fn prefilter_ascii(
|
||||||
|
&self,
|
||||||
|
mut haystack: &[u8],
|
||||||
|
needle: &[u8],
|
||||||
|
) -> Option<(usize, usize, usize)> {
|
||||||
|
if self.config.ignore_case {
|
||||||
|
let start = find_ascii_ignore_case(needle[0], haystack)?;
|
||||||
|
let mut eager_end = start + 1;
|
||||||
|
haystack = &haystack[eager_end..];
|
||||||
|
for &c in &needle[1..] {
|
||||||
|
let idx = find_ascii_ignore_case(c, haystack)? + 1;
|
||||||
|
eager_end += idx;
|
||||||
|
haystack = &haystack[idx..];
|
||||||
|
}
|
||||||
|
let end = eager_end
|
||||||
|
+ find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack).unwrap_or(0);
|
||||||
|
Some((start, eager_end, end))
|
||||||
|
} else {
|
||||||
|
let start = memchr(needle[0], haystack)?;
|
||||||
|
let mut eager_end = start + 1;
|
||||||
|
haystack = &haystack[eager_end..];
|
||||||
|
for &c in &needle[1..] {
|
||||||
|
let idx = memchr(c, haystack)? + 1;
|
||||||
|
eager_end += idx;
|
||||||
|
haystack = &haystack[idx..];
|
||||||
|
}
|
||||||
|
let end = eager_end + memrchr(*needle.last().unwrap(), haystack).unwrap_or(0);
|
||||||
|
Some((start, eager_end, end))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn prefilter_non_ascii(
|
||||||
|
&self,
|
||||||
|
haystack: &[char],
|
||||||
|
needle: Utf32Str<'_>,
|
||||||
|
) -> Option<(usize, usize)> {
|
||||||
|
let needle_char = needle.get(0);
|
||||||
|
let start = haystack
|
||||||
|
.iter()
|
||||||
|
.position(|c| c.normalize(&self.config) == needle_char)?;
|
||||||
|
let needle_char = needle.last();
|
||||||
|
let end = haystack[start..]
|
||||||
|
.iter()
|
||||||
|
.position(|c| c.normalize(&self.config) == needle_char)?;
|
||||||
|
|
||||||
|
Some((start, end))
|
||||||
|
}
|
||||||
|
}
|
145
src/score.rs
Normal file
145
src/score.rs
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
use std::cmp::max;
|
||||||
|
|
||||||
|
use crate::chars::{Char, CharClass};
|
||||||
|
use crate::{Matcher, MatcherConfig};
|
||||||
|
|
||||||
|
pub(crate) const SCORE_MATCH: u16 = 16;
|
||||||
|
pub(crate) const PENALTY_GAP_START: u16 = 3;
|
||||||
|
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
|
||||||
|
|
||||||
|
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||||
|
// too great to prevent the longer acronym matches from always winning over
|
||||||
|
// shorter fuzzy matches. The bonus point here was specifically chosen that
|
||||||
|
// the bonus is cancelled when the gap between the acronyms grows over
|
||||||
|
// 8 characters, which is approximately the average length of the words found
|
||||||
|
// in web2 dictionary and my file system.
|
||||||
|
pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
|
||||||
|
|
||||||
|
// Although bonus point for non-word characters is non-contextual, we need it
|
||||||
|
// for computing bonus points for consecutive chunks starting with a non-word
|
||||||
|
// character.
|
||||||
|
pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2;
|
||||||
|
|
||||||
|
// Edge-triggered bonus for matches in camelCase words.
|
||||||
|
// Compared to word-boundary case, they don't accompany single-character gaps
|
||||||
|
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
||||||
|
pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION;
|
||||||
|
|
||||||
|
// Minimum bonus point given to characters in consecutive chunks.
|
||||||
|
// Note that bonus points for consecutive matches shouldn't have needed if we
|
||||||
|
// used fixed match score as in the original algorithm.
|
||||||
|
pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION;
|
||||||
|
|
||||||
|
// The first character in the typed pattern usually has more significance
|
||||||
|
// than the rest so it's important that it appears at special positions where
|
||||||
|
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
||||||
|
// The amount of the extra bonus should be limited so that the gap penalty is
|
||||||
|
// still respected.
|
||||||
|
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
|
||||||
|
|
||||||
|
impl MatcherConfig {
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
||||||
|
if class > CharClass::NonWord {
|
||||||
|
// transition from non word to word
|
||||||
|
match prev_class {
|
||||||
|
CharClass::Whitespace => return self.bonus_boundary_white,
|
||||||
|
CharClass::Delimiter => return self.bonus_boundary_delimiter,
|
||||||
|
CharClass::NonWord => return BONUS_BOUNDARY,
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if prev_class == CharClass::Lower && class == CharClass::Upper
|
||||||
|
|| prev_class != CharClass::Number && class == CharClass::Number
|
||||||
|
{
|
||||||
|
// camelCase letter123
|
||||||
|
BONUS_CAMEL123
|
||||||
|
} else if class == CharClass::NonWord {
|
||||||
|
BONUS_NON_WORD
|
||||||
|
} else if class == CharClass::Whitespace {
|
||||||
|
self.bonus_boundary_white
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl Matcher {
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
|
||||||
|
self.config.bonus_for(prev_class, class)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn calculate_score<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
|
||||||
|
&mut self,
|
||||||
|
haystack: &[H],
|
||||||
|
needle: &[N],
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
indicies: &mut Vec<u32>,
|
||||||
|
) -> u16 {
|
||||||
|
if INDICIES {
|
||||||
|
indicies.reserve(needle.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut prev_class = start
|
||||||
|
.checked_sub(1)
|
||||||
|
.map(|i| haystack[i].char_class(&self.config))
|
||||||
|
.unwrap_or(self.config.inital_char_class);
|
||||||
|
let mut needle_iter = needle.iter();
|
||||||
|
let mut needle_char = *needle_iter.next().unwrap();
|
||||||
|
|
||||||
|
let mut in_gap = false;
|
||||||
|
let mut consecutive = 1;
|
||||||
|
|
||||||
|
// unrolled the firs iteration to make applying the first char multiplier less akward
|
||||||
|
if INDICIES {
|
||||||
|
indicies.push(start as u32)
|
||||||
|
}
|
||||||
|
let mut first_bonus = self.bonus_for(prev_class, haystack[0].char_class(&self.config));
|
||||||
|
let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER;
|
||||||
|
|
||||||
|
for (i, c) in haystack[start + 1..end].iter().enumerate() {
|
||||||
|
let class = c.char_class(&self.config);
|
||||||
|
let c = c.normalize(&self.config);
|
||||||
|
if c == needle_char {
|
||||||
|
if INDICIES {
|
||||||
|
indicies.push(i as u32 + start as u32)
|
||||||
|
}
|
||||||
|
let mut bonus = self.bonus_for(prev_class, class);
|
||||||
|
if consecutive == 0 {
|
||||||
|
first_bonus = bonus
|
||||||
|
} else {
|
||||||
|
// Break consecutive chunk
|
||||||
|
if bonus > first_bonus {
|
||||||
|
if bonus >= BONUS_BOUNDARY {
|
||||||
|
first_bonus = bonus;
|
||||||
|
} else {
|
||||||
|
bonus = max(bonus, BONUS_CONSECUTIVE);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
bonus = max(first_bonus, BONUS_CONSECUTIVE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
score += SCORE_MATCH + bonus;
|
||||||
|
in_gap = false;
|
||||||
|
consecutive += 1;
|
||||||
|
if let Some(&next) = needle_iter.next() {
|
||||||
|
needle_char = next;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let penalty = if in_gap {
|
||||||
|
PENALTY_GAP_EXTENSION
|
||||||
|
} else {
|
||||||
|
PENALTY_GAP_START
|
||||||
|
};
|
||||||
|
score = score.saturating_sub(penalty);
|
||||||
|
in_gap = true;
|
||||||
|
consecutive = 0;
|
||||||
|
first_bonus = 0;
|
||||||
|
}
|
||||||
|
prev_class = class;
|
||||||
|
}
|
||||||
|
|
||||||
|
score
|
||||||
|
}
|
||||||
|
}
|
270
src/tests.rs
Normal file
270
src/tests.rs
Normal file
@ -0,0 +1,270 @@
|
|||||||
|
use crate::config::{
|
||||||
|
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
|
||||||
|
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
|
||||||
|
};
|
||||||
|
use crate::{CaseMatching, Matcher, MatcherConfig};
|
||||||
|
|
||||||
|
pub fn assert_matches(
|
||||||
|
use_v1: bool,
|
||||||
|
normalize: bool,
|
||||||
|
case_sensitive: bool,
|
||||||
|
path: bool,
|
||||||
|
cases: &[(&str, &str, u32, u32, u16)],
|
||||||
|
) {
|
||||||
|
let mut config = MatcherConfig {
|
||||||
|
use_v1,
|
||||||
|
normalize,
|
||||||
|
case_matching: if case_sensitive {
|
||||||
|
CaseMatching::Respect
|
||||||
|
} else {
|
||||||
|
CaseMatching::Ignore
|
||||||
|
},
|
||||||
|
..MatcherConfig::DEFAULT
|
||||||
|
};
|
||||||
|
if path {
|
||||||
|
config.set_match_paths();
|
||||||
|
}
|
||||||
|
let mut matcher = Matcher::new(config);
|
||||||
|
let mut indicies = Vec::new();
|
||||||
|
for &(haystack, needle, start, end, mut score) in cases {
|
||||||
|
score += needle.chars().count() as u16 * SCORE_MATCH;
|
||||||
|
let query = matcher.compile_query(needle);
|
||||||
|
let res = matcher.fuzzy_indicies(&query, haystack, &mut indicies);
|
||||||
|
assert_eq!(res, Some(score), "{needle:?} did not match {haystack:?}");
|
||||||
|
assert_eq!(
|
||||||
|
indicies.first().copied()..indicies.last().map(|&i| i + 1),
|
||||||
|
Some(start)..Some(end),
|
||||||
|
"{needle:?} match {haystack:?}[{start}..{end}]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
|
||||||
|
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_v2_fuzzy() {
|
||||||
|
assert_matches(
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
&[
|
||||||
|
(
|
||||||
|
"fooBarbaz1",
|
||||||
|
"oBZ",
|
||||||
|
2,
|
||||||
|
9,
|
||||||
|
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo bar baz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 4 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/AutomatorDocument.icns",
|
||||||
|
"rdoc",
|
||||||
|
9,
|
||||||
|
13,
|
||||||
|
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/man1/zshcompctl.1",
|
||||||
|
"zshc",
|
||||||
|
6,
|
||||||
|
10,
|
||||||
|
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_BOUNDARY_DELIMITER * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/.oh-my-zsh/cache",
|
||||||
|
"zshc",
|
||||||
|
8,
|
||||||
|
13,
|
||||||
|
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
+ BONUS_BOUNDARY_DELIMITER,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"ab0123 456",
|
||||||
|
"12356",
|
||||||
|
3,
|
||||||
|
10,
|
||||||
|
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"abc123 456",
|
||||||
|
"12356",
|
||||||
|
3,
|
||||||
|
10,
|
||||||
|
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_CAMEL123 * 2
|
||||||
|
+ BONUS_CONSECUTIVE
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo/bar/baz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 4 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"fooBarBaz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
7,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 2 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo barbaz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
|
||||||
|
- PENALTY_GAP_START * 2
|
||||||
|
- PENALTY_GAP_EXTENSION * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"fooBar Baz",
|
||||||
|
"foob",
|
||||||
|
0,
|
||||||
|
4,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"xFoo-Bar Baz",
|
||||||
|
"foo-b",
|
||||||
|
1,
|
||||||
|
6,
|
||||||
|
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_CAMEL123 * 2
|
||||||
|
+ BONUS_NON_WORD
|
||||||
|
+ BONUS_BOUNDARY,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_v1_fuzzy() {
|
||||||
|
assert_matches(
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
&[
|
||||||
|
(
|
||||||
|
"fooBarbaz1",
|
||||||
|
"oBZ",
|
||||||
|
2,
|
||||||
|
9,
|
||||||
|
BONUS_CAMEL123 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo bar baz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 4 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/AutomatorDocument.icns",
|
||||||
|
"rdoc",
|
||||||
|
9,
|
||||||
|
13,
|
||||||
|
BONUS_CAMEL123 + BONUS_CONSECUTIVE * 2,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/man1/zshcompctl.1",
|
||||||
|
"zshc",
|
||||||
|
6,
|
||||||
|
10,
|
||||||
|
BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_BOUNDARY_DELIMITER * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"/.oh-my-zsh/cache",
|
||||||
|
"zshc",
|
||||||
|
8,
|
||||||
|
13,
|
||||||
|
BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
+ BONUS_BOUNDARY_DELIMITER,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"ab0123 456",
|
||||||
|
"12356",
|
||||||
|
3,
|
||||||
|
10,
|
||||||
|
BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"abc123 456",
|
||||||
|
"12356",
|
||||||
|
3,
|
||||||
|
10,
|
||||||
|
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_CAMEL123 * 2
|
||||||
|
+ BONUS_CONSECUTIVE
|
||||||
|
- PENALTY_GAP_START
|
||||||
|
- PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo/bar/baz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 4 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"fooBarBaz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
7,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2
|
||||||
|
- 2 * PENALTY_GAP_START
|
||||||
|
- 2 * PENALTY_GAP_EXTENSION,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"foo barbaz",
|
||||||
|
"fbb",
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE
|
||||||
|
- PENALTY_GAP_START * 2
|
||||||
|
- PENALTY_GAP_EXTENSION * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"fooBar Baz",
|
||||||
|
"foob",
|
||||||
|
0,
|
||||||
|
4,
|
||||||
|
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"xFoo-Bar Baz",
|
||||||
|
"foo-b",
|
||||||
|
1,
|
||||||
|
6,
|
||||||
|
BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER
|
||||||
|
+ BONUS_CAMEL123 * 2
|
||||||
|
+ BONUS_NON_WORD
|
||||||
|
+ BONUS_BOUNDARY,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
123
src/utf32_str.rs
Normal file
123
src/utf32_str.rs
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
use std::ops::{Bound, RangeBounds};
|
||||||
|
|
||||||
|
/// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching.
|
||||||
|
///
|
||||||
|
/// Usually rusts utf8 encoded strings are great. However during fuzzy matching
|
||||||
|
/// operates on codepoints (it should operate on graphemes but that's too much
|
||||||
|
/// hassle to deal with). We want to quickly iterate these codeboints between
|
||||||
|
/// (up to 5 times) during matching.
|
||||||
|
///
|
||||||
|
/// Doing codepoint segmentation on the fly not only blows trough the cache
|
||||||
|
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
|
||||||
|
/// matching itself. Furthermore there are a lot of exta optimizations available
|
||||||
|
/// for ascii only text (but checking during each match has too much overhead).
|
||||||
|
///
|
||||||
|
/// Ofcourse this comes at exta memory cost as we usally still need the ut8
|
||||||
|
/// encoded variant for rendenring. In the (dominant) case of ascii-only text
|
||||||
|
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
|
||||||
|
/// the user is typing on the fly so the same item is potentially matched many
|
||||||
|
/// times (making the the upfront cost more worth it). That means that its
|
||||||
|
/// basically always worth it to presegment the string.
|
||||||
|
///
|
||||||
|
/// For usecases that only match (a lot of) strings once its possible to keep
|
||||||
|
/// char buffer around that is filled with the presegmented chars
|
||||||
|
///
|
||||||
|
/// Another advantage of this approach is that the matcher will naturally
|
||||||
|
/// produce char indecies (instead of utf8 offsets) annyway. With a
|
||||||
|
/// codepoint basec representation like this the indecies can be used
|
||||||
|
/// directly
|
||||||
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Debug)]
|
||||||
|
pub enum Utf32Str<'a> {
|
||||||
|
/// A string represented as ASCII encoded bytes.
|
||||||
|
/// Correctness invariant: must only contain vaild ASCII (<=127)
|
||||||
|
Ascii(&'a [u8]),
|
||||||
|
/// A string represented as an array of unicode codepoints (basically UTF-32).
|
||||||
|
Unicode(&'a [char]),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Utf32Str<'a> {
|
||||||
|
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
|
||||||
|
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
|
||||||
|
if str.is_ascii() {
|
||||||
|
Utf32Str::Ascii(str.as_bytes())
|
||||||
|
} else {
|
||||||
|
buf.clear();
|
||||||
|
buf.extend(str.chars());
|
||||||
|
Utf32Str::Unicode(&*buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
match self {
|
||||||
|
Utf32Str::Unicode(codepoints) => codepoints.len(),
|
||||||
|
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
|
||||||
|
let start = match range.start_bound() {
|
||||||
|
Bound::Included(&start) => start,
|
||||||
|
Bound::Excluded(&start) => start + 1,
|
||||||
|
Bound::Unbounded => 0,
|
||||||
|
};
|
||||||
|
let end = match range.end_bound() {
|
||||||
|
Bound::Included(&end) => end,
|
||||||
|
Bound::Excluded(&end) => end + 1,
|
||||||
|
Bound::Unbounded => self.len(),
|
||||||
|
};
|
||||||
|
match self {
|
||||||
|
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
|
||||||
|
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `slice` but accepts a u32 range for convenicene sine
|
||||||
|
/// those are the indecies returned by the matcher
|
||||||
|
#[inline]
|
||||||
|
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
|
||||||
|
let start = match range.start_bound() {
|
||||||
|
Bound::Included(&start) => start as usize,
|
||||||
|
Bound::Excluded(&start) => start as usize + 1,
|
||||||
|
Bound::Unbounded => 0,
|
||||||
|
};
|
||||||
|
let end = match range.end_bound() {
|
||||||
|
Bound::Included(&end) => end as usize,
|
||||||
|
Bound::Excluded(&end) => end as usize + 1,
|
||||||
|
Bound::Unbounded => self.len(),
|
||||||
|
};
|
||||||
|
match self {
|
||||||
|
Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
|
||||||
|
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_ascii(&self) -> bool {
|
||||||
|
matches!(self, Utf32Str::Ascii(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, idx: u32) -> char {
|
||||||
|
match self {
|
||||||
|
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
|
||||||
|
Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn last(&self) -> char {
|
||||||
|
match self {
|
||||||
|
Utf32Str::Ascii(bytes) => bytes[bytes.len()] as char,
|
||||||
|
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len()],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// impl Str for &[char] {
|
||||||
|
// type Chars;
|
||||||
|
|
||||||
|
// fn chars(&self) -> Self::Chars {
|
||||||
|
// todo!()
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn slice(&self, range: impl RangeBounds<u32>) {
|
||||||
|
// todo!()
|
||||||
|
// }
|
||||||
|
// }
|
Loading…
Reference in New Issue
Block a user