fix typos

This commit is contained in:
Pascal Kuthe 2023-07-20 16:03:31 +02:00
parent 33822be2ab
commit d844ab7f3b
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
11 changed files with 69 additions and 311 deletions

245
foo.c
View File

@ -1,245 +0,0 @@
fzf_result_t fzf_fuzzy_match_v2(bool case_sensitive, bool normalize,
fzf_string_t *text, fzf_string_t *pattern,
fzf_position_t *pos, fzf_slab_t *slab) {
const size_t M = pattern->size;
const size_t N = text->size;
if (M == 0) {
return (fzf_result_t){0, 0, 0};
}
if (slab != NULL && N * M > slab->I16.cap) {
return fzf_fuzzy_match_v1(case_sensitive, normalize, text, pattern, pos,
slab);
}
size_t idx;
{
int32_t tmp_idx = ascii_fuzzy_index(text, pattern->data, M, case_sensitive);
if (tmp_idx < 0) {
return (fzf_result_t){-1, -1, 0};
}
idx = (size_t)tmp_idx;
}
size_t offset16 = 0;
size_t offset32 = 0;
fzf_i16_t h0 = alloc16(&offset16, slab, N);
fzf_i16_t c0 = alloc16(&offset16, slab, N);
// Bonus point for each positions
fzf_i16_t bo = alloc16(&offset16, slab, N);
// The first occurrence of each character in the pattern
fzf_i32_t f = alloc32(&offset32, slab, M);
// Rune array
fzf_i32_t t = alloc32(&offset32, slab, N);
copy_runes(text, &t); // input.CopyRunes(T)
// Phase 2. Calculate bonus for each point
int16_t max_score = 0;
size_t max_score_pos = 0;
size_t pidx = 0;
size_t last_idx = 0;
char pchar0 = pattern->data[0];
char pchar = pattern->data[0];
int16_t prev_h0 = 0;
int32_t prev_class = CharNonWord;
bool in_gap = false;
i32_slice_t t_sub = slice_i32(t.data, idx, t.size); // T[idx:];
i16_slice_t h0_sub =
slice_i16_right(slice_i16(h0.data, idx, h0.size).data, t_sub.size);
i16_slice_t c0_sub =
slice_i16_right(slice_i16(c0.data, idx, c0.size).data, t_sub.size);
i16_slice_t b_sub =
slice_i16_right(slice_i16(bo.data, idx, bo.size).data, t_sub.size);
for (size_t off = 0; off < t_sub.size; off++) {
char_class class;
char c = (char)t_sub.data[off];
class = char_class_of_ascii(c);
if (!case_sensitive && class == CharUpper) {
/* TODO(conni2461): unicode support */
c = (char)tolower((uint8_t)c);
}
if (normalize) {
c = normalize_rune(c);
}
t_sub.data[off] = (uint8_t)c;
int16_t bonus = bonus_for(prev_class, class);
b_sub.data[off] = bonus;
prev_class = class;
if (c == pchar) {
if (pidx < M) {
f.data[pidx] = (int32_t)(idx + off);
pidx++;
pchar = pattern->data[min64u(pidx, M - 1)];
}
last_idx = idx + off;
}
if (c == pchar0) {
int16_t score = ScoreMatch + bonus * BonusFirstCharMultiplier;
h0_sub.data[off] = score;
c0_sub.data[off] = 1;
if (M == 1 && (score > max_score)) {
max_score = score;
max_score_pos = idx + off;
if (bonus == BonusBoundary) {
break;
}
}
in_gap = false;
} else {
if (in_gap) {
h0_sub.data[off] = max16(prev_h0 + ScoreGapExtention, 0);
} else {
h0_sub.data[off] = max16(prev_h0 + ScoreGapStart, 0);
}
c0_sub.data[off] = 0;
in_gap = true;
}
prev_h0 = h0_sub.data[off];
}
if (pidx != M) {
free_alloc(t);
free_alloc(f);
free_alloc(bo);
free_alloc(c0);
free_alloc(h0);
return (fzf_result_t){-1, -1, 0};
}
if (M == 1) {
free_alloc(t);
free_alloc(f);
free_alloc(bo);
free_alloc(c0);
free_alloc(h0);
fzf_result_t res = {(int32_t)max_score_pos, (int32_t)max_score_pos + 1,
max_score};
append_pos(pos, max_score_pos);
return res;
}
size_t f0 = (size_t)f.data[0];
size_t width = last_idx - f0 + 1;
fzf_i16_t h = alloc16(&offset16, slab, width * M);
{
i16_slice_t h0_tmp_slice = slice_i16(h0.data, f0, last_idx + 1);
copy_into_i16(&h0_tmp_slice, &h);
}
fzf_i16_t c = alloc16(&offset16, slab, width * M);
{
i16_slice_t c0_tmp_slice = slice_i16(c0.data, f0, last_idx + 1);
copy_into_i16(&c0_tmp_slice, &c);
}
i32_slice_t f_sub = slice_i32(f.data, 1, f.size);
str_slice_t p_sub =
slice_str_right(slice_str(pattern->data, 1, M).data, f_sub.size);
for (size_t off = 0; off < f_sub.size; off++) {
size_t f = (size_t)f_sub.data[off];
pchar = p_sub.data[off];
pidx = off + 1;
size_t row = pidx * width;
in_gap = false;
t_sub = slice_i32(t.data, f, last_idx + 1);
b_sub = slice_i16_right(slice_i16(bo.data, f, bo.size).data, t_sub.size);
i16_slice_t c_sub = slice_i16_right(
slice_i16(c.data, row + f - f0, c.size).data, t_sub.size);
i16_slice_t c_diag = slice_i16_right(
slice_i16(c.data, row + f - f0 - 1 - width, c.size).data, t_sub.size);
i16_slice_t h_sub = slice_i16_right(
slice_i16(h.data, row + f - f0, h.size).data, t_sub.size);
i16_slice_t h_diag = slice_i16_right(
slice_i16(h.data, row + f - f0 - 1 - width, h.size).data, t_sub.size);
i16_slice_t h_left = slice_i16_right(
slice_i16(h.data, row + f - f0 - 1, h.size).data, t_sub.size);
h_left.data[0] = 0;
for (size_t j = 0; j < t_sub.size; j++) {
char ch = (char)t_sub.data[j];
size_t col = j + f;
int16_t s1 = 0;
int16_t s2 = 0;
int16_t consecutive = 0;
if (in_gap) {
s2 = h_left.data[j] + ScoreGapExtention;
} else {
s2 = h_left.data[j] + ScoreGapStart;
}
if (pchar == ch) {
s1 = h_diag.data[j] + ScoreMatch;
int16_t b = b_sub.data[j];
consecutive = c_diag.data[j] + 1;
if (b == BonusBoundary) {
consecutive = 1;
} else if (consecutive > 1) {
b = max16(b, max16(BonusConsecutive,
bo.data[col - ((size_t)consecutive) + 1]));
}
if (s1 + b < s2) {
s1 += b_sub.data[j];
consecutive = 0;
} else {
s1 += b;
}
}
c_sub.data[j] = consecutive;
in_gap = s1 < s2;
int16_t score = max16(max16(s1, s2), 0);
if (pidx == M - 1 && (score > max_score)) {
max_score = score;
max_score_pos = col;
}
h_sub.data[j] = score;
}
}
resize_pos(pos, M, M);
size_t j = max_score_pos;
if (pos) {
size_t i = M - 1;
bool prefer_match = true;
for (;;) {
size_t ii = i * width;
size_t j0 = j - f0;
int16_t s = h.data[ii + j0];
int16_t s1 = 0;
int16_t s2 = 0;
if (i > 0 && j >= f.data[i]) {
s1 = h.data[ii - width + j0 - 1];
}
if (j > f.data[i]) {
s2 = h.data[ii + j0 - 1];
}
if (s > s1 && (s > s2 || (s == s2 && prefer_match))) {
unsafe_append_pos(pos, j);
if (i == 0) {
break;
}
i--;
}
prefer_match = c.data[ii + j0] > 1 || (ii + width + j0 + 1 < c.size &&
c.data[ii + width + j0 + 1] > 0);
j--;
}
}
free_alloc(h);
free_alloc(c);
free_alloc(t);
free_alloc(f);
free_alloc(bo);
free_alloc(c0);
free_alloc(h0);
return (fzf_result_t){(int32_t)j, (int32_t)max_score_pos + 1,
(int32_t)max_score};
}

View File

@ -64,7 +64,7 @@ impl Char for AsciiChar {
CharClass::Number
} else if c.is_ascii_whitespace() {
CharClass::Whitespace
} else if config.delimeter_chars.contains(&c) {
} else if config.delimiter_chars.contains(&c) {
CharClass::Delimiter
} else {
CharClass::NonWord

View File

@ -3,16 +3,16 @@ use crate::score::BONUS_BOUNDARY;
#[non_exhaustive]
pub struct MatcherConfig {
pub delimeter_chars: &'static [u8],
pub delimiter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string
pub bonus_boundary_white: u16,
// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub bonus_boundary_delimiter: u16,
pub inital_char_class: CharClass,
/// Whether to normalize latin script charaters to ASCII
pub initial_char_class: CharClass,
/// Whether to normalize latin script characters to ASCII
/// this significantly degrades performance so its not recommended
/// to be truned on by default
/// to be turned on by default
pub normalize: bool,
/// whether to ignore casing
pub ignore_case: bool,
@ -29,10 +29,10 @@ pub struct MatcherConfig {
impl MatcherConfig {
pub const DEFAULT: Self = {
MatcherConfig {
delimeter_chars: b"/,:;|",
delimiter_chars: b"/,:;|",
bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
inital_char_class: CharClass::Whitespace,
initial_char_class: CharClass::Whitespace,
normalize: false,
ignore_case: true,
}
@ -42,22 +42,22 @@ impl MatcherConfig {
impl MatcherConfig {
pub fn set_match_paths(&mut self) {
if cfg!(windows) {
self.delimeter_chars = b"/\\";
self.delimiter_chars = b"/\\";
} else {
self.delimeter_chars = b"/";
self.delimiter_chars = b"/";
}
self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter;
self.initial_char_class = CharClass::Delimiter;
}
pub const fn match_paths(mut self) -> Self {
if cfg!(windows) {
self.delimeter_chars = b"/\\";
self.delimiter_chars = b"/\\";
} else {
self.delimeter_chars = b"/";
self.delimiter_chars = b"/";
}
self.bonus_boundary_white = BONUS_BOUNDARY;
self.inital_char_class = CharClass::Delimiter;
self.initial_char_class = CharClass::Delimiter;
self
}
}

View File

@ -2,15 +2,15 @@ use crate::chars::Char;
use crate::Matcher;
impl Matcher {
/// greedy fallback algoritm, much faster (linear time) but reported scores/indicies
/// greedy fallback algorithm, much faster (linear time) but reported scores/indicies
/// might not be the best match
pub(crate) fn fuzzy_match_greedy<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
pub(crate) fn fuzzy_match_greedy<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
mut start: usize,
mut end: usize,
indicies: &mut Vec<u32>,
indices: &mut Vec<u32>,
) -> Option<u16> {
let first_char_end = if H::ASCII { start + 1 } else { end };
if !H::ASCII && needle.len() != 1 {
@ -27,7 +27,7 @@ impl Matcher {
}
}
}
// mimimize the greedly match by greedy matching in reverse
// minimize the greedly match by greedy matching in reverse
let mut needle_iter = needle.iter().rev().copied();
let mut needle_char = needle_iter.next().unwrap();
@ -40,6 +40,6 @@ impl Matcher {
needle_char = next_needle_char;
}
}
Some(self.calculate_score::<INDICIES, H, N>(haystack, needle, start, end, indicies))
Some(self.calculate_score::<INDICES, H, N>(haystack, needle, start, end, indices))
}
}

View File

@ -10,39 +10,39 @@ use crate::score::{
use crate::{Matcher, MatcherConfig};
impl Matcher {
pub(crate) fn fuzzy_match_optimal<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
start: usize,
greedy_end: usize,
end: usize,
indicies: &mut Vec<u32>,
indices: &mut Vec<u32>,
) -> Option<u16> {
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
// us to treat needle indecies as u16
// us to treat needle indices as u16
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
return self.fuzzy_match_greedy::<INDICIES, H, N>(
return self.fuzzy_match_greedy::<INDICES, H, N>(
haystack,
needle,
start,
greedy_end,
indicies,
indices,
);
};
let prev_class = start
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.inital_char_class);
.unwrap_or(self.config.initial_char_class);
let (max_score_pos, max_score, matched) = matrix.setup(needle, prev_class, &self.config);
// this only happend with unicode haystacks, for ASCII the prefilter handles all rejects
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched {
return None;
}
if needle.len() == 1 {
indicies.push(max_score_pos as u32);
indices.push(max_score_pos as u32);
return Some(max_score);
}
debug_assert_eq!(
@ -52,8 +52,8 @@ impl Matcher {
// populate the matrix and find the best score
let (max_score, best_match_end) = matrix.populate_matrix(needle);
if INDICIES {
matrix.reconstruct_optimal_path(needle, start as u32, indicies, best_match_end);
if INDICES {
matrix.reconstruct_optimal_path(needle, start as u32, indices, best_match_end);
}
Some(max_score)
}
@ -224,12 +224,12 @@ impl<H: Char> Matrix<'_, H> {
&self,
needle: &[N],
start: u32,
indicies: &mut Vec<u32>,
indices: &mut Vec<u32>,
best_match_end: u16,
) {
indicies.resize(needle.len(), 0);
indices.resize(needle.len(), 0);
let mut row_iter = self.rows_rev().zip(indicies.iter_mut().rev()).peekable();
let mut row_iter = self.rows_rev().zip(indices.iter_mut().rev()).peekable();
let (mut row, mut matched_col_idx) = row_iter.next().unwrap();
let mut next_row: Option<MatrixRow> = None;
let mut col = best_match_end;

View File

@ -1,4 +1,4 @@
// sadly this doens't optmimzie well currently
// sadly ranges don't optmimzie well
#![allow(clippy::manual_range_contains)]
mod chars;
@ -63,7 +63,7 @@ impl Matcher {
self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new())
}
pub fn fuzzy_indicies(
pub fn fuzzy_indices(
&mut self,
haystack: Utf32Str<'_>,
needle: Utf32Str<'_>,
@ -73,7 +73,7 @@ impl Matcher {
self.fuzzy_matcher_impl::<true>(haystack, needle, indidies)
}
fn fuzzy_matcher_impl<const INDICIES: bool>(
fn fuzzy_matcher_impl<const INDICES: bool>(
&mut self,
haystack: Utf32Str<'_>,
needle_: Utf32Str<'_>,
@ -92,7 +92,7 @@ impl Matcher {
match (haystack, needle_) {
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle)?;
self.fuzzy_match_optimal::<INDICIES, AsciiChar, AsciiChar>(
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
AsciiChar::cast(haystack),
AsciiChar::cast(needle),
start,
@ -108,7 +108,7 @@ impl Matcher {
}
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
self.fuzzy_match_optimal::<INDICIES, char, AsciiChar>(
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
haystack,
AsciiChar::cast(needle),
start,
@ -119,7 +119,7 @@ impl Matcher {
}
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
let (start, end) = self.prefilter_non_ascii(haystack, needle_)?;
self.fuzzy_match_optimal::<INDICIES, char, char>(
self.fuzzy_match_optimal::<INDICES, char, char>(
haystack,
needle,
start,
@ -131,11 +131,11 @@ impl Matcher {
}
}
// pub fn fuzzy_indicies(
// pub fn fuzzy_indices(
// &mut self,
// query: &Query,
// mut haystack: Utf32Str<'_>,
// indicies: &mut Vec<u32>,
// indices: &mut Vec<u32>,
// ) -> Option<u16> {
// if haystack.len() > u32::MAX as usize {
// haystack = &haystack[..u32::MAX as usize]
@ -146,14 +146,14 @@ impl Matcher {
// );
// if self.config.use_v1 {
// if query.is_ascii && !self.config.normalize {
// self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
// self.fuzzy_matcher_v1::<true, true>(query, haystack, indices)
// } else {
// self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
// self.fuzzy_matcher_v1::<true, false>(query, haystack, indices)
// }
// } else if query.is_ascii && !self.config.normalize {
// self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
// self.fuzzy_matcher_v2::<true, true>(query, haystack, indices)
// } else {
// self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
// self.fuzzy_matcher_v2::<true, false>(query, haystack, indices)
// }
// }
}

View File

@ -155,7 +155,7 @@ where
pub(crate) struct Matrix<'a, C: Char> {
pub haystack: &'a mut [C],
// stored as a seperate array instead of struct
// stored as a separate array instead of struct
// to avoid padding sine char is too large and u8 too small :/
pub bonus: &'a mut [u16],
pub row_offs: &'a mut [u16],
@ -264,10 +264,10 @@ impl MatrixSlab {
return None;
}
unsafe {
// safetly: this allocation is valid for MATRIX_ALLOC_LAYOUT
// safely: this allocation is valid for MATRIX_ALLOC_LAYOUT
let (haystack, bonus, rows, cells) = matrix_layout.fieds_from_ptr(self.0);
// copy haystack before creating refernces to ensure we donu't crate
// refrences to invalid chars (which may or may not be UB)
// copy haystack before creating references to ensure we donu't crate
// references to invalid chars (which may or may not be UB)
haystack_
.as_ptr()
.copy_to_nonoverlapping(haystack as *mut _, haystack_.len());

View File

@ -69,22 +69,22 @@ impl Matcher {
self.config.bonus_for(prev_class, class)
}
pub(crate) fn calculate_score<const INDICIES: bool, H: Char + PartialEq<N>, N: Char>(
pub(crate) fn calculate_score<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
start: usize,
end: usize,
indicies: &mut Vec<u32>,
indices: &mut Vec<u32>,
) -> u16 {
if INDICIES {
indicies.reserve(needle.len());
if INDICES {
indices.reserve(needle.len());
}
let mut prev_class = start
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.inital_char_class);
.unwrap_or(self.config.initial_char_class);
let mut needle_iter = needle.iter();
let mut needle_char = *needle_iter.next().unwrap();
@ -92,8 +92,8 @@ impl Matcher {
let mut consecutive = 1;
// unrolled the firs iteration to make applying the first char multiplier less akward
if INDICIES {
indicies.push(start as u32)
if INDICES {
indices.push(start as u32)
}
let mut first_bonus = self.bonus_for(prev_class, haystack[0].char_class(&self.config));
let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER;
@ -102,8 +102,8 @@ impl Matcher {
let class = c.char_class(&self.config);
let c = c.normalize(&self.config);
if c == needle_char {
if INDICIES {
indicies.push(i as u32 + start as u32)
if INDICES {
indices.push(i as u32 + start as u32)
}
let mut bonus = self.bonus_for(prev_class, class);
if consecutive == 0 {

View File

@ -22,7 +22,7 @@ pub fn assert_matches(
config.set_match_paths();
}
let mut matcher = Matcher::new(config);
let mut indicies = Vec::new();
let mut indices = Vec::new();
let mut needle_buf = Vec::new();
let mut haystack_buf = Vec::new();
for &(haystack, needle, start, end, mut score) in cases {
@ -35,8 +35,8 @@ pub fn assert_matches(
let haystack = Utf32Str::new(haystack, &mut haystack_buf);
score += needle.len() as u16 * SCORE_MATCH;
let res = matcher.fuzzy_indicies(haystack, needle, &mut indicies);
let match_chars: Vec<_> = indicies
let res = matcher.fuzzy_indices(haystack, needle, &mut indices);
let match_chars: Vec<_> = indices
.iter()
.map(|&i| haystack.get(i).normalize(&matcher.config))
.collect();
@ -47,9 +47,9 @@ pub fn assert_matches(
Some(score),
"{needle:?} did not match {haystack:?}: {match_chars:?}"
);
assert_eq!(match_chars, needle_chars, "match indicies are incorrect");
assert_eq!(match_chars, needle_chars, "match indices are incorrect");
assert_eq!(
indicies.first().copied()..indicies.last().map(|&i| i + 1),
indices.first().copied()..indices.last().map(|&i| i + 1),
Some(start)..Some(end),
"{needle:?} match {haystack:?}[{start}..{end}]"
);

View File

@ -13,7 +13,7 @@ use std::slice;
/// matching itself. Furthermore there are a lot of exta optimizations available
/// for ascii only text (but checking during each match has too much overhead).
///
/// Ofcourse this comes at exta memory cost as we usally still need the ut8
/// Ofcourse this comes at exta memory cost as we usually still need the ut8
/// encoded variant for rendenring. In the (dominant) case of ascii-only text
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
/// the user is typing on the fly so the same item is potentially matched many
@ -24,13 +24,13 @@ use std::slice;
/// char buffer around that is filled with the presegmented chars
///
/// Another advantage of this approach is that the matcher will naturally
/// produce char indecies (instead of utf8 offsets) annyway. With a
/// codepoint basec representation like this the indecies can be used
/// produce char indices (instead of utf8 offsets) annyway. With a
/// codepoint basec representation like this the indices can be used
/// directly
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Debug)]
pub enum Utf32Str<'a> {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain vaild ASCII (<=127)
/// Correctness invariant: must only contain valid ASCII (<=127)
Ascii(&'a [u8]),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(&'a [char]),
@ -75,7 +75,7 @@ impl<'a> Utf32Str<'a> {
}
/// Same as `slice` but accepts a u32 range for convenicene sine
/// those are the indecies returned by the matcher
/// those are the indices returned by the matcher
#[inline]
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
let start = match range.start_bound() {

3
typos.toml Normal file
View File

@ -0,0 +1,3 @@
default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"]
[files]
extend-exclude = ["integration_tests", "verilogae/tests", "*.mir", "openvaf/lexer/src/tests.rs"]