mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 01:47:49 +00:00
Merge pull request #4 from helix-editor/typos
Fix typos and removed unused files
This commit is contained in:
commit
093ecafb01
@ -4,7 +4,7 @@
|
||||
|
||||
Nucleo uses the exact **same scoring system as fzf**. That means you should get the same ranking quality (or better) as you are used to from fzf. However, `nucleo` has a more faithful implementation of the Smith-Waterman algorithm which is normally used in DNA sequence alignment (see https://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/gaps.pdf) with two separate matrices (instead of one like fzf). This means that `nucleo` finds the optimal match more often. For example if you match `foo` in `xf foo` `nucleo` will match `x__foo` but `fzf` will match `xf_oo` (you can increase the word length the result will stay the same). The former is the more intuitive match and has a higher score according to the ranking system that both `nucleo` and fzf.
|
||||
|
||||
**Compared to `skim`** (and the `fuzzy-matcher` crate) `nucleo` has an even larger performance advantage and is often around **six times faster** (see benchmarks below). Furthermore, the bonus system used by nucleo and fzf is (in my opinion)more consistent/superior. `nulceo` also handles non-ascii text much better. (`skim`s bonus system and even case insensitivity only work for ASCII).
|
||||
**Compared to `skim`** (and the `fuzzy-matcher` crate) `nucleo` has an even larger performance advantage and is often around **six times faster** (see benchmarks below). Furthermore, the bonus system used by nucleo and fzf is (in my opinion) more consistent/superior. `nulceo` also handles non-ascii text much better. (`skim`s bonus system and even case insensitivity only work for ASCII).
|
||||
|
||||
Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly).
|
||||
|
||||
|
@ -156,7 +156,7 @@ pub enum CharClass {
|
||||
Number,
|
||||
}
|
||||
|
||||
/// nucleo can not match graphemes as single units to work around
|
||||
/// nucleo cannot match graphemes as single units to work around
|
||||
/// that we only use the first codepoint of each grapheme
|
||||
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
|
||||
text.graphemes(true).map(|grapheme| {
|
||||
|
@ -145,7 +145,7 @@ impl Matcher {
|
||||
}
|
||||
}
|
||||
// in case we don't have any letter in the needle
|
||||
// we can treat the search as case sensitive and use memmem dircedly which is way faster
|
||||
// we can treat the search as case sensitive and use memmem directly which is way faster
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ use crate::matrix::MatrixSlab;
|
||||
/// A matcher engine that can execute (fuzzy) matches.
|
||||
///
|
||||
/// A matches contains **heap allocated** scratch memory that is reused during
|
||||
/// matching. This scratch memory allows the matcher to garunte that it will
|
||||
/// matching. This scratch memory allows the matcher to guarantee that it will
|
||||
/// **never allocate** during matching (with the exception of pushing to the
|
||||
/// `indices` vector if there isn't enough capacity). However this scratch
|
||||
/// memory is fairly large (around 135KB) so creating a matcher is expensive and
|
||||
@ -58,7 +58,7 @@ pub struct Matcher {
|
||||
slab: MatrixSlab,
|
||||
}
|
||||
|
||||
// this is just here for convenience not ruse if we should implement this
|
||||
// this is just here for convenience not sure if we should implement this
|
||||
impl Clone for Matcher {
|
||||
fn clone(&self) -> Self {
|
||||
Matcher {
|
||||
@ -93,7 +93,7 @@ impl Matcher {
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the fuzzy match with the higehest score in the `haystack`.
|
||||
/// Find the fuzzy match with the highest score in the `haystack`.
|
||||
///
|
||||
/// This functions has `O(mn)` time complexity for short inputs. To
|
||||
/// avoid slowdowns it automatically falls back to [greedy matching]
|
||||
@ -241,7 +241,7 @@ impl Matcher {
|
||||
|
||||
/// Greedly find a fuzzy match in the `haystack` and compute its indices.
|
||||
///
|
||||
/// This functions has `O(n)` time complexity but may provide unintutive (non-optimal)
|
||||
/// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal)
|
||||
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should
|
||||
/// be preferred.
|
||||
///
|
||||
|
@ -107,7 +107,7 @@ pub(crate) struct ScoreCell {
|
||||
pub(crate) struct MatcherDataView<'a, C: Char> {
|
||||
pub haystack: &'a mut [C],
|
||||
// stored as a separate array instead of struct
|
||||
// to avoid padding sine char is too large and u8 too small :/
|
||||
// to avoid padding since char is too large and u8 too small :/
|
||||
pub bonus: &'a mut [u8],
|
||||
pub current_row: &'a mut [ScoreCell],
|
||||
pub row_offs: &'a mut [u16],
|
||||
@ -162,7 +162,7 @@ impl MatrixSlab {
|
||||
let cells = haystack_.len() * needle_len;
|
||||
if cells > MAX_MATRIX_SIZE
|
||||
|| haystack_.len() > u16::MAX as usize
|
||||
// ensures that socres never overflow
|
||||
// ensures that scores never overflow
|
||||
|| needle_len > MAX_NEEDLE_LEN
|
||||
{
|
||||
return None;
|
||||
@ -175,7 +175,7 @@ impl MatrixSlab {
|
||||
// safely: this allocation is valid for MATRIX_ALLOC_LAYOUT
|
||||
let (haystack, bonus, rows, current_row, matrix_cells) =
|
||||
matrix_layout.fieds_from_ptr(self.0);
|
||||
// copy haystack before creating references to ensure we donu't crate
|
||||
// copy haystack before creating references to ensure we don't create
|
||||
// references to invalid chars (which may or may not be UB)
|
||||
haystack_
|
||||
.as_ptr()
|
||||
|
@ -20,7 +20,7 @@ pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2;
|
||||
// However, this priporitzes camel case over non-camel case.
|
||||
// In fzf/skim this is not a problem since they score off the max
|
||||
// consecutive bonus. However, we don't do that (because its incorrect)
|
||||
// so to avoids prioritzing camel we use a lower bonus. I think that's fine
|
||||
// so to avoids prioritizing camel we use a lower bonus. I think that's fine
|
||||
// usually camel case is wekaer boundary than actual wourd boundaries anyway
|
||||
// This also has the nice sideeffect of perfectly balancing out
|
||||
// camel case, snake case and the consecutive version of the word
|
||||
@ -97,7 +97,7 @@ impl Matcher {
|
||||
let mut in_gap = false;
|
||||
let mut consecutive = 1;
|
||||
|
||||
// unrolled the firs iteration to make applying the first char multiplier less akward
|
||||
// unrolled the first iteration to make applying the first char multiplier less awkward
|
||||
if INDICES {
|
||||
indices.push(start as u32)
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ use std::{fmt, slice};
|
||||
///
|
||||
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
|
||||
/// operates on codepoints (it should operate on graphemes but that's too much
|
||||
/// hassle to deal with). We want to quickly iterate these codeboints between
|
||||
/// hassle to deal with). We want to quickly iterate these codepoints between
|
||||
/// (up to 5 times) during matching.
|
||||
///
|
||||
/// Doing codepoint segmentation on the fly not only blows trough the cache
|
||||
@ -14,7 +14,7 @@ use std::{fmt, slice};
|
||||
/// for ascii only text (but checking during each match has too much overhead).
|
||||
///
|
||||
/// Ofcourse this comes at exta memory cost as we usually still need the ut8
|
||||
/// encoded variant for rendenring. In the (dominant) case of ascii-only text
|
||||
/// encoded variant for rendering. In the (dominant) case of ascii-only text
|
||||
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
|
||||
/// the user is typing on the fly so the same item is potentially matched many
|
||||
/// times (making the the upfront cost more worth it). That means that its
|
||||
@ -24,8 +24,8 @@ use std::{fmt, slice};
|
||||
/// char buffer around that is filled with the presegmented chars
|
||||
///
|
||||
/// Another advantage of this approach is that the matcher will naturally
|
||||
/// produce char indices (instead of utf8 offsets) annyway. With a
|
||||
/// codepoint basec representation like this the indices can be used
|
||||
/// produce char indices (instead of utf8 offsets) anyway. With a
|
||||
/// codepoint basic representation like this the indices can be used
|
||||
/// directly
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub enum Utf32Str<'a> {
|
||||
@ -84,7 +84,7 @@ impl<'a> Utf32Str<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as `slice` but accepts a u32 range for convenicene sine
|
||||
/// Same as `slice` but accepts a u32 range for convenience since
|
||||
/// those are the indices returned by the matcher
|
||||
#[inline]
|
||||
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
|
||||
|
@ -43,9 +43,9 @@ pub(crate) struct Vec<T> {
|
||||
buckets: [Bucket<T>; BUCKETS as usize],
|
||||
/// the number of initialized elements in this vector
|
||||
count: AtomicU32,
|
||||
/// the number of matcher columns in this vector, its absoletly critical that
|
||||
/// the number of matcher columns in this vector, its absolutely critical that
|
||||
/// this remains constant and after initilaziaton (safety invariant) since
|
||||
/// it is used to calculate the Entry layou
|
||||
/// it is used to calculate the Entry layout
|
||||
columns: u32,
|
||||
}
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
|
14
src/lib.rs
14
src/lib.rs
@ -8,7 +8,7 @@ use rayon::ThreadPool;
|
||||
|
||||
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
|
||||
pub use crate::utf32_string::Utf32String;
|
||||
use crate::worker::Woker;
|
||||
use crate::worker::Worker;
|
||||
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
|
||||
|
||||
mod boxcar;
|
||||
@ -85,7 +85,7 @@ pub struct Nucleo<T: Sync + Send + 'static> {
|
||||
// but this lets us avoid some unsafe
|
||||
canceled: Arc<AtomicBool>,
|
||||
should_notify: Arc<AtomicBool>,
|
||||
worker: Arc<Mutex<Woker<T>>>,
|
||||
worker: Arc<Mutex<Worker<T>>>,
|
||||
pool: ThreadPool,
|
||||
cleared: bool,
|
||||
item_count: u32,
|
||||
@ -104,7 +104,7 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
|
||||
case_matching: CaseMatching,
|
||||
columns: u32,
|
||||
) -> Self {
|
||||
let (pool, worker) = Woker::new(num_threads, config, notify.clone(), columns);
|
||||
let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns);
|
||||
Self {
|
||||
canceled: worker.canceled.clone(),
|
||||
should_notify: worker.should_notify.clone(),
|
||||
@ -137,7 +137,7 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
|
||||
/// # Safety
|
||||
///
|
||||
/// Item at `index` must be initialized. That means you must have observed
|
||||
/// `push` returning this value or `get` retunring `Some` for this value.
|
||||
/// `push` returning this value or `get` returning `Some` for this value.
|
||||
/// Just because a later index is initialized doesn't mean that this index
|
||||
/// is initialized
|
||||
pub unsafe fn get_unchecked(&self, index: u32) -> Item<'_, T> {
|
||||
@ -219,7 +219,7 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
|
||||
impl<T: Sync + Send> Drop for Nucleo<T> {
|
||||
fn drop(&mut self) {
|
||||
// we ensure the worker quits before dropping items to ensure that
|
||||
// the worker can always assume the items outlife it
|
||||
// the worker can always assume the items outlive it
|
||||
self.canceled.store(true, atomic::Ordering::Relaxed);
|
||||
let lock = self.worker.try_lock_for(Duration::from_secs(1));
|
||||
if lock.is_none() {
|
||||
@ -228,8 +228,8 @@ impl<T: Sync + Send> Drop for Nucleo<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// convenicne function to easily fuzzy match
|
||||
/// on a (relatively small list of inputs). This is not recommended for building a full tui
|
||||
/// convenience function to easily fuzzy match
|
||||
/// on a (relatively small) list of inputs. This is not recommended for building a full tui
|
||||
/// application that can match large numbers of matches as all matching is done on the current
|
||||
/// thread, effectively blocking the UI
|
||||
pub fn fuzzy_match<T: AsRef<str>>(
|
||||
|
@ -2,7 +2,7 @@
|
||||
//!
|
||||
//! This implementation is copied verbatim from `std::slice::sort_unstable` and then parallelized.
|
||||
//! The only difference from the original is that calls to `recurse` are executed in parallel using
|
||||
//! `rayon_core::join`a.
|
||||
//! `rayon_core::join`.
|
||||
//! Further modified for nucleo to allow canceling the sort
|
||||
|
||||
// Copyright (c) 2010 The Rust Project Developers
|
||||
|
@ -1,9 +0,0 @@
|
||||
pub struct MatchSnapshot {
|
||||
chunks: Vec<Match>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
struct Match {
|
||||
score: u32,
|
||||
idx: u32,
|
||||
}
|
@ -36,7 +36,7 @@ impl Utf32String {
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as `slice` but accepts a u32 range for convenicene sine
|
||||
/// Same as `slice` but accepts a u32 range for convenience since
|
||||
/// those are the indices returned by the matcher
|
||||
#[inline]
|
||||
pub fn slice(&self, range: impl RangeBounds<u32>) -> Utf32Str {
|
||||
|
@ -14,7 +14,7 @@ use crate::{boxcar, Match};
|
||||
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
|
||||
|
||||
impl Matchers {
|
||||
// thiss is not a true mut from ref, we use a cell here
|
||||
// this is not a true mut from ref, we use a cell here
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
unsafe fn get(&self) -> &mut nucleo_matcher::Matcher {
|
||||
&mut *self.0[rayon::current_thread_index().unwrap()].get()
|
||||
@ -24,7 +24,7 @@ impl Matchers {
|
||||
unsafe impl Sync for Matchers {}
|
||||
unsafe impl Send for Matchers {}
|
||||
|
||||
pub(crate) struct Woker<T: Sync + Send + 'static> {
|
||||
pub(crate) struct Worker<T: Sync + Send + 'static> {
|
||||
pub(crate) running: bool,
|
||||
matchers: Matchers,
|
||||
pub(crate) matches: Vec<Match>,
|
||||
@ -38,7 +38,7 @@ pub(crate) struct Woker<T: Sync + Send + 'static> {
|
||||
in_flight: Vec<u32>,
|
||||
}
|
||||
|
||||
impl<T: Sync + Send + 'static> Woker<T> {
|
||||
impl<T: Sync + Send + 'static> Worker<T> {
|
||||
pub(crate) fn item_count(&self) -> u32 {
|
||||
self.last_snapshot - self.in_flight.len() as u32
|
||||
}
|
||||
@ -64,7 +64,7 @@ impl<T: Sync + Send + 'static> Woker<T> {
|
||||
let matchers = (0..worker_threads)
|
||||
.map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config)))
|
||||
.collect();
|
||||
let worker = Woker {
|
||||
let worker = Worker {
|
||||
running: false,
|
||||
matchers: Matchers(matchers),
|
||||
last_snapshot: 0,
|
||||
@ -211,7 +211,7 @@ impl<T: Sync + Send + 'static> Woker<T> {
|
||||
if match2.idx == u32::MAX {
|
||||
return true;
|
||||
}
|
||||
// the tie breaker is comparitevly rarely needed so we keep it
|
||||
// the tie breaker is comparatively rarely needed so we keep it
|
||||
// in a branch especially because we need to access the items
|
||||
// array here which involves some pointer chasing
|
||||
let item1 = self.items.get_unchecked(match1.idx);
|
||||
|
Loading…
Reference in New Issue
Block a user