From b4f547b0f41981560d1e3cf3cc731318305b77d0 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Sat, 5 Aug 2023 10:25:01 -0500 Subject: [PATCH 1/2] Fix typos --- README.md | 2 +- matcher/src/chars.rs | 2 +- matcher/src/exact.rs | 2 +- matcher/src/lib.rs | 8 ++++---- matcher/src/matrix.rs | 6 +++--- matcher/src/score.rs | 4 ++-- matcher/src/utf32_str.rs | 10 +++++----- src/boxcar.rs | 4 ++-- src/lib.rs | 14 +++++++------- src/par_sort.rs | 2 +- src/utf32_string.rs | 2 +- src/worker.rs | 10 +++++----- 12 files changed, 33 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index a6baa09..50fc660 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Nucleo uses the exact **same scoring system as fzf**. That means you should get the same ranking quality (or better) as you are used to from fzf. However, `nucleo` has a more faithful implementation of the Smith-Waterman algorithm which is normally used in DNA sequence alignment (see https://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/gaps.pdf) with two separate matrices (instead of one like fzf). This means that `nucleo` finds the optimal match more often. For example if you match `foo` in `xf foo` `nucleo` will match `x__foo` but `fzf` will match `xf_oo` (you can increase the word length the result will stay the same). The former is the more intuitive match and has a higher score according to the ranking system that both `nucleo` and fzf. -**Compared to `skim`** (and the `fuzzy-matcher` crate) `nucleo` has an even larger performance advantage and is often around **six times faster** (see benchmarks below). Furthermore, the bonus system used by nucleo and fzf is (in my opinion)more consistent/superior. `nulceo` also handles non-ascii text much better. (`skim`s bonus system and even case insensitivity only work for ASCII). +**Compared to `skim`** (and the `fuzzy-matcher` crate) `nucleo` has an even larger performance advantage and is often around **six times faster** (see benchmarks below). Furthermore, the bonus system used by nucleo and fzf is (in my opinion) more consistent/superior. `nulceo` also handles non-ascii text much better. (`skim`s bonus system and even case insensitivity only work for ASCII). Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly). diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index 503cc9a..7f89577 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -156,7 +156,7 @@ pub enum CharClass { Number, } -/// nucleo can not match graphemes as single units to work around +/// nucleo cannot match graphemes as single units to work around /// that we only use the first codepoint of each grapheme pub fn graphemes(text: &str) -> impl Iterator + '_ { text.graphemes(true).map(|grapheme| { diff --git a/matcher/src/exact.rs b/matcher/src/exact.rs index 20ff954..4fc17d4 100644 --- a/matcher/src/exact.rs +++ b/matcher/src/exact.rs @@ -145,7 +145,7 @@ impl Matcher { } } // in case we don't have any letter in the needle - // we can treat the search as case sensitive and use memmem dircedly which is way faster + // we can treat the search as case sensitive and use memmem directly which is way faster None => (), } } diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 6e6efcc..448d1c8 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -36,7 +36,7 @@ use crate::matrix::MatrixSlab; /// A matcher engine that can execute (fuzzy) matches. /// /// A matches contains **heap allocated** scratch memory that is reused during -/// matching. This scratch memory allows the matcher to garunte that it will +/// matching. This scratch memory allows the matcher to guarantee that it will /// **never allocate** during matching (with the exception of pushing to the /// `indices` vector if there isn't enough capacity). However this scratch /// memory is fairly large (around 135KB) so creating a matcher is expensive and @@ -58,7 +58,7 @@ pub struct Matcher { slab: MatrixSlab, } -// this is just here for convenience not ruse if we should implement this +// this is just here for convenience not sure if we should implement this impl Clone for Matcher { fn clone(&self) -> Self { Matcher { @@ -93,7 +93,7 @@ impl Matcher { } } - /// Find the fuzzy match with the higehest score in the `haystack`. + /// Find the fuzzy match with the highest score in the `haystack`. /// /// This functions has `O(mn)` time complexity for short inputs. To /// avoid slowdowns it automatically falls back to [greedy matching] @@ -241,7 +241,7 @@ impl Matcher { /// Greedly find a fuzzy match in the `haystack` and compute its indices. /// - /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) + /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should /// be preferred. /// diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs index eff3dae..d60e2a6 100644 --- a/matcher/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -107,7 +107,7 @@ pub(crate) struct ScoreCell { pub(crate) struct MatcherDataView<'a, C: Char> { pub haystack: &'a mut [C], // stored as a separate array instead of struct - // to avoid padding sine char is too large and u8 too small :/ + // to avoid padding since char is too large and u8 too small :/ pub bonus: &'a mut [u8], pub current_row: &'a mut [ScoreCell], pub row_offs: &'a mut [u16], @@ -162,7 +162,7 @@ impl MatrixSlab { let cells = haystack_.len() * needle_len; if cells > MAX_MATRIX_SIZE || haystack_.len() > u16::MAX as usize - // ensures that socres never overflow + // ensures that scores never overflow || needle_len > MAX_NEEDLE_LEN { return None; @@ -175,7 +175,7 @@ impl MatrixSlab { // safely: this allocation is valid for MATRIX_ALLOC_LAYOUT let (haystack, bonus, rows, current_row, matrix_cells) = matrix_layout.fieds_from_ptr(self.0); - // copy haystack before creating references to ensure we donu't crate + // copy haystack before creating references to ensure we don't create // references to invalid chars (which may or may not be UB) haystack_ .as_ptr() diff --git a/matcher/src/score.rs b/matcher/src/score.rs index b371daa..7a7c0c3 100644 --- a/matcher/src/score.rs +++ b/matcher/src/score.rs @@ -20,7 +20,7 @@ pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; // However, this priporitzes camel case over non-camel case. // In fzf/skim this is not a problem since they score off the max // consecutive bonus. However, we don't do that (because its incorrect) -// so to avoids prioritzing camel we use a lower bonus. I think that's fine +// so to avoids prioritizing camel we use a lower bonus. I think that's fine // usually camel case is wekaer boundary than actual wourd boundaries anyway // This also has the nice sideeffect of perfectly balancing out // camel case, snake case and the consecutive version of the word @@ -97,7 +97,7 @@ impl Matcher { let mut in_gap = false; let mut consecutive = 1; - // unrolled the firs iteration to make applying the first char multiplier less akward + // unrolled the first iteration to make applying the first char multiplier less awkward if INDICES { indices.push(start as u32) } diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index cfd73db..768c724 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -5,7 +5,7 @@ use std::{fmt, slice}; /// /// Usually rusts' utf8 encoded strings are great. However during fuzzy matching /// operates on codepoints (it should operate on graphemes but that's too much -/// hassle to deal with). We want to quickly iterate these codeboints between +/// hassle to deal with). We want to quickly iterate these codepoints between /// (up to 5 times) during matching. /// /// Doing codepoint segmentation on the fly not only blows trough the cache @@ -14,7 +14,7 @@ use std::{fmt, slice}; /// for ascii only text (but checking during each match has too much overhead). /// /// Ofcourse this comes at exta memory cost as we usually still need the ut8 -/// encoded variant for rendenring. In the (dominant) case of ascii-only text +/// encoded variant for rendering. In the (dominant) case of ascii-only text /// we don't require a copy. Furthermore fuzzy matching usually is applied while /// the user is typing on the fly so the same item is potentially matched many /// times (making the the upfront cost more worth it). That means that its @@ -24,8 +24,8 @@ use std::{fmt, slice}; /// char buffer around that is filled with the presegmented chars /// /// Another advantage of this approach is that the matcher will naturally -/// produce char indices (instead of utf8 offsets) annyway. With a -/// codepoint basec representation like this the indices can be used +/// produce char indices (instead of utf8 offsets) anyway. With a +/// codepoint basic representation like this the indices can be used /// directly #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub enum Utf32Str<'a> { @@ -84,7 +84,7 @@ impl<'a> Utf32Str<'a> { } } - /// Same as `slice` but accepts a u32 range for convenicene sine + /// Same as `slice` but accepts a u32 range for convenience since /// those are the indices returned by the matcher #[inline] pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { diff --git a/src/boxcar.rs b/src/boxcar.rs index 67b1cb0..1d8ae1d 100644 --- a/src/boxcar.rs +++ b/src/boxcar.rs @@ -43,9 +43,9 @@ pub(crate) struct Vec { buckets: [Bucket; BUCKETS as usize], /// the number of initialized elements in this vector count: AtomicU32, - /// the number of matcher columns in this vector, its absoletly critical that + /// the number of matcher columns in this vector, its absolutely critical that /// this remains constant and after initilaziaton (safety invariant) since - /// it is used to calculate the Entry layou + /// it is used to calculate the Entry layout columns: u32, } diff --git a/src/lib.rs b/src/lib.rs index 8a7be6b..eaab53c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ use rayon::ThreadPool; pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; pub use crate::utf32_string::Utf32String; -use crate::worker::Woker; +use crate::worker::Worker; pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; mod boxcar; @@ -85,7 +85,7 @@ pub struct Nucleo { // but this lets us avoid some unsafe canceled: Arc, should_notify: Arc, - worker: Arc>>, + worker: Arc>>, pool: ThreadPool, cleared: bool, item_count: u32, @@ -104,7 +104,7 @@ impl Nucleo { case_matching: CaseMatching, columns: u32, ) -> Self { - let (pool, worker) = Woker::new(num_threads, config, notify.clone(), columns); + let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns); Self { canceled: worker.canceled.clone(), should_notify: worker.should_notify.clone(), @@ -137,7 +137,7 @@ impl Nucleo { /// # Safety /// /// Item at `index` must be initialized. That means you must have observed - /// `push` returning this value or `get` retunring `Some` for this value. + /// `push` returning this value or `get` returning `Some` for this value. /// Just because a later index is initialized doesn't mean that this index /// is initialized pub unsafe fn get_unchecked(&self, index: u32) -> Item<'_, T> { @@ -219,7 +219,7 @@ impl Nucleo { impl Drop for Nucleo { fn drop(&mut self) { // we ensure the worker quits before dropping items to ensure that - // the worker can always assume the items outlife it + // the worker can always assume the items outlive it self.canceled.store(true, atomic::Ordering::Relaxed); let lock = self.worker.try_lock_for(Duration::from_secs(1)); if lock.is_none() { @@ -228,8 +228,8 @@ impl Drop for Nucleo { } } -/// convenicne function to easily fuzzy match -/// on a (relatively small list of inputs). This is not recommended for building a full tui +/// convenience function to easily fuzzy match +/// on a (relatively small) list of inputs. This is not recommended for building a full tui /// application that can match large numbers of matches as all matching is done on the current /// thread, effectively blocking the UI pub fn fuzzy_match>( diff --git a/src/par_sort.rs b/src/par_sort.rs index 5dfbd11..92f716c 100644 --- a/src/par_sort.rs +++ b/src/par_sort.rs @@ -2,7 +2,7 @@ //! //! This implementation is copied verbatim from `std::slice::sort_unstable` and then parallelized. //! The only difference from the original is that calls to `recurse` are executed in parallel using -//! `rayon_core::join`a. +//! `rayon_core::join`. //! Further modified for nucleo to allow canceling the sort // Copyright (c) 2010 The Rust Project Developers diff --git a/src/utf32_string.rs b/src/utf32_string.rs index d7e9935..f8410a0 100644 --- a/src/utf32_string.rs +++ b/src/utf32_string.rs @@ -36,7 +36,7 @@ impl Utf32String { } } - /// Same as `slice` but accepts a u32 range for convenicene sine + /// Same as `slice` but accepts a u32 range for convenience since /// those are the indices returned by the matcher #[inline] pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { diff --git a/src/worker.rs b/src/worker.rs index ab30f95..20136c1 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -14,7 +14,7 @@ use crate::{boxcar, Match}; struct Matchers(Box<[UnsafeCell]>); impl Matchers { - // thiss is not a true mut from ref, we use a cell here + // this is not a true mut from ref, we use a cell here #[allow(clippy::mut_from_ref)] unsafe fn get(&self) -> &mut nucleo_matcher::Matcher { &mut *self.0[rayon::current_thread_index().unwrap()].get() @@ -24,7 +24,7 @@ impl Matchers { unsafe impl Sync for Matchers {} unsafe impl Send for Matchers {} -pub(crate) struct Woker { +pub(crate) struct Worker { pub(crate) running: bool, matchers: Matchers, pub(crate) matches: Vec, @@ -38,7 +38,7 @@ pub(crate) struct Woker { in_flight: Vec, } -impl Woker { +impl Worker { pub(crate) fn item_count(&self) -> u32 { self.last_snapshot - self.in_flight.len() as u32 } @@ -64,7 +64,7 @@ impl Woker { let matchers = (0..worker_threads) .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config))) .collect(); - let worker = Woker { + let worker = Worker { running: false, matchers: Matchers(matchers), last_snapshot: 0, @@ -211,7 +211,7 @@ impl Woker { if match2.idx == u32::MAX { return true; } - // the tie breaker is comparitevly rarely needed so we keep it + // the tie breaker is comparatively rarely needed so we keep it // in a branch especially because we need to access the items // array here which involves some pointer chasing let item1 = self.items.get_unchecked(match1.idx); From f73a1988f285ab4d6dc89cf98eb8c076b41f854a Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Sat, 5 Aug 2023 10:25:12 -0500 Subject: [PATCH 2/2] Remove unused modules/files --- matcher/src/multizip.rs | 0 src/items.rs | 1 - src/results.rs | 9 --------- 3 files changed, 10 deletions(-) delete mode 100644 matcher/src/multizip.rs delete mode 100644 src/items.rs delete mode 100644 src/results.rs diff --git a/matcher/src/multizip.rs b/matcher/src/multizip.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/items.rs b/src/items.rs deleted file mode 100644 index 8b13789..0000000 --- a/src/items.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/results.rs b/src/results.rs deleted file mode 100644 index dea0b4b..0000000 --- a/src/results.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub struct MatchSnapshot { - chunks: Vec, -} - -#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] -struct Match { - score: u32, - idx: u32, -}