Merge pull request #16 from helix-editor/release_nucleo_matcher

Co-authored-by: Michael Davis <mcarsondavis@gmail.com>
This commit is contained in:
Pascal Kuthe 2023-08-29 00:21:07 +02:00 committed by GitHub
commit 2de732889f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 1194 additions and 667 deletions

10
CHANGELOG.md Normal file
View File

@ -0,0 +1,10 @@
# Changelog
## nucleo-matcher
# [0.2.0] - 2023-09-01
*initial public release*
[0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0

4
Cargo.lock generated
View File

@ -152,7 +152,7 @@ dependencies = [
[[package]]
name = "nucleo"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"nucleo-matcher",
"parking_lot",
@ -161,7 +161,7 @@ dependencies = [
[[package]]
name = "nucleo-matcher"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"cov-mark",
"memchr",

View File

@ -2,7 +2,7 @@
name = "nucleo"
description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.1"
version = "0.2.0"
edition = "2021"
license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo"
@ -11,7 +11,7 @@ readme = "README.md"
[lib]
[dependencies]
nucleo-matcher = { version = "0.1", path = "matcher" }
nucleo-matcher = { version = "0.2.0", path = "matcher" }
parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]}
rayon = "1.7.0"

View File

@ -1,10 +1,5 @@
# Nucleo
> Disclaimer: An 0.1 version has been published to crates.io.
> This allows us to merge the `nucleo` integration into helix.
> However, the public API is not yet final and will likely
> change quite a bit in the next release. The documentation
> is also not yet complete
`nucleo` is a highly performant fuzzy matcher written in rust. It aims to fill the same use case as `fzf` and `skim`. Compared to `fzf` `nucleo` has a significantly faster matching algorithm. This mainly makes a difference when matching patterns with low selectivity on many items. An (unscientific) comparison is shown in the benchmark section below.
@ -14,6 +9,12 @@
Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly).
## Status
Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away).
While the high level `nucleo` crate also works well (and is also used in helix), there are still additional features that will be added in the future. The high level crate also need better documentation and will likely see a few API changes in the future.
## Benchmarks
> WIP currently more of a demonstration than a comprehensive benchmark suit

View File

@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
nucleo = { version = "0.1", path = "../" }
nucleo = { version = "0.2", path = "../" }
brunch = "0.5.0"
fuzzy-matcher = "0.3.7"
walkdir = "2"

View File

@ -43,7 +43,7 @@ fn main() {
Some((path.as_str().into(), path))
})
.unzip();
let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths());
let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths());
let skim = fuzzy_matcher::skim::SkimMatcherV2::default();
// TODO: unicode?

View File

@ -2,7 +2,7 @@
name = "nucleo-matcher"
description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.0"
version = "0.2.0"
edition = "2021"
license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo"

View File

@ -1,7 +1,9 @@
//! Utilities for working with (unicode) characters/codepoints
use std::fmt::{self, Debug, Display};
use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
use crate::MatcherConfig;
use crate::Config;
//autogenerated by generate-ucd
#[allow(warnings)]
@ -11,9 +13,9 @@ mod normalize;
pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
const ASCII: bool;
fn char_class(self, config: &MatcherConfig) -> CharClass;
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass);
fn normalize(self, config: &MatcherConfig) -> Self;
fn char_class(self, config: &Config) -> CharClass;
fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass);
fn normalize(self, config: &Config) -> Self;
}
/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation
@ -42,7 +44,7 @@ impl PartialEq<AsciiChar> for char {
impl Char for AsciiChar {
const ASCII: bool = true;
#[inline]
fn char_class(self, config: &MatcherConfig) -> CharClass {
fn char_class(self, config: &Config) -> CharClass {
let c = self.0;
// using manual if conditions instead optimizes better
if c >= b'a' && c <= b'z' {
@ -61,7 +63,7 @@ impl Char for AsciiChar {
}
#[inline(always)]
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) {
fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
let char_class = self.char_class(config);
if config.ignore_case && char_class == CharClass::Upper {
self.0 += 32
@ -70,7 +72,7 @@ impl Char for AsciiChar {
}
#[inline(always)]
fn normalize(mut self, config: &MatcherConfig) -> Self {
fn normalize(mut self, config: &Config) -> Self {
if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' {
self.0 += 32
}
@ -95,7 +97,7 @@ fn char_class_non_ascii(c: char) -> CharClass {
impl Char for char {
const ASCII: bool = false;
#[inline(always)]
fn char_class(self, config: &MatcherConfig) -> CharClass {
fn char_class(self, config: &Config) -> CharClass {
if self.is_ascii() {
return AsciiChar(self as u8).char_class(config);
}
@ -103,7 +105,7 @@ impl Char for char {
}
#[inline(always)]
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) {
fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
if self.is_ascii() {
let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config);
return (c.0 as char, class);
@ -123,7 +125,7 @@ impl Char for char {
}
#[inline(always)]
fn normalize(mut self, config: &MatcherConfig) -> Self {
fn normalize(mut self, config: &Config) -> Self {
if config.normalize {
self = normalize::normalize(self);
}
@ -138,12 +140,14 @@ pub use normalize::normalize;
use unicode_segmentation::UnicodeSegmentation;
#[inline(always)]
/// Converts a character to lower case using simple unicode case folding
pub fn to_lower_case(c: char) -> char {
CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
}
/// Converts a character to upper case using simple unicode case folding
#[inline(always)]
pub fn is_upper_case(c: char) -> bool {
CASE_FOLDING_SIMPLE
@ -152,8 +156,7 @@ pub fn is_upper_case(c: char) -> bool {
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CharClass {
pub(crate) enum CharClass {
Whitespace,
NonWord,
Delimiter,
@ -163,8 +166,10 @@ pub enum CharClass {
Number,
}
/// nucleo cannot match graphemes as single units to work around
/// that we only use the first codepoint of each grapheme
/// Nucleo cannot match graphemes as single units. To work around
/// that we only use the first codepoint of each grapheme. This
/// iterator returns the first character of each unicode grapheme
/// in a string and is used for constructing `Utf32Str(ing)`.
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
text.graphemes(true).map(|grapheme| {
grapheme

View File

@ -495,6 +495,16 @@ const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1;
const LEN3: usize = (DATA3_END - DATA3_START) as usize;
static TABLE3: [char; LEN3] = generate_table(&DATA3);
/// Normalizes a unicode character by converting latin characters
/// which are variants of ASCII characters to their latin equivant.
///
/// # Example
///
/// ``` rust
/// # use nucleo_matcher::chars::normalize;
///
/// assert_eq!(normalize('ä'), 'a');
/// ```
pub fn normalize(c: char) -> char {
let i = c as u32;
if i < DATA1_START || i >= DATA3_END {

View File

@ -1,38 +1,52 @@
use crate::chars::CharClass;
use crate::score::BONUS_BOUNDARY;
/// Configuration data that controls how a matcher behaves
#[non_exhaustive]
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct MatcherConfig {
pub delimiter_chars: &'static [u8],
#[derive(PartialEq, Eq, Debug, Clone)]
pub struct Config {
/// Characters that act as delimiters and provide bonus
/// for matching the following char
pub(crate) delimiter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string
pub(crate) bonus_boundary_white: u16,
/// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub(crate) bonus_boundary_delimiter: u16,
pub initial_char_class: CharClass,
pub(crate) initial_char_class: CharClass,
/// Whether to normalize latin script characters to ASCII (enabled by default)
pub normalize: bool,
/// whether to ignore casing
pub ignore_case: bool,
/// Whether to provide a bonus to matches by their distance from the start
/// of the haystack. The bonus is fairly small compared to the normal gap
/// penalty to avoid messing with the normal score heuristic. This setting
/// is not turned on by default and only recommended for autocompletion
/// usecases where the expectation is that the user is typing the entire
/// match. For a full fzf-like fuzzy matcher/picker word segmentation and
/// explicit prefix literals should be used instead.
pub prefer_prefix: bool,
}
impl MatcherConfig {
impl Config {
/// The default config for nucleo, implemented as a constant since
/// Default::default can not be called in a const context
pub const DEFAULT: Self = {
MatcherConfig {
Config {
delimiter_chars: b"/,:;|",
bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
initial_char_class: CharClass::Whitespace,
normalize: true,
ignore_case: true,
prefer_prefix: false,
}
};
}
impl MatcherConfig {
impl Config {
/// Configures the matcher with bonuses appropriate for matching file paths.
pub fn set_match_paths(&mut self) {
// compared to fzf we include
if cfg!(windows) {
self.delimiter_chars = b"/:\\";
} else {
@ -42,6 +56,7 @@ impl MatcherConfig {
self.initial_char_class = CharClass::Delimiter;
}
/// Configures the matcher with bonuses appropriate for matching file paths.
pub const fn match_paths(mut self) -> Self {
if cfg!(windows) {
self.delimiter_chars = b"/\\";

View File

@ -3,10 +3,10 @@ use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
use crate::score::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION,
PENALTY_GAP_START, SCORE_MATCH,
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
};
use crate::{Matcher, MatcherConfig};
use crate::{Config, Matcher};
impl Matcher {
pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
@ -23,11 +23,7 @@ impl Matcher {
// us to treat needle indices as u16
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
return self.fuzzy_match_greedy_::<INDICES, H, N>(
haystack,
needle,
start,
greedy_end,
indices,
haystack, needle, start, greedy_end, indices,
);
};
@ -35,7 +31,7 @@ impl Matcher {
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.initial_char_class);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched {
assert!(
@ -116,7 +112,8 @@ impl<H: Char> MatcherDataView<'_, H> {
&mut self,
needle: &[N],
mut prev_class: CharClass,
config: &MatcherConfig,
config: &Config,
start: u32,
) -> bool
where
H: PartialEq<N>,
@ -167,6 +164,17 @@ impl<H: Char> MatcherDataView<'_, H> {
0,
needle[0],
needle[1],
if config.prefer_prefix {
if start == 0 {
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
} else {
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
)
}
} else {
0
},
);
true
}
@ -182,6 +190,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx: u16,
needle_char: N,
next_needle_char: N,
mut prefix_bonus: u16,
) where
H: PartialEq<N>,
{
@ -198,15 +207,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
if c == needle_char {
let cell = if c == needle_char {
ScoreCell {
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: *bonus,
}
} else {
UNMATCHED
}
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
@ -224,15 +237,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((c, bonus), score_cell), matrix_cell) in col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
if c[0] == needle_char {
let cell = if c[0] == needle_char {
ScoreCell {
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH,
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: bonus[0],
}
} else {
UNMATCHED
}
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
@ -271,6 +288,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx as u16 + 1,
needle_char,
next_needle_char,
0,
);
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
matrix_cells = &mut matrix_cells[len..];
@ -317,7 +335,7 @@ impl<H: Char> MatcherDataView<'_, H> {
}
let next_matched = row[col as usize].get(matched);
if matched {
let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else{
let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else {
break;
};
col += row_off - next_row_off;

View File

@ -1,16 +1,66 @@
/*!
`nucleo_matcher` is a low level crate that contains the matcher implementation
used by the other nucleo crates.
used by the high level `nucleo` crate.
The matcher is hightly optimized and can significantly outperform `fzf` and
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
a slightly less convenient API. Particularly, `nucleo_matcher` requires that
needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead
of rusts normal utf32 strings.
a slightly less convenient API. Be sure to carefully read the documentation of
the [`Matcher`] to avoid unexpected behaviour.
# Examples
For almost all usecases the [`pattern`] API should be used instead of calling
the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will
construct a single Atom (a single match operation) for each word. The pattern
can contain special characters to control what kind of match is performed (see
[`AtomKind`](crate::pattern::AtomKind)).
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]);
```
If the pattern should be matched literally (without this special parsing)
[`Pattern::new`](pattern::Pattern::new) can be used instead.
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let paths = ["^foo/bar", "bar/^foo", "foobar"];
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]);
```
If word segmentation is also not desired, a single `Atom` can be constructed directly.
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind};
let paths = ["foobar", "foo bar"];
let mut matcher = Matcher::new(Config::DEFAULT);
let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo bar", 192)]);
```
# Status
Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away).
*/
// sadly ranges don't optmimzie well
#![allow(clippy::manual_range_contains)]
#![warn(missing_docs)]
pub mod chars;
mod config;
@ -20,6 +70,7 @@ mod exact;
mod fuzzy_greedy;
mod fuzzy_optimal;
mod matrix;
pub mod pattern;
mod prefilter;
mod score;
mod utf32_str;
@ -27,8 +78,8 @@ mod utf32_str;
#[cfg(test)]
mod tests;
pub use crate::config::MatcherConfig;
pub use crate::utf32_str::Utf32Str;
pub use crate::config::Config;
pub use crate::utf32_str::{Utf32Str, Utf32String};
use crate::chars::{AsciiChar, Char};
use crate::matrix::MatrixSlab;
@ -39,22 +90,48 @@ use crate::matrix::MatrixSlab;
/// matching. This scratch memory allows the matcher to guarantee that it will
/// **never allocate** during matching (with the exception of pushing to the
/// `indices` vector if there isn't enough capacity). However this scratch
/// memory is fairly large (around 135KB) so creating a matcher is expensive and
/// should be reused.
/// memory is fairly large (around 135KB) so creating a matcher is expensive.
///
/// All `.._match` functions will not compute the indices of the matched chars
/// and are therefore significantly faster. These should be used to prefitler
/// and sort all matches. All `.._indices` functions will compute the indices of
/// the computed chars. These should be used when rendering the best N matches.
/// Note that the `indices` argument is **never cleared**. This allows running
/// multiple different matches on the same haystack and merging the indices by
/// sorting and deduplicating the vector.
/// All `.._match` functions will not compute the indices of the matched
/// characters. These should be used to prefitler to filter and rank all
/// matches. All `.._indices` functions will also compute the indices of the
/// matched characters but are slower compared to the `..match` variant. These
/// should be used when rendering the best N matches. Note that the `indices`
/// argument is **never cleared**. This allows running multiple different
/// matches on the same haystack and merging the indices by sorting and
/// deduplicating the vector.
///
/// The `needle` argument for each function must always be normalized by the
/// caller (unicode normalization and case folding). Otherwise, the matcher
/// may fail to produce a match. The [`pattern`] modules provides utilities
/// to preprocess needles and **should usually be preferred over invoking the
/// matcher directly**. Additionally it's recommend to perform separate matches
/// for each word in the needle. Consider the folloling example:
///
/// If `foo bar` is used as the needle it matches both `foo test baaar` and
/// `foo hello-world bar`. However, `foo test baaar` will receive a higher
/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which
/// will receive a penalty and therefore the user will likely expect it to rank
/// lower. However, if `foo bar` is matched as a single query `hello-world` and
/// `test` are both considered gaps too. As `hello-world` is a much longer gap
/// then `test` the extra penalty for `baaar` is canceled out. If both words
/// are matched individually the interspersed words do not receive a penalty and
/// `foo hello-world bar` ranks higher.
///
/// In general nucleo is a **substring matching tool** (except for the prefix/
/// postfix matching modes) with no penalty assigned to matches that start
/// later within the same pattern (which enables matching words individually
/// as shown above). If patterns show a large variety in length and the syntax
/// described above is not used it may be preferable to give preference to
/// matches closer to the start of a haystack. To accommodate that usecase the
/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true.
///
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
/// that the matcher *will panic*. The caller must decide whether it wants to
/// that the matcher **will panic**. The caller must decide whether it wants to
/// filter out long haystacks or truncate them.
pub struct Matcher {
pub config: MatcherConfig,
#[allow(missing_docs)]
pub config: Config,
slab: MatrixSlab,
}
@ -62,7 +139,7 @@ pub struct Matcher {
impl Clone for Matcher {
fn clone(&self) -> Self {
Matcher {
config: self.config,
config: self.config.clone(),
slab: MatrixSlab::new(),
}
}
@ -79,14 +156,17 @@ impl std::fmt::Debug for Matcher {
impl Default for Matcher {
fn default() -> Self {
Matcher {
config: MatcherConfig::DEFAULT,
config: Config::DEFAULT,
slab: MatrixSlab::new(),
}
}
}
impl Matcher {
pub fn new(config: MatcherConfig) -> Self {
/// Creates a new matcher instance, note that this will eagerly allocate a
/// fairly large chunk of heap memory (around 135KB currently but subject to
/// change) so matchers should be reused if called often (like in a loop).
pub fn new(config: Config) -> Self {
Self {
config,
slab: MatrixSlab::new(),
@ -95,9 +175,10 @@ impl Matcher {
/// Find the fuzzy match with the highest score in the `haystack`.
///
/// This functions has `O(mn)` time complexity for short inputs. To
/// avoid slowdowns it automatically falls back to [greedy matching]
/// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks
/// This functions has `O(mn)` time complexity for short inputs.
/// To avoid slowdowns it automatically falls back to
/// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large
/// needles and haystacks.
///
/// See the [matcher documentation](crate::Matcher) for more details.
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
@ -229,7 +310,7 @@ impl Matcher {
/// Greedly find a fuzzy match in the `haystack`.
///
/// This functions has `O(n)` time complexity but may provide unintutive (non-optimal)
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should
/// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should
/// be preferred.
///
/// See the [matcher documentation](crate::Matcher) for more details.
@ -245,7 +326,7 @@ impl Matcher {
/// Greedly find a fuzzy match in the `haystack` and compute its indices.
///
/// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal)
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should
/// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should
/// be preferred.
///
/// See the [matcher documentation](crate::Matcher) for more details.
@ -329,7 +410,7 @@ impl Matcher {
/// Finds the substring match with the highest score in the `haystack`.
///
/// This functions has `O(nm)` time complexity. However many cases can
/// be significantly accelerated using prefilters so it's usually fast
/// be significantly accelerated using prefilters so it's usually very fast
/// in practice.
///
/// See the [matcher documentation](crate::Matcher) for more details.

View File

@ -74,7 +74,7 @@ impl<C: Char> MatrixLayout<C> {
let base = ptr.as_ptr();
let haystack = base.add(self.haystack_off) as *mut C;
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
let bonus = base.add(self.bonus_off) as *mut u8;
let bonus = base.add(self.bonus_off);
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
let rows = base.add(self.rows_off) as *mut u16;
let rows = slice_from_raw_parts_mut(rows, self.needle_len);
@ -148,7 +148,7 @@ impl MatrixSlab {
let layout = Layout::new::<MatcherData>();
// safety: the matrix is never zero sized (hardcoded constants)
let ptr = unsafe { alloc_zeroed(layout) };
let Some(ptr) = NonNull::new(ptr) else{
let Some(ptr) = NonNull::new(ptr) else {
handle_alloc_error(layout)
};
MatrixSlab(ptr.cast())

488
matcher/src/pattern.rs Normal file
View File

@ -0,0 +1,488 @@
//! This module provides a slightly higher level API for matching strings.
use std::cmp::Reverse;
use crate::{chars, Matcher, Utf32Str};
#[cfg(test)]
mod tests;
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to treat a case mismatch between two characters.
pub enum CaseMatching {
/// Characters always match their case folded version (`a == A`).
Ignore,
/// Characters never match their case folded version (`a != A`).
Respect,
/// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
/// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
#[default]
Smart,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// The kind of matching algorithm to run for an atom.
pub enum AtomKind {
/// Fuzzy matching where the needle must match any haystack characters
/// (match can contain gaps). This atom kind is used by default if no
/// special syntax is used. There is no negated fuzzy matching (too
/// many false positives).
///
/// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
Fuzzy,
/// The needle must match a contiguous sequence of haystack characters
/// without gaps. This atom kind is parsed from the following syntax:
/// `'foo` and `!foo` (negated).
///
/// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
Substring,
/// The needle must match all leading haystack characters without gaps or
/// prefix. This atom kind is parsed from the following syntax: `^foo` and
/// `!^foo` (negated).
///
/// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
Prefix,
/// The needle must match all trailing haystack characters without gaps or
/// postfix. This atom kind is parsed from the following syntax: `foo$` and
/// `!foo$` (negated).
///
/// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
Postfix,
/// The needle must match all haystack characters without gaps or prefix.
/// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
/// (negated).
///
/// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
Exact,
}
/// A single pattern component that is matched with a single [`Matcher`](crate::Matcher) function
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Atom {
/// Whether this pattern atom is a negative match.
/// A negative pattern atom will prevent haystacks matching it from
/// being matchend. It does not contribute to scoring/indices
pub negative: bool,
/// The kind of match that this pattern performs
pub kind: AtomKind,
needle: Utf32String,
ignore_case: bool,
}
impl Atom {
/// Creates a single [`Atom`] from a string by performing unicode
/// normalization and case folding (if necessary). Optionally `\ ` can
/// be escaped to ` `.
pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom {
Atom::new_inner(needle, case, kind, escape_whitespace, false)
}
fn new_inner(
needle: &str,
case: CaseMatching,
kind: AtomKind,
escape_whitespace: bool,
append_dollar: bool,
) -> Atom {
let mut ignore_case;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => ignore_case = false,
}
if append_dollar {
needle.push('$');
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
};
if append_dollar {
needle_.push('$');
}
Utf32String::Unicode(needle_.into_boxed_slice())
};
Atom {
kind,
needle,
negative: false,
ignore_case,
}
}
/// Parse a pattern atom from a string. Some special trailing and leading
/// characters can be used to control the atom kind. See [`AtomKind`] for
/// details.
pub fn parse(raw: &str, case: CaseMatching) -> Atom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
AtomKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
AtomKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
AtomKind::Fuzzy
}
_ => AtomKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == AtomKind::Fuzzy {
AtomKind::Postfix
} else {
AtomKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if invert && kind == AtomKind::Fuzzy {
kind = AtomKind::Substring
}
let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar);
pattern.negative = invert;
pattern
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher).
/// Documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
if self.negative {
if pattern_score.is_some() {
return None;
}
Some(0)
} else {
pattern_score
}
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`](crate::Matcher). Documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The `indices` vector is not cleared by this function.
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
if self.negative {
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
pattern_score.is_none().then_some(0)
} else {
match self.kind {
AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
AtomKind::Substring => {
matcher.substring_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Prefix => {
matcher.prefix_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Postfix => {
matcher.postfix_indices(haystack, self.needle.slice(..), indices)
}
}
}
}
/// Returns the needle text that is passed to the matcher. All indices
/// produced by the `indices` functions produce char indices used to index
/// this text
pub fn needle_text(&self) -> Utf32Str<'_> {
self.needle.slice(..)
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u16)> {
if self.needle.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
' ' if !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}
#[derive(Debug, Default)]
/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
#[non_exhaustive]
pub struct Pattern {
/// The individual pattern (words) in this pattern
pub atoms: Vec<Atom>,
}
impl Pattern {
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
/// and ^ don't receive special treatment). If you want to match the entire
/// pattern as a single needle use a single [`Atom`] instead.
pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::new(pat, case_matching, kind, true);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
/// cause different matching behaviour (see [`AtomKind`]). These can be
/// escaped with backslash.
pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::parse(pat, case_matching);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u32)> {
if self.atoms.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher).
/// Documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.score(haystack, matcher)? as u32;
}
Some(score)
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`](crate::Matcher). Documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The indices for each pattern are calculated individually
/// and simply appended to the `indices` vector and not deduplicated/sorted.
/// This allows associating the match indices to their source pattern. If
/// required (like for highlighting) unique/sorted indices can be obtained
/// as follows:
///
/// ```
/// # let mut indices: Vec<u32> = Vec::new();
/// indices.sort_unstable();
/// indices.dedup();
/// ```
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.indices(haystack, matcher, indices)? as u32;
}
Some(score)
}
/// Refreshes this pattern by reparsing it from a string. This is mostly
/// equivalent to just constructing a new pattern using [`Pattern::parse`]
/// but is slightly more efficient by reusing some allocations
pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) {
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = Atom::parse(atom, case_matching);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.atoms.extend(atoms);
}
}
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
atoms: self.atoms.clone(),
}
}
fn clone_from(&mut self, source: &Self) {
self.atoms.clone_from(&source.atoms);
}
}

View File

@ -0,0 +1,114 @@
use crate::pattern::{Atom, AtomKind, CaseMatching};
#[test]
fn negative() {
let pat = Atom::parse("!foo", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!foo$", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo$", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = Atom::parse("foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("'foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("foo$", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo$", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = Atom::parse("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "äxx");
let pat = Atom::parse("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
let pat = Atom::parse("Axx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = Atom::parse("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = Atom::parse("\\!foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\'foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\^foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Prefix);
let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Substring);
}

View File

@ -1,11 +1,15 @@
use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::{Matcher, MatcherConfig};
use crate::{Config, Matcher};
pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
/// If the prefer_prefix option is enabled we want to penalize
/// the initial gap. The prefix should not be too much
pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
// We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over
@ -43,7 +47,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS
// still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
impl MatcherConfig {
impl Config {
#[inline]
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::Delimiter {
@ -140,7 +144,15 @@ impl Matcher {
}
prev_class = class;
}
if self.config.prefer_prefix {
if start != 0 {
let penalty = PENALTY_GAP_START
+ PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
} else {
score += MAX_PREFIX_BONUS;
}
}
score
}
}

View File

@ -1,10 +1,10 @@
use crate::chars::Char;
use crate::score::{
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
};
use crate::utf32_str::Utf32Str;
use crate::{Matcher, MatcherConfig};
use crate::{Config, Matcher};
use Algorithm::*;
@ -23,12 +23,14 @@ fn assert_matches(
normalize: bool,
case_sensitive: bool,
path: bool,
prefer_prefix: bool,
cases: &[(&str, &str, &[u32], u16)],
) {
let mut config = MatcherConfig {
let mut config = Config {
normalize,
ignore_case: !case_sensitive,
..MatcherConfig::DEFAULT
prefer_prefix,
..Config::DEFAULT
};
if path {
config.set_match_paths();
@ -87,10 +89,10 @@ pub fn assert_not_matches(
path: bool,
cases: &[(&str, &str)],
) {
let mut config = MatcherConfig {
let mut config = Config {
normalize,
ignore_case: !case_sensitive,
..MatcherConfig::DEFAULT
..Config::DEFAULT
};
if path {
config.set_match_paths();
@ -132,8 +134,8 @@ pub fn assert_not_matches(
}
}
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter;
const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter;
#[test]
fn test_fuzzy() {
@ -142,6 +144,7 @@ fn test_fuzzy() {
false,
false,
false,
false,
&[
(
"fooBarbaz1",
@ -250,6 +253,7 @@ fn empty_needle() {
false,
false,
false,
false,
&[("foo bar baz", "", &[], 0)],
);
}
@ -261,6 +265,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo bar baz",
@ -287,6 +292,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo bar baz",
@ -313,6 +319,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"foo",
@ -339,6 +346,7 @@ fn test_substring() {
false,
false,
false,
false,
&[
(
"fooBarbaz1",
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
false,
true,
false,
false,
&[
(
"fooBarbaz1",
@ -418,6 +427,7 @@ fn test_normalize() {
true,
false,
false,
false,
&[
(
"Só Danço Samba",
@ -464,6 +474,7 @@ fn test_unicode() {
true,
false,
false,
false,
&[
(
"你好世界",
@ -488,6 +499,7 @@ fn test_long_str() {
false,
false,
false,
false,
&[(
&"x".repeat(u16::MAX as usize + 1),
"xx",
@ -504,6 +516,7 @@ fn test_casing() {
false,
false,
false,
false,
&[
// these two have the same score
(
@ -536,6 +549,7 @@ fn test_casing() {
],
)
}
#[test]
fn test_optimal() {
assert_matches(
@ -543,6 +557,7 @@ fn test_optimal() {
false,
false,
false,
false,
&[
(
"axxx xx ",
@ -624,3 +639,32 @@ fn test_reject() {
);
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
}
#[test]
fn test_prefer_prefix() {
assert_matches(
&[FuzzyOptimal, FuzzyGreedy],
false,
false,
false,
true,
&[
(
"Moby Dick",
"md",
&[0, 5],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS
- PENALTY_GAP_START
- 3 * PENALTY_GAP_EXTENSION,
),
(
"Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
"md",
&[82, 85],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
],
);
}

View File

@ -1,6 +1,9 @@
use std::borrow::Cow;
use std::ops::{Bound, RangeBounds};
use std::{fmt, slice};
use crate::chars;
/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
///
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
@ -51,6 +54,7 @@ impl<'a> Utf32Str<'a> {
}
}
/// Returns the number of characters in this string.
#[inline]
pub fn len(self) -> usize {
match self {
@ -58,6 +62,8 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
}
}
/// Returns whether this string is empty.
#[inline]
pub fn is_empty(self) -> bool {
match self {
@ -66,6 +72,8 @@ impl<'a> Utf32Str<'a> {
}
}
/// Creates a slice with a string that contains the characters in
/// the specified **character range**.
#[inline]
pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
let start = match range.start_bound() {
@ -86,7 +94,7 @@ impl<'a> Utf32Str<'a> {
/// Returns the number of leading whitespaces in this string
#[inline]
pub fn leading_white_space(self) -> usize {
pub(crate) fn leading_white_space(self) -> usize {
match self {
Utf32Str::Ascii(bytes) => bytes
.iter()
@ -101,7 +109,7 @@ impl<'a> Utf32Str<'a> {
/// Returns the number of leading whitespaces in this string
#[inline]
pub fn trailing_white_space(self) -> usize {
pub(crate) fn trailing_white_space(self) -> usize {
match self {
Utf32Str::Ascii(bytes) => bytes
.iter()
@ -117,7 +125,7 @@ impl<'a> Utf32Str<'a> {
}
/// Same as `slice` but accepts a u32 range for convenience since
/// those are the indices returned by the matcher
/// those are the indices returned by the matcher.
#[inline]
pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
let start = match range.start_bound() {
@ -135,29 +143,34 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
}
}
/// Returns whether this string only contains ascii text.
pub fn is_ascii(self) -> bool {
matches!(self, Utf32Str::Ascii(_))
}
pub fn get(self, idx: u32) -> char {
/// Returns the `n`th character in this string.
pub fn get(self, n: u32) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[n as usize],
}
}
pub fn last(self) -> char {
pub(crate) fn last(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
}
}
pub fn first(self) -> char {
pub(crate) fn first(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[0] as char,
Utf32Str::Unicode(codepoints) => codepoints[0],
}
}
/// Returns an iterator over the characters in this string
pub fn chars(self) -> Chars<'a> {
match self {
Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
@ -165,6 +178,7 @@ impl<'a> Utf32Str<'a> {
}
}
}
impl fmt::Debug for Utf32Str<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
@ -209,3 +223,133 @@ impl DoubleEndedIterator for Chars<'_> {
}
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
/// An owned version of [`Utf32Str`].
pub enum Utf32String {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
Ascii(Box<str>),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(Box<[char]>),
}
impl Default for Utf32String {
fn default() -> Self {
Self::Ascii(String::new().into_boxed_str())
}
}
impl Utf32String {
/// Returns the number of characters in this string.
#[inline]
pub fn len(&self) -> usize {
match self {
Utf32String::Unicode(codepoints) => codepoints.len(),
Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(),
}
}
/// Returns whether this string is empty.
#[inline]
pub fn is_empty(&self) -> bool {
match self {
Utf32String::Unicode(codepoints) => codepoints.is_empty(),
Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
}
}
/// Creates a slice with a string that contains the characters in
/// the specified **character range**.
#[inline]
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end + 1,
Bound::Excluded(&end) => end,
Bound::Unbounded => self.len(),
};
match self {
Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]),
Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
}
}
/// Same as `slice` but accepts a u32 range for convenience since
/// those are the indices returned by the matcher.
#[inline]
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end + 1,
Bound::Excluded(&end) => end,
Bound::Unbounded => self.len() as u32,
};
match self {
Utf32String::Ascii(bytes) => {
Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize])
}
Utf32String::Unicode(codepoints) => {
Utf32Str::Unicode(&codepoints[start as usize..end as usize])
}
}
}
}
impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self {
if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str())
} else {
Self::Unicode(chars::graphemes(value).collect())
}
}
}
impl From<Box<str>> for Utf32String {
fn from(value: Box<str>) -> Self {
if value.is_ascii() {
Self::Ascii(value)
} else {
Self::Unicode(chars::graphemes(&value).collect())
}
}
}
impl From<String> for Utf32String {
#[inline]
fn from(value: String) -> Self {
value.into_boxed_str().into()
}
}
impl<'a> From<Cow<'a, str>> for Utf32String {
#[inline]
fn from(value: Cow<'a, str>) -> Self {
match value {
Cow::Borrowed(value) => value.into(),
Cow::Owned(value) => value.into(),
}
}
}
impl fmt::Debug for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?}", self.slice(..))
}
}
impl fmt::Display for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.slice(..))
}
}

View File

@ -1,4 +1,31 @@
use std::cmp::Reverse;
/*!
`nucleo` is a high level crate that provides a high level matcher API that
provides a highly effective (parallel) matcher worker. It's designed to allow
quickly plugging a fully featured (and faster) fzf/skim like fuzzy matcher into
your TUI application.
It's designed to run matching on a background threadpool while providing a
snapshot of the last complete match. That means the matcher can update the
results live while the user is typing while never blocking the main UI thread
(beyond a user provided timeout). Nucleo also supports fully concurrent lock-free
(and wait-free) streaming of input items.
The [`Nucleo`] struct servers as the main API entrypoint for this crate.
# Status
Nucleo is used in the helix-editor and therefore has a large user base with lots
or real world testing. The core matcher implementation is considered complete
and is unlikely to see major changes. The `nucleo-matcher` crate is finished and
ready for widespread use, breaking changes should be very rare (a 1.0 release
should not be far away).
While the high level `nucleo` crate also works well (and is also used in helix),
there are still additional features that will be added in the future. The high
level crate also need better documentation and will likely see a few API
changes in the future.
*/
use std::ops::{Bound, RangeBounds};
use std::sync::atomic::{self, AtomicBool, Ordering};
use std::sync::Arc;
@ -7,22 +34,25 @@ use std::time::Duration;
use parking_lot::Mutex;
use rayon::ThreadPool;
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::utf32_string::Utf32String;
use crate::pattern::MultiPattern;
use crate::worker::Worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
pub use nucleo_matcher::{chars, Config, Matcher, Utf32Str, Utf32String};
mod boxcar;
mod par_sort;
mod pattern;
mod utf32_string;
pub mod pattern;
mod worker;
/// A match candidate stored in a [`Nucleo`] worker.
pub struct Item<'a, T> {
pub data: &'a T,
pub matcher_columns: &'a [Utf32String],
}
/// A handle that allow adding new items [`Nucleo`] worker.
///
/// It's internally reference counted and can be cheaply cloned
/// and send acsorss tread
pub struct Injector<T> {
items: Arc<boxcar::Vec<T>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
@ -38,15 +68,17 @@ impl<T> Clone for Injector<T> {
}
impl<T> Injector<T> {
/// Appends an element to the back of the vector.
/// Appends an element to the list of matched items.
/// This function is lock-free and wait-free.
pub fn push(&self, value: T, fill_columns: impl FnOnce(&mut [Utf32String])) -> u32 {
let idx = self.items.push(value, fill_columns);
(self.notify)();
idx
}
/// Returns the total number of items in the current
/// queue
/// Returns the total number of items injected in the matcher. This might
/// not match the number of items in the match snapshot (if the matcher
/// is still running)
pub fn injected_items(&self) -> u32 {
self.items.count()
}
@ -69,18 +101,24 @@ impl<T> Injector<T> {
}
}
/// An [item](crate::Item) that was successfully matched by a [`Nucleo`] worker.
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match {
pub score: u32,
pub idx: u32,
}
/// That status of a [`Nucleo`] worker after a match.
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Status {
/// Whether the current snapshot has changed.
pub changed: bool,
/// Whether the matcher is still processing in the background.
pub running: bool,
}
/// A snapshot represent the results of a [`Nucleo`] worker after
/// finishing a [`tick`](Nucleo::tick).
pub struct Snapshot<T: Sync + Send + 'static> {
item_count: u32,
matches: Vec<Match>,
@ -181,6 +219,8 @@ impl<T: Sync + Send + 'static> Snapshot<T> {
}
}
/// A high level matcher worker that quickly computes matches in a background
/// threadpool.
pub struct Nucleo<T: Sync + Send + 'static> {
// the way the API is build we totally don't actually need these to be Arcs
// but this lets us avoid some unsafe
@ -192,15 +232,31 @@ pub struct Nucleo<T: Sync + Send + 'static> {
items: Arc<boxcar::Vec<T>>,
notify: Arc<(dyn Fn() + Sync + Send)>,
snapshot: Snapshot<T>,
/// The pattern matched by this matcher. To update the match pattern
/// [`MultiPattern::reparse`](`pattern::MultiPattern::reparse`) should be used.
/// Note that the matcher worker will only become aware of the new pattern
/// after a call to [`tick`](Nucleo::tick).
pub pattern: MultiPattern,
}
impl<T: Sync + Send + 'static> Nucleo<T> {
/// Constructs a new `nucleo` worker threadpool with the provided `config`.
///
/// `notify` is called everytime new information is available and
/// [`tick`](Nucleo::tick) should be called. Note that `notify` is not
/// debounced, that should be handled by the downstream crate (for example
/// debouncing to only redraw at most every 1/60 seconds).
///
/// If `None` is passed for the number of worker threads, nucleo will use
/// one thread per hardware thread.
///
/// Nucleo can match items with multiple orthogonal properties. `columns`
/// indicates how many matching columns each item (and the pattern) has. The
/// number of columns can not be changed after construction.
pub fn new(
config: MatcherConfig,
config: Config,
notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>,
case_matching: CaseMatching,
columns: u32,
) -> Self {
let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns);
@ -209,10 +265,10 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
should_notify: worker.should_notify.clone(),
items: worker.items.clone(),
pool,
pattern: MultiPattern::new(&config, case_matching, columns as usize),
pattern: MultiPattern::new(columns as usize),
snapshot: Snapshot {
matches: Vec::with_capacity(2 * 1024),
pattern: MultiPattern::new(&config, case_matching, columns as usize),
pattern: MultiPattern::new(columns as usize),
item_count: 0,
items: worker.items.clone(),
},
@ -222,11 +278,12 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
}
}
/// Returns a snapshot of all items
/// Returns a snapshot of the current matcher state.
pub fn snapshot(&self) -> &Snapshot<T> {
&self.snapshot
}
/// Returns an injector that can be used for adding candidates to the matcher.
pub fn injector(&self) -> Injector<T> {
Injector {
items: self.items.clone(),
@ -234,11 +291,11 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
}
}
/// Restart the the item stream. Removes all items disconnects all
/// previously created injectors from this instance. If `clear_snapshot` is
/// `true` then all items and matched are removed from the
/// [`Snapshot`](crate::Snapshot) immediately. Otherwise the snapshot will
/// keep the current matches until the matcher has run again.
/// Restart the the item stream. Removes all items and disconnects all
/// previously created injectors from this instance. If `clear_snapshot`
/// is `true` then all items and matched are removed from the [`Snapshot`]
/// (crate::Snapshot) immediately. Otherwise the snapshot will keep the
/// current matches until the matcher has run again.
///
/// # Note
///
@ -254,10 +311,14 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
}
}
pub fn update_config(&mut self, config: MatcherConfig) {
pub fn update_config(&mut self, config: Config) {
self.worker.lock().update_config(config)
}
/// The main way to interact with the matcher, this should be called
/// regularly (for example each time a frame is rendered). To avoid
/// excessive redraws this method will wait `timeout` milliseconds for the
/// worker therad to finish. It is recommend to set the timeout to 10ms.
pub fn tick(&mut self, timeout: u64) -> Status {
self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status();
@ -278,7 +339,10 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
} else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
self.should_notify.store(true, Ordering::Release);
return Status{ changed: false, running: true };
return Status {
changed: false,
running: true,
};
};
worker
};
@ -320,31 +384,3 @@ impl<T: Sync + Send> Drop for Nucleo<T> {
}
}
}
/// convenience function to easily fuzzy match
/// on a (relatively small) list of inputs. This is not recommended for building a full tui
/// application that can match large numbers of matches as all matching is done on the current
/// thread, effectively blocking the UI
pub fn fuzzy_match<T: AsRef<str>>(
matcher: &mut Matcher,
pattern: &str,
items: impl IntoIterator<Item = T>,
case_matching: CaseMatching,
) -> Vec<(T, u32)> {
let mut pattern_ = Pattern::new(&matcher.config, case_matching);
pattern_.set_literal(pattern, PatternKind::Fuzzy, false);
if pattern_.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
pattern_
.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}

View File

@ -1,188 +1,12 @@
use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern};
use nucleo_matcher::{Matcher, Utf32String};
#[cfg(test)]
mod tests;
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum CaseMatching {
Ignore,
Smart,
Respect,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
pub enum PatternKind {
Exact,
Fuzzy,
Substring,
Prefix,
Postfix,
}
#[derive(Debug, PartialEq, Eq, Clone)]
struct PatternAtom {
kind: PatternKind,
needle: Utf32String,
invert: bool,
ignore_case: bool,
}
impl PatternAtom {
fn literal(
needle: &str,
normalize: bool,
case: CaseMatching,
kind: PatternKind,
escape_whitespace: bool,
) -> PatternAtom {
let mut ignore_case;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => ignore_case = false,
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
};
Utf32String::Unicode(needle_.into_boxed_slice())
};
PatternAtom {
kind,
needle,
invert: false,
ignore_case,
}
}
fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
PatternKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
PatternKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
PatternKind::Fuzzy
}
_ => PatternKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == PatternKind::Fuzzy {
PatternKind::Postfix
} else {
PatternKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if invert && kind == PatternKind::Fuzzy {
kind = PatternKind::Substring
}
let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true);
pattern.invert = invert;
if append_dollar {
pattern.needle.push('$');
}
pattern
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
pub enum Status {
#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Default)]
pub(crate) enum Status {
#[default]
Unchanged,
Update,
Rescore,
@ -190,7 +14,7 @@ pub enum Status {
#[derive(Debug)]
pub struct MultiPattern {
pub cols: Vec<Pattern>,
cols: Vec<(Pattern, Status)>,
}
impl Clone for MultiPattern {
@ -206,214 +30,64 @@ impl Clone for MultiPattern {
}
impl MultiPattern {
pub fn new(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
columns: usize,
) -> MultiPattern {
MultiPattern {
cols: vec![Pattern::new(matcher_config, case_matching); columns],
/// Creates a multi pattern with `columns` empty column patterns.
pub fn new(columns: usize) -> Self {
Self {
cols: vec![Default::default(); columns],
}
}
/// Reparses a column. By specifying `append` the caller promises that text passed
/// to the previous `reparse` invocation is a prefix of `new_text`. This enables
/// additional optimizations but can lead to missing matches if an incorrect value
/// is passed.
pub fn reparse(
&mut self,
column: usize,
new_text: &str,
case_matching: CaseMatching,
append: bool,
) {
let old_status = self.cols[column].1;
if append
&& old_status != Status::Rescore
&& self.cols[column]
.0
.atoms
.last()
.map_or(true, |last| !last.negative)
{
self.cols[column].1 = Status::Update;
} else {
self.cols[column].1 = Status::Rescore;
}
self.cols[column].0.reparse(new_text, case_matching);
}
pub(crate) fn status(&self) -> Status {
self.cols
.iter()
.map(|col| col.status)
.map(|&(_, status)| status)
.max()
.unwrap_or(Status::Unchanged)
}
pub(crate) fn reset_status(&mut self) {
for col in &mut self.cols {
col.status = Status::Unchanged
for (_, status) in &mut self.cols {
*status = Status::Unchanged
}
}
pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option<u32> {
// TODO: wheight columns?
let mut score = 0;
for (pattern, haystack) in self.cols.iter().zip(haystack) {
for ((pattern, _), haystack) in self.cols.iter().zip(haystack) {
score += pattern.score(haystack.slice(..), matcher)?
}
Some(score)
}
}
#[derive(Debug)]
pub struct Pattern {
atoms: Vec<PatternAtom>,
case_matching: CaseMatching,
normalize: bool,
status: Status,
}
impl Pattern {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern {
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
}
}
pub fn new_fuzzy_literal(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
pattern: &str,
) -> Pattern {
let mut res = Pattern {
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
res.set_literal(pattern, PatternKind::Fuzzy, false);
res
}
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
PatternKind::Substring => {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)),
};
if pattern.invert {
if pattern_score.is_some() {
return None;
}
} else {
score += pattern_score? as u32
}
}
Some(score)
}
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
if pattern.invert {
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
PatternKind::Substring => {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => {
matcher.postfix_match(haystack, pattern.needle.slice(..))
}
};
if pattern_score.is_some() {
return None;
}
continue;
}
let pattern_score = match pattern.kind {
PatternKind::Exact => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Fuzzy => {
matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Substring => {
matcher.substring_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Prefix => {
matcher.prefix_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Postfix => {
matcher.postfix_indices(haystack, pattern.needle.slice(..), indices)
}
};
score += pattern_score? as u32
}
Some(score)
}
pub fn parse_from(&mut self, pattern: &str, append: bool) {
let invert = self.atoms.last().map_or(false, |pat| pat.invert);
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.atoms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) {
self.atoms.clear();
let pattern =
PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false);
if !pattern.needle.is_empty() {
self.atoms.push(pattern);
}
self.status = if append && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
pub fn is_empty(&self) -> bool {
self.atoms.is_empty()
self.cols.iter().all(|(pat, _)| pat.atoms.is_empty())
}
}
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
atoms: self.atoms.clone(),
case_matching: self.case_matching,
normalize: self.normalize,
status: self.status,
}
}
fn clone_from(&mut self, source: &Self) {
self.atoms.clone_from(&source.atoms);
self.case_matching = source.case_matching;
self.normalize = source.normalize;
self.status = source.status;
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
' ' if !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}

View File

@ -1,145 +1,14 @@
use crate::pattern::{PatternAtom, Status};
use crate::{CaseMatching, Pattern, PatternKind};
use nucleo_matcher::pattern::CaseMatching;
fn parse_atom(pat: &str) -> PatternAtom {
parse_atom_with(pat, CaseMatching::Smart)
}
fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom {
let mut pat = parse_with(pat, case_matching, false);
assert_eq!(pat.atoms.len(), 1);
pat.atoms.remove(0)
}
fn parse_with(pat: &str, case_matching: CaseMatching, append: bool) -> Pattern {
let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching);
res.parse_from(pat, append);
res
}
#[test]
fn negative() {
let pat = parse_atom("!foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = parse_atom("foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("'foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = parse_atom_with("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "axx");
let pat = parse_atom_with("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = parse_atom("foo\\ bar");
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = parse_atom("\\!foo");
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\'foo");
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\^foo");
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("^foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Prefix);
let pat = parse_atom("\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\!^foo\\$");
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("!\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Substring);
}
use crate::pattern::{MultiPattern, Status};
#[test]
fn append() {
let mut pat = parse_with("!", CaseMatching::Smart, true);
assert_eq!(pat.status, Status::Update);
pat.parse_from("!f", true);
assert_eq!(pat.status, Status::Update);
pat.parse_from("!fo", true);
assert_eq!(pat.status, Status::Rescore);
let mut pat = MultiPattern::new(1);
pat.reparse(0, "!", CaseMatching::Smart, true);
assert_eq!(pat.status(), Status::Update);
pat.reparse(0, "!f", CaseMatching::Smart, true);
assert_eq!(pat.status(), Status::Update);
pat.reparse(0, "!fo", CaseMatching::Smart, true);
assert_eq!(pat.status(), Status::Rescore);
}

View File

@ -3,7 +3,7 @@ use std::mem::take;
use std::sync::atomic::{self, AtomicBool, AtomicU32};
use std::sync::Arc;
use nucleo_matcher::MatcherConfig;
use nucleo_matcher::Config;
use parking_lot::Mutex;
use rayon::{prelude::*, ThreadPool};
@ -42,15 +42,15 @@ impl<T: Sync + Send + 'static> Worker<T> {
pub(crate) fn item_count(&self) -> u32 {
self.last_snapshot - self.in_flight.len() as u32
}
pub(crate) fn update_config(&mut self, config: MatcherConfig) {
pub(crate) fn update_config(&mut self, config: Config) {
for matcher in self.matchers.0.iter_mut() {
matcher.get_mut().config = config;
matcher.get_mut().config = config.clone();
}
}
pub(crate) fn new(
worker_threads: Option<usize>,
config: MatcherConfig,
config: Config,
notify: Arc<(dyn Fn() + Sync + Send)>,
cols: u32,
) -> (ThreadPool, Self) {
@ -62,7 +62,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
.build()
.expect("creating threadpool failed");
let matchers = (0..worker_threads)
.map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config)))
.map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config.clone())))
.collect();
let worker = Worker {
running: false,
@ -70,7 +70,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
last_snapshot: 0,
matches: Vec::new(),
// just a placeholder
pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0),
pattern: MultiPattern::new(cols as usize),
canceled: Arc::new(AtomicBool::new(false)),
should_notify: Arc::new(AtomicBool::new(false)),
was_canceled: false,
@ -102,14 +102,20 @@ impl<T: Sync + Send + 'static> Worker<T> {
let Some(item) = item else {
in_flight.lock().push(idx);
unmatched.fetch_add(1, atomic::Ordering::Relaxed);
return Match { score: 0, idx: u32::MAX };
return Match {
score: 0,
idx: u32::MAX,
};
};
if self.canceled.load(atomic::Ordering::Relaxed) {
return Match { score: 0, idx };
}
let Some(score) = pattern.score(item.matcher_columns, matchers.get()) else {
unmatched.fetch_add(1, atomic::Ordering::Relaxed);
return Match { score: 0, idx: u32::MAX };
return Match {
score: 0,
idx: u32::MAX,
};
};
Match { score, idx }
});
@ -156,7 +162,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
}
// TODO: be smarter around reusing past results for rescoring
if self.pattern.cols.iter().all(|pat| pat.is_empty()) {
if self.pattern.is_empty() {
self.reset_matches();
self.process_new_items_trivial();
if self.should_notify.load(atomic::Ordering::Relaxed) {

View File

@ -1,3 +1,3 @@
default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"]
[files]
extend-exclude = ["matcher/src/tests.rs", "*.html"]
extend-exclude = ["matcher/src/tests.rs","src/pattern/tests.rs", "*.html"]