Merge pull request #16 from helix-editor/release_nucleo_matcher

Co-authored-by: Michael Davis <mcarsondavis@gmail.com>
This commit is contained in:
Pascal Kuthe 2023-08-29 00:21:07 +02:00 committed by GitHub
commit 2de732889f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 1194 additions and 667 deletions

10
CHANGELOG.md Normal file
View File

@ -0,0 +1,10 @@
# Changelog
## nucleo-matcher
# [0.2.0] - 2023-09-01
*initial public release*
[0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0

4
Cargo.lock generated
View File

@ -152,7 +152,7 @@ dependencies = [
[[package]] [[package]]
name = "nucleo" name = "nucleo"
version = "0.1.0" version = "0.2.0"
dependencies = [ dependencies = [
"nucleo-matcher", "nucleo-matcher",
"parking_lot", "parking_lot",
@ -161,7 +161,7 @@ dependencies = [
[[package]] [[package]]
name = "nucleo-matcher" name = "nucleo-matcher"
version = "0.1.0" version = "0.2.0"
dependencies = [ dependencies = [
"cov-mark", "cov-mark",
"memchr", "memchr",

View File

@ -2,7 +2,7 @@
name = "nucleo" name = "nucleo"
description = "plug and play high performance fuzzy matcher" description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"] authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.1" version = "0.2.0"
edition = "2021" edition = "2021"
license = "MPL-2.0" license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo" repository = "https://github.com/helix-editor/nucleo"
@ -11,7 +11,7 @@ readme = "README.md"
[lib] [lib]
[dependencies] [dependencies]
nucleo-matcher = { version = "0.1", path = "matcher" } nucleo-matcher = { version = "0.2.0", path = "matcher" }
parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]} parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]}
rayon = "1.7.0" rayon = "1.7.0"

View File

@ -1,10 +1,5 @@
# Nucleo # Nucleo
> Disclaimer: An 0.1 version has been published to crates.io.
> This allows us to merge the `nucleo` integration into helix.
> However, the public API is not yet final and will likely
> change quite a bit in the next release. The documentation
> is also not yet complete
`nucleo` is a highly performant fuzzy matcher written in rust. It aims to fill the same use case as `fzf` and `skim`. Compared to `fzf` `nucleo` has a significantly faster matching algorithm. This mainly makes a difference when matching patterns with low selectivity on many items. An (unscientific) comparison is shown in the benchmark section below. `nucleo` is a highly performant fuzzy matcher written in rust. It aims to fill the same use case as `fzf` and `skim`. Compared to `fzf` `nucleo` has a significantly faster matching algorithm. This mainly makes a difference when matching patterns with low selectivity on many items. An (unscientific) comparison is shown in the benchmark section below.
@ -14,6 +9,12 @@
Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly). Nucleo also handles Unicode graphemes more correctly. `Fzf` and `skim` both operate on Unicode code points (chars). That means that multi codepoint graphemes can have weird effects (match multiple times, weirdly change the score, ...). `nucleo` will always use the first codepoint of the grapheme for matching instead (and reports grapheme indices, so they can be highlighted correctly).
## Status
Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away).
While the high level `nucleo` crate also works well (and is also used in helix), there are still additional features that will be added in the future. The high level crate also need better documentation and will likely see a few API changes in the future.
## Benchmarks ## Benchmarks
> WIP currently more of a demonstration than a comprehensive benchmark suit > WIP currently more of a demonstration than a comprehensive benchmark suit

View File

@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
nucleo = { version = "0.1", path = "../" } nucleo = { version = "0.2", path = "../" }
brunch = "0.5.0" brunch = "0.5.0"
fuzzy-matcher = "0.3.7" fuzzy-matcher = "0.3.7"
walkdir = "2" walkdir = "2"

View File

@ -43,7 +43,7 @@ fn main() {
Some((path.as_str().into(), path)) Some((path.as_str().into(), path))
}) })
.unzip(); .unzip();
let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths()); let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths());
let skim = fuzzy_matcher::skim::SkimMatcherV2::default(); let skim = fuzzy_matcher::skim::SkimMatcherV2::default();
// TODO: unicode? // TODO: unicode?

View File

@ -2,7 +2,7 @@
name = "nucleo-matcher" name = "nucleo-matcher"
description = "plug and play high performance fuzzy matcher" description = "plug and play high performance fuzzy matcher"
authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"] authors = ["Pascal Kuthe <pascal.kuthe@semimod.de>"]
version = "0.1.0" version = "0.2.0"
edition = "2021" edition = "2021"
license = "MPL-2.0" license = "MPL-2.0"
repository = "https://github.com/helix-editor/nucleo" repository = "https://github.com/helix-editor/nucleo"

View File

@ -1,7 +1,9 @@
//! Utilities for working with (unicode) characters/codepoints
use std::fmt::{self, Debug, Display}; use std::fmt::{self, Debug, Display};
use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
use crate::MatcherConfig; use crate::Config;
//autogenerated by generate-ucd //autogenerated by generate-ucd
#[allow(warnings)] #[allow(warnings)]
@ -11,9 +13,9 @@ mod normalize;
pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
const ASCII: bool; const ASCII: bool;
fn char_class(self, config: &MatcherConfig) -> CharClass; fn char_class(self, config: &Config) -> CharClass;
fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass);
fn normalize(self, config: &MatcherConfig) -> Self; fn normalize(self, config: &Config) -> Self;
} }
/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation /// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation
@ -42,7 +44,7 @@ impl PartialEq<AsciiChar> for char {
impl Char for AsciiChar { impl Char for AsciiChar {
const ASCII: bool = true; const ASCII: bool = true;
#[inline] #[inline]
fn char_class(self, config: &MatcherConfig) -> CharClass { fn char_class(self, config: &Config) -> CharClass {
let c = self.0; let c = self.0;
// using manual if conditions instead optimizes better // using manual if conditions instead optimizes better
if c >= b'a' && c <= b'z' { if c >= b'a' && c <= b'z' {
@ -61,7 +63,7 @@ impl Char for AsciiChar {
} }
#[inline(always)] #[inline(always)]
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
let char_class = self.char_class(config); let char_class = self.char_class(config);
if config.ignore_case && char_class == CharClass::Upper { if config.ignore_case && char_class == CharClass::Upper {
self.0 += 32 self.0 += 32
@ -70,7 +72,7 @@ impl Char for AsciiChar {
} }
#[inline(always)] #[inline(always)]
fn normalize(mut self, config: &MatcherConfig) -> Self { fn normalize(mut self, config: &Config) -> Self {
if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' {
self.0 += 32 self.0 += 32
} }
@ -95,7 +97,7 @@ fn char_class_non_ascii(c: char) -> CharClass {
impl Char for char { impl Char for char {
const ASCII: bool = false; const ASCII: bool = false;
#[inline(always)] #[inline(always)]
fn char_class(self, config: &MatcherConfig) -> CharClass { fn char_class(self, config: &Config) -> CharClass {
if self.is_ascii() { if self.is_ascii() {
return AsciiChar(self as u8).char_class(config); return AsciiChar(self as u8).char_class(config);
} }
@ -103,7 +105,7 @@ impl Char for char {
} }
#[inline(always)] #[inline(always)]
fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
if self.is_ascii() { if self.is_ascii() {
let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config);
return (c.0 as char, class); return (c.0 as char, class);
@ -123,7 +125,7 @@ impl Char for char {
} }
#[inline(always)] #[inline(always)]
fn normalize(mut self, config: &MatcherConfig) -> Self { fn normalize(mut self, config: &Config) -> Self {
if config.normalize { if config.normalize {
self = normalize::normalize(self); self = normalize::normalize(self);
} }
@ -138,12 +140,14 @@ pub use normalize::normalize;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
#[inline(always)] #[inline(always)]
/// Converts a character to lower case using simple unicode case folding
pub fn to_lower_case(c: char) -> char { pub fn to_lower_case(c: char) -> char {
CASE_FOLDING_SIMPLE CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper) .binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
} }
/// Converts a character to upper case using simple unicode case folding
#[inline(always)] #[inline(always)]
pub fn is_upper_case(c: char) -> bool { pub fn is_upper_case(c: char) -> bool {
CASE_FOLDING_SIMPLE CASE_FOLDING_SIMPLE
@ -152,8 +156,7 @@ pub fn is_upper_case(c: char) -> bool {
} }
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive] pub(crate) enum CharClass {
pub enum CharClass {
Whitespace, Whitespace,
NonWord, NonWord,
Delimiter, Delimiter,
@ -163,8 +166,10 @@ pub enum CharClass {
Number, Number,
} }
/// nucleo cannot match graphemes as single units to work around /// Nucleo cannot match graphemes as single units. To work around
/// that we only use the first codepoint of each grapheme /// that we only use the first codepoint of each grapheme. This
/// iterator returns the first character of each unicode grapheme
/// in a string and is used for constructing `Utf32Str(ing)`.
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ { pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
text.graphemes(true).map(|grapheme| { text.graphemes(true).map(|grapheme| {
grapheme grapheme

View File

@ -495,6 +495,16 @@ const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1;
const LEN3: usize = (DATA3_END - DATA3_START) as usize; const LEN3: usize = (DATA3_END - DATA3_START) as usize;
static TABLE3: [char; LEN3] = generate_table(&DATA3); static TABLE3: [char; LEN3] = generate_table(&DATA3);
/// Normalizes a unicode character by converting latin characters
/// which are variants of ASCII characters to their latin equivant.
///
/// # Example
///
/// ``` rust
/// # use nucleo_matcher::chars::normalize;
///
/// assert_eq!(normalize('ä'), 'a');
/// ```
pub fn normalize(c: char) -> char { pub fn normalize(c: char) -> char {
let i = c as u32; let i = c as u32;
if i < DATA1_START || i >= DATA3_END { if i < DATA1_START || i >= DATA3_END {

View File

@ -1,38 +1,52 @@
use crate::chars::CharClass; use crate::chars::CharClass;
use crate::score::BONUS_BOUNDARY; use crate::score::BONUS_BOUNDARY;
/// Configuration data that controls how a matcher behaves
#[non_exhaustive] #[non_exhaustive]
#[derive(PartialEq, Eq, Debug, Clone, Copy)] #[derive(PartialEq, Eq, Debug, Clone)]
pub struct MatcherConfig { pub struct Config {
pub delimiter_chars: &'static [u8], /// Characters that act as delimiters and provide bonus
/// for matching the following char
pub(crate) delimiter_chars: &'static [u8],
/// Extra bonus for word boundary after whitespace character or beginning of the string /// Extra bonus for word boundary after whitespace character or beginning of the string
pub(crate) bonus_boundary_white: u16, pub(crate) bonus_boundary_white: u16,
/// Extra bonus for word boundary after slash, colon, semi-colon, and comma /// Extra bonus for word boundary after slash, colon, semi-colon, and comma
pub(crate) bonus_boundary_delimiter: u16, pub(crate) bonus_boundary_delimiter: u16,
pub initial_char_class: CharClass, pub(crate) initial_char_class: CharClass,
/// Whether to normalize latin script characters to ASCII (enabled by default) /// Whether to normalize latin script characters to ASCII (enabled by default)
pub normalize: bool, pub normalize: bool,
/// whether to ignore casing /// whether to ignore casing
pub ignore_case: bool, pub ignore_case: bool,
/// Whether to provide a bonus to matches by their distance from the start
/// of the haystack. The bonus is fairly small compared to the normal gap
/// penalty to avoid messing with the normal score heuristic. This setting
/// is not turned on by default and only recommended for autocompletion
/// usecases where the expectation is that the user is typing the entire
/// match. For a full fzf-like fuzzy matcher/picker word segmentation and
/// explicit prefix literals should be used instead.
pub prefer_prefix: bool,
} }
impl MatcherConfig { impl Config {
/// The default config for nucleo, implemented as a constant since
/// Default::default can not be called in a const context
pub const DEFAULT: Self = { pub const DEFAULT: Self = {
MatcherConfig { Config {
delimiter_chars: b"/,:;|", delimiter_chars: b"/,:;|",
bonus_boundary_white: BONUS_BOUNDARY + 2, bonus_boundary_white: BONUS_BOUNDARY + 2,
bonus_boundary_delimiter: BONUS_BOUNDARY + 1, bonus_boundary_delimiter: BONUS_BOUNDARY + 1,
initial_char_class: CharClass::Whitespace, initial_char_class: CharClass::Whitespace,
normalize: true, normalize: true,
ignore_case: true, ignore_case: true,
prefer_prefix: false,
} }
}; };
} }
impl MatcherConfig { impl Config {
/// Configures the matcher with bonuses appropriate for matching file paths.
pub fn set_match_paths(&mut self) { pub fn set_match_paths(&mut self) {
// compared to fzf we include
if cfg!(windows) { if cfg!(windows) {
self.delimiter_chars = b"/:\\"; self.delimiter_chars = b"/:\\";
} else { } else {
@ -42,6 +56,7 @@ impl MatcherConfig {
self.initial_char_class = CharClass::Delimiter; self.initial_char_class = CharClass::Delimiter;
} }
/// Configures the matcher with bonuses appropriate for matching file paths.
pub const fn match_paths(mut self) -> Self { pub const fn match_paths(mut self) -> Self {
if cfg!(windows) { if cfg!(windows) {
self.delimiter_chars = b"/\\"; self.delimiter_chars = b"/\\";

View File

@ -3,10 +3,10 @@ use std::cmp::max;
use crate::chars::{Char, CharClass}; use crate::chars::{Char, CharClass};
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
use crate::score::{ use crate::score::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
PENALTY_GAP_START, SCORE_MATCH, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
}; };
use crate::{Matcher, MatcherConfig}; use crate::{Config, Matcher};
impl Matcher { impl Matcher {
pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>( pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
@ -23,11 +23,7 @@ impl Matcher {
// us to treat needle indices as u16 // us to treat needle indices as u16
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else { let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
return self.fuzzy_match_greedy_::<INDICES, H, N>( return self.fuzzy_match_greedy_::<INDICES, H, N>(
haystack, haystack, needle, start, greedy_end, indices,
needle,
start,
greedy_end,
indices,
); );
}; };
@ -35,7 +31,7 @@ impl Matcher {
.checked_sub(1) .checked_sub(1)
.map(|i| haystack[i].char_class(&self.config)) .map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.initial_char_class); .unwrap_or(self.config.initial_char_class);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config); let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched { if !matched {
assert!( assert!(
@ -116,7 +112,8 @@ impl<H: Char> MatcherDataView<'_, H> {
&mut self, &mut self,
needle: &[N], needle: &[N],
mut prev_class: CharClass, mut prev_class: CharClass,
config: &MatcherConfig, config: &Config,
start: u32,
) -> bool ) -> bool
where where
H: PartialEq<N>, H: PartialEq<N>,
@ -167,6 +164,17 @@ impl<H: Char> MatcherDataView<'_, H> {
0, 0,
needle[0], needle[0],
needle[1], needle[1],
if config.prefer_prefix {
if start == 0 {
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
} else {
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
)
}
} else {
0
},
); );
true true
} }
@ -182,6 +190,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx: u16, needle_idx: u16,
needle_char: N, needle_char: N,
next_needle_char: N, next_needle_char: N,
mut prefix_bonus: u16,
) where ) where
H: PartialEq<N>, H: PartialEq<N>,
{ {
@ -198,15 +207,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW { let m_cell = if FIRST_ROW {
if c == needle_char { let cell = if c == needle_char {
ScoreCell { ScoreCell {
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false, matched: false,
consecutive_bonus: *bonus, consecutive_bonus: *bonus,
} }
} else { } else {
UNMATCHED UNMATCHED
} };
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else { } else {
*score_cell *score_cell
}; };
@ -224,15 +237,19 @@ impl<H: Char> MatcherDataView<'_, H> {
for (((c, bonus), score_cell), matrix_cell) in col_iter { for (((c, bonus), score_cell), matrix_cell) in col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW { let m_cell = if FIRST_ROW {
if c[0] == needle_char { let cell = if c[0] == needle_char {
ScoreCell { ScoreCell {
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH, score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false, matched: false,
consecutive_bonus: bonus[0], consecutive_bonus: bonus[0],
} }
} else { } else {
UNMATCHED UNMATCHED
} };
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else { } else {
*score_cell *score_cell
}; };
@ -271,6 +288,7 @@ impl<H: Char> MatcherDataView<'_, H> {
needle_idx as u16 + 1, needle_idx as u16 + 1,
needle_char, needle_char,
next_needle_char, next_needle_char,
0,
); );
let len = self.current_row.len() + needle_idx + 1 - row_off as usize; let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
matrix_cells = &mut matrix_cells[len..]; matrix_cells = &mut matrix_cells[len..];

View File

@ -1,16 +1,66 @@
/*! /*!
`nucleo_matcher` is a low level crate that contains the matcher implementation `nucleo_matcher` is a low level crate that contains the matcher implementation
used by the other nucleo crates. used by the high level `nucleo` crate.
The matcher is hightly optimized and can significantly outperform `fzf` and The matcher is hightly optimized and can significantly outperform `fzf` and
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require `skim` (the `fuzzy-matcher` crate). However some of these optimizations require
a slightly less convenient API. Particularly, `nucleo_matcher` requires that a slightly less convenient API. Be sure to carefully read the documentation of
needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead the [`Matcher`] to avoid unexpected behaviour.
of rusts normal utf32 strings. # Examples
For almost all usecases the [`pattern`] API should be used instead of calling
the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will
construct a single Atom (a single match operation) for each word. The pattern
can contain special characters to control what kind of match is performed (see
[`AtomKind`](crate::pattern::AtomKind)).
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]);
```
If the pattern should be matched literally (without this special parsing)
[`Pattern::new`](pattern::Pattern::new) can be used instead.
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let paths = ["^foo/bar", "bar/^foo", "foobar"];
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]);
```
If word segmentation is also not desired, a single `Atom` can be constructed directly.
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind};
let paths = ["foobar", "foo bar"];
let mut matcher = Matcher::new(Config::DEFAULT);
let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo bar", 192)]);
```
# Status
Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away).
*/ */
// sadly ranges don't optmimzie well // sadly ranges don't optmimzie well
#![allow(clippy::manual_range_contains)] #![allow(clippy::manual_range_contains)]
#![warn(missing_docs)]
pub mod chars; pub mod chars;
mod config; mod config;
@ -20,6 +70,7 @@ mod exact;
mod fuzzy_greedy; mod fuzzy_greedy;
mod fuzzy_optimal; mod fuzzy_optimal;
mod matrix; mod matrix;
pub mod pattern;
mod prefilter; mod prefilter;
mod score; mod score;
mod utf32_str; mod utf32_str;
@ -27,8 +78,8 @@ mod utf32_str;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
pub use crate::config::MatcherConfig; pub use crate::config::Config;
pub use crate::utf32_str::Utf32Str; pub use crate::utf32_str::{Utf32Str, Utf32String};
use crate::chars::{AsciiChar, Char}; use crate::chars::{AsciiChar, Char};
use crate::matrix::MatrixSlab; use crate::matrix::MatrixSlab;
@ -39,22 +90,48 @@ use crate::matrix::MatrixSlab;
/// matching. This scratch memory allows the matcher to guarantee that it will /// matching. This scratch memory allows the matcher to guarantee that it will
/// **never allocate** during matching (with the exception of pushing to the /// **never allocate** during matching (with the exception of pushing to the
/// `indices` vector if there isn't enough capacity). However this scratch /// `indices` vector if there isn't enough capacity). However this scratch
/// memory is fairly large (around 135KB) so creating a matcher is expensive and /// memory is fairly large (around 135KB) so creating a matcher is expensive.
/// should be reused.
/// ///
/// All `.._match` functions will not compute the indices of the matched chars /// All `.._match` functions will not compute the indices of the matched
/// and are therefore significantly faster. These should be used to prefitler /// characters. These should be used to prefitler to filter and rank all
/// and sort all matches. All `.._indices` functions will compute the indices of /// matches. All `.._indices` functions will also compute the indices of the
/// the computed chars. These should be used when rendering the best N matches. /// matched characters but are slower compared to the `..match` variant. These
/// Note that the `indices` argument is **never cleared**. This allows running /// should be used when rendering the best N matches. Note that the `indices`
/// multiple different matches on the same haystack and merging the indices by /// argument is **never cleared**. This allows running multiple different
/// sorting and deduplicating the vector. /// matches on the same haystack and merging the indices by sorting and
/// deduplicating the vector.
///
/// The `needle` argument for each function must always be normalized by the
/// caller (unicode normalization and case folding). Otherwise, the matcher
/// may fail to produce a match. The [`pattern`] modules provides utilities
/// to preprocess needles and **should usually be preferred over invoking the
/// matcher directly**. Additionally it's recommend to perform separate matches
/// for each word in the needle. Consider the folloling example:
///
/// If `foo bar` is used as the needle it matches both `foo test baaar` and
/// `foo hello-world bar`. However, `foo test baaar` will receive a higher
/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which
/// will receive a penalty and therefore the user will likely expect it to rank
/// lower. However, if `foo bar` is matched as a single query `hello-world` and
/// `test` are both considered gaps too. As `hello-world` is a much longer gap
/// then `test` the extra penalty for `baaar` is canceled out. If both words
/// are matched individually the interspersed words do not receive a penalty and
/// `foo hello-world bar` ranks higher.
///
/// In general nucleo is a **substring matching tool** (except for the prefix/
/// postfix matching modes) with no penalty assigned to matches that start
/// later within the same pattern (which enables matching words individually
/// as shown above). If patterns show a large variety in length and the syntax
/// described above is not used it may be preferable to give preference to
/// matches closer to the start of a haystack. To accommodate that usecase the
/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true.
/// ///
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
/// that the matcher *will panic*. The caller must decide whether it wants to /// that the matcher **will panic**. The caller must decide whether it wants to
/// filter out long haystacks or truncate them. /// filter out long haystacks or truncate them.
pub struct Matcher { pub struct Matcher {
pub config: MatcherConfig, #[allow(missing_docs)]
pub config: Config,
slab: MatrixSlab, slab: MatrixSlab,
} }
@ -62,7 +139,7 @@ pub struct Matcher {
impl Clone for Matcher { impl Clone for Matcher {
fn clone(&self) -> Self { fn clone(&self) -> Self {
Matcher { Matcher {
config: self.config, config: self.config.clone(),
slab: MatrixSlab::new(), slab: MatrixSlab::new(),
} }
} }
@ -79,14 +156,17 @@ impl std::fmt::Debug for Matcher {
impl Default for Matcher { impl Default for Matcher {
fn default() -> Self { fn default() -> Self {
Matcher { Matcher {
config: MatcherConfig::DEFAULT, config: Config::DEFAULT,
slab: MatrixSlab::new(), slab: MatrixSlab::new(),
} }
} }
} }
impl Matcher { impl Matcher {
pub fn new(config: MatcherConfig) -> Self { /// Creates a new matcher instance, note that this will eagerly allocate a
/// fairly large chunk of heap memory (around 135KB currently but subject to
/// change) so matchers should be reused if called often (like in a loop).
pub fn new(config: Config) -> Self {
Self { Self {
config, config,
slab: MatrixSlab::new(), slab: MatrixSlab::new(),
@ -95,9 +175,10 @@ impl Matcher {
/// Find the fuzzy match with the highest score in the `haystack`. /// Find the fuzzy match with the highest score in the `haystack`.
/// ///
/// This functions has `O(mn)` time complexity for short inputs. To /// This functions has `O(mn)` time complexity for short inputs.
/// avoid slowdowns it automatically falls back to [greedy matching] /// To avoid slowdowns it automatically falls back to
/// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large
/// needles and haystacks.
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
@ -229,7 +310,7 @@ impl Matcher {
/// Greedly find a fuzzy match in the `haystack`. /// Greedly find a fuzzy match in the `haystack`.
/// ///
/// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal)
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should /// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should
/// be preferred. /// be preferred.
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
@ -245,7 +326,7 @@ impl Matcher {
/// Greedly find a fuzzy match in the `haystack` and compute its indices. /// Greedly find a fuzzy match in the `haystack` and compute its indices.
/// ///
/// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal)
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should /// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should
/// be preferred. /// be preferred.
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
@ -329,7 +410,7 @@ impl Matcher {
/// Finds the substring match with the highest score in the `haystack`. /// Finds the substring match with the highest score in the `haystack`.
/// ///
/// This functions has `O(nm)` time complexity. However many cases can /// This functions has `O(nm)` time complexity. However many cases can
/// be significantly accelerated using prefilters so it's usually fast /// be significantly accelerated using prefilters so it's usually very fast
/// in practice. /// in practice.
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.

View File

@ -74,7 +74,7 @@ impl<C: Char> MatrixLayout<C> {
let base = ptr.as_ptr(); let base = ptr.as_ptr();
let haystack = base.add(self.haystack_off) as *mut C; let haystack = base.add(self.haystack_off) as *mut C;
let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len);
let bonus = base.add(self.bonus_off) as *mut u8; let bonus = base.add(self.bonus_off);
let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len);
let rows = base.add(self.rows_off) as *mut u16; let rows = base.add(self.rows_off) as *mut u16;
let rows = slice_from_raw_parts_mut(rows, self.needle_len); let rows = slice_from_raw_parts_mut(rows, self.needle_len);

488
matcher/src/pattern.rs Normal file
View File

@ -0,0 +1,488 @@
//! This module provides a slightly higher level API for matching strings.
use std::cmp::Reverse;
use crate::{chars, Matcher, Utf32Str};
#[cfg(test)]
mod tests;
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to treat a case mismatch between two characters.
pub enum CaseMatching {
/// Characters always match their case folded version (`a == A`).
Ignore,
/// Characters never match their case folded version (`a != A`).
Respect,
/// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
/// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
#[default]
Smart,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// The kind of matching algorithm to run for an atom.
pub enum AtomKind {
/// Fuzzy matching where the needle must match any haystack characters
/// (match can contain gaps). This atom kind is used by default if no
/// special syntax is used. There is no negated fuzzy matching (too
/// many false positives).
///
/// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
Fuzzy,
/// The needle must match a contiguous sequence of haystack characters
/// without gaps. This atom kind is parsed from the following syntax:
/// `'foo` and `!foo` (negated).
///
/// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
Substring,
/// The needle must match all leading haystack characters without gaps or
/// prefix. This atom kind is parsed from the following syntax: `^foo` and
/// `!^foo` (negated).
///
/// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
Prefix,
/// The needle must match all trailing haystack characters without gaps or
/// postfix. This atom kind is parsed from the following syntax: `foo$` and
/// `!foo$` (negated).
///
/// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
Postfix,
/// The needle must match all haystack characters without gaps or prefix.
/// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
/// (negated).
///
/// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
Exact,
}
/// A single pattern component that is matched with a single [`Matcher`](crate::Matcher) function
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Atom {
/// Whether this pattern atom is a negative match.
/// A negative pattern atom will prevent haystacks matching it from
/// being matchend. It does not contribute to scoring/indices
pub negative: bool,
/// The kind of match that this pattern performs
pub kind: AtomKind,
needle: Utf32String,
ignore_case: bool,
}
impl Atom {
/// Creates a single [`Atom`] from a string by performing unicode
/// normalization and case folding (if necessary). Optionally `\ ` can
/// be escaped to ` `.
pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom {
Atom::new_inner(needle, case, kind, escape_whitespace, false)
}
fn new_inner(
needle: &str,
case: CaseMatching,
kind: AtomKind,
escape_whitespace: bool,
append_dollar: bool,
) -> Atom {
let mut ignore_case;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => ignore_case = false,
}
if append_dollar {
needle.push('$');
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
};
if append_dollar {
needle_.push('$');
}
Utf32String::Unicode(needle_.into_boxed_slice())
};
Atom {
kind,
needle,
negative: false,
ignore_case,
}
}
/// Parse a pattern atom from a string. Some special trailing and leading
/// characters can be used to control the atom kind. See [`AtomKind`] for
/// details.
pub fn parse(raw: &str, case: CaseMatching) -> Atom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
AtomKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
AtomKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
AtomKind::Fuzzy
}
_ => AtomKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == AtomKind::Fuzzy {
AtomKind::Postfix
} else {
AtomKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if invert && kind == AtomKind::Fuzzy {
kind = AtomKind::Substring
}
let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar);
pattern.negative = invert;
pattern
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher).
/// Documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
if self.negative {
if pattern_score.is_some() {
return None;
}
Some(0)
} else {
pattern_score
}
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`](crate::Matcher). Documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The `indices` vector is not cleared by this function.
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
if self.negative {
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
};
pattern_score.is_none().then_some(0)
} else {
match self.kind {
AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
AtomKind::Substring => {
matcher.substring_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Prefix => {
matcher.prefix_indices(haystack, self.needle.slice(..), indices)
}
AtomKind::Postfix => {
matcher.postfix_indices(haystack, self.needle.slice(..), indices)
}
}
}
}
/// Returns the needle text that is passed to the matcher. All indices
/// produced by the `indices` functions produce char indices used to index
/// this text
pub fn needle_text(&self) -> Utf32Str<'_> {
self.needle.slice(..)
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u16)> {
if self.needle.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
' ' if !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}
#[derive(Debug, Default)]
/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
#[non_exhaustive]
pub struct Pattern {
/// The individual pattern (words) in this pattern
pub atoms: Vec<Atom>,
}
impl Pattern {
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
/// and ^ don't receive special treatment). If you want to match the entire
/// pattern as a single needle use a single [`Atom`] instead.
pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::new(pat, case_matching, kind, true);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Creates a pattern where each word is matched individually (whitespaces
/// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
/// cause different matching behaviour (see [`AtomKind`]). These can be
/// escaped with backslash.
pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::parse(pat, case_matching);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Pattern { atoms }
}
/// Convenience function to easily match (and sort) a (relatively small)
/// list of inputs.
///
/// *Note* This function is not recommended for building a full fuzzy
/// matching application that can match large numbers of matches (like all
/// files in a directory) as all matching is done on the current thread,
/// effectively blocking the UI. For such applications the high level
/// `nucleo` crate can be used instead.
pub fn match_list<T: AsRef<str>>(
&self,
items: impl IntoIterator<Item = T>,
matcher: &mut Matcher,
) -> Vec<(T, u32)> {
if self.atoms.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}
/// Matches this pattern against `haystack` (using the allocation and configuration
/// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher).
/// Documentation for more details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.score(haystack, matcher)? as u32;
}
Some(score)
}
/// Matches this pattern against `haystack` (using the allocation and
/// configuration from `matcher`), calculates a ranking score and the match
/// indices. See the [`Matcher`](crate::Matcher). Documentation for more
/// details.
///
/// *Note:* The `ignore_case` setting is overwritten to match the casing of
/// each pattern atom.
///
/// *Note:* The indices for each pattern are calculated individually
/// and simply appended to the `indices` vector and not deduplicated/sorted.
/// This allows associating the match indices to their source pattern. If
/// required (like for highlighting) unique/sorted indices can be obtained
/// as follows:
///
/// ```
/// # let mut indices: Vec<u32> = Vec::new();
/// indices.sort_unstable();
/// indices.dedup();
/// ```
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
score += pattern.indices(haystack, matcher, indices)? as u32;
}
Some(score)
}
/// Refreshes this pattern by reparsing it from a string. This is mostly
/// equivalent to just constructing a new pattern using [`Pattern::parse`]
/// but is slightly more efficient by reusing some allocations
pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) {
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = Atom::parse(atom, case_matching);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.atoms.extend(atoms);
}
}
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
atoms: self.atoms.clone(),
}
}
fn clone_from(&mut self, source: &Self) {
self.atoms.clone_from(&source.atoms);
}
}

View File

@ -0,0 +1,114 @@
use crate::pattern::{Atom, AtomKind, CaseMatching};
#[test]
fn negative() {
let pat = Atom::parse("!foo", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!foo$", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo$", CaseMatching::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = Atom::parse("foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("'foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("foo$", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo$", CaseMatching::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = Atom::parse("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "äxx");
let pat = Atom::parse("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
let pat = Atom::parse("Axx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = Atom::parse("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = Atom::parse("\\!foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\'foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\^foo", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Prefix);
let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Substring);
}

View File

@ -1,11 +1,15 @@
use std::cmp::max; use std::cmp::max;
use crate::chars::{Char, CharClass}; use crate::chars::{Char, CharClass};
use crate::{Matcher, MatcherConfig}; use crate::{Config, Matcher};
pub(crate) const SCORE_MATCH: u16 = 16; pub(crate) const SCORE_MATCH: u16 = 16;
pub(crate) const PENALTY_GAP_START: u16 = 3; pub(crate) const PENALTY_GAP_START: u16 = 3;
pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1;
/// If the prefer_prefix option is enabled we want to penalize
/// the initial gap. The prefix should not be too much
pub(crate) const PREFIX_BONUS_SCALE: u16 = 2;
pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY;
// We prefer matches at the beginning of a word, but the bonus should not be // We prefer matches at the beginning of a word, but the bonus should not be
// too great to prevent the longer acronym matches from always winning over // too great to prevent the longer acronym matches from always winning over
@ -43,7 +47,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS
// still respected. // still respected.
pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2;
impl MatcherConfig { impl Config {
#[inline] #[inline]
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 {
if class > CharClass::Delimiter { if class > CharClass::Delimiter {
@ -140,7 +144,15 @@ impl Matcher {
} }
prev_class = class; prev_class = class;
} }
if self.config.prefer_prefix {
if start != 0 {
let penalty = PENALTY_GAP_START
+ PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16;
score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE);
} else {
score += MAX_PREFIX_BONUS;
}
}
score score
} }
} }

View File

@ -1,10 +1,10 @@
use crate::chars::Char; use crate::chars::Char;
use crate::score::{ use crate::score::{
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
}; };
use crate::utf32_str::Utf32Str; use crate::utf32_str::Utf32Str;
use crate::{Matcher, MatcherConfig}; use crate::{Config, Matcher};
use Algorithm::*; use Algorithm::*;
@ -23,12 +23,14 @@ fn assert_matches(
normalize: bool, normalize: bool,
case_sensitive: bool, case_sensitive: bool,
path: bool, path: bool,
prefer_prefix: bool,
cases: &[(&str, &str, &[u32], u16)], cases: &[(&str, &str, &[u32], u16)],
) { ) {
let mut config = MatcherConfig { let mut config = Config {
normalize, normalize,
ignore_case: !case_sensitive, ignore_case: !case_sensitive,
..MatcherConfig::DEFAULT prefer_prefix,
..Config::DEFAULT
}; };
if path { if path {
config.set_match_paths(); config.set_match_paths();
@ -87,10 +89,10 @@ pub fn assert_not_matches(
path: bool, path: bool,
cases: &[(&str, &str)], cases: &[(&str, &str)],
) { ) {
let mut config = MatcherConfig { let mut config = Config {
normalize, normalize,
ignore_case: !case_sensitive, ignore_case: !case_sensitive,
..MatcherConfig::DEFAULT ..Config::DEFAULT
}; };
if path { if path {
config.set_match_paths(); config.set_match_paths();
@ -132,8 +134,8 @@ pub fn assert_not_matches(
} }
} }
const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white;
const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter;
#[test] #[test]
fn test_fuzzy() { fn test_fuzzy() {
@ -142,6 +144,7 @@ fn test_fuzzy() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"fooBarbaz1", "fooBarbaz1",
@ -250,6 +253,7 @@ fn empty_needle() {
false, false,
false, false,
false, false,
false,
&[("foo bar baz", "", &[], 0)], &[("foo bar baz", "", &[], 0)],
); );
} }
@ -261,6 +265,7 @@ fn test_substring() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"foo bar baz", "foo bar baz",
@ -287,6 +292,7 @@ fn test_substring() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"foo bar baz", "foo bar baz",
@ -313,6 +319,7 @@ fn test_substring() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"foo", "foo",
@ -339,6 +346,7 @@ fn test_substring() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"fooBarbaz1", "fooBarbaz1",
@ -377,6 +385,7 @@ fn test_fuzzy_case_sensitive() {
false, false,
true, true,
false, false,
false,
&[ &[
( (
"fooBarbaz1", "fooBarbaz1",
@ -418,6 +427,7 @@ fn test_normalize() {
true, true,
false, false,
false, false,
false,
&[ &[
( (
"Só Danço Samba", "Só Danço Samba",
@ -464,6 +474,7 @@ fn test_unicode() {
true, true,
false, false,
false, false,
false,
&[ &[
( (
"你好世界", "你好世界",
@ -488,6 +499,7 @@ fn test_long_str() {
false, false,
false, false,
false, false,
false,
&[( &[(
&"x".repeat(u16::MAX as usize + 1), &"x".repeat(u16::MAX as usize + 1),
"xx", "xx",
@ -504,6 +516,7 @@ fn test_casing() {
false, false,
false, false,
false, false,
false,
&[ &[
// these two have the same score // these two have the same score
( (
@ -536,6 +549,7 @@ fn test_casing() {
], ],
) )
} }
#[test] #[test]
fn test_optimal() { fn test_optimal() {
assert_matches( assert_matches(
@ -543,6 +557,7 @@ fn test_optimal() {
false, false,
false, false,
false, false,
false,
&[ &[
( (
"axxx xx ", "axxx xx ",
@ -624,3 +639,32 @@ fn test_reject() {
); );
assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]); assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]);
} }
#[test]
fn test_prefer_prefix() {
assert_matches(
&[FuzzyOptimal, FuzzyGreedy],
false,
false,
false,
true,
&[
(
"Moby Dick",
"md",
&[0, 5],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS
- PENALTY_GAP_START
- 3 * PENALTY_GAP_EXTENSION,
),
(
"Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage",
"md",
&[82, 85],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1)
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
),
],
);
}

View File

@ -1,6 +1,9 @@
use std::borrow::Cow;
use std::ops::{Bound, RangeBounds}; use std::ops::{Bound, RangeBounds};
use std::{fmt, slice}; use std::{fmt, slice};
use crate::chars;
/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. /// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
/// ///
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching /// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
@ -51,6 +54,7 @@ impl<'a> Utf32Str<'a> {
} }
} }
/// Returns the number of characters in this string.
#[inline] #[inline]
pub fn len(self) -> usize { pub fn len(self) -> usize {
match self { match self {
@ -58,6 +62,8 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
} }
} }
/// Returns whether this string is empty.
#[inline] #[inline]
pub fn is_empty(self) -> bool { pub fn is_empty(self) -> bool {
match self { match self {
@ -66,6 +72,8 @@ impl<'a> Utf32Str<'a> {
} }
} }
/// Creates a slice with a string that contains the characters in
/// the specified **character range**.
#[inline] #[inline]
pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> { pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
let start = match range.start_bound() { let start = match range.start_bound() {
@ -86,7 +94,7 @@ impl<'a> Utf32Str<'a> {
/// Returns the number of leading whitespaces in this string /// Returns the number of leading whitespaces in this string
#[inline] #[inline]
pub fn leading_white_space(self) -> usize { pub(crate) fn leading_white_space(self) -> usize {
match self { match self {
Utf32Str::Ascii(bytes) => bytes Utf32Str::Ascii(bytes) => bytes
.iter() .iter()
@ -101,7 +109,7 @@ impl<'a> Utf32Str<'a> {
/// Returns the number of leading whitespaces in this string /// Returns the number of leading whitespaces in this string
#[inline] #[inline]
pub fn trailing_white_space(self) -> usize { pub(crate) fn trailing_white_space(self) -> usize {
match self { match self {
Utf32Str::Ascii(bytes) => bytes Utf32Str::Ascii(bytes) => bytes
.iter() .iter()
@ -117,7 +125,7 @@ impl<'a> Utf32Str<'a> {
} }
/// Same as `slice` but accepts a u32 range for convenience since /// Same as `slice` but accepts a u32 range for convenience since
/// those are the indices returned by the matcher /// those are the indices returned by the matcher.
#[inline] #[inline]
pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> { pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
let start = match range.start_bound() { let start = match range.start_bound() {
@ -135,29 +143,34 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
} }
} }
/// Returns whether this string only contains ascii text.
pub fn is_ascii(self) -> bool { pub fn is_ascii(self) -> bool {
matches!(self, Utf32Str::Ascii(_)) matches!(self, Utf32Str::Ascii(_))
} }
pub fn get(self, idx: u32) -> char { /// Returns the `n`th character in this string.
pub fn get(self, n: u32) -> char {
match self { match self {
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[idx as usize], Utf32Str::Unicode(codepoints) => codepoints[n as usize],
} }
} }
pub fn last(self) -> char { pub(crate) fn last(self) -> char {
match self { match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
} }
} }
pub fn first(self) -> char {
pub(crate) fn first(self) -> char {
match self { match self {
Utf32Str::Ascii(bytes) => bytes[0] as char, Utf32Str::Ascii(bytes) => bytes[0] as char,
Utf32Str::Unicode(codepoints) => codepoints[0], Utf32Str::Unicode(codepoints) => codepoints[0],
} }
} }
/// Returns an iterator over the characters in this string
pub fn chars(self) -> Chars<'a> { pub fn chars(self) -> Chars<'a> {
match self { match self {
Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
@ -165,6 +178,7 @@ impl<'a> Utf32Str<'a> {
} }
} }
} }
impl fmt::Debug for Utf32Str<'_> { impl fmt::Debug for Utf32Str<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?; write!(f, "\"")?;
@ -209,3 +223,133 @@ impl DoubleEndedIterator for Chars<'_> {
} }
} }
} }
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
/// An owned version of [`Utf32Str`].
pub enum Utf32String {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
Ascii(Box<str>),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(Box<[char]>),
}
impl Default for Utf32String {
fn default() -> Self {
Self::Ascii(String::new().into_boxed_str())
}
}
impl Utf32String {
/// Returns the number of characters in this string.
#[inline]
pub fn len(&self) -> usize {
match self {
Utf32String::Unicode(codepoints) => codepoints.len(),
Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(),
}
}
/// Returns whether this string is empty.
#[inline]
pub fn is_empty(&self) -> bool {
match self {
Utf32String::Unicode(codepoints) => codepoints.is_empty(),
Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
}
}
/// Creates a slice with a string that contains the characters in
/// the specified **character range**.
#[inline]
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end + 1,
Bound::Excluded(&end) => end,
Bound::Unbounded => self.len(),
};
match self {
Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]),
Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
}
}
/// Same as `slice` but accepts a u32 range for convenience since
/// those are the indices returned by the matcher.
#[inline]
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0,
};
let end = match range.end_bound() {
Bound::Included(&end) => end + 1,
Bound::Excluded(&end) => end,
Bound::Unbounded => self.len() as u32,
};
match self {
Utf32String::Ascii(bytes) => {
Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize])
}
Utf32String::Unicode(codepoints) => {
Utf32Str::Unicode(&codepoints[start as usize..end as usize])
}
}
}
}
impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self {
if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str())
} else {
Self::Unicode(chars::graphemes(value).collect())
}
}
}
impl From<Box<str>> for Utf32String {
fn from(value: Box<str>) -> Self {
if value.is_ascii() {
Self::Ascii(value)
} else {
Self::Unicode(chars::graphemes(&value).collect())
}
}
}
impl From<String> for Utf32String {
#[inline]
fn from(value: String) -> Self {
value.into_boxed_str().into()
}
}
impl<'a> From<Cow<'a, str>> for Utf32String {
#[inline]
fn from(value: Cow<'a, str>) -> Self {
match value {
Cow::Borrowed(value) => value.into(),
Cow::Owned(value) => value.into(),
}
}
}
impl fmt::Debug for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?}", self.slice(..))
}
}
impl fmt::Display for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.slice(..))
}
}

View File

@ -1,4 +1,31 @@
use std::cmp::Reverse; /*!
`nucleo` is a high level crate that provides a high level matcher API that
provides a highly effective (parallel) matcher worker. It's designed to allow
quickly plugging a fully featured (and faster) fzf/skim like fuzzy matcher into
your TUI application.
It's designed to run matching on a background threadpool while providing a
snapshot of the last complete match. That means the matcher can update the
results live while the user is typing while never blocking the main UI thread
(beyond a user provided timeout). Nucleo also supports fully concurrent lock-free
(and wait-free) streaming of input items.
The [`Nucleo`] struct servers as the main API entrypoint for this crate.
# Status
Nucleo is used in the helix-editor and therefore has a large user base with lots
or real world testing. The core matcher implementation is considered complete
and is unlikely to see major changes. The `nucleo-matcher` crate is finished and
ready for widespread use, breaking changes should be very rare (a 1.0 release
should not be far away).
While the high level `nucleo` crate also works well (and is also used in helix),
there are still additional features that will be added in the future. The high
level crate also need better documentation and will likely see a few API
changes in the future.
*/
use std::ops::{Bound, RangeBounds}; use std::ops::{Bound, RangeBounds};
use std::sync::atomic::{self, AtomicBool, Ordering}; use std::sync::atomic::{self, AtomicBool, Ordering};
use std::sync::Arc; use std::sync::Arc;
@ -7,22 +34,25 @@ use std::time::Duration;
use parking_lot::Mutex; use parking_lot::Mutex;
use rayon::ThreadPool; use rayon::ThreadPool;
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; use crate::pattern::MultiPattern;
pub use crate::utf32_string::Utf32String;
use crate::worker::Worker; use crate::worker::Worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; pub use nucleo_matcher::{chars, Config, Matcher, Utf32Str, Utf32String};
mod boxcar; mod boxcar;
mod par_sort; mod par_sort;
mod pattern; pub mod pattern;
mod utf32_string;
mod worker; mod worker;
/// A match candidate stored in a [`Nucleo`] worker.
pub struct Item<'a, T> { pub struct Item<'a, T> {
pub data: &'a T, pub data: &'a T,
pub matcher_columns: &'a [Utf32String], pub matcher_columns: &'a [Utf32String],
} }
/// A handle that allow adding new items [`Nucleo`] worker.
///
/// It's internally reference counted and can be cheaply cloned
/// and send acsorss tread
pub struct Injector<T> { pub struct Injector<T> {
items: Arc<boxcar::Vec<T>>, items: Arc<boxcar::Vec<T>>,
notify: Arc<(dyn Fn() + Sync + Send)>, notify: Arc<(dyn Fn() + Sync + Send)>,
@ -38,15 +68,17 @@ impl<T> Clone for Injector<T> {
} }
impl<T> Injector<T> { impl<T> Injector<T> {
/// Appends an element to the back of the vector. /// Appends an element to the list of matched items.
/// This function is lock-free and wait-free.
pub fn push(&self, value: T, fill_columns: impl FnOnce(&mut [Utf32String])) -> u32 { pub fn push(&self, value: T, fill_columns: impl FnOnce(&mut [Utf32String])) -> u32 {
let idx = self.items.push(value, fill_columns); let idx = self.items.push(value, fill_columns);
(self.notify)(); (self.notify)();
idx idx
} }
/// Returns the total number of items in the current /// Returns the total number of items injected in the matcher. This might
/// queue /// not match the number of items in the match snapshot (if the matcher
/// is still running)
pub fn injected_items(&self) -> u32 { pub fn injected_items(&self) -> u32 {
self.items.count() self.items.count()
} }
@ -69,18 +101,24 @@ impl<T> Injector<T> {
} }
} }
/// An [item](crate::Item) that was successfully matched by a [`Nucleo`] worker.
#[derive(PartialEq, Eq, Debug, Clone, Copy)] #[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Match { pub struct Match {
pub score: u32, pub score: u32,
pub idx: u32, pub idx: u32,
} }
/// That status of a [`Nucleo`] worker after a match.
#[derive(PartialEq, Eq, Debug, Clone, Copy)] #[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct Status { pub struct Status {
/// Whether the current snapshot has changed.
pub changed: bool, pub changed: bool,
/// Whether the matcher is still processing in the background.
pub running: bool, pub running: bool,
} }
/// A snapshot represent the results of a [`Nucleo`] worker after
/// finishing a [`tick`](Nucleo::tick).
pub struct Snapshot<T: Sync + Send + 'static> { pub struct Snapshot<T: Sync + Send + 'static> {
item_count: u32, item_count: u32,
matches: Vec<Match>, matches: Vec<Match>,
@ -181,6 +219,8 @@ impl<T: Sync + Send + 'static> Snapshot<T> {
} }
} }
/// A high level matcher worker that quickly computes matches in a background
/// threadpool.
pub struct Nucleo<T: Sync + Send + 'static> { pub struct Nucleo<T: Sync + Send + 'static> {
// the way the API is build we totally don't actually need these to be Arcs // the way the API is build we totally don't actually need these to be Arcs
// but this lets us avoid some unsafe // but this lets us avoid some unsafe
@ -192,15 +232,31 @@ pub struct Nucleo<T: Sync + Send + 'static> {
items: Arc<boxcar::Vec<T>>, items: Arc<boxcar::Vec<T>>,
notify: Arc<(dyn Fn() + Sync + Send)>, notify: Arc<(dyn Fn() + Sync + Send)>,
snapshot: Snapshot<T>, snapshot: Snapshot<T>,
/// The pattern matched by this matcher. To update the match pattern
/// [`MultiPattern::reparse`](`pattern::MultiPattern::reparse`) should be used.
/// Note that the matcher worker will only become aware of the new pattern
/// after a call to [`tick`](Nucleo::tick).
pub pattern: MultiPattern, pub pattern: MultiPattern,
} }
impl<T: Sync + Send + 'static> Nucleo<T> { impl<T: Sync + Send + 'static> Nucleo<T> {
/// Constructs a new `nucleo` worker threadpool with the provided `config`.
///
/// `notify` is called everytime new information is available and
/// [`tick`](Nucleo::tick) should be called. Note that `notify` is not
/// debounced, that should be handled by the downstream crate (for example
/// debouncing to only redraw at most every 1/60 seconds).
///
/// If `None` is passed for the number of worker threads, nucleo will use
/// one thread per hardware thread.
///
/// Nucleo can match items with multiple orthogonal properties. `columns`
/// indicates how many matching columns each item (and the pattern) has. The
/// number of columns can not be changed after construction.
pub fn new( pub fn new(
config: MatcherConfig, config: Config,
notify: Arc<(dyn Fn() + Sync + Send)>, notify: Arc<(dyn Fn() + Sync + Send)>,
num_threads: Option<usize>, num_threads: Option<usize>,
case_matching: CaseMatching,
columns: u32, columns: u32,
) -> Self { ) -> Self {
let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns); let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns);
@ -209,10 +265,10 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
should_notify: worker.should_notify.clone(), should_notify: worker.should_notify.clone(),
items: worker.items.clone(), items: worker.items.clone(),
pool, pool,
pattern: MultiPattern::new(&config, case_matching, columns as usize), pattern: MultiPattern::new(columns as usize),
snapshot: Snapshot { snapshot: Snapshot {
matches: Vec::with_capacity(2 * 1024), matches: Vec::with_capacity(2 * 1024),
pattern: MultiPattern::new(&config, case_matching, columns as usize), pattern: MultiPattern::new(columns as usize),
item_count: 0, item_count: 0,
items: worker.items.clone(), items: worker.items.clone(),
}, },
@ -222,11 +278,12 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
} }
} }
/// Returns a snapshot of all items /// Returns a snapshot of the current matcher state.
pub fn snapshot(&self) -> &Snapshot<T> { pub fn snapshot(&self) -> &Snapshot<T> {
&self.snapshot &self.snapshot
} }
/// Returns an injector that can be used for adding candidates to the matcher.
pub fn injector(&self) -> Injector<T> { pub fn injector(&self) -> Injector<T> {
Injector { Injector {
items: self.items.clone(), items: self.items.clone(),
@ -234,11 +291,11 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
} }
} }
/// Restart the the item stream. Removes all items disconnects all /// Restart the the item stream. Removes all items and disconnects all
/// previously created injectors from this instance. If `clear_snapshot` is /// previously created injectors from this instance. If `clear_snapshot`
/// `true` then all items and matched are removed from the /// is `true` then all items and matched are removed from the [`Snapshot`]
/// [`Snapshot`](crate::Snapshot) immediately. Otherwise the snapshot will /// (crate::Snapshot) immediately. Otherwise the snapshot will keep the
/// keep the current matches until the matcher has run again. /// current matches until the matcher has run again.
/// ///
/// # Note /// # Note
/// ///
@ -254,10 +311,14 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
} }
} }
pub fn update_config(&mut self, config: MatcherConfig) { pub fn update_config(&mut self, config: Config) {
self.worker.lock().update_config(config) self.worker.lock().update_config(config)
} }
/// The main way to interact with the matcher, this should be called
/// regularly (for example each time a frame is rendered). To avoid
/// excessive redraws this method will wait `timeout` milliseconds for the
/// worker therad to finish. It is recommend to set the timeout to 10ms.
pub fn tick(&mut self, timeout: u64) -> Status { pub fn tick(&mut self, timeout: u64) -> Status {
self.should_notify.store(false, atomic::Ordering::Relaxed); self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status(); let status = self.pattern.status();
@ -278,7 +339,10 @@ impl<T: Sync + Send + 'static> Nucleo<T> {
} else { } else {
let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else { let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else {
self.should_notify.store(true, Ordering::Release); self.should_notify.store(true, Ordering::Release);
return Status{ changed: false, running: true }; return Status {
changed: false,
running: true,
};
}; };
worker worker
}; };
@ -320,31 +384,3 @@ impl<T: Sync + Send> Drop for Nucleo<T> {
} }
} }
} }
/// convenience function to easily fuzzy match
/// on a (relatively small) list of inputs. This is not recommended for building a full tui
/// application that can match large numbers of matches as all matching is done on the current
/// thread, effectively blocking the UI
pub fn fuzzy_match<T: AsRef<str>>(
matcher: &mut Matcher,
pattern: &str,
items: impl IntoIterator<Item = T>,
case_matching: CaseMatching,
) -> Vec<(T, u32)> {
let mut pattern_ = Pattern::new(&matcher.config, case_matching);
pattern_.set_literal(pattern, PatternKind::Fuzzy, false);
if pattern_.is_empty() {
return items.into_iter().map(|item| (item, 0)).collect();
}
let mut buf = Vec::new();
let mut items: Vec<_> = items
.into_iter()
.filter_map(|item| {
pattern_
.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
.map(|score| (item, score))
})
.collect();
items.sort_by_key(|(_, score)| Reverse(*score));
items
}

View File

@ -1,188 +1,12 @@
use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern};
use nucleo_matcher::{Matcher, Utf32String};
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
use crate::Utf32String; #[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Default)]
pub(crate) enum Status {
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[default]
#[non_exhaustive]
pub enum CaseMatching {
Ignore,
Smart,
Respect,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
pub enum PatternKind {
Exact,
Fuzzy,
Substring,
Prefix,
Postfix,
}
#[derive(Debug, PartialEq, Eq, Clone)]
struct PatternAtom {
kind: PatternKind,
needle: Utf32String,
invert: bool,
ignore_case: bool,
}
impl PatternAtom {
fn literal(
needle: &str,
normalize: bool,
case: CaseMatching,
kind: PatternKind,
escape_whitespace: bool,
) -> PatternAtom {
let mut ignore_case;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
let mut needle = start.to_owned();
for rem in rem.split("\\ ") {
needle.push(' ');
needle.push_str(rem);
}
needle
} else {
needle.to_owned()
}
} else {
needle.to_owned()
};
match case {
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => ignore_case = false,
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
saw_backslash = false;
continue;
} else {
needle_.push('\\');
}
}
saw_backslash = c == '\\';
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
};
Utf32String::Unicode(needle_.into_boxed_slice())
};
PatternAtom {
kind,
needle,
invert: false,
ignore_case,
}
}
fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
atom = &atom[1..];
PatternKind::Prefix
}
[b'\'', ..] => {
atom = &atom[1..];
PatternKind::Substring
}
[b'\\', b'^' | b'\'', ..] => {
atom = &atom[1..];
PatternKind::Fuzzy
}
_ => PatternKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == PatternKind::Fuzzy {
PatternKind::Postfix
} else {
PatternKind::Exact
};
atom = &atom[..atom.len() - 1]
}
_ => (),
}
if invert && kind == PatternKind::Fuzzy {
kind = PatternKind::Substring
}
let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true);
pattern.invert = invert;
if append_dollar {
pattern.needle.push('$');
}
pattern
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
pub enum Status {
Unchanged, Unchanged,
Update, Update,
Rescore, Rescore,
@ -190,7 +14,7 @@ pub enum Status {
#[derive(Debug)] #[derive(Debug)]
pub struct MultiPattern { pub struct MultiPattern {
pub cols: Vec<Pattern>, cols: Vec<(Pattern, Status)>,
} }
impl Clone for MultiPattern { impl Clone for MultiPattern {
@ -206,214 +30,64 @@ impl Clone for MultiPattern {
} }
impl MultiPattern { impl MultiPattern {
pub fn new( /// Creates a multi pattern with `columns` empty column patterns.
matcher_config: &MatcherConfig, pub fn new(columns: usize) -> Self {
case_matching: CaseMatching, Self {
columns: usize, cols: vec![Default::default(); columns],
) -> MultiPattern {
MultiPattern {
cols: vec![Pattern::new(matcher_config, case_matching); columns],
} }
} }
/// Reparses a column. By specifying `append` the caller promises that text passed
/// to the previous `reparse` invocation is a prefix of `new_text`. This enables
/// additional optimizations but can lead to missing matches if an incorrect value
/// is passed.
pub fn reparse(
&mut self,
column: usize,
new_text: &str,
case_matching: CaseMatching,
append: bool,
) {
let old_status = self.cols[column].1;
if append
&& old_status != Status::Rescore
&& self.cols[column]
.0
.atoms
.last()
.map_or(true, |last| !last.negative)
{
self.cols[column].1 = Status::Update;
} else {
self.cols[column].1 = Status::Rescore;
}
self.cols[column].0.reparse(new_text, case_matching);
}
pub(crate) fn status(&self) -> Status { pub(crate) fn status(&self) -> Status {
self.cols self.cols
.iter() .iter()
.map(|col| col.status) .map(|&(_, status)| status)
.max() .max()
.unwrap_or(Status::Unchanged) .unwrap_or(Status::Unchanged)
} }
pub(crate) fn reset_status(&mut self) { pub(crate) fn reset_status(&mut self) {
for col in &mut self.cols { for (_, status) in &mut self.cols {
col.status = Status::Unchanged *status = Status::Unchanged
} }
} }
pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option<u32> { pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option<u32> {
// TODO: wheight columns? // TODO: wheight columns?
let mut score = 0; let mut score = 0;
for (pattern, haystack) in self.cols.iter().zip(haystack) { for ((pattern, _), haystack) in self.cols.iter().zip(haystack) {
score += pattern.score(haystack.slice(..), matcher)? score += pattern.score(haystack.slice(..), matcher)?
} }
Some(score) Some(score)
} }
}
#[derive(Debug)]
pub struct Pattern {
atoms: Vec<PatternAtom>,
case_matching: CaseMatching,
normalize: bool,
status: Status,
}
impl Pattern {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern {
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
}
}
pub fn new_fuzzy_literal(
matcher_config: &MatcherConfig,
case_matching: CaseMatching,
pattern: &str,
) -> Pattern {
let mut res = Pattern {
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
};
res.set_literal(pattern, PatternKind::Fuzzy, false);
res
}
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
PatternKind::Substring => {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)),
};
if pattern.invert {
if pattern_score.is_some() {
return None;
}
} else {
score += pattern_score? as u32
}
}
Some(score)
}
pub fn indices(
&self,
haystack: Utf32Str<'_>,
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
if pattern.invert {
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)),
PatternKind::Substring => {
matcher.substring_match(haystack, pattern.needle.slice(..))
}
PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)),
PatternKind::Postfix => {
matcher.postfix_match(haystack, pattern.needle.slice(..))
}
};
if pattern_score.is_some() {
return None;
}
continue;
}
let pattern_score = match pattern.kind {
PatternKind::Exact => {
matcher.exact_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Fuzzy => {
matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Substring => {
matcher.substring_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Prefix => {
matcher.prefix_indices(haystack, pattern.needle.slice(..), indices)
}
PatternKind::Postfix => {
matcher.postfix_indices(haystack, pattern.needle.slice(..), indices)
}
};
score += pattern_score? as u32
}
Some(score)
}
pub fn parse_from(&mut self, pattern: &str, append: bool) {
let invert = self.atoms.last().map_or(false, |pat| pat.invert);
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
if atom.needle.is_empty() {
return None;
}
Some(atom)
});
self.atoms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) {
self.atoms.clear();
let pattern =
PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false);
if !pattern.needle.is_empty() {
self.atoms.push(pattern);
}
self.status = if append && self.status != Status::Rescore {
Status::Update
} else {
Status::Rescore
};
}
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.atoms.is_empty() self.cols.iter().all(|(pat, _)| pat.atoms.is_empty())
} }
} }
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
atoms: self.atoms.clone(),
case_matching: self.case_matching,
normalize: self.normalize,
status: self.status,
}
}
fn clone_from(&mut self, source: &Self) {
self.atoms.clone_from(&source.atoms);
self.case_matching = source.case_matching;
self.normalize = source.normalize;
self.status = source.status;
}
}
fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
let mut saw_backslash = false;
pattern.split(move |c| {
saw_backslash = match c {
' ' if !saw_backslash => return true,
'\\' => true,
_ => false,
};
false
})
}

View File

@ -1,145 +1,14 @@
use crate::pattern::{PatternAtom, Status}; use nucleo_matcher::pattern::CaseMatching;
use crate::{CaseMatching, Pattern, PatternKind};
fn parse_atom(pat: &str) -> PatternAtom { use crate::pattern::{MultiPattern, Status};
parse_atom_with(pat, CaseMatching::Smart)
}
fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom {
let mut pat = parse_with(pat, case_matching, false);
assert_eq!(pat.atoms.len(), 1);
pat.atoms.remove(0)
}
fn parse_with(pat: &str, case_matching: CaseMatching, append: bool) -> Pattern {
let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching);
res.parse_from(pat, append);
res
}
#[test]
fn negative() {
let pat = parse_atom("!foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = parse_atom("foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("'foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = parse_atom_with("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "axx");
let pat = parse_atom_with("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = parse_atom("foo\\ bar");
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = parse_atom("\\!foo");
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\'foo");
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\^foo");
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("^foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Prefix);
let pat = parse_atom("\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\!^foo\\$");
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("!\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Substring);
}
#[test] #[test]
fn append() { fn append() {
let mut pat = parse_with("!", CaseMatching::Smart, true); let mut pat = MultiPattern::new(1);
assert_eq!(pat.status, Status::Update); pat.reparse(0, "!", CaseMatching::Smart, true);
pat.parse_from("!f", true); assert_eq!(pat.status(), Status::Update);
assert_eq!(pat.status, Status::Update); pat.reparse(0, "!f", CaseMatching::Smart, true);
pat.parse_from("!fo", true); assert_eq!(pat.status(), Status::Update);
assert_eq!(pat.status, Status::Rescore); pat.reparse(0, "!fo", CaseMatching::Smart, true);
assert_eq!(pat.status(), Status::Rescore);
} }

View File

@ -3,7 +3,7 @@ use std::mem::take;
use std::sync::atomic::{self, AtomicBool, AtomicU32}; use std::sync::atomic::{self, AtomicBool, AtomicU32};
use std::sync::Arc; use std::sync::Arc;
use nucleo_matcher::MatcherConfig; use nucleo_matcher::Config;
use parking_lot::Mutex; use parking_lot::Mutex;
use rayon::{prelude::*, ThreadPool}; use rayon::{prelude::*, ThreadPool};
@ -42,15 +42,15 @@ impl<T: Sync + Send + 'static> Worker<T> {
pub(crate) fn item_count(&self) -> u32 { pub(crate) fn item_count(&self) -> u32 {
self.last_snapshot - self.in_flight.len() as u32 self.last_snapshot - self.in_flight.len() as u32
} }
pub(crate) fn update_config(&mut self, config: MatcherConfig) { pub(crate) fn update_config(&mut self, config: Config) {
for matcher in self.matchers.0.iter_mut() { for matcher in self.matchers.0.iter_mut() {
matcher.get_mut().config = config; matcher.get_mut().config = config.clone();
} }
} }
pub(crate) fn new( pub(crate) fn new(
worker_threads: Option<usize>, worker_threads: Option<usize>,
config: MatcherConfig, config: Config,
notify: Arc<(dyn Fn() + Sync + Send)>, notify: Arc<(dyn Fn() + Sync + Send)>,
cols: u32, cols: u32,
) -> (ThreadPool, Self) { ) -> (ThreadPool, Self) {
@ -62,7 +62,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
.build() .build()
.expect("creating threadpool failed"); .expect("creating threadpool failed");
let matchers = (0..worker_threads) let matchers = (0..worker_threads)
.map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config))) .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config.clone())))
.collect(); .collect();
let worker = Worker { let worker = Worker {
running: false, running: false,
@ -70,7 +70,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
last_snapshot: 0, last_snapshot: 0,
matches: Vec::new(), matches: Vec::new(),
// just a placeholder // just a placeholder
pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0), pattern: MultiPattern::new(cols as usize),
canceled: Arc::new(AtomicBool::new(false)), canceled: Arc::new(AtomicBool::new(false)),
should_notify: Arc::new(AtomicBool::new(false)), should_notify: Arc::new(AtomicBool::new(false)),
was_canceled: false, was_canceled: false,
@ -102,14 +102,20 @@ impl<T: Sync + Send + 'static> Worker<T> {
let Some(item) = item else { let Some(item) = item else {
in_flight.lock().push(idx); in_flight.lock().push(idx);
unmatched.fetch_add(1, atomic::Ordering::Relaxed); unmatched.fetch_add(1, atomic::Ordering::Relaxed);
return Match { score: 0, idx: u32::MAX }; return Match {
score: 0,
idx: u32::MAX,
};
}; };
if self.canceled.load(atomic::Ordering::Relaxed) { if self.canceled.load(atomic::Ordering::Relaxed) {
return Match { score: 0, idx }; return Match { score: 0, idx };
} }
let Some(score) = pattern.score(item.matcher_columns, matchers.get()) else { let Some(score) = pattern.score(item.matcher_columns, matchers.get()) else {
unmatched.fetch_add(1, atomic::Ordering::Relaxed); unmatched.fetch_add(1, atomic::Ordering::Relaxed);
return Match { score: 0, idx: u32::MAX }; return Match {
score: 0,
idx: u32::MAX,
};
}; };
Match { score, idx } Match { score, idx }
}); });
@ -156,7 +162,7 @@ impl<T: Sync + Send + 'static> Worker<T> {
} }
// TODO: be smarter around reusing past results for rescoring // TODO: be smarter around reusing past results for rescoring
if self.pattern.cols.iter().all(|pat| pat.is_empty()) { if self.pattern.is_empty() {
self.reset_matches(); self.reset_matches();
self.process_new_items_trivial(); self.process_new_items_trivial();
if self.should_notify.load(atomic::Ordering::Relaxed) { if self.should_notify.load(atomic::Ordering::Relaxed) {

View File

@ -1,3 +1,3 @@
default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"] default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"]
[files] [files]
extend-exclude = ["matcher/src/tests.rs", "*.html"] extend-exclude = ["matcher/src/tests.rs","src/pattern/tests.rs", "*.html"]