From de844d6acec4c3e4435c8f9595796db9470ea1a8 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Mon, 28 Aug 2023 01:33:47 +0200 Subject: [PATCH] move pattern API to nucleo-matcher --- bench/src/main.rs | 2 +- matcher/src/chars.rs | 33 ++- matcher/src/chars/normalize.rs | 10 + matcher/src/config.rs | 24 +- matcher/src/fuzzy_optimal.rs | 4 +- matcher/src/lib.rs | 16 +- matcher/src/pattern.rs | 469 +++++++++++++++++++++++++++++++++ matcher/src/pattern/tests.rs | 114 ++++++++ matcher/src/score.rs | 4 +- matcher/src/tests.rs | 14 +- matcher/src/utf32_str.rs | 134 ++++------ src/lib.rs | 44 +--- src/pattern.rs | 408 +++------------------------- src/pattern/tests.rs | 149 +---------- src/worker.rs | 14 +- typos.toml | 2 +- 16 files changed, 766 insertions(+), 675 deletions(-) create mode 100644 matcher/src/pattern.rs create mode 100644 matcher/src/pattern/tests.rs diff --git a/bench/src/main.rs b/bench/src/main.rs index 148d353..bc77b03 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -43,7 +43,7 @@ fn main() { Some((path.as_str().into(), path)) }) .unzip(); - let mut nucleo = nucleo::Matcher::new(nucleo::MatcherConfig::DEFAULT.match_paths()); + let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths()); let skim = fuzzy_matcher::skim::SkimMatcherV2::default(); // TODO: unicode? diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index a469fc1..710c212 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -1,7 +1,9 @@ +//! Utilities for working with (unicode) characters/codepoints + use std::fmt::{self, Debug, Display}; use crate::chars::case_fold::CASE_FOLDING_SIMPLE; -use crate::MatcherConfig; +use crate::Config; //autogenerated by generate-ucd #[allow(warnings)] @@ -11,9 +13,9 @@ mod normalize; pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { const ASCII: bool; - fn char_class(self, config: &MatcherConfig) -> CharClass; - fn char_class_and_normalize(self, config: &MatcherConfig) -> (Self, CharClass); - fn normalize(self, config: &MatcherConfig) -> Self; + fn char_class(self, config: &Config) -> CharClass; + fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); + fn normalize(self, config: &Config) -> Self; } /// repr tansparent wrapper around u8 with better formatting and `PartialEq` implementation @@ -42,7 +44,7 @@ impl PartialEq for char { impl Char for AsciiChar { const ASCII: bool = true; #[inline] - fn char_class(self, config: &MatcherConfig) -> CharClass { + fn char_class(self, config: &Config) -> CharClass { let c = self.0; // using manual if conditions instead optimizes better if c >= b'a' && c <= b'z' { @@ -61,7 +63,7 @@ impl Char for AsciiChar { } #[inline(always)] - fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { let char_class = self.char_class(config); if config.ignore_case && char_class == CharClass::Upper { self.0 += 32 @@ -70,7 +72,7 @@ impl Char for AsciiChar { } #[inline(always)] - fn normalize(mut self, config: &MatcherConfig) -> Self { + fn normalize(mut self, config: &Config) -> Self { if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { self.0 += 32 } @@ -95,7 +97,7 @@ fn char_class_non_ascii(c: char) -> CharClass { impl Char for char { const ASCII: bool = false; #[inline(always)] - fn char_class(self, config: &MatcherConfig) -> CharClass { + fn char_class(self, config: &Config) -> CharClass { if self.is_ascii() { return AsciiChar(self as u8).char_class(config); } @@ -103,7 +105,7 @@ impl Char for char { } #[inline(always)] - fn char_class_and_normalize(mut self, config: &MatcherConfig) -> (Self, CharClass) { + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { if self.is_ascii() { let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); return (c.0 as char, class); @@ -123,7 +125,7 @@ impl Char for char { } #[inline(always)] - fn normalize(mut self, config: &MatcherConfig) -> Self { + fn normalize(mut self, config: &Config) -> Self { if config.normalize { self = normalize::normalize(self); } @@ -138,12 +140,14 @@ pub use normalize::normalize; use unicode_segmentation::UnicodeSegmentation; #[inline(always)] +/// Converts a character to lower case using simple unicode case folding pub fn to_lower_case(c: char) -> char { CASE_FOLDING_SIMPLE .binary_search_by_key(&c, |(upper, _)| *upper) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } +/// Converts a character to upper case using simple unicode case folding #[inline(always)] pub fn is_upper_case(c: char) -> bool { CASE_FOLDING_SIMPLE @@ -152,8 +156,7 @@ pub fn is_upper_case(c: char) -> bool { } #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] -#[non_exhaustive] -pub enum CharClass { +pub(crate) enum CharClass { Whitespace, NonWord, Delimiter, @@ -163,8 +166,10 @@ pub enum CharClass { Number, } -/// nucleo cannot match graphemes as single units to work around -/// that we only use the first codepoint of each grapheme +/// Nucleo cannot match graphemes as single units. To work around +/// that we only use the first codepoint of each grapheme. This +/// iterator returns the first character of each unicode grapheme +// in a string and is used for constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { text.graphemes(true).map(|grapheme| { grapheme diff --git a/matcher/src/chars/normalize.rs b/matcher/src/chars/normalize.rs index 66a4db1..d3df40e 100644 --- a/matcher/src/chars/normalize.rs +++ b/matcher/src/chars/normalize.rs @@ -495,6 +495,16 @@ const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1; const LEN3: usize = (DATA3_END - DATA3_START) as usize; static TABLE3: [char; LEN3] = generate_table(&DATA3); +/// Normalizes a unicode character by converting latin characters +/// which are variants of ASCII characters to their latin equivant. +/// +/// # Example +/// +/// ``` rust +/// # use nucleo_matcher::chars::normalize; +/// +/// assert_eq!(normalize('ä'), 'a'); +/// ``` pub fn normalize(c: char) -> char { let i = c as u32; if i < DATA1_START || i >= DATA3_END { diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 67e07b7..eca7ae3 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -1,16 +1,19 @@ use crate::chars::CharClass; use crate::score::BONUS_BOUNDARY; +/// Configuration data that controls how a matcher behaves #[non_exhaustive] -#[derive(PartialEq, Eq, Debug, Clone, Copy)] -pub struct MatcherConfig { - pub delimiter_chars: &'static [u8], +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct Config { + /// Characters that act as delimiters and provide bonus + /// for matching the following char + pub(crate) delimiter_chars: &'static [u8], /// Extra bonus for word boundary after whitespace character or beginning of the string pub(crate) bonus_boundary_white: u16, - /// Extra bonus for word boundary after slash, colon, semi-colon, and comma pub(crate) bonus_boundary_delimiter: u16, - pub initial_char_class: CharClass, + pub(crate) initial_char_class: CharClass, + /// Whether to normalize latin script characters to ASCII (enabled by default) pub normalize: bool, /// whether to ignore casing @@ -25,9 +28,11 @@ pub struct MatcherConfig { pub prefer_prefix: bool, } -impl MatcherConfig { +impl Config { + /// The default config for nucleo, implemented as a constant since + /// Default::default can not be called in a const context pub const DEFAULT: Self = { - MatcherConfig { + Config { delimiter_chars: b"/,:;|", bonus_boundary_white: BONUS_BOUNDARY + 2, bonus_boundary_delimiter: BONUS_BOUNDARY + 1, @@ -39,9 +44,9 @@ impl MatcherConfig { }; } -impl MatcherConfig { +impl Config { + /// Configures the matcher with bonuses appropriate for matching file paths. pub fn set_match_paths(&mut self) { - // compared to fzf we include if cfg!(windows) { self.delimiter_chars = b"/:\\"; } else { @@ -51,6 +56,7 @@ impl MatcherConfig { self.initial_char_class = CharClass::Delimiter; } + /// Configures the matcher with bonuses appropriate for matching file paths. pub const fn match_paths(mut self) -> Self { if cfg!(windows) { self.delimiter_chars = b"/\\"; diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs index 10c7bcd..aba7bbe 100644 --- a/matcher/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -6,7 +6,7 @@ use crate::score::{ BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, }; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; impl Matcher { pub(crate) fn fuzzy_match_optimal, N: Char>( @@ -112,7 +112,7 @@ impl MatcherDataView<'_, H> { &mut self, needle: &[N], mut prev_class: CharClass, - config: &MatcherConfig, + config: &Config, start: u32, ) -> bool where diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index efae388..7feff93 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -10,6 +10,7 @@ a slightly less convenient API. Be sure to carefully read the documentation of t // sadly ranges don't optmimzie well #![allow(clippy::manual_range_contains)] +#![warn(missing_docs)] pub mod chars; mod config; @@ -19,6 +20,7 @@ mod exact; mod fuzzy_greedy; mod fuzzy_optimal; mod matrix; +pub mod pattern; mod prefilter; mod score; mod utf32_str; @@ -26,7 +28,7 @@ mod utf32_str; #[cfg(test)] mod tests; -pub use crate::config::MatcherConfig; +pub use crate::config::Config; pub use crate::utf32_str::{Utf32Str, Utf32String}; use crate::chars::{AsciiChar, Char}; @@ -80,7 +82,8 @@ use crate::matrix::MatrixSlab; /// that the matcher *will panic*. The caller must decide whether it wants to /// filter out long haystacks or truncate them. pub struct Matcher { - pub config: MatcherConfig, + #[allow(missing_docs)] + pub config: Config, slab: MatrixSlab, } @@ -88,7 +91,7 @@ pub struct Matcher { impl Clone for Matcher { fn clone(&self) -> Self { Matcher { - config: self.config, + config: self.config.clone(), slab: MatrixSlab::new(), } } @@ -105,14 +108,17 @@ impl std::fmt::Debug for Matcher { impl Default for Matcher { fn default() -> Self { Matcher { - config: MatcherConfig::DEFAULT, + config: Config::DEFAULT, slab: MatrixSlab::new(), } } } impl Matcher { - pub fn new(config: MatcherConfig) -> Self { + /// Creates a new matcher instance, note that this will eagerly allocate + /// a fairly large chunk of heap memory (135KB currently but subject to + /// change) so matchers should be reused if used in a loop. + pub fn new(config: Config) -> Self { Self { config, slab: MatrixSlab::new(), diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs new file mode 100644 index 0000000..3583ebe --- /dev/null +++ b/matcher/src/pattern.rs @@ -0,0 +1,469 @@ +//! This module provides a slightly higher level API for matching strings. + +use std::cmp::Reverse; + +use crate::{chars, Matcher, Utf32Str}; + +#[cfg(test)] +mod tests; + +use crate::Utf32String; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[non_exhaustive] +/// How nucleo will treat case mismatch +pub enum CaseMatching { + /// Characters always match their case folded version (`a == A`) + Ignore, + /// Characters never match their case folded version (`a != A`) + Respect, + /// Acts like `Ignore` if all characters in a pattern atom are + /// lowercase and like `Respect` otherwire + #[default] + Smart, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[non_exhaustive] +/// The kind of matching algorithm to run for this atom +pub enum AtomKind { + /// Fuzzy matching where the needle must match any haystack characters + /// (match can contain gaps). This atom kind is used by default if no + /// special syntax is used. There is no negated fuzzy matching (too + /// many false positives). + /// + /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). + Fuzzy, + /// The needle must match a contiguous sequence of haystack characters + /// without gaps. This atom kind is parsed from the following syntax: + /// `'foo` and `!foo` (negated). + /// + /// See also [`Matcher::substring_match`](crate::Matcher::substring_match). + Substring, + /// The needle must match all leading haystack characters without gaps or + /// prefix. This atom kind is parsed from the following syntax: `foo$` and + /// `!foo$` (negated). + /// + /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match). + Prefix, + /// The needle must match all trailing haystack characters without gaps or + /// postfix. This atom kind is parsed from the following syntax: `foo$` and + /// `!foo$` (negated). + /// + /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match). + Postfix, + /// The needle must match all haystack characters without gaps or prefix. + /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$` + /// (negated). + /// + /// See also [`Matcher::exact_match`] (crate::Matcher::exact_match). + Exact, +} + +/// A single pattern component that is matched with a single [`Matcher`](crate::Matcher) function +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Atom { + /// Whether this pattern atom is a negative match. + /// A negative pattern atom will prevent haystacks matching it from + /// being matchend. It does not contribute to scoring/indices + pub negative: bool, + /// The kind of match that this pattern performs + pub kind: AtomKind, + needle: Utf32String, + ignore_case: bool, +} + +impl Atom { + /// Creates a single [`PatternAtom`] from a string by performing unicode + /// normalization + pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom { + Atom::new_inner(needle, case, kind, escape_whitespace, false) + } + + fn new_inner( + needle: &str, + case: CaseMatching, + kind: AtomKind, + escape_whitespace: bool, + append_dollar: bool, + ) -> Atom { + let mut ignore_case; + let needle = if needle.is_ascii() { + let mut needle = if escape_whitespace { + if let Some((start, rem)) = needle.split_once("\\ ") { + let mut needle = start.to_owned(); + for rem in rem.split("\\ ") { + needle.push(' '); + needle.push_str(rem); + } + needle + } else { + needle.to_owned() + } + } else { + needle.to_owned() + }; + + match case { + CaseMatching::Ignore => { + ignore_case = true; + needle.make_ascii_lowercase() + } + CaseMatching::Smart => { + ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) + } + CaseMatching::Respect => ignore_case = false, + } + if append_dollar { + needle.push('$'); + } + Utf32String::Ascii(needle.into_boxed_str()) + } else { + let mut needle_ = Vec::with_capacity(needle.len()); + ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); + if escape_whitespace { + let mut saw_backslash = false; + for mut c in chars::graphemes(needle) { + if saw_backslash { + if c == ' ' { + needle_.push(' '); + saw_backslash = false; + continue; + } else { + needle_.push('\\'); + } + } + saw_backslash = c == '\\'; + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c) + } + CaseMatching::Respect => (), + } + needle_.push(c); + } + } else { + let chars = chars::graphemes(needle).map(|mut c| { + match case { + CaseMatching::Ignore => c = chars::to_lower_case(c), + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c); + } + CaseMatching::Respect => (), + } + c + }); + needle_.extend(chars); + }; + if append_dollar { + needle_.push('$'); + } + Utf32String::Unicode(needle_.into_boxed_slice()) + }; + Atom { + kind, + needle, + negative: false, + ignore_case, + } + } + + /// Parse a pattern atom from a string. Some special trailing and leading + /// characters can be used to control the atom kind. See [`AtomKind`] for + /// details. + pub fn parse(raw: &str, case: CaseMatching) -> Atom { + let mut atom = raw; + let invert = match atom.as_bytes() { + [b'!', ..] => { + atom = &atom[1..]; + true + } + [b'\\', b'!', ..] => { + atom = &atom[1..]; + false + } + _ => false, + }; + + let mut kind = match atom.as_bytes() { + [b'^', ..] => { + atom = &atom[1..]; + AtomKind::Prefix + } + [b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Substring + } + [b'\\', b'^' | b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Fuzzy + } + _ => AtomKind::Fuzzy, + }; + + let mut append_dollar = false; + match atom.as_bytes() { + [.., b'\\', b'$'] => { + append_dollar = true; + atom = &atom[..atom.len() - 2] + } + [.., b'$'] => { + kind = if kind == AtomKind::Fuzzy { + AtomKind::Postfix + } else { + AtomKind::Exact + }; + atom = &atom[..atom.len() - 1] + } + _ => (), + } + + if invert && kind == AtomKind::Fuzzy { + kind = AtomKind::Substring + } + + let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar); + pattern.negative = invert; + pattern + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher). + /// Documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + matcher.config.ignore_case = self.ignore_case; + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + if self.negative { + if pattern_score.is_some() { + return None; + } + Some(0) + } else { + pattern_score + } + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the matche + /// indices. See the [`Matcher`](crate::Matcher). Documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// this pattern atom. + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec, + ) -> Option { + matcher.config.ignore_case = self.ignore_case; + if self.negative { + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + pattern_score.is_none().then_some(0) + } else { + match self.kind { + AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices), + AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices), + AtomKind::Substring => { + matcher.substring_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Prefix => { + matcher.prefix_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Postfix => { + matcher.postfix_indices(haystack, self.needle.slice(..), indices) + } + } + } + } + + /// Returns the needle text that is passed to the matcher. All indices + /// produced by the `indices` functions produce char indices used to index + /// this text + pub fn needle_text(&self) -> Utf32Str<'_> { + self.needle.slice(..) + } + /// Convenience function to easily match on a (relatively small) list of + /// inputs. This is not recommended for building a full fuzzy matching + /// application that can match large numbers of matches (like all files in + /// a directory) as all matching is done on the current thread, effectively + /// blocking the UI. + pub fn match_list>( + &self, + matcher: &mut Matcher, + items: impl IntoIterator, + ) -> Vec<(T, u16)> { + if self.needle.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } +} + +fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { + let mut saw_backslash = false; + pattern.split(move |c| { + saw_backslash = match c { + ' ' if !saw_backslash => return true, + '\\' => true, + _ => false, + }; + false + }) +} + +#[derive(Debug, Default)] +/// A fuzzy match pattern +#[non_exhaustive] +pub struct Pattern { + /// The individual pattern (words) in this pattern + pub atoms: Vec, +} + +impl Pattern { + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, ' + /// and ^ don't receive special treatment). If you want to match the entiru + /// pattern as a single needle use a single [`PatternAtom`] instead + pub fn new(case_matching: CaseMatching, kind: AtomKind, pattern: &str) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::new(pat, case_matching, kind, true); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will + /// cause different matching behaviour (see [`PatternAtomKind`]). These can be + /// escaped with backslash. + pub fn parse(case_matching: CaseMatching, pattern: &str) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::parse(pat, case_matching); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + + /// Convenience function to easily match on a (relatively small) list of + /// inputs. This is not recommended for building a full fuzzy matching + /// application that can match large numbers of matches (like all files in + /// a directory) as all matching is done on the current thread, effectively + /// blocking the UI. + pub fn match_list>( + &self, + matcher: &mut Matcher, + items: impl IntoIterator, + ) -> Vec<(T, u32)> { + if self.atoms.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`](crate::Matcher). + /// Documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.score(haystack, matcher)? as u32; + } + Some(score) + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the matche + /// indices. See the [`Matcher`](crate::Matcher). Documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + /// + /// *Note:* The indices for each pattern are calculated individually + /// and simply appended to the `indices` vector. This allows + /// + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec, + ) -> Option { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.indices(haystack, matcher, indices)? as u32; + } + Some(score) + } + + /// Refreshes this pattern by reparsing a + pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) { + self.atoms.clear(); + let atoms = pattern_atoms(pattern).filter_map(|atom| { + let atom = Atom::parse(atom, case_matching); + if atom.needle.is_empty() { + return None; + } + Some(atom) + }); + self.atoms.extend(atoms); + } +} + +impl Clone for Pattern { + fn clone(&self) -> Self { + Self { + atoms: self.atoms.clone(), + } + } + + fn clone_from(&mut self, source: &Self) { + self.atoms.clone_from(&source.atoms); + } +} diff --git a/matcher/src/pattern/tests.rs b/matcher/src/pattern/tests.rs new file mode 100644 index 0000000..8fcd0a9 --- /dev/null +++ b/matcher/src/pattern/tests.rs @@ -0,0 +1,114 @@ +use crate::pattern::{Atom, AtomKind, CaseMatching}; + +#[test] +fn negative() { + let pat = Atom::parse("!foo", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!foo$", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo$", CaseMatching::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn pattern_kinds() { + let pat = Atom::parse("foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Fuzzy); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("'foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("foo$", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo$", CaseMatching::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn case_matching() { + let pat = Atom::parse("foo", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Äxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "äxx"); + let pat = Atom::parse("Äxx", CaseMatching::Respect); + assert!(!pat.ignore_case); + let pat = Atom::parse("Axx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = Atom::parse("你xx", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("你xx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Ⲽxx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "ⲽxx"); +} + +#[test] +fn escape() { + let pat = Atom::parse("foo\\ bar", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo bar"); + let pat = Atom::parse("\\!foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "!foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\'foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "'foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\^foo", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Prefix); + let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "!^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Substring); +} diff --git a/matcher/src/score.rs b/matcher/src/score.rs index eba054b..c934a8e 100644 --- a/matcher/src/score.rs +++ b/matcher/src/score.rs @@ -1,7 +1,7 @@ use std::cmp::max; use crate::chars::{Char, CharClass}; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; pub(crate) const SCORE_MATCH: u16 = 16; pub(crate) const PENALTY_GAP_START: u16 = 3; @@ -47,7 +47,7 @@ pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENS // still respected. pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; -impl MatcherConfig { +impl Config { #[inline] pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { if class > CharClass::Delimiter { diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index 691230c..058b497 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -4,7 +4,7 @@ use crate::score::{ MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; use crate::utf32_str::Utf32Str; -use crate::{Matcher, MatcherConfig}; +use crate::{Config, Matcher}; use Algorithm::*; @@ -26,11 +26,11 @@ fn assert_matches( prefer_prefix: bool, cases: &[(&str, &str, &[u32], u16)], ) { - let mut config = MatcherConfig { + let mut config = Config { normalize, ignore_case: !case_sensitive, prefer_prefix, - ..MatcherConfig::DEFAULT + ..Config::DEFAULT }; if path { config.set_match_paths(); @@ -89,10 +89,10 @@ pub fn assert_not_matches( path: bool, cases: &[(&str, &str)], ) { - let mut config = MatcherConfig { + let mut config = Config { normalize, ignore_case: !case_sensitive, - ..MatcherConfig::DEFAULT + ..Config::DEFAULT }; if path { config.set_match_paths(); @@ -134,8 +134,8 @@ pub fn assert_not_matches( } } -const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; -const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; +const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white; +const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter; #[test] fn test_fuzzy() { diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 9602b27..1821b46 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::mem::take; use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; @@ -55,6 +54,7 @@ impl<'a> Utf32Str<'a> { } } + /// Returns the number of characters in this string. #[inline] pub fn len(self) -> usize { match self { @@ -62,6 +62,8 @@ impl<'a> Utf32Str<'a> { Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), } } + + /// Returns whether this string is empty. #[inline] pub fn is_empty(self) -> bool { match self { @@ -70,6 +72,8 @@ impl<'a> Utf32Str<'a> { } } + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. #[inline] pub fn slice(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { @@ -90,7 +94,7 @@ impl<'a> Utf32Str<'a> { /// Returns the number of leading whitespaces in this string #[inline] - pub fn leading_white_space(self) -> usize { + pub(crate) fn leading_white_space(self) -> usize { match self { Utf32Str::Ascii(bytes) => bytes .iter() @@ -105,7 +109,7 @@ impl<'a> Utf32Str<'a> { /// Returns the number of leading whitespaces in this string #[inline] - pub fn trailing_white_space(self) -> usize { + pub(crate) fn trailing_white_space(self) -> usize { match self { Utf32Str::Ascii(bytes) => bytes .iter() @@ -121,7 +125,7 @@ impl<'a> Utf32Str<'a> { } /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher + /// those are the indices returned by the matcher. #[inline] pub fn slice_u32(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { @@ -139,29 +143,34 @@ impl<'a> Utf32Str<'a> { Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), } } + + /// Returns whether this string only contains ascii text. pub fn is_ascii(self) -> bool { matches!(self, Utf32Str::Ascii(_)) } - pub fn get(self, idx: u32) -> char { + /// Returns the `n`th character in this string. + pub fn get(self, n: u32) -> char { match self { - Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, - Utf32Str::Unicode(codepoints) => codepoints[idx as usize], + Utf32Str::Ascii(bytes) => bytes[n as usize] as char, + Utf32Str::Unicode(codepoints) => codepoints[n as usize], } } - pub fn last(self) -> char { + pub(crate) fn last(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], } } - pub fn first(self) -> char { + + pub(crate) fn first(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[0] as char, Utf32Str::Unicode(codepoints) => codepoints[0], } } + /// Returns an iterator over the characters in this string pub fn chars(self) -> Chars<'a> { match self { Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), @@ -169,6 +178,7 @@ impl<'a> Utf32Str<'a> { } } } + impl fmt::Debug for Utf32Str<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "\"")?; @@ -215,6 +225,7 @@ impl DoubleEndedIterator for Chars<'_> { } #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +/// An owned version of [`Utf32Str`]. pub enum Utf32String { /// A string represented as ASCII encoded bytes. /// Correctness invariant: must only contain valid ASCII (<=127) @@ -230,6 +241,7 @@ impl Default for Utf32String { } impl Utf32String { + /// Returns the number of characters in this string. #[inline] pub fn len(&self) -> usize { match self { @@ -237,6 +249,8 @@ impl Utf32String { Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), } } + + /// Returns whether this string is empty. #[inline] pub fn is_empty(&self) -> bool { match self { @@ -245,18 +259,18 @@ impl Utf32String { } } - /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. #[inline] - pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { let start = match range.start_bound() { - Bound::Included(&start) => start as usize, - Bound::Excluded(&start) => start as usize + 1, + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, Bound::Unbounded => 0, }; let end = match range.end_bound() { - Bound::Included(&end) => end as usize + 1, - Bound::Excluded(&end) => end as usize, + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, Bound::Unbounded => self.len(), }; match self { @@ -265,65 +279,28 @@ impl Utf32String { } } + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher. #[inline] - pub fn is_ascii(&self) -> bool { - matches!(self, Utf32String::Ascii(_)) - } - - #[inline] - pub fn get(&self, idx: u32) -> char { - match self { - Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, - Utf32String::Unicode(codepoints) => codepoints[idx as usize], - } - } - - #[inline] - pub fn last(&self) -> char { - match self { - Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, - Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], - } - } - - #[inline] - pub fn chars(&self) -> Chars<'_> { - match self { - Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), - Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), - } - } - - #[inline] - pub fn push_str(&mut self, text: &str) { - let mut codeboints = match take(self) { - Utf32String::Ascii(bytes) if text.is_ascii() => { - let mut bytes = bytes.into_string(); - bytes.push_str(text); - *self = Self::Ascii(bytes.into_boxed_str()); - return; - } - Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), - Utf32String::Unicode(codepoints) => Vec::from(codepoints), + pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, }; - codeboints.extend(chars::graphemes(text)); - *self = Utf32String::Unicode(codeboints.into_boxed_slice()); - } - - #[inline] - pub fn push(&mut self, c: char) { - let mut codeboints = match take(self) { - Utf32String::Ascii(bytes) if c.is_ascii() => { - let mut bytes = bytes.into_string(); - bytes.push(c); - *self = Self::Ascii(bytes.into_boxed_str()); - return; - } - Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(), - Utf32String::Unicode(codepoints) => Vec::from(codepoints), + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.len() as u32, }; - codeboints.push(c); - *self = Utf32String::Unicode(codeboints.into_boxed_slice()); + match self { + Utf32String::Ascii(bytes) => { + Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize]) + } + Utf32String::Unicode(codepoints) => { + Utf32Str::Unicode(&codepoints[start as usize..end as usize]) + } + } } } @@ -367,21 +344,12 @@ impl<'a> From> for Utf32String { impl fmt::Debug for Utf32String { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\"")?; - for c in self.chars() { - for c in c.escape_debug() { - write!(f, "{c}")? - } - } - write!(f, "\"") + write!(f, "{:?}", self.slice(..)) } } impl fmt::Display for Utf32String { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for c in self.chars() { - write!(f, "{c}")? - } - Ok(()) + write!(f, "{}", self.slice(..)) } } diff --git a/src/lib.rs b/src/lib.rs index cb87352..61f18db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,3 @@ -use std::cmp::Reverse; use std::ops::{Bound, RangeBounds}; use std::sync::atomic::{self, AtomicBool, Ordering}; use std::sync::Arc; @@ -7,13 +6,13 @@ use std::time::Duration; use parking_lot::Mutex; use rayon::ThreadPool; -pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind}; +use crate::pattern::MultiPattern; use crate::worker::Worker; -pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str, Utf32String}; +pub use nucleo_matcher::{chars, Config, Matcher, Utf32Str, Utf32String}; mod boxcar; mod par_sort; -mod pattern; +pub mod pattern; mod worker; pub struct Item<'a, T> { @@ -195,10 +194,9 @@ pub struct Nucleo { impl Nucleo { pub fn new( - config: MatcherConfig, + config: Config, notify: Arc<(dyn Fn() + Sync + Send)>, num_threads: Option, - case_matching: CaseMatching, columns: u32, ) -> Self { let (pool, worker) = Worker::new(num_threads, config, notify.clone(), columns); @@ -207,10 +205,10 @@ impl Nucleo { should_notify: worker.should_notify.clone(), items: worker.items.clone(), pool, - pattern: MultiPattern::new(&config, case_matching, columns as usize), + pattern: MultiPattern::new(columns as usize), snapshot: Snapshot { matches: Vec::with_capacity(2 * 1024), - pattern: MultiPattern::new(&config, case_matching, columns as usize), + pattern: MultiPattern::new(columns as usize), item_count: 0, items: worker.items.clone(), }, @@ -252,7 +250,7 @@ impl Nucleo { } } - pub fn update_config(&mut self, config: MatcherConfig) { + pub fn update_config(&mut self, config: Config) { self.worker.lock().update_config(config) } @@ -321,31 +319,3 @@ impl Drop for Nucleo { } } } - -/// convenience function to easily fuzzy match -/// on a (relatively small) list of inputs. This is not recommended for building a full tui -/// application that can match large numbers of matches as all matching is done on the current -/// thread, effectively blocking the UI -pub fn fuzzy_match>( - matcher: &mut Matcher, - pattern: &str, - items: impl IntoIterator, - case_matching: CaseMatching, -) -> Vec<(T, u32)> { - let mut pattern_ = Pattern::new(&matcher.config, case_matching); - pattern_.set_literal(pattern, PatternKind::Fuzzy, false); - if pattern_.is_empty() { - return items.into_iter().map(|item| (item, 0)).collect(); - } - let mut buf = Vec::new(); - let mut items: Vec<_> = items - .into_iter() - .filter_map(|item| { - pattern_ - .score(Utf32Str::new(item.as_ref(), &mut buf), matcher) - .map(|score| (item, score)) - }) - .collect(); - items.sort_by_key(|(_, score)| Reverse(*score)); - items -} diff --git a/src/pattern.rs b/src/pattern.rs index f9939c6..07620e9 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,188 +1,12 @@ -use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; +pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern}; +use nucleo_matcher::{Matcher, Utf32String}; #[cfg(test)] mod tests; -use crate::Utf32String; - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -#[non_exhaustive] -pub enum CaseMatching { - Ignore, - Smart, - Respect, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -#[non_exhaustive] -pub enum PatternKind { - Exact, - Fuzzy, - Substring, - Prefix, - Postfix, -} - -#[derive(Debug, PartialEq, Eq, Clone)] -struct PatternAtom { - kind: PatternKind, - needle: Utf32String, - invert: bool, - ignore_case: bool, -} -impl PatternAtom { - fn literal( - needle: &str, - normalize: bool, - case: CaseMatching, - kind: PatternKind, - escape_whitespace: bool, - ) -> PatternAtom { - let mut ignore_case; - let needle = if needle.is_ascii() { - let mut needle = if escape_whitespace { - if let Some((start, rem)) = needle.split_once("\\ ") { - let mut needle = start.to_owned(); - for rem in rem.split("\\ ") { - needle.push(' '); - needle.push_str(rem); - } - needle - } else { - needle.to_owned() - } - } else { - needle.to_owned() - }; - - match case { - CaseMatching::Ignore => { - ignore_case = true; - needle.make_ascii_lowercase() - } - CaseMatching::Smart => { - ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) - } - CaseMatching::Respect => ignore_case = false, - } - - Utf32String::Ascii(needle.into_boxed_str()) - } else { - let mut needle_ = Vec::with_capacity(needle.len()); - ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); - if escape_whitespace { - let mut saw_backslash = false; - for mut c in chars::graphemes(needle) { - if saw_backslash { - if c == ' ' { - needle_.push(' '); - saw_backslash = false; - continue; - } else { - needle_.push('\\'); - } - } - saw_backslash = c == '\\'; - if normalize { - c = chars::normalize(c); - } - match case { - CaseMatching::Ignore => c = chars::to_lower_case(c), - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c) - } - CaseMatching::Respect => (), - } - needle_.push(c); - } - } else { - let chars = chars::graphemes(needle).map(|mut c| { - if normalize { - c = chars::normalize(c); - } - match case { - CaseMatching::Ignore => c = chars::to_lower_case(c), - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c); - } - CaseMatching::Respect => (), - } - c - }); - needle_.extend(chars); - }; - Utf32String::Unicode(needle_.into_boxed_slice()) - }; - PatternAtom { - kind, - needle, - invert: false, - ignore_case, - } - } - - fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom { - let mut atom = raw; - let invert = match atom.as_bytes() { - [b'!', ..] => { - atom = &atom[1..]; - true - } - [b'\\', b'!', ..] => { - atom = &atom[1..]; - false - } - _ => false, - }; - - let mut kind = match atom.as_bytes() { - [b'^', ..] => { - atom = &atom[1..]; - PatternKind::Prefix - } - [b'\'', ..] => { - atom = &atom[1..]; - PatternKind::Substring - } - [b'\\', b'^' | b'\'', ..] => { - atom = &atom[1..]; - PatternKind::Fuzzy - } - _ => PatternKind::Fuzzy, - }; - - let mut append_dollar = false; - match atom.as_bytes() { - [.., b'\\', b'$'] => { - append_dollar = true; - atom = &atom[..atom.len() - 2] - } - [.., b'$'] => { - kind = if kind == PatternKind::Fuzzy { - PatternKind::Postfix - } else { - PatternKind::Exact - }; - atom = &atom[..atom.len() - 1] - } - _ => (), - } - - if invert && kind == PatternKind::Fuzzy { - kind = PatternKind::Substring - } - - let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true); - pattern.invert = invert; - if append_dollar { - pattern.needle.push('$'); - } - pattern - } -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Default)] pub enum Status { + #[default] Unchanged, Update, Rescore, @@ -190,7 +14,7 @@ pub enum Status { #[derive(Debug)] pub struct MultiPattern { - pub cols: Vec, + cols: Vec<(Pattern, Status)>, } impl Clone for MultiPattern { @@ -206,214 +30,64 @@ impl Clone for MultiPattern { } impl MultiPattern { - pub fn new( - matcher_config: &MatcherConfig, - case_matching: CaseMatching, - columns: usize, - ) -> MultiPattern { - MultiPattern { - cols: vec![Pattern::new(matcher_config, case_matching); columns], + /// Creates a multi pattern with `columns` empty column patterns. + pub fn new(columns: usize) -> Self { + Self { + cols: vec![Default::default(); columns], } } + /// Reparses a column. By specifying `append` the caller promises that text passed + /// to the previous `reparse` invocation is a prefix of `new_text`. This enables + /// additional optimizations but can lead to missing matches if an incorrect value + /// is passed. + pub fn reparse( + &mut self, + column: usize, + new_text: &str, + case_matching: CaseMatching, + append: bool, + ) { + let old_status = self.cols[column].1; + if append + && old_status != Status::Rescore + && self.cols[column] + .0 + .atoms + .last() + .map_or(true, |last| !last.negative) + { + self.cols[column].1 = Status::Update; + } else { + self.cols[column].1 = Status::Rescore; + } + self.cols[column].0.reparse(new_text, case_matching); + } + pub(crate) fn status(&self) -> Status { self.cols .iter() - .map(|col| col.status) + .map(|&(_, status)| status) .max() .unwrap_or(Status::Unchanged) } pub(crate) fn reset_status(&mut self) { - for col in &mut self.cols { - col.status = Status::Unchanged + for (_, status) in &mut self.cols { + *status = Status::Unchanged } } pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { // TODO: wheight columns? let mut score = 0; - for (pattern, haystack) in self.cols.iter().zip(haystack) { + for ((pattern, _), haystack) in self.cols.iter().zip(haystack) { score += pattern.score(haystack.slice(..), matcher)? } Some(score) } -} - -#[derive(Debug)] -pub struct Pattern { - atoms: Vec, - case_matching: CaseMatching, - normalize: bool, - status: Status, -} - -impl Pattern { - pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern { - Pattern { - atoms: Vec::new(), - case_matching, - normalize: matcher_config.normalize, - status: Status::Unchanged, - } - } - pub fn new_fuzzy_literal( - matcher_config: &MatcherConfig, - case_matching: CaseMatching, - pattern: &str, - ) -> Pattern { - let mut res = Pattern { - atoms: Vec::new(), - case_matching, - normalize: matcher_config.normalize, - status: Status::Unchanged, - }; - res.set_literal(pattern, PatternKind::Fuzzy, false); - res - } - - pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - matcher.config.ignore_case = pattern.ignore_case; - let pattern_score = match pattern.kind { - PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), - PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), - PatternKind::Substring => { - matcher.substring_match(haystack, pattern.needle.slice(..)) - } - PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), - PatternKind::Postfix => matcher.postfix_match(haystack, pattern.needle.slice(..)), - }; - if pattern.invert { - if pattern_score.is_some() { - return None; - } - } else { - score += pattern_score? as u32 - } - } - Some(score) - } - - pub fn indices( - &self, - haystack: Utf32Str<'_>, - matcher: &mut Matcher, - indices: &mut Vec, - ) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - matcher.config.ignore_case = pattern.ignore_case; - if pattern.invert { - let pattern_score = match pattern.kind { - PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), - PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), - PatternKind::Substring => { - matcher.substring_match(haystack, pattern.needle.slice(..)) - } - PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), - PatternKind::Postfix => { - matcher.postfix_match(haystack, pattern.needle.slice(..)) - } - }; - if pattern_score.is_some() { - return None; - } - continue; - } - let pattern_score = match pattern.kind { - PatternKind::Exact => { - matcher.exact_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Fuzzy => { - matcher.fuzzy_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Substring => { - matcher.substring_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Prefix => { - matcher.prefix_indices(haystack, pattern.needle.slice(..), indices) - } - PatternKind::Postfix => { - matcher.postfix_indices(haystack, pattern.needle.slice(..), indices) - } - }; - score += pattern_score? as u32 - } - Some(score) - } - - pub fn parse_from(&mut self, pattern: &str, append: bool) { - let invert = self.atoms.last().map_or(false, |pat| pat.invert); - self.atoms.clear(); - let atoms = pattern_atoms(pattern).filter_map(|atom| { - let atom = PatternAtom::parse(atom, self.normalize, self.case_matching); - if atom.needle.is_empty() { - return None; - } - Some(atom) - }); - self.atoms.extend(atoms); - - self.status = if append && !invert && self.status != Status::Rescore { - Status::Update - } else { - Status::Rescore - }; - } - - pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) { - self.atoms.clear(); - let pattern = - PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false); - if !pattern.needle.is_empty() { - self.atoms.push(pattern); - } - self.status = if append && self.status != Status::Rescore { - Status::Update - } else { - Status::Rescore - }; - } pub fn is_empty(&self) -> bool { - self.atoms.is_empty() + self.cols.iter().all(|(pat, _)| pat.atoms.is_empty()) } } - -impl Clone for Pattern { - fn clone(&self) -> Self { - Self { - atoms: self.atoms.clone(), - case_matching: self.case_matching, - normalize: self.normalize, - status: self.status, - } - } - - fn clone_from(&mut self, source: &Self) { - self.atoms.clone_from(&source.atoms); - self.case_matching = source.case_matching; - self.normalize = source.normalize; - self.status = source.status; - } -} - -fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { - let mut saw_backslash = false; - pattern.split(move |c| { - saw_backslash = match c { - ' ' if !saw_backslash => return true, - '\\' => true, - _ => false, - }; - false - }) -} diff --git a/src/pattern/tests.rs b/src/pattern/tests.rs index 4eaeb40..3854e15 100644 --- a/src/pattern/tests.rs +++ b/src/pattern/tests.rs @@ -1,145 +1,14 @@ -use crate::pattern::{PatternAtom, Status}; -use crate::{CaseMatching, Pattern, PatternKind}; +use nucleo_matcher::pattern::CaseMatching; -fn parse_atom(pat: &str) -> PatternAtom { - parse_atom_with(pat, CaseMatching::Smart) -} - -fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom { - let mut pat = parse_with(pat, case_matching, false); - assert_eq!(pat.atoms.len(), 1); - pat.atoms.remove(0) -} - -fn parse_with(pat: &str, case_matching: CaseMatching, append: bool) -> Pattern { - let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching); - res.parse_from(pat, append); - res -} - -#[test] -fn negative() { - let pat = parse_atom("!foo"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!^foo"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!foo$"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("!^foo$"); - assert!(pat.invert); - assert_eq!(pat.kind, PatternKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn pattern_kinds() { - let pat = parse_atom("foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Fuzzy); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("'foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("^foo"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("foo$"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom("^foo$"); - assert!(!pat.invert); - assert_eq!(pat.kind, PatternKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn case_matching() { - let pat = parse_atom_with("foo", CaseMatching::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom_with("Foo", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Foo", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = parse_atom_with("Foo", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Foo", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = parse_atom_with("Äxx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Respect); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("Äxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = parse_atom_with("你xx", CaseMatching::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = parse_atom_with("你xx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Ⲽxx"); - let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "ⲽxx"); -} - -#[test] -fn escape() { - let pat = parse_atom("foo\\ bar"); - assert_eq!(pat.needle.to_string(), "foo bar"); - let pat = parse_atom("\\!foo"); - assert_eq!(pat.needle.to_string(), "!foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\'foo"); - assert_eq!(pat.needle.to_string(), "'foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\^foo"); - assert_eq!(pat.needle.to_string(), "^foo"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("foo\\$"); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("^foo\\$"); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, PatternKind::Prefix); - let pat = parse_atom("\\^foo\\$"); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("\\!^foo\\$"); - assert_eq!(pat.needle.to_string(), "!^foo$"); - assert_eq!(pat.kind, PatternKind::Fuzzy); - let pat = parse_atom("!\\^foo\\$"); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, PatternKind::Substring); -} +use crate::pattern::{MultiPattern, Status}; #[test] fn append() { - let mut pat = parse_with("!", CaseMatching::Smart, true); - assert_eq!(pat.status, Status::Update); - pat.parse_from("!f", true); - assert_eq!(pat.status, Status::Update); - pat.parse_from("!fo", true); - assert_eq!(pat.status, Status::Rescore); + let mut pat = MultiPattern::new(1); + pat.reparse(0, "!", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Update); + pat.reparse(0, "!f", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Update); + pat.reparse(0, "!fo", CaseMatching::Smart, true); + assert_eq!(pat.status(), Status::Rescore); } diff --git a/src/worker.rs b/src/worker.rs index e343be1..478dce7 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -3,7 +3,7 @@ use std::mem::take; use std::sync::atomic::{self, AtomicBool, AtomicU32}; use std::sync::Arc; -use nucleo_matcher::MatcherConfig; +use nucleo_matcher::Config; use parking_lot::Mutex; use rayon::{prelude::*, ThreadPool}; @@ -42,15 +42,15 @@ impl Worker { pub(crate) fn item_count(&self) -> u32 { self.last_snapshot - self.in_flight.len() as u32 } - pub(crate) fn update_config(&mut self, config: MatcherConfig) { + pub(crate) fn update_config(&mut self, config: Config) { for matcher in self.matchers.0.iter_mut() { - matcher.get_mut().config = config; + matcher.get_mut().config = config.clone(); } } pub(crate) fn new( worker_threads: Option, - config: MatcherConfig, + config: Config, notify: Arc<(dyn Fn() + Sync + Send)>, cols: u32, ) -> (ThreadPool, Self) { @@ -62,7 +62,7 @@ impl Worker { .build() .expect("creating threadpool failed"); let matchers = (0..worker_threads) - .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config))) + .map(|_| UnsafeCell::new(nucleo_matcher::Matcher::new(config.clone()))) .collect(); let worker = Worker { running: false, @@ -70,7 +70,7 @@ impl Worker { last_snapshot: 0, matches: Vec::new(), // just a placeholder - pattern: MultiPattern::new(&config, crate::CaseMatching::Ignore, 0), + pattern: MultiPattern::new(cols as usize), canceled: Arc::new(AtomicBool::new(false)), should_notify: Arc::new(AtomicBool::new(false)), was_canceled: false, @@ -162,7 +162,7 @@ impl Worker { } // TODO: be smarter around reusing past results for rescoring - if self.pattern.cols.iter().all(|pat| pat.is_empty()) { + if self.pattern.is_empty() { self.reset_matches(); self.process_new_items_trivial(); if self.should_notify.load(atomic::Ordering::Relaxed) { diff --git a/typos.toml b/typos.toml index 900e3df..14fc504 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,3 @@ default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"] [files] -extend-exclude = ["matcher/src/tests.rs", "*.html"] +extend-exclude = ["matcher/src/tests.rs","src/pattern/tests.rs", "*.html"]