/*! `nucleo_matcher` is a low level crate that contains the matcher implementation used by the high level `nucleo` crate. The matcher is hightly optimized and can significantly outperform `fzf` and `skim` (the `fuzzy-matcher` crate). However some of these optimizations require a slightly less convenient API. Be sure to carefully read the documentation of the [`Matcher`] to avoid unexpected behaviour.. */ // sadly ranges don't optmimzie well #![allow(clippy::manual_range_contains)] #![warn(missing_docs)] pub mod chars; mod config; #[cfg(test)] mod debug; mod exact; mod fuzzy_greedy; mod fuzzy_optimal; mod matrix; pub mod pattern; mod prefilter; mod score; mod utf32_str; #[cfg(test)] mod tests; pub use crate::config::Config; pub use crate::utf32_str::{Utf32Str, Utf32String}; use crate::chars::{AsciiChar, Char}; use crate::matrix::MatrixSlab; /// A matcher engine that can execute (fuzzy) matches. /// /// A matches contains **heap allocated** scratch memory that is reused during /// matching. This scratch memory allows the matcher to guarantee that it will /// **never allocate** during matching (with the exception of pushing to the /// `indices` vector if there isn't enough capacity). However this scratch /// memory is fairly large (around 135KB) so creating a matcher is expensive and /// should be reused. /// /// All `.._match` functions will not compute the indices of the matched chars /// and are therefore significantly faster. These should be used to prefitler /// and sort all matches. All `.._indices` functions will compute the indices of /// the computed chars. These should be used when rendering the best N matches. /// Note that the `indices` argument is **never cleared**. This allows running /// multiple different matches on the same haystack and merging the indices by /// sorting and deduplicating the vector. /// /// The `needle` argument for each function must always be normalized by the caller /// (unicode normalization and case folding if a case insesnitive match is produced). /// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules /// provides utilities to preprocess needles. /// /// Additionally it's recommend to perform separate matches for each word in /// the needle. Consider the folloling example: If `foo bar` as used at the /// needle it matches both `foo test baaar` and `foo hello-world bar`. However, /// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a /// 2 character gap which will receive a penalty and therefore the user will /// likely expect it to rank lower. However, if `foo bar` is matched as a single /// query `hello-world` and `test` are both considered gaps too. As `hello- /// world` is a much longer gap then `test` the extra penalty for `baaar` is /// outweigh. If both words are matched individually the interspersed words /// do not receive a penalty and `foo hello-world bar` ranks higher. /// /// In general nucleo is a **substring matching tool** with no penalty assigned /// to matches that start later within the same pattern (which enables the /// usecase shown above). This may be undesirable in one very particular usecase: /// For automatic suggestions for commands (like a shell). In these case the /// assumption is that the user is actually typing the full haystack. In other words: /// The matcher should prefer a prefix match. To accomedate that usecase the /// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set /// to true. Note that the penalty given is quite small (and capped to a maximum) /// to avoid overwriting the normal scoring heuristic. /// /// /// Matching is limited to 2^32-1 codepoints, if the haystack is longer than /// that the matcher *will panic*. The caller must decide whether it wants to /// filter out long haystacks or truncate them. pub struct Matcher { #[allow(missing_docs)] pub config: Config, slab: MatrixSlab, } // this is just here for convenience not sure if we should implement this impl Clone for Matcher { fn clone(&self) -> Self { Matcher { config: self.config.clone(), slab: MatrixSlab::new(), } } } impl std::fmt::Debug for Matcher { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Matcher") .field("config", &self.config) .finish_non_exhaustive() } } impl Default for Matcher { fn default() -> Self { Matcher { config: Config::DEFAULT, slab: MatrixSlab::new(), } } } impl Matcher { /// Creates a new matcher instance, note that this will eagerly allocate /// a fairly large chunk of heap memory (135KB currently but subject to /// change) so matchers should be reused if used in a loop. pub fn new(config: Config) -> Self { Self { config, slab: MatrixSlab::new(), } } /// Find the fuzzy match with the highest score in the `haystack`. /// /// This functions has `O(mn)` time complexity for short inputs. To /// avoid slowdowns it automatically falls back to [greedy matching] /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { assert!(haystack.len() <= u32::MAX as usize); self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) } /// Find the fuzzy match with the higehest score in the `haystack` and /// compute its indices. /// /// This functions has `O(mn)` time complexity for short inputs. To /// avoid slowdowns it automatically falls back to [greedy matching] /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn fuzzy_indices( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { assert!(haystack.len() <= u32::MAX as usize); self.fuzzy_matcher_impl::(haystack, needle, indices) } fn fuzzy_matcher_impl( &mut self, haystack_: Utf32Str<'_>, needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle_.len() > haystack_.len() { return None; } if needle_.is_empty() { return Some(0); } if needle_.len() == haystack_.len() { return self.exact_match_impl::( haystack_, needle_, 0, haystack_.len(), indices, ); } assert!( haystack_.len() <= u32::MAX as usize, "fuzzy matching is only support for up to 2^32-1 codepoints" ); match (haystack_, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { if let &[needle] = needle { return self.substring_match_1_ascii::(haystack, needle, indices); } let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; if needle_.len() == end - start { return Some(self.calculate_score::( AsciiChar::cast(haystack), AsciiChar::cast(needle), start, greedy_end, indices, )); } self.fuzzy_match_optimal::( AsciiChar::cast(haystack), AsciiChar::cast(needle), start, greedy_end, end, indices, ) } (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { // a purely ascii haystack can never be transformed to match // a needle that contains non-ascii chars since we don't allow gaps None } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { if let &[needle] = needle { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; let res = self.substring_match_1_non_ascii::( haystack, needle as char, start, indices, ); return Some(res); } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; if needle_.len() == end - start { return self .exact_match_impl::(haystack_, needle_, start, end, indices); } self.fuzzy_match_optimal::( haystack, AsciiChar::cast(needle), start, start + 1, end, indices, ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { if let &[needle] = needle { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; let res = self .substring_match_1_non_ascii::(haystack, needle, start, indices); return Some(res); } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; if needle_.len() == end - start { return self .exact_match_impl::(haystack_, needle_, start, end, indices); } self.fuzzy_match_optimal::( haystack, needle, start, start + 1, end, indices, ) } } } /// Greedly find a fuzzy match in the `haystack`. /// /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should /// be preferred. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn fuzzy_match_greedy( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, ) -> Option { assert!(haystack.len() <= u32::MAX as usize); self.fuzzy_match_greedy_impl::(haystack, needle, &mut Vec::new()) } /// Greedly find a fuzzy match in the `haystack` and compute its indices. /// /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should /// be preferred. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn fuzzy_indices_greedy( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { assert!(haystack.len() <= u32::MAX as usize); self.fuzzy_match_greedy_impl::(haystack, needle, indices) } fn fuzzy_match_greedy_impl( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle_.len() > haystack.len() { return None; } if needle_.is_empty() { return Some(0); } if needle_.len() == haystack.len() { return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } assert!( haystack.len() <= u32::MAX as usize, "matching is only support for up to 2^32-1 codepoints" ); match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; if needle_.len() == greedy_end - start { return Some(self.calculate_score::( AsciiChar::cast(haystack), AsciiChar::cast(needle), start, greedy_end, indices, )); } self.fuzzy_match_greedy_::( AsciiChar::cast(haystack), AsciiChar::cast(needle), start, greedy_end, indices, ) } (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { // a purely ascii haystack can never be transformed to match // a needle that contains non-ascii chars since we don't allow gaps None } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; self.fuzzy_match_greedy_::( haystack, AsciiChar::cast(needle), start, start + 1, indices, ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; self.fuzzy_match_greedy_::( haystack, needle, start, start + 1, indices, ) } } } /// Finds the substring match with the highest score in the `haystack`. /// /// This functions has `O(nm)` time complexity. However many cases can /// be significantly accelerated using prefilters so it's usually fast /// in practice. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn substring_match( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, ) -> Option { self.substring_match_impl::(haystack, needle_, &mut Vec::new()) } /// Finds the substring match with the highest score in the `haystack` and /// compute its indices. /// /// This functions has `O(nm)` time complexity. However many cases can /// be significantly accelerated using prefilters so it's usually fast /// in practice. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn substring_indices( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { self.substring_match_impl::(haystack, needle_, indices) } fn substring_match_impl( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle_.len() > haystack.len() { return None; } if needle_.is_empty() { return Some(0); } if needle_.len() == haystack.len() { return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } assert!( haystack.len() <= u32::MAX as usize, "matching is only support for up to 2^32-1 codepoints" ); match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { if let &[needle] = needle { return self.substring_match_1_ascii::(haystack, needle, indices); } self.substring_match_ascii::(haystack, needle, indices) } (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { // a purely ascii haystack can never be transformed to match // a needle that contains non-ascii chars since we don't allow gaps None } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { if let &[needle] = needle { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; let res = self.substring_match_1_non_ascii::( haystack, needle as char, start, indices, ); return Some(res); } let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; self.substring_match_non_ascii::( haystack, AsciiChar::cast(needle), start, indices, ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { if let &[needle] = needle { let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; let res = self .substring_match_1_non_ascii::(haystack, needle, start, indices); return Some(res); } let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; self.fuzzy_match_optimal::( haystack, needle, start, start + 1, end, indices, ) } } } /// Checks whether needle and haystack match exactly. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { if needle.is_empty() { return Some(0); } let mut leading_space = 0; let mut trailing_space = 0; if !needle.first().is_whitespace() { leading_space = haystack.leading_white_space() } if !needle.last().is_whitespace() { trailing_space = haystack.trailing_white_space() } // avoid wraparound in size check if trailing_space == haystack.len() { return None; } self.exact_match_impl::( haystack, needle, leading_space, haystack.len() - trailing_space, &mut Vec::new(), ) } /// Checks whether needle and haystack match exactly and compute the matches indices. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn exact_indices( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle.is_empty() { return Some(0); } let mut leading_space = 0; let mut trailing_space = 0; if !needle.first().is_whitespace() { leading_space = haystack.leading_white_space() } if !needle.last().is_whitespace() { trailing_space = haystack.trailing_white_space() } // avoid wraparound in size check if trailing_space == haystack.len() { return None; } self.exact_match_impl::( haystack, needle, leading_space, haystack.len() - trailing_space, indices, ) } /// Checks whether needle is a prefix of the haystack. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { if needle.is_empty() { return Some(0); } let mut leading_space = 0; if !needle.first().is_whitespace() { leading_space = haystack.leading_white_space() } if haystack.len() - leading_space < needle.len() { None } else { self.exact_match_impl::( haystack, needle, leading_space, needle.len() + leading_space, &mut Vec::new(), ) } } /// Checks whether needle is a prefix of the haystack and compute the matches indices. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn prefix_indices( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle.is_empty() { return Some(0); } let mut leading_space = 0; if !needle.first().is_whitespace() { leading_space = haystack.leading_white_space() } if haystack.len() - leading_space < needle.len() { None } else { self.exact_match_impl::( haystack, needle, leading_space, needle.len() + leading_space, indices, ) } } /// Checks whether needle is a postfix of the haystack. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { if needle.is_empty() { return Some(0); } let mut trailing_spaces = 0; if !needle.last().is_whitespace() { trailing_spaces = haystack.trailing_white_space() } if haystack.len() - trailing_spaces < needle.len() { None } else { self.exact_match_impl::( haystack, needle, haystack.len() - needle.len() - trailing_spaces, haystack.len() - trailing_spaces, &mut Vec::new(), ) } } /// Checks whether needle is a postfix of the haystack and compute the matches indices. /// /// This functions has `O(n)` time complexity. /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn postfix_indices( &mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { if needle.is_empty() { return Some(0); } let mut trailing_spaces = 0; if !needle.last().is_whitespace() { trailing_spaces = haystack.trailing_white_space() } if haystack.len() - trailing_spaces < needle.len() { None } else { self.exact_match_impl::( haystack, needle, haystack.len() - needle.len() - trailing_spaces, haystack.len() - trailing_spaces, indices, ) } } fn exact_match_impl( &mut self, haystack: Utf32Str<'_>, needle_: Utf32Str<'_>, start: usize, end: usize, indices: &mut Vec, ) -> Option { if needle_.len() != end - start { return None; } assert!( haystack.len() <= u32::MAX as usize, "matching is only support for up to 2^32-1 codepoints" ); let score = match (haystack, needle_) { (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { let matched = if self.config.ignore_case { AsciiChar::cast(haystack)[start..end] .iter() .map(|c| c.normalize(&self.config)) .eq(AsciiChar::cast(needle) .iter() .map(|c| c.normalize(&self.config))) } else { haystack == needle }; if !matched { return None; } self.calculate_score::( AsciiChar::cast(haystack), AsciiChar::cast(needle), start, end, indices, ) } (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { // a purely ascii haystack can never be transformed to match // a needle that contains non-ascii chars since we don't allow gaps return None; } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { let matched = haystack[start..end] .iter() .map(|c| c.normalize(&self.config)) .eq(AsciiChar::cast(needle) .iter() .map(|c| c.normalize(&self.config))); if !matched { return None; } self.calculate_score::( haystack, AsciiChar::cast(needle), start, end, indices, ) } (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { let matched = haystack[start..end] .iter() .map(|c| c.normalize(&self.config)) .eq(needle.iter().map(|c| c.normalize(&self.config))); if !matched { return None; } self.calculate_score::(haystack, needle, start, end, indices) } }; Some(score) } }