2023-07-26 13:32:04 +00:00
|
|
|
/*!
|
|
|
|
`nucleo_matcher` is a low level crate that contains the matcher implementation
|
2023-08-27 14:21:51 +00:00
|
|
|
used by the high level `nucleo` crate.
|
2023-07-26 13:32:04 +00:00
|
|
|
|
|
|
|
The matcher is hightly optimized and can significantly outperform `fzf` and
|
|
|
|
`skim` (the `fuzzy-matcher` crate). However some of these optimizations require
|
2023-08-27 14:21:51 +00:00
|
|
|
a slightly less convenient API. Be sure to carefully read the documentation of the
|
|
|
|
[`Matcher`] to avoid unexpected behaviour..
|
2023-07-26 13:32:04 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
// sadly ranges don't optmimzie well
|
|
|
|
#![allow(clippy::manual_range_contains)]
|
2023-08-27 23:33:47 +00:00
|
|
|
#![warn(missing_docs)]
|
2023-07-26 13:32:04 +00:00
|
|
|
|
|
|
|
pub mod chars;
|
|
|
|
mod config;
|
|
|
|
#[cfg(test)]
|
|
|
|
mod debug;
|
|
|
|
mod exact;
|
|
|
|
mod fuzzy_greedy;
|
|
|
|
mod fuzzy_optimal;
|
|
|
|
mod matrix;
|
2023-08-27 23:33:47 +00:00
|
|
|
pub mod pattern;
|
2023-07-26 13:32:04 +00:00
|
|
|
mod prefilter;
|
|
|
|
mod score;
|
|
|
|
mod utf32_str;
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests;
|
|
|
|
|
2023-08-27 23:33:47 +00:00
|
|
|
pub use crate::config::Config;
|
2023-08-27 15:29:17 +00:00
|
|
|
pub use crate::utf32_str::{Utf32Str, Utf32String};
|
2023-07-26 13:32:04 +00:00
|
|
|
|
|
|
|
use crate::chars::{AsciiChar, Char};
|
|
|
|
use crate::matrix::MatrixSlab;
|
|
|
|
|
|
|
|
/// A matcher engine that can execute (fuzzy) matches.
|
|
|
|
///
|
|
|
|
/// A matches contains **heap allocated** scratch memory that is reused during
|
2023-08-05 15:25:01 +00:00
|
|
|
/// matching. This scratch memory allows the matcher to guarantee that it will
|
2023-07-26 13:32:04 +00:00
|
|
|
/// **never allocate** during matching (with the exception of pushing to the
|
|
|
|
/// `indices` vector if there isn't enough capacity). However this scratch
|
|
|
|
/// memory is fairly large (around 135KB) so creating a matcher is expensive and
|
|
|
|
/// should be reused.
|
|
|
|
///
|
|
|
|
/// All `.._match` functions will not compute the indices of the matched chars
|
|
|
|
/// and are therefore significantly faster. These should be used to prefitler
|
|
|
|
/// and sort all matches. All `.._indices` functions will compute the indices of
|
|
|
|
/// the computed chars. These should be used when rendering the best N matches.
|
|
|
|
/// Note that the `indices` argument is **never cleared**. This allows running
|
|
|
|
/// multiple different matches on the same haystack and merging the indices by
|
|
|
|
/// sorting and deduplicating the vector.
|
|
|
|
///
|
2023-08-27 14:21:51 +00:00
|
|
|
/// The `needle` argument for each function must always be normalized by the caller
|
|
|
|
/// (unicode normalization and case folding if a case insesnitive match is produced).
|
|
|
|
/// Otherwise, the matcher may fail to produce a match. The [`pattern`] modules
|
|
|
|
/// provides utilities to preprocess needles.
|
|
|
|
///
|
|
|
|
/// Additionally it's recommend to perform separate matches for each word in
|
|
|
|
/// the needle. Consider the folloling example: If `foo bar` as used at the
|
|
|
|
/// needle it matches both `foo test baaar` and `foo hello-world bar`. However,
|
|
|
|
/// `foo test baaar` will receive a lower score/rank lower. `baaar` contains a
|
|
|
|
/// 2 character gap which will receive a penalty and therefore the user will
|
|
|
|
/// likely expect it to rank lower. However, if `foo bar` is matched as a single
|
|
|
|
/// query `hello-world` and `test` are both considered gaps too. As `hello-
|
|
|
|
/// world` is a much longer gap then `test` the extra penalty for `baaar` is
|
|
|
|
/// outweigh. If both words are matched individually the interspersed words
|
|
|
|
/// do not receive a penalty and `foo hello-world bar` ranks higher.
|
|
|
|
///
|
|
|
|
/// In general nucleo is a **substring matching tool** with no penalty assigned
|
|
|
|
/// to matches that start later within the same pattern (which enables the
|
|
|
|
/// usecase shown above). This may be undesirable in one very particular usecase:
|
|
|
|
/// For automatic suggestions for commands (like a shell). In these case the
|
|
|
|
/// assumption is that the user is actually typing the full haystack. In other words:
|
|
|
|
/// The matcher should prefer a prefix match. To accomedate that usecase the
|
|
|
|
/// [`prefer_prefix`](MatcherConfig::prefer_prefix) option can be set
|
|
|
|
/// to true. Note that the penalty given is quite small (and capped to a maximum)
|
|
|
|
/// to avoid overwriting the normal scoring heuristic.
|
|
|
|
///
|
|
|
|
///
|
2023-07-26 13:32:04 +00:00
|
|
|
/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than
|
|
|
|
/// that the matcher *will panic*. The caller must decide whether it wants to
|
|
|
|
/// filter out long haystacks or truncate them.
|
|
|
|
pub struct Matcher {
|
2023-08-27 23:33:47 +00:00
|
|
|
#[allow(missing_docs)]
|
|
|
|
pub config: Config,
|
2023-07-26 13:32:04 +00:00
|
|
|
slab: MatrixSlab,
|
|
|
|
}
|
|
|
|
|
2023-08-05 15:25:01 +00:00
|
|
|
// this is just here for convenience not sure if we should implement this
|
2023-07-27 20:08:06 +00:00
|
|
|
impl Clone for Matcher {
|
|
|
|
fn clone(&self) -> Self {
|
|
|
|
Matcher {
|
2023-08-27 23:33:47 +00:00
|
|
|
config: self.config.clone(),
|
2023-07-27 20:08:06 +00:00
|
|
|
slab: MatrixSlab::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl std::fmt::Debug for Matcher {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
f.debug_struct("Matcher")
|
|
|
|
.field("config", &self.config)
|
|
|
|
.finish_non_exhaustive()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-26 13:32:04 +00:00
|
|
|
impl Default for Matcher {
|
|
|
|
fn default() -> Self {
|
|
|
|
Matcher {
|
2023-08-27 23:33:47 +00:00
|
|
|
config: Config::DEFAULT,
|
2023-07-26 13:32:04 +00:00
|
|
|
slab: MatrixSlab::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Matcher {
|
2023-08-27 23:33:47 +00:00
|
|
|
/// Creates a new matcher instance, note that this will eagerly allocate
|
|
|
|
/// a fairly large chunk of heap memory (135KB currently but subject to
|
|
|
|
/// change) so matchers should be reused if used in a loop.
|
|
|
|
pub fn new(config: Config) -> Self {
|
2023-07-26 13:32:04 +00:00
|
|
|
Self {
|
|
|
|
config,
|
|
|
|
slab: MatrixSlab::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-05 15:25:01 +00:00
|
|
|
/// Find the fuzzy match with the highest score in the `haystack`.
|
2023-07-26 13:32:04 +00:00
|
|
|
///
|
|
|
|
/// This functions has `O(mn)` time complexity for short inputs. To
|
|
|
|
/// avoid slowdowns it automatically falls back to [greedy matching]
|
|
|
|
/// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
|
|
|
assert!(haystack.len() <= u32::MAX as usize);
|
|
|
|
self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Find the fuzzy match with the higehest score in the `haystack` and
|
|
|
|
/// compute its indices.
|
|
|
|
///
|
|
|
|
/// This functions has `O(mn)` time complexity for short inputs. To
|
|
|
|
/// avoid slowdowns it automatically falls back to [greedy matching]
|
|
|
|
/// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn fuzzy_indices(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
|
|
|
assert!(haystack.len() <= u32::MAX as usize);
|
|
|
|
self.fuzzy_matcher_impl::<true>(haystack, needle, indices)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn fuzzy_matcher_impl<const INDICES: bool>(
|
|
|
|
&mut self,
|
2023-07-30 02:52:44 +00:00
|
|
|
haystack_: Utf32Str<'_>,
|
2023-07-26 13:32:04 +00:00
|
|
|
needle_: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.len() > haystack_.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
return None;
|
|
|
|
}
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
2023-07-30 02:52:44 +00:00
|
|
|
if needle_.len() == haystack_.len() {
|
|
|
|
return self.exact_match_impl::<INDICES>(
|
|
|
|
haystack_,
|
|
|
|
needle_,
|
|
|
|
0,
|
|
|
|
haystack_.len(),
|
|
|
|
indices,
|
|
|
|
);
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
assert!(
|
2023-07-30 02:52:44 +00:00
|
|
|
haystack_.len() <= u32::MAX as usize,
|
2023-07-26 13:32:04 +00:00
|
|
|
"fuzzy matching is only support for up to 2^32-1 codepoints"
|
|
|
|
);
|
2023-07-30 02:52:44 +00:00
|
|
|
match (haystack_, needle_) {
|
2023-07-26 13:32:04 +00:00
|
|
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices);
|
|
|
|
}
|
|
|
|
let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?;
|
2023-07-30 02:52:44 +00:00
|
|
|
if needle_.len() == end - start {
|
|
|
|
return Some(self.calculate_score::<INDICES, _, _>(
|
|
|
|
AsciiChar::cast(haystack),
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
greedy_end,
|
|
|
|
indices,
|
|
|
|
));
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>(
|
|
|
|
AsciiChar::cast(haystack),
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
greedy_end,
|
|
|
|
end,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
|
|
|
// a purely ascii haystack can never be transformed to match
|
|
|
|
// a needle that contains non-ascii chars since we don't allow gaps
|
|
|
|
None
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
let res = self.substring_match_1_non_ascii::<INDICES>(
|
|
|
|
haystack,
|
|
|
|
needle as char,
|
|
|
|
start,
|
|
|
|
indices,
|
|
|
|
);
|
|
|
|
return Some(res);
|
|
|
|
}
|
|
|
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
2023-07-30 02:52:44 +00:00
|
|
|
if needle_.len() == end - start {
|
|
|
|
return self
|
|
|
|
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
self.fuzzy_match_optimal::<INDICES, char, AsciiChar>(
|
|
|
|
haystack,
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
start + 1,
|
|
|
|
end,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
let res = self
|
|
|
|
.substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices);
|
|
|
|
return Some(res);
|
|
|
|
}
|
|
|
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
2023-07-30 02:52:44 +00:00
|
|
|
if needle_.len() == end - start {
|
|
|
|
return self
|
|
|
|
.exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices);
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
self.fuzzy_match_optimal::<INDICES, char, char>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
start,
|
|
|
|
start + 1,
|
|
|
|
end,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Greedly find a fuzzy match in the `haystack`.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity but may provide unintutive (non-optimal)
|
|
|
|
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should
|
|
|
|
/// be preferred.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn fuzzy_match_greedy(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
) -> Option<u16> {
|
|
|
|
assert!(haystack.len() <= u32::MAX as usize);
|
|
|
|
self.fuzzy_match_greedy_impl::<false>(haystack, needle, &mut Vec::new())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Greedly find a fuzzy match in the `haystack` and compute its indices.
|
|
|
|
///
|
2023-08-05 15:25:01 +00:00
|
|
|
/// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal)
|
2023-07-26 13:32:04 +00:00
|
|
|
/// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should
|
|
|
|
/// be preferred.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn fuzzy_indices_greedy(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
|
|
|
assert!(haystack.len() <= u32::MAX as usize);
|
|
|
|
self.fuzzy_match_greedy_impl::<true>(haystack, needle, indices)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn fuzzy_match_greedy_impl<const INDICES: bool>(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle_: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.len() > haystack.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
return None;
|
|
|
|
}
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
if needle_.len() == haystack.len() {
|
2023-07-30 02:52:44 +00:00
|
|
|
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
assert!(
|
|
|
|
haystack.len() <= u32::MAX as usize,
|
|
|
|
"matching is only support for up to 2^32-1 codepoints"
|
|
|
|
);
|
|
|
|
match (haystack, needle_) {
|
|
|
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?;
|
2023-07-30 02:52:44 +00:00
|
|
|
if needle_.len() == greedy_end - start {
|
|
|
|
return Some(self.calculate_score::<INDICES, _, _>(
|
|
|
|
AsciiChar::cast(haystack),
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
greedy_end,
|
|
|
|
indices,
|
|
|
|
));
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>(
|
|
|
|
AsciiChar::cast(haystack),
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
greedy_end,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
|
|
|
// a purely ascii haystack can never be transformed to match
|
|
|
|
// a needle that contains non-ascii chars since we don't allow gaps
|
|
|
|
None
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
self.fuzzy_match_greedy_::<INDICES, char, AsciiChar>(
|
|
|
|
haystack,
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
start + 1,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
self.fuzzy_match_greedy_::<INDICES, char, char>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
start,
|
|
|
|
start + 1,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Finds the substring match with the highest score in the `haystack`.
|
|
|
|
///
|
|
|
|
/// This functions has `O(nm)` time complexity. However many cases can
|
|
|
|
/// be significantly accelerated using prefilters so it's usually fast
|
|
|
|
/// in practice.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn substring_match(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle_: Utf32Str<'_>,
|
|
|
|
) -> Option<u16> {
|
|
|
|
self.substring_match_impl::<false>(haystack, needle_, &mut Vec::new())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Finds the substring match with the highest score in the `haystack` and
|
|
|
|
/// compute its indices.
|
|
|
|
///
|
|
|
|
/// This functions has `O(nm)` time complexity. However many cases can
|
|
|
|
/// be significantly accelerated using prefilters so it's usually fast
|
|
|
|
/// in practice.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn substring_indices(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle_: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
|
|
|
self.substring_match_impl::<true>(haystack, needle_, indices)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn substring_match_impl<const INDICES: bool>(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle_: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.len() > haystack.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
return None;
|
|
|
|
}
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
if needle_.len() == haystack.len() {
|
2023-07-30 02:52:44 +00:00
|
|
|
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
assert!(
|
|
|
|
haystack.len() <= u32::MAX as usize,
|
|
|
|
"matching is only support for up to 2^32-1 codepoints"
|
|
|
|
);
|
|
|
|
match (haystack, needle_) {
|
|
|
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices);
|
|
|
|
}
|
|
|
|
self.substring_match_ascii::<INDICES>(haystack, needle, indices)
|
|
|
|
}
|
|
|
|
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
|
|
|
// a purely ascii haystack can never be transformed to match
|
|
|
|
// a needle that contains non-ascii chars since we don't allow gaps
|
|
|
|
None
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
let res = self.substring_match_1_non_ascii::<INDICES>(
|
|
|
|
haystack,
|
|
|
|
needle as char,
|
|
|
|
start,
|
|
|
|
indices,
|
|
|
|
);
|
|
|
|
return Some(res);
|
|
|
|
}
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
|
|
|
self.substring_match_non_ascii::<INDICES, _>(
|
|
|
|
haystack,
|
|
|
|
AsciiChar::cast(needle),
|
|
|
|
start,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
|
|
|
if let &[needle] = needle {
|
|
|
|
let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?;
|
|
|
|
let res = self
|
|
|
|
.substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices);
|
|
|
|
return Some(res);
|
|
|
|
}
|
|
|
|
let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
|
|
|
|
self.fuzzy_match_optimal::<INDICES, char, char>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
start,
|
|
|
|
start + 1,
|
|
|
|
end,
|
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle and haystack match exactly.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut leading_space = 0;
|
|
|
|
let mut trailing_space = 0;
|
|
|
|
if !needle.first().is_whitespace() {
|
|
|
|
leading_space = haystack.leading_white_space()
|
|
|
|
}
|
|
|
|
if !needle.last().is_whitespace() {
|
|
|
|
trailing_space = haystack.trailing_white_space()
|
|
|
|
}
|
|
|
|
// avoid wraparound in size check
|
|
|
|
if trailing_space == haystack.len() {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
self.exact_match_impl::<false>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
leading_space,
|
|
|
|
haystack.len() - trailing_space,
|
|
|
|
&mut Vec::new(),
|
|
|
|
)
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle and haystack match exactly and compute the matches indices.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn exact_indices(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut leading_space = 0;
|
|
|
|
let mut trailing_space = 0;
|
|
|
|
if !needle.first().is_whitespace() {
|
|
|
|
leading_space = haystack.leading_white_space()
|
|
|
|
}
|
|
|
|
if !needle.last().is_whitespace() {
|
|
|
|
trailing_space = haystack.trailing_white_space()
|
|
|
|
}
|
|
|
|
// avoid wraparound in size check
|
|
|
|
if trailing_space == haystack.len() {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
self.exact_match_impl::<true>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
leading_space,
|
|
|
|
haystack.len() - trailing_space,
|
|
|
|
indices,
|
|
|
|
)
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle is a prefix of the haystack.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut leading_space = 0;
|
|
|
|
if !needle.first().is_whitespace() {
|
|
|
|
leading_space = haystack.leading_white_space()
|
|
|
|
}
|
|
|
|
if haystack.len() - leading_space < needle.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
None
|
|
|
|
} else {
|
2023-08-06 17:55:31 +00:00
|
|
|
self.exact_match_impl::<false>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
leading_space,
|
|
|
|
needle.len() + leading_space,
|
|
|
|
&mut Vec::new(),
|
|
|
|
)
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle is a prefix of the haystack and compute the matches indices.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn prefix_indices(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut leading_space = 0;
|
|
|
|
if !needle.first().is_whitespace() {
|
|
|
|
leading_space = haystack.leading_white_space()
|
|
|
|
}
|
|
|
|
if haystack.len() - leading_space < needle.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
None
|
|
|
|
} else {
|
2023-08-06 17:55:31 +00:00
|
|
|
self.exact_match_impl::<true>(
|
|
|
|
haystack,
|
|
|
|
needle,
|
|
|
|
leading_space,
|
|
|
|
needle.len() + leading_space,
|
|
|
|
indices,
|
|
|
|
)
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle is a postfix of the haystack.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut trailing_spaces = 0;
|
|
|
|
if !needle.last().is_whitespace() {
|
|
|
|
trailing_spaces = haystack.trailing_white_space()
|
|
|
|
}
|
|
|
|
if haystack.len() - trailing_spaces < needle.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
None
|
|
|
|
} else {
|
|
|
|
self.exact_match_impl::<false>(
|
2023-07-30 02:52:44 +00:00
|
|
|
haystack,
|
2023-07-26 13:32:04 +00:00
|
|
|
needle,
|
2023-08-06 17:55:31 +00:00
|
|
|
haystack.len() - needle.len() - trailing_spaces,
|
|
|
|
haystack.len() - trailing_spaces,
|
2023-07-26 13:32:04 +00:00
|
|
|
&mut Vec::new(),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks whether needle is a postfix of the haystack and compute the matches indices.
|
|
|
|
///
|
|
|
|
/// This functions has `O(n)` time complexity.
|
|
|
|
///
|
|
|
|
/// See the [matcher documentation](crate::Matcher) for more details.
|
|
|
|
pub fn postfix_indices(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle: Utf32Str<'_>,
|
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle.is_empty() {
|
|
|
|
return Some(0);
|
|
|
|
}
|
|
|
|
let mut trailing_spaces = 0;
|
|
|
|
if !needle.last().is_whitespace() {
|
|
|
|
trailing_spaces = haystack.trailing_white_space()
|
|
|
|
}
|
|
|
|
if haystack.len() - trailing_spaces < needle.len() {
|
2023-07-26 13:32:04 +00:00
|
|
|
None
|
|
|
|
} else {
|
|
|
|
self.exact_match_impl::<true>(
|
2023-07-30 02:52:44 +00:00
|
|
|
haystack,
|
2023-07-26 13:32:04 +00:00
|
|
|
needle,
|
2023-08-06 17:55:31 +00:00
|
|
|
haystack.len() - needle.len() - trailing_spaces,
|
|
|
|
haystack.len() - trailing_spaces,
|
2023-07-26 13:32:04 +00:00
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn exact_match_impl<const INDICES: bool>(
|
|
|
|
&mut self,
|
|
|
|
haystack: Utf32Str<'_>,
|
|
|
|
needle_: Utf32Str<'_>,
|
2023-07-30 02:52:44 +00:00
|
|
|
start: usize,
|
|
|
|
end: usize,
|
2023-07-26 13:32:04 +00:00
|
|
|
indices: &mut Vec<u32>,
|
|
|
|
) -> Option<u16> {
|
2023-08-06 17:55:31 +00:00
|
|
|
if needle_.len() != end - start {
|
2023-07-26 13:32:04 +00:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
assert!(
|
|
|
|
haystack.len() <= u32::MAX as usize,
|
|
|
|
"matching is only support for up to 2^32-1 codepoints"
|
|
|
|
);
|
|
|
|
let score = match (haystack, needle_) {
|
|
|
|
(Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => {
|
|
|
|
let matched = if self.config.ignore_case {
|
2023-07-30 02:52:44 +00:00
|
|
|
AsciiChar::cast(haystack)[start..end]
|
2023-07-26 13:32:04 +00:00
|
|
|
.iter()
|
|
|
|
.map(|c| c.normalize(&self.config))
|
|
|
|
.eq(AsciiChar::cast(needle)
|
|
|
|
.iter()
|
|
|
|
.map(|c| c.normalize(&self.config)))
|
|
|
|
} else {
|
|
|
|
haystack == needle
|
|
|
|
};
|
|
|
|
if !matched {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
self.calculate_score::<INDICES, _, _>(
|
|
|
|
AsciiChar::cast(haystack),
|
|
|
|
AsciiChar::cast(needle),
|
2023-07-30 02:52:44 +00:00
|
|
|
start,
|
|
|
|
end,
|
2023-07-26 13:32:04 +00:00
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => {
|
|
|
|
// a purely ascii haystack can never be transformed to match
|
|
|
|
// a needle that contains non-ascii chars since we don't allow gaps
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => {
|
2023-07-30 02:52:44 +00:00
|
|
|
let matched = haystack[start..end]
|
|
|
|
.iter()
|
|
|
|
.map(|c| c.normalize(&self.config))
|
|
|
|
.eq(AsciiChar::cast(needle)
|
2023-07-26 13:32:04 +00:00
|
|
|
.iter()
|
2023-07-30 02:52:44 +00:00
|
|
|
.map(|c| c.normalize(&self.config)));
|
2023-07-27 20:08:06 +00:00
|
|
|
if !matched {
|
|
|
|
return None;
|
|
|
|
}
|
2023-07-26 13:32:04 +00:00
|
|
|
|
|
|
|
self.calculate_score::<INDICES, _, _>(
|
|
|
|
haystack,
|
|
|
|
AsciiChar::cast(needle),
|
2023-07-30 02:52:44 +00:00
|
|
|
start,
|
|
|
|
end,
|
2023-07-26 13:32:04 +00:00
|
|
|
indices,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
(Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => {
|
2023-07-30 02:52:44 +00:00
|
|
|
let matched = haystack[start..end]
|
2023-07-26 13:32:04 +00:00
|
|
|
.iter()
|
|
|
|
.map(|c| c.normalize(&self.config))
|
|
|
|
.eq(needle.iter().map(|c| c.normalize(&self.config)));
|
|
|
|
if !matched {
|
|
|
|
return None;
|
|
|
|
}
|
2023-07-30 02:52:44 +00:00
|
|
|
self.calculate_score::<INDICES, _, _>(haystack, needle, start, end, indices)
|
2023-07-26 13:32:04 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
Some(score)
|
|
|
|
}
|
|
|
|
}
|