Correctly handle normalization in pattern API

This commit is contained in:
Pascal Kuthe 2023-12-17 18:36:07 +01:00
parent b41c989daf
commit 1edf451192
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
7 changed files with 146 additions and 59 deletions

View File

@ -1,9 +1,14 @@
# Changelog
# Unreleased
# [0.3.0] - 2023-12-20
## **Breaking Changes**
* Pattern API method now requires a Unicode `Normalization` strategy in addition to a `CaseMatching` strategy.
## Bugfixes
* correctly handle Unicode normalization when there are normalizable characters in the pattern, for example characters with umlauts
* when the needle is composed of a single char, return the score and index
of the best position instead of always returning the first matched character
in the haystack
@ -19,5 +24,6 @@
*initial public release*
[0.3.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.3.0
[0.2.1]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.1
[0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0

View File

@ -16,12 +16,12 @@ can contain special characters to control what kind of match is performed (see
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching};
# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]);
```
@ -30,13 +30,13 @@ If the pattern should be matched literally (without this special parsing)
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let paths = ["^foo/bar", "bar/^foo", "foobar"];
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]);
```
@ -44,10 +44,10 @@ If word segmentation is also not desired, a single `Atom` can be constructed dir
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind};
let paths = ["foobar", "foo bar"];
let mut matcher = Matcher::new(Config::DEFAULT);
let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo bar", 192)]);
```

View File

@ -25,6 +25,19 @@ pub enum CaseMatching {
Smart,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to handle unicode normalization,
pub enum Normalization {
/// Characters never match their normalized version (`a != ä`).
Never,
/// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
/// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
#[default]
#[cfg(feature = "unicode-normalization")]
Smart,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// The kind of matching algorithm to run for an atom.
@ -73,24 +86,41 @@ pub struct Atom {
pub kind: AtomKind,
needle: Utf32String,
ignore_case: bool,
normalize: bool,
}
impl Atom {
/// Creates a single [`Atom`] from a string by performing unicode
/// normalization and case folding (if necessary). Optionally `\ ` can
/// be escaped to ` `.
pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom {
Atom::new_inner(needle, case, kind, escape_whitespace, false)
pub fn new(
needle: &str,
case: CaseMatching,
normalize: Normalization,
kind: AtomKind,
escape_whitespace: bool,
) -> Atom {
Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
}
fn new_inner(
needle: &str,
case: CaseMatching,
normalization: Normalization,
kind: AtomKind,
escape_whitespace: bool,
append_dollar: bool,
) -> Atom {
let mut ignore_case;
let mut normalize;
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
#[cfg(not(feature = "unicode-normalization"))]
{
normalize = false;
}
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
@ -133,6 +163,10 @@ impl Atom {
{
ignore_case = false;
}
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
@ -155,6 +189,13 @@ impl Atom {
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
needle_.push(c);
}
} else {
@ -168,6 +209,13 @@ impl Atom {
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
c
});
needle_.extend(chars);
@ -182,13 +230,14 @@ impl Atom {
needle,
negative: false,
ignore_case,
normalize,
}
}
/// Parse a pattern atom from a string. Some special trailing and leading
/// characters can be used to control the atom kind. See [`AtomKind`] for
/// details.
pub fn parse(raw: &str, case: CaseMatching) -> Atom {
pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
@ -239,7 +288,7 @@ impl Atom {
kind = AtomKind::Substring
}
let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar);
let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
pattern.negative = invert;
pattern
}
@ -252,6 +301,7 @@ impl Atom {
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
@ -285,6 +335,7 @@ impl Atom {
indices: &mut Vec<u32>,
) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
if self.negative {
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
@ -371,10 +422,15 @@ impl Pattern {
/// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
/// and ^ don't receive special treatment). If you want to match the entire
/// pattern as a single needle use a single [`Atom`] instead.
pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern {
pub fn new(
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
kind: AtomKind,
) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::new(pat, case_matching, kind, true);
let pat = Atom::new(pat, case_matching, normalize, kind, true);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
@ -384,10 +440,10 @@ impl Pattern {
/// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
/// cause different matching behaviour (see [`AtomKind`]). These can be
/// escaped with backslash.
pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern {
pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::parse(pat, case_matching);
let pat = Atom::parse(pat, case_matching, normalize);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
@ -477,10 +533,15 @@ impl Pattern {
/// Refreshes this pattern by reparsing it from a string. This is mostly
/// equivalent to just constructing a new pattern using [`Pattern::parse`]
/// but is slightly more efficient by reusing some allocations
pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) {
pub fn reparse(
&mut self,
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
) {
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = Atom::parse(atom, case_matching);
let atom = Atom::parse(atom, case_matching, normalize);
if atom.needle.is_empty() {
return None;
}

View File

@ -1,20 +1,20 @@
use crate::pattern::{Atom, AtomKind, CaseMatching};
use crate::pattern::{Atom, AtomKind, CaseMatching, Normalization};
#[test]
fn negative() {
let pat = Atom::parse("!foo", CaseMatching::Smart);
let pat = Atom::parse("!foo", CaseMatching::Smart, Normalization::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo", CaseMatching::Smart);
let pat = Atom::parse("!^foo", CaseMatching::Smart, Normalization::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!foo$", CaseMatching::Smart);
let pat = Atom::parse("!foo$", CaseMatching::Smart, Normalization::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("!^foo$", CaseMatching::Smart);
let pat = Atom::parse("!^foo$", CaseMatching::Smart, Normalization::Smart);
assert!(pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
@ -22,23 +22,23 @@ fn negative() {
#[test]
fn pattern_kinds() {
let pat = Atom::parse("foo", CaseMatching::Smart);
let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("'foo", CaseMatching::Smart);
let pat = Atom::parse("'foo", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo", CaseMatching::Smart);
let pat = Atom::parse("^foo", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("foo$", CaseMatching::Smart);
let pat = Atom::parse("foo$", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("^foo$", CaseMatching::Smart);
let pat = Atom::parse("^foo$", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.negative);
assert_eq!(pat.kind, AtomKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
@ -46,69 +46,69 @@ fn pattern_kinds() {
#[test]
fn case_matching() {
let pat = Atom::parse("foo", CaseMatching::Smart);
let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Smart);
let pat = Atom::parse("Foo", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Ignore);
let pat = Atom::parse("Foo", CaseMatching::Ignore, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Foo", CaseMatching::Respect);
let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = Atom::parse("Äxx", CaseMatching::Ignore);
let pat = Atom::parse("Äxx", CaseMatching::Ignore, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "äxx");
let pat = Atom::parse("Äxx", CaseMatching::Respect);
let pat = Atom::parse("Äxx", CaseMatching::Respect, Normalization::Smart);
assert!(!pat.ignore_case);
let pat = Atom::parse("Axx", CaseMatching::Smart);
let pat = Atom::parse("Axx", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = Atom::parse("你xx", CaseMatching::Smart);
let pat = Atom::parse("你xx", CaseMatching::Smart, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("你xx", CaseMatching::Ignore);
let pat = Atom::parse("你xx", CaseMatching::Ignore, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Smart);
let pat = Atom::parse("Ⲽxx", CaseMatching::Smart, Normalization::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore);
let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore, Normalization::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart);
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = Atom::parse("\\!foo", CaseMatching::Smart);
let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\'foo", CaseMatching::Smart);
let pat = Atom::parse("\\'foo", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\^foo", CaseMatching::Smart);
let pat = Atom::parse("\\^foo", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("foo\\$", CaseMatching::Smart);
let pat = Atom::parse("foo\\$", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("^foo\\$", CaseMatching::Smart);
let pat = Atom::parse("^foo\\$", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, AtomKind::Prefix);
let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart);
let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart);
let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, AtomKind::Fuzzy);
let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart);
let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart, Normalization::Smart);
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, AtomKind::Substring);
}

View File

@ -1,4 +1,5 @@
use crate::chars::Char;
use crate::pattern::{CaseMatching, Normalization, Pattern};
use crate::score::{
BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD,
MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH,
@ -577,12 +578,13 @@ fn test_optimal() {
// the second f instead of the third yielding a higher score
// (despite using the same scoring function!)
(
"xf foo",
"xf.foo",
"xfoo",
&[0, 3, 4, 5],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3)
BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER
- PENALTY_GAP_START
- PENALTY_GAP_EXTENSION,
- PENALTY_GAP_EXTENSION
+ BONUS_BOUNDARY * 3,
),
(
"xf fo",
@ -698,3 +700,18 @@ fn test_single_char_needle() {
)],
);
}
#[test]
fn umlaut() {
let paths = ["be", ""];
let mut matcher = Matcher::new(Config::DEFAULT);
let matches = Pattern::parse("ë", CaseMatching::Ignore, Normalization::Smart)
.match_list(paths, &mut matcher);
assert_eq!(matches.len(), 1);
let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Never)
.match_list(paths, &mut matcher);
assert_eq!(matches.len(), 1);
let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Smart)
.match_list(paths, &mut matcher);
assert_eq!(matches.len(), 2);
}

View File

@ -1,4 +1,4 @@
pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern};
pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Normalization, Pattern};
use nucleo_matcher::{Matcher, Utf32String};
#[cfg(test)]
@ -46,6 +46,7 @@ impl MultiPattern {
column: usize,
new_text: &str,
case_matching: CaseMatching,
normalization: Normalization,
append: bool,
) {
let old_status = self.cols[column].1;
@ -61,7 +62,9 @@ impl MultiPattern {
} else {
self.cols[column].1 = Status::Rescore;
}
self.cols[column].0.reparse(new_text, case_matching);
self.cols[column]
.0
.reparse(new_text, case_matching, normalization);
}
pub fn column_pattern(&self, column: usize) -> &Pattern {

View File

@ -1,14 +1,14 @@
use nucleo_matcher::pattern::CaseMatching;
use nucleo_matcher::pattern::{CaseMatching, Normalization};
use crate::pattern::{MultiPattern, Status};
#[test]
fn append() {
let mut pat = MultiPattern::new(1);
pat.reparse(0, "!", CaseMatching::Smart, true);
pat.reparse(0, "!", CaseMatching::Smart, Normalization::Smart, true);
assert_eq!(pat.status(), Status::Update);
pat.reparse(0, "!f", CaseMatching::Smart, true);
pat.reparse(0, "!f", CaseMatching::Smart, Normalization::Smart, true);
assert_eq!(pat.status(), Status::Update);
pat.reparse(0, "!fo", CaseMatching::Smart, true);
pat.reparse(0, "!fo", CaseMatching::Smart, Normalization::Smart, true);
assert_eq!(pat.status(), Status::Rescore);
}