From 1edf4511920b798fed42a2f9093c1ddcbec76946 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 17 Dec 2023 18:36:07 +0100 Subject: [PATCH] Correctly handle normalization in pattern API --- CHANGELOG.md | 8 +++- matcher/src/lib.rs | 16 +++---- matcher/src/pattern.rs | 81 +++++++++++++++++++++++++++++++----- matcher/src/pattern/tests.rs | 62 +++++++++++++-------------- matcher/src/tests.rs | 23 ++++++++-- src/pattern.rs | 7 +++- src/pattern/tests.rs | 8 ++-- 7 files changed, 146 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b20882d..75b9725 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,14 @@ # Changelog -# Unreleased +# [0.3.0] - 2023-12-20 + +## **Breaking Changes** + +* Pattern API method now requires a Unicode `Normalization` strategy in addition to a `CaseMatching` strategy. ## Bugfixes +* correctly handle Unicode normalization when there are normalizable characters in the pattern, for example characters with umlauts * when the needle is composed of a single char, return the score and index of the best position instead of always returning the first matched character in the haystack @@ -19,5 +24,6 @@ *initial public release* +[0.3.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.3.0 [0.2.1]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.1 [0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0 diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 4812fc4..2cd4b67 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -16,12 +16,12 @@ can contain special characters to control what kind of match is performed (see ``` # use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, CaseMatching}; +# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching}; let paths = ["foo/bar", "bar/foo", "foobar"]; let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); -let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher); +let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); -let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher); +let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]); ``` @@ -30,13 +30,13 @@ If the pattern should be matched literally (without this special parsing) ``` # use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind}; +# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization}; let paths = ["foo/bar", "bar/foo", "foobar"]; let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); -let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher); +let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); let paths = ["^foo/bar", "bar/^foo", "foobar"]; -let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher); +let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]); ``` @@ -44,10 +44,10 @@ If word segmentation is also not desired, a single `Atom` can be constructed dir ``` # use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind}; +# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind}; let paths = ["foobar", "foo bar"]; let mut matcher = Matcher::new(Config::DEFAULT); -let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); +let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); assert_eq!(matches, vec![("foo bar", 192)]); ``` diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs index 5aa16cd..4801017 100644 --- a/matcher/src/pattern.rs +++ b/matcher/src/pattern.rs @@ -25,6 +25,19 @@ pub enum CaseMatching { Smart, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[non_exhaustive] +/// How to handle unicode normalization, +pub enum Normalization { + /// Characters never match their normalized version (`a != ä`). + Never, + /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom + /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`). + #[default] + #[cfg(feature = "unicode-normalization")] + Smart, +} + #[derive(Debug, PartialEq, Eq, Clone, Copy)] #[non_exhaustive] /// The kind of matching algorithm to run for an atom. @@ -73,24 +86,41 @@ pub struct Atom { pub kind: AtomKind, needle: Utf32String, ignore_case: bool, + normalize: bool, } impl Atom { /// Creates a single [`Atom`] from a string by performing unicode /// normalization and case folding (if necessary). Optionally `\ ` can /// be escaped to ` `. - pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom { - Atom::new_inner(needle, case, kind, escape_whitespace, false) + pub fn new( + needle: &str, + case: CaseMatching, + normalize: Normalization, + kind: AtomKind, + escape_whitespace: bool, + ) -> Atom { + Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false) } fn new_inner( needle: &str, case: CaseMatching, + normalization: Normalization, kind: AtomKind, escape_whitespace: bool, append_dollar: bool, ) -> Atom { let mut ignore_case; + let mut normalize; + #[cfg(feature = "unicode-normalization")] + { + normalize = matches!(normalization, Normalization::Smart); + } + #[cfg(not(feature = "unicode-normalization"))] + { + normalize = false; + } let needle = if needle.is_ascii() { let mut needle = if escape_whitespace { if let Some((start, rem)) = needle.split_once("\\ ") { @@ -133,6 +163,10 @@ impl Atom { { ignore_case = false; } + #[cfg(feature = "unicode-normalization")] + { + normalize = matches!(normalization, Normalization::Smart); + } if escape_whitespace { let mut saw_backslash = false; for mut c in chars::graphemes(needle) { @@ -155,6 +189,13 @@ impl Atom { } CaseMatching::Respect => (), } + match normalization { + #[cfg(feature = "unicode-normalization")] + Normalization::Smart => { + normalize = normalize && chars::normalize(c) == c; + } + Normalization::Never => (), + } needle_.push(c); } } else { @@ -168,6 +209,13 @@ impl Atom { } CaseMatching::Respect => (), } + match normalization { + #[cfg(feature = "unicode-normalization")] + Normalization::Smart => { + normalize = normalize && chars::normalize(c) == c; + } + Normalization::Never => (), + } c }); needle_.extend(chars); @@ -182,13 +230,14 @@ impl Atom { needle, negative: false, ignore_case, + normalize, } } /// Parse a pattern atom from a string. Some special trailing and leading /// characters can be used to control the atom kind. See [`AtomKind`] for /// details. - pub fn parse(raw: &str, case: CaseMatching) -> Atom { + pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom { let mut atom = raw; let invert = match atom.as_bytes() { [b'!', ..] => { @@ -239,7 +288,7 @@ impl Atom { kind = AtomKind::Substring } - let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar); + let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar); pattern.negative = invert; pattern } @@ -252,6 +301,7 @@ impl Atom { /// each pattern atom. pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { matcher.config.ignore_case = self.ignore_case; + matcher.config.normalize = self.normalize; let pattern_score = match self.kind { AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), @@ -285,6 +335,7 @@ impl Atom { indices: &mut Vec, ) -> Option { matcher.config.ignore_case = self.ignore_case; + matcher.config.normalize = self.normalize; if self.negative { let pattern_score = match self.kind { AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), @@ -371,10 +422,15 @@ impl Pattern { /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, ' /// and ^ don't receive special treatment). If you want to match the entire /// pattern as a single needle use a single [`Atom`] instead. - pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern { + pub fn new( + pattern: &str, + case_matching: CaseMatching, + normalize: Normalization, + kind: AtomKind, + ) -> Pattern { let atoms = pattern_atoms(pattern) .filter_map(|pat| { - let pat = Atom::new(pat, case_matching, kind, true); + let pat = Atom::new(pat, case_matching, normalize, kind, true); (!pat.needle.is_empty()).then_some(pat) }) .collect(); @@ -384,10 +440,10 @@ impl Pattern { /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will /// cause different matching behaviour (see [`AtomKind`]). These can be /// escaped with backslash. - pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern { + pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern { let atoms = pattern_atoms(pattern) .filter_map(|pat| { - let pat = Atom::parse(pat, case_matching); + let pat = Atom::parse(pat, case_matching, normalize); (!pat.needle.is_empty()).then_some(pat) }) .collect(); @@ -477,10 +533,15 @@ impl Pattern { /// Refreshes this pattern by reparsing it from a string. This is mostly /// equivalent to just constructing a new pattern using [`Pattern::parse`] /// but is slightly more efficient by reusing some allocations - pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) { + pub fn reparse( + &mut self, + pattern: &str, + case_matching: CaseMatching, + normalize: Normalization, + ) { self.atoms.clear(); let atoms = pattern_atoms(pattern).filter_map(|atom| { - let atom = Atom::parse(atom, case_matching); + let atom = Atom::parse(atom, case_matching, normalize); if atom.needle.is_empty() { return None; } diff --git a/matcher/src/pattern/tests.rs b/matcher/src/pattern/tests.rs index 8fcd0a9..246c2db 100644 --- a/matcher/src/pattern/tests.rs +++ b/matcher/src/pattern/tests.rs @@ -1,20 +1,20 @@ -use crate::pattern::{Atom, AtomKind, CaseMatching}; +use crate::pattern::{Atom, AtomKind, CaseMatching, Normalization}; #[test] fn negative() { - let pat = Atom::parse("!foo", CaseMatching::Smart); + let pat = Atom::parse("!foo", CaseMatching::Smart, Normalization::Smart); assert!(pat.negative); assert_eq!(pat.kind, AtomKind::Substring); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!^foo", CaseMatching::Smart); + let pat = Atom::parse("!^foo", CaseMatching::Smart, Normalization::Smart); assert!(pat.negative); assert_eq!(pat.kind, AtomKind::Prefix); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!foo$", CaseMatching::Smart); + let pat = Atom::parse("!foo$", CaseMatching::Smart, Normalization::Smart); assert!(pat.negative); assert_eq!(pat.kind, AtomKind::Postfix); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!^foo$", CaseMatching::Smart); + let pat = Atom::parse("!^foo$", CaseMatching::Smart, Normalization::Smart); assert!(pat.negative); assert_eq!(pat.kind, AtomKind::Exact); assert_eq!(pat.needle.to_string(), "foo"); @@ -22,23 +22,23 @@ fn negative() { #[test] fn pattern_kinds() { - let pat = Atom::parse("foo", CaseMatching::Smart); + let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); assert!(!pat.negative); assert_eq!(pat.kind, AtomKind::Fuzzy); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("'foo", CaseMatching::Smart); + let pat = Atom::parse("'foo", CaseMatching::Smart, Normalization::Smart); assert!(!pat.negative); assert_eq!(pat.kind, AtomKind::Substring); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("^foo", CaseMatching::Smart); + let pat = Atom::parse("^foo", CaseMatching::Smart, Normalization::Smart); assert!(!pat.negative); assert_eq!(pat.kind, AtomKind::Prefix); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("foo$", CaseMatching::Smart); + let pat = Atom::parse("foo$", CaseMatching::Smart, Normalization::Smart); assert!(!pat.negative); assert_eq!(pat.kind, AtomKind::Postfix); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("^foo$", CaseMatching::Smart); + let pat = Atom::parse("^foo$", CaseMatching::Smart, Normalization::Smart); assert!(!pat.negative); assert_eq!(pat.kind, AtomKind::Exact); assert_eq!(pat.needle.to_string(), "foo"); @@ -46,69 +46,69 @@ fn pattern_kinds() { #[test] fn case_matching() { - let pat = Atom::parse("foo", CaseMatching::Smart); + let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("Foo", CaseMatching::Smart); + let pat = Atom::parse("Foo", CaseMatching::Smart, Normalization::Smart); assert!(!pat.ignore_case); assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Foo", CaseMatching::Ignore); + let pat = Atom::parse("Foo", CaseMatching::Ignore, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("Foo", CaseMatching::Respect); + let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); assert!(!pat.ignore_case); assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Foo", CaseMatching::Respect); + let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); assert!(!pat.ignore_case); assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Äxx", CaseMatching::Ignore); + let pat = Atom::parse("Äxx", CaseMatching::Ignore, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "äxx"); - let pat = Atom::parse("Äxx", CaseMatching::Respect); + let pat = Atom::parse("Äxx", CaseMatching::Respect, Normalization::Smart); assert!(!pat.ignore_case); - let pat = Atom::parse("Axx", CaseMatching::Smart); + let pat = Atom::parse("Axx", CaseMatching::Smart, Normalization::Smart); assert!(!pat.ignore_case); assert_eq!(pat.needle.to_string(), "Axx"); - let pat = Atom::parse("你xx", CaseMatching::Smart); + let pat = Atom::parse("你xx", CaseMatching::Smart, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "你xx"); - let pat = Atom::parse("你xx", CaseMatching::Ignore); + let pat = Atom::parse("你xx", CaseMatching::Ignore, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "你xx"); - let pat = Atom::parse("Ⲽxx", CaseMatching::Smart); + let pat = Atom::parse("Ⲽxx", CaseMatching::Smart, Normalization::Smart); assert!(!pat.ignore_case); assert_eq!(pat.needle.to_string(), "Ⲽxx"); - let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore); + let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore, Normalization::Smart); assert!(pat.ignore_case); assert_eq!(pat.needle.to_string(), "ⲽxx"); } #[test] fn escape() { - let pat = Atom::parse("foo\\ bar", CaseMatching::Smart); + let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "foo bar"); - let pat = Atom::parse("\\!foo", CaseMatching::Smart); + let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "!foo"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\'foo", CaseMatching::Smart); + let pat = Atom::parse("\\'foo", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "'foo"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\^foo", CaseMatching::Smart); + let pat = Atom::parse("\\^foo", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "^foo"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("foo\\$", CaseMatching::Smart); + let pat = Atom::parse("foo\\$", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "foo$"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("^foo\\$", CaseMatching::Smart); + let pat = Atom::parse("^foo\\$", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "foo$"); assert_eq!(pat.kind, AtomKind::Prefix); - let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart); + let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "^foo$"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart); + let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "!^foo$"); assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart); + let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart, Normalization::Smart); assert_eq!(pat.needle.to_string(), "^foo$"); assert_eq!(pat.kind, AtomKind::Substring); } diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index 67f69c1..64fcc3c 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -1,4 +1,5 @@ use crate::chars::Char; +use crate::pattern::{CaseMatching, Normalization, Pattern}; use crate::score::{ BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, @@ -577,12 +578,13 @@ fn test_optimal() { // the second f instead of the third yielding a higher score // (despite using the same scoring function!) ( - "xf foo", + "xf.foo", "xfoo", &[0, 3, 4, 5], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3) + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION, + - PENALTY_GAP_EXTENSION + + BONUS_BOUNDARY * 3, ), ( "xf fo", @@ -698,3 +700,18 @@ fn test_single_char_needle() { )], ); } + +#[test] +fn umlaut() { + let paths = ["be", "bë"]; + let mut matcher = Matcher::new(Config::DEFAULT); + let matches = Pattern::parse("ë", CaseMatching::Ignore, Normalization::Smart) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 1); + let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Never) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 1); + let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Smart) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 2); +} diff --git a/src/pattern.rs b/src/pattern.rs index fb17d6a..816b0a3 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,4 +1,4 @@ -pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Pattern}; +pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Normalization, Pattern}; use nucleo_matcher::{Matcher, Utf32String}; #[cfg(test)] @@ -46,6 +46,7 @@ impl MultiPattern { column: usize, new_text: &str, case_matching: CaseMatching, + normalization: Normalization, append: bool, ) { let old_status = self.cols[column].1; @@ -61,7 +62,9 @@ impl MultiPattern { } else { self.cols[column].1 = Status::Rescore; } - self.cols[column].0.reparse(new_text, case_matching); + self.cols[column] + .0 + .reparse(new_text, case_matching, normalization); } pub fn column_pattern(&self, column: usize) -> &Pattern { diff --git a/src/pattern/tests.rs b/src/pattern/tests.rs index 3854e15..40e8e32 100644 --- a/src/pattern/tests.rs +++ b/src/pattern/tests.rs @@ -1,14 +1,14 @@ -use nucleo_matcher::pattern::CaseMatching; +use nucleo_matcher::pattern::{CaseMatching, Normalization}; use crate::pattern::{MultiPattern, Status}; #[test] fn append() { let mut pat = MultiPattern::new(1); - pat.reparse(0, "!", CaseMatching::Smart, true); + pat.reparse(0, "!", CaseMatching::Smart, Normalization::Smart, true); assert_eq!(pat.status(), Status::Update); - pat.reparse(0, "!f", CaseMatching::Smart, true); + pat.reparse(0, "!f", CaseMatching::Smart, Normalization::Smart, true); assert_eq!(pat.status(), Status::Update); - pat.reparse(0, "!fo", CaseMatching::Smart, true); + pat.reparse(0, "!fo", CaseMatching::Smart, Normalization::Smart, true); assert_eq!(pat.status(), Status::Rescore); }