From 6b08991faca5e5fbfc728e06beaa042770afa2ec Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 6 Aug 2023 14:42:47 +0200 Subject: [PATCH] test pattrn parsing and fix edgecases --- matcher/src/chars.rs | 9 ++- matcher/src/utf32_str.rs | 3 +- src/pattern.rs | 71 ++++++++++++-------- src/pattern/tests.rs | 135 +++++++++++++++++++++++++++++++++++++++ src/utf32_string.rs | 3 +- 5 files changed, 191 insertions(+), 30 deletions(-) create mode 100644 src/pattern/tests.rs diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index 7f89577..a469fc1 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -80,7 +80,7 @@ impl Char for AsciiChar { fn char_class_non_ascii(c: char) -> CharClass { if c.is_lowercase() { CharClass::Lower - } else if c.is_uppercase() { + } else if is_upper_case(c) { CharClass::Upper } else if c.is_numeric() { CharClass::Number @@ -144,6 +144,13 @@ pub fn to_lower_case(c: char) -> char { .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } +#[inline(always)] +pub fn is_upper_case(c: char) -> bool { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .is_ok() +} + #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] #[non_exhaustive] pub enum CharClass { diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 768c724..554f72c 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -140,11 +140,10 @@ impl fmt::Debug for Utf32Str<'_> { impl fmt::Display for Utf32Str<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\"")?; for c in self.chars() { write!(f, "{c}")? } - write!(f, "\"") + Ok(()) } } diff --git a/src/pattern.rs b/src/pattern.rs index 1efb7a9..58a0530 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,5 +1,8 @@ use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; +#[cfg(test)] +mod tests; + use crate::Utf32String; #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -35,7 +38,7 @@ impl PatternAtom { kind: PatternKind, escape_whitespace: bool, ) -> PatternAtom { - let mut ignore_case = case == CaseMatching::Ignore; + let mut ignore_case; let needle = if needle.is_ascii() { let mut needle = if escape_whitespace { if let Some((start, rem)) = needle.split_once("\\ ") { @@ -53,16 +56,20 @@ impl PatternAtom { }; match case { - CaseMatching::Ignore => needle.make_ascii_lowercase(), + CaseMatching::Ignore => { + ignore_case = true; + needle.make_ascii_lowercase() + } CaseMatching::Smart => { ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) } - CaseMatching::Respect => (), + CaseMatching::Respect => ignore_case = false, } Utf32String::Ascii(needle.into_boxed_str()) } else { let mut needle_ = Vec::with_capacity(needle.len()); + ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); if escape_whitespace { let mut saw_backslash = false; for mut c in chars::graphemes(needle) { @@ -82,7 +89,7 @@ impl PatternAtom { match case { CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Smart => { - ignore_case = ignore_case && !c.is_uppercase(); + ignore_case = ignore_case && !chars::is_upper_case(c) } CaseMatching::Respect => (), } @@ -96,7 +103,7 @@ impl PatternAtom { match case { CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Smart => { - ignore_case = ignore_case && !c.is_uppercase(); + ignore_case = ignore_case && !chars::is_upper_case(c); } CaseMatching::Respect => (), } @@ -116,10 +123,17 @@ impl PatternAtom { fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom { let mut atom = raw; - let invert = atom.starts_with('!'); - if invert { - atom = &atom[1..]; - } + let invert = match atom.as_bytes() { + [b'!', ..] => { + atom = &atom[1..]; + true + } + [b'\\', b'!', ..] => { + atom = &atom[1..]; + false + } + _ => false, + }; let mut kind = match atom.as_bytes() { [b'^', ..] => { @@ -137,8 +151,12 @@ impl PatternAtom { _ => PatternKind::Fuzzy, }; + let mut append_dollar = false; match atom.as_bytes() { - [.., b'\\', b'$'] => (), + [.., b'\\', b'$'] => { + append_dollar = true; + atom = &atom[..atom.len() - 2] + } [.., b'$'] => { kind = if kind == PatternKind::Fuzzy { PatternKind::Postfix @@ -156,6 +174,9 @@ impl PatternAtom { let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true); pattern.invert = invert; + if append_dollar { + pattern.needle.push('$'); + } pattern } } @@ -221,7 +242,7 @@ impl MultiPattern { #[derive(Debug)] pub struct Pattern { - terms: Vec, + atoms: Vec, case_matching: CaseMatching, normalize: bool, status: Status, @@ -230,7 +251,7 @@ pub struct Pattern { impl Pattern { pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern { Pattern { - terms: Vec::new(), + atoms: Vec::new(), case_matching, normalize: matcher_config.normalize, status: Status::Unchanged, @@ -242,7 +263,7 @@ impl Pattern { pattern: &str, ) -> Pattern { let mut res = Pattern { - terms: Vec::new(), + atoms: Vec::new(), case_matching, normalize: matcher_config.normalize, status: Status::Unchanged, @@ -252,11 +273,11 @@ impl Pattern { } pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { - if self.terms.is_empty() { + if self.atoms.is_empty() { return Some(0); } let mut score = 0; - for pattern in &self.terms { + for pattern in &self.atoms { matcher.config.ignore_case = pattern.ignore_case; let pattern_score = match pattern.kind { PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), @@ -284,11 +305,11 @@ impl Pattern { matcher: &mut Matcher, indices: &mut Vec, ) -> Option { - if self.terms.is_empty() { + if self.atoms.is_empty() { return Some(0); } let mut score = 0; - for pattern in &self.terms { + for pattern in &self.atoms { matcher.config.ignore_case = pattern.ignore_case; if pattern.invert { let pattern_score = match pattern.kind { @@ -330,8 +351,8 @@ impl Pattern { } pub fn parse_from(&mut self, pattern: &str, append: bool) { - self.terms.clear(); - let invert = self.terms.last().map_or(false, |pat| pat.invert); + self.atoms.clear(); + let invert = self.atoms.last().map_or(false, |pat| pat.invert); let atoms = pattern_atoms(pattern).filter_map(|atom| { let atom = PatternAtom::parse(atom, self.normalize, self.case_matching); if atom.needle.is_empty() { @@ -339,7 +360,7 @@ impl Pattern { } Some(atom) }); - self.terms.extend(atoms); + self.atoms.extend(atoms); self.status = if append && !invert && self.status != Status::Rescore { Status::Update @@ -349,10 +370,10 @@ impl Pattern { } pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) { - self.terms.clear(); + self.atoms.clear(); let pattern = PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false); - self.terms.push(pattern); + self.atoms.push(pattern); self.status = if append && self.status != Status::Rescore { Status::Update } else { @@ -361,14 +382,14 @@ impl Pattern { } pub fn is_empty(&self) -> bool { - self.terms.is_empty() + self.atoms.is_empty() } } impl Clone for Pattern { fn clone(&self) -> Self { Self { - terms: self.terms.clone(), + atoms: self.atoms.clone(), case_matching: self.case_matching, normalize: self.normalize, status: self.status, @@ -376,7 +397,7 @@ impl Clone for Pattern { } fn clone_from(&mut self, source: &Self) { - self.terms.clone_from(&source.terms); + self.atoms.clone_from(&source.atoms); self.case_matching = source.case_matching; self.normalize = source.normalize; self.status = source.status; diff --git a/src/pattern/tests.rs b/src/pattern/tests.rs new file mode 100644 index 0000000..a822f1c --- /dev/null +++ b/src/pattern/tests.rs @@ -0,0 +1,135 @@ +use crate::pattern::PatternAtom; +use crate::{CaseMatching, Pattern, PatternKind}; + +fn parse_atom(pat: &str) -> PatternAtom { + parse_atom_with(pat, CaseMatching::Smart) +} + +fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom { + let mut pat = parse_with(pat, case_matching); + assert_eq!(pat.atoms.len(), 1); + pat.atoms.remove(0) +} + +fn parse_with(pat: &str, case_matching: CaseMatching) -> Pattern { + let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching); + res.parse_from(pat, false); + res +} + +#[test] +fn negative() { + let pat = parse_atom("!foo"); + assert!(pat.invert); + assert_eq!(pat.kind, PatternKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("!^foo"); + assert!(pat.invert); + assert_eq!(pat.kind, PatternKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("!foo$"); + assert!(pat.invert); + assert_eq!(pat.kind, PatternKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("!^foo$"); + assert!(pat.invert); + assert_eq!(pat.kind, PatternKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn pattern_kinds() { + let pat = parse_atom("foo"); + assert!(!pat.invert); + assert_eq!(pat.kind, PatternKind::Fuzzy); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("'foo"); + assert!(!pat.invert); + assert_eq!(pat.kind, PatternKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("^foo"); + assert!(!pat.invert); + assert_eq!(pat.kind, PatternKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("foo$"); + assert!(!pat.invert); + assert_eq!(pat.kind, PatternKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom("^foo$"); + assert!(!pat.invert); + assert_eq!(pat.kind, PatternKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn case_matching() { + let pat = parse_atom_with("foo", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom_with("Foo", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = parse_atom_with("Foo", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = parse_atom_with("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = parse_atom_with("Foo", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = parse_atom_with("Äxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "axx"); + let pat = parse_atom_with("Äxx", CaseMatching::Respect); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = parse_atom_with("Äxx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = parse_atom_with("Äxx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = parse_atom_with("你xx", CaseMatching::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = parse_atom_with("你xx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Ⲽxx"); + let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "ⲽxx"); +} + +#[test] +fn escape() { + let pat = parse_atom("foo\\ bar"); + assert_eq!(pat.needle.to_string(), "foo bar"); + let pat = parse_atom("\\!foo"); + assert_eq!(pat.needle.to_string(), "!foo"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("\\'foo"); + assert_eq!(pat.needle.to_string(), "'foo"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("\\^foo"); + assert_eq!(pat.needle.to_string(), "^foo"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("foo\\$"); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("^foo\\$"); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, PatternKind::Prefix); + let pat = parse_atom("\\^foo\\$"); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("\\!^foo\\$"); + assert_eq!(pat.needle.to_string(), "!^foo$"); + assert_eq!(pat.kind, PatternKind::Fuzzy); + let pat = parse_atom("!\\^foo\\$"); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, PatternKind::Substring); +} diff --git a/src/utf32_string.rs b/src/utf32_string.rs index f8410a0..da69cf3 100644 --- a/src/utf32_string.rs +++ b/src/utf32_string.rs @@ -186,10 +186,9 @@ impl fmt::Debug for Utf32String { impl fmt::Display for Utf32String { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\"")?; for c in self.chars() { write!(f, "{c}")? } - write!(f, "\"") + Ok(()) } }