test pattrn parsing and fix edgecases

This commit is contained in:
Pascal Kuthe 2023-08-06 14:42:47 +02:00
parent bb0b5f8726
commit 6b08991fac
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
5 changed files with 191 additions and 30 deletions

View File

@ -80,7 +80,7 @@ impl Char for AsciiChar {
fn char_class_non_ascii(c: char) -> CharClass { fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() { if c.is_lowercase() {
CharClass::Lower CharClass::Lower
} else if c.is_uppercase() { } else if is_upper_case(c) {
CharClass::Upper CharClass::Upper
} else if c.is_numeric() { } else if c.is_numeric() {
CharClass::Number CharClass::Number
@ -144,6 +144,13 @@ pub fn to_lower_case(c: char) -> char {
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
} }
#[inline(always)]
pub fn is_upper_case(c: char) -> bool {
CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.is_ok()
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive] #[non_exhaustive]
pub enum CharClass { pub enum CharClass {

View File

@ -140,11 +140,10 @@ impl fmt::Debug for Utf32Str<'_> {
impl fmt::Display for Utf32Str<'_> { impl fmt::Display for Utf32Str<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for c in self.chars() { for c in self.chars() {
write!(f, "{c}")? write!(f, "{c}")?
} }
write!(f, "\"") Ok(())
} }
} }

View File

@ -1,5 +1,8 @@
use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
#[cfg(test)]
mod tests;
use crate::Utf32String; use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -35,7 +38,7 @@ impl PatternAtom {
kind: PatternKind, kind: PatternKind,
escape_whitespace: bool, escape_whitespace: bool,
) -> PatternAtom { ) -> PatternAtom {
let mut ignore_case = case == CaseMatching::Ignore; let mut ignore_case;
let needle = if needle.is_ascii() { let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace { let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") { if let Some((start, rem)) = needle.split_once("\\ ") {
@ -53,16 +56,20 @@ impl PatternAtom {
}; };
match case { match case {
CaseMatching::Ignore => needle.make_ascii_lowercase(), CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
} }
CaseMatching::Respect => (), CaseMatching::Respect => ignore_case = false,
} }
Utf32String::Ascii(needle.into_boxed_str()) Utf32String::Ascii(needle.into_boxed_str())
} else { } else {
let mut needle_ = Vec::with_capacity(needle.len()); let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace { if escape_whitespace {
let mut saw_backslash = false; let mut saw_backslash = false;
for mut c in chars::graphemes(needle) { for mut c in chars::graphemes(needle) {
@ -82,7 +89,7 @@ impl PatternAtom {
match case { match case {
CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase(); ignore_case = ignore_case && !chars::is_upper_case(c)
} }
CaseMatching::Respect => (), CaseMatching::Respect => (),
} }
@ -96,7 +103,7 @@ impl PatternAtom {
match case { match case {
CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase(); ignore_case = ignore_case && !chars::is_upper_case(c);
} }
CaseMatching::Respect => (), CaseMatching::Respect => (),
} }
@ -116,10 +123,17 @@ impl PatternAtom {
fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom { fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom {
let mut atom = raw; let mut atom = raw;
let invert = atom.starts_with('!'); let invert = match atom.as_bytes() {
if invert { [b'!', ..] => {
atom = &atom[1..]; atom = &atom[1..];
true
} }
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() { let mut kind = match atom.as_bytes() {
[b'^', ..] => { [b'^', ..] => {
@ -137,8 +151,12 @@ impl PatternAtom {
_ => PatternKind::Fuzzy, _ => PatternKind::Fuzzy,
}; };
let mut append_dollar = false;
match atom.as_bytes() { match atom.as_bytes() {
[.., b'\\', b'$'] => (), [.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => { [.., b'$'] => {
kind = if kind == PatternKind::Fuzzy { kind = if kind == PatternKind::Fuzzy {
PatternKind::Postfix PatternKind::Postfix
@ -156,6 +174,9 @@ impl PatternAtom {
let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true); let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true);
pattern.invert = invert; pattern.invert = invert;
if append_dollar {
pattern.needle.push('$');
}
pattern pattern
} }
} }
@ -221,7 +242,7 @@ impl MultiPattern {
#[derive(Debug)] #[derive(Debug)]
pub struct Pattern { pub struct Pattern {
terms: Vec<PatternAtom>, atoms: Vec<PatternAtom>,
case_matching: CaseMatching, case_matching: CaseMatching,
normalize: bool, normalize: bool,
status: Status, status: Status,
@ -230,7 +251,7 @@ pub struct Pattern {
impl Pattern { impl Pattern {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern { pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern { Pattern {
terms: Vec::new(), atoms: Vec::new(),
case_matching, case_matching,
normalize: matcher_config.normalize, normalize: matcher_config.normalize,
status: Status::Unchanged, status: Status::Unchanged,
@ -242,7 +263,7 @@ impl Pattern {
pattern: &str, pattern: &str,
) -> Pattern { ) -> Pattern {
let mut res = Pattern { let mut res = Pattern {
terms: Vec::new(), atoms: Vec::new(),
case_matching, case_matching,
normalize: matcher_config.normalize, normalize: matcher_config.normalize,
status: Status::Unchanged, status: Status::Unchanged,
@ -252,11 +273,11 @@ impl Pattern {
} }
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> { pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.terms.is_empty() { if self.atoms.is_empty() {
return Some(0); return Some(0);
} }
let mut score = 0; let mut score = 0;
for pattern in &self.terms { for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case; matcher.config.ignore_case = pattern.ignore_case;
let pattern_score = match pattern.kind { let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
@ -284,11 +305,11 @@ impl Pattern {
matcher: &mut Matcher, matcher: &mut Matcher,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u32> { ) -> Option<u32> {
if self.terms.is_empty() { if self.atoms.is_empty() {
return Some(0); return Some(0);
} }
let mut score = 0; let mut score = 0;
for pattern in &self.terms { for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case; matcher.config.ignore_case = pattern.ignore_case;
if pattern.invert { if pattern.invert {
let pattern_score = match pattern.kind { let pattern_score = match pattern.kind {
@ -330,8 +351,8 @@ impl Pattern {
} }
pub fn parse_from(&mut self, pattern: &str, append: bool) { pub fn parse_from(&mut self, pattern: &str, append: bool) {
self.terms.clear(); self.atoms.clear();
let invert = self.terms.last().map_or(false, |pat| pat.invert); let invert = self.atoms.last().map_or(false, |pat| pat.invert);
let atoms = pattern_atoms(pattern).filter_map(|atom| { let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = PatternAtom::parse(atom, self.normalize, self.case_matching); let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
if atom.needle.is_empty() { if atom.needle.is_empty() {
@ -339,7 +360,7 @@ impl Pattern {
} }
Some(atom) Some(atom)
}); });
self.terms.extend(atoms); self.atoms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore { self.status = if append && !invert && self.status != Status::Rescore {
Status::Update Status::Update
@ -349,10 +370,10 @@ impl Pattern {
} }
pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) { pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) {
self.terms.clear(); self.atoms.clear();
let pattern = let pattern =
PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false); PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false);
self.terms.push(pattern); self.atoms.push(pattern);
self.status = if append && self.status != Status::Rescore { self.status = if append && self.status != Status::Rescore {
Status::Update Status::Update
} else { } else {
@ -361,14 +382,14 @@ impl Pattern {
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.terms.is_empty() self.atoms.is_empty()
} }
} }
impl Clone for Pattern { impl Clone for Pattern {
fn clone(&self) -> Self { fn clone(&self) -> Self {
Self { Self {
terms: self.terms.clone(), atoms: self.atoms.clone(),
case_matching: self.case_matching, case_matching: self.case_matching,
normalize: self.normalize, normalize: self.normalize,
status: self.status, status: self.status,
@ -376,7 +397,7 @@ impl Clone for Pattern {
} }
fn clone_from(&mut self, source: &Self) { fn clone_from(&mut self, source: &Self) {
self.terms.clone_from(&source.terms); self.atoms.clone_from(&source.atoms);
self.case_matching = source.case_matching; self.case_matching = source.case_matching;
self.normalize = source.normalize; self.normalize = source.normalize;
self.status = source.status; self.status = source.status;

135
src/pattern/tests.rs Normal file
View File

@ -0,0 +1,135 @@
use crate::pattern::PatternAtom;
use crate::{CaseMatching, Pattern, PatternKind};
fn parse_atom(pat: &str) -> PatternAtom {
parse_atom_with(pat, CaseMatching::Smart)
}
fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom {
let mut pat = parse_with(pat, case_matching);
assert_eq!(pat.atoms.len(), 1);
pat.atoms.remove(0)
}
fn parse_with(pat: &str, case_matching: CaseMatching) -> Pattern {
let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching);
res.parse_from(pat, false);
res
}
#[test]
fn negative() {
let pat = parse_atom("!foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = parse_atom("foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("'foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = parse_atom_with("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "axx");
let pat = parse_atom_with("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = parse_atom("foo\\ bar");
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = parse_atom("\\!foo");
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\'foo");
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\^foo");
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("^foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Prefix);
let pat = parse_atom("\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\!^foo\\$");
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("!\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Substring);
}

View File

@ -186,10 +186,9 @@ impl fmt::Debug for Utf32String {
impl fmt::Display for Utf32String { impl fmt::Display for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for c in self.chars() { for c in self.chars() {
write!(f, "{c}")? write!(f, "{c}")?
} }
write!(f, "\"") Ok(())
} }
} }