test pattrn parsing and fix edgecases

This commit is contained in:
Pascal Kuthe 2023-08-06 14:42:47 +02:00
parent bb0b5f8726
commit 6b08991fac
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
5 changed files with 191 additions and 30 deletions

View File

@ -80,7 +80,7 @@ impl Char for AsciiChar {
fn char_class_non_ascii(c: char) -> CharClass {
if c.is_lowercase() {
CharClass::Lower
} else if c.is_uppercase() {
} else if is_upper_case(c) {
CharClass::Upper
} else if c.is_numeric() {
CharClass::Number
@ -144,6 +144,13 @@ pub fn to_lower_case(c: char) -> char {
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
}
#[inline(always)]
pub fn is_upper_case(c: char) -> bool {
CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper)
.is_ok()
}
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
#[non_exhaustive]
pub enum CharClass {

View File

@ -140,11 +140,10 @@ impl fmt::Debug for Utf32Str<'_> {
impl fmt::Display for Utf32Str<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for c in self.chars() {
write!(f, "{c}")?
}
write!(f, "\"")
Ok(())
}
}

View File

@ -1,5 +1,8 @@
use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
#[cfg(test)]
mod tests;
use crate::Utf32String;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -35,7 +38,7 @@ impl PatternAtom {
kind: PatternKind,
escape_whitespace: bool,
) -> PatternAtom {
let mut ignore_case = case == CaseMatching::Ignore;
let mut ignore_case;
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
@ -53,16 +56,20 @@ impl PatternAtom {
};
match case {
CaseMatching::Ignore => needle.make_ascii_lowercase(),
CaseMatching::Ignore => {
ignore_case = true;
needle.make_ascii_lowercase()
}
CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
}
CaseMatching::Respect => (),
CaseMatching::Respect => ignore_case = false,
}
Utf32String::Ascii(needle.into_boxed_str())
} else {
let mut needle_ = Vec::with_capacity(needle.len());
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
@ -82,7 +89,7 @@ impl PatternAtom {
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase();
ignore_case = ignore_case && !chars::is_upper_case(c)
}
CaseMatching::Respect => (),
}
@ -96,7 +103,7 @@ impl PatternAtom {
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase();
ignore_case = ignore_case && !chars::is_upper_case(c);
}
CaseMatching::Respect => (),
}
@ -116,10 +123,17 @@ impl PatternAtom {
fn parse(raw: &str, normalize: bool, case: CaseMatching) -> PatternAtom {
let mut atom = raw;
let invert = atom.starts_with('!');
if invert {
atom = &atom[1..];
}
let invert = match atom.as_bytes() {
[b'!', ..] => {
atom = &atom[1..];
true
}
[b'\\', b'!', ..] => {
atom = &atom[1..];
false
}
_ => false,
};
let mut kind = match atom.as_bytes() {
[b'^', ..] => {
@ -137,8 +151,12 @@ impl PatternAtom {
_ => PatternKind::Fuzzy,
};
let mut append_dollar = false;
match atom.as_bytes() {
[.., b'\\', b'$'] => (),
[.., b'\\', b'$'] => {
append_dollar = true;
atom = &atom[..atom.len() - 2]
}
[.., b'$'] => {
kind = if kind == PatternKind::Fuzzy {
PatternKind::Postfix
@ -156,6 +174,9 @@ impl PatternAtom {
let mut pattern = PatternAtom::literal(atom, normalize, case, kind, true);
pattern.invert = invert;
if append_dollar {
pattern.needle.push('$');
}
pattern
}
}
@ -221,7 +242,7 @@ impl MultiPattern {
#[derive(Debug)]
pub struct Pattern {
terms: Vec<PatternAtom>,
atoms: Vec<PatternAtom>,
case_matching: CaseMatching,
normalize: bool,
status: Status,
@ -230,7 +251,7 @@ pub struct Pattern {
impl Pattern {
pub fn new(matcher_config: &MatcherConfig, case_matching: CaseMatching) -> Pattern {
Pattern {
terms: Vec::new(),
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
@ -242,7 +263,7 @@ impl Pattern {
pattern: &str,
) -> Pattern {
let mut res = Pattern {
terms: Vec::new(),
atoms: Vec::new(),
case_matching,
normalize: matcher_config.normalize,
status: Status::Unchanged,
@ -252,11 +273,11 @@ impl Pattern {
}
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
if self.terms.is_empty() {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.terms {
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
let pattern_score = match pattern.kind {
PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)),
@ -284,11 +305,11 @@ impl Pattern {
matcher: &mut Matcher,
indices: &mut Vec<u32>,
) -> Option<u32> {
if self.terms.is_empty() {
if self.atoms.is_empty() {
return Some(0);
}
let mut score = 0;
for pattern in &self.terms {
for pattern in &self.atoms {
matcher.config.ignore_case = pattern.ignore_case;
if pattern.invert {
let pattern_score = match pattern.kind {
@ -330,8 +351,8 @@ impl Pattern {
}
pub fn parse_from(&mut self, pattern: &str, append: bool) {
self.terms.clear();
let invert = self.terms.last().map_or(false, |pat| pat.invert);
self.atoms.clear();
let invert = self.atoms.last().map_or(false, |pat| pat.invert);
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = PatternAtom::parse(atom, self.normalize, self.case_matching);
if atom.needle.is_empty() {
@ -339,7 +360,7 @@ impl Pattern {
}
Some(atom)
});
self.terms.extend(atoms);
self.atoms.extend(atoms);
self.status = if append && !invert && self.status != Status::Rescore {
Status::Update
@ -349,10 +370,10 @@ impl Pattern {
}
pub fn set_literal(&mut self, pattern: &str, kind: PatternKind, append: bool) {
self.terms.clear();
self.atoms.clear();
let pattern =
PatternAtom::literal(pattern, self.normalize, self.case_matching, kind, false);
self.terms.push(pattern);
self.atoms.push(pattern);
self.status = if append && self.status != Status::Rescore {
Status::Update
} else {
@ -361,14 +382,14 @@ impl Pattern {
}
pub fn is_empty(&self) -> bool {
self.terms.is_empty()
self.atoms.is_empty()
}
}
impl Clone for Pattern {
fn clone(&self) -> Self {
Self {
terms: self.terms.clone(),
atoms: self.atoms.clone(),
case_matching: self.case_matching,
normalize: self.normalize,
status: self.status,
@ -376,7 +397,7 @@ impl Clone for Pattern {
}
fn clone_from(&mut self, source: &Self) {
self.terms.clone_from(&source.terms);
self.atoms.clone_from(&source.atoms);
self.case_matching = source.case_matching;
self.normalize = source.normalize;
self.status = source.status;

135
src/pattern/tests.rs Normal file
View File

@ -0,0 +1,135 @@
use crate::pattern::PatternAtom;
use crate::{CaseMatching, Pattern, PatternKind};
fn parse_atom(pat: &str) -> PatternAtom {
parse_atom_with(pat, CaseMatching::Smart)
}
fn parse_atom_with(pat: &str, case_matching: CaseMatching) -> PatternAtom {
let mut pat = parse_with(pat, case_matching);
assert_eq!(pat.atoms.len(), 1);
pat.atoms.remove(0)
}
fn parse_with(pat: &str, case_matching: CaseMatching) -> Pattern {
let mut res = Pattern::new(&nucleo_matcher::MatcherConfig::DEFAULT, case_matching);
res.parse_from(pat, false);
res
}
#[test]
fn negative() {
let pat = parse_atom("!foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("!^foo$");
assert!(pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn pattern_kinds() {
let pat = parse_atom("foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Fuzzy);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("'foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Substring);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Prefix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Postfix);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom("^foo$");
assert!(!pat.invert);
assert_eq!(pat.kind, PatternKind::Exact);
assert_eq!(pat.needle.to_string(), "foo");
}
#[test]
fn case_matching() {
let pat = parse_atom_with("foo", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Foo", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Foo");
let pat = parse_atom_with("Äxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "axx");
let pat = parse_atom_with("Äxx", CaseMatching::Respect);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("Äxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Axx");
let pat = parse_atom_with("你xx", CaseMatching::Smart);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("你xx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "你xx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Smart);
assert!(!pat.ignore_case);
assert_eq!(pat.needle.to_string(), "Ⲽxx");
let pat = parse_atom_with("Ⲽxx", CaseMatching::Ignore);
assert!(pat.ignore_case);
assert_eq!(pat.needle.to_string(), "ⲽxx");
}
#[test]
fn escape() {
let pat = parse_atom("foo\\ bar");
assert_eq!(pat.needle.to_string(), "foo bar");
let pat = parse_atom("\\!foo");
assert_eq!(pat.needle.to_string(), "!foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\'foo");
assert_eq!(pat.needle.to_string(), "'foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\^foo");
assert_eq!(pat.needle.to_string(), "^foo");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("^foo\\$");
assert_eq!(pat.needle.to_string(), "foo$");
assert_eq!(pat.kind, PatternKind::Prefix);
let pat = parse_atom("\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("\\!^foo\\$");
assert_eq!(pat.needle.to_string(), "!^foo$");
assert_eq!(pat.kind, PatternKind::Fuzzy);
let pat = parse_atom("!\\^foo\\$");
assert_eq!(pat.needle.to_string(), "^foo$");
assert_eq!(pat.kind, PatternKind::Substring);
}

View File

@ -186,10 +186,9 @@ impl fmt::Debug for Utf32String {
impl fmt::Display for Utf32String {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for c in self.chars() {
write!(f, "{c}")?
}
write!(f, "\"")
Ok(())
}
}