correctly handle empty strings and strip leading and trailing whitespace

This commit is contained in:
Pascal Kuthe 2023-08-06 19:55:31 +02:00
parent 6b08991fac
commit 4fc0281dd7
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
4 changed files with 280 additions and 43 deletions

View File

@ -129,9 +129,12 @@ impl Matcher {
needle_: Utf32Str<'_>, needle_: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() > haystack_.len() || needle_.is_empty() { if needle_.len() > haystack_.len() {
return None; return None;
} }
if needle_.is_empty() {
return Some(0);
}
if needle_.len() == haystack_.len() { if needle_.len() == haystack_.len() {
return self.exact_match_impl::<INDICES>( return self.exact_match_impl::<INDICES>(
haystack_, haystack_,
@ -262,9 +265,12 @@ impl Matcher {
needle_: Utf32Str<'_>, needle_: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() > haystack.len() || needle_.is_empty() { if needle_.len() > haystack.len() {
return None; return None;
} }
if needle_.is_empty() {
return Some(0);
}
if needle_.len() == haystack.len() { if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
} }
@ -358,9 +364,12 @@ impl Matcher {
needle_: Utf32Str<'_>, needle_: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() > haystack.len() || needle_.is_empty() { if needle_.len() > haystack.len() {
return None; return None;
} }
if needle_.is_empty() {
return Some(0);
}
if needle_.len() == haystack.len() { if needle_.len() == haystack.len() {
return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
} }
@ -425,7 +434,28 @@ impl Matcher {
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
self.exact_match_impl::<false>(haystack, needle, 0, haystack.len(), &mut Vec::new()) if needle.is_empty() {
return Some(0);
}
let mut leading_space = 0;
let mut trailing_space = 0;
if !needle.first().is_whitespace() {
leading_space = haystack.leading_white_space()
}
if !needle.last().is_whitespace() {
trailing_space = haystack.trailing_white_space()
}
// avoid wraparound in size check
if trailing_space == haystack.len() {
return None;
}
self.exact_match_impl::<false>(
haystack,
needle,
leading_space,
haystack.len() - trailing_space,
&mut Vec::new(),
)
} }
/// Checks whether needle and haystack match exactly and compute the matches indices. /// Checks whether needle and haystack match exactly and compute the matches indices.
@ -439,7 +469,28 @@ impl Matcher {
needle: Utf32Str<'_>, needle: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
self.exact_match_impl::<true>(haystack, needle, 0, haystack.len(), indices) if needle.is_empty() {
return Some(0);
}
let mut leading_space = 0;
let mut trailing_space = 0;
if !needle.first().is_whitespace() {
leading_space = haystack.leading_white_space()
}
if !needle.last().is_whitespace() {
trailing_space = haystack.trailing_white_space()
}
// avoid wraparound in size check
if trailing_space == haystack.len() {
return None;
}
self.exact_match_impl::<true>(
haystack,
needle,
leading_space,
haystack.len() - trailing_space,
indices,
)
} }
/// Checks whether needle is a prefix of the haystack. /// Checks whether needle is a prefix of the haystack.
@ -448,10 +499,23 @@ impl Matcher {
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
if haystack.len() < needle.len() { if needle.is_empty() {
return Some(0);
}
let mut leading_space = 0;
if !needle.first().is_whitespace() {
leading_space = haystack.leading_white_space()
}
if haystack.len() - leading_space < needle.len() {
None None
} else { } else {
self.exact_match_impl::<false>(haystack, needle, 0, needle.len(), &mut Vec::new()) self.exact_match_impl::<false>(
haystack,
needle,
leading_space,
needle.len() + leading_space,
&mut Vec::new(),
)
} }
} }
@ -466,10 +530,23 @@ impl Matcher {
needle: Utf32Str<'_>, needle: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if haystack.len() < needle.len() { if needle.is_empty() {
return Some(0);
}
let mut leading_space = 0;
if !needle.first().is_whitespace() {
leading_space = haystack.leading_white_space()
}
if haystack.len() - leading_space < needle.len() {
None None
} else { } else {
self.exact_match_impl::<true>(haystack, needle, 0, needle.len(), indices) self.exact_match_impl::<true>(
haystack,
needle,
leading_space,
needle.len() + leading_space,
indices,
)
} }
} }
@ -479,14 +556,21 @@ impl Matcher {
/// ///
/// See the [matcher documentation](crate::Matcher) for more details. /// See the [matcher documentation](crate::Matcher) for more details.
pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
if haystack.len() < needle.len() { if needle.is_empty() {
return Some(0);
}
let mut trailing_spaces = 0;
if !needle.last().is_whitespace() {
trailing_spaces = haystack.trailing_white_space()
}
if haystack.len() - trailing_spaces < needle.len() {
None None
} else { } else {
self.exact_match_impl::<false>( self.exact_match_impl::<false>(
haystack, haystack,
needle, needle,
haystack.len() - needle.len(), haystack.len() - needle.len() - trailing_spaces,
haystack.len(), haystack.len() - trailing_spaces,
&mut Vec::new(), &mut Vec::new(),
) )
} }
@ -503,14 +587,21 @@ impl Matcher {
needle: Utf32Str<'_>, needle: Utf32Str<'_>,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if haystack.len() < needle.len() { if needle.is_empty() {
return Some(0);
}
let mut trailing_spaces = 0;
if !needle.last().is_whitespace() {
trailing_spaces = haystack.trailing_white_space()
}
if haystack.len() - trailing_spaces < needle.len() {
None None
} else { } else {
self.exact_match_impl::<true>( self.exact_match_impl::<true>(
haystack, haystack,
needle, needle,
haystack.len() - needle.len(), haystack.len() - needle.len() - trailing_spaces,
haystack.len(), haystack.len() - trailing_spaces,
indices, indices,
) )
} }
@ -524,7 +615,7 @@ impl Matcher {
end: usize, end: usize,
indices: &mut Vec<u32>, indices: &mut Vec<u32>,
) -> Option<u16> { ) -> Option<u16> {
if needle_.len() != end - start || needle_.is_empty() { if needle_.len() != end - start {
return None; return None;
} }
assert!( assert!(

View File

@ -13,6 +13,9 @@ enum Algorithm {
FuzzyOptimal, FuzzyOptimal,
FuzzyGreedy, FuzzyGreedy,
Substring, Substring,
Prefix,
Postfix,
Exact,
} }
fn assert_matches( fn assert_matches(
@ -50,6 +53,9 @@ fn assert_matches(
FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices), FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices),
FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices), FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices),
Substring => matcher.substring_indices(haystack, needle, &mut matched_indices), Substring => matcher.substring_indices(haystack, needle, &mut matched_indices),
Prefix => matcher.prefix_indices(haystack, needle, &mut matched_indices),
Postfix => matcher.postfix_indices(haystack, needle, &mut matched_indices),
Exact => matcher.exact_indices(haystack, needle, &mut matched_indices),
}; };
println!("{matched_indices:?}"); println!("{matched_indices:?}");
let match_chars: Vec<_> = matched_indices let match_chars: Vec<_> = matched_indices
@ -107,7 +113,22 @@ pub fn assert_not_matches(
assert_eq!( assert_eq!(
res, None, res, None,
"{needle:?} should not match {haystack:?} (greedy)" "{needle:?} should not match {haystack:?} (greedy)"
) );
let res = matcher.substring_match(haystack, needle);
assert_eq!(
res, None,
"{needle:?} should not match {haystack:?} (substring)"
);
let res = matcher.prefix_match(haystack, needle);
assert_eq!(
res, None,
"{needle:?} should not match {haystack:?} (prefix)"
);
let res = matcher.postfix_match(haystack, needle);
assert_eq!(
res, None,
"{needle:?} should not match {haystack:?} (postfix)"
);
} }
} }
@ -222,8 +243,97 @@ fn test_fuzzy() {
); );
} }
#[test]
fn empty_needle() {
assert_matches(
&[Substring, Prefix, Postfix, FuzzyGreedy, FuzzyOptimal, Exact],
false,
false,
false,
&[("foo bar baz", "", &[], 0)],
);
}
#[test] #[test]
fn test_substring() { fn test_substring() {
assert_matches(
&[Substring, Prefix],
false,
false,
false,
&[
(
"foo bar baz",
"foo",
&[0, 1, 2],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
" foo bar baz",
"FOO",
&[1, 2, 3],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
" foo bar baz",
" FOO",
&[0, 1, 2, 3],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
),
],
);
assert_matches(
&[Substring, Postfix],
false,
false,
false,
&[
(
"foo bar baz",
"baz",
&[8, 9, 10],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
"foo bar baz ",
"baz",
&[8, 9, 10],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
"foo bar baz ",
"baz ",
&[8, 9, 10, 11],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
),
],
);
assert_matches(
&[Substring, Prefix, Postfix, Exact, FuzzyGreedy, FuzzyOptimal],
false,
false,
false,
&[
(
"foo",
"foo",
&[0, 1, 2],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
" foo",
"foo",
&[1, 2, 3],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
" foo",
" foo",
&[0, 1, 2, 3],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
),
],
);
assert_matches( assert_matches(
&[Substring], &[Substring],
false, false,
@ -236,18 +346,6 @@ fn test_substring() {
&[2, 3, 4], &[2, 3, 4],
BONUS_CAMEL123 + BONUS_CONSECUTIVE, BONUS_CAMEL123 + BONUS_CONSECUTIVE,
), ),
(
"foo bar baz",
"foo",
&[0, 1, 2],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
(
"foo bar baz",
"FOO",
&[0, 1, 2],
BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
),
( (
"/AutomatorDocument.icns", "/AutomatorDocument.icns",
"rdoc", "rdoc",

View File

@ -52,14 +52,14 @@ impl<'a> Utf32Str<'a> {
} }
#[inline] #[inline]
pub fn len(&self) -> usize { pub fn len(self) -> usize {
match self { match self {
Utf32Str::Unicode(codepoints) => codepoints.len(), Utf32Str::Unicode(codepoints) => codepoints.len(),
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
} }
} }
#[inline] #[inline]
pub fn is_empty(&self) -> bool { pub fn is_empty(self) -> bool {
match self { match self {
Utf32Str::Unicode(codepoints) => codepoints.is_empty(), Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(), Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
@ -67,15 +67,15 @@ impl<'a> Utf32Str<'a> {
} }
#[inline] #[inline]
pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str { pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
let start = match range.start_bound() { let start = match range.start_bound() {
Bound::Included(&start) => start, Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1, Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0, Bound::Unbounded => 0,
}; };
let end = match range.end_bound() { let end = match range.end_bound() {
Bound::Included(&end) => end, Bound::Included(&end) => end + 1,
Bound::Excluded(&end) => end + 1, Bound::Excluded(&end) => end,
Bound::Unbounded => self.len(), Bound::Unbounded => self.len(),
}; };
match self { match self {
@ -84,18 +84,50 @@ impl<'a> Utf32Str<'a> {
} }
} }
/// Returns the number of leading whitespaces in this string
#[inline]
pub fn leading_white_space(self) -> usize {
match self {
Utf32Str::Ascii(bytes) => bytes
.iter()
.position(|b| !b.is_ascii_whitespace())
.unwrap_or(0),
Utf32Str::Unicode(codepoints) => codepoints
.iter()
.position(|c| !c.is_whitespace())
.unwrap_or(0),
}
}
/// Returns the number of leading whitespaces in this string
#[inline]
pub fn trailing_white_space(self) -> usize {
match self {
Utf32Str::Ascii(bytes) => bytes
.iter()
.rev()
.position(|b| !b.is_ascii_whitespace())
.unwrap_or(0),
Utf32Str::Unicode(codepoints) => codepoints
.iter()
.rev()
.position(|c| !c.is_whitespace())
.unwrap_or(0),
}
}
/// Same as `slice` but accepts a u32 range for convenience since /// Same as `slice` but accepts a u32 range for convenience since
/// those are the indices returned by the matcher /// those are the indices returned by the matcher
#[inline] #[inline]
pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str { pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
let start = match range.start_bound() { let start = match range.start_bound() {
Bound::Included(&start) => start as usize, Bound::Included(&start) => start as usize,
Bound::Excluded(&start) => start as usize + 1, Bound::Excluded(&start) => start as usize + 1,
Bound::Unbounded => 0, Bound::Unbounded => 0,
}; };
let end = match range.end_bound() { let end = match range.end_bound() {
Bound::Included(&end) => end as usize, Bound::Included(&end) => end as usize + 1,
Bound::Excluded(&end) => end as usize + 1, Bound::Excluded(&end) => end as usize,
Bound::Unbounded => self.len(), Bound::Unbounded => self.len(),
}; };
match self { match self {
@ -103,23 +135,30 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
} }
} }
pub fn is_ascii(&self) -> bool { pub fn is_ascii(self) -> bool {
matches!(self, Utf32Str::Ascii(_)) matches!(self, Utf32Str::Ascii(_))
} }
pub fn get(&self, idx: u32) -> char { pub fn get(self, idx: u32) -> char {
match self { match self {
Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[idx as usize], Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
} }
} }
pub fn last(&self) -> char { pub fn last(self) -> char {
match self { match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
} }
} }
pub fn chars(&self) -> Chars<'_> { pub fn first(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[0] as char,
Utf32Str::Unicode(codepoints) => codepoints[0],
}
}
pub fn chars(self) -> Chars<'a> {
match self { match self {
Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
@ -161,3 +200,12 @@ impl<'a> Iterator for Chars<'a> {
} }
} }
} }
impl DoubleEndedIterator for Chars<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
match self {
Chars::Ascii(iter) => iter.next_back().map(|&c| c as char),
Chars::Unicode(iter) => iter.next_back().copied(),
}
}
}

View File

@ -46,8 +46,8 @@ impl Utf32String {
Bound::Unbounded => 0, Bound::Unbounded => 0,
}; };
let end = match range.end_bound() { let end = match range.end_bound() {
Bound::Included(&end) => end as usize, Bound::Included(&end) => end as usize + 1,
Bound::Excluded(&end) => end as usize + 1, Bound::Excluded(&end) => end as usize,
Bound::Unbounded => self.len(), Bound::Unbounded => self.len(),
}; };
match self { match self {