From 4fc0281dd708e65fa726f1f3d8ae73de2ad31799 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 6 Aug 2023 19:55:31 +0200 Subject: [PATCH] correctly handle empty strings and strip leading and trailing whitespace --- matcher/src/lib.rs | 123 +++++++++++++++++++++++++++++++++----- matcher/src/tests.rs | 124 +++++++++++++++++++++++++++++++++++---- matcher/src/utf32_str.rs | 72 +++++++++++++++++++---- src/utf32_string.rs | 4 +- 4 files changed, 280 insertions(+), 43 deletions(-) diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 448d1c8..f966b1a 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -129,9 +129,12 @@ impl Matcher { needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if needle_.len() > haystack_.len() || needle_.is_empty() { + if needle_.len() > haystack_.len() { return None; } + if needle_.is_empty() { + return Some(0); + } if needle_.len() == haystack_.len() { return self.exact_match_impl::( haystack_, @@ -262,9 +265,12 @@ impl Matcher { needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if needle_.len() > haystack.len() || needle_.is_empty() { + if needle_.len() > haystack.len() { return None; } + if needle_.is_empty() { + return Some(0); + } if needle_.len() == haystack.len() { return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } @@ -358,9 +364,12 @@ impl Matcher { needle_: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if needle_.len() > haystack.len() || needle_.is_empty() { + if needle_.len() > haystack.len() { return None; } + if needle_.is_empty() { + return Some(0); + } if needle_.len() == haystack.len() { return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); } @@ -425,7 +434,28 @@ impl Matcher { /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - self.exact_match_impl::(haystack, needle, 0, haystack.len(), &mut Vec::new()) + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + &mut Vec::new(), + ) } /// Checks whether needle and haystack match exactly and compute the matches indices. @@ -439,7 +469,28 @@ impl Matcher { needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - self.exact_match_impl::(haystack, needle, 0, haystack.len(), indices) + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + indices, + ) } /// Checks whether needle is a prefix of the haystack. @@ -448,10 +499,23 @@ impl Matcher { /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - if haystack.len() < needle.len() { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { None } else { - self.exact_match_impl::(haystack, needle, 0, needle.len(), &mut Vec::new()) + self.exact_match_impl::( + haystack, + needle, + leading_space, + needle.len() + leading_space, + &mut Vec::new(), + ) } } @@ -466,10 +530,23 @@ impl Matcher { needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if haystack.len() < needle.len() { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { None } else { - self.exact_match_impl::(haystack, needle, 0, needle.len(), indices) + self.exact_match_impl::( + haystack, + needle, + leading_space, + needle.len() + leading_space, + indices, + ) } } @@ -479,14 +556,21 @@ impl Matcher { /// /// See the [matcher documentation](crate::Matcher) for more details. pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - if haystack.len() < needle.len() { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { None } else { self.exact_match_impl::( haystack, needle, - haystack.len() - needle.len(), - haystack.len(), + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, &mut Vec::new(), ) } @@ -503,14 +587,21 @@ impl Matcher { needle: Utf32Str<'_>, indices: &mut Vec, ) -> Option { - if haystack.len() < needle.len() { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { None } else { self.exact_match_impl::( haystack, needle, - haystack.len() - needle.len(), - haystack.len(), + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, indices, ) } @@ -524,7 +615,7 @@ impl Matcher { end: usize, indices: &mut Vec, ) -> Option { - if needle_.len() != end - start || needle_.is_empty() { + if needle_.len() != end - start { return None; } assert!( diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs index 8cfb3c0..d2bfaea 100644 --- a/matcher/src/tests.rs +++ b/matcher/src/tests.rs @@ -13,6 +13,9 @@ enum Algorithm { FuzzyOptimal, FuzzyGreedy, Substring, + Prefix, + Postfix, + Exact, } fn assert_matches( @@ -50,6 +53,9 @@ fn assert_matches( FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices), FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices), Substring => matcher.substring_indices(haystack, needle, &mut matched_indices), + Prefix => matcher.prefix_indices(haystack, needle, &mut matched_indices), + Postfix => matcher.postfix_indices(haystack, needle, &mut matched_indices), + Exact => matcher.exact_indices(haystack, needle, &mut matched_indices), }; println!("{matched_indices:?}"); let match_chars: Vec<_> = matched_indices @@ -107,7 +113,22 @@ pub fn assert_not_matches( assert_eq!( res, None, "{needle:?} should not match {haystack:?} (greedy)" - ) + ); + let res = matcher.substring_match(haystack, needle); + assert_eq!( + res, None, + "{needle:?} should not match {haystack:?} (substring)" + ); + let res = matcher.prefix_match(haystack, needle); + assert_eq!( + res, None, + "{needle:?} should not match {haystack:?} (prefix)" + ); + let res = matcher.postfix_match(haystack, needle); + assert_eq!( + res, None, + "{needle:?} should not match {haystack:?} (postfix)" + ); } } @@ -222,8 +243,97 @@ fn test_fuzzy() { ); } +#[test] +fn empty_needle() { + assert_matches( + &[Substring, Prefix, Postfix, FuzzyGreedy, FuzzyOptimal, Exact], + false, + false, + false, + &[("foo bar baz", "", &[], 0)], + ); +} + #[test] fn test_substring() { + assert_matches( + &[Substring, Prefix], + false, + false, + false, + &[ + ( + "foo bar baz", + "foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo bar baz", + "FOO", + &[1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo bar baz", + " FOO", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); + assert_matches( + &[Substring, Postfix], + false, + false, + false, + &[ + ( + "foo bar baz", + "baz", + &[8, 9, 10], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "foo bar baz ", + "baz", + &[8, 9, 10], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "foo bar baz ", + "baz ", + &[8, 9, 10, 11], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); + assert_matches( + &[Substring, Prefix, Postfix, Exact, FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + &[ + ( + "foo", + "foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo", + "foo", + &[1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo", + " foo", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); assert_matches( &[Substring], false, @@ -236,18 +346,6 @@ fn test_substring() { &[2, 3, 4], BONUS_CAMEL123 + BONUS_CONSECUTIVE, ), - ( - "foo bar baz", - "foo", - &[0, 1, 2], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - "foo bar baz", - "FOO", - &[0, 1, 2], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), ( "/AutomatorDocument.icns", "rdoc", diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 554f72c..fe4f44e 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -52,14 +52,14 @@ impl<'a> Utf32Str<'a> { } #[inline] - pub fn len(&self) -> usize { + pub fn len(self) -> usize { match self { Utf32Str::Unicode(codepoints) => codepoints.len(), Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), } } #[inline] - pub fn is_empty(&self) -> bool { + pub fn is_empty(self) -> bool { match self { Utf32Str::Unicode(codepoints) => codepoints.is_empty(), Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(), @@ -67,15 +67,15 @@ impl<'a> Utf32Str<'a> { } #[inline] - pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + pub fn slice(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { Bound::Included(&start) => start, Bound::Excluded(&start) => start + 1, Bound::Unbounded => 0, }; let end = match range.end_bound() { - Bound::Included(&end) => end, - Bound::Excluded(&end) => end + 1, + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, Bound::Unbounded => self.len(), }; match self { @@ -84,18 +84,50 @@ impl<'a> Utf32Str<'a> { } } + /// Returns the number of leading whitespaces in this string + #[inline] + pub fn leading_white_space(self) -> usize { + match self { + Utf32Str::Ascii(bytes) => bytes + .iter() + .position(|b| !b.is_ascii_whitespace()) + .unwrap_or(0), + Utf32Str::Unicode(codepoints) => codepoints + .iter() + .position(|c| !c.is_whitespace()) + .unwrap_or(0), + } + } + + /// Returns the number of leading whitespaces in this string + #[inline] + pub fn trailing_white_space(self) -> usize { + match self { + Utf32Str::Ascii(bytes) => bytes + .iter() + .rev() + .position(|b| !b.is_ascii_whitespace()) + .unwrap_or(0), + Utf32Str::Unicode(codepoints) => codepoints + .iter() + .rev() + .position(|c| !c.is_whitespace()) + .unwrap_or(0), + } + } + /// Same as `slice` but accepts a u32 range for convenience since /// those are the indices returned by the matcher #[inline] - pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { + pub fn slice_u32(self, range: impl RangeBounds) -> Utf32Str<'a> { let start = match range.start_bound() { Bound::Included(&start) => start as usize, Bound::Excluded(&start) => start as usize + 1, Bound::Unbounded => 0, }; let end = match range.end_bound() { - Bound::Included(&end) => end as usize, - Bound::Excluded(&end) => end as usize + 1, + Bound::Included(&end) => end as usize + 1, + Bound::Excluded(&end) => end as usize, Bound::Unbounded => self.len(), }; match self { @@ -103,23 +135,30 @@ impl<'a> Utf32Str<'a> { Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), } } - pub fn is_ascii(&self) -> bool { + pub fn is_ascii(self) -> bool { matches!(self, Utf32Str::Ascii(_)) } - pub fn get(&self, idx: u32) -> char { + pub fn get(self, idx: u32) -> char { match self { Utf32Str::Ascii(bytes) => bytes[idx as usize] as char, Utf32Str::Unicode(codepoints) => codepoints[idx as usize], } } - pub fn last(&self) -> char { + pub fn last(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], } } - pub fn chars(&self) -> Chars<'_> { + pub fn first(self) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[0] as char, + Utf32Str::Unicode(codepoints) => codepoints[0], + } + } + + pub fn chars(self) -> Chars<'a> { match self { Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), @@ -161,3 +200,12 @@ impl<'a> Iterator for Chars<'a> { } } } + +impl DoubleEndedIterator for Chars<'_> { + fn next_back(&mut self) -> Option { + match self { + Chars::Ascii(iter) => iter.next_back().map(|&c| c as char), + Chars::Unicode(iter) => iter.next_back().copied(), + } + } +} diff --git a/src/utf32_string.rs b/src/utf32_string.rs index da69cf3..97d4e25 100644 --- a/src/utf32_string.rs +++ b/src/utf32_string.rs @@ -46,8 +46,8 @@ impl Utf32String { Bound::Unbounded => 0, }; let end = match range.end_bound() { - Bound::Included(&end) => end as usize, - Bound::Excluded(&end) => end as usize + 1, + Bound::Included(&end) => end as usize + 1, + Bound::Excluded(&end) => end as usize, Bound::Unbounded => self.len(), }; match self {