correctly handle empty strings and strip leading and trailing whitespace

2024-12-22 09:57:49 +00:00 · 2023-08-06 19:55:31 +02:00 · 2023-08-06 19:55:31 +02:00 · 4fc0281dd7
commit 4fc0281dd7
parent 6b08991fac
4 changed files with 280 additions and 43 deletions
--- a/matcher/src/lib.rs
+++ b/matcher/src/lib.rs
@ -129,9 +129,12 @@ impl Matcher {
        needle_: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if needle_.len() > haystack_.len() || needle_.is_empty() {
+        if needle_.len() > haystack_.len() {
            return None;
        }
        if needle_.is_empty() {
            return Some(0);
        }
        if needle_.len() == haystack_.len() {
            return self.exact_match_impl::<INDICES>(
                haystack_,
@ -262,9 +265,12 @@ impl Matcher {
        needle_: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if needle_.len() > haystack.len() || needle_.is_empty() {
+        if needle_.len() > haystack.len() {
            return None;
        }
        if needle_.is_empty() {
            return Some(0);
        }
        if needle_.len() == haystack.len() {
            return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
        }
@ -358,9 +364,12 @@ impl Matcher {
        needle_: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if needle_.len() > haystack.len() || needle_.is_empty() {
+        if needle_.len() > haystack.len() {
            return None;
        }
        if needle_.is_empty() {
            return Some(0);
        }
        if needle_.len() == haystack.len() {
            return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices);
        }
@ -425,7 +434,28 @@ impl Matcher {
    ///
    /// See the [matcher documentation](crate::Matcher) for more details.
    pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
-        self.exact_match_impl::<false>(haystack, needle, 0, haystack.len(), &mut Vec::new())
+        if needle.is_empty() {
            return Some(0);
        }
        let mut leading_space = 0;
        let mut trailing_space = 0;
        if !needle.first().is_whitespace() {
            leading_space = haystack.leading_white_space()
        }
        if !needle.last().is_whitespace() {
            trailing_space = haystack.trailing_white_space()
        }
        // avoid wraparound in size check
        if trailing_space == haystack.len() {
            return None;
        }
        self.exact_match_impl::<false>(
            haystack,
            needle,
            leading_space,
            haystack.len() - trailing_space,
            &mut Vec::new(),
        )
    }
    /// Checks whether needle and haystack match exactly and compute the matches indices.
@ -439,7 +469,28 @@ impl Matcher {
        needle: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        self.exact_match_impl::<true>(haystack, needle, 0, haystack.len(), indices)
+        if needle.is_empty() {
            return Some(0);
        }
        let mut leading_space = 0;
        let mut trailing_space = 0;
        if !needle.first().is_whitespace() {
            leading_space = haystack.leading_white_space()
        }
        if !needle.last().is_whitespace() {
            trailing_space = haystack.trailing_white_space()
        }
        // avoid wraparound in size check
        if trailing_space == haystack.len() {
            return None;
        }
        self.exact_match_impl::<true>(
            haystack,
            needle,
            leading_space,
            haystack.len() - trailing_space,
            indices,
        )
    }
    /// Checks whether needle is a prefix of the haystack.
@ -448,10 +499,23 @@ impl Matcher {
    ///
    /// See the [matcher documentation](crate::Matcher) for more details.
    pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
-        if haystack.len() < needle.len() {
+        if needle.is_empty() {
            return Some(0);
        }
        let mut leading_space = 0;
        if !needle.first().is_whitespace() {
            leading_space = haystack.leading_white_space()
        }
        if haystack.len() - leading_space < needle.len() {
            None
        } else {
-            self.exact_match_impl::<false>(haystack, needle, 0, needle.len(), &mut Vec::new())
+            self.exact_match_impl::<false>(
                haystack,
                needle,
                leading_space,
                needle.len() + leading_space,
                &mut Vec::new(),
            )
        }
    }
@ -466,10 +530,23 @@ impl Matcher {
        needle: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if haystack.len() < needle.len() {
+        if needle.is_empty() {
            return Some(0);
        }
        let mut leading_space = 0;
        if !needle.first().is_whitespace() {
            leading_space = haystack.leading_white_space()
        }
        if haystack.len() - leading_space < needle.len() {
            None
        } else {
-            self.exact_match_impl::<true>(haystack, needle, 0, needle.len(), indices)
+            self.exact_match_impl::<true>(
                haystack,
                needle,
                leading_space,
                needle.len() + leading_space,
                indices,
            )
        }
    }
@ -479,14 +556,21 @@ impl Matcher {
    ///
    /// See the [matcher documentation](crate::Matcher) for more details.
    pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> {
-        if haystack.len() < needle.len() {
+        if needle.is_empty() {
            return Some(0);
        }
        let mut trailing_spaces = 0;
        if !needle.last().is_whitespace() {
            trailing_spaces = haystack.trailing_white_space()
        }
        if haystack.len() - trailing_spaces < needle.len() {
            None
        } else {
            self.exact_match_impl::<false>(
                haystack,
                needle,
-                haystack.len() - needle.len(),
+                haystack.len() - needle.len() - trailing_spaces,
-                haystack.len(),
+                haystack.len() - trailing_spaces,
                &mut Vec::new(),
            )
        }
@ -503,14 +587,21 @@ impl Matcher {
        needle: Utf32Str<'_>,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if haystack.len() < needle.len() {
+        if needle.is_empty() {
            return Some(0);
        }
        let mut trailing_spaces = 0;
        if !needle.last().is_whitespace() {
            trailing_spaces = haystack.trailing_white_space()
        }
        if haystack.len() - trailing_spaces < needle.len() {
            None
        } else {
            self.exact_match_impl::<true>(
                haystack,
                needle,
-                haystack.len() - needle.len(),
+                haystack.len() - needle.len() - trailing_spaces,
-                haystack.len(),
+                haystack.len() - trailing_spaces,
                indices,
            )
        }
@ -524,7 +615,7 @@ impl Matcher {
        end: usize,
        indices: &mut Vec<u32>,
    ) -> Option<u16> {
-        if needle_.len() != end - start || needle_.is_empty() {
+        if needle_.len() != end - start {
            return None;
        }
        assert!(
--- a/matcher/src/tests.rs
+++ b/matcher/src/tests.rs
@ -13,6 +13,9 @@ enum Algorithm {
    FuzzyOptimal,
    FuzzyGreedy,
    Substring,
    Prefix,
    Postfix,
    Exact,
 }
 fn assert_matches(
@ -50,6 +53,9 @@ fn assert_matches(
                FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices),
                FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices),
                Substring => matcher.substring_indices(haystack, needle, &mut matched_indices),
                Prefix => matcher.prefix_indices(haystack, needle, &mut matched_indices),
                Postfix => matcher.postfix_indices(haystack, needle, &mut matched_indices),
                Exact => matcher.exact_indices(haystack, needle, &mut matched_indices),
            };
            println!("{matched_indices:?}");
            let match_chars: Vec<_> = matched_indices
@ -107,7 +113,22 @@ pub fn assert_not_matches(
        assert_eq!(
            res, None,
            "{needle:?} should not match {haystack:?} (greedy)"
-        )
+        );
        let res = matcher.substring_match(haystack, needle);
        assert_eq!(
            res, None,
            "{needle:?} should not match {haystack:?} (substring)"
        );
        let res = matcher.prefix_match(haystack, needle);
        assert_eq!(
            res, None,
            "{needle:?} should not match {haystack:?} (prefix)"
        );
        let res = matcher.postfix_match(haystack, needle);
        assert_eq!(
            res, None,
            "{needle:?} should not match {haystack:?} (postfix)"
        );
    }
 }
@ -222,8 +243,97 @@ fn test_fuzzy() {
    );
 }
 #[test]
 fn empty_needle() {
    assert_matches(
        &[Substring, Prefix, Postfix, FuzzyGreedy, FuzzyOptimal, Exact],
        false,
        false,
        false,
        &[("foo bar baz", "", &[], 0)],
    );
 }
 #[test]
 fn test_substring() {
    assert_matches(
        &[Substring, Prefix],
        false,
        false,
        false,
        &[
            (
                "foo bar baz",
                "foo",
                &[0, 1, 2],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                " foo bar baz",
                "FOO",
                &[1, 2, 3],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                " foo bar baz",
                " FOO",
                &[0, 1, 2, 3],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
            ),
        ],
    );
    assert_matches(
        &[Substring, Postfix],
        false,
        false,
        false,
        &[
            (
                "foo bar baz",
                "baz",
                &[8, 9, 10],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                "foo bar baz ",
                "baz",
                &[8, 9, 10],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                "foo bar baz ",
                "baz ",
                &[8, 9, 10, 11],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
            ),
        ],
    );
    assert_matches(
        &[Substring, Prefix, Postfix, Exact, FuzzyGreedy, FuzzyOptimal],
        false,
        false,
        false,
        &[
            (
                "foo",
                "foo",
                &[0, 1, 2],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                " foo",
                "foo",
                &[1, 2, 3],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                " foo",
                " foo",
                &[0, 1, 2, 3],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3),
            ),
        ],
    );
    assert_matches(
        &[Substring],
        false,
@ -236,18 +346,6 @@ fn test_substring() {
                &[2, 3, 4],
                BONUS_CAMEL123 + BONUS_CONSECUTIVE,
            ),
            (
                "foo bar baz",
                "foo",
                &[0, 1, 2],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                "foo bar baz",
                "FOO",
                &[0, 1, 2],
                BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2),
            ),
            (
                "/AutomatorDocument.icns",
                "rdoc",
--- a/matcher/src/utf32_str.rs
+++ b/matcher/src/utf32_str.rs
@ -52,14 +52,14 @@ impl<'a> Utf32Str<'a> {
    }
    #[inline]
-    pub fn len(&self) -> usize {
+    pub fn len(self) -> usize {
        match self {
            Utf32Str::Unicode(codepoints) => codepoints.len(),
            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
        }
    }
    #[inline]
-    pub fn is_empty(&self) -> bool {
+    pub fn is_empty(self) -> bool {
        match self {
            Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
@ -67,15 +67,15 @@ impl<'a> Utf32Str<'a> {
    }
    #[inline]
-    pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
+    pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
        let start = match range.start_bound() {
            Bound::Included(&start) => start,
            Bound::Excluded(&start) => start + 1,
            Bound::Unbounded => 0,
        };
        let end = match range.end_bound() {
-            Bound::Included(&end) => end,
+            Bound::Included(&end) => end + 1,
-            Bound::Excluded(&end) => end + 1,
+            Bound::Excluded(&end) => end,
            Bound::Unbounded => self.len(),
        };
        match self {
@ -84,18 +84,50 @@ impl<'a> Utf32Str<'a> {
        }
    }
    /// Returns the number of leading whitespaces in this string
    #[inline]
    pub fn leading_white_space(self) -> usize {
        match self {
            Utf32Str::Ascii(bytes) => bytes
                .iter()
                .position(|b| !b.is_ascii_whitespace())
                .unwrap_or(0),
            Utf32Str::Unicode(codepoints) => codepoints
                .iter()
                .position(|c| !c.is_whitespace())
                .unwrap_or(0),
        }
    }
    /// Returns the number of leading whitespaces in this string
    #[inline]
    pub fn trailing_white_space(self) -> usize {
        match self {
            Utf32Str::Ascii(bytes) => bytes
                .iter()
                .rev()
                .position(|b| !b.is_ascii_whitespace())
                .unwrap_or(0),
            Utf32Str::Unicode(codepoints) => codepoints
                .iter()
                .rev()
                .position(|c| !c.is_whitespace())
                .unwrap_or(0),
        }
    }
    /// Same as `slice` but accepts a u32 range for convenience since
    /// those are the indices returned by the matcher
    #[inline]
-    pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
+    pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
        let start = match range.start_bound() {
            Bound::Included(&start) => start as usize,
            Bound::Excluded(&start) => start as usize + 1,
            Bound::Unbounded => 0,
        };
        let end = match range.end_bound() {
-            Bound::Included(&end) => end as usize,
+            Bound::Included(&end) => end as usize + 1,
-            Bound::Excluded(&end) => end as usize + 1,
+            Bound::Excluded(&end) => end as usize,
            Bound::Unbounded => self.len(),
        };
        match self {
@ -103,23 +135,30 @@ impl<'a> Utf32Str<'a> {
            Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
        }
    }
-    pub fn is_ascii(&self) -> bool {
+    pub fn is_ascii(self) -> bool {
        matches!(self, Utf32Str::Ascii(_))
    }
-    pub fn get(&self, idx: u32) -> char {
+    pub fn get(self, idx: u32) -> char {
        match self {
            Utf32Str::Ascii(bytes) => bytes[idx as usize] as char,
            Utf32Str::Unicode(codepoints) => codepoints[idx as usize],
        }
    }
-    pub fn last(&self) -> char {
+    pub fn last(self) -> char {
        match self {
            Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
            Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
        }
    }
-    pub fn chars(&self) -> Chars<'_> {
+    pub fn first(self) -> char {
        match self {
            Utf32Str::Ascii(bytes) => bytes[0] as char,
            Utf32Str::Unicode(codepoints) => codepoints[0],
        }
    }
    pub fn chars(self) -> Chars<'a> {
        match self {
            Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
            Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
@ -161,3 +200,12 @@ impl<'a> Iterator for Chars<'a> {
        }
    }
 }
 impl DoubleEndedIterator for Chars<'_> {
    fn next_back(&mut self) -> Option<Self::Item> {
        match self {
            Chars::Ascii(iter) => iter.next_back().map(|&c| c as char),
            Chars::Unicode(iter) => iter.next_back().copied(),
        }
    }
 }
--- a/src/utf32_string.rs
+++ b/src/utf32_string.rs
@ -46,8 +46,8 @@ impl Utf32String {
            Bound::Unbounded => 0,
        };
        let end = match range.end_bound() {
-            Bound::Included(&end) => end as usize,
+            Bound::Included(&end) => end as usize + 1,
-            Bound::Excluded(&end) => end as usize + 1,
+            Bound::Excluded(&end) => end as usize,
            Bound::Unbounded => self.len(),
        };
        match self {