Grapheme segment input and match on first grapheme char

This commit is contained in:
Pascal Kuthe 2023-07-30 16:48:15 +02:00
parent 878fd7b7ea
commit 1cb902ccdb
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
8 changed files with 56 additions and 17 deletions

7
Cargo.lock generated
View File

@ -165,6 +165,7 @@ version = "0.1.0"
dependencies = [
"cov-mark",
"memchr",
"unicode-segmentation",
]
[[package]]
@ -277,6 +278,12 @@ dependencies = [
"once_cell",
]
[[package]]
name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
version = "0.1.10"

View File

@ -9,6 +9,7 @@ edition = "2021"
[dependencies]
memchr = "2.5.0"
cov-mark = { version = "1.1.0", default-features = false }
unicode-segmentation = "1.10"
[dev-dependencies]
cov-mark = { version = "1.1.0", default-features = true }

View File

@ -135,6 +135,7 @@ impl Char for char {
}
pub use normalize::normalize;
use unicode_segmentation::UnicodeSegmentation;
#[inline(always)]
pub fn to_lower_case(c: char) -> char {
@ -154,3 +155,14 @@ pub enum CharClass {
Letter,
Number,
}
/// nucleo can not match graphemes as single units to work around
/// that we only use the first codepoint of each grapheme
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
text.graphemes(true).map(|grapheme| {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
})
}

View File

@ -43,7 +43,10 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Ascii(str.as_bytes())
} else {
buf.clear();
buf.extend(str.chars());
buf.extend(crate::chars::graphemes(str));
if buf.iter().all(|c| c.is_ascii()) {
return Utf32Str::Ascii(str.as_bytes());
}
Utf32Str::Unicode(&*buf)
}
}

View File

@ -9,11 +9,11 @@ use crate::worker::Worker;
use parking_lot::lock_api::ArcMutexGuard;
use rayon::ThreadPool;
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::utf32_string::Utf32String;
mod items;
mod query;
mod pattern;
mod utf32_string;
mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
@ -124,14 +124,14 @@ impl<T: Sync + Send> Nucleo<T> {
self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status();
let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared();
let canceled = status != pattern::Status::Unchanged || items.cleared();
let res = self.tick_inner(timeout, canceled, items, status);
if !canceled {
self.should_notify.store(true, atomic::Ordering::Relaxed);
return res;
}
let items = self.items.cache.lock_arc();
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged);
let res = self.tick_inner(timeout, false, items, pattern::Status::Unchanged);
self.should_notify.store(true, atomic::Ordering::Relaxed);
res
}
@ -141,7 +141,7 @@ impl<T: Sync + Send> Nucleo<T> {
timeout: u64,
canceled: bool,
items: ArcMutexGuard<RawMutex, ItemCache>,
status: query::Status,
status: pattern::Status,
) -> Status {
let mut inner = if canceled {
self.pattern.reset_status();

View File

@ -65,7 +65,7 @@ impl PatternAtom {
let mut needle_ = Vec::with_capacity(needle.len());
if escape_whitespace {
let mut saw_backslash = false;
for mut c in needle.chars() {
for mut c in chars::graphemes(needle) {
if saw_backslash {
if c == ' ' {
needle_.push(' ');
@ -88,6 +88,21 @@ impl PatternAtom {
}
needle_.push(c);
}
} else {
let chars = chars::graphemes(needle).map(|mut c| {
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase();
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
};
Utf32String::Unicode(needle_.into_boxed_slice())
};

View File

@ -4,7 +4,7 @@ use std::fmt;
use std::mem::take;
use std::ops::{Bound, RangeBounds};
use nucleo_matcher::Utf32Str;
use nucleo_matcher::{chars, Utf32Str};
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
pub enum Utf32String {
@ -94,12 +94,13 @@ impl Utf32String {
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.extend(text.chars());
codeboints.extend(chars::graphemes(text));
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
}
#[inline]
pub fn push(&mut self, c: char) {
let mut codeboints = match take(self) {
@ -109,7 +110,7 @@ impl Utf32String {
*self = Self::Ascii(bytes.into_boxed_str());
return;
}
Utf32String::Ascii(bytes) => bytes.chars().collect(),
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
};
codeboints.push(c);
@ -123,7 +124,7 @@ impl From<&str> for Utf32String {
if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str())
} else {
Self::Unicode(value.chars().collect())
Self::Unicode(chars::graphemes(value).collect())
}
}
}
@ -133,7 +134,7 @@ impl From<Box<str>> for Utf32String {
if value.is_ascii() {
Self::Ascii(value)
} else {
Self::Unicode(value.chars().collect())
Self::Unicode(chars::graphemes(&value).collect())
}
}
}

View File

@ -8,7 +8,7 @@ use parking_lot::RawMutex;
use rayon::{prelude::*, ThreadPool};
use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::{self, MultiPattern};
use crate::pattern::{self, MultiPattern};
use crate::Match;
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
@ -76,7 +76,7 @@ impl Worker {
pub(crate) unsafe fn run(
&mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
query_status: query::Status,
query_status: pattern::Status,
) {
self.running = true;
let mut last_scored_item = self.items.len();
@ -84,7 +84,7 @@ impl Worker {
drop(items_lock);
// TODO: be smarter around reusing past results for rescoring
if cleared || query_status == query::Status::Rescore {
if cleared || query_status == pattern::Status::Rescore {
self.matches.clear();
last_scored_item = 0;
}
@ -103,7 +103,7 @@ impl Worker {
}
return;
}
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
if query_status != pattern::Status::Unchanged && !self.matches.is_empty() {
self.matches
.par_iter_mut()
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))