Grapheme segment input and match on first grapheme char

This commit is contained in:
Pascal Kuthe 2023-07-30 16:48:15 +02:00
parent 878fd7b7ea
commit 1cb902ccdb
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
8 changed files with 56 additions and 17 deletions

7
Cargo.lock generated
View File

@ -165,6 +165,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"cov-mark", "cov-mark",
"memchr", "memchr",
"unicode-segmentation",
] ]
[[package]] [[package]]
@ -277,6 +278,12 @@ dependencies = [
"once_cell", "once_cell",
] ]
[[package]]
name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.10" version = "0.1.10"

View File

@ -9,6 +9,7 @@ edition = "2021"
[dependencies] [dependencies]
memchr = "2.5.0" memchr = "2.5.0"
cov-mark = { version = "1.1.0", default-features = false } cov-mark = { version = "1.1.0", default-features = false }
unicode-segmentation = "1.10"
[dev-dependencies] [dev-dependencies]
cov-mark = { version = "1.1.0", default-features = true } cov-mark = { version = "1.1.0", default-features = true }

View File

@ -135,6 +135,7 @@ impl Char for char {
} }
pub use normalize::normalize; pub use normalize::normalize;
use unicode_segmentation::UnicodeSegmentation;
#[inline(always)] #[inline(always)]
pub fn to_lower_case(c: char) -> char { pub fn to_lower_case(c: char) -> char {
@ -154,3 +155,14 @@ pub enum CharClass {
Letter, Letter,
Number, Number,
} }
/// nucleo can not match graphemes as single units to work around
/// that we only use the first codepoint of each grapheme
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
text.graphemes(true).map(|grapheme| {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
})
}

View File

@ -43,7 +43,10 @@ impl<'a> Utf32Str<'a> {
Utf32Str::Ascii(str.as_bytes()) Utf32Str::Ascii(str.as_bytes())
} else { } else {
buf.clear(); buf.clear();
buf.extend(str.chars()); buf.extend(crate::chars::graphemes(str));
if buf.iter().all(|c| c.is_ascii()) {
return Utf32Str::Ascii(str.as_bytes());
}
Utf32Str::Unicode(&*buf) Utf32Str::Unicode(&*buf)
} }
} }

View File

@ -9,11 +9,11 @@ use crate::worker::Worker;
use parking_lot::lock_api::ArcMutexGuard; use parking_lot::lock_api::ArcMutexGuard;
use rayon::ThreadPool; use rayon::ThreadPool;
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind}; pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
pub use crate::utf32_string::Utf32String; pub use crate::utf32_string::Utf32String;
mod items; mod items;
mod query; mod pattern;
mod utf32_string; mod utf32_string;
mod worker; mod worker;
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str}; pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
@ -124,14 +124,14 @@ impl<T: Sync + Send> Nucleo<T> {
self.should_notify.store(false, atomic::Ordering::Relaxed); self.should_notify.store(false, atomic::Ordering::Relaxed);
let status = self.pattern.status(); let status = self.pattern.status();
let items = self.items.cache.lock_arc(); let items = self.items.cache.lock_arc();
let canceled = status != query::Status::Unchanged || items.cleared(); let canceled = status != pattern::Status::Unchanged || items.cleared();
let res = self.tick_inner(timeout, canceled, items, status); let res = self.tick_inner(timeout, canceled, items, status);
if !canceled { if !canceled {
self.should_notify.store(true, atomic::Ordering::Relaxed); self.should_notify.store(true, atomic::Ordering::Relaxed);
return res; return res;
} }
let items = self.items.cache.lock_arc(); let items = self.items.cache.lock_arc();
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged); let res = self.tick_inner(timeout, false, items, pattern::Status::Unchanged);
self.should_notify.store(true, atomic::Ordering::Relaxed); self.should_notify.store(true, atomic::Ordering::Relaxed);
res res
} }
@ -141,7 +141,7 @@ impl<T: Sync + Send> Nucleo<T> {
timeout: u64, timeout: u64,
canceled: bool, canceled: bool,
items: ArcMutexGuard<RawMutex, ItemCache>, items: ArcMutexGuard<RawMutex, ItemCache>,
status: query::Status, status: pattern::Status,
) -> Status { ) -> Status {
let mut inner = if canceled { let mut inner = if canceled {
self.pattern.reset_status(); self.pattern.reset_status();

View File

@ -65,7 +65,7 @@ impl PatternAtom {
let mut needle_ = Vec::with_capacity(needle.len()); let mut needle_ = Vec::with_capacity(needle.len());
if escape_whitespace { if escape_whitespace {
let mut saw_backslash = false; let mut saw_backslash = false;
for mut c in needle.chars() { for mut c in chars::graphemes(needle) {
if saw_backslash { if saw_backslash {
if c == ' ' { if c == ' ' {
needle_.push(' '); needle_.push(' ');
@ -88,6 +88,21 @@ impl PatternAtom {
} }
needle_.push(c); needle_.push(c);
} }
} else {
let chars = chars::graphemes(needle).map(|mut c| {
if normalize {
c = chars::normalize(c);
}
match case {
CaseMatching::Ignore => c = chars::to_lower_case(c),
CaseMatching::Smart => {
ignore_case = ignore_case && !c.is_uppercase();
}
CaseMatching::Respect => (),
}
c
});
needle_.extend(chars);
}; };
Utf32String::Unicode(needle_.into_boxed_slice()) Utf32String::Unicode(needle_.into_boxed_slice())
}; };

View File

@ -4,7 +4,7 @@ use std::fmt;
use std::mem::take; use std::mem::take;
use std::ops::{Bound, RangeBounds}; use std::ops::{Bound, RangeBounds};
use nucleo_matcher::Utf32Str; use nucleo_matcher::{chars, Utf32Str};
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
pub enum Utf32String { pub enum Utf32String {
@ -94,12 +94,13 @@ impl Utf32String {
*self = Self::Ascii(bytes.into_boxed_str()); *self = Self::Ascii(bytes.into_boxed_str());
return; return;
} }
Utf32String::Ascii(bytes) => bytes.chars().collect(), Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints), Utf32String::Unicode(codepoints) => Vec::from(codepoints),
}; };
codeboints.extend(text.chars()); codeboints.extend(chars::graphemes(text));
*self = Utf32String::Unicode(codeboints.into_boxed_slice()); *self = Utf32String::Unicode(codeboints.into_boxed_slice());
} }
#[inline] #[inline]
pub fn push(&mut self, c: char) { pub fn push(&mut self, c: char) {
let mut codeboints = match take(self) { let mut codeboints = match take(self) {
@ -109,7 +110,7 @@ impl Utf32String {
*self = Self::Ascii(bytes.into_boxed_str()); *self = Self::Ascii(bytes.into_boxed_str());
return; return;
} }
Utf32String::Ascii(bytes) => bytes.chars().collect(), Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
Utf32String::Unicode(codepoints) => Vec::from(codepoints), Utf32String::Unicode(codepoints) => Vec::from(codepoints),
}; };
codeboints.push(c); codeboints.push(c);
@ -123,7 +124,7 @@ impl From<&str> for Utf32String {
if value.is_ascii() { if value.is_ascii() {
Self::Ascii(value.to_owned().into_boxed_str()) Self::Ascii(value.to_owned().into_boxed_str())
} else { } else {
Self::Unicode(value.chars().collect()) Self::Unicode(chars::graphemes(value).collect())
} }
} }
} }
@ -133,7 +134,7 @@ impl From<Box<str>> for Utf32String {
if value.is_ascii() { if value.is_ascii() {
Self::Ascii(value) Self::Ascii(value)
} else { } else {
Self::Unicode(value.chars().collect()) Self::Unicode(chars::graphemes(&value).collect())
} }
} }
} }

View File

@ -8,7 +8,7 @@ use parking_lot::RawMutex;
use rayon::{prelude::*, ThreadPool}; use rayon::{prelude::*, ThreadPool};
use crate::items::{ItemCache, ItemsSnapshot}; use crate::items::{ItemCache, ItemsSnapshot};
use crate::query::{self, MultiPattern}; use crate::pattern::{self, MultiPattern};
use crate::Match; use crate::Match;
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>); struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
@ -76,7 +76,7 @@ impl Worker {
pub(crate) unsafe fn run( pub(crate) unsafe fn run(
&mut self, &mut self,
items_lock: ArcMutexGuard<RawMutex, ItemCache>, items_lock: ArcMutexGuard<RawMutex, ItemCache>,
query_status: query::Status, query_status: pattern::Status,
) { ) {
self.running = true; self.running = true;
let mut last_scored_item = self.items.len(); let mut last_scored_item = self.items.len();
@ -84,7 +84,7 @@ impl Worker {
drop(items_lock); drop(items_lock);
// TODO: be smarter around reusing past results for rescoring // TODO: be smarter around reusing past results for rescoring
if cleared || query_status == query::Status::Rescore { if cleared || query_status == pattern::Status::Rescore {
self.matches.clear(); self.matches.clear();
last_scored_item = 0; last_scored_item = 0;
} }
@ -103,7 +103,7 @@ impl Worker {
} }
return; return;
} }
if query_status != query::Status::Unchanged && !self.matches.is_empty() { if query_status != pattern::Status::Unchanged && !self.matches.is_empty() {
self.matches self.matches
.par_iter_mut() .par_iter_mut()
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed)) .take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))