mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 01:47:49 +00:00
Grapheme segment input and match on first grapheme char
This commit is contained in:
parent
878fd7b7ea
commit
1cb902ccdb
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -165,6 +165,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"cov-mark",
|
||||
"memchr",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -277,6 +278,12 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.10"
|
||||
|
@ -9,6 +9,7 @@ edition = "2021"
|
||||
[dependencies]
|
||||
memchr = "2.5.0"
|
||||
cov-mark = { version = "1.1.0", default-features = false }
|
||||
unicode-segmentation = "1.10"
|
||||
|
||||
[dev-dependencies]
|
||||
cov-mark = { version = "1.1.0", default-features = true }
|
||||
|
@ -135,6 +135,7 @@ impl Char for char {
|
||||
}
|
||||
|
||||
pub use normalize::normalize;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn to_lower_case(c: char) -> char {
|
||||
@ -154,3 +155,14 @@ pub enum CharClass {
|
||||
Letter,
|
||||
Number,
|
||||
}
|
||||
|
||||
/// nucleo can not match graphemes as single units to work around
|
||||
/// that we only use the first codepoint of each grapheme
|
||||
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
|
||||
text.graphemes(true).map(|grapheme| {
|
||||
grapheme
|
||||
.chars()
|
||||
.next()
|
||||
.expect("graphemes must be non-empty")
|
||||
})
|
||||
}
|
||||
|
@ -43,7 +43,10 @@ impl<'a> Utf32Str<'a> {
|
||||
Utf32Str::Ascii(str.as_bytes())
|
||||
} else {
|
||||
buf.clear();
|
||||
buf.extend(str.chars());
|
||||
buf.extend(crate::chars::graphemes(str));
|
||||
if buf.iter().all(|c| c.is_ascii()) {
|
||||
return Utf32Str::Ascii(str.as_bytes());
|
||||
}
|
||||
Utf32Str::Unicode(&*buf)
|
||||
}
|
||||
}
|
||||
|
10
src/lib.rs
10
src/lib.rs
@ -9,11 +9,11 @@ use crate::worker::Worker;
|
||||
use parking_lot::lock_api::ArcMutexGuard;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind};
|
||||
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
|
||||
pub use crate::utf32_string::Utf32String;
|
||||
|
||||
mod items;
|
||||
mod query;
|
||||
mod pattern;
|
||||
mod utf32_string;
|
||||
mod worker;
|
||||
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
|
||||
@ -124,14 +124,14 @@ impl<T: Sync + Send> Nucleo<T> {
|
||||
self.should_notify.store(false, atomic::Ordering::Relaxed);
|
||||
let status = self.pattern.status();
|
||||
let items = self.items.cache.lock_arc();
|
||||
let canceled = status != query::Status::Unchanged || items.cleared();
|
||||
let canceled = status != pattern::Status::Unchanged || items.cleared();
|
||||
let res = self.tick_inner(timeout, canceled, items, status);
|
||||
if !canceled {
|
||||
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
||||
return res;
|
||||
}
|
||||
let items = self.items.cache.lock_arc();
|
||||
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged);
|
||||
let res = self.tick_inner(timeout, false, items, pattern::Status::Unchanged);
|
||||
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
||||
res
|
||||
}
|
||||
@ -141,7 +141,7 @@ impl<T: Sync + Send> Nucleo<T> {
|
||||
timeout: u64,
|
||||
canceled: bool,
|
||||
items: ArcMutexGuard<RawMutex, ItemCache>,
|
||||
status: query::Status,
|
||||
status: pattern::Status,
|
||||
) -> Status {
|
||||
let mut inner = if canceled {
|
||||
self.pattern.reset_status();
|
||||
|
@ -65,7 +65,7 @@ impl PatternAtom {
|
||||
let mut needle_ = Vec::with_capacity(needle.len());
|
||||
if escape_whitespace {
|
||||
let mut saw_backslash = false;
|
||||
for mut c in needle.chars() {
|
||||
for mut c in chars::graphemes(needle) {
|
||||
if saw_backslash {
|
||||
if c == ' ' {
|
||||
needle_.push(' ');
|
||||
@ -88,6 +88,21 @@ impl PatternAtom {
|
||||
}
|
||||
needle_.push(c);
|
||||
}
|
||||
} else {
|
||||
let chars = chars::graphemes(needle).map(|mut c| {
|
||||
if normalize {
|
||||
c = chars::normalize(c);
|
||||
}
|
||||
match case {
|
||||
CaseMatching::Ignore => c = chars::to_lower_case(c),
|
||||
CaseMatching::Smart => {
|
||||
ignore_case = ignore_case && !c.is_uppercase();
|
||||
}
|
||||
CaseMatching::Respect => (),
|
||||
}
|
||||
c
|
||||
});
|
||||
needle_.extend(chars);
|
||||
};
|
||||
Utf32String::Unicode(needle_.into_boxed_slice())
|
||||
};
|
@ -4,7 +4,7 @@ use std::fmt;
|
||||
use std::mem::take;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
|
||||
use nucleo_matcher::Utf32Str;
|
||||
use nucleo_matcher::{chars, Utf32Str};
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
|
||||
pub enum Utf32String {
|
||||
@ -94,12 +94,13 @@ impl Utf32String {
|
||||
*self = Self::Ascii(bytes.into_boxed_str());
|
||||
return;
|
||||
}
|
||||
Utf32String::Ascii(bytes) => bytes.chars().collect(),
|
||||
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
|
||||
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
||||
};
|
||||
codeboints.extend(text.chars());
|
||||
codeboints.extend(chars::graphemes(text));
|
||||
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn push(&mut self, c: char) {
|
||||
let mut codeboints = match take(self) {
|
||||
@ -109,7 +110,7 @@ impl Utf32String {
|
||||
*self = Self::Ascii(bytes.into_boxed_str());
|
||||
return;
|
||||
}
|
||||
Utf32String::Ascii(bytes) => bytes.chars().collect(),
|
||||
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
|
||||
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
||||
};
|
||||
codeboints.push(c);
|
||||
@ -123,7 +124,7 @@ impl From<&str> for Utf32String {
|
||||
if value.is_ascii() {
|
||||
Self::Ascii(value.to_owned().into_boxed_str())
|
||||
} else {
|
||||
Self::Unicode(value.chars().collect())
|
||||
Self::Unicode(chars::graphemes(value).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,7 +134,7 @@ impl From<Box<str>> for Utf32String {
|
||||
if value.is_ascii() {
|
||||
Self::Ascii(value)
|
||||
} else {
|
||||
Self::Unicode(value.chars().collect())
|
||||
Self::Unicode(chars::graphemes(&value).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ use parking_lot::RawMutex;
|
||||
use rayon::{prelude::*, ThreadPool};
|
||||
|
||||
use crate::items::{ItemCache, ItemsSnapshot};
|
||||
use crate::query::{self, MultiPattern};
|
||||
use crate::pattern::{self, MultiPattern};
|
||||
use crate::Match;
|
||||
|
||||
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
|
||||
@ -76,7 +76,7 @@ impl Worker {
|
||||
pub(crate) unsafe fn run(
|
||||
&mut self,
|
||||
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
|
||||
query_status: query::Status,
|
||||
query_status: pattern::Status,
|
||||
) {
|
||||
self.running = true;
|
||||
let mut last_scored_item = self.items.len();
|
||||
@ -84,7 +84,7 @@ impl Worker {
|
||||
drop(items_lock);
|
||||
|
||||
// TODO: be smarter around reusing past results for rescoring
|
||||
if cleared || query_status == query::Status::Rescore {
|
||||
if cleared || query_status == pattern::Status::Rescore {
|
||||
self.matches.clear();
|
||||
last_scored_item = 0;
|
||||
}
|
||||
@ -103,7 +103,7 @@ impl Worker {
|
||||
}
|
||||
return;
|
||||
}
|
||||
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
|
||||
if query_status != pattern::Status::Unchanged && !self.matches.is_empty() {
|
||||
self.matches
|
||||
.par_iter_mut()
|
||||
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))
|
||||
|
Loading…
Reference in New Issue
Block a user