mirror of
https://github.com/solaeus/nucleo.git
synced 2024-12-22 09:57:49 +00:00
Grapheme segment input and match on first grapheme char
This commit is contained in:
parent
878fd7b7ea
commit
1cb902ccdb
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -165,6 +165,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"cov-mark",
|
"cov-mark",
|
||||||
"memchr",
|
"memchr",
|
||||||
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -277,6 +278,12 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-width"
|
name = "unicode-width"
|
||||||
version = "0.1.10"
|
version = "0.1.10"
|
||||||
|
@ -9,6 +9,7 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
memchr = "2.5.0"
|
memchr = "2.5.0"
|
||||||
cov-mark = { version = "1.1.0", default-features = false }
|
cov-mark = { version = "1.1.0", default-features = false }
|
||||||
|
unicode-segmentation = "1.10"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
cov-mark = { version = "1.1.0", default-features = true }
|
cov-mark = { version = "1.1.0", default-features = true }
|
||||||
|
@ -135,6 +135,7 @@ impl Char for char {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub use normalize::normalize;
|
pub use normalize::normalize;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn to_lower_case(c: char) -> char {
|
pub fn to_lower_case(c: char) -> char {
|
||||||
@ -154,3 +155,14 @@ pub enum CharClass {
|
|||||||
Letter,
|
Letter,
|
||||||
Number,
|
Number,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// nucleo can not match graphemes as single units to work around
|
||||||
|
/// that we only use the first codepoint of each grapheme
|
||||||
|
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
|
||||||
|
text.graphemes(true).map(|grapheme| {
|
||||||
|
grapheme
|
||||||
|
.chars()
|
||||||
|
.next()
|
||||||
|
.expect("graphemes must be non-empty")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
@ -43,7 +43,10 @@ impl<'a> Utf32Str<'a> {
|
|||||||
Utf32Str::Ascii(str.as_bytes())
|
Utf32Str::Ascii(str.as_bytes())
|
||||||
} else {
|
} else {
|
||||||
buf.clear();
|
buf.clear();
|
||||||
buf.extend(str.chars());
|
buf.extend(crate::chars::graphemes(str));
|
||||||
|
if buf.iter().all(|c| c.is_ascii()) {
|
||||||
|
return Utf32Str::Ascii(str.as_bytes());
|
||||||
|
}
|
||||||
Utf32Str::Unicode(&*buf)
|
Utf32Str::Unicode(&*buf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
10
src/lib.rs
10
src/lib.rs
@ -9,11 +9,11 @@ use crate::worker::Worker;
|
|||||||
use parking_lot::lock_api::ArcMutexGuard;
|
use parking_lot::lock_api::ArcMutexGuard;
|
||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
|
|
||||||
pub use crate::query::{CaseMatching, MultiPattern, Pattern, PatternKind};
|
pub use crate::pattern::{CaseMatching, MultiPattern, Pattern, PatternKind};
|
||||||
pub use crate::utf32_string::Utf32String;
|
pub use crate::utf32_string::Utf32String;
|
||||||
|
|
||||||
mod items;
|
mod items;
|
||||||
mod query;
|
mod pattern;
|
||||||
mod utf32_string;
|
mod utf32_string;
|
||||||
mod worker;
|
mod worker;
|
||||||
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
|
pub use nucleo_matcher::{chars, Matcher, MatcherConfig, Utf32Str};
|
||||||
@ -124,14 +124,14 @@ impl<T: Sync + Send> Nucleo<T> {
|
|||||||
self.should_notify.store(false, atomic::Ordering::Relaxed);
|
self.should_notify.store(false, atomic::Ordering::Relaxed);
|
||||||
let status = self.pattern.status();
|
let status = self.pattern.status();
|
||||||
let items = self.items.cache.lock_arc();
|
let items = self.items.cache.lock_arc();
|
||||||
let canceled = status != query::Status::Unchanged || items.cleared();
|
let canceled = status != pattern::Status::Unchanged || items.cleared();
|
||||||
let res = self.tick_inner(timeout, canceled, items, status);
|
let res = self.tick_inner(timeout, canceled, items, status);
|
||||||
if !canceled {
|
if !canceled {
|
||||||
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
let items = self.items.cache.lock_arc();
|
let items = self.items.cache.lock_arc();
|
||||||
let res = self.tick_inner(timeout, false, items, query::Status::Unchanged);
|
let res = self.tick_inner(timeout, false, items, pattern::Status::Unchanged);
|
||||||
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
self.should_notify.store(true, atomic::Ordering::Relaxed);
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
@ -141,7 +141,7 @@ impl<T: Sync + Send> Nucleo<T> {
|
|||||||
timeout: u64,
|
timeout: u64,
|
||||||
canceled: bool,
|
canceled: bool,
|
||||||
items: ArcMutexGuard<RawMutex, ItemCache>,
|
items: ArcMutexGuard<RawMutex, ItemCache>,
|
||||||
status: query::Status,
|
status: pattern::Status,
|
||||||
) -> Status {
|
) -> Status {
|
||||||
let mut inner = if canceled {
|
let mut inner = if canceled {
|
||||||
self.pattern.reset_status();
|
self.pattern.reset_status();
|
||||||
|
@ -65,7 +65,7 @@ impl PatternAtom {
|
|||||||
let mut needle_ = Vec::with_capacity(needle.len());
|
let mut needle_ = Vec::with_capacity(needle.len());
|
||||||
if escape_whitespace {
|
if escape_whitespace {
|
||||||
let mut saw_backslash = false;
|
let mut saw_backslash = false;
|
||||||
for mut c in needle.chars() {
|
for mut c in chars::graphemes(needle) {
|
||||||
if saw_backslash {
|
if saw_backslash {
|
||||||
if c == ' ' {
|
if c == ' ' {
|
||||||
needle_.push(' ');
|
needle_.push(' ');
|
||||||
@ -88,6 +88,21 @@ impl PatternAtom {
|
|||||||
}
|
}
|
||||||
needle_.push(c);
|
needle_.push(c);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
let chars = chars::graphemes(needle).map(|mut c| {
|
||||||
|
if normalize {
|
||||||
|
c = chars::normalize(c);
|
||||||
|
}
|
||||||
|
match case {
|
||||||
|
CaseMatching::Ignore => c = chars::to_lower_case(c),
|
||||||
|
CaseMatching::Smart => {
|
||||||
|
ignore_case = ignore_case && !c.is_uppercase();
|
||||||
|
}
|
||||||
|
CaseMatching::Respect => (),
|
||||||
|
}
|
||||||
|
c
|
||||||
|
});
|
||||||
|
needle_.extend(chars);
|
||||||
};
|
};
|
||||||
Utf32String::Unicode(needle_.into_boxed_slice())
|
Utf32String::Unicode(needle_.into_boxed_slice())
|
||||||
};
|
};
|
@ -4,7 +4,7 @@ use std::fmt;
|
|||||||
use std::mem::take;
|
use std::mem::take;
|
||||||
use std::ops::{Bound, RangeBounds};
|
use std::ops::{Bound, RangeBounds};
|
||||||
|
|
||||||
use nucleo_matcher::Utf32Str;
|
use nucleo_matcher::{chars, Utf32Str};
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
|
||||||
pub enum Utf32String {
|
pub enum Utf32String {
|
||||||
@ -94,12 +94,13 @@ impl Utf32String {
|
|||||||
*self = Self::Ascii(bytes.into_boxed_str());
|
*self = Self::Ascii(bytes.into_boxed_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Utf32String::Ascii(bytes) => bytes.chars().collect(),
|
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
|
||||||
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
||||||
};
|
};
|
||||||
codeboints.extend(text.chars());
|
codeboints.extend(chars::graphemes(text));
|
||||||
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
|
*self = Utf32String::Unicode(codeboints.into_boxed_slice());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn push(&mut self, c: char) {
|
pub fn push(&mut self, c: char) {
|
||||||
let mut codeboints = match take(self) {
|
let mut codeboints = match take(self) {
|
||||||
@ -109,7 +110,7 @@ impl Utf32String {
|
|||||||
*self = Self::Ascii(bytes.into_boxed_str());
|
*self = Self::Ascii(bytes.into_boxed_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Utf32String::Ascii(bytes) => bytes.chars().collect(),
|
Utf32String::Ascii(bytes) => bytes.bytes().map(|c| c as char).collect(),
|
||||||
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
Utf32String::Unicode(codepoints) => Vec::from(codepoints),
|
||||||
};
|
};
|
||||||
codeboints.push(c);
|
codeboints.push(c);
|
||||||
@ -123,7 +124,7 @@ impl From<&str> for Utf32String {
|
|||||||
if value.is_ascii() {
|
if value.is_ascii() {
|
||||||
Self::Ascii(value.to_owned().into_boxed_str())
|
Self::Ascii(value.to_owned().into_boxed_str())
|
||||||
} else {
|
} else {
|
||||||
Self::Unicode(value.chars().collect())
|
Self::Unicode(chars::graphemes(value).collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -133,7 +134,7 @@ impl From<Box<str>> for Utf32String {
|
|||||||
if value.is_ascii() {
|
if value.is_ascii() {
|
||||||
Self::Ascii(value)
|
Self::Ascii(value)
|
||||||
} else {
|
} else {
|
||||||
Self::Unicode(value.chars().collect())
|
Self::Unicode(chars::graphemes(&value).collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ use parking_lot::RawMutex;
|
|||||||
use rayon::{prelude::*, ThreadPool};
|
use rayon::{prelude::*, ThreadPool};
|
||||||
|
|
||||||
use crate::items::{ItemCache, ItemsSnapshot};
|
use crate::items::{ItemCache, ItemsSnapshot};
|
||||||
use crate::query::{self, MultiPattern};
|
use crate::pattern::{self, MultiPattern};
|
||||||
use crate::Match;
|
use crate::Match;
|
||||||
|
|
||||||
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
|
struct Matchers(Box<[UnsafeCell<nucleo_matcher::Matcher>]>);
|
||||||
@ -76,7 +76,7 @@ impl Worker {
|
|||||||
pub(crate) unsafe fn run(
|
pub(crate) unsafe fn run(
|
||||||
&mut self,
|
&mut self,
|
||||||
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
|
items_lock: ArcMutexGuard<RawMutex, ItemCache>,
|
||||||
query_status: query::Status,
|
query_status: pattern::Status,
|
||||||
) {
|
) {
|
||||||
self.running = true;
|
self.running = true;
|
||||||
let mut last_scored_item = self.items.len();
|
let mut last_scored_item = self.items.len();
|
||||||
@ -84,7 +84,7 @@ impl Worker {
|
|||||||
drop(items_lock);
|
drop(items_lock);
|
||||||
|
|
||||||
// TODO: be smarter around reusing past results for rescoring
|
// TODO: be smarter around reusing past results for rescoring
|
||||||
if cleared || query_status == query::Status::Rescore {
|
if cleared || query_status == pattern::Status::Rescore {
|
||||||
self.matches.clear();
|
self.matches.clear();
|
||||||
last_scored_item = 0;
|
last_scored_item = 0;
|
||||||
}
|
}
|
||||||
@ -103,7 +103,7 @@ impl Worker {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if query_status != query::Status::Unchanged && !self.matches.is_empty() {
|
if query_status != pattern::Status::Unchanged && !self.matches.is_empty() {
|
||||||
self.matches
|
self.matches
|
||||||
.par_iter_mut()
|
.par_iter_mut()
|
||||||
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))
|
.take_any_while(|_| !self.canceled.load(atomic::Ordering::Relaxed))
|
||||||
|
Loading…
Reference in New Issue
Block a user