feature gate unicode support in nucleo-matcher

This commit is contained in:
Pascal Kuthe 2023-08-29 14:25:00 +02:00
parent 2de732889f
commit 8b22bc28cb
No known key found for this signature in database
GPG Key ID: D715E8655AE166A6
4 changed files with 55 additions and 10 deletions

View File

@ -27,6 +27,8 @@ jobs:
- name: Run cargo check - name: Run cargo check
run: cargo check run: cargo check
- name: Run cargo check withoult default features
run: cargo check --no-default-features
test: test:
name: Test name: Test
@ -62,6 +64,8 @@ jobs:
- name: Run cargo clippy - name: Run cargo clippy
run: cargo clippy --workspace --all-targets -- -D warnings run: cargo clippy --workspace --all-targets -- -D warnings
- name: Run cargo clippy withoult default features
run: cargo clippy --workspace --all-targets --no-default-features -- -D warnings
- name: Run cargo doc - name: Run cargo doc
run: cargo doc --no-deps --workspace --document-private-items run: cargo doc --no-deps --workspace --document-private-items

View File

@ -11,7 +11,13 @@ readme = "../README.md"
[dependencies] [dependencies]
memchr = "2.5.0" memchr = "2.5.0"
cov-mark = { version = "1.1.0", default-features = false } cov-mark = { version = "1.1.0", default-features = false }
unicode-segmentation = "1.10" unicode-segmentation = { version = "1.10", optional = true }
[features]
default = ["unicode-normalization", "unicode-casefold", "unicode-segmentation"]
unicode-normalization = []
unicode-casefold = []
unicode-segmentation = ["dep:unicode-segmentation"]
[dev-dependencies] [dev-dependencies]
cov-mark = { version = "1.1.0", default-features = true } cov-mark = { version = "1.1.0", default-features = true }

View File

@ -2,13 +2,16 @@
use std::fmt::{self, Debug, Display}; use std::fmt::{self, Debug, Display};
#[cfg(feature = "unicode-casefold")]
use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
use crate::Config; use crate::Config;
//autogenerated by generate-ucd //autogenerated by generate-ucd
#[allow(warnings)] #[allow(warnings)]
#[rustfmt::skip] #[rustfmt::skip]
#[cfg(feature = "unicode-casefold")]
mod case_fold; mod case_fold;
#[cfg(feature = "unicode-normalization")]
mod normalize; mod normalize;
pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
@ -111,11 +114,14 @@ impl Char for char {
return (c.0 as char, class); return (c.0 as char, class);
} }
let char_class = char_class_non_ascii(self); let char_class = char_class_non_ascii(self);
#[cfg(feature = "unicode-casefold")]
let mut case_fold = char_class == CharClass::Upper; let mut case_fold = char_class == CharClass::Upper;
#[cfg(feature = "unicode-normalization")]
if config.normalize { if config.normalize {
self = normalize::normalize(self); self = normalize::normalize(self);
case_fold = true case_fold = true
} }
#[cfg(feature = "unicode-casefold")]
if case_fold && config.ignore_case { if case_fold && config.ignore_case {
self = CASE_FOLDING_SIMPLE self = CASE_FOLDING_SIMPLE
.binary_search_by_key(&self, |(upper, _)| *upper) .binary_search_by_key(&self, |(upper, _)| *upper)
@ -126,9 +132,11 @@ impl Char for char {
#[inline(always)] #[inline(always)]
fn normalize(mut self, config: &Config) -> Self { fn normalize(mut self, config: &Config) -> Self {
#[cfg(feature = "unicode-normalization")]
if config.normalize { if config.normalize {
self = normalize::normalize(self); self = normalize::normalize(self);
} }
#[cfg(feature = "unicode-casefold")]
if config.ignore_case { if config.ignore_case {
self = to_lower_case(self) self = to_lower_case(self)
} }
@ -136,23 +144,31 @@ impl Char for char {
} }
} }
#[cfg(feature = "unicode-normalization")]
pub use normalize::normalize; pub use normalize::normalize;
#[cfg(feature = "unicode-segmentation")]
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
#[inline(always)]
/// Converts a character to lower case using simple unicode case folding /// Converts a character to lower case using simple unicode case folding
#[cfg(feature = "unicode-casefold")]
#[inline(always)]
pub fn to_lower_case(c: char) -> char { pub fn to_lower_case(c: char) -> char {
CASE_FOLDING_SIMPLE CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper) .binary_search_by_key(&c, |(upper, _)| *upper)
.map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
} }
/// Converts a character to upper case using simple unicode case folding /// Checks if a character is upper case according to simple unicode case folding.
/// if the `unicode-casefold` feature is disable the equivalent std function is used
#[inline(always)] #[inline(always)]
pub fn is_upper_case(c: char) -> bool { pub fn is_upper_case(c: char) -> bool {
CASE_FOLDING_SIMPLE #[cfg(feature = "unicode-casefold")]
let val = CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |(upper, _)| *upper) .binary_search_by_key(&c, |(upper, _)| *upper)
.is_ok() .is_ok();
#[cfg(not(feature = "unicode-casefold"))]
let val = c.is_uppercase();
val
} }
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
@ -171,10 +187,14 @@ pub(crate) enum CharClass {
/// iterator returns the first character of each unicode grapheme /// iterator returns the first character of each unicode grapheme
/// in a string and is used for constructing `Utf32Str(ing)`. /// in a string and is used for constructing `Utf32Str(ing)`.
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ { pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
text.graphemes(true).map(|grapheme| { #[cfg(feature = "unicode-segmentation")]
let res = text.graphemes(true).map(|grapheme| {
grapheme grapheme
.chars() .chars()
.next() .next()
.expect("graphemes must be non-empty") .expect("graphemes must be non-empty")
}) });
#[cfg(not(feature = "unicode-segmentation"))]
let res = text.chars();
res
} }

View File

@ -13,13 +13,15 @@ use crate::Utf32String;
#[non_exhaustive] #[non_exhaustive]
/// How to treat a case mismatch between two characters. /// How to treat a case mismatch between two characters.
pub enum CaseMatching { pub enum CaseMatching {
/// Characters always match their case folded version (`a == A`).
Ignore,
/// Characters never match their case folded version (`a != A`). /// Characters never match their case folded version (`a != A`).
Respect, Respect,
/// Characters always match their case folded version (`a == A`).
#[cfg(feature = "unicode-casefold")]
Ignore,
/// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
/// lowercase and like [`Respect`](CaseMatching::Respect) otherwise. /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
#[default] #[default]
#[cfg(feature = "unicode-casefold")]
Smart, Smart,
} }
@ -106,10 +108,12 @@ impl Atom {
}; };
match case { match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => { CaseMatching::Ignore => {
ignore_case = true; ignore_case = true;
needle.make_ascii_lowercase() needle.make_ascii_lowercase()
} }
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
} }
@ -121,7 +125,14 @@ impl Atom {
Utf32String::Ascii(needle.into_boxed_str()) Utf32String::Ascii(needle.into_boxed_str())
} else { } else {
let mut needle_ = Vec::with_capacity(needle.len()); let mut needle_ = Vec::with_capacity(needle.len());
#[cfg(feature = "unicode-casefold")]
{
ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
}
#[cfg(not(feature = "unicode-casefold"))]
{
ignore_case = false;
}
if escape_whitespace { if escape_whitespace {
let mut saw_backslash = false; let mut saw_backslash = false;
for mut c in chars::graphemes(needle) { for mut c in chars::graphemes(needle) {
@ -136,7 +147,9 @@ impl Atom {
} }
saw_backslash = c == '\\'; saw_backslash = c == '\\';
match case { match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Ignore => c = chars::to_lower_case(c),
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c) ignore_case = ignore_case && !chars::is_upper_case(c)
} }
@ -147,7 +160,9 @@ impl Atom {
} else { } else {
let chars = chars::graphemes(needle).map(|mut c| { let chars = chars::graphemes(needle).map(|mut c| {
match case { match case {
#[cfg(feature = "unicode-casefold")]
CaseMatching::Ignore => c = chars::to_lower_case(c), CaseMatching::Ignore => c = chars::to_lower_case(c),
#[cfg(feature = "unicode-casefold")]
CaseMatching::Smart => { CaseMatching::Smart => {
ignore_case = ignore_case && !chars::is_upper_case(c); ignore_case = ignore_case && !chars::is_upper_case(c);
} }