mirror of
https://github.com/solaeus/nucleo.git
synced 2025-01-22 07:47:47 +00:00
Initial fuzzy matcher implementation
This commit is contained in:
parent
93c75f6867
commit
d0703bb6e0
16
Cargo.lock
generated
Normal file
16
Cargo.lock
generated
Normal file
@ -0,0 +1,16 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "fzf_oxide"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "fzf_oxide"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
memchr = "2.5.0"
|
245
foo.c
Normal file
245
foo.c
Normal file
@ -0,0 +1,245 @@
|
||||
|
||||
fzf_result_t fzf_fuzzy_match_v2(bool case_sensitive, bool normalize,
|
||||
fzf_string_t *text, fzf_string_t *pattern,
|
||||
fzf_position_t *pos, fzf_slab_t *slab) {
|
||||
const size_t M = pattern->size;
|
||||
const size_t N = text->size;
|
||||
if (M == 0) {
|
||||
return (fzf_result_t){0, 0, 0};
|
||||
}
|
||||
if (slab != NULL && N * M > slab->I16.cap) {
|
||||
return fzf_fuzzy_match_v1(case_sensitive, normalize, text, pattern, pos,
|
||||
slab);
|
||||
}
|
||||
|
||||
size_t idx;
|
||||
{
|
||||
int32_t tmp_idx = ascii_fuzzy_index(text, pattern->data, M, case_sensitive);
|
||||
if (tmp_idx < 0) {
|
||||
return (fzf_result_t){-1, -1, 0};
|
||||
}
|
||||
idx = (size_t)tmp_idx;
|
||||
}
|
||||
|
||||
size_t offset16 = 0;
|
||||
size_t offset32 = 0;
|
||||
|
||||
fzf_i16_t h0 = alloc16(&offset16, slab, N);
|
||||
fzf_i16_t c0 = alloc16(&offset16, slab, N);
|
||||
// Bonus point for each positions
|
||||
fzf_i16_t bo = alloc16(&offset16, slab, N);
|
||||
// The first occurrence of each character in the pattern
|
||||
fzf_i32_t f = alloc32(&offset32, slab, M);
|
||||
// Rune array
|
||||
fzf_i32_t t = alloc32(&offset32, slab, N);
|
||||
copy_runes(text, &t); // input.CopyRunes(T)
|
||||
|
||||
// Phase 2. Calculate bonus for each point
|
||||
int16_t max_score = 0;
|
||||
size_t max_score_pos = 0;
|
||||
|
||||
size_t pidx = 0;
|
||||
size_t last_idx = 0;
|
||||
|
||||
char pchar0 = pattern->data[0];
|
||||
char pchar = pattern->data[0];
|
||||
int16_t prev_h0 = 0;
|
||||
int32_t prev_class = CharNonWord;
|
||||
bool in_gap = false;
|
||||
|
||||
i32_slice_t t_sub = slice_i32(t.data, idx, t.size); // T[idx:];
|
||||
i16_slice_t h0_sub =
|
||||
slice_i16_right(slice_i16(h0.data, idx, h0.size).data, t_sub.size);
|
||||
i16_slice_t c0_sub =
|
||||
slice_i16_right(slice_i16(c0.data, idx, c0.size).data, t_sub.size);
|
||||
i16_slice_t b_sub =
|
||||
slice_i16_right(slice_i16(bo.data, idx, bo.size).data, t_sub.size);
|
||||
|
||||
for (size_t off = 0; off < t_sub.size; off++) {
|
||||
char_class class;
|
||||
char c = (char)t_sub.data[off];
|
||||
class = char_class_of_ascii(c);
|
||||
if (!case_sensitive && class == CharUpper) {
|
||||
/* TODO(conni2461): unicode support */
|
||||
c = (char)tolower((uint8_t)c);
|
||||
}
|
||||
if (normalize) {
|
||||
c = normalize_rune(c);
|
||||
}
|
||||
|
||||
t_sub.data[off] = (uint8_t)c;
|
||||
int16_t bonus = bonus_for(prev_class, class);
|
||||
b_sub.data[off] = bonus;
|
||||
prev_class = class;
|
||||
if (c == pchar) {
|
||||
if (pidx < M) {
|
||||
f.data[pidx] = (int32_t)(idx + off);
|
||||
pidx++;
|
||||
pchar = pattern->data[min64u(pidx, M - 1)];
|
||||
}
|
||||
last_idx = idx + off;
|
||||
}
|
||||
|
||||
if (c == pchar0) {
|
||||
int16_t score = ScoreMatch + bonus * BonusFirstCharMultiplier;
|
||||
h0_sub.data[off] = score;
|
||||
c0_sub.data[off] = 1;
|
||||
if (M == 1 && (score > max_score)) {
|
||||
max_score = score;
|
||||
max_score_pos = idx + off;
|
||||
if (bonus == BonusBoundary) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
in_gap = false;
|
||||
} else {
|
||||
if (in_gap) {
|
||||
h0_sub.data[off] = max16(prev_h0 + ScoreGapExtention, 0);
|
||||
} else {
|
||||
h0_sub.data[off] = max16(prev_h0 + ScoreGapStart, 0);
|
||||
}
|
||||
c0_sub.data[off] = 0;
|
||||
in_gap = true;
|
||||
}
|
||||
prev_h0 = h0_sub.data[off];
|
||||
}
|
||||
if (pidx != M) {
|
||||
free_alloc(t);
|
||||
free_alloc(f);
|
||||
free_alloc(bo);
|
||||
free_alloc(c0);
|
||||
free_alloc(h0);
|
||||
return (fzf_result_t){-1, -1, 0};
|
||||
}
|
||||
if (M == 1) {
|
||||
free_alloc(t);
|
||||
free_alloc(f);
|
||||
free_alloc(bo);
|
||||
free_alloc(c0);
|
||||
free_alloc(h0);
|
||||
fzf_result_t res = {(int32_t)max_score_pos, (int32_t)max_score_pos + 1,
|
||||
max_score};
|
||||
append_pos(pos, max_score_pos);
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t f0 = (size_t)f.data[0];
|
||||
size_t width = last_idx - f0 + 1;
|
||||
fzf_i16_t h = alloc16(&offset16, slab, width * M);
|
||||
{
|
||||
i16_slice_t h0_tmp_slice = slice_i16(h0.data, f0, last_idx + 1);
|
||||
copy_into_i16(&h0_tmp_slice, &h);
|
||||
}
|
||||
|
||||
fzf_i16_t c = alloc16(&offset16, slab, width * M);
|
||||
{
|
||||
i16_slice_t c0_tmp_slice = slice_i16(c0.data, f0, last_idx + 1);
|
||||
copy_into_i16(&c0_tmp_slice, &c);
|
||||
}
|
||||
|
||||
i32_slice_t f_sub = slice_i32(f.data, 1, f.size);
|
||||
str_slice_t p_sub =
|
||||
slice_str_right(slice_str(pattern->data, 1, M).data, f_sub.size);
|
||||
for (size_t off = 0; off < f_sub.size; off++) {
|
||||
size_t f = (size_t)f_sub.data[off];
|
||||
pchar = p_sub.data[off];
|
||||
pidx = off + 1;
|
||||
size_t row = pidx * width;
|
||||
in_gap = false;
|
||||
t_sub = slice_i32(t.data, f, last_idx + 1);
|
||||
b_sub = slice_i16_right(slice_i16(bo.data, f, bo.size).data, t_sub.size);
|
||||
i16_slice_t c_sub = slice_i16_right(
|
||||
slice_i16(c.data, row + f - f0, c.size).data, t_sub.size);
|
||||
i16_slice_t c_diag = slice_i16_right(
|
||||
slice_i16(c.data, row + f - f0 - 1 - width, c.size).data, t_sub.size);
|
||||
i16_slice_t h_sub = slice_i16_right(
|
||||
slice_i16(h.data, row + f - f0, h.size).data, t_sub.size);
|
||||
i16_slice_t h_diag = slice_i16_right(
|
||||
slice_i16(h.data, row + f - f0 - 1 - width, h.size).data, t_sub.size);
|
||||
i16_slice_t h_left = slice_i16_right(
|
||||
slice_i16(h.data, row + f - f0 - 1, h.size).data, t_sub.size);
|
||||
h_left.data[0] = 0;
|
||||
for (size_t j = 0; j < t_sub.size; j++) {
|
||||
char ch = (char)t_sub.data[j];
|
||||
size_t col = j + f;
|
||||
int16_t s1 = 0;
|
||||
int16_t s2 = 0;
|
||||
int16_t consecutive = 0;
|
||||
|
||||
if (in_gap) {
|
||||
s2 = h_left.data[j] + ScoreGapExtention;
|
||||
} else {
|
||||
s2 = h_left.data[j] + ScoreGapStart;
|
||||
}
|
||||
|
||||
if (pchar == ch) {
|
||||
s1 = h_diag.data[j] + ScoreMatch;
|
||||
int16_t b = b_sub.data[j];
|
||||
consecutive = c_diag.data[j] + 1;
|
||||
if (b == BonusBoundary) {
|
||||
consecutive = 1;
|
||||
} else if (consecutive > 1) {
|
||||
b = max16(b, max16(BonusConsecutive,
|
||||
bo.data[col - ((size_t)consecutive) + 1]));
|
||||
}
|
||||
if (s1 + b < s2) {
|
||||
s1 += b_sub.data[j];
|
||||
consecutive = 0;
|
||||
} else {
|
||||
s1 += b;
|
||||
}
|
||||
}
|
||||
c_sub.data[j] = consecutive;
|
||||
in_gap = s1 < s2;
|
||||
int16_t score = max16(max16(s1, s2), 0);
|
||||
if (pidx == M - 1 && (score > max_score)) {
|
||||
max_score = score;
|
||||
max_score_pos = col;
|
||||
}
|
||||
h_sub.data[j] = score;
|
||||
}
|
||||
}
|
||||
|
||||
resize_pos(pos, M, M);
|
||||
size_t j = max_score_pos;
|
||||
if (pos) {
|
||||
size_t i = M - 1;
|
||||
bool prefer_match = true;
|
||||
for (;;) {
|
||||
size_t ii = i * width;
|
||||
size_t j0 = j - f0;
|
||||
int16_t s = h.data[ii + j0];
|
||||
|
||||
int16_t s1 = 0;
|
||||
int16_t s2 = 0;
|
||||
if (i > 0 && j >= f.data[i]) {
|
||||
s1 = h.data[ii - width + j0 - 1];
|
||||
}
|
||||
if (j > f.data[i]) {
|
||||
s2 = h.data[ii + j0 - 1];
|
||||
}
|
||||
|
||||
if (s > s1 && (s > s2 || (s == s2 && prefer_match))) {
|
||||
unsafe_append_pos(pos, j);
|
||||
if (i == 0) {
|
||||
break;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
prefer_match = c.data[ii + j0] > 1 || (ii + width + j0 + 1 < c.size &&
|
||||
c.data[ii + width + j0 + 1] > 0);
|
||||
j--;
|
||||
}
|
||||
}
|
||||
|
||||
free_alloc(h);
|
||||
free_alloc(c);
|
||||
free_alloc(t);
|
||||
free_alloc(f);
|
||||
free_alloc(bo);
|
||||
free_alloc(c0);
|
||||
free_alloc(h0);
|
||||
return (fzf_result_t){(int32_t)j, (int32_t)max_score_pos + 1,
|
||||
(int32_t)max_score};
|
||||
}
|
||||
|
13
generate_case_fold_table.sh
Executable file
13
generate_case_fold_table.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
dir=$(pwd)
|
||||
mkdir /tmp/ucd-15.0.0
|
||||
cd /tmp/ucd-15.0.0
|
||||
curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
|
||||
unzip UCD.zip
|
||||
|
||||
cd "${dir}"
|
||||
cargo install ucd-generate
|
||||
ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/case_fold.rs
|
||||
rm -rf /tmp/ucd-15.0.0
|
347
src/case_fold.rs
Normal file
347
src/case_fold.rs
Normal file
@ -0,0 +1,347 @@
|
||||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.3.0 is available on crates.io.
|
||||
|
||||
pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[
|
||||
('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'),
|
||||
('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'),
|
||||
('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'),
|
||||
('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'),
|
||||
('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'),
|
||||
('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'),
|
||||
('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'),
|
||||
('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'),
|
||||
('Ñ', 'ñ'), ('Ò', 'ò'), ('Ó', 'ó'), ('Ô', 'ô'), ('Õ', 'õ'),
|
||||
('Ö', 'ö'), ('Ø', 'ø'), ('Ù', 'ù'), ('Ú', 'ú'), ('Û', 'û'),
|
||||
('Ü', 'ü'), ('Ý', 'ý'), ('Þ', 'þ'), ('Ā', 'ā'), ('Ă', 'ă'),
|
||||
('Ą', 'ą'), ('Ć', 'ć'), ('Ĉ', 'ĉ'), ('Ċ', 'ċ'), ('Č', 'č'),
|
||||
('Ď', 'ď'), ('Đ', 'đ'), ('Ē', 'ē'), ('Ĕ', 'ĕ'), ('Ė', 'ė'),
|
||||
('Ę', 'ę'), ('Ě', 'ě'), ('Ĝ', 'ĝ'), ('Ğ', 'ğ'), ('Ġ', 'ġ'),
|
||||
('Ģ', 'ģ'), ('Ĥ', 'ĥ'), ('Ħ', 'ħ'), ('Ĩ', 'ĩ'), ('Ī', 'ī'),
|
||||
('Ĭ', 'ĭ'), ('Į', 'į'), ('IJ', 'ij'), ('Ĵ', 'ĵ'), ('Ķ', 'ķ'),
|
||||
('Ĺ', 'ĺ'), ('Ļ', 'ļ'), ('Ľ', 'ľ'), ('Ŀ', 'ŀ'), ('Ł', 'ł'),
|
||||
('Ń', 'ń'), ('Ņ', 'ņ'), ('Ň', 'ň'), ('Ŋ', 'ŋ'), ('Ō', 'ō'),
|
||||
('Ŏ', 'ŏ'), ('Ő', 'ő'), ('Œ', 'œ'), ('Ŕ', 'ŕ'), ('Ŗ', 'ŗ'),
|
||||
('Ř', 'ř'), ('Ś', 'ś'), ('Ŝ', 'ŝ'), ('Ş', 'ş'), ('Š', 'š'),
|
||||
('Ţ', 'ţ'), ('Ť', 'ť'), ('Ŧ', 'ŧ'), ('Ũ', 'ũ'), ('Ū', 'ū'),
|
||||
('Ŭ', 'ŭ'), ('Ů', 'ů'), ('Ű', 'ű'), ('Ų', 'ų'), ('Ŵ', 'ŵ'),
|
||||
('Ŷ', 'ŷ'), ('Ÿ', 'ÿ'), ('Ź', 'ź'), ('Ż', 'ż'), ('Ž', 'ž'),
|
||||
('ſ', 's'), ('Ɓ', 'ɓ'), ('Ƃ', 'ƃ'), ('Ƅ', 'ƅ'), ('Ɔ', 'ɔ'),
|
||||
('Ƈ', 'ƈ'), ('Ɖ', 'ɖ'), ('Ɗ', 'ɗ'), ('Ƌ', 'ƌ'), ('Ǝ', 'ǝ'),
|
||||
('Ə', 'ə'), ('Ɛ', 'ɛ'), ('Ƒ', 'ƒ'), ('Ɠ', 'ɠ'), ('Ɣ', 'ɣ'),
|
||||
('Ɩ', 'ɩ'), ('Ɨ', 'ɨ'), ('Ƙ', 'ƙ'), ('Ɯ', 'ɯ'), ('Ɲ', 'ɲ'),
|
||||
('Ɵ', 'ɵ'), ('Ơ', 'ơ'), ('Ƣ', 'ƣ'), ('Ƥ', 'ƥ'), ('Ʀ', 'ʀ'),
|
||||
('Ƨ', 'ƨ'), ('Ʃ', 'ʃ'), ('Ƭ', 'ƭ'), ('Ʈ', 'ʈ'), ('Ư', 'ư'),
|
||||
('Ʊ', 'ʊ'), ('Ʋ', 'ʋ'), ('Ƴ', 'ƴ'), ('Ƶ', 'ƶ'), ('Ʒ', 'ʒ'),
|
||||
('Ƹ', 'ƹ'), ('Ƽ', 'ƽ'), ('DŽ', 'dž'), ('Dž', 'dž'), ('LJ', 'lj'),
|
||||
('Lj', 'lj'), ('NJ', 'nj'), ('Nj', 'nj'), ('Ǎ', 'ǎ'), ('Ǐ', 'ǐ'),
|
||||
('Ǒ', 'ǒ'), ('Ǔ', 'ǔ'), ('Ǖ', 'ǖ'), ('Ǘ', 'ǘ'), ('Ǚ', 'ǚ'),
|
||||
('Ǜ', 'ǜ'), ('Ǟ', 'ǟ'), ('Ǡ', 'ǡ'), ('Ǣ', 'ǣ'), ('Ǥ', 'ǥ'),
|
||||
('Ǧ', 'ǧ'), ('Ǩ', 'ǩ'), ('Ǫ', 'ǫ'), ('Ǭ', 'ǭ'), ('Ǯ', 'ǯ'),
|
||||
('DZ', 'dz'), ('Dz', 'dz'), ('Ǵ', 'ǵ'), ('Ƕ', 'ƕ'), ('Ƿ', 'ƿ'),
|
||||
('Ǹ', 'ǹ'), ('Ǻ', 'ǻ'), ('Ǽ', 'ǽ'), ('Ǿ', 'ǿ'), ('Ȁ', 'ȁ'),
|
||||
('Ȃ', 'ȃ'), ('Ȅ', 'ȅ'), ('Ȇ', 'ȇ'), ('Ȉ', 'ȉ'), ('Ȋ', 'ȋ'),
|
||||
('Ȍ', 'ȍ'), ('Ȏ', 'ȏ'), ('Ȑ', 'ȑ'), ('Ȓ', 'ȓ'), ('Ȕ', 'ȕ'),
|
||||
('Ȗ', 'ȗ'), ('Ș', 'ș'), ('Ț', 'ț'), ('Ȝ', 'ȝ'), ('Ȟ', 'ȟ'),
|
||||
('Ƞ', 'ƞ'), ('Ȣ', 'ȣ'), ('Ȥ', 'ȥ'), ('Ȧ', 'ȧ'), ('Ȩ', 'ȩ'),
|
||||
('Ȫ', 'ȫ'), ('Ȭ', 'ȭ'), ('Ȯ', 'ȯ'), ('Ȱ', 'ȱ'), ('Ȳ', 'ȳ'),
|
||||
('Ⱥ', 'ⱥ'), ('Ȼ', 'ȼ'), ('Ƚ', 'ƚ'), ('Ⱦ', 'ⱦ'), ('Ɂ', 'ɂ'),
|
||||
('Ƀ', 'ƀ'), ('Ʉ', 'ʉ'), ('Ʌ', 'ʌ'), ('Ɇ', 'ɇ'), ('Ɉ', 'ɉ'),
|
||||
('Ɋ', 'ɋ'), ('Ɍ', 'ɍ'), ('Ɏ', 'ɏ'), ('\u{345}', 'ι'), ('Ͱ', 'ͱ'),
|
||||
('Ͳ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϳ', 'ϳ'), ('Ά', 'ά'), ('Έ', 'έ'),
|
||||
('Ή', 'ή'), ('Ί', 'ί'), ('Ό', 'ό'), ('Ύ', 'ύ'), ('Ώ', 'ώ'),
|
||||
('Α', 'α'), ('Β', 'β'), ('Γ', 'γ'), ('Δ', 'δ'), ('Ε', 'ε'),
|
||||
('Ζ', 'ζ'), ('Η', 'η'), ('Θ', 'θ'), ('Ι', 'ι'), ('Κ', 'κ'),
|
||||
('Λ', 'λ'), ('Μ', 'μ'), ('Ν', 'ν'), ('Ξ', 'ξ'), ('Ο', 'ο'),
|
||||
('Π', 'π'), ('Ρ', 'ρ'), ('Σ', 'σ'), ('Τ', 'τ'), ('Υ', 'υ'),
|
||||
('Φ', 'φ'), ('Χ', 'χ'), ('Ψ', 'ψ'), ('Ω', 'ω'), ('Ϊ', 'ϊ'),
|
||||
('Ϋ', 'ϋ'), ('ς', 'σ'), ('Ϗ', 'ϗ'), ('ϐ', 'β'), ('ϑ', 'θ'),
|
||||
('ϕ', 'φ'), ('ϖ', 'π'), ('Ϙ', 'ϙ'), ('Ϛ', 'ϛ'), ('Ϝ', 'ϝ'),
|
||||
('Ϟ', 'ϟ'), ('Ϡ', 'ϡ'), ('Ϣ', 'ϣ'), ('Ϥ', 'ϥ'), ('Ϧ', 'ϧ'),
|
||||
('Ϩ', 'ϩ'), ('Ϫ', 'ϫ'), ('Ϭ', 'ϭ'), ('Ϯ', 'ϯ'), ('ϰ', 'κ'),
|
||||
('ϱ', 'ρ'), ('ϴ', 'θ'), ('ϵ', 'ε'), ('Ϸ', 'ϸ'), ('Ϲ', 'ϲ'),
|
||||
('Ϻ', 'ϻ'), ('Ͻ', 'ͻ'), ('Ͼ', 'ͼ'), ('Ͽ', 'ͽ'), ('Ѐ', 'ѐ'),
|
||||
('Ё', 'ё'), ('Ђ', 'ђ'), ('Ѓ', 'ѓ'), ('Є', 'є'), ('Ѕ', 'ѕ'),
|
||||
('І', 'і'), ('Ї', 'ї'), ('Ј', 'ј'), ('Љ', 'љ'), ('Њ', 'њ'),
|
||||
('Ћ', 'ћ'), ('Ќ', 'ќ'), ('Ѝ', 'ѝ'), ('Ў', 'ў'), ('Џ', 'џ'),
|
||||
('А', 'а'), ('Б', 'б'), ('В', 'в'), ('Г', 'г'), ('Д', 'д'),
|
||||
('Е', 'е'), ('Ж', 'ж'), ('З', 'з'), ('И', 'и'), ('Й', 'й'),
|
||||
('К', 'к'), ('Л', 'л'), ('М', 'м'), ('Н', 'н'), ('О', 'о'),
|
||||
('П', 'п'), ('Р', 'р'), ('С', 'с'), ('Т', 'т'), ('У', 'у'),
|
||||
('Ф', 'ф'), ('Х', 'х'), ('Ц', 'ц'), ('Ч', 'ч'), ('Ш', 'ш'),
|
||||
('Щ', 'щ'), ('Ъ', 'ъ'), ('Ы', 'ы'), ('Ь', 'ь'), ('Э', 'э'),
|
||||
('Ю', 'ю'), ('Я', 'я'), ('Ѡ', 'ѡ'), ('Ѣ', 'ѣ'), ('Ѥ', 'ѥ'),
|
||||
('Ѧ', 'ѧ'), ('Ѩ', 'ѩ'), ('Ѫ', 'ѫ'), ('Ѭ', 'ѭ'), ('Ѯ', 'ѯ'),
|
||||
('Ѱ', 'ѱ'), ('Ѳ', 'ѳ'), ('Ѵ', 'ѵ'), ('Ѷ', 'ѷ'), ('Ѹ', 'ѹ'),
|
||||
('Ѻ', 'ѻ'), ('Ѽ', 'ѽ'), ('Ѿ', 'ѿ'), ('Ҁ', 'ҁ'), ('Ҋ', 'ҋ'),
|
||||
('Ҍ', 'ҍ'), ('Ҏ', 'ҏ'), ('Ґ', 'ґ'), ('Ғ', 'ғ'), ('Ҕ', 'ҕ'),
|
||||
('Җ', 'җ'), ('Ҙ', 'ҙ'), ('Қ', 'қ'), ('Ҝ', 'ҝ'), ('Ҟ', 'ҟ'),
|
||||
('Ҡ', 'ҡ'), ('Ң', 'ң'), ('Ҥ', 'ҥ'), ('Ҧ', 'ҧ'), ('Ҩ', 'ҩ'),
|
||||
('Ҫ', 'ҫ'), ('Ҭ', 'ҭ'), ('Ү', 'ү'), ('Ұ', 'ұ'), ('Ҳ', 'ҳ'),
|
||||
('Ҵ', 'ҵ'), ('Ҷ', 'ҷ'), ('Ҹ', 'ҹ'), ('Һ', 'һ'), ('Ҽ', 'ҽ'),
|
||||
('Ҿ', 'ҿ'), ('Ӏ', 'ӏ'), ('Ӂ', 'ӂ'), ('Ӄ', 'ӄ'), ('Ӆ', 'ӆ'),
|
||||
('Ӈ', 'ӈ'), ('Ӊ', 'ӊ'), ('Ӌ', 'ӌ'), ('Ӎ', 'ӎ'), ('Ӑ', 'ӑ'),
|
||||
('Ӓ', 'ӓ'), ('Ӕ', 'ӕ'), ('Ӗ', 'ӗ'), ('Ә', 'ә'), ('Ӛ', 'ӛ'),
|
||||
('Ӝ', 'ӝ'), ('Ӟ', 'ӟ'), ('Ӡ', 'ӡ'), ('Ӣ', 'ӣ'), ('Ӥ', 'ӥ'),
|
||||
('Ӧ', 'ӧ'), ('Ө', 'ө'), ('Ӫ', 'ӫ'), ('Ӭ', 'ӭ'), ('Ӯ', 'ӯ'),
|
||||
('Ӱ', 'ӱ'), ('Ӳ', 'ӳ'), ('Ӵ', 'ӵ'), ('Ӷ', 'ӷ'), ('Ӹ', 'ӹ'),
|
||||
('Ӻ', 'ӻ'), ('Ӽ', 'ӽ'), ('Ӿ', 'ӿ'), ('Ԁ', 'ԁ'), ('Ԃ', 'ԃ'),
|
||||
('Ԅ', 'ԅ'), ('Ԇ', 'ԇ'), ('Ԉ', 'ԉ'), ('Ԋ', 'ԋ'), ('Ԍ', 'ԍ'),
|
||||
('Ԏ', 'ԏ'), ('Ԑ', 'ԑ'), ('Ԓ', 'ԓ'), ('Ԕ', 'ԕ'), ('Ԗ', 'ԗ'),
|
||||
('Ԙ', 'ԙ'), ('Ԛ', 'ԛ'), ('Ԝ', 'ԝ'), ('Ԟ', 'ԟ'), ('Ԡ', 'ԡ'),
|
||||
('Ԣ', 'ԣ'), ('Ԥ', 'ԥ'), ('Ԧ', 'ԧ'), ('Ԩ', 'ԩ'), ('Ԫ', 'ԫ'),
|
||||
('Ԭ', 'ԭ'), ('Ԯ', 'ԯ'), ('Ա', 'ա'), ('Բ', 'բ'), ('Գ', 'գ'),
|
||||
('Դ', 'դ'), ('Ե', 'ե'), ('Զ', 'զ'), ('Է', 'է'), ('Ը', 'ը'),
|
||||
('Թ', 'թ'), ('Ժ', 'ժ'), ('Ի', 'ի'), ('Լ', 'լ'), ('Խ', 'խ'),
|
||||
('Ծ', 'ծ'), ('Կ', 'կ'), ('Հ', 'հ'), ('Ձ', 'ձ'), ('Ղ', 'ղ'),
|
||||
('Ճ', 'ճ'), ('Մ', 'մ'), ('Յ', 'յ'), ('Ն', 'ն'), ('Շ', 'շ'),
|
||||
('Ո', 'ո'), ('Չ', 'չ'), ('Պ', 'պ'), ('Ջ', 'ջ'), ('Ռ', 'ռ'),
|
||||
('Ս', 'ս'), ('Վ', 'վ'), ('Տ', 'տ'), ('Ր', 'ր'), ('Ց', 'ց'),
|
||||
('Ւ', 'ւ'), ('Փ', 'փ'), ('Ք', 'ք'), ('Օ', 'օ'), ('Ֆ', 'ֆ'),
|
||||
('Ⴀ', 'ⴀ'), ('Ⴁ', 'ⴁ'), ('Ⴂ', 'ⴂ'), ('Ⴃ', 'ⴃ'),
|
||||
('Ⴄ', 'ⴄ'), ('Ⴅ', 'ⴅ'), ('Ⴆ', 'ⴆ'), ('Ⴇ', 'ⴇ'),
|
||||
('Ⴈ', 'ⴈ'), ('Ⴉ', 'ⴉ'), ('Ⴊ', 'ⴊ'), ('Ⴋ', 'ⴋ'),
|
||||
('Ⴌ', 'ⴌ'), ('Ⴍ', 'ⴍ'), ('Ⴎ', 'ⴎ'), ('Ⴏ', 'ⴏ'),
|
||||
('Ⴐ', 'ⴐ'), ('Ⴑ', 'ⴑ'), ('Ⴒ', 'ⴒ'), ('Ⴓ', 'ⴓ'),
|
||||
('Ⴔ', 'ⴔ'), ('Ⴕ', 'ⴕ'), ('Ⴖ', 'ⴖ'), ('Ⴗ', 'ⴗ'),
|
||||
('Ⴘ', 'ⴘ'), ('Ⴙ', 'ⴙ'), ('Ⴚ', 'ⴚ'), ('Ⴛ', 'ⴛ'),
|
||||
('Ⴜ', 'ⴜ'), ('Ⴝ', 'ⴝ'), ('Ⴞ', 'ⴞ'), ('Ⴟ', 'ⴟ'),
|
||||
('Ⴠ', 'ⴠ'), ('Ⴡ', 'ⴡ'), ('Ⴢ', 'ⴢ'), ('Ⴣ', 'ⴣ'),
|
||||
('Ⴤ', 'ⴤ'), ('Ⴥ', 'ⴥ'), ('Ⴧ', 'ⴧ'), ('Ⴭ', 'ⴭ'),
|
||||
('ᏸ', 'Ᏸ'), ('ᏹ', 'Ᏹ'), ('ᏺ', 'Ᏺ'), ('ᏻ', 'Ᏻ'),
|
||||
('ᏼ', 'Ᏼ'), ('ᏽ', 'Ᏽ'), ('ᲀ', 'в'), ('ᲁ', 'д'), ('ᲂ', 'о'),
|
||||
('ᲃ', 'с'), ('ᲄ', 'т'), ('ᲅ', 'т'), ('ᲆ', 'ъ'), ('ᲇ', 'ѣ'),
|
||||
('ᲈ', 'ꙋ'), ('Ა', 'ა'), ('Ბ', 'ბ'), ('Გ', 'გ'),
|
||||
('Დ', 'დ'), ('Ე', 'ე'), ('Ვ', 'ვ'), ('Ზ', 'ზ'),
|
||||
('Თ', 'თ'), ('Ი', 'ი'), ('Კ', 'კ'), ('Ლ', 'ლ'),
|
||||
('Მ', 'მ'), ('Ნ', 'ნ'), ('Ო', 'ო'), ('Პ', 'პ'),
|
||||
('Ჟ', 'ჟ'), ('Რ', 'რ'), ('Ს', 'ს'), ('Ტ', 'ტ'),
|
||||
('Უ', 'უ'), ('Ფ', 'ფ'), ('Ქ', 'ქ'), ('Ღ', 'ღ'),
|
||||
('Ყ', 'ყ'), ('Შ', 'შ'), ('Ჩ', 'ჩ'), ('Ც', 'ც'),
|
||||
('Ძ', 'ძ'), ('Წ', 'წ'), ('Ჭ', 'ჭ'), ('Ხ', 'ხ'),
|
||||
('Ჯ', 'ჯ'), ('Ჰ', 'ჰ'), ('Ჱ', 'ჱ'), ('Ჲ', 'ჲ'),
|
||||
('Ჳ', 'ჳ'), ('Ჴ', 'ჴ'), ('Ჵ', 'ჵ'), ('Ჶ', 'ჶ'),
|
||||
('Ჷ', 'ჷ'), ('Ჸ', 'ჸ'), ('Ჹ', 'ჹ'), ('Ჺ', 'ჺ'),
|
||||
('Ჽ', 'ჽ'), ('Ჾ', 'ჾ'), ('Ჿ', 'ჿ'), ('Ḁ', 'ḁ'),
|
||||
('Ḃ', 'ḃ'), ('Ḅ', 'ḅ'), ('Ḇ', 'ḇ'), ('Ḉ', 'ḉ'),
|
||||
('Ḋ', 'ḋ'), ('Ḍ', 'ḍ'), ('Ḏ', 'ḏ'), ('Ḑ', 'ḑ'),
|
||||
('Ḓ', 'ḓ'), ('Ḕ', 'ḕ'), ('Ḗ', 'ḗ'), ('Ḙ', 'ḙ'),
|
||||
('Ḛ', 'ḛ'), ('Ḝ', 'ḝ'), ('Ḟ', 'ḟ'), ('Ḡ', 'ḡ'),
|
||||
('Ḣ', 'ḣ'), ('Ḥ', 'ḥ'), ('Ḧ', 'ḧ'), ('Ḩ', 'ḩ'),
|
||||
('Ḫ', 'ḫ'), ('Ḭ', 'ḭ'), ('Ḯ', 'ḯ'), ('Ḱ', 'ḱ'),
|
||||
('Ḳ', 'ḳ'), ('Ḵ', 'ḵ'), ('Ḷ', 'ḷ'), ('Ḹ', 'ḹ'),
|
||||
('Ḻ', 'ḻ'), ('Ḽ', 'ḽ'), ('Ḿ', 'ḿ'), ('Ṁ', 'ṁ'),
|
||||
('Ṃ', 'ṃ'), ('Ṅ', 'ṅ'), ('Ṇ', 'ṇ'), ('Ṉ', 'ṉ'),
|
||||
('Ṋ', 'ṋ'), ('Ṍ', 'ṍ'), ('Ṏ', 'ṏ'), ('Ṑ', 'ṑ'),
|
||||
('Ṓ', 'ṓ'), ('Ṕ', 'ṕ'), ('Ṗ', 'ṗ'), ('Ṙ', 'ṙ'),
|
||||
('Ṛ', 'ṛ'), ('Ṝ', 'ṝ'), ('Ṟ', 'ṟ'), ('Ṡ', 'ṡ'),
|
||||
('Ṣ', 'ṣ'), ('Ṥ', 'ṥ'), ('Ṧ', 'ṧ'), ('Ṩ', 'ṩ'),
|
||||
('Ṫ', 'ṫ'), ('Ṭ', 'ṭ'), ('Ṯ', 'ṯ'), ('Ṱ', 'ṱ'),
|
||||
('Ṳ', 'ṳ'), ('Ṵ', 'ṵ'), ('Ṷ', 'ṷ'), ('Ṹ', 'ṹ'),
|
||||
('Ṻ', 'ṻ'), ('Ṽ', 'ṽ'), ('Ṿ', 'ṿ'), ('Ẁ', 'ẁ'),
|
||||
('Ẃ', 'ẃ'), ('Ẅ', 'ẅ'), ('Ẇ', 'ẇ'), ('Ẉ', 'ẉ'),
|
||||
('Ẋ', 'ẋ'), ('Ẍ', 'ẍ'), ('Ẏ', 'ẏ'), ('Ẑ', 'ẑ'),
|
||||
('Ẓ', 'ẓ'), ('Ẕ', 'ẕ'), ('ẛ', 'ṡ'), ('ẞ', 'ß'),
|
||||
('Ạ', 'ạ'), ('Ả', 'ả'), ('Ấ', 'ấ'), ('Ầ', 'ầ'),
|
||||
('Ẩ', 'ẩ'), ('Ẫ', 'ẫ'), ('Ậ', 'ậ'), ('Ắ', 'ắ'),
|
||||
('Ằ', 'ằ'), ('Ẳ', 'ẳ'), ('Ẵ', 'ẵ'), ('Ặ', 'ặ'),
|
||||
('Ẹ', 'ẹ'), ('Ẻ', 'ẻ'), ('Ẽ', 'ẽ'), ('Ế', 'ế'),
|
||||
('Ề', 'ề'), ('Ể', 'ể'), ('Ễ', 'ễ'), ('Ệ', 'ệ'),
|
||||
('Ỉ', 'ỉ'), ('Ị', 'ị'), ('Ọ', 'ọ'), ('Ỏ', 'ỏ'),
|
||||
('Ố', 'ố'), ('Ồ', 'ồ'), ('Ổ', 'ổ'), ('Ỗ', 'ỗ'),
|
||||
('Ộ', 'ộ'), ('Ớ', 'ớ'), ('Ờ', 'ờ'), ('Ở', 'ở'),
|
||||
('Ỡ', 'ỡ'), ('Ợ', 'ợ'), ('Ụ', 'ụ'), ('Ủ', 'ủ'),
|
||||
('Ứ', 'ứ'), ('Ừ', 'ừ'), ('Ử', 'ử'), ('Ữ', 'ữ'),
|
||||
('Ự', 'ự'), ('Ỳ', 'ỳ'), ('Ỵ', 'ỵ'), ('Ỷ', 'ỷ'),
|
||||
('Ỹ', 'ỹ'), ('Ỻ', 'ỻ'), ('Ỽ', 'ỽ'), ('Ỿ', 'ỿ'),
|
||||
('Ἀ', 'ἀ'), ('Ἁ', 'ἁ'), ('Ἂ', 'ἂ'), ('Ἃ', 'ἃ'),
|
||||
('Ἄ', 'ἄ'), ('Ἅ', 'ἅ'), ('Ἆ', 'ἆ'), ('Ἇ', 'ἇ'),
|
||||
('Ἐ', 'ἐ'), ('Ἑ', 'ἑ'), ('Ἒ', 'ἒ'), ('Ἓ', 'ἓ'),
|
||||
('Ἔ', 'ἔ'), ('Ἕ', 'ἕ'), ('Ἠ', 'ἠ'), ('Ἡ', 'ἡ'),
|
||||
('Ἢ', 'ἢ'), ('Ἣ', 'ἣ'), ('Ἤ', 'ἤ'), ('Ἥ', 'ἥ'),
|
||||
('Ἦ', 'ἦ'), ('Ἧ', 'ἧ'), ('Ἰ', 'ἰ'), ('Ἱ', 'ἱ'),
|
||||
('Ἲ', 'ἲ'), ('Ἳ', 'ἳ'), ('Ἴ', 'ἴ'), ('Ἵ', 'ἵ'),
|
||||
('Ἶ', 'ἶ'), ('Ἷ', 'ἷ'), ('Ὀ', 'ὀ'), ('Ὁ', 'ὁ'),
|
||||
('Ὂ', 'ὂ'), ('Ὃ', 'ὃ'), ('Ὄ', 'ὄ'), ('Ὅ', 'ὅ'),
|
||||
('Ὑ', 'ὑ'), ('Ὓ', 'ὓ'), ('Ὕ', 'ὕ'), ('Ὗ', 'ὗ'),
|
||||
('Ὠ', 'ὠ'), ('Ὡ', 'ὡ'), ('Ὢ', 'ὢ'), ('Ὣ', 'ὣ'),
|
||||
('Ὤ', 'ὤ'), ('Ὥ', 'ὥ'), ('Ὦ', 'ὦ'), ('Ὧ', 'ὧ'),
|
||||
('ᾈ', 'ᾀ'), ('ᾉ', 'ᾁ'), ('ᾊ', 'ᾂ'), ('ᾋ', 'ᾃ'),
|
||||
('ᾌ', 'ᾄ'), ('ᾍ', 'ᾅ'), ('ᾎ', 'ᾆ'), ('ᾏ', 'ᾇ'),
|
||||
('ᾘ', 'ᾐ'), ('ᾙ', 'ᾑ'), ('ᾚ', 'ᾒ'), ('ᾛ', 'ᾓ'),
|
||||
('ᾜ', 'ᾔ'), ('ᾝ', 'ᾕ'), ('ᾞ', 'ᾖ'), ('ᾟ', 'ᾗ'),
|
||||
('ᾨ', 'ᾠ'), ('ᾩ', 'ᾡ'), ('ᾪ', 'ᾢ'), ('ᾫ', 'ᾣ'),
|
||||
('ᾬ', 'ᾤ'), ('ᾭ', 'ᾥ'), ('ᾮ', 'ᾦ'), ('ᾯ', 'ᾧ'),
|
||||
('Ᾰ', 'ᾰ'), ('Ᾱ', 'ᾱ'), ('Ὰ', 'ὰ'), ('Ά', 'ά'),
|
||||
('ᾼ', 'ᾳ'), ('ι', 'ι'), ('Ὲ', 'ὲ'), ('Έ', 'έ'),
|
||||
('Ὴ', 'ὴ'), ('Ή', 'ή'), ('ῌ', 'ῃ'), ('Ῐ', 'ῐ'),
|
||||
('Ῑ', 'ῑ'), ('Ὶ', 'ὶ'), ('Ί', 'ί'), ('Ῠ', 'ῠ'),
|
||||
('Ῡ', 'ῡ'), ('Ὺ', 'ὺ'), ('Ύ', 'ύ'), ('Ῥ', 'ῥ'),
|
||||
('Ὸ', 'ὸ'), ('Ό', 'ό'), ('Ὼ', 'ὼ'), ('Ώ', 'ώ'),
|
||||
('ῼ', 'ῳ'), ('Ω', 'ω'), ('K', 'k'), ('Å', 'å'), ('Ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ⅰ'), ('Ⅱ', 'ⅱ'), ('Ⅲ', 'ⅲ'), ('Ⅳ', 'ⅳ'),
|
||||
('Ⅴ', 'ⅴ'), ('Ⅵ', 'ⅵ'), ('Ⅶ', 'ⅶ'), ('Ⅷ', 'ⅷ'),
|
||||
('Ⅸ', 'ⅸ'), ('Ⅹ', 'ⅹ'), ('Ⅺ', 'ⅺ'), ('Ⅻ', 'ⅻ'),
|
||||
('Ⅼ', 'ⅼ'), ('Ⅽ', 'ⅽ'), ('Ⅾ', 'ⅾ'), ('Ⅿ', 'ⅿ'),
|
||||
('Ↄ', 'ↄ'), ('Ⓐ', 'ⓐ'), ('Ⓑ', 'ⓑ'), ('Ⓒ', 'ⓒ'),
|
||||
('Ⓓ', 'ⓓ'), ('Ⓔ', 'ⓔ'), ('Ⓕ', 'ⓕ'), ('Ⓖ', 'ⓖ'),
|
||||
('Ⓗ', 'ⓗ'), ('Ⓘ', 'ⓘ'), ('Ⓙ', 'ⓙ'), ('Ⓚ', 'ⓚ'),
|
||||
('Ⓛ', 'ⓛ'), ('Ⓜ', 'ⓜ'), ('Ⓝ', 'ⓝ'), ('Ⓞ', 'ⓞ'),
|
||||
('Ⓟ', 'ⓟ'), ('Ⓠ', 'ⓠ'), ('Ⓡ', 'ⓡ'), ('Ⓢ', 'ⓢ'),
|
||||
('Ⓣ', 'ⓣ'), ('Ⓤ', 'ⓤ'), ('Ⓥ', 'ⓥ'), ('Ⓦ', 'ⓦ'),
|
||||
('Ⓧ', 'ⓧ'), ('Ⓨ', 'ⓨ'), ('Ⓩ', 'ⓩ'), ('Ⰰ', 'ⰰ'),
|
||||
('Ⰱ', 'ⰱ'), ('Ⰲ', 'ⰲ'), ('Ⰳ', 'ⰳ'), ('Ⰴ', 'ⰴ'),
|
||||
('Ⰵ', 'ⰵ'), ('Ⰶ', 'ⰶ'), ('Ⰷ', 'ⰷ'), ('Ⰸ', 'ⰸ'),
|
||||
('Ⰹ', 'ⰹ'), ('Ⰺ', 'ⰺ'), ('Ⰻ', 'ⰻ'), ('Ⰼ', 'ⰼ'),
|
||||
('Ⰽ', 'ⰽ'), ('Ⰾ', 'ⰾ'), ('Ⰿ', 'ⰿ'), ('Ⱀ', 'ⱀ'),
|
||||
('Ⱁ', 'ⱁ'), ('Ⱂ', 'ⱂ'), ('Ⱃ', 'ⱃ'), ('Ⱄ', 'ⱄ'),
|
||||
('Ⱅ', 'ⱅ'), ('Ⱆ', 'ⱆ'), ('Ⱇ', 'ⱇ'), ('Ⱈ', 'ⱈ'),
|
||||
('Ⱉ', 'ⱉ'), ('Ⱊ', 'ⱊ'), ('Ⱋ', 'ⱋ'), ('Ⱌ', 'ⱌ'),
|
||||
('Ⱍ', 'ⱍ'), ('Ⱎ', 'ⱎ'), ('Ⱏ', 'ⱏ'), ('Ⱐ', 'ⱐ'),
|
||||
('Ⱑ', 'ⱑ'), ('Ⱒ', 'ⱒ'), ('Ⱓ', 'ⱓ'), ('Ⱔ', 'ⱔ'),
|
||||
('Ⱕ', 'ⱕ'), ('Ⱖ', 'ⱖ'), ('Ⱗ', 'ⱗ'), ('Ⱘ', 'ⱘ'),
|
||||
('Ⱙ', 'ⱙ'), ('Ⱚ', 'ⱚ'), ('Ⱛ', 'ⱛ'), ('Ⱜ', 'ⱜ'),
|
||||
('Ⱝ', 'ⱝ'), ('Ⱞ', 'ⱞ'), ('Ⱟ', 'ⱟ'), ('Ⱡ', 'ⱡ'),
|
||||
('Ɫ', 'ɫ'), ('Ᵽ', 'ᵽ'), ('Ɽ', 'ɽ'), ('Ⱨ', 'ⱨ'),
|
||||
('Ⱪ', 'ⱪ'), ('Ⱬ', 'ⱬ'), ('Ɑ', 'ɑ'), ('Ɱ', 'ɱ'), ('Ɐ', 'ɐ'),
|
||||
('Ɒ', 'ɒ'), ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ȿ'), ('Ɀ', 'ɀ'),
|
||||
('Ⲁ', 'ⲁ'), ('Ⲃ', 'ⲃ'), ('Ⲅ', 'ⲅ'), ('Ⲇ', 'ⲇ'),
|
||||
('Ⲉ', 'ⲉ'), ('Ⲋ', 'ⲋ'), ('Ⲍ', 'ⲍ'), ('Ⲏ', 'ⲏ'),
|
||||
('Ⲑ', 'ⲑ'), ('Ⲓ', 'ⲓ'), ('Ⲕ', 'ⲕ'), ('Ⲗ', 'ⲗ'),
|
||||
('Ⲙ', 'ⲙ'), ('Ⲛ', 'ⲛ'), ('Ⲝ', 'ⲝ'), ('Ⲟ', 'ⲟ'),
|
||||
('Ⲡ', 'ⲡ'), ('Ⲣ', 'ⲣ'), ('Ⲥ', 'ⲥ'), ('Ⲧ', 'ⲧ'),
|
||||
('Ⲩ', 'ⲩ'), ('Ⲫ', 'ⲫ'), ('Ⲭ', 'ⲭ'), ('Ⲯ', 'ⲯ'),
|
||||
('Ⲱ', 'ⲱ'), ('Ⲳ', 'ⲳ'), ('Ⲵ', 'ⲵ'), ('Ⲷ', 'ⲷ'),
|
||||
('Ⲹ', 'ⲹ'), ('Ⲻ', 'ⲻ'), ('Ⲽ', 'ⲽ'), ('Ⲿ', 'ⲿ'),
|
||||
('Ⳁ', 'ⳁ'), ('Ⳃ', 'ⳃ'), ('Ⳅ', 'ⳅ'), ('Ⳇ', 'ⳇ'),
|
||||
('Ⳉ', 'ⳉ'), ('Ⳋ', 'ⳋ'), ('Ⳍ', 'ⳍ'), ('Ⳏ', 'ⳏ'),
|
||||
('Ⳑ', 'ⳑ'), ('Ⳓ', 'ⳓ'), ('Ⳕ', 'ⳕ'), ('Ⳗ', 'ⳗ'),
|
||||
('Ⳙ', 'ⳙ'), ('Ⳛ', 'ⳛ'), ('Ⳝ', 'ⳝ'), ('Ⳟ', 'ⳟ'),
|
||||
('Ⳡ', 'ⳡ'), ('Ⳣ', 'ⳣ'), ('Ⳬ', 'ⳬ'), ('Ⳮ', 'ⳮ'),
|
||||
('Ⳳ', 'ⳳ'), ('Ꙁ', 'ꙁ'), ('Ꙃ', 'ꙃ'), ('Ꙅ', 'ꙅ'),
|
||||
('Ꙇ', 'ꙇ'), ('Ꙉ', 'ꙉ'), ('Ꙋ', 'ꙋ'), ('Ꙍ', 'ꙍ'),
|
||||
('Ꙏ', 'ꙏ'), ('Ꙑ', 'ꙑ'), ('Ꙓ', 'ꙓ'), ('Ꙕ', 'ꙕ'),
|
||||
('Ꙗ', 'ꙗ'), ('Ꙙ', 'ꙙ'), ('Ꙛ', 'ꙛ'), ('Ꙝ', 'ꙝ'),
|
||||
('Ꙟ', 'ꙟ'), ('Ꙡ', 'ꙡ'), ('Ꙣ', 'ꙣ'), ('Ꙥ', 'ꙥ'),
|
||||
('Ꙧ', 'ꙧ'), ('Ꙩ', 'ꙩ'), ('Ꙫ', 'ꙫ'), ('Ꙭ', 'ꙭ'),
|
||||
('Ꚁ', 'ꚁ'), ('Ꚃ', 'ꚃ'), ('Ꚅ', 'ꚅ'), ('Ꚇ', 'ꚇ'),
|
||||
('Ꚉ', 'ꚉ'), ('Ꚋ', 'ꚋ'), ('Ꚍ', 'ꚍ'), ('Ꚏ', 'ꚏ'),
|
||||
('Ꚑ', 'ꚑ'), ('Ꚓ', 'ꚓ'), ('Ꚕ', 'ꚕ'), ('Ꚗ', 'ꚗ'),
|
||||
('Ꚙ', 'ꚙ'), ('Ꚛ', 'ꚛ'), ('Ꜣ', 'ꜣ'), ('Ꜥ', 'ꜥ'),
|
||||
('Ꜧ', 'ꜧ'), ('Ꜩ', 'ꜩ'), ('Ꜫ', 'ꜫ'), ('Ꜭ', 'ꜭ'),
|
||||
('Ꜯ', 'ꜯ'), ('Ꜳ', 'ꜳ'), ('Ꜵ', 'ꜵ'), ('Ꜷ', 'ꜷ'),
|
||||
('Ꜹ', 'ꜹ'), ('Ꜻ', 'ꜻ'), ('Ꜽ', 'ꜽ'), ('Ꜿ', 'ꜿ'),
|
||||
('Ꝁ', 'ꝁ'), ('Ꝃ', 'ꝃ'), ('Ꝅ', 'ꝅ'), ('Ꝇ', 'ꝇ'),
|
||||
('Ꝉ', 'ꝉ'), ('Ꝋ', 'ꝋ'), ('Ꝍ', 'ꝍ'), ('Ꝏ', 'ꝏ'),
|
||||
('Ꝑ', 'ꝑ'), ('Ꝓ', 'ꝓ'), ('Ꝕ', 'ꝕ'), ('Ꝗ', 'ꝗ'),
|
||||
('Ꝙ', 'ꝙ'), ('Ꝛ', 'ꝛ'), ('Ꝝ', 'ꝝ'), ('Ꝟ', 'ꝟ'),
|
||||
('Ꝡ', 'ꝡ'), ('Ꝣ', 'ꝣ'), ('Ꝥ', 'ꝥ'), ('Ꝧ', 'ꝧ'),
|
||||
('Ꝩ', 'ꝩ'), ('Ꝫ', 'ꝫ'), ('Ꝭ', 'ꝭ'), ('Ꝯ', 'ꝯ'),
|
||||
('Ꝺ', 'ꝺ'), ('Ꝼ', 'ꝼ'), ('Ᵹ', 'ᵹ'), ('Ꝿ', 'ꝿ'),
|
||||
('Ꞁ', 'ꞁ'), ('Ꞃ', 'ꞃ'), ('Ꞅ', 'ꞅ'), ('Ꞇ', 'ꞇ'),
|
||||
('Ꞌ', 'ꞌ'), ('Ɥ', 'ɥ'), ('Ꞑ', 'ꞑ'), ('Ꞓ', 'ꞓ'),
|
||||
('Ꞗ', 'ꞗ'), ('Ꞙ', 'ꞙ'), ('Ꞛ', 'ꞛ'), ('Ꞝ', 'ꞝ'),
|
||||
('Ꞟ', 'ꞟ'), ('Ꞡ', 'ꞡ'), ('Ꞣ', 'ꞣ'), ('Ꞥ', 'ꞥ'),
|
||||
('Ꞧ', 'ꞧ'), ('Ꞩ', 'ꞩ'), ('Ɦ', 'ɦ'), ('Ɜ', 'ɜ'), ('Ɡ', 'ɡ'),
|
||||
('Ɬ', 'ɬ'), ('Ɪ', 'ɪ'), ('Ʞ', 'ʞ'), ('Ʇ', 'ʇ'), ('Ʝ', 'ʝ'),
|
||||
('Ꭓ', 'ꭓ'), ('Ꞵ', 'ꞵ'), ('Ꞷ', 'ꞷ'), ('Ꞹ', 'ꞹ'),
|
||||
('Ꞻ', 'ꞻ'), ('Ꞽ', 'ꞽ'), ('Ꞿ', 'ꞿ'), ('Ꟁ', 'ꟁ'),
|
||||
('Ꟃ', 'ꟃ'), ('Ꞔ', 'ꞔ'), ('Ʂ', 'ʂ'), ('Ᶎ', 'ᶎ'),
|
||||
('Ꟈ', 'ꟈ'), ('Ꟊ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('Ꟗ', 'ꟗ'),
|
||||
('Ꟙ', 'ꟙ'), ('Ꟶ', 'ꟶ'), ('ꭰ', 'Ꭰ'), ('ꭱ', 'Ꭱ'),
|
||||
('ꭲ', 'Ꭲ'), ('ꭳ', 'Ꭳ'), ('ꭴ', 'Ꭴ'), ('ꭵ', 'Ꭵ'),
|
||||
('ꭶ', 'Ꭶ'), ('ꭷ', 'Ꭷ'), ('ꭸ', 'Ꭸ'), ('ꭹ', 'Ꭹ'),
|
||||
('ꭺ', 'Ꭺ'), ('ꭻ', 'Ꭻ'), ('ꭼ', 'Ꭼ'), ('ꭽ', 'Ꭽ'),
|
||||
('ꭾ', 'Ꭾ'), ('ꭿ', 'Ꭿ'), ('ꮀ', 'Ꮀ'), ('ꮁ', 'Ꮁ'),
|
||||
('ꮂ', 'Ꮂ'), ('ꮃ', 'Ꮃ'), ('ꮄ', 'Ꮄ'), ('ꮅ', 'Ꮅ'),
|
||||
('ꮆ', 'Ꮆ'), ('ꮇ', 'Ꮇ'), ('ꮈ', 'Ꮈ'), ('ꮉ', 'Ꮉ'),
|
||||
('ꮊ', 'Ꮊ'), ('ꮋ', 'Ꮋ'), ('ꮌ', 'Ꮌ'), ('ꮍ', 'Ꮍ'),
|
||||
('ꮎ', 'Ꮎ'), ('ꮏ', 'Ꮏ'), ('ꮐ', 'Ꮐ'), ('ꮑ', 'Ꮑ'),
|
||||
('ꮒ', 'Ꮒ'), ('ꮓ', 'Ꮓ'), ('ꮔ', 'Ꮔ'), ('ꮕ', 'Ꮕ'),
|
||||
('ꮖ', 'Ꮖ'), ('ꮗ', 'Ꮗ'), ('ꮘ', 'Ꮘ'), ('ꮙ', 'Ꮙ'),
|
||||
('ꮚ', 'Ꮚ'), ('ꮛ', 'Ꮛ'), ('ꮜ', 'Ꮜ'), ('ꮝ', 'Ꮝ'),
|
||||
('ꮞ', 'Ꮞ'), ('ꮟ', 'Ꮟ'), ('ꮠ', 'Ꮠ'), ('ꮡ', 'Ꮡ'),
|
||||
('ꮢ', 'Ꮢ'), ('ꮣ', 'Ꮣ'), ('ꮤ', 'Ꮤ'), ('ꮥ', 'Ꮥ'),
|
||||
('ꮦ', 'Ꮦ'), ('ꮧ', 'Ꮧ'), ('ꮨ', 'Ꮨ'), ('ꮩ', 'Ꮩ'),
|
||||
('ꮪ', 'Ꮪ'), ('ꮫ', 'Ꮫ'), ('ꮬ', 'Ꮬ'), ('ꮭ', 'Ꮭ'),
|
||||
('ꮮ', 'Ꮮ'), ('ꮯ', 'Ꮯ'), ('ꮰ', 'Ꮰ'), ('ꮱ', 'Ꮱ'),
|
||||
('ꮲ', 'Ꮲ'), ('ꮳ', 'Ꮳ'), ('ꮴ', 'Ꮴ'), ('ꮵ', 'Ꮵ'),
|
||||
('ꮶ', 'Ꮶ'), ('ꮷ', 'Ꮷ'), ('ꮸ', 'Ꮸ'), ('ꮹ', 'Ꮹ'),
|
||||
('ꮺ', 'Ꮺ'), ('ꮻ', 'Ꮻ'), ('ꮼ', 'Ꮼ'), ('ꮽ', 'Ꮽ'),
|
||||
('ꮾ', 'Ꮾ'), ('ꮿ', 'Ꮿ'), ('A', 'a'), ('B', 'b'),
|
||||
('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'),
|
||||
('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'),
|
||||
('K', 'k'), ('L', 'l'), ('M', 'm'), ('N', 'n'),
|
||||
('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'),
|
||||
('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'),
|
||||
('W', 'w'), ('X', 'x'), ('Y', 'y'), ('Z', 'z'),
|
||||
('𐐀', '𐐨'), ('𐐁', '𐐩'), ('𐐂', '𐐪'), ('𐐃', '𐐫'),
|
||||
('𐐄', '𐐬'), ('𐐅', '𐐭'), ('𐐆', '𐐮'), ('𐐇', '𐐯'),
|
||||
('𐐈', '𐐰'), ('𐐉', '𐐱'), ('𐐊', '𐐲'), ('𐐋', '𐐳'),
|
||||
('𐐌', '𐐴'), ('𐐍', '𐐵'), ('𐐎', '𐐶'), ('𐐏', '𐐷'),
|
||||
('𐐐', '𐐸'), ('𐐑', '𐐹'), ('𐐒', '𐐺'), ('𐐓', '𐐻'),
|
||||
('𐐔', '𐐼'), ('𐐕', '𐐽'), ('𐐖', '𐐾'), ('𐐗', '𐐿'),
|
||||
('𐐘', '𐑀'), ('𐐙', '𐑁'), ('𐐚', '𐑂'), ('𐐛', '𐑃'),
|
||||
('𐐜', '𐑄'), ('𐐝', '𐑅'), ('𐐞', '𐑆'), ('𐐟', '𐑇'),
|
||||
('𐐠', '𐑈'), ('𐐡', '𐑉'), ('𐐢', '𐑊'), ('𐐣', '𐑋'),
|
||||
('𐐤', '𐑌'), ('𐐥', '𐑍'), ('𐐦', '𐑎'), ('𐐧', '𐑏'),
|
||||
('𐒰', '𐓘'), ('𐒱', '𐓙'), ('𐒲', '𐓚'), ('𐒳', '𐓛'),
|
||||
('𐒴', '𐓜'), ('𐒵', '𐓝'), ('𐒶', '𐓞'), ('𐒷', '𐓟'),
|
||||
('𐒸', '𐓠'), ('𐒹', '𐓡'), ('𐒺', '𐓢'), ('𐒻', '𐓣'),
|
||||
('𐒼', '𐓤'), ('𐒽', '𐓥'), ('𐒾', '𐓦'), ('𐒿', '𐓧'),
|
||||
('𐓀', '𐓨'), ('𐓁', '𐓩'), ('𐓂', '𐓪'), ('𐓃', '𐓫'),
|
||||
('𐓄', '𐓬'), ('𐓅', '𐓭'), ('𐓆', '𐓮'), ('𐓇', '𐓯'),
|
||||
('𐓈', '𐓰'), ('𐓉', '𐓱'), ('𐓊', '𐓲'), ('𐓋', '𐓳'),
|
||||
('𐓌', '𐓴'), ('𐓍', '𐓵'), ('𐓎', '𐓶'), ('𐓏', '𐓷'),
|
||||
('𐓐', '𐓸'), ('𐓑', '𐓹'), ('𐓒', '𐓺'), ('𐓓', '𐓻'),
|
||||
('𐕰', '𐖗'), ('𐕱', '𐖘'), ('𐕲', '𐖙'), ('𐕳', '𐖚'),
|
||||
('𐕴', '𐖛'), ('𐕵', '𐖜'), ('𐕶', '𐖝'), ('𐕷', '𐖞'),
|
||||
('𐕸', '𐖟'), ('𐕹', '𐖠'), ('𐕺', '𐖡'), ('𐕼', '𐖣'),
|
||||
('𐕽', '𐖤'), ('𐕾', '𐖥'), ('𐕿', '𐖦'), ('𐖀', '𐖧'),
|
||||
('𐖁', '𐖨'), ('𐖂', '𐖩'), ('𐖃', '𐖪'), ('𐖄', '𐖫'),
|
||||
('𐖅', '𐖬'), ('𐖆', '𐖭'), ('𐖇', '𐖮'), ('𐖈', '𐖯'),
|
||||
('𐖉', '𐖰'), ('𐖊', '𐖱'), ('𐖌', '𐖳'), ('𐖍', '𐖴'),
|
||||
('𐖎', '𐖵'), ('𐖏', '𐖶'), ('𐖐', '𐖷'), ('𐖑', '𐖸'),
|
||||
('𐖒', '𐖹'), ('𐖔', '𐖻'), ('𐖕', '𐖼'), ('𐲀', '𐳀'),
|
||||
('𐲁', '𐳁'), ('𐲂', '𐳂'), ('𐲃', '𐳃'), ('𐲄', '𐳄'),
|
||||
('𐲅', '𐳅'), ('𐲆', '𐳆'), ('𐲇', '𐳇'), ('𐲈', '𐳈'),
|
||||
('𐲉', '𐳉'), ('𐲊', '𐳊'), ('𐲋', '𐳋'), ('𐲌', '𐳌'),
|
||||
('𐲍', '𐳍'), ('𐲎', '𐳎'), ('𐲏', '𐳏'), ('𐲐', '𐳐'),
|
||||
('𐲑', '𐳑'), ('𐲒', '𐳒'), ('𐲓', '𐳓'), ('𐲔', '𐳔'),
|
||||
('𐲕', '𐳕'), ('𐲖', '𐳖'), ('𐲗', '𐳗'), ('𐲘', '𐳘'),
|
||||
('𐲙', '𐳙'), ('𐲚', '𐳚'), ('𐲛', '𐳛'), ('𐲜', '𐳜'),
|
||||
('𐲝', '𐳝'), ('𐲞', '𐳞'), ('𐲟', '𐳟'), ('𐲠', '𐳠'),
|
||||
('𐲡', '𐳡'), ('𐲢', '𐳢'), ('𐲣', '𐳣'), ('𐲤', '𐳤'),
|
||||
('𐲥', '𐳥'), ('𐲦', '𐳦'), ('𐲧', '𐳧'), ('𐲨', '𐳨'),
|
||||
('𐲩', '𐳩'), ('𐲪', '𐳪'), ('𐲫', '𐳫'), ('𐲬', '𐳬'),
|
||||
('𐲭', '𐳭'), ('𐲮', '𐳮'), ('𐲯', '𐳯'), ('𐲰', '𐳰'),
|
||||
('𐲱', '𐳱'), ('𐲲', '𐳲'), ('𑢠', '𑣀'), ('𑢡', '𑣁'),
|
||||
('𑢢', '𑣂'), ('𑢣', '𑣃'), ('𑢤', '𑣄'), ('𑢥', '𑣅'),
|
||||
('𑢦', '𑣆'), ('𑢧', '𑣇'), ('𑢨', '𑣈'), ('𑢩', '𑣉'),
|
||||
('𑢪', '𑣊'), ('𑢫', '𑣋'), ('𑢬', '𑣌'), ('𑢭', '𑣍'),
|
||||
('𑢮', '𑣎'), ('𑢯', '𑣏'), ('𑢰', '𑣐'), ('𑢱', '𑣑'),
|
||||
('𑢲', '𑣒'), ('𑢳', '𑣓'), ('𑢴', '𑣔'), ('𑢵', '𑣕'),
|
||||
('𑢶', '𑣖'), ('𑢷', '𑣗'), ('𑢸', '𑣘'), ('𑢹', '𑣙'),
|
||||
('𑢺', '𑣚'), ('𑢻', '𑣛'), ('𑢼', '𑣜'), ('𑢽', '𑣝'),
|
||||
('𑢾', '𑣞'), ('𑢿', '𑣟'), ('𖹀', '𖹠'), ('𖹁', '𖹡'),
|
||||
('𖹂', '𖹢'), ('𖹃', '𖹣'), ('𖹄', '𖹤'), ('𖹅', '𖹥'),
|
||||
('𖹆', '𖹦'), ('𖹇', '𖹧'), ('𖹈', '𖹨'), ('𖹉', '𖹩'),
|
||||
('𖹊', '𖹪'), ('𖹋', '𖹫'), ('𖹌', '𖹬'), ('𖹍', '𖹭'),
|
||||
('𖹎', '𖹮'), ('𖹏', '𖹯'), ('𖹐', '𖹰'), ('𖹑', '𖹱'),
|
||||
('𖹒', '𖹲'), ('𖹓', '𖹳'), ('𖹔', '𖹴'), ('𖹕', '𖹵'),
|
||||
('𖹖', '𖹶'), ('𖹗', '𖹷'), ('𖹘', '𖹸'), ('𖹙', '𖹹'),
|
||||
('𖹚', '𖹺'), ('𖹛', '𖹻'), ('𖹜', '𖹼'), ('𖹝', '𖹽'),
|
||||
('𖹞', '𖹾'), ('𖹟', '𖹿'), ('𞤀', '𞤢'), ('𞤁', '𞤣'),
|
||||
('𞤂', '𞤤'), ('𞤃', '𞤥'), ('𞤄', '𞤦'), ('𞤅', '𞤧'),
|
||||
('𞤆', '𞤨'), ('𞤇', '𞤩'), ('𞤈', '𞤪'), ('𞤉', '𞤫'),
|
||||
('𞤊', '𞤬'), ('𞤋', '𞤭'), ('𞤌', '𞤮'), ('𞤍', '𞤯'),
|
||||
('𞤎', '𞤰'), ('𞤏', '𞤱'), ('𞤐', '𞤲'), ('𞤑', '𞤳'),
|
||||
('𞤒', '𞤴'), ('𞤓', '𞤵'), ('𞤔', '𞤶'), ('𞤕', '𞤷'),
|
||||
('𞤖', '𞤸'), ('𞤗', '𞤹'), ('𞤘', '𞤺'), ('𞤙', '𞤻'),
|
||||
('𞤚', '𞤼'), ('𞤛', '𞤽'), ('𞤜', '𞤾'), ('𞤝', '𞤿'),
|
||||
('𞤞', '𞥀'), ('𞤟', '𞥁'), ('𞤠', '𞥂'), ('𞤡', '𞥃'),
|
||||
];
|
187
src/config.rs
Normal file
187
src/config.rs
Normal file
@ -0,0 +1,187 @@
|
||||
pub struct MatcherConfig {
|
||||
pub score_match: i16,
|
||||
pub score_gap_start: i16,
|
||||
pub score_gap_extension: i16,
|
||||
|
||||
// We prefer matches at the beginning of a word, but the bonus should not be
|
||||
// too great to prevent the longer acronym matches from always winning over
|
||||
// shorter fuzzy matches. The bonus point here was specifically chosen that
|
||||
// the bonus is cancelled when the gap between the acronyms grows over
|
||||
// 8 characters, which is approximately the average length of the words found
|
||||
// in web2 dictionary and my file system.
|
||||
pub bonus_boundary: i16,
|
||||
|
||||
// Although bonus point for non-word characters is non-contextual, we need it
|
||||
// for computing bonus points for consecutive chunks starting with a non-word
|
||||
// character.
|
||||
pub bonus_non_word: i16,
|
||||
|
||||
// Edge-triggered bonus for matches in camelCase words.
|
||||
// Compared to word-boundary case, they don't accompany single-character gaps
|
||||
// (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
||||
pub bonus_camel123: i16,
|
||||
|
||||
// Minimum bonus point given to characters in consecutive chunks.
|
||||
// Note that bonus points for consecutive matches shouldn't have needed if we
|
||||
// used fixed match score as in the original algorithm.
|
||||
pub bonus_consecutive: i16,
|
||||
|
||||
// The first character in the typed pattern usually has more significance
|
||||
// than the rest so it's important that it appears at special positions where
|
||||
// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo".
|
||||
// The amount of the extra bonus should be limited so that the gap penalty is
|
||||
// still respected.
|
||||
pub bonus_first_char_multiplier: i16,
|
||||
|
||||
pub delimeter_chars: &'static [u8],
|
||||
/// Extra bonus for word boundary after whitespace character or beginning of the string
|
||||
pub bonus_boundary_white: i16,
|
||||
|
||||
// Extra bonus for word boundary after slash, colon, semi-colon, and comma
|
||||
pub bonus_boundary_delimiter: i16,
|
||||
pub inital_char_class: CharClass,
|
||||
/// Whether to normalize latin script charaters to ASCII
|
||||
/// this significantly degrades performance so its not recommended
|
||||
/// to be truned on by default
|
||||
pub normalize: bool,
|
||||
/// use faster/simpler algorithm at the cost of (potentially) much worse results
|
||||
/// For long inputs this algorith is always used as a fallbach to avoid
|
||||
/// blowups in time complexity
|
||||
pub use_v1: bool,
|
||||
/// The case matching to perform
|
||||
pub case_matching: CaseMatching,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum CharClass {
|
||||
Whitespace,
|
||||
NonWord,
|
||||
Delimiter,
|
||||
Lower,
|
||||
Upper,
|
||||
Letter,
|
||||
Number,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum CaseMatching {
|
||||
Respect,
|
||||
Ignore,
|
||||
Smart,
|
||||
}
|
||||
|
||||
impl MatcherConfig {
|
||||
pub const DEFAULT: Self = {
|
||||
let score_match = 16;
|
||||
let score_gap_start = -3;
|
||||
let score_gap_extension = -1;
|
||||
let bonus_boundary = score_match / 2;
|
||||
MatcherConfig {
|
||||
score_match,
|
||||
score_gap_start,
|
||||
score_gap_extension,
|
||||
bonus_boundary,
|
||||
bonus_non_word: score_match / 2,
|
||||
bonus_camel123: bonus_boundary + score_gap_extension,
|
||||
bonus_consecutive: -(score_gap_start + score_gap_extension),
|
||||
bonus_first_char_multiplier: 2,
|
||||
delimeter_chars: b"/,:;|",
|
||||
bonus_boundary_white: bonus_boundary + 2,
|
||||
bonus_boundary_delimiter: bonus_boundary + 1,
|
||||
inital_char_class: CharClass::Whitespace,
|
||||
normalize: false,
|
||||
use_v1: false,
|
||||
case_matching: CaseMatching::Smart,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl MatcherConfig {
|
||||
pub fn set_match_paths(&mut self) {
|
||||
if cfg!(windows) {
|
||||
self.delimeter_chars = b"/\\";
|
||||
} else {
|
||||
self.delimeter_chars = b"/";
|
||||
}
|
||||
self.bonus_boundary_white = self.bonus_boundary;
|
||||
self.inital_char_class = CharClass::Delimiter;
|
||||
}
|
||||
|
||||
pub const fn match_paths(mut self) -> Self {
|
||||
if cfg!(windows) {
|
||||
self.delimeter_chars = b"/\\";
|
||||
} else {
|
||||
self.delimeter_chars = b"/";
|
||||
}
|
||||
self.bonus_boundary_white = self.bonus_boundary;
|
||||
self.inital_char_class = CharClass::Delimiter;
|
||||
self
|
||||
}
|
||||
|
||||
fn char_class_non_ascii(c: char) -> CharClass {
|
||||
if c.is_lowercase() {
|
||||
CharClass::Lower
|
||||
} else if c.is_uppercase() {
|
||||
CharClass::Upper
|
||||
} else if c.is_numeric() {
|
||||
CharClass::Number
|
||||
} else if c.is_alphabetic() {
|
||||
CharClass::Letter
|
||||
} else if c.is_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
|
||||
fn char_class_ascii(&self, c: char) -> CharClass {
|
||||
// using manual if conditions instead optimizes better
|
||||
if c >= 'a' && c <= 'z' {
|
||||
CharClass::Lower
|
||||
} else if c >= 'A' && c <= 'Z' {
|
||||
CharClass::Upper
|
||||
} else if c >= '0' && c <= '9' {
|
||||
CharClass::Number
|
||||
} else if c.is_ascii_whitespace() {
|
||||
CharClass::Whitespace
|
||||
} else if self.delimeter_chars.contains(&(c as u8)) {
|
||||
CharClass::Delimiter
|
||||
} else {
|
||||
CharClass::NonWord
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn char_class(&self, c: char) -> CharClass {
|
||||
if c.is_ascii() {
|
||||
self.char_class_ascii(c)
|
||||
} else {
|
||||
Self::char_class_non_ascii(c)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> i16 {
|
||||
if class > CharClass::NonWord {
|
||||
// transition from non word to word
|
||||
match prev_class {
|
||||
CharClass::Whitespace => return self.bonus_boundary_white,
|
||||
CharClass::Delimiter => return self.bonus_boundary_delimiter,
|
||||
CharClass::NonWord => return self.bonus_boundary,
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
if prev_class == CharClass::Lower && class == CharClass::Upper
|
||||
|| prev_class != CharClass::Number && class == CharClass::Number
|
||||
{
|
||||
// camelCase letter123
|
||||
self.bonus_camel123
|
||||
} else if class == CharClass::NonWord {
|
||||
self.bonus_non_word
|
||||
} else if class == CharClass::Whitespace {
|
||||
self.bonus_boundary_white
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
605
src/lib.rs
Normal file
605
src/lib.rs
Normal file
@ -0,0 +1,605 @@
|
||||
// sadly this doens't optmimzie well currently
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::cmp::max;
|
||||
|
||||
use memchr::{memchr, memchr2};
|
||||
use normalize::normalize;
|
||||
|
||||
//autogenerated by generate-ucd
|
||||
#[allow(warnings)]
|
||||
#[rustfmt::skip]
|
||||
mod case_fold;
|
||||
mod config;
|
||||
mod normalize;
|
||||
|
||||
pub use config::{CaseMatching, CharClass, MatcherConfig};
|
||||
|
||||
const MAX_MATRIX_SIZE: usize = 75 * 1024; // 300KB
|
||||
const MAX_HAYSTACK_LEN: usize = 8192; // 64KB
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct MatrixCell {
|
||||
score: i16,
|
||||
consecutive_chars: u16,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct HaystackChar {
|
||||
char: char,
|
||||
bonus: i16,
|
||||
}
|
||||
|
||||
pub struct Matcher {
|
||||
pub config: MatcherConfig,
|
||||
matrix: Box<[MatrixCell; MAX_MATRIX_SIZE]>,
|
||||
haystack: Box<[HaystackChar; MAX_HAYSTACK_LEN]>,
|
||||
// needle can be at most as long as the haystack
|
||||
first_needle_occurance: Box<[u16; MAX_HAYSTACK_LEN]>,
|
||||
}
|
||||
|
||||
pub struct Query {
|
||||
needle_chars: Vec<char>,
|
||||
is_ascii: bool,
|
||||
ignore_case: bool,
|
||||
}
|
||||
|
||||
impl Query {
|
||||
fn push(&mut self, needle: &str, normalize_: bool, smart_case: bool) {
|
||||
self.needle_chars.reserve(needle.len());
|
||||
self.needle_chars.extend(needle.chars().map(|mut c| {
|
||||
if !c.is_ascii() {
|
||||
self.is_ascii = false;
|
||||
}
|
||||
if smart_case {
|
||||
if c.is_uppercase() {
|
||||
self.ignore_case = false;
|
||||
}
|
||||
} else if self.ignore_case {
|
||||
if self.is_ascii {
|
||||
c = to_lower_case::<true>(c)
|
||||
} else {
|
||||
c = to_lower_case::<false>(c)
|
||||
}
|
||||
}
|
||||
if normalize_ && !self.is_ascii {
|
||||
c = normalize(c);
|
||||
}
|
||||
c
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> {
|
||||
if c >= b'a' || c <= b'z' {
|
||||
memchr2(c, c + 32, haystack)
|
||||
} else {
|
||||
memchr(c, haystack)
|
||||
}
|
||||
}
|
||||
/// Safety: T must be vaind if initalized with zeros
|
||||
unsafe fn zeroed_array_on_heap<T: Copy, const LEN: usize>() -> Box<[T; LEN]> {
|
||||
let layout = Layout::new::<[T; LEN]>();
|
||||
let res = std::alloc::alloc_zeroed(layout);
|
||||
if res.is_null() {
|
||||
std::alloc::handle_alloc_error(layout)
|
||||
}
|
||||
Box::from_raw(res as _)
|
||||
}
|
||||
|
||||
impl Matcher {
|
||||
pub fn new(config: MatcherConfig) -> Self {
|
||||
// Safety: all data allocated here is just integers/structs that contain
|
||||
// integers so zeroed values are legal
|
||||
unsafe {
|
||||
Self {
|
||||
config,
|
||||
matrix: zeroed_array_on_heap(),
|
||||
haystack: zeroed_array_on_heap(),
|
||||
first_needle_occurance: zeroed_array_on_heap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compile_query(&self, needle: &str) -> Query {
|
||||
let mut query = Query {
|
||||
needle_chars: Vec::new(),
|
||||
is_ascii: true,
|
||||
ignore_case: self.config.case_matching == CaseMatching::Ignore,
|
||||
};
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
query
|
||||
}
|
||||
pub fn recompile_query(&self, query: &mut Query, needle: &str) {
|
||||
query.needle_chars.clear();
|
||||
query.is_ascii = false;
|
||||
query.ignore_case = self.config.case_matching == CaseMatching::Ignore;
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
}
|
||||
pub fn append_query(&self, query: &mut Query, needle: &str) {
|
||||
query.push(
|
||||
needle,
|
||||
self.config.normalize,
|
||||
self.config.case_matching == CaseMatching::Smart,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn fuzzy_match(&mut self, query: &Query, mut haystack: &str) -> Option<i32> {
|
||||
if haystack.len() > u32::MAX as usize {
|
||||
haystack = &haystack[..u32::MAX as usize]
|
||||
}
|
||||
if self.config.use_v1 {
|
||||
if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v1::<false, true>(query, haystack, &mut Vec::new())
|
||||
} else {
|
||||
self.fuzzy_matcher_v1::<false, false>(query, haystack, &mut Vec::new())
|
||||
}
|
||||
} else if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v2::<false, true>(query, haystack, &mut Vec::new())
|
||||
} else {
|
||||
self.fuzzy_matcher_v2::<false, false>(query, haystack, &mut Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fuzzy_indicies(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
mut haystack: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<i32> {
|
||||
if haystack.len() > u32::MAX as usize {
|
||||
haystack = &haystack[..u32::MAX as usize]
|
||||
}
|
||||
if self.config.use_v1 {
|
||||
if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v1::<true, true>(query, haystack, indicies)
|
||||
} else {
|
||||
self.fuzzy_matcher_v1::<true, false>(query, haystack, indicies)
|
||||
}
|
||||
} else if query.is_ascii && !self.config.normalize {
|
||||
self.fuzzy_matcher_v2::<true, true>(query, haystack, indicies)
|
||||
} else {
|
||||
self.fuzzy_matcher_v2::<true, false>(query, haystack, indicies)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn normalize_char<const ASCII_ONLY: bool>(&self, ignore_case: bool, mut c: char) -> char {
|
||||
if ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c)
|
||||
}
|
||||
if !ASCII_ONLY && self.config.normalize {
|
||||
c = normalize(c)
|
||||
}
|
||||
c
|
||||
}
|
||||
|
||||
fn prefilter_ascii(&self, query: &Query, mut haystack: &[u8]) -> Option<(usize, usize)> {
|
||||
let needle = &query.needle_chars;
|
||||
if query.ignore_case {
|
||||
let first_idx = find_ascii_ignore_case(needle[0] as u8, haystack)?;
|
||||
let mut last_idx = first_idx + 1;
|
||||
haystack = &haystack[last_idx..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = find_ascii_ignore_case(c as u8, haystack)? + 1;
|
||||
last_idx += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
Some((first_idx, last_idx))
|
||||
} else {
|
||||
let first_idx = memchr(needle[0] as u8, haystack)?;
|
||||
let mut last_idx = first_idx + 1;
|
||||
haystack = &haystack[last_idx..];
|
||||
for &c in &needle[1..] {
|
||||
let idx = memchr(c as u8, haystack)? + 1;
|
||||
last_idx += idx;
|
||||
haystack = &haystack[idx..];
|
||||
}
|
||||
Some((first_idx, last_idx))
|
||||
}
|
||||
}
|
||||
|
||||
fn prefilter_non_ascii(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
||||
let needle_char = query.needle_chars[0];
|
||||
let mut text = haystack
|
||||
.char_indices()
|
||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
||||
|
||||
let (match_start, c) = text.find(|&(_, c)| c == needle_char)?;
|
||||
Some((match_start, match_start + c.len_utf8()))
|
||||
}
|
||||
|
||||
fn prefilter(&self, query: &Query, haystack: &str) -> Option<(usize, usize)> {
|
||||
// quickly reject small matches
|
||||
if query.needle_chars.len() > haystack.len() {
|
||||
return None;
|
||||
}
|
||||
if query.is_ascii {
|
||||
self.prefilter_ascii(query, haystack.as_bytes())
|
||||
} else {
|
||||
self.prefilter_non_ascii(query, haystack)
|
||||
}
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v1<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
haystack: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<i32> {
|
||||
let (start, end) = self.prefilter(query, haystack)?;
|
||||
self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
||||
query, haystack, start, end, indicies,
|
||||
)
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v1_with_prefilter<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
haystack: &str,
|
||||
mut start: usize,
|
||||
mut end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<i32> {
|
||||
let first_char_end = if ASCII_ONLY { start + 1 } else { end };
|
||||
if !ASCII_ONLY && query.needle_chars.len() != 1 {
|
||||
let mut needle_iter = query.needle_chars[1..].iter().copied();
|
||||
if let Some(mut needle_char) = needle_iter.next() {
|
||||
let haystack = haystack[first_char_end..]
|
||||
.char_indices()
|
||||
.rev()
|
||||
.map(|(i, c)| (i, self.normalize_char::<false>(query.ignore_case, c)));
|
||||
for (i, c) in haystack {
|
||||
if c == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
end = i + c.len_utf8();
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// very simple, just mimimize from the back
|
||||
let match_ = haystack[first_char_end..end]
|
||||
.char_indices()
|
||||
.rev()
|
||||
.map(|(i, c)| (i, self.normalize_char::<ASCII_ONLY>(query.ignore_case, c)));
|
||||
|
||||
let mut needle_iter = query.needle_chars[..].iter().rev().copied();
|
||||
let mut needle_char = needle_iter.next().unwrap();
|
||||
for (i, c) in match_ {
|
||||
if c == needle_char {
|
||||
let Some(next_needle_char) = needle_iter.next() else {
|
||||
start = i;
|
||||
break;
|
||||
};
|
||||
needle_char = next_needle_char;
|
||||
}
|
||||
}
|
||||
Some(self.calculate_score::<INDICIES, ASCII_ONLY>(query, haystack, start, end, indicies))
|
||||
}
|
||||
|
||||
fn calculate_score<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
text: &str,
|
||||
match_start: usize,
|
||||
match_end: usize,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> i32 {
|
||||
if INDICIES {
|
||||
indicies.reserve(query.needle_chars.len());
|
||||
}
|
||||
let mut prev_class = text[..match_start]
|
||||
.chars()
|
||||
.next_back()
|
||||
.map(|c| self.config.char_class(c))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
let mut needle_idx = 0;
|
||||
let mut score = 0i32;
|
||||
let mut in_gap = false;
|
||||
let mut consecutive = 0;
|
||||
let mut first_bonus = 0i16;
|
||||
for (i, mut c) in text[match_start..match_end].char_indices() {
|
||||
let class = self.config.char_class(c);
|
||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c);
|
||||
}
|
||||
if self.config.normalize && !ASCII_ONLY {
|
||||
c = normalize(c)
|
||||
}
|
||||
if c == query.needle_chars[needle_idx] {
|
||||
if INDICIES {
|
||||
indicies.push(i as u32)
|
||||
}
|
||||
score += self.config.score_match as i32;
|
||||
let mut bonus = self.config.bonus_for(prev_class, class);
|
||||
if consecutive == 0 {
|
||||
first_bonus = bonus
|
||||
} else {
|
||||
// Break consecutive chunk
|
||||
if bonus >= self.config.bonus_boundary && bonus > first_bonus {
|
||||
first_bonus = bonus
|
||||
}
|
||||
bonus = max(
|
||||
max(bonus, first_bonus),
|
||||
self.config.bonus_first_char_multiplier,
|
||||
);
|
||||
}
|
||||
if needle_idx == 0 {
|
||||
bonus *= self.config.bonus_first_char_multiplier
|
||||
}
|
||||
score += bonus as i32;
|
||||
needle_idx += 1;
|
||||
in_gap = false;
|
||||
consecutive += 1;
|
||||
} else {
|
||||
if in_gap {
|
||||
score += self.config.score_gap_extension as i32
|
||||
} else {
|
||||
score += self.config.score_gap_start as i32
|
||||
}
|
||||
in_gap = true;
|
||||
consecutive = 0;
|
||||
first_bonus = 0;
|
||||
}
|
||||
prev_class = class;
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
fn fuzzy_matcher_v2<const INDICIES: bool, const ASCII_ONLY: bool>(
|
||||
&mut self,
|
||||
query: &Query,
|
||||
text: &str,
|
||||
indicies: &mut Vec<u32>,
|
||||
) -> Option<i32> {
|
||||
let (start, prefilter_end) = self.prefilter(query, text)?;
|
||||
let text_len = text.len() - start;
|
||||
// fallback to v1 algorithms for long haystacks
|
||||
// technically we need to multiply by char len here
|
||||
// but counting chars has a lot of unecessary overhead that we can avoid
|
||||
// here in practice using bytelen should be a reasonable approximation
|
||||
// we also differ from fzf here in that we never allocate and instead stringintly check here
|
||||
if text_len > u16::MAX as usize || text_len * query.needle_chars.len() > MAX_HAYSTACK_LEN {
|
||||
return self.fuzzy_matcher_v1_with_prefilter::<INDICIES, ASCII_ONLY>(
|
||||
query,
|
||||
text,
|
||||
start,
|
||||
prefilter_end,
|
||||
indicies,
|
||||
);
|
||||
}
|
||||
|
||||
let mut prev_class = text[..start]
|
||||
.chars()
|
||||
.next_back()
|
||||
.map(|c| self.config.char_class(c))
|
||||
.unwrap_or(self.config.inital_char_class);
|
||||
|
||||
let text = &text[start..];
|
||||
|
||||
let mut needle_iter = query.needle_chars[..]
|
||||
.iter()
|
||||
.copied()
|
||||
.zip(self.first_needle_occurance.iter_mut());
|
||||
let (mut needle_char, mut needle_char_idx) = needle_iter.next().unwrap();
|
||||
|
||||
let iter = text[start..]
|
||||
.chars()
|
||||
.zip(self.matrix.iter_mut())
|
||||
.zip(self.haystack.iter_mut())
|
||||
.enumerate();
|
||||
|
||||
let mut last_matched_idx = 0;
|
||||
let mut max_score = 0;
|
||||
let mut max_score_pos = 0;
|
||||
let mut in_gap = false;
|
||||
let mut prev_score = 0;
|
||||
let mut matched = false;
|
||||
|
||||
let first_needle_char = query.needle_chars[0];
|
||||
for (i, ((mut c, matrix_cell), char_info)) in iter {
|
||||
let class = self.config.char_class(c);
|
||||
if (ASCII_ONLY || class == CharClass::Upper) && query.ignore_case {
|
||||
c = to_lower_case::<ASCII_ONLY>(c);
|
||||
}
|
||||
if self.config.normalize && !ASCII_ONLY {
|
||||
c = normalize(c)
|
||||
}
|
||||
char_info.char = c;
|
||||
let bonus = self.config.bonus_for(prev_class, class);
|
||||
char_info.char = c;
|
||||
prev_class = class;
|
||||
|
||||
let i = i as u16;
|
||||
if c == needle_char {
|
||||
// save the first idx of each char
|
||||
if let Some(next) = needle_iter.next() {
|
||||
*needle_char_idx = i;
|
||||
(needle_char, needle_char_idx) = next
|
||||
} else {
|
||||
// we have atleast one match
|
||||
matched = true;
|
||||
}
|
||||
// and the last matched char
|
||||
last_matched_idx = i;
|
||||
}
|
||||
if c == first_needle_char {
|
||||
let score =
|
||||
self.config.score_match + bonus * self.config.bonus_first_char_multiplier;
|
||||
matrix_cell.consecutive_chars = 1;
|
||||
if query.needle_chars.len() == 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_pos = i;
|
||||
// can't get better than this
|
||||
if bonus >= self.config.bonus_boundary {
|
||||
break;
|
||||
}
|
||||
}
|
||||
in_gap = false;
|
||||
} else {
|
||||
let gap_score = if in_gap {
|
||||
self.config.score_gap_extension
|
||||
} else {
|
||||
self.config.score_gap_start
|
||||
};
|
||||
matrix_cell.score = max(0, gap_score + prev_score);
|
||||
matrix_cell.consecutive_chars = 0;
|
||||
in_gap = true;
|
||||
}
|
||||
prev_score = matrix_cell.score;
|
||||
}
|
||||
if !matched {
|
||||
debug_assert!(!ASCII_ONLY, "prefilter should have rejected");
|
||||
return None;
|
||||
}
|
||||
if query.needle_chars.len() == 1 {
|
||||
indicies.push(max_score_pos as u32);
|
||||
return Some(max_score as i32);
|
||||
}
|
||||
assert_eq!(
|
||||
self.first_needle_occurance[0], 0,
|
||||
"prefilter should have put us at the start of the match"
|
||||
);
|
||||
let haystack_len = last_matched_idx as usize + 1;
|
||||
let (max_score, best_match_end) = self.popultate_matrix(haystack_len, query);
|
||||
if INDICIES {
|
||||
indicies.reserve(query.needle_chars.len());
|
||||
let mut col = best_match_end;
|
||||
let mut needle_iter = self.matrix[..haystack_len * query.needle_chars.len()]
|
||||
.windows(haystack_len)
|
||||
.zip(self.first_needle_occurance[..haystack_len].iter())
|
||||
.rev()
|
||||
.peekable();
|
||||
let mut next_row = None;
|
||||
let (mut row, mut first_needle_occurance) = needle_iter.next().unwrap();
|
||||
let mut prefer_match = true;
|
||||
loop {
|
||||
let score = row[col as usize].score;
|
||||
let mut score1 = 0;
|
||||
let mut score2 = 0;
|
||||
if let Some((prev_row, _)) = needle_iter.peek() {
|
||||
if col >= *first_needle_occurance {
|
||||
score1 = prev_row[col as usize].score;
|
||||
}
|
||||
}
|
||||
if col > *first_needle_occurance {
|
||||
score2 = row[col as usize - 1].score;
|
||||
}
|
||||
if score > score1 && (score > score2 || score == score2 && prefer_match) {
|
||||
indicies.push(col as u32 + start as u32);
|
||||
next_row = Some(row);
|
||||
let Some(next) = needle_iter.next() else {
|
||||
break;
|
||||
};
|
||||
(row, first_needle_occurance) = next
|
||||
}
|
||||
prefer_match = row[col as usize].consecutive_chars > 1;
|
||||
if !prefer_match && col + 1 < query.needle_chars.len() as u16 {
|
||||
if let Some(next_row) = next_row {
|
||||
prefer_match = next_row[col as usize + 1].consecutive_chars > 0
|
||||
}
|
||||
}
|
||||
col -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
Some(max_score as i32)
|
||||
}
|
||||
|
||||
fn popultate_matrix(&mut self, haystack_len: usize, query: &Query) -> (i16, u16) {
|
||||
let mut max_score = 0;
|
||||
let mut max_score_end = 0;
|
||||
let mut iter = query
|
||||
.needle_chars
|
||||
.iter()
|
||||
.zip(self.first_needle_occurance.iter())
|
||||
.zip(self.matrix.chunks_mut(haystack_len))
|
||||
.enumerate();
|
||||
// skip the first row we already calculated the initial scores
|
||||
let (_, ((&_, &_), mut prev_matrix_row)) = iter.next().unwrap();
|
||||
for (i, ((&needle_char, &first_occurance), matrix_row)) in iter {
|
||||
// help the optimizer out a little
|
||||
assert!((first_occurance as usize) < matrix_row.len());
|
||||
assert!(first_occurance != 0);
|
||||
let mut in_gap = false;
|
||||
let haystack = &self.haystack[first_occurance as usize..haystack_len];
|
||||
let mut prev_matrix_cell = matrix_row[first_occurance as usize - 1];
|
||||
let matrix_row = &mut matrix_row[first_occurance as usize..haystack_len];
|
||||
let prev_matrix_diagonal =
|
||||
&mut prev_matrix_row[first_occurance as usize - 1..haystack_len - 1];
|
||||
for (j, ((&haystack_char, matrix_cell), &diag_matrix_cell)) in haystack
|
||||
.iter()
|
||||
.zip(matrix_row.iter_mut())
|
||||
.zip(prev_matrix_diagonal.iter())
|
||||
.enumerate()
|
||||
{
|
||||
let col = j + first_occurance as usize;
|
||||
let gap_score = if in_gap {
|
||||
self.config.score_gap_extension
|
||||
} else {
|
||||
self.config.score_gap_start
|
||||
};
|
||||
let mut score1 = 0;
|
||||
let score2 = prev_matrix_cell.score + gap_score;
|
||||
let mut consecutive = 0;
|
||||
if haystack_char.char == needle_char {
|
||||
score1 = diag_matrix_cell.score + self.config.score_match;
|
||||
let mut bonus = haystack_char.bonus;
|
||||
consecutive = diag_matrix_cell.consecutive_chars + 1;
|
||||
if consecutive > 1 {
|
||||
let first_bonus = self.haystack[col - consecutive as usize].bonus;
|
||||
if bonus > self.config.bonus_boundary && bonus > first_bonus {
|
||||
consecutive = 1
|
||||
} else {
|
||||
bonus = max(bonus, max(self.config.bonus_consecutive, first_bonus))
|
||||
}
|
||||
}
|
||||
if score1 + bonus < score2 {
|
||||
score1 += haystack_char.bonus;
|
||||
consecutive = 0;
|
||||
} else {
|
||||
score1 += bonus;
|
||||
}
|
||||
}
|
||||
in_gap = score1 < score2;
|
||||
let score = max(max(score1, score2), 0);
|
||||
prev_matrix_cell = *matrix_cell;
|
||||
if i == query.needle_chars.len() - 1 && score > max_score {
|
||||
max_score = score;
|
||||
max_score_end = col as u16;
|
||||
}
|
||||
matrix_cell.consecutive_chars = consecutive;
|
||||
matrix_cell.score = score;
|
||||
}
|
||||
prev_matrix_row = matrix_row;
|
||||
}
|
||||
(max_score, max_score_end)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_lower_case<const ASCII_ONLY: bool>(c: char) -> char {
|
||||
if c >= 'A' && c <= 'Z' {
|
||||
char::from_u32(c as u32 + 32).unwrap()
|
||||
} else if !c.is_ascii() && !ASCII_ONLY {
|
||||
case_fold::CASE_FOLDING_SIMPLE
|
||||
.binary_search_by_key(&c, |(upper, _)| *upper)
|
||||
.map_or(c, |idx| case_fold::CASE_FOLDING_SIMPLE[idx].1)
|
||||
} else {
|
||||
c
|
||||
}
|
||||
}
|
514
src/normalize.rs
Normal file
514
src/normalize.rs
Normal file
@ -0,0 +1,514 @@
|
||||
const DATA1: [(char, char); 277] = [
|
||||
('\u{00C0}', 'A'), // WITH GRAVE, LATIN CAPITAL LETTER
|
||||
('\u{00C1}', 'A'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00C2}', 'A'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER
|
||||
('\u{00C3}', 'A'), // WITH TILDE, LATIN CAPITAL LETTER
|
||||
('\u{00C4}', 'A'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{00C5}', 'A'), // WITH RING ABOVE, LATIN CAPITAL LETTER
|
||||
('\u{00C7}', 'C'), // WITH CEDILLA, LATIN CAPITAL LETTER
|
||||
('\u{00C8}', 'E'), // WITH GRAVE, LATIN CAPITAL LETTER
|
||||
('\u{00C9}', 'E'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00CA}', 'E'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER
|
||||
('\u{00CB}', 'E'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{00CC}', 'I'), // WITH GRAVE, LATIN CAPITAL LETTER
|
||||
('\u{00CD}', 'I'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00CE}', 'I'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER
|
||||
('\u{00CF}', 'I'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{00D1}', 'N'), // WITH TILDE, LATIN CAPITAL LETTER
|
||||
('\u{00D2}', 'O'), // WITH GRAVE, LATIN CAPITAL LETTER
|
||||
('\u{00D3}', 'O'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00D4}', 'O'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER
|
||||
('\u{00D5}', 'O'), // WITH TILDE, LATIN CAPITAL LETTER
|
||||
('\u{00D6}', 'O'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{00D8}', 'O'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{00D9}', 'U'), // WITH GRAVE, LATIN CAPITAL LETTER
|
||||
('\u{00DA}', 'U'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00DB}', 'U'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER
|
||||
('\u{00DC}', 'U'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{00DD}', 'Y'), // WITH ACUTE, LATIN CAPITAL LETTER
|
||||
('\u{00DF}', 's'), // , LATIN SMALL LETTER SHARP
|
||||
('\u{00E0}', 'a'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{00E1}', 'a'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00E2}', 'a'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{00E3}', 'a'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{00E4}', 'a'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{00E5}', 'a'), // WITH RING ABOVE, LATIN SMALL LETTER
|
||||
('\u{00E7}', 'c'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{00E8}', 'e'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{00E9}', 'e'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00EA}', 'e'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{00EB}', 'e'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{00EC}', 'i'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{00ED}', 'i'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00EE}', 'i'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{00EF}', 'i'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{00F1}', 'n'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{00F2}', 'o'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{00F3}', 'o'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00F4}', 'o'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{00F5}', 'o'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{00F6}', 'o'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{00F8}', 'o'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{00F9}', 'u'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{00FA}', 'u'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00FB}', 'u'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{00FC}', 'u'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{00FD}', 'y'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{00FF}', 'y'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{0101}', 'a'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{0103}', 'a'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{0105}', 'a'), // WITH OGONEK, LATIN SMALL LETTER
|
||||
('\u{0107}', 'c'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{0109}', 'c'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{010B}', 'c'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{010D}', 'c'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{010F}', 'd'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{0111}', 'd'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0113}', 'e'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{0115}', 'e'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{0117}', 'e'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{0119}', 'e'), // WITH OGONEK, LATIN SMALL LETTER
|
||||
('\u{011B}', 'e'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{011D}', 'g'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{011F}', 'g'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{0121}', 'g'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{0123}', 'g'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{0125}', 'h'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{0127}', 'h'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0129}', 'i'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{012B}', 'i'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{012D}', 'i'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{012F}', 'i'), // WITH OGONEK, LATIN SMALL LETTER
|
||||
('\u{0130}', 'I'), // WITH DOT ABOVE, LATIN CAPITAL LETTER
|
||||
('\u{0131}', 'i'), // , LATIN SMALL LETTER DOTLESS
|
||||
('\u{0135}', 'j'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{0137}', 'k'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{013A}', 'l'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{013C}', 'l'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{013E}', 'l'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{0140}', 'l'), // WITH MIDDLE DOT, LATIN SMALL LETTER
|
||||
('\u{0142}', 'l'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0144}', 'n'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{0146}', 'n'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{0148}', 'n'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{014D}', 'o'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{014F}', 'o'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{0151}', 'o'), // WITH DOUBLE ACUTE, LATIN SMALL LETTER
|
||||
('\u{0155}', 'r'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{0157}', 'r'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{0159}', 'r'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{015B}', 's'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{015D}', 's'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{015F}', 's'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{0161}', 's'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{0163}', 't'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{0165}', 't'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{0167}', 't'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0169}', 'u'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{016B}', 'u'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{016D}', 'u'), // WITH BREVE, LATIN SMALL LETTER
|
||||
('\u{016F}', 'u'), // WITH RING ABOVE, LATIN SMALL LETTER
|
||||
('\u{0171}', 'u'), // WITH DOUBLE ACUTE, LATIN SMALL LETTER
|
||||
('\u{0173}', 'u'), // WITH OGONEK, LATIN SMALL LETTER
|
||||
('\u{0175}', 'w'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{0177}', 'y'), // WITH CIRCUMFLEX, LATIN SMALL LETTER
|
||||
('\u{0178}', 'Y'), // WITH DIAERESIS, LATIN CAPITAL LETTER
|
||||
('\u{017A}', 'z'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{017C}', 'z'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{017E}', 'z'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{017F}', 's'), // , LATIN SMALL LETTER LONG
|
||||
('\u{0180}', 'b'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0181}', 'B'), // WITH HOOK, LATIN CAPITAL LETTER
|
||||
('\u{0183}', 'b'), // WITH TOPBAR, LATIN SMALL LETTER
|
||||
('\u{0186}', 'O'), // , LATIN CAPITAL LETTER OPEN
|
||||
('\u{0188}', 'c'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0189}', 'D'), // , LATIN CAPITAL LETTER AFRICAN
|
||||
('\u{018A}', 'D'), // WITH HOOK, LATIN CAPITAL LETTER
|
||||
('\u{018C}', 'd'), // WITH TOPBAR, LATIN SMALL LETTER
|
||||
('\u{018E}', 'E'), // , LATIN CAPITAL LETTER REVERSED
|
||||
('\u{0190}', 'E'), // , LATIN CAPITAL LETTER OPEN
|
||||
('\u{0192}', 'f'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0193}', 'G'), // WITH HOOK, LATIN CAPITAL LETTER
|
||||
('\u{0197}', 'I'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{0199}', 'k'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{019A}', 'l'), // WITH BAR, LATIN SMALL LETTER
|
||||
('\u{019C}', 'M'), // , LATIN CAPITAL LETTER TURNED
|
||||
('\u{019D}', 'N'), // WITH LEFT HOOK, LATIN CAPITAL LETTER
|
||||
('\u{019E}', 'n'), // WITH LONG RIGHT LEG, LATIN SMALL LETTER
|
||||
('\u{019F}', 'O'), // WITH MIDDLE TILDE, LATIN CAPITAL LETTER
|
||||
('\u{01A1}', 'o'), // WITH HORN, LATIN SMALL LETTER
|
||||
('\u{01A5}', 'p'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{01AB}', 't'), // WITH PALATAL HOOK, LATIN SMALL LETTER
|
||||
('\u{01AD}', 't'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{01AE}', 'T'), // WITH RETROFLEX HOOK, LATIN CAPITAL LETTER
|
||||
('\u{01B0}', 'u'), // WITH HORN, LATIN SMALL LETTER
|
||||
('\u{01B2}', 'V'), // WITH HOOK, LATIN CAPITAL LETTER
|
||||
('\u{01B4}', 'y'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{01B6}', 'z'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{01CE}', 'a'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01D0}', 'i'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01D2}', 'o'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01D4}', 'u'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01DD}', 'e'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{01E5}', 'g'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{01E7}', 'g'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01E9}', 'k'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01EB}', 'o'), // WITH OGONEK, LATIN SMALL LETTER
|
||||
('\u{01F0}', 'j'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{01F5}', 'g'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{01F9}', 'n'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{0201}', 'a'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{0203}', 'a'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{0205}', 'e'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{0207}', 'e'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{0209}', 'i'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{020B}', 'i'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{020D}', 'o'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{020F}', 'o'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{0211}', 'r'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{0213}', 'r'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{0215}', 'u'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER
|
||||
('\u{0217}', 'u'), // WITH INVERTED BREVE, LATIN SMALL LETTER
|
||||
('\u{0219}', 's'), // WITH COMMA BELOW, LATIN SMALL LETTER
|
||||
('\u{021B}', 't'), // WITH COMMA BELOW, LATIN SMALL LETTER
|
||||
('\u{021F}', 'h'), // WITH CARON, LATIN SMALL LETTER
|
||||
('\u{0220}', 'N'), // WITH LONG RIGHT LEG, LATIN CAPITAL LETTER
|
||||
('\u{0221}', 'd'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0225}', 'z'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0227}', 'a'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{0229}', 'e'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{022F}', 'o'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{0233}', 'y'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{0234}', 'l'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0235}', 'n'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0236}', 't'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0237}', 'j'), // , LATIN SMALL LETTER DOTLESS
|
||||
('\u{023A}', 'A'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{023B}', 'C'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{023C}', 'c'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{023D}', 'L'), // WITH BAR, LATIN CAPITAL LETTER
|
||||
('\u{023E}', 'T'), // WITH DIAGONAL STROKE, LATIN CAPITAL LETTER
|
||||
('\u{023F}', 's'), // WITH SWASH TAIL, LATIN SMALL LETTER
|
||||
('\u{0240}', 'z'), // WITH SWASH TAIL, LATIN SMALL LETTER
|
||||
('\u{0243}', 'B'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{0244}', 'U'), // BAR, LATIN CAPITAL LETTER
|
||||
('\u{0245}', 'V'), // , LATIN CAPITAL LETTER TURNED
|
||||
('\u{0246}', 'E'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{0247}', 'e'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0248}', 'J'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{0249}', 'j'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{024A}', 'Q'), // WITH HOOK TAIL, LATIN CAPITAL LETTER SMALL
|
||||
('\u{024B}', 'q'), // WITH HOOK TAIL, LATIN SMALL LETTER
|
||||
('\u{024C}', 'R'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{024D}', 'r'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{024E}', 'Y'), // WITH STROKE, LATIN CAPITAL LETTER
|
||||
('\u{024F}', 'y'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{0250}', 'a'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{0251}', 'a'), // , latin small letter script
|
||||
('\u{0253}', 'b'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0254}', 'o'), // , LATIN SMALL LETTER OPEN
|
||||
('\u{0255}', 'c'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0256}', 'd'), // WITH TAIL, LATIN SMALL LETTER
|
||||
('\u{0257}', 'd'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0258}', 'e'), // , LATIN SMALL LETTER REVERSED
|
||||
('\u{025B}', 'e'), // , LATIN SMALL LETTER OPEN
|
||||
('\u{025C}', 'e'), // , LATIN SMALL LETTER REVERSED OPEN
|
||||
('\u{025D}', 'e'), // WITH HOOK, LATIN SMALL LETTER REVERSED OPEN
|
||||
('\u{025E}', 'e'), // , LATIN SMALL LETTER CLOSED REVERSED OPEN
|
||||
('\u{025F}', 'j'), // WITH STROKE, LATIN SMALL LETTER DOTLESS
|
||||
('\u{0260}', 'g'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0261}', 'g'), // , LATIN SMALL LETTER SCRIPT
|
||||
('\u{0262}', 'G'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{0265}', 'h'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{0266}', 'h'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0268}', 'i'), // WITH STROKE, LATIN SMALL LETTER
|
||||
('\u{026A}', 'I'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{026B}', 'l'), // WITH MIDDLE TILDE, LATIN SMALL LETTER
|
||||
('\u{026C}', 'l'), // WITH BELT, LATIN SMALL LETTER
|
||||
('\u{026D}', 'l'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER
|
||||
('\u{026F}', 'm'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{0270}', 'm'), // WITH LONG LEG, LATIN SMALL LETTER TURNED
|
||||
('\u{0271}', 'm'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0272}', 'n'), // WITH LEFT HOOK, LATIN SMALL LETTER
|
||||
('\u{0273}', 'n'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER
|
||||
('\u{0274}', 'N'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{0275}', 'o'), // , LATIN SMALL LETTER BARRED
|
||||
('\u{0279}', 'r'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{027A}', 'r'), // WITH LONG LEG, LATIN SMALL LETTER TURNED
|
||||
('\u{027B}', 'r'), // WITH HOOK, LATIN SMALL LETTER TURNED
|
||||
('\u{027C}', 'r'), // WITH LONG LEG, LATIN SMALL LETTER
|
||||
('\u{027D}', 'r'), // WITH TAIL, LATIN SMALL LETTER
|
||||
('\u{027E}', 'r'), // WITH FISHHOOK, LATIN SMALL LETTER
|
||||
('\u{027F}', 'r'), // WITH FISHHOOK, LATIN SMALL LETTER REVERSED
|
||||
('\u{0280}', 'R'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{0281}', 'R'), // , LATIN LETTER SMALL CAPITAL INVERTED
|
||||
('\u{0282}', 's'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{0287}', 't'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{0288}', 't'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER
|
||||
('\u{0289}', 'u'), // BAR, LATIN SMALL LETTER
|
||||
('\u{028B}', 'v'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{028C}', 'v'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{028D}', 'w'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{028E}', 'y'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{028F}', 'Y'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{0290}', 'z'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER
|
||||
('\u{0291}', 'z'), // WITH CURL, LATIN SMALL LETTER
|
||||
('\u{0297}', 'c'), // , LATIN LETTER STRETCHED
|
||||
('\u{0299}', 'B'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{029A}', 'e'), // , LATIN SMALL LETTER CLOSED OPEN
|
||||
('\u{029B}', 'G'), // WITH HOOK, LATIN LETTER SMALL CAPITAL
|
||||
('\u{029C}', 'H'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{029D}', 'j'), // WITH CROSSED-TAIL, LATIN SMALL LETTER
|
||||
('\u{029E}', 'k'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{029F}', 'L'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{02A0}', 'q'), // WITH HOOK, LATIN SMALL LETTER
|
||||
('\u{02AE}', 'h'), // WITH FISHHOOK, LATIN SMALL LETTER TURNED
|
||||
('\u{0363}', 'a'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0364}', 'e'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0365}', 'i'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0366}', 'o'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0367}', 'u'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0368}', 'c'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{0369}', 'd'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036A}', 'h'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036B}', 'm'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036C}', 'r'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036D}', 't'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036E}', 'v'), // , COMBINING LATIN SMALL LETTER
|
||||
('\u{036F}', 'x'), // , COMBINING LATIN SMALL LETTER
|
||||
];
|
||||
|
||||
const DATA2: [(char, char); 167] = [
|
||||
('\u{1D00}', 'A'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D03}', 'B'), // , LATIN LETTER SMALL CAPITAL BARRED
|
||||
('\u{1D04}', 'C'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D05}', 'D'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D07}', 'E'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D08}', 'e'), // , LATIN SMALL LETTER TURNED OPEN
|
||||
('\u{1D09}', 'i'), // , LATIN SMALL LETTER TURNED
|
||||
('\u{1D0A}', 'J'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D0B}', 'K'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D0C}', 'L'), // WITH STROKE, LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D0D}', 'M'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D0E}', 'N'), // , LATIN LETTER SMALL CAPITAL REVERSED
|
||||
('\u{1D0F}', 'O'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D10}', 'O'), // , LATIN LETTER SMALL CAPITAL OPEN
|
||||
('\u{1D11}', 'o'), // , LATIN SMALL LETTER SIDEWAYS
|
||||
('\u{1D12}', 'o'), // , LATIN SMALL LETTER SIDEWAYS OPEN
|
||||
('\u{1D13}', 'o'), // WITH STROKE, LATIN SMALL LETTER SIDEWAYS
|
||||
('\u{1D16}', 'o'), // , LATIN SMALL LETTER TOP HALF
|
||||
('\u{1D17}', 'o'), // , LATIN SMALL LETTER BOTTOM HALF
|
||||
('\u{1D18}', 'P'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D19}', 'R'), // , LATIN LETTER SMALL CAPITAL REVERSED
|
||||
('\u{1D1A}', 'R'), // , LATIN LETTER SMALL CAPITAL TURNED
|
||||
('\u{1D1B}', 'T'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D1C}', 'U'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D1D}', 'u'), // , LATIN SMALL LETTER SIDEWAYS
|
||||
('\u{1D1E}', 'u'), // , LATIN SMALL LETTER SIDEWAYS DIAERESIZED
|
||||
('\u{1D1F}', 'm'), // , LATIN SMALL LETTER SIDEWAYS TURNED
|
||||
('\u{1D20}', 'V'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D21}', 'W'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D22}', 'Z'), // , LATIN LETTER SMALL CAPITAL
|
||||
('\u{1D62}', 'i'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{1D63}', 'r'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{1D64}', 'u'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{1D65}', 'v'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{1E01}', 'a'), // WITH RING BELOW, LATIN SMALL LETTER
|
||||
('\u{1E03}', 'b'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E05}', 'b'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E07}', 'b'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E0B}', 'd'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E0D}', 'd'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E0F}', 'd'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E11}', 'd'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{1E13}', 'd'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E19}', 'e'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E1B}', 'e'), // WITH TILDE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E1F}', 'f'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E21}', 'g'), // WITH MACRON, LATIN SMALL LETTER
|
||||
('\u{1E23}', 'h'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E25}', 'h'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E27}', 'h'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{1E29}', 'h'), // WITH CEDILLA, LATIN SMALL LETTER
|
||||
('\u{1E2B}', 'h'), // WITH BREVE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E2D}', 'i'), // WITH TILDE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E31}', 'k'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{1E33}', 'k'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E35}', 'k'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E37}', 'l'), // WITH DOT BELOW, LATIN SMALL LETTER ('\u{1E3B}', 'l'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E3D}', 'l'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E3F}', 'm'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{1E41}', 'm'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E43}', 'm'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E45}', 'n'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E47}', 'n'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E49}', 'n'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E4B}', 'n'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E55}', 'p'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{1E57}', 'p'), // WITH DOT ABOVE, LATIN SMALL LETTER ('\u{1E59}', 'r'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E5B}', 'r'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E5F}', 'r'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E61}', 's'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E63}', 's'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E6B}', 't'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E6D}', 't'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E6F}', 't'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E71}', 't'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E73}', 'u'), // WITH DIAERESIS BELOW, LATIN SMALL LETTER
|
||||
('\u{1E75}', 'u'), // WITH TILDE BELOW, LATIN SMALL LETTER ('\u{1E77}', 'u'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER
|
||||
('\u{1E7D}', 'v'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{1E7F}', 'v'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E81}', 'w'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{1E83}', 'w'), // WITH ACUTE, LATIN SMALL LETTER
|
||||
('\u{1E85}', 'w'), // WITH DIAERESIS, LATIN SMALL LETTER ('\u{1E87}', 'w'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E89}', 'w'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E8B}', 'x'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E8D}', 'x'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{1E8F}', 'y'), // WITH DOT ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E91}', 'z'), // WITH CIRCUMFLEX, LATIN SMALL LETTER ('\u{1E93}', 'z'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1E95}', 'z'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E96}', 'h'), // WITH LINE BELOW, LATIN SMALL LETTER
|
||||
('\u{1E97}', 't'), // WITH DIAERESIS, LATIN SMALL LETTER
|
||||
('\u{1E98}', 'w'), // WITH RING ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E99}', 'y'), // WITH RING ABOVE, LATIN SMALL LETTER
|
||||
('\u{1E9A}', 'a'), // WITH RIGHT HALF RING, LATIN SMALL LETTER
|
||||
('\u{1E9B}', 's'), // WITH DOT ABOVE, LATIN SMALL LETTER LONG
|
||||
('\u{1EA1}', 'a'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1EA3}', 'a'), // WITH HOOK ABOVE, LATIN SMALL LETTER
|
||||
('\u{1EB9}', 'e'), // WITH DOT BELOW, LATIN SMALL LETTER ('\u{1EBB}', 'e'), // WITH HOOK ABOVE, LATIN SMALL LETTER
|
||||
('\u{1EBD}', 'e'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{1EC9}', 'i'), // WITH HOOK ABOVE, LATIN SMALL LETTER
|
||||
('\u{1ECB}', 'i'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1ECD}', 'o'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1ECF}', 'o'), // WITH HOOK ABOVE, LATIN SMALL LETTER
|
||||
('\u{1EE5}', 'u'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1EE7}', 'u'), // WITH HOOK ABOVE, LATIN SMALL LETTER
|
||||
('\u{1EF3}', 'y'), // WITH GRAVE, LATIN SMALL LETTER
|
||||
('\u{1EF5}', 'y'), // WITH DOT BELOW, LATIN SMALL LETTER
|
||||
('\u{1EF7}', 'y'), // WITH HOOK ABOVE, LATIN SMALL LETTER ('\u{1EF9}', 'y'), // WITH TILDE, LATIN SMALL LETTER
|
||||
('\u{1ea4}', 'A'),
|
||||
('\u{1ea5}', 'a'),
|
||||
('\u{1ea6}', 'A'),
|
||||
('\u{1ea7}', 'a'),
|
||||
('\u{1ea8}', 'A'),
|
||||
('\u{1ea9}', 'a'),
|
||||
('\u{1eaa}', 'A'),
|
||||
('\u{1eab}', 'a'),
|
||||
('\u{1eac}', 'A'),
|
||||
('\u{1ead}', 'a'),
|
||||
('\u{1eae}', 'A'),
|
||||
('\u{1eaf}', 'a'),
|
||||
('\u{1eb0}', 'A'),
|
||||
('\u{1eb1}', 'a'),
|
||||
('\u{1eb2}', 'A'),
|
||||
('\u{1eb3}', 'a'),
|
||||
('\u{1eb4}', 'A'),
|
||||
('\u{1eb5}', 'a'),
|
||||
('\u{1eb6}', 'A'),
|
||||
('\u{1eb7}', 'a'),
|
||||
('\u{1ebe}', 'E'),
|
||||
('\u{1ebf}', 'e'),
|
||||
('\u{1ec0}', 'E'),
|
||||
('\u{1ec1}', 'e'),
|
||||
('\u{1ec2}', 'E'),
|
||||
('\u{1ec3}', 'e'),
|
||||
('\u{1ec4}', 'E'),
|
||||
('\u{1ec5}', 'e'),
|
||||
('\u{1ec6}', 'E'),
|
||||
('\u{1ec7}', 'e'),
|
||||
('\u{1ed0}', 'O'),
|
||||
('\u{1ed1}', 'o'),
|
||||
('\u{1ed2}', 'O'),
|
||||
('\u{1ed3}', 'o'),
|
||||
('\u{1ed4}', 'O'),
|
||||
('\u{1ed5}', 'o'),
|
||||
('\u{1ed6}', 'O'),
|
||||
('\u{1ed7}', 'o'),
|
||||
('\u{1ed8}', 'O'),
|
||||
('\u{1ed9}', 'o'),
|
||||
('\u{1eda}', 'O'),
|
||||
('\u{1edb}', 'o'),
|
||||
('\u{1edc}', 'O'),
|
||||
('\u{1edd}', 'o'),
|
||||
('\u{1ede}', 'O'),
|
||||
('\u{1edf}', 'o'),
|
||||
('\u{1ee0}', 'O'),
|
||||
('\u{1ee1}', 'o'),
|
||||
('\u{1ee2}', 'O'),
|
||||
('\u{1ee3}', 'o'),
|
||||
('\u{1ee8}', 'U'),
|
||||
('\u{1ee9}', 'u'),
|
||||
('\u{1eea}', 'U'),
|
||||
('\u{1eeb}', 'u'),
|
||||
('\u{1eec}', 'U'),
|
||||
('\u{1eed}', 'u'),
|
||||
('\u{1eee}', 'U'),
|
||||
('\u{1eef}', 'u'),
|
||||
('\u{1ef0}', 'U'),
|
||||
('\u{1ef1}', 'u'),
|
||||
];
|
||||
|
||||
const DATA3: [(char, char); 9] = [
|
||||
('\u{2071}', 'i'), // , SUPERSCRIPT LATIN SMALL LETTER
|
||||
('\u{2095}', 'h'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{2096}', 'k'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{2097}', 'l'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{2098}', 'm'), // , LATIN SUBSCRIPT SMALL LETTER0x2099: 'n', // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{209A}', 'p'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{209B}', 's'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{209C}', 't'), // , LATIN SUBSCRIPT SMALL LETTER
|
||||
('\u{2184}', 'c'), // , LATIN SMALL LETTER REVERSED
|
||||
];
|
||||
|
||||
const DATA1_START: u32 = DATA1[0].0 as u32;
|
||||
const DATA1_END: u32 = DATA1[DATA1.len() - 1].0 as u32 + 1;
|
||||
const LEN1: usize = (DATA1_END - DATA1_START) as usize;
|
||||
static TABLE1: [char; LEN1] = generate_table(&DATA1);
|
||||
|
||||
const fn generate_table<const LEN: usize>(sparse_data: &[(char, char)]) -> [char; LEN] {
|
||||
let mut table: [char; LEN] = ['\0'; LEN];
|
||||
let start = sparse_data[0].0 as u32;
|
||||
let mut i = 0u32;
|
||||
let mut j = 0;
|
||||
while i < table.len() as u32 {
|
||||
let Some(key) = char::from_u32(start + i) else { panic!("invalid char") };
|
||||
if sparse_data[j].0 == key {
|
||||
table[i as usize] = DATA1[j].1;
|
||||
j += 1;
|
||||
} else {
|
||||
//identity
|
||||
table[i as usize] = key;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
table
|
||||
}
|
||||
const DATA2_START: u32 = DATA2[0].0 as u32;
|
||||
const DATA2_END: u32 = DATA2[DATA2.len() - 1].0 as u32 + 1;
|
||||
const LEN2: usize = (DATA2_END - DATA2_START) as usize;
|
||||
static TABLE2: [char; LEN2] = generate_table(&DATA2);
|
||||
|
||||
const DATA3_START: u32 = DATA3[0].0 as u32;
|
||||
const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1;
|
||||
const LEN3: usize = (DATA3_END - DATA3_START) as usize;
|
||||
static TABLE3: [char; LEN3] = generate_table(&DATA3);
|
||||
|
||||
pub fn normalize(c: char) -> char {
|
||||
let i = c as u32;
|
||||
if i < DATA1_START || DATA3_END >= i {
|
||||
return c;
|
||||
}
|
||||
if i < DATA1_END {
|
||||
return TABLE1[(i - DATA1_START) as usize];
|
||||
}
|
||||
if i < DATA2_START {
|
||||
return c;
|
||||
}
|
||||
if i < DATA2_END {
|
||||
return TABLE2[(i - DATA2_START) as usize];
|
||||
}
|
||||
if i < DATA3_START {
|
||||
return c;
|
||||
}
|
||||
TABLE3[(i - DATA3_START) as usize]
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user