diff --git a/Cargo.lock b/Cargo.lock index 1968688..478f6e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,56 +2,38 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "aho-corasick" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" -dependencies = [ - "memchr", -] - -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - -[[package]] -name = "anstyle" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" - [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "benches" +version = "0.1.0" +dependencies = [ + "brunch", + "fuzzy-matcher", + "nucleo", + "nucleo-matcher", + "walkdir", +] + [[package]] name = "bitflags" -version = "2.3.3" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "bumpalo" -version = "3.13.0" +name = "brunch" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" - -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - -[[package]] -name = "cc" -version = "1.0.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "12ec8866ee8d4ec8c0770203322d7cfa2d2183b03fc788611c5ffc191f2a5688" +dependencies = [ + "dactyl", + "unicode-width", +] [[package]] name = "cfg-if" @@ -59,100 +41,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "ciborium" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" -dependencies = [ - "ciborium-io", - "ciborium-ll", - "serde", -] - -[[package]] -name = "ciborium-io" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" - -[[package]] -name = "ciborium-ll" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" -dependencies = [ - "ciborium-io", - "half", -] - -[[package]] -name = "clap" -version = "4.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d" -dependencies = [ - "clap_builder", -] - -[[package]] -name = "clap_builder" -version = "4.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1" -dependencies = [ - "anstyle", - "clap_lex", -] - -[[package]] -name = "clap_lex" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" - [[package]] name = "cov-mark" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ffa3d3e0138386cd4361f63537765cac7ee40698028844635a54495a92f67f3" -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools", -] - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -196,33 +90,21 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "dactyl" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f762271c6826d426c3fd2e37aa827fa039596bc7050e9289cb713265be3d7f" +dependencies = [ + "num-traits", +] + [[package]] name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" -[[package]] -name = "errno" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "fuzzy-matcher" version = "0.3.7" @@ -232,53 +114,12 @@ dependencies = [ "thread_local", ] -[[package]] -name = "half" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" - [[package]] name = "hermit-abi" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" -[[package]] -name = "is-terminal" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" -dependencies = [ - "hermit-abi", - "rustix", - "windows-sys", -] - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" - -[[package]] -name = "js-sys" -version = "0.3.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" -dependencies = [ - "wasm-bindgen", -] - [[package]] name = "libc" version = "0.2.147" @@ -286,16 +127,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] -name = "linux-raw-sys" -version = "0.4.3" +name = "lock_api" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" - -[[package]] -name = "log" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] [[package]] name = "memchr" @@ -312,13 +151,20 @@ dependencies = [ "autocfg", ] +[[package]] +name = "nucleo" +version = "0.1.0" +dependencies = [ + "nucleo-matcher", + "parking_lot", + "rayon", +] + [[package]] name = "nucleo-matcher" version = "0.1.0" dependencies = [ "cov-mark", - "criterion", - "fuzzy-matcher", "memchr", ] @@ -348,55 +194,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] -name = "oorandom" -version = "11.1.3" +name = "parking_lot" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" - -[[package]] -name = "plotters" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", + "lock_api", + "parking_lot_core", ] [[package]] -name = "plotters-backend" -version = "0.3.5" +name = "parking_lot_core" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" - -[[package]] -name = "plotters-svg" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ - "plotters-backend", -] - -[[package]] -name = "proc-macro2" -version = "1.0.66" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" -dependencies = [ - "proc-macro2", + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", ] [[package]] @@ -422,53 +239,14 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.9.1" +name = "redox_syscall" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" - -[[package]] -name = "rustix" -version = "0.38.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys", ] -[[package]] -name = "ryu" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" - [[package]] name = "same-file" version = "1.0.6" @@ -485,46 +263,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "serde" -version = "1.0.175" +name = "smallvec" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.175" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.103" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d03b412469450d4404fe8499a268edd7f8b79fecb074b0d812ad64ca21f4031b" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "syn" -version = "2.0.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" [[package]] name = "thread_local" @@ -537,20 +279,10 @@ dependencies = [ ] [[package]] -name = "tinytemplate" -version = "1.2.1" +name = "unicode-width" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "unicode-ident" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "walkdir" @@ -562,70 +294,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasm-bindgen" -version = "0.2.87" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.87" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.87" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.87" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.87" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" - -[[package]] -name = "web-sys" -version = "0.3.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi" version = "0.3.9" @@ -657,15 +325,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-targets" version = "0.48.1" @@ -722,7 +381,3 @@ name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" - -[[package]] -name = "worker" -version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 4024f6c..b9a2d3a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,2 @@ [workspace] -members = [ "matcher", "worker" ] +members = [ "matcher", "worker", "benches" ] diff --git a/README.md b/README.md index a00ab8d..19e892c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,41 @@ -# fzf_oxide +# Nucleo An optimized rust port of the fzf fuzzy matching algorithm -## TODO: +## Notes: -* case mismatch penalty -* substring/prefix/postfix/exact matcher -* high level API (worker thread, query parsing, sorting) +* [x] fuzzy matcher + * based on https://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/gaps.pdf + * compared to theory we don't store the p-matrix at all and instead just store the value in a variable as we iterate the row + * this is possible because we precompute the m-matrix for the next row. This is super confusing but greatly increases cache locality and halfes the amount of space we need during matching for the m-matrix too + * during index calculation full `O(mn)` space matrix is required. We only store + two bools to allow backtracking indices, skim stores the full p and m matrix in that case => doesn't matter too much as indices are only computed for visible elements + * space complexity: skim needs at least 8x more memory => much worse case locality, fzf always allocates a full `O(mn)` matrix even during matching => byebye cache (atleast they use smaller integers tough) + * nucleos' matrix only was width `n-m+1` instead of width `n`. This comes from the observation that the `p.` char requires `p-1` chars before it and `m-p` chars after it, so there are always `p-1 + m-p = m+1` chars that can never match the current char. This works especially well with only using a single row because the first relevant char is always at the same position even tough its technically further to the right. This is particularly nice because we precalculate the m -matrix which is computed from diagonal elements, so the precalculated values stay in the same matrix cell. + * a couple simpler (but arguably even more impactful) optimizations: + * we presegment unicode, unicode segmentation is somewhat slow and matcher will filter the same elements quite often so only doing it once is nice. It also prevents a very common soruce fo bugs (mixing of char indices which we use here and utf8 indices) and makes the code a lot simpler as a result. + * we special case ASCII since 90% of practical text is ASCII. Ascii can be stored as bytes instead of `chars` => much better cache locality => we can use memchar (SIMD!). + * we aggressively prefilter (especially ASCII but also unicode to a lesser extent) to ensure we reject non-matching haystacks as fast as possible. Usually most haystacks will not match when fuzzy matching large lists so having very quick reject past is good + * for very long matches we fallback to a greedy matcher which runs in `O(N)` (and `O(1)` space complexity) to avoid the `O(mn)` blowup. This is fzfs old algorithm and yields decent (but not great) results. + * There is a misunderstanding in both skim and fzf. Basically what they do is give a bonus to each character (like word boundaries). That makes senes and is reasonable, but the problem is that they use the **maximum bonus** when multiple chars match in sequence. That means that the bonus of a character depends on which characters exactly matched around it. But the fundamental assumption of this algorithm (and why it doesn't require backtracking) is that the score of each character is independent of what other chars matched (this is the difference between the affine gap and the generic gap case shown in the paper too). During fuzzing I found many cases where this mechanism leads to a non-optimal match being reported (so the sort order and fuzzy indices would be wrong). In my testing removing this mechanism and slightly tweaking the bonus calculation results in similar match quality but made sure the algorithm always worked correctly (and removed a bunch of weird edges cases). +* [x] substring/prefix/postfix/exact matcher +* [ ] case mismatch penalty. This doens't seem like a good idea to me. Fzf doens't do this (only skin), smartcase should cover most cases. .would be nice for fully case insensitive matching without smartcase like in autocompletions tough. Realistically there won't be more than 3 items that are identical with different casing tough, so I don't think it matters too much. It is a bit annoying to implement since you can no longer prenormalize queries(or need two queries) :/ +* [ ] high level API (worker thread, query parsing, sorting), in progress + * apparently sorting is superfast (at most 5% of match time for nucleo matcher with a highly selective query, otherwise its completely negligible compared to fuzzy matching). All the bending over backwards fzf does (and skim copied but way worse) seems a little silly. I think fzf does it because go doens't have a good parallel sort. Fzf divides the matches into a couple fairly large chunks and sorts those on each worker thread and then lazily merges the result. That makes the sorting without the merging `Nlog(N/M)` which is basically equivalent for large `N` and small `M` as is the case here. Atleast its parallel tough. In rust we have a great pattern defeating parallel quicksort tough (rayon) which is way easier. + * [x] basic implemenation (workers, streaming, invalidation) + * [ ] verify it actually works + * [ ] query paring + * [ ] hook up to helix + * [ ] currently I simply use a tick system (called on every redraw) + together with a redraw/tick nofication (ideally debounced) is that enough? + * [ ] for streaming callers should buffer their data. Can we provide a better API for that beyond what is currently there? + * [ ] cleanup code, improve API + * [ ] write docs + +* tests + * [x] fuzz the fuzzy matcher + * [x] port the full fzf testsuite for fuzzy matching + * [ ] port the full skim testsuite for fuzzy matching + * [ ] highlevel API + * [ ] test bustring/exact/prefix/postfix match + * [ ] coverage report (fuzzy matcher was at 86%) diff --git a/benches/Cargo.toml b/benches/Cargo.toml new file mode 100644 index 0000000..89f10b4 --- /dev/null +++ b/benches/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "benches" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nucleo-matcher = { version = "0.1", path = "../matcher" } +nucleo = { version = "0.1", path = "../worker" } +brunch = "0.5.0" +fuzzy-matcher = "0.3.7" +walkdir = "2" \ No newline at end of file diff --git a/benches/src/main.rs b/benches/src/main.rs new file mode 100644 index 0000000..34f83f2 --- /dev/null +++ b/benches/src/main.rs @@ -0,0 +1,78 @@ +use std::hint::black_box; +use std::path::PathBuf; +use std::process::Command; + +use brunch::{Bench, Benches}; +use fuzzy_matcher::FuzzyMatcher; +use nucleo::Utf32String; +use nucleo_matcher::Utf32Str; + +fn bench_dir() -> PathBuf { + std::env::var_os("BENCHMARK_DIR") + .expect("the BENCHMARK_DIR must be set to the directory to traverse for the benchmark") + .into() +} + +fn checkout_linux_if_needed() { + let linux_dir = bench_dir(); + if !linux_dir.exists() { + println!("will git clone linux..."); + let output = Command::new("git") + .arg("clone") + .arg("https://github.com/BurntSushi/linux.git") + .arg("--depth") + .arg("1") + .arg("--branch") + .arg("master") + .arg("--single-branch") + .arg(&linux_dir) + .stdout(std::process::Stdio::inherit()) + .status() + .expect("failed to git clone linux"); + println!("did git clone linux...{:?}", output); + } +} + +fn main() { + checkout_linux_if_needed(); + let dir = bench_dir(); + let paths: (Vec, Vec) = walkdir::WalkDir::new(dir) + .into_iter() + .filter_map(|path| { + let dent = path.ok()?; + let path = dent.into_path().to_string_lossy().into_owned(); + Some((path.as_str().into(), path)) + }) + .unzip(); + let mut nucleo = + nucleo_matcher::Matcher::new(nucleo_matcher::MatcherConfig::DEFAULT.match_paths()); + let skim = fuzzy_matcher::skim::SkimMatcherV2::default().ignore_case(); + + // TODO: unicode? + let needles = ["never_matches", "copying", "/doc/kernel", "//.h"]; + // Announce that we've started. + ::std::eprint!("\x1b[1;38;5;199mStarting:\x1b[0m Running benchmark(s). Stand by!\n\n"); + let mut benches = Benches::default(); + // let mut scores = Vec::with_capacity(paths.0.len()); + for needle in needles { + println!("running {needle:?}..."); + benches.push(Bench::new(format!("nucleo {needle:?}")).run(|| { + // scores.clear(); + // scores.extend(paths.0.iter().filter_map(|haystack| { + for haystack in &paths.0 { + black_box( + nucleo.fuzzy_match(haystack.slice(..), Utf32Str::Ascii(needle.as_bytes())), + ); + } + // })); + // scores.sort_unstable(); + })); + benches.push(Bench::new(format!("skim {needle:?}")).run(|| { + for haystack in &paths.1 { + let res = skim.fuzzy_match(haystack, needle); + let _ = black_box(res); + } + })); + } + benches.finish(); +} diff --git a/matcher/Cargo.toml b/matcher/Cargo.toml index ddf725c..c609a21 100644 --- a/matcher/Cargo.toml +++ b/matcher/Cargo.toml @@ -12,6 +12,3 @@ cov-mark = { version = "1.1.0", default-features = false } [dev-dependencies] cov-mark = { version = "1.1.0", default-features = true } -criterion = "0.5.1" -fuzzy-matcher = "0.3.7" - diff --git a/matcher/src/config.rs b/matcher/src/config.rs index b228e82..931b032 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -6,10 +6,10 @@ use crate::score::BONUS_BOUNDARY; pub struct MatcherConfig { pub delimiter_chars: &'static [u8], /// Extra bonus for word boundary after whitespace character or beginning of the string - pub bonus_boundary_white: u16, + pub(crate) bonus_boundary_white: u16, - // Extra bonus for word boundary after slash, colon, semi-colon, and comma - pub bonus_boundary_delimiter: u16, + /// Extra bonus for word boundary after slash, colon, semi-colon, and comma + pub(crate) bonus_boundary_delimiter: u16, pub initial_char_class: CharClass, /// Whether to normalize latin script characters to ASCII /// this significantly degrades performance so its not recommended diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index c478348..dda12b8 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -58,6 +58,24 @@ pub struct Matcher { slab: MatrixSlab, } +// this is just here for convenience not ruse if we should implement this +impl Clone for Matcher { + fn clone(&self) -> Self { + Matcher { + config: self.config, + slab: MatrixSlab::new(), + } + } +} + +impl std::fmt::Debug for Matcher { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Matcher") + .field("config", &self.config) + .finish_non_exhaustive() + } +} + impl Default for Matcher { fn default() -> Self { Matcher { @@ -504,12 +522,16 @@ impl Matcher { return None; } (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - haystack - .iter() - .map(|c| c.normalize(&self.config)) - .eq(AsciiChar::cast(needle) + let matched = + haystack .iter() - .map(|c| c.normalize(&self.config))); + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))); + if !matched { + return None; + } self.calculate_score::( haystack, diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs index 550a5d7..412f1bf 100644 --- a/matcher/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -1,6 +1,7 @@ use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; use std::marker::PhantomData; use std::mem::size_of; +use std::panic::{RefUnwindSafe, UnwindSafe}; use std::ptr::{slice_from_raw_parts_mut, NonNull}; use crate::chars::Char; @@ -125,6 +126,10 @@ struct MatcherData { } pub(crate) struct MatrixSlab(NonNull); +unsafe impl Sync for MatrixSlab {} +unsafe impl Send for MatrixSlab {} +impl UnwindSafe for MatrixSlab {} +impl RefUnwindSafe for MatrixSlab {} impl MatrixSlab { pub fn new() -> Self { diff --git a/worker/Cargo.toml b/worker/Cargo.toml new file mode 100644 index 0000000..483c133 --- /dev/null +++ b/worker/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "nucleo" +authors = ["Pascal Kuthe "] +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nucleo-matcher = { version = "0.1", path = "../matcher" } +parking_lot = { version = "0.12.1", features = ["send_guard", "arc_lock"]} +rayon = "1.7.0" diff --git a/worker/src/items.rs b/worker/src/items.rs new file mode 100644 index 0000000..27ec6a1 --- /dev/null +++ b/worker/src/items.rs @@ -0,0 +1,106 @@ +use std::mem::swap; +use std::ptr::NonNull; + +use crate::Utf32String; + +pub(crate) struct ItemCache { + live: Vec, + evicted: Vec, +} +impl ItemCache { + pub(crate) fn clear(&mut self) { + if self.evicted.is_empty() { + swap(&mut self.evicted, &mut self.live) + } else { + self.evicted.append(&mut self.live) + } + } + pub(crate) fn cleared(&self) -> bool { + !self.evicted.is_empty() + } + + pub(crate) fn push(&mut self, item: Box<[Utf32String]>) { + self.live.push(Item { + cols: Box::leak(item).into(), + }) + } +} + +#[derive(PartialEq, Eq, Debug, Clone)] +pub(crate) struct Item { + // TODO: small vec optimization?? + cols: NonNull<[Utf32String]>, +} + +unsafe impl Send for Item {} +unsafe impl Sync for Item {} + +impl Item { + fn cols(&self) -> &[Utf32String] { + // safety: cols is basically a box and treated the same as a box, + // however there can be other references so using a box (unique ptr) + // would be an alias violation + unsafe { self.cols.as_ref() } + } +} +impl Drop for Item { + fn drop(&mut self) { + // safety: cols is basically a box and treated the same as a box, + // however there can be other references (that won't be accesed + // anymore at this point) so using a box (unique ptr) would be an alias + // violation + unsafe { drop(Box::from_raw(self.cols.as_ptr())) } + } +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct ItemSnapshot { + cols: NonNull<[Utf32String]>, + pub(crate) len: u32, +} + +unsafe impl Send for ItemSnapshot {} +unsafe impl Sync for ItemSnapshot {} + +#[derive(Debug, Clone)] +pub(crate) struct ItemsSnapshot { + items: Vec, +} + +impl ItemsSnapshot { + pub(crate) fn outdated(&self, items: &ItemCache) -> bool { + items.live.len() != self.items.len() + } + + pub(crate) fn len(&self) -> usize { + self.items.len() + } + + pub(crate) fn update(&mut self, items: &ItemCache) -> bool { + let cleared = items.evicted.is_empty(); + // drop in another thread to ensure we don't wait for a long drop here + if cleared { + self.items.clear(); + }; + let start = self.items.len(); + self.items + .extend(items.live[start..].iter().map(|item| ItemSnapshot { + cols: item.cols, + len: item.cols().iter().map(|s| s.len() as u32).sum(), + })); + cleared + } + + pub(crate) unsafe fn get(&self) -> &[ItemSnapshot] { + &self.items + } +} + +impl ItemSnapshot { + pub(crate) fn cols(&self) -> &[Utf32String] { + // safety: we only hand out ItemSnapshot ranges + // if the caller asserted via the unsafe ItemsSnapshot::get + // function that the pointers are valid + unsafe { self.cols.as_ref() } + } +} diff --git a/worker/src/lib.rs b/worker/src/lib.rs new file mode 100644 index 0000000..aa047fc --- /dev/null +++ b/worker/src/lib.rs @@ -0,0 +1,201 @@ +use std::cell::UnsafeCell; +use std::ops::Deref; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::Arc; +use std::time::Duration; + +use crate::items::{ItemCache, ItemsSnapshot}; +use crate::query::Query; +pub use crate::utf32_string::Utf32String; +use parking_lot::lock_api::ArcMutexGuard; +use rayon::prelude::*; + +mod items; +mod query; +mod utf32_string; + +use parking_lot::{Mutex, MutexGuard, RawMutex}; + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub struct Match { + score: u32, + idx: u32, +} + +struct Matchers(Box<[UnsafeCell]>); + +impl Matchers { + // thiss is not a true mut from ref, we use a cell here + #[allow(clippy::mut_from_ref)] + unsafe fn get(&self) -> &mut nucleo_matcher::Matcher { + &mut *self.0[rayon::current_thread_index().unwrap()].get() + } +} + +unsafe impl Sync for Matchers {} +unsafe impl Send for Matchers {} + +struct Worker { + notify: Arc<(dyn Fn() + Sync + Send)>, + running: bool, + items: ItemsSnapshot, + matchers: Matchers, + matches: Vec, + query: Query, + canceled: Arc, +} + +impl Worker { + unsafe fn run( + &mut self, + items_lock: ArcMutexGuard, + query_status: query::Status, + canceled: Arc, + ) { + self.running = true; + let mut last_scored_item = self.items.len(); + let cleared = self.items.update(&items_lock); + drop(items_lock); + + // TODO: be smarter around reusing past results for rescoring + if cleared || query_status == query::Status::Rescore { + self.matches.clear(); + last_scored_item = 0; + } + + let matchers = &self.matchers; + let query = &self.query; + let items = unsafe { self.items.get() }; + + if query_status != query::Status::Unchanged && !self.matches.is_empty() { + self.matches + .par_iter_mut() + .take_any_while(|_| canceled.load(atomic::Ordering::Relaxed)) + .for_each(|match_| { + let item = &items[match_.idx as usize]; + match_.score = query + .score(item.cols(), unsafe { matchers.get() }) + .unwrap_or(u32::MAX); + }); + // TODO: do this in parallel? + self.matches.retain(|m| m.score != u32::MAX) + } + + if last_scored_item != self.items.len() { + self.running = true; + let items = items[last_scored_item..] + .par_iter() + .enumerate() + .filter_map(|(i, item)| { + let score = if canceled.load(atomic::Ordering::Relaxed) { + 0 + } else { + query.score(item.cols(), unsafe { matchers.get() })? + }; + Some(Match { + score, + idx: i as u32, + }) + }); + self.matches.par_extend(items) + } + + if !self.canceled.load(atomic::Ordering::Relaxed) { + // TODO: cancel sort in progess? + self.matches.par_sort_unstable_by(|match1, match2| { + match2.idx.cmp(&match1.idx).then_with(|| { + // the tie breaker is comparitevly rarely needed so we keep it + // in a branch especially beacuse we need to acceess the items + // array here which invovles some pointer chasing + let item1 = &items[match1.idx as usize]; + let item2 = &items[match2.idx as usize]; + (item1.len, match1.idx).cmp(&(item2.len, match2.idx)) + }) + }); + } + + (self.notify)(); + } +} + +pub struct Items { + cache: Arc>, + items: Arc>>, + notify: Arc<(dyn Fn() + Sync + Send)>, +} + +impl Items { + pub fn clear(&mut self) { + self.items.lock().clear(); + self.cache.lock().clear(); + } + + pub fn append(&mut self, items: impl Iterator)>) { + let mut cache = self.cache.lock(); + let mut items_ = self.items.lock(); + items_.extend(items.map(|(item, text)| { + cache.push(text); + item + })); + // notify that a new tick will be necessary + (self.notify)(); + } + + pub fn get(&self) -> impl Deref + '_ { + MutexGuard::map(self.items.lock(), |items| items.as_mut_slice()) + } + + pub fn push() {} +} + +pub struct Nucleo { + // the way the API is build we totally don't actually neeed these to be Arcs + // but this lets us avoid some unsafe + worker: Arc>, + canceled: Arc, + items: Items, + thread_pool: rayon::ThreadPool, + pub matches: Vec, + pub query: Query, +} + +impl Nucleo { + pub fn tick(&mut self, timeout: u64) -> bool { + let status = self.query.status(); + let items = self.items.cache.lock_arc(); + let canceled = status != query::Status::Unchanged || items.cleared(); + let mut inner = if canceled { + self.canceled.store(true, atomic::Ordering::Relaxed); + self.worker.lock_arc() + } else { + let Some(worker) = self.worker.try_lock_arc_for(Duration::from_millis(timeout)) else { + return true; + }; + worker + }; + + if inner.running { + inner.running = false; + self.matches.clone_from(&inner.matches); + } else if !canceled { + // nothing has changed + return false; + } + + if canceled || inner.items.outdated(&items) { + let canceled = self.canceled.clone(); + self.thread_pool + .spawn(move || unsafe { inner.run(items, status, canceled) }) + } + true + } +} + +impl Drop for Nucleo { + fn drop(&mut self) { + // we ensure the worker quits before dropping items to ensure that + // the worker can always assume the items outlife it + self.canceled.store(true, atomic::Ordering::Relaxed); + drop(self.worker.lock()); + } +} diff --git a/worker/src/matcher.rs b/worker/src/matcher.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/worker/src/matcher.rs @@ -0,0 +1 @@ + diff --git a/worker/src/query.rs b/worker/src/query.rs new file mode 100644 index 0000000..936c6bf --- /dev/null +++ b/worker/src/query.rs @@ -0,0 +1,84 @@ +use nucleo_matcher::{Matcher, Utf32Str}; + +use crate::Utf32String; + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +enum PatternKind { + Exact, + Fuzzy, + Substring, + Prefix, + Postfix, +} + +#[derive(Debug, PartialEq, Eq, Clone)] +struct PatternAtom { + kind: PatternKind, + needle: Utf32String, + invert: bool, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)] +pub enum Status { + Unchanged, + Update, + Rescore, +} + +#[derive(Debug, Clone)] +pub struct Query { + pub cols: Vec, +} + +impl Query { + pub(crate) fn status(&self) -> Status { + self.cols + .iter() + .map(|col| col.status) + .max() + .unwrap_or(Status::Unchanged) + } + + pub(crate) fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { + // TODO: wheight columns? + let mut score = 0; + for (pattern, haystack) in self.cols.iter().zip(haystack) { + score += pattern.score(haystack.slice(..), matcher)? + } + Some(score) + } +} + +#[derive(Clone, Debug)] +pub struct Pattern { + terms: Vec, + status: Status, +} + +impl Pattern { + pub(crate) fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { + if self.terms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.terms { + let pattern_score = match pattern.kind { + PatternKind::Exact => matcher.exact_match(haystack, pattern.needle.slice(..)), + PatternKind::Fuzzy => matcher.fuzzy_match(haystack, pattern.needle.slice(..)), + PatternKind::Substring => { + matcher.substring_match(haystack, pattern.needle.slice(..)) + } + PatternKind::Prefix => matcher.prefix_match(haystack, pattern.needle.slice(..)), + PatternKind::Postfix => matcher.prefix_match(haystack, pattern.needle.slice(..)), + }; + if pattern.invert { + if pattern_score.is_some() { + return None; + } + } else { + score += pattern_score? as u32 + } + } + Some(score) + } +} diff --git a/worker/src/results.rs b/worker/src/results.rs new file mode 100644 index 0000000..dea0b4b --- /dev/null +++ b/worker/src/results.rs @@ -0,0 +1,9 @@ +pub struct MatchSnapshot { + chunks: Vec, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] +struct Match { + score: u32, + idx: u32, +} diff --git a/worker/src/utf32_string.rs b/worker/src/utf32_string.rs new file mode 100644 index 0000000..1a2604a --- /dev/null +++ b/worker/src/utf32_string.rs @@ -0,0 +1,135 @@ +use core::slice; +use std::fmt; +use std::ops::{Bound, RangeBounds}; + +use nucleo_matcher::Utf32Str; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +pub enum Utf32String { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain valid ASCII (<=127) + Ascii(Box), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(Box<[char]>), +} +impl Utf32String { + #[inline] + pub fn len(&self) -> usize { + match self { + Utf32String::Unicode(codepoints) => codepoints.len(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Utf32String::Unicode(codepoints) => codepoints.is_empty(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } + + /// Same as `slice` but accepts a u32 range for convenicene sine + /// those are the indices returned by the matcher + #[inline] + pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start as usize, + Bound::Excluded(&start) => start as usize + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end as usize, + Bound::Excluded(&end) => end as usize + 1, + Bound::Unbounded => self.len(), + }; + match self { + Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]), + Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + pub fn is_ascii(&self) -> bool { + matches!(self, Utf32String::Ascii(_)) + } + + pub fn get(&self, idx: u32) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[idx as usize] as char, + Utf32String::Unicode(codepoints) => codepoints[idx as usize], + } + } + pub fn last(&self) -> char { + match self { + Utf32String::Ascii(bytes) => bytes.as_bytes()[bytes.len() - 1] as char, + Utf32String::Unicode(codepoints) => codepoints[codepoints.len() - 1], + } + } + pub fn chars(&self) -> Chars<'_> { + match self { + Utf32String::Ascii(bytes) => Chars::Ascii(bytes.as_bytes().iter()), + Utf32String::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), + } + } +} + +impl From<&str> for Utf32String { + fn from(value: &str) -> Self { + if value.is_ascii() { + Self::Ascii(value.to_owned().into_boxed_str()) + } else { + Self::Unicode(value.chars().collect()) + } + } +} + +impl From> for Utf32String { + fn from(value: Box) -> Self { + if value.is_ascii() { + Self::Ascii(value) + } else { + Self::Unicode(value.chars().collect()) + } + } +} +impl From for Utf32String { + fn from(value: String) -> Self { + value.into_boxed_str().into() + } +} + +pub enum Chars<'a> { + Ascii(slice::Iter<'a, u8>), + Unicode(slice::Iter<'a, char>), +} +impl<'a> Iterator for Chars<'a> { + type Item = char; + + fn next(&mut self) -> Option { + match self { + Chars::Ascii(iter) => iter.next().map(|&c| c as char), + Chars::Unicode(iter) => iter.next().copied(), + } + } +} + +impl fmt::Debug for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"")?; + for c in self.chars() { + for c in c.escape_debug() { + write!(f, "{c}")? + } + } + write!(f, "\"") + } +} + +impl fmt::Display for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"")?; + for c in self.chars() { + write!(f, "{c}")? + } + write!(f, "\"") + } +}