diff --git a/Cargo.lock b/Cargo.lock index 2867753..1968688 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,115 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + +[[package]] +name = "bumpalo" +version = "3.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" + +[[package]] +name = "ciborium-ll" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" + [[package]] name = "cov-mark" version = "1.1.0" @@ -9,15 +118,611 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ffa3d3e0138386cd4361f63537765cac7ee40698028844635a54495a92f67f3" [[package]] -name = "fzf_oxide" -version = "0.1.0" +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ - "cov-mark", - "memchr", + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", ] +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fuzzy-matcher" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94" +dependencies = [ + "thread_local", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi", + "rustix", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "linux-raw-sys" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" + +[[package]] +name = "log" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" + [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "nucleo-matcher" +version = "0.1.0" +dependencies = [ + "cov-mark", + "criterion", + "fuzzy-matcher", + "memchr", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "regex" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" + +[[package]] +name = "rustix" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d03b412469450d4404fe8499a268edd7f8b79fecb074b0d812ad64ca21f4031b" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thread_local" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" + +[[package]] +name = "walkdir" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "web-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "worker" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index cf6b496..4024f6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,14 +1,2 @@ -[package] -name = "fzf_oxide" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -memchr = "2.5.0" -cov-mark = { version = "1.1.0", default-features = false } - -[dev-dependencies] -cov-mark = { version = "1.1.0", default-features = true } - +[workspace] +members = [ "matcher", "worker" ] diff --git a/matcher/Cargo.toml b/matcher/Cargo.toml new file mode 100644 index 0000000..ddf725c --- /dev/null +++ b/matcher/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "nucleo-matcher" +authors = ["Pascal Kuthe "] +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +memchr = "2.5.0" +cov-mark = { version = "1.1.0", default-features = false } + +[dev-dependencies] +cov-mark = { version = "1.1.0", default-features = true } +criterion = "0.5.1" +fuzzy-matcher = "0.3.7" + diff --git a/fuzz.sh b/matcher/fuzz.sh similarity index 100% rename from fuzz.sh rename to matcher/fuzz.sh diff --git a/matcher/fuzz/.gitignore b/matcher/fuzz/.gitignore new file mode 100644 index 0000000..1a45eee --- /dev/null +++ b/matcher/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/fuzz/Cargo.toml b/matcher/fuzz/Cargo.toml similarity index 100% rename from fuzz/Cargo.toml rename to matcher/fuzz/Cargo.toml diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/matcher/fuzz/fuzz_targets/fuzz_target_1.rs similarity index 100% rename from fuzz/fuzz_targets/fuzz_target_1.rs rename to matcher/fuzz/fuzz_targets/fuzz_target_1.rs diff --git a/generate_case_fold_table.sh b/matcher/generate_case_fold_table.sh similarity index 100% rename from generate_case_fold_table.sh rename to matcher/generate_case_fold_table.sh diff --git a/src/chars.rs b/matcher/src/chars.rs similarity index 100% rename from src/chars.rs rename to matcher/src/chars.rs diff --git a/src/chars/case_fold.rs b/matcher/src/chars/case_fold.rs similarity index 100% rename from src/chars/case_fold.rs rename to matcher/src/chars/case_fold.rs diff --git a/src/chars/normalize.rs b/matcher/src/chars/normalize.rs similarity index 99% rename from src/chars/normalize.rs rename to matcher/src/chars/normalize.rs index 772d768..66a4db1 100644 --- a/src/chars/normalize.rs +++ b/matcher/src/chars/normalize.rs @@ -1,3 +1,5 @@ +use std::mem::transmute; + const DATA1: [(char, char); 277] = [ ('\u{00C0}', 'A'), // WITH GRAVE, LATIN CAPITAL LETTER ('\u{00C1}', 'A'), // WITH ACUTE, LATIN CAPITAL LETTER @@ -471,7 +473,7 @@ const fn generate_table(sparse_data: &[(char, char)]) -> [char let mut i = 0u32; let mut j = 0; while i < table.len() as u32 { - let Some(key) = char::from_u32(start + i) else { panic!("invalid char") }; + let key = unsafe { transmute(start + i) }; if sparse_data[j].0 == key { table[i as usize] = DATA1[j].1; j += 1; diff --git a/src/config.rs b/matcher/src/config.rs similarity index 100% rename from src/config.rs rename to matcher/src/config.rs diff --git a/src/debug.rs b/matcher/src/debug.rs similarity index 93% rename from src/debug.rs rename to matcher/src/debug.rs index d167aeb..364676c 100644 --- a/src/debug.rs +++ b/matcher/src/debug.rs @@ -21,7 +21,7 @@ use std::fmt::{Debug, Formatter, Result}; impl Debug for ScoreCell { fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {}, {})", self.score, self.bonus, self.matched) + write!(f, "({}, {})", self.score, self.matched) } } diff --git a/matcher/src/exact.rs b/matcher/src/exact.rs new file mode 100644 index 0000000..8b5115b --- /dev/null +++ b/matcher/src/exact.rs @@ -0,0 +1,271 @@ +use memchr::memmem; +use memchr::{Memchr, Memchr2}; + +use crate::chars::{AsciiChar, Char}; +use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH}; +use crate::Matcher; + +impl Matcher { + pub(crate) fn substring_match_1_ascii( + &mut self, + haystack: &[u8], + c: u8, + indices: &mut Vec, + ) -> Option { + let mut max_score = 0; + let mut max_pos = 0; + if self.config.ignore_case && c >= b'a' && c <= b'z' { + for i in Memchr2::new(c, c - 32, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let char_class = AsciiChar(haystack[i]).char_class(&self.config); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + } else { + let char_class = AsciiChar(c).char_class(&self.config); + for i in Memchr::new(c, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + } + if max_score == 0 { + return None; + } + + if INDICES { + indices.push(max_pos); + } + Some(max_score) + } + + pub(crate) fn substring_match_ascii_with_prefilter( + &mut self, + haystack: &[u8], + needle: &[u8], + prefilter_len: usize, + prefilter: impl Iterator, + ) -> (u16, usize) { + let needle_without_prefilter = &needle[prefilter_len..]; + let mut max_score = 0; + let mut max_pos = 0; + for i in prefilter { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let char_class = AsciiChar(haystack[i]).char_class(&self.config); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score + && haystack[i + prefilter_len..] + .iter() + .map(|&c| AsciiChar(c).normalize(&self.config).0) + .eq(needle_without_prefilter.iter().copied()) + { + max_pos = i; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + (max_score, max_pos) + } + + pub(crate) fn substring_match_ascii( + &mut self, + haystack: &[u8], + needle: &[u8], + indices: &mut Vec, + ) -> Option { + let mut max_score = 0; + let mut max_pos = 0; + if self.config.ignore_case { + match needle.iter().position(|&c| c >= b'a' && c <= b'z') { + // start with char do case insensitive search + Some(0) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + Memchr2::new( + needle[0], + needle[0] - 32, + &haystack[..haystack.len() - needle.len() + 1], + ), + ); + if max_score == 0 { + return None; + } + } + Some(1) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + Memchr::new(needle[0], &haystack[..haystack.len() - needle.len() + 1]), + ); + if max_score == 0 { + return None; + } + } + Some(len) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + memmem::find_iter(&haystack[..haystack.len() - needle.len() + len], needle), + ); + if max_score == 0 { + return None; + } + } + // in case we don't have any letter in the needle + // we can treat the search as case sensitive and use memmem dircedly which is way faster + None => (), + } + } + + if max_score == 0 { + let char_class = AsciiChar(needle[0]).char_class(&self.config); + for i in memmem::find_iter(haystack, needle) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + if max_score == 0 { + return None; + } + } + let score = self.calculate_score::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + max_pos, + max_pos + needle.len(), + indices, + ); + Some(score) + } + + pub(crate) fn substring_match_1_non_ascii( + &mut self, + haystack: &[char], + needle: char, + start: usize, + indices: &mut Vec, + ) -> u16 { + let mut max_score = 0; + let mut max_pos = 0; + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + for (i, &c) in haystack[start..].iter().enumerate() { + let (c, char_class) = c.char_class_and_normalize(&self.config); + if c != needle { + continue; + } + let bonus = self.config.bonus_for(prev_class, char_class); + prev_class = char_class; + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + + if INDICES { + indices.push(max_pos + start as u32); + } + max_score + } + + pub(crate) fn substring_match_non_ascii( + &mut self, + haystack: &[char], + needle: &[N], + start: usize, + indices: &mut Vec, + ) -> Option + where + N: Char, + char: PartialEq, + { + let mut max_score = 0; + let mut max_pos = 0; + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + for (i, &c) in haystack[start..].iter().enumerate() { + let (c, char_class) = c.char_class_and_normalize(&self.config); + if c != needle[0] { + continue; + } + let bonus = self.config.bonus_for(prev_class, char_class); + prev_class = char_class; + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score + && haystack[i + 1..] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(needle[1..].iter().copied()) + { + max_pos = i; + max_score = score; + // can't get better than this + if score >= self.config.bonus_boundary_white { + break; + } + } + } + + let score = self.calculate_score::( + haystack, + needle, + max_pos, + max_pos + needle.len(), + indices, + ); + Some(score) + } +} diff --git a/src/fuzzy_greedy.rs b/matcher/src/fuzzy_greedy.rs similarity index 100% rename from src/fuzzy_greedy.rs rename to matcher/src/fuzzy_greedy.rs diff --git a/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs similarity index 90% rename from src/fuzzy_optimal.rs rename to matcher/src/fuzzy_optimal.rs index e56ff46..4034f31 100644 --- a/src/fuzzy_optimal.rs +++ b/matcher/src/fuzzy_optimal.rs @@ -61,20 +61,18 @@ impl Matcher { } } -fn next_m_score(p_score: i32, m_score: i32, bonus: u16, next_bonus: u16) -> ScoreCell { - let consecutive_bonus = max(bonus, max(next_bonus, BONUS_CONSECUTIVE)); +fn next_m_score(p_score: i32, m_score: i32, bonus: u16) -> ScoreCell { + let consecutive_bonus = max(bonus, BONUS_CONSECUTIVE); let score_match = m_score + consecutive_bonus as i32; - let score_skip = p_score + next_bonus as i32; - if score_match >= score_skip { + let score_skip = p_score + bonus as i32; + if score_match > score_skip { ScoreCell { score: score_match + SCORE_MATCH as i32, - bonus: consecutive_bonus, matched: true, } } else { ScoreCell { score: score_skip + SCORE_MATCH as i32, - bonus: next_bonus, matched: false, } } @@ -91,7 +89,7 @@ fn p_score(prev_p_score: i32, prev_m_score: i32) -> (i32, bool) { } else { i32::MIN / 2 }; - if score_match >= score_skip { + if score_match > score_skip { (score_match, true) } else { (score_skip, false) @@ -185,15 +183,10 @@ impl MatcherDataView<'_, H> { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { if c == needle_char { - // TODO: do we really want to start with a penalty here?? - let mut cell = - next_m_score(0, i32::MIN / 2, 0, bonus * BONUS_FIRST_CHAR_MULTIPLIER); - cell.bonus = *bonus; - cell + next_m_score(0, i32::MIN / 2, bonus * BONUS_FIRST_CHAR_MULTIPLIER) } else { ScoreCell { score: i32::MIN / 2, - bonus: 0, matched: false, } } @@ -215,15 +208,10 @@ impl MatcherDataView<'_, H> { let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); let m_cell = if FIRST_ROW { if c[0] == needle_char { - // TODO: do we really want to start with a penalty here?? - let mut cell = - next_m_score(0, i32::MIN / 2, 0, bonus[0] * BONUS_FIRST_CHAR_MULTIPLIER); - cell.bonus = bonus[0]; - cell + next_m_score(0, i32::MIN / 2, bonus[0] * BONUS_FIRST_CHAR_MULTIPLIER) } else { ScoreCell { score: i32::MIN / 2, - bonus: 0, matched: false, } } @@ -231,11 +219,10 @@ impl MatcherDataView<'_, H> { *score_cell }; *score_cell = if c[1] == next_needle_char { - next_m_score(p_score, m_cell.score, m_cell.bonus, bonus[1]) + next_m_score(p_score, m_cell.score, bonus[1]) } else { ScoreCell { score: i32::MIN / 2, - bonus: 0, matched: false, } }; @@ -284,8 +271,9 @@ impl MatcherDataView<'_, H> { matrix_len: usize, start: u32, ) { - indices.clear(); - indices.resize(self.row_offs.len(), 0); + let indices_start = indices.len(); + indices.resize(indices_start + self.row_offs.len(), 0); + let indices = &mut indices[indices_start..]; let last_row_off = *self.row_offs.last().unwrap(); indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32; diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs new file mode 100644 index 0000000..c478348 --- /dev/null +++ b/matcher/src/lib.rs @@ -0,0 +1,535 @@ +/*! +`nucleo_matcher` is a low level crate that contains the matcher implementation +used by the other nucleo crates. + +The matcher is hightly optimized and can significantly outperform `fzf` and +`skim` (the `fuzzy-matcher` crate). However some of these optimizations require +a slightly less convenient API. Particularly, `nucleo_matcher` requires that +needles and haystacks are provided as [UTF32 strings](crate::Utf32Str) instead +of rusts normal utf32 strings. +*/ + +// sadly ranges don't optmimzie well +#![allow(clippy::manual_range_contains)] + +pub mod chars; +mod config; +#[cfg(test)] +mod debug; +mod exact; +mod fuzzy_greedy; +mod fuzzy_optimal; +mod matrix; +mod prefilter; +mod score; +mod utf32_str; + +#[cfg(test)] +mod tests; + +pub use crate::config::MatcherConfig; +pub use crate::utf32_str::Utf32Str; + +use crate::chars::{AsciiChar, Char}; +use crate::matrix::MatrixSlab; + +/// A matcher engine that can execute (fuzzy) matches. +/// +/// A matches contains **heap allocated** scratch memory that is reused during +/// matching. This scratch memory allows the matcher to garunte that it will +/// **never allocate** during matching (with the exception of pushing to the +/// `indices` vector if there isn't enough capacity). However this scratch +/// memory is fairly large (around 135KB) so creating a matcher is expensive and +/// should be reused. +/// +/// All `.._match` functions will not compute the indices of the matched chars +/// and are therefore significantly faster. These should be used to prefitler +/// and sort all matches. All `.._indices` functions will compute the indices of +/// the computed chars. These should be used when rendering the best N matches. +/// Note that the `indices` argument is **never cleared**. This allows running +/// multiple different matches on the same haystack and merging the indices by +/// sorting and deduplicating the vector. +/// +/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than +/// that the matcher *will panic*. The caller must decide whether it wants to +/// filter out long haystacks or truncate them. +pub struct Matcher { + pub config: MatcherConfig, + slab: MatrixSlab, +} + +impl Default for Matcher { + fn default() -> Self { + Matcher { + config: MatcherConfig::DEFAULT, + slab: MatrixSlab::new(), + } + } +} + +impl Matcher { + pub fn new(config: MatcherConfig) -> Self { + Self { + config, + slab: MatrixSlab::new(), + } + } + + /// Find the fuzzy match with the higehest score in the `haystack`. + /// + /// This functions has `O(mn)` time complexity for short inputs. To + /// avoid slowdowns it automatically falls back to [greedy matching] + /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) + } + + /// Find the fuzzy match with the higehest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(mn)` time complexity for short inputs. To + /// avoid slowdowns it automatically falls back to [greedy matching] + /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::(haystack, needle, indices) + } + + fn fuzzy_matcher_impl( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if needle_.len() > haystack.len() || needle_.is_empty() { + return None; + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::(haystack, needle_, indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "fuzzy matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::(haystack, needle, indices); + } + let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; + self.fuzzy_match_optimal::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.fuzzy_match_optimal::( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + end, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::(haystack, needle, start, indices); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.fuzzy_match_optimal::( + haystack, + needle, + start, + start + 1, + end, + indices, + ) + } + } + } + + /// Greedly find a fuzzy match in the `haystack`. + /// + /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) + /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + ) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::(haystack, needle, &mut Vec::new()) + } + + /// Greedly find a fuzzy match in the `haystack` and compute its indices. + /// + /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) + /// indices and scores. Usually [fuzz_indices](crate::Matcher::fuzzy_indices) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::(haystack, needle, indices) + } + + fn fuzzy_match_greedy_impl( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if needle_.len() > haystack.len() || needle_.is_empty() { + return None; + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::(haystack, needle_, indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; + self.fuzzy_match_greedy_::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::( + haystack, + needle, + start, + start + 1, + indices, + ) + } + } + } + + /// Finds the substring match with the highest score in the `haystack`. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_match( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + ) -> Option { + self.substring_match_impl::(haystack, needle_, &mut Vec::new()) + } + + /// Finds the substring match with the highest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_indices( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + self.substring_match_impl::(haystack, needle_, indices) + } + + fn substring_match_impl( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if needle_.len() > haystack.len() || needle_.is_empty() { + return None; + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::(haystack, needle_, indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::(haystack, needle, indices); + } + self.substring_match_ascii::(haystack, needle, indices) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.substring_match_non_ascii::( + haystack, + AsciiChar::cast(needle), + start, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::(haystack, needle, start, indices); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.fuzzy_match_optimal::( + haystack, + needle, + start, + start + 1, + end, + indices, + ) + } + } + } + + /// Checks whether needle and haystack match exactly. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { + self.exact_match_impl::(haystack, needle, &mut Vec::new()) + } + + /// Checks whether needle and haystack match exactly and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + self.exact_match_impl::(haystack, needle, indices) + } + + /// Checks whether needle is a prefix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { + if haystack.len() < needle.len() { + None + } else { + self.exact_match_impl::(haystack.slice(..needle.len()), needle, &mut Vec::new()) + } + } + + /// Checks whether needle is a prefix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if haystack.len() < needle.len() { + None + } else { + self.exact_match_impl::(haystack.slice(..needle.len()), needle, indices) + } + } + + /// Checks whether needle is a postfix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { + if haystack.len() < needle.len() { + None + } else { + self.exact_match_impl::( + haystack.slice(haystack.len() - needle.len()..), + needle, + &mut Vec::new(), + ) + } + } + + /// Checks whether needle is a postfix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if haystack.len() < needle.len() { + None + } else { + self.exact_match_impl::( + haystack.slice(haystack.len() - needle.len()..), + needle, + indices, + ) + } + } + + fn exact_match_impl( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec, + ) -> Option { + if needle_.len() != haystack.len() || needle_.is_empty() { + return None; + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + let score = match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let matched = if self.config.ignore_case { + AsciiChar::cast(haystack) + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))) + } else { + haystack == needle + }; + if !matched { + return None; + } + self.calculate_score::( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + 0, + haystack.len(), + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + return None; + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + haystack + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))); + + self.calculate_score::( + haystack, + AsciiChar::cast(needle), + 0, + haystack.len(), + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let matched = haystack + .iter() + .map(|c| c.normalize(&self.config)) + .eq(needle.iter().map(|c| c.normalize(&self.config))); + if !matched { + return None; + } + self.calculate_score::(haystack, needle, 0, haystack.len(), indices) + } + }; + Some(score) + } +} diff --git a/src/matrix.rs b/matcher/src/matrix.rs similarity index 99% rename from src/matrix.rs rename to matcher/src/matrix.rs index 755b5d3..550a5d7 100644 --- a/src/matrix.rs +++ b/matcher/src/matrix.rs @@ -88,7 +88,6 @@ impl MatrixLayout { #[derive(Clone, Copy)] pub(crate) struct ScoreCell { pub score: i32, - pub bonus: u16, pub matched: bool, } diff --git a/src/multizip.rs b/matcher/src/multizip.rs similarity index 100% rename from src/multizip.rs rename to matcher/src/multizip.rs diff --git a/src/prefilter.rs b/matcher/src/prefilter.rs similarity index 87% rename from src/prefilter.rs rename to matcher/src/prefilter.rs index 918c7b2..92fbb7a 100644 --- a/src/prefilter.rs +++ b/matcher/src/prefilter.rs @@ -30,7 +30,8 @@ impl Matcher { only_greedy: bool, ) -> Option<(usize, usize, usize)> { if self.config.ignore_case { - let start = find_ascii_ignore_case(needle[0], haystack)?; + let start = + find_ascii_ignore_case(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; let mut greedy_end = start + 1; haystack = &haystack[greedy_end..]; for &c in &needle[1..] { @@ -47,7 +48,7 @@ impl Matcher { Some((start, greedy_end, end)) } } else { - let start = memchr(needle[0], haystack)?; + let start = memchr(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; let mut greedy_end = start + 1; haystack = &haystack[greedy_end..]; for &c in &needle[1..] { @@ -72,7 +73,7 @@ impl Matcher { only_greedy: bool, ) -> Option<(usize, usize)> { let needle_char = needle.get(0); - let start = haystack + let start = haystack[..haystack.len() - needle.len() + 1] .iter() .position(|c| c.normalize(&self.config) == needle_char)?; let needle_char = needle.last(); @@ -80,15 +81,10 @@ impl Matcher { Some((start, start + 1)) } else { let end = haystack.len() - - haystack[start..] + - haystack[start + 1..] .iter() .rev() .position(|c| c.normalize(&self.config) == needle_char)?; - // matches are never possible in this case - if end - start < needle.len() { - cov_mark::hit!(small_haystack); - return None; - } Some((start, end)) } diff --git a/src/score.rs b/matcher/src/score.rs similarity index 78% rename from src/score.rs rename to matcher/src/score.rs index 4a14c7c..05e591c 100644 --- a/src/score.rs +++ b/matcher/src/score.rs @@ -15,11 +15,6 @@ pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; // in web2 dictionary and my file system. pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; -// Although bonus point for non-word characters is non-contextual, we need it -// for computing bonus points for consecutive chunks starting with a non-word -// character. -pub(crate) const BONUS_NON_WORD: u16 = SCORE_MATCH / 2; - // Edge-triggered bonus for matches in camelCase words. // Compared to word-boundary case, they don't accompany single-character gaps // (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly. @@ -28,19 +23,20 @@ pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_EXTENSION; // Minimum bonus point given to characters in consecutive chunks. // Note that bonus points for consecutive matches shouldn't have needed if we // used fixed match score as in the original algorithm. -pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; +pub(crate) const BONUS_CONSECUTIVE: u16 = + PENALTY_GAP_START + PENALTY_GAP_EXTENSION + PENALTY_GAP_EXTENSION; // The first character in the typed pattern usually has more significance // than the rest so it's important that it appears at special positions where // bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". // The amount of the extra bonus should be limited so that the gap penalty is // still respected. -pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 1; +pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; impl MatcherConfig { #[inline] pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { - if class > CharClass::NonWord { + if class > CharClass::Delimiter { // transition from non word to word match prev_class { CharClass::Whitespace => return self.bonus_boundary_white, @@ -54,8 +50,6 @@ impl MatcherConfig { { // camelCase letter123 BONUS_CAMEL123 - } else if class == CharClass::NonWord { - BONUS_NON_WORD } else if class == CharClass::Whitespace { self.bonus_boundary_white } else { @@ -78,7 +72,6 @@ impl Matcher { indices: &mut Vec, ) -> u16 { if INDICES { - indices.clear(); indices.reserve(needle.len()); } @@ -97,8 +90,8 @@ impl Matcher { indices.push(start as u32) } let class = haystack[start].char_class(&self.config); - let mut first_bonus = self.bonus_for(prev_class, class); - let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER; + let mut bonus = self.bonus_for(prev_class, class); + let mut score = SCORE_MATCH + bonus * BONUS_FIRST_CHAR_MULTIPLIER; prev_class = class; needle_char = *needle_iter.next().unwrap_or(&needle_char); @@ -108,17 +101,9 @@ impl Matcher { if INDICES { indices.push(i as u32 + start as u32 + 1) } - let mut bonus = self.bonus_for(prev_class, class); - if consecutive == 0 { - first_bonus = bonus - } else { - // Break consecutive chunk - if bonus > first_bonus { - first_bonus = bonus; - bonus = max(max(bonus, first_bonus), BONUS_CONSECUTIVE); - } else { - bonus = max(first_bonus, BONUS_CONSECUTIVE); - } + bonus = self.bonus_for(prev_class, class); + if consecutive != 0 { + bonus = max(bonus, BONUS_CONSECUTIVE); } score += SCORE_MATCH + bonus; in_gap = false; @@ -135,7 +120,6 @@ impl Matcher { score = score.saturating_sub(penalty); in_gap = true; consecutive = 0; - first_bonus = 0; } prev_class = class; } diff --git a/src/tests.rs b/matcher/src/tests.rs similarity index 78% rename from src/tests.rs rename to matcher/src/tests.rs index f6357ba..91b4a3e 100644 --- a/src/tests.rs +++ b/matcher/src/tests.rs @@ -1,8 +1,6 @@ -use cov_mark::check; - use crate::chars::Char; use crate::score::{ - BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, + BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, }; use crate::utf32_str::Utf32Str; @@ -46,13 +44,12 @@ fn assert_matches( score += needle.len() as u16 * SCORE_MATCH; for algo in algorithm { println!("xx {matched_indices:?} {algo:?}"); + matched_indices.clear(); let res = match algo { Algorithm::FuzzyOptimal => { - matched_indices.clear(); matcher.fuzzy_indices(haystack, needle, &mut matched_indices) } Algorithm::FuzzyGreedy => { - matched_indices.clear(); matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices) } }; @@ -115,6 +112,7 @@ pub fn assert_not_matches( ) } } + const BONUS_BOUNDARY_WHITE: u16 = MatcherConfig::DEFAULT.bonus_boundary_white; const BONUS_BOUNDARY_DELIMITER: u16 = MatcherConfig::DEFAULT.bonus_boundary_delimiter; @@ -144,20 +142,19 @@ fn test_fuzzy() { "/AutomatorDocument.icns", "rdoc", &[9, 10, 11, 12], - BONUS_CAMEL123 * 3, + BONUS_CAMEL123 + 2 * BONUS_CONSECUTIVE, ), ( "/man1/zshcompctl.1", "zshc", &[6, 7, 8, 9], - BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER - + BONUS_BOUNDARY_DELIMITER * 3, + BONUS_BOUNDARY_DELIMITER * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE * 3, ), ( "/.oh-my-zsh/cache", "zshc", &[8, 9, 10, 12], - BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 2 + BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE * 2 - PENALTY_GAP_START + BONUS_BOUNDARY_DELIMITER, ), @@ -171,9 +168,7 @@ fn test_fuzzy() { "abc123 456", "12356", &[3, 4, 5, 8, 9], - BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER - + BONUS_CAMEL123 * 2 - + BONUS_CONSECUTIVE + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, ), @@ -205,37 +200,42 @@ fn test_fuzzy() { "fooBar Baz", "foob", &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CONSECUTIVE * 2 + + BONUS_CAMEL123, ), ( "xFoo-Bar Baz", "foo-b", &[1, 2, 3, 4, 5], BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER - + BONUS_CAMEL123 * 2 - + BONUS_NON_WORD + + BONUS_CONSECUTIVE * 3 + BONUS_BOUNDARY, ), ( "]\0\0\0H\0\0\0rrrrrrrrrrrrrrrrrrrrrrrVVVVVVVV\0", "H\0\0VV", &[4, 5, 6, 31, 32], - BONUS_BOUNDARY * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + 2 * BONUS_CAMEL123 + BONUS_BOUNDARY * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE * 2 - PENALTY_GAP_START - - 23 * PENALTY_GAP_EXTENSION, + - 23 * PENALTY_GAP_EXTENSION + + BONUS_CAMEL123 + + BONUS_CONSECUTIVE, ), ( "\nץ&`@ `---\0\0\0\0", "`@ `--\0\0", &[3, 4, 5, 6, 7, 8, 10, 11], - BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 3) + BONUS_BOUNDARY_WHITE * 4 - - PENALTY_GAP_START, + BONUS_BOUNDARY_WHITE * 2 + 2 * BONUS_CONSECUTIVE - PENALTY_GAP_START + + BONUS_CONSECUTIVE, ), ( " 1111111u11111uuu111", "11111uuu1", &[9, 10, 11, 12, 13, 14, 15, 16, 17], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 8), + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + + 7 * BONUS_CONSECUTIVE + + BONUS_CAMEL123, ), ], ); @@ -275,14 +275,15 @@ fn test_fuzzy_case_sensitive() { "FooBar Baz", "FooB", &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 3, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CONSECUTIVE * 2 + + BONUS_CAMEL123, ), - // Consecutive bonus updated ( "foo-bar", "o-ba", &[2, 3, 4, 5], - BONUS_BOUNDARY * 2 + BONUS_NON_WORD, + BONUS_BOUNDARY + 2 * BONUS_CONSECUTIVE, ), ], ); @@ -300,13 +301,13 @@ fn test_normalize() { "Só Danço Samba", "So", &[0, 1], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE, ), ( "Só Danço Samba", "sodc", &[0, 1, 3, 6], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE - PENALTY_GAP_START + BONUS_BOUNDARY_WHITE - PENALTY_GAP_START @@ -316,19 +317,21 @@ fn test_normalize() { "Danço", "danco", &[0, 1, 2, 3, 4], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + 4 * BONUS_CONSECUTIVE, ), ( "DanÇo", "danco", &[0, 1, 2, 3, 4], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_CAMEL123 + + 3 * BONUS_CONSECUTIVE, ), ( "xÇando", "cando", &[1, 2, 3, 4, 5], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER + 4 * BONUS_CONSECUTIVE, ), ("ۂ(GCGɴCG", "n", &[5], 0), ], @@ -347,7 +350,7 @@ fn test_unicode1() { "你好世界", "你好", &[0, 1], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CONSECUTIVE, ), ( "你好世界", @@ -370,11 +373,55 @@ fn test_long_str() { &"x".repeat(u16::MAX as usize + 1), "xx", &[0, 1], - (BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE, + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + BONUS_CONSECUTIVE, )], ); } +#[test] +fn test_casing() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + &[ + // score 143 we currently slightly prefer camel + ( + "fooBar", + "foobar", + &[0, 1, 2, 3, 4, 5], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + + BONUS_CAMEL123 + + 4 * BONUS_CONSECUTIVE, + ), + // score 141 for perfect match + ( + "foobar", + "foobar", + &[0, 1, 2, 3, 4, 5], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + 5 * BONUS_CONSECUTIVE, + ), + // score 141 here too since the boundary bonus and the gap penalty/missed consecutive bonus cancel perfectly + ( + "foo-bar", + "foobar", + &[0, 1, 2, 4, 5, 6], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + BONUS_BOUNDARY + - PENALTY_GAP_START + + 4 * BONUS_CONSECUTIVE, + ), + ( + "foo_bar", + "foobar", + &[0, 1, 2, 4, 5, 6], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + BONUS_BOUNDARY + - PENALTY_GAP_START + + 4 * BONUS_CONSECUTIVE, + ), + ], + ) +} #[test] fn test_optimal() { assert_matches( @@ -387,60 +434,38 @@ fn test_optimal() { "axxx xx ", "xx", &[5, 6], - (BONUS_FIRST_CHAR_MULTIPLIER + 1) * BONUS_BOUNDARY_WHITE, - ), - ( - "I\0I", - "\0", - &[1], - BONUS_FIRST_CHAR_MULTIPLIER * BONUS_NON_WORD, + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_BOUNDARY_WHITE + BONUS_CONSECUTIVE, ), ( "SS!H", "S!", &[0, 2], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_NON_WORD - - PENALTY_GAP_START, + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, ), ( "^^^\u{7f}\0\0E%\u{1a}^", "^^\0E", &[1, 2, 5, 6], - BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 3) - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION, + BONUS_CONSECUTIVE + BONUS_BOUNDARY - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, ), ( - "Hٷ!!-!!!\n--\u{4}\u{c}-\u{8}-!\u{c}", - "-!--!", - &[4, 5, 13, 15, 16], - BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 4) - - 2 * PENALTY_GAP_START - - 6 * PENALTY_GAP_EXTENSION, - ), - ( - "C8Gۂ(GECGS", - "8GCG", - &[1, 2, 7, 8], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + "8gx(gecg)", + "8gcg", + &[0, 4, 6, 7], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + - PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION + + BONUS_BOUNDARY - PENALTY_GAP_START - - 3 * PENALTY_GAP_EXTENSION + BONUS_CONSECUTIVE, ), - ( - "\nץ&`@ `;;;\0\0\0\0", - "`@ `;;\0\0", - &[3, 4, 5, 6, 7, 9, 10, 11], - BONUS_NON_WORD * (BONUS_FIRST_CHAR_MULTIPLIER + 1) - + BONUS_BOUNDARY_DELIMITER * 3 - + BONUS_BOUNDARY_WHITE * 3 - - PENALTY_GAP_START, - ), ( "dddddd\0\0\0ddddfdddddd", "dddddfddddd", &[0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY * 10 + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + + BONUS_BOUNDARY + + 9 * BONUS_CONSECUTIVE - PENALTY_GAP_START - 7 * PENALTY_GAP_EXTENSION, ), @@ -476,9 +501,11 @@ fn test_reject() { false, &[ ("你好界", "abc"), + ("你好界", "a"), ("你好世界", "富"), ("Só Danço Samba", "sox"), ("fooBarbaz", "fooBarbazz"), + ("fooBarbaz", "c"), ], ); assert_not_matches( @@ -488,6 +515,8 @@ fn test_reject() { &[ ("你好界", "abc"), ("abc", "你"), + ("abc", "A"), + ("abc", "d"), ("你好世界", "富"), ("Só Danço Samba", "sox"), ("fooBarbaz", "oBZ"), @@ -499,8 +528,11 @@ fn test_reject() { false, true, false, - &[("Só Danço Samba", "sod"), ("Só Danço Samba", "soc")], + &[ + ("Só Danço Samba", "sod"), + ("Só Danço Samba", "soc"), + ("Só Danç", "So"), + ], ); - check!(small_haystack); assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]); } diff --git a/src/utf32_str.rs b/matcher/src/utf32_str.rs similarity index 97% rename from src/utf32_str.rs rename to matcher/src/utf32_str.rs index cf66091..70945be 100644 --- a/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,9 +1,9 @@ use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; -/// A UTF32 encoded (char array) String that can be used as an input to fuzzy matching. +/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. /// -/// Usually rusts utf8 encoded strings are great. However during fuzzy matching +/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching /// operates on codepoints (it should operate on graphemes but that's too much /// hassle to deal with). We want to quickly iterate these codeboints between /// (up to 5 times) during matching. diff --git a/src/exact.rs b/src/exact.rs deleted file mode 100644 index 70e42d1..0000000 --- a/src/exact.rs +++ /dev/null @@ -1,108 +0,0 @@ -use memchr::{Memchr, Memchr2}; - -use crate::chars::{AsciiChar, Char}; -use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH}; -use crate::Matcher; - -impl Matcher { - pub(crate) fn substring_match_1_ascii( - &mut self, - haystack: &[u8], - c: u8, - indices: &mut Vec, - ) -> Option { - let mut max_score = 0; - let mut max_pos = 0; - if self.config.ignore_case && c >= b'a' && c <= b'z' { - for i in Memchr2::new(c, c - 32, haystack) { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let char_class = AsciiChar(haystack[i]).char_class(&self.config); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if score >= self.config.bonus_boundary_white - && score >= self.config.bonus_boundary_delimiter - { - break; - } - } - } - } else { - let char_class = AsciiChar(c).char_class(&self.config); - for i in Memchr::new(c, haystack) { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if score >= self.config.bonus_boundary_white - && score >= self.config.bonus_boundary_delimiter - { - break; - } - } - } - } - if max_score == 0 { - return None; - } - - if INDICES { - indices.clear(); - indices.push(max_pos); - } - Some(max_score) - } - - pub(crate) fn substring_match_1_non_ascii( - &mut self, - haystack: &[char], - needle: char, - start: usize, - indices: &mut Vec, - ) -> u16 { - let mut max_score = 0; - let mut max_pos = 0; - let mut prev_class = start - .checked_sub(1) - .map(|i| haystack[i].char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - for (i, &c) in haystack[start..].iter().enumerate() { - let (c, char_class) = c.char_class_and_normalize(&self.config); - if c != needle { - println!("ups {c} {needle}"); - continue; - } - let bonus = self.config.bonus_for(prev_class, char_class); - prev_class = char_class; - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if score >= self.config.bonus_boundary_white - && score >= self.config.bonus_boundary_delimiter - { - break; - } - } - } - - if INDICES { - indices.clear(); - indices.push(max_pos + start as u32); - } - max_score - } -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index d21bb0d..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,202 +0,0 @@ -// sadly ranges don't optmimzie well -#![allow(clippy::manual_range_contains)] - -pub mod chars; -mod config; -#[cfg(test)] -mod debug; -mod exact; -mod fuzzy_greedy; -mod fuzzy_optimal; -mod matrix; -mod prefilter; -mod score; -mod utf32_str; - -#[cfg(test)] -mod tests; - -pub use crate::config::MatcherConfig; -pub use crate::utf32_str::Utf32Str; - -use crate::chars::AsciiChar; -use crate::matrix::MatrixSlab; - -pub struct Matcher { - pub config: MatcherConfig, - slab: MatrixSlab, -} - -impl Matcher { - pub fn new(config: MatcherConfig) -> Self { - Self { - config, - slab: MatrixSlab::new(), - } - } - - pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) - } - - pub fn fuzzy_indices( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indidies: &mut Vec, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_matcher_impl::(haystack, needle, indidies) - } - - fn fuzzy_matcher_impl( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indidies: &mut Vec, - ) -> Option { - if needle_.len() > haystack.len() || needle_.is_empty() { - return None; - } - // if needle_.len() == haystack.len() { - // return self.exact_match(); - // } - assert!( - haystack.len() <= u32::MAX as usize, - "fuzzy matching is only support for up to 2^32-1 codepoints" - ); - match (haystack, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - return self.substring_match_1_ascii::(haystack, needle, indidies); - } - let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; - self.fuzzy_match_optimal::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - end, - indidies, - ) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - None - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self.substring_match_1_non_ascii::( - haystack, - needle as char, - start, - indidies, - ); - return Some(res); - } - let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; - self.fuzzy_match_optimal::( - haystack, - AsciiChar::cast(needle), - start, - start + 1, - end, - indidies, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self - .substring_match_1_non_ascii::(haystack, needle, start, indidies); - return Some(res); - } - let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; - self.fuzzy_match_optimal::( - haystack, - needle, - start, - start + 1, - end, - indidies, - ) - } - } - } - pub fn fuzzy_match_greedy( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_match_greedy_impl::(haystack, needle, &mut Vec::new()) - } - - pub fn fuzzy_indices_greedy( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indidies: &mut Vec, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_match_greedy_impl::(haystack, needle, indidies) - } - - fn fuzzy_match_greedy_impl( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indidies: &mut Vec, - ) -> Option { - if needle_.len() > haystack.len() || needle_.is_empty() { - return None; - } - // if needle_.len() == haystack.len() { - // return self.exact_match(); - // } - assert!( - haystack.len() <= u32::MAX as usize, - "fuzzy matching is only support for up to 2^32-1 codepoints" - ); - match (haystack, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; - self.fuzzy_match_greedy_::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - indidies, - ) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - None - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - self.fuzzy_match_greedy_::( - haystack, - AsciiChar::cast(needle), - start, - start + 1, - indidies, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - self.fuzzy_match_greedy_::( - haystack, - needle, - start, - start + 1, - indidies, - ) - } - } - } -} diff --git a/tarpulin.toml b/tarpulin.toml index 3a54abf..4869f25 100644 --- a/tarpulin.toml +++ b/tarpulin.toml @@ -1,3 +1 @@ -exclude = ["src/tests.rs", "src/debug.rs", "src/chars/normalize.rs"] -[report] -out = ["Html", "Xml"] \ No newline at end of file +exclude = ["matcher/src/tests.rs", "matcher/src/debug.rs", "matcher/src/chars/normalize.rs"] diff --git a/typos.toml b/typos.toml index 322202d..900e3df 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,3 @@ default.extend-ignore-re = ["\\\\u\\{[0-9A-Za-z]*\\}"] [files] -extend-exclude = ["src/tests.rs", "*.html"] +extend-exclude = ["matcher/src/tests.rs", "*.html"]