diff --git a/Cargo.lock b/Cargo.lock index 1c2028b..6514d5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -311,18 +311,6 @@ dependencies = [ "rustc_version", ] -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata 0.1.10", - "serde", -] - [[package]] name = "built" version = "0.7.7" @@ -454,8 +442,6 @@ dependencies = [ "lazy_static", "libsql", "log", - "prettytable-rs", - "regex", "rusqlite", "serde", "serde_json", @@ -463,6 +449,7 @@ dependencies = [ "tokio", "treexml", "turso", + "unicode-width 0.2.2", "uuid 0.6.5", "xml-rs", ] @@ -612,28 +599,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "csv" -version = "1.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" -dependencies = [ - "bstr", - "csv-core", - "itoa 0.4.8", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" -dependencies = [ - "memchr", -] - [[package]] name = "ctr" version = "0.9.2" @@ -662,27 +627,6 @@ dependencies = [ "crypto-common", ] -[[package]] -name = "dirs-next" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" -dependencies = [ - "cfg-if 1.0.0", - "dirs-sys-next", -] - -[[package]] -name = "dirs-sys-next" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "dot_json" version = "0.2.0" @@ -712,12 +656,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "encode_unicode" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" - [[package]] name = "env_filter" version = "1.0.0" @@ -747,17 +685,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" -[[package]] -name = "errno" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" -dependencies = [ - "errno-dragonfly", - "libc", - "winapi", -] - [[package]] name = "errno" version = "0.3.14" @@ -768,16 +695,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "failure" version = "0.1.8" @@ -1086,15 +1003,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] - [[package]] name = "hermit-abi" version = "0.5.2" @@ -1124,7 +1032,7 @@ checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" dependencies = [ "bytes", "fnv", - "itoa 1.0.1", + "itoa", ] [[package]] @@ -1134,7 +1042,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ "bytes", - "itoa 1.0.1", + "itoa", ] [[package]] @@ -1175,7 +1083,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.1", + "itoa", "pin-project-lite", "socket2 0.5.10", "tokio", @@ -1272,16 +1180,6 @@ dependencies = [ "memoffset", ] -[[package]] -name = "io-lifetimes" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" -dependencies = [ - "libc", - "windows-sys 0.42.0", -] - [[package]] name = "io-uring" version = "0.7.11" @@ -1293,18 +1191,6 @@ dependencies = [ "libc", ] -[[package]] -name = "is-terminal" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" -dependencies = [ - "hermit-abi 0.2.6", - "io-lifetimes", - "rustix 0.36.6", - "windows-sys 0.42.0", -] - [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -1329,12 +1215,6 @@ dependencies = [ "either", ] -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - [[package]] name = "itoa" version = "1.0.1" @@ -1519,12 +1399,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "linux-raw-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" - [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -1580,7 +1454,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" dependencies = [ - "regex-automata 0.4.14", + "regex-automata", ] [[package]] @@ -1606,7 +1480,7 @@ checksum = "5f98efec8807c63c752b5bd61f862c165c115b0a35685bdcfd9238c7aeb592b7" dependencies = [ "cfg-if 1.0.0", "miette-derive", - "unicode-width", + "unicode-width 0.1.14", ] [[package]] @@ -1775,7 +1649,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall 0.5.18", + "redox_syscall", "smallvec", "windows-link", ] @@ -1865,7 +1739,7 @@ checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" dependencies = [ "cfg-if 1.0.0", "concurrent-queue", - "hermit-abi 0.5.2", + "hermit-abi", "pin-project-lite", "rustix 1.1.4", "windows-sys 0.61.2", @@ -1920,20 +1794,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "prettytable-rs" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eea25e07510aa6ab6547308ebe3c036016d162b8da920dbb079e3ba8acf3d95a" -dependencies = [ - "csv", - "encode_unicode", - "is-terminal", - "lazy_static", - "term", - "unicode-width", -] - [[package]] name = "proc-macro2" version = "1.0.106" @@ -2124,15 +1984,6 @@ dependencies = [ "rand_core 0.3.1", ] -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.5.18" @@ -2142,17 +1993,6 @@ dependencies = [ "bitflags 2.11.0", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom 0.2.17", - "redox_syscall 0.2.16", - "thiserror 1.0.69", -] - [[package]] name = "regex" version = "1.12.3" @@ -2161,16 +2001,10 @@ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.14", + "regex-automata", "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-automata" version = "0.4.14" @@ -2261,20 +2095,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rustix" -version = "0.36.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549" -dependencies = [ - "bitflags 1.3.2", - "errno 0.2.8", - "io-lifetimes", - "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.42.0", -] - [[package]] name = "rustix" version = "0.38.44" @@ -2282,7 +2102,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags 2.11.0", - "errno 0.3.14", + "errno", "libc", "linux-raw-sys 0.4.15", "windows-sys 0.59.0", @@ -2295,7 +2115,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags 2.11.0", - "errno 0.3.14", + "errno", "libc", "linux-raw-sys 0.12.1", "windows-sys 0.61.2", @@ -2456,7 +2276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2bb9cd061c5865d345bb02ca49fcef1391741b672b54a0bf7b679badec3142" dependencies = [ "indexmap 1.7.0", - "itoa 1.0.1", + "itoa", "ryu", "serde", ] @@ -2662,17 +2482,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "term" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" -dependencies = [ - "dirs-next", - "rustversion", - "winapi", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -2739,7 +2548,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", - "itoa 1.0.1", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -2932,7 +2741,7 @@ dependencies = [ "matchers", "nu-ansi-term", "once_cell", - "regex-automata 0.4.14", + "regex-automata", "sharded-slab", "smallvec", "thread_local", @@ -3177,6 +2986,12 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.2" @@ -3488,21 +3303,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" -dependencies = [ - "windows_aarch64_gnullvm 0.42.0", - "windows_aarch64_msvc 0.42.0", - "windows_i686_gnu 0.42.0", - "windows_i686_msvc 0.42.0", - "windows_x86_64_gnu 0.42.0", - "windows_x86_64_gnullvm 0.42.0", - "windows_x86_64_msvc 0.42.0", -] - [[package]] name = "windows-sys" version = "0.52.0" @@ -3572,12 +3372,6 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -3590,12 +3384,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -3608,12 +3396,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -3638,12 +3420,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" - [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -3656,12 +3432,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -3674,12 +3444,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -3692,12 +3456,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" diff --git a/cirup_core/Cargo.toml b/cirup_core/Cargo.toml index 5b06f37..db5c316 100644 --- a/cirup_core/Cargo.toml +++ b/cirup_core/Cargo.toml @@ -10,14 +10,13 @@ turso-rust = ["dep:tokio", "dep:turso", "dep:libsql"] rusqlite-c = ["dep:rusqlite"] [dependencies] -regex = "1.0" serde = { version = "1.0", features = ["derive"] } xml-rs = "0.8.0" dot_json = "0.2.0" lazy_static = "1.0.0" -prettytable-rs = "^0.10" log = "0.4" sha2 = "0.10" +unicode-width = "0.2" [dependencies.uuid] version = "0.6" diff --git a/cirup_core/src/file.rs b/cirup_core/src/file.rs index 7aade8e..c016bef 100644 --- a/cirup_core/src/file.rs +++ b/cirup_core/src/file.rs @@ -4,6 +4,8 @@ use std::path::Path; use std::collections::HashMap; use std::sync::Mutex; +#[cfg(test)] +use std::time::Instant; use sha2::{Digest, Sha256}; @@ -50,8 +52,14 @@ pub(crate) fn load_string_from_file(filename: &str) -> Result, tou output_hash != sha256_hash(existing_bytes) } -fn encode_utf8(text: &str, output_encoding: OutputEncoding) -> Vec { +fn encode_utf8_owned(text: String, output_encoding: OutputEncoding) -> Vec { match output_encoding { - OutputEncoding::Utf8NoBom => text.as_bytes().to_vec(), + OutputEncoding::Utf8NoBom => text.into_bytes(), OutputEncoding::Utf8Bom => { + let text = text.into_bytes(); let mut output = Vec::with_capacity(UTF8_BOM.len() + text.len()); output.extend_from_slice(&UTF8_BOM); - output.extend_from_slice(text.as_bytes()); + output.extend_from_slice(&text); output } } @@ -95,17 +104,17 @@ fn output_bytes_for_format( FormatType::Json => { let file_format = JsonFileFormat {}; let text = file_format.write_to_str(resources); - encode_utf8(&text, output_encoding) + encode_utf8_owned(text, output_encoding) } FormatType::Resx => { let file_format = ResxFileFormat {}; let text = file_format.write_to_str(resources); - encode_utf8(&text, output_encoding) + encode_utf8_owned(text, output_encoding) } FormatType::Restext => { let file_format = RestextFileFormat {}; let text = file_format.write_to_str(resources); - encode_utf8(&text, output_encoding) + encode_utf8_owned(text, output_encoding) } FormatType::Unknown => Vec::new(), } @@ -355,3 +364,31 @@ fn would_save_resource_file_reports_true_for_missing_output() { assert!(would_write); } + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_output_bytes_for_format_large_input() { + let resources = (0..50_000usize) + .map(|index| Resource::new(&format!("group{index}.key{}", index % 13), &format!("value{index}"))) + .collect::>(); + + let started = Instant::now(); + let utf8_no_bom = output_bytes_for_format(FormatType::Json, &resources, OutputEncoding::Utf8NoBom); + let utf8_no_bom_elapsed = started.elapsed(); + + let started = Instant::now(); + let utf8_bom = output_bytes_for_format(FormatType::Json, &resources, OutputEncoding::Utf8Bom); + let utf8_bom_elapsed = started.elapsed(); + + assert!(utf8_no_bom.len() < utf8_bom.len()); + + println!( + "output-bytes benchmark: resources={} utf8_no_bom_bytes={} utf8_bom_bytes={} no_bom={:?} bom={:?}", + resources.len(), + utf8_no_bom.len(), + utf8_bom.len(), + utf8_no_bom_elapsed, + utf8_bom_elapsed + ); +} diff --git a/cirup_core/src/json.rs b/cirup_core/src/json.rs index e65f75c..c5a789c 100644 --- a/cirup_core/src/json.rs +++ b/cirup_core/src/json.rs @@ -1,10 +1,10 @@ -extern crate dot_json; extern crate serde; extern crate serde_json; -use dot_json::value_to_dot; use serde::Serialize; use serde_json::{Map, Value}; +#[cfg(test)] +use std::time::Instant; use crate::Resource; use crate::file::FileFormat; @@ -14,16 +14,12 @@ use std::error::Error; pub(crate) struct JsonFileFormat {} fn json_dot_insert(root_map: &mut Map, name: &str, value: &str) { - if let Some(dot_index) = name.find('.') { - let root_path = &name[0..dot_index]; - let child_path = &name[dot_index + 1..name.len()]; + if let Some((root_path, child_path)) = name.split_once('.') { + let child_value = root_map + .entry(root_path.to_owned()) + .or_insert_with(|| Value::Object(Map::new())); - if !root_map.contains_key(root_path) { - let child_map: Map = Map::new(); - root_map.insert(root_path.to_owned(), Value::Object(child_map)); - } - - if let Some(Value::Object(child_map)) = root_map.get_mut(root_path) { + if let Value::Object(child_map) = child_value { json_dot_insert(child_map, child_path, value); } } else { @@ -31,6 +27,24 @@ fn json_dot_insert(root_map: &mut Map, name: &str, value: &str) { } } +fn flatten_json_value(value: &Value, path: &mut String, resources: &mut Vec) { + match value { + Value::Object(object) => { + for (key, child_value) in object { + let prefix_len = path.len(); + if prefix_len > 0 { + path.push('.'); + } + path.push_str(key); + flatten_json_value(child_value, path, resources); + path.truncate(prefix_len); + } + } + Value::String(text) => resources.push(Resource::new(path, text)), + _ => {} + } +} + fn json_to_string_pretty(value: &Map) -> String { let writer = Vec::new(); let formatter = serde_json::ser::PrettyFormatter::with_indent(b" "); @@ -47,17 +61,18 @@ impl FileFormat for JsonFileFormat { fn parse_from_str(&self, text: &str) -> Result, Box> { let mut resources: Vec = Vec::new(); let root_value: Value = serde_json::from_str(text)?; - let root_value_dot = value_to_dot(&root_value); - let root_object_dot = match root_value_dot.as_object() { + let root_object = match root_value.as_object() { Some(object) => object, - None => Err("json dot value is not an object")?, + None => Err("json value is not an object")?, }; - for (key, value) in root_object_dot.iter() { - if let Some(value) = value.as_str() { - let resource = Resource::new(key.as_str(), value); - resources.push(resource); - } + + let mut path = String::new(); + for (key, value) in root_object { + path.clear(); + path.push_str(key); + flatten_json_value(value, &mut path, &mut resources); } + Ok(resources) } @@ -161,3 +176,42 @@ fn test_json_write() { //println!("{}", expected_text); assert_eq!(actual_text, expected_text); } + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_json_parse_and_write_large_input() { + let file_format = JsonFileFormat {}; + let repetitions = 5_000usize; + let mut resources = Vec::with_capacity(repetitions * 6); + + for index in 0..repetitions { + let prefix = format!("group{index}"); + resources.push(Resource::new(&format!("{prefix}.lblBoat"), "I'm on a boat.")); + resources.push(Resource::new(&format!("{prefix}.lblYolo"), "You only live once")); + resources.push(Resource::new(&format!("{prefix}.lblDogs"), "Who let the dogs out?")); + resources.push(Resource::new(&format!("{prefix}.language.en"), "English")); + resources.push(Resource::new(&format!("{prefix}.language.fr"), "French")); + resources.push(Resource::new(&format!("{prefix}.very.deep.object"), "value")); + } + + let started = Instant::now(); + let written = file_format.write_to_str(&resources); + let write_elapsed = started.elapsed(); + + let started = Instant::now(); + let reparsed = file_format + .parse_from_str(&written) + .unwrap_or_else(|e| panic!("json benchmark parse failed: {}", e)); + let parse_elapsed = started.elapsed(); + + assert_eq!(reparsed.len(), resources.len()); + + println!( + "json benchmark: resources={} bytes={} write={:?} parse={:?}", + resources.len(), + written.len(), + write_elapsed, + parse_elapsed + ); +} diff --git a/cirup_core/src/lib.rs b/cirup_core/src/lib.rs index ffb78e4..918fda4 100644 --- a/cirup_core/src/lib.rs +++ b/cirup_core/src/lib.rs @@ -1,4 +1,3 @@ -extern crate regex; extern crate treexml; extern crate uuid; @@ -10,9 +9,6 @@ extern crate serde_json; #[macro_use] extern crate log; -#[macro_use] -extern crate prettytable; - #[macro_use] extern crate lazy_static; diff --git a/cirup_core/src/query.rs b/cirup_core/src/query.rs index d07a7c4..030806d 100644 --- a/cirup_core/src/query.rs +++ b/cirup_core/src/query.rs @@ -1,9 +1,10 @@ #![allow(clippy::self_named_module_files)] use std::io; +#[cfg(test)] +use std::time::Instant; -use prettytable::{Cell, Row, Table}; -use regex::Regex; +use unicode_width::UnicodeWidthStr; use crate::config::{QueryBackendKind, QueryConfig}; use crate::file::{ @@ -65,12 +66,43 @@ enum TextPatternSegment { AnyMany, } +#[derive(Debug, Clone)] +enum TextPatternMatcher { + Exact(String), + Prefix(String), + Suffix(String), + Contains(String), + PrefixSuffix { prefix: String, suffix: String }, + Wildcard(Vec), +} + +impl TextPatternMatcher { + fn matches(&self, value: &str) -> bool { + match self { + Self::Exact(expected) => value == expected, + Self::Prefix(prefix) => value.starts_with(prefix), + Self::Suffix(suffix) => value.ends_with(suffix), + Self::Contains(fragment) => value.contains(fragment), + Self::PrefixSuffix { prefix, suffix } => { + value.len() >= prefix.len() + suffix.len() && value.starts_with(prefix) && value.ends_with(suffix) + } + Self::Wildcard(segments) => wildcard_matches(segments, value), + } + } +} + #[derive(Debug, Clone)] struct CompiledTextPattern { - regex: Regex, + matcher: TextPatternMatcher, glob_pattern: String, } +impl CompiledTextPattern { + fn matches(&self, value: &str) -> bool { + self.matcher.matches(value) + } +} + #[derive(Debug, Clone, Default)] struct CompiledTextFilter { patterns: Vec, @@ -78,21 +110,34 @@ struct CompiledTextFilter { impl CompiledTextFilter { fn matches(&self, value: &str) -> bool { - self.patterns.iter().any(|pattern| pattern.regex.is_match(value)) + self.patterns.iter().any(|pattern| pattern.matches(value)) } fn sql_condition(&self, value_expr: &str) -> String { - let clauses = self - .patterns - .iter() - .map(|pattern| format!("{value_expr} GLOB {}", sql_quote_literal(&pattern.glob_pattern))) - .collect::>(); - - if clauses.len() == 1 { - clauses[0].clone() - } else { - format!("({})", clauses.join(" OR ")) + let mut clauses = self.patterns.iter(); + let Some(first_pattern) = clauses.next() else { + return String::new(); + }; + + let mut condition = String::with_capacity((value_expr.len() + 24) * self.patterns.len() + 4); + let needs_wrap = self.patterns.len() > 1; + + if needs_wrap { + condition.push('('); + } + + append_glob_clause(&mut condition, value_expr, &first_pattern.glob_pattern); + + for pattern in clauses { + condition.push_str(" OR "); + append_glob_clause(&mut condition, value_expr, &pattern.glob_pattern); } + + if needs_wrap { + condition.push(')'); + } + + condition } fn is_empty(&self) -> bool { @@ -104,39 +149,38 @@ fn make_error(message: impl Into) -> io::Error { io::Error::other(message.into()) } -fn sql_quote_literal(value: &str) -> String { - format!("'{}'", value.replace('\'', "''")) +fn append_sql_quote_literal(output: &mut String, value: &str) { + output.push('\''); + for ch in value.chars() { + if ch == '\'' { + output.push('\''); + } + output.push(ch); + } + output.push('\''); +} + +fn append_glob_clause(output: &mut String, value_expr: &str, glob_pattern: &str) { + output.push_str(value_expr); + output.push_str(" GLOB "); + append_sql_quote_literal(output, glob_pattern); } -fn escape_glob_char(ch: char) -> String { +fn append_escaped_glob_char(output: &mut String, ch: char) { match ch { - '*' => String::from("[*]"), - '?' => String::from("[?]"), - '[' => String::from("[[]"), - ']' => String::from("[]]"), - _ => ch.to_string(), + '*' => output.push_str("[*]"), + '?' => output.push_str("[?]"), + '[' => output.push_str("[[]"), + ']' => output.push_str("[]]"), + _ => output.push(ch), } } -fn compress_glob_stars(glob_pattern: &str) -> String { - let mut output = String::new(); - let mut previous_star = false; - - for ch in glob_pattern.chars() { - if ch == '*' { - if previous_star { - continue; - } - - previous_star = true; - } else { - previous_star = false; - } - - output.push(ch); +fn push_glob_wildcard(output: &mut String, previous_star: &mut bool) { + if !*previous_star { + output.push('*'); + *previous_star = true; } - - output } fn is_escaped_char(chars: &[char], index: usize) -> bool { @@ -155,6 +199,91 @@ fn is_escaped_char(chars: &[char], index: usize) -> bool { backslash_count % 2 == 1 } +fn push_match_segment(output: &mut Vec, segment: TextPatternSegment) { + if matches!(segment, TextPatternSegment::AnyMany) && matches!(output.last(), Some(TextPatternSegment::AnyMany)) { + return; + } + + output.push(segment); +} + +fn advance_char_boundary(value: &str, index: usize) -> Option { + value[index..].chars().next().map(|ch| index + ch.len_utf8()) +} + +fn wildcard_matches(segments: &[TextPatternSegment], value: &str) -> bool { + let mut segment_index = 0usize; + let mut value_index = 0usize; + let mut star_segment_index = None; + let mut star_value_index = 0usize; + + while value_index < value.len() { + match segments.get(segment_index) { + Some(TextPatternSegment::Literal(literal)) if value[value_index..].starts_with(literal) => { + value_index += literal.len(); + segment_index += 1; + } + Some(TextPatternSegment::AnyOne) => { + let Some(next_index) = advance_char_boundary(value, value_index) else { + return false; + }; + value_index = next_index; + segment_index += 1; + } + Some(TextPatternSegment::AnyMany) => { + star_segment_index = Some(segment_index); + star_value_index = value_index; + segment_index += 1; + } + _ => { + let Some(star_index) = star_segment_index else { + return false; + }; + let Some(next_index) = advance_char_boundary(value, star_value_index) else { + return false; + }; + star_value_index = next_index; + value_index = next_index; + segment_index = star_index + 1; + } + } + } + + while matches!(segments.get(segment_index), Some(TextPatternSegment::AnyMany)) { + segment_index += 1; + } + + segment_index == segments.len() +} + +fn matcher_from_segments(segments: Vec) -> TextPatternMatcher { + match segments.as_slice() { + [] => TextPatternMatcher::Exact(String::new()), + [TextPatternSegment::AnyMany] => TextPatternMatcher::Contains(String::new()), + [TextPatternSegment::Literal(literal)] => TextPatternMatcher::Exact(literal.clone()), + [TextPatternSegment::Literal(prefix), TextPatternSegment::AnyMany] => { + TextPatternMatcher::Prefix(prefix.clone()) + } + [TextPatternSegment::AnyMany, TextPatternSegment::Literal(suffix)] => { + TextPatternMatcher::Suffix(suffix.clone()) + } + [ + TextPatternSegment::AnyMany, + TextPatternSegment::Literal(fragment), + TextPatternSegment::AnyMany, + ] => TextPatternMatcher::Contains(fragment.clone()), + [ + TextPatternSegment::Literal(prefix), + TextPatternSegment::AnyMany, + TextPatternSegment::Literal(suffix), + ] => TextPatternMatcher::PrefixSuffix { + prefix: prefix.clone(), + suffix: suffix.clone(), + }, + _ => TextPatternMatcher::Wildcard(segments), + } +} + fn compile_text_pattern(flag_name: &str, pattern: &str) -> Result { if pattern.is_empty() { return Err(make_error(format!("{flag_name} cannot be empty"))); @@ -239,51 +368,44 @@ fn compile_text_pattern(flag_name: &str, pattern: &str) -> Result { - regex_pattern.push_str(®ex::escape(value)); for ch in value.chars() { - glob_pattern.push_str(&escape_glob_char(ch)); + append_escaped_glob_char(&mut glob_pattern, ch); + previous_glob_star = false; } + push_match_segment(&mut match_segments, TextPatternSegment::Literal(value.clone())); } TextPatternSegment::AnyOne => { - regex_pattern.push('.'); glob_pattern.push('?'); + previous_glob_star = false; + push_match_segment(&mut match_segments, TextPatternSegment::AnyOne); } TextPatternSegment::AnyMany => { - regex_pattern.push_str(".*"); - glob_pattern.push('*'); + push_glob_wildcard(&mut glob_pattern, &mut previous_glob_star); + push_match_segment(&mut match_segments, TextPatternSegment::AnyMany); } } } - if anchored_end { - regex_pattern.push('$'); - } else { - glob_pattern.push('*'); + if !anchored_end { + push_glob_wildcard(&mut glob_pattern, &mut previous_glob_star); + push_match_segment(&mut match_segments, TextPatternSegment::AnyMany); } - let regex = Regex::new(®ex_pattern).map_err(|error| { - make_error(format!( - "invalid {flag_name} '{}': failed to compile generated regex '{}': {}", - pattern, regex_pattern, error, - )) - })?; - Ok(CompiledTextPattern { - regex, - glob_pattern: compress_glob_stars(&glob_pattern), + matcher: matcher_from_segments(match_segments), + glob_pattern, }) } @@ -320,11 +442,22 @@ fn compile_query_filters(options: &QueryRunOptions) -> Result String { - input - .split_whitespace() - .collect::>() - .join(" ") - .to_ascii_lowercase() + let mut output = String::with_capacity(input.len()); + let mut saw_token = false; + + for token in input.split_whitespace() { + if saw_token { + output.push(' '); + } + + for ch in token.chars() { + output.push(ch.to_ascii_lowercase()); + } + + saw_token = true; + } + + output } fn wrap_resource_query_with_filters(query: &str, filters: &CompiledQueryFilters) -> String { @@ -460,46 +593,99 @@ fn ensure_trailing_newline(mut text: String) -> String { text } +fn ascii_table_border(output: &mut String, widths: &[usize; N]) { + output.push('+'); + for width in widths { + for _ in 0..(*width + 2) { + output.push('-'); + } + output.push('+'); + } + output.push('\n'); +} + +fn ascii_table_row(output: &mut String, widths: &[usize; N], cells: [&str; N]) { + output.push('|'); + for (width, cell) in widths.iter().zip(cells) { + output.push(' '); + output.push_str(cell); + let padding = width.saturating_sub(UnicodeWidthStr::width(cell)) + 1; + for _ in 0..padding { + output.push(' '); + } + output.push('|'); + } + output.push('\n'); +} + +fn estimate_ascii_table_capacity(column_widths: &[usize], row_count: usize) -> usize { + let line_len = column_widths.iter().sum::() + column_widths.len() * 3 + 2; + line_len * (row_count * 2 + 1) +} + fn resources_to_table(resources: &[Resource]) -> String { - let mut table: Table = Table::new(); + let mut widths = [UnicodeWidthStr::width("name"), UnicodeWidthStr::width("value")]; - table.add_row(row!["name", "value"]); + for resource in resources { + widths[0] = widths[0].max(UnicodeWidthStr::width(resource.name.as_str())); + widths[1] = widths[1].max(UnicodeWidthStr::width(resource.value.as_str())); + } + + let mut output = String::with_capacity(estimate_ascii_table_capacity(&widths, resources.len() + 1)); + + ascii_table_border(&mut output, &widths); + ascii_table_row(&mut output, &widths, ["name", "value"]); + ascii_table_border(&mut output, &widths); for resource in resources { - let mut row = Row::empty(); - row.add_cell(Cell::new(resource.name.as_str())); - row.add_cell(Cell::new(resource.value.as_str())); - table.add_row(row); + ascii_table_row(&mut output, &widths, [resource.name.as_str(), resource.value.as_str()]); + ascii_table_border(&mut output, &widths); } - ensure_trailing_newline(table.to_string()) + output } fn triples_to_table(triples: &[Triple]) -> String { - let mut table: Table = Table::new(); + let mut widths = [ + UnicodeWidthStr::width("name"), + UnicodeWidthStr::width("value"), + UnicodeWidthStr::width("base"), + ]; + + for triple in triples { + widths[0] = widths[0].max(UnicodeWidthStr::width(triple.name.as_str())); + widths[1] = widths[1].max(UnicodeWidthStr::width(triple.value.as_str())); + widths[2] = widths[2].max(UnicodeWidthStr::width(triple.base.as_str())); + } + + let mut output = String::with_capacity(estimate_ascii_table_capacity(&widths, triples.len() + 1)); - table.add_row(row!["name", "value", "base"]); + ascii_table_border(&mut output, &widths); + ascii_table_row(&mut output, &widths, ["name", "value", "base"]); + ascii_table_border(&mut output, &widths); for triple in triples { - let mut row = Row::empty(); - row.add_cell(Cell::new(triple.name.as_str())); - row.add_cell(Cell::new(triple.value.as_str())); - row.add_cell(Cell::new(triple.base.as_str())); - table.add_row(row); + ascii_table_row( + &mut output, + &widths, + [triple.name.as_str(), triple.value.as_str(), triple.base.as_str()], + ); + ascii_table_border(&mut output, &widths); } - ensure_trailing_newline(table.to_string()) + output } fn render_jsonl(values: &[T]) -> String { - let mut output = String::new(); + let mut output = Vec::with_capacity(values.len().saturating_mul(32)); for value in values { - output.push_str(&serde_json::to_string(value).expect("failed to serialize JSONL row")); - output.push('\n'); + serde_json::to_writer(&mut output, value).expect("failed to serialize JSONL row"); + output.push(b'\n'); } - output + // SAFETY: serde_json emits valid UTF-8 and this function only appends ASCII newlines. + unsafe { String::from_utf8_unchecked(output) } } fn render_resources(resources: &[Resource], output_format: QueryOutputFormat) -> String { @@ -526,35 +712,48 @@ fn render_count(count: usize) -> String { format!("{count}\n") } -fn report_to_table(report: &QueryExecutionReport) -> String { - let mut table: Table = Table::new(); - - table.add_row(row!["field", "value"]); +fn bool_str(value: bool) -> &'static str { + if value { "true" } else { "false" } +} +fn report_to_table(report: &QueryExecutionReport) -> String { + let input_files = report.input_files.join(","); + let matched_count = report.matched_count.to_string(); + let filtered_count = report.filtered_count.to_string(); + let output_count = report.output_count.to_string(); let rows = [ - ("operation", report.operation.as_deref().unwrap_or_default().to_owned()), - ("result_kind", report.result_kind.clone()), - ("input_files", report.input_files.join(",")), - ("output_file", report.output_file.clone().unwrap_or_default()), - ("matched_count", report.matched_count.to_string()), - ("filtered_count", report.filtered_count.to_string()), - ("output_count", report.output_count.to_string()), - ("truncated", report.truncated.to_string()), - ("dry_run", report.dry_run.to_string()), - ("check", report.check.to_string()), - ("would_write", report.would_write.to_string()), - ("wrote_output", report.wrote_output.to_string()), - ("change_detected", report.change_detected.to_string()), + ("operation", report.operation.as_deref().unwrap_or_default()), + ("result_kind", report.result_kind.as_str()), + ("input_files", input_files.as_str()), + ("output_file", report.output_file.as_deref().unwrap_or_default()), + ("matched_count", matched_count.as_str()), + ("filtered_count", filtered_count.as_str()), + ("output_count", output_count.as_str()), + ("truncated", bool_str(report.truncated)), + ("dry_run", bool_str(report.dry_run)), + ("check", bool_str(report.check)), + ("would_write", bool_str(report.would_write)), + ("wrote_output", bool_str(report.wrote_output)), + ("change_detected", bool_str(report.change_detected)), ]; + let mut widths = [UnicodeWidthStr::width("field"), UnicodeWidthStr::width("value")]; for (field, value) in rows { - let mut row = Row::empty(); - row.add_cell(Cell::new(field)); - row.add_cell(Cell::new(value.as_str())); - table.add_row(row); + widths[0] = widths[0].max(UnicodeWidthStr::width(field)); + widths[1] = widths[1].max(UnicodeWidthStr::width(value)); } - ensure_trailing_newline(table.to_string()) + let mut output = String::with_capacity(estimate_ascii_table_capacity(&widths, rows.len() + 1)); + ascii_table_border(&mut output, &widths); + ascii_table_row(&mut output, &widths, ["field", "value"]); + ascii_table_border(&mut output, &widths); + + for (field, value) in rows { + ascii_table_row(&mut output, &widths, [field, value]); + ascii_table_border(&mut output, &widths); + } + + output } fn render_report(report: &QueryExecutionReport, output_format: QueryOutputFormat) -> String { @@ -573,12 +772,19 @@ fn filter_resources( limit: Option, ) -> (QueryExecutionCounts, Vec) { let matched_count = resources.len(); - if let Some(key_filter) = filters.key_filter.as_ref() { - resources.retain(|resource| key_filter.matches(&resource.name)); - } - if let Some(value_filter) = filters.value_filter.as_ref() { - resources.retain(|resource| value_filter.matches(&resource.value)); + match (filters.key_filter.as_ref(), filters.value_filter.as_ref()) { + (Some(key_filter), Some(value_filter)) => { + resources.retain(|resource| key_filter.matches(&resource.name) && value_filter.matches(&resource.value)); + } + (Some(key_filter), None) => { + resources.retain(|resource| key_filter.matches(&resource.name)); + } + (None, Some(value_filter)) => { + resources.retain(|resource| value_filter.matches(&resource.value)); + } + (None, None) => {} } + let filtered_count = resources.len(); let mut truncated = false; @@ -604,12 +810,19 @@ fn filter_triples( limit: Option, ) -> (QueryExecutionCounts, Vec) { let matched_count = triples.len(); - if let Some(key_filter) = filters.key_filter.as_ref() { - triples.retain(|triple| key_filter.matches(&triple.name)); - } - if let Some(value_filter) = filters.value_filter.as_ref() { - triples.retain(|triple| value_filter.matches(&triple.value)); + match (filters.key_filter.as_ref(), filters.value_filter.as_ref()) { + (Some(key_filter), Some(value_filter)) => { + triples.retain(|triple| key_filter.matches(&triple.name) && value_filter.matches(&triple.value)); + } + (Some(key_filter), None) => { + triples.retain(|triple| key_filter.matches(&triple.name)); + } + (None, Some(value_filter)) => { + triples.retain(|triple| value_filter.matches(&triple.value)); + } + (None, None) => {} } + let filtered_count = triples.len(); let mut truncated = false; @@ -721,42 +934,16 @@ pub struct CirupQuery { query: String, } -const PRINT_QUERY: &str = "SELECT * FROM A"; -const DIFF_QUERY: &str = r" - SELECT A.key, A.val, B.val - FROM A - LEFT OUTER JOIN B ON A.key = B.key - WHERE (B.val IS NULL)"; -const DIFF_WITH_BASE_QUERY: &str = r" - SELECT B.key, B.val, C.val - FROM B - LEFT OUTER JOIN A ON B.key = A.key - INNER JOIN C ON B.key = C.key - WHERE (A.val IS NULL)"; -const CHANGE_QUERY: &str = r" - SELECT A.key, A.val, B.val - FROM A - LEFT OUTER JOIN B ON A.key = B.key - WHERE (B.val IS NULL) OR (A.val <> B.val)"; -const MERGE_QUERY: &str = r" - SELECT A.key, CASE WHEN B.val IS NOT NULL THEN B.val ELSE A.val END - FROM A - LEFT OUTER JOIN B on A.key = B.key - UNION - SELECT B.key, B.val - FROM B - LEFT OUTER JOIN A on A.key = B.key - WHERE (A.key IS NULL)"; -const INTERSECT_QUERY: &str = r" - SELECT * FROM A - INTERSECT - SELECT * from B"; -const SUBTRACT_QUERY: &str = r" - SELECT * FROM A - WHERE A.key NOT IN - (SELECT B.key FROM B)"; -const CONVERT_QUERY: &str = "SELECT * FROM A"; -const SORT_QUERY: &str = "SELECT * FROM A ORDER BY A.key"; +const PRINT_QUERY: &str = "select * from a"; +const DIFF_QUERY: &str = "select a.key, a.val, b.val from a left outer join b on a.key = b.key where (b.val is null)"; +const DIFF_WITH_BASE_QUERY: &str = "select b.key, b.val, c.val from b left outer join a on b.key = a.key inner join c on b.key = c.key where (a.val is null)"; +const CHANGE_QUERY: &str = + "select a.key, a.val, b.val from a left outer join b on a.key = b.key where (b.val is null) or (a.val <> b.val)"; +const MERGE_QUERY: &str = "select a.key, case when b.val is not null then b.val else a.val end from a left outer join b on a.key = b.key union select b.key, b.val from b left outer join a on a.key = b.key where (a.key is null)"; +const INTERSECT_QUERY: &str = "select * from a intersect select * from b"; +const SUBTRACT_QUERY: &str = "select * from a where a.key not in (select b.key from b)"; +const CONVERT_QUERY: &str = PRINT_QUERY; +const SORT_QUERY: &str = "select * from a order by a.key"; pub fn query_print(file: &str) -> CirupQuery { query_print_with_backend(file, default_query_backend()) @@ -1141,6 +1328,39 @@ fn test_render_resources_jsonl() { ); } +#[test] +fn test_render_resources_table_preserves_ascii_layout() { + let resources = vec![ + Resource::new("lblBoat", "I'm on a boat."), + Resource::new("lblYolo", "You only live once"), + Resource::new("lblDogs", "Who let the dogs out?"), + Resource::new("language.en", "English"), + Resource::new("language.fr", "French"), + Resource::new("very.deep.object", "value"), + ]; + + let output = render_resources(&resources, QueryOutputFormat::Table); + let expected = concat!( + "+------------------+-----------------------+\n", + "| name | value |\n", + "+------------------+-----------------------+\n", + "| lblBoat | I'm on a boat. |\n", + "+------------------+-----------------------+\n", + "| lblYolo | You only live once |\n", + "+------------------+-----------------------+\n", + "| lblDogs | Who let the dogs out? |\n", + "+------------------+-----------------------+\n", + "| language.en | English |\n", + "+------------------+-----------------------+\n", + "| language.fr | French |\n", + "+------------------+-----------------------+\n", + "| very.deep.object | value |\n", + "+------------------+-----------------------+\n", + ); + + assert_eq!(output, expected); +} + #[test] fn test_render_triples_json() { let triples = vec![Triple::new("hello", "world", "base")]; @@ -1149,6 +1369,260 @@ fn test_render_triples_json() { assert_eq!(output, "[{\"name\":\"hello\",\"value\":\"world\",\"base\":\"base\"}]\n"); } +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_render_and_filter_resources_large_input() { + let resources = (0..50_000usize) + .map(|index| { + Resource::new( + &format!("group{index}.key{}", index % 17), + if index % 3 == 0 { "English" } else { "French" }, + ) + }) + .collect::>(); + + let options = QueryRunOptions { + key_filters: vec![String::from("^group.*")], + value_filters: vec![String::from("^English$")], + limit: Some(10_000), + ..QueryRunOptions::default() + }; + let filters = compile_query_filters(&options).expect("failed to compile benchmark filters"); + + let started = Instant::now(); + let rendered = render_resources(&resources, QueryOutputFormat::Jsonl); + let render_elapsed = started.elapsed(); + + let started = Instant::now(); + let (counts, filtered) = filter_resources(resources.clone(), &filters, options.limit); + let filter_elapsed = started.elapsed(); + + assert!(!rendered.is_empty()); + assert_eq!(counts.output_count, filtered.len()); + + println!( + "query render/filter benchmark: input={} rendered_bytes={} filtered={} render={:?} filter={:?}", + resources.len(), + rendered.len(), + filtered.len(), + render_elapsed, + filter_elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_compile_query_filters_large_input() { + let options = QueryRunOptions { + key_filters: (0..5_000usize) + .map(|index| format!("^group{index}.*value{}$", index % 23)) + .collect(), + value_filters: (0..5_000usize) + .map(|index| format!("^lang{}.*English$", index % 19)) + .collect(), + ..QueryRunOptions::default() + }; + + let started = Instant::now(); + let filters = compile_query_filters(&options).expect("failed to compile benchmark filters"); + let compile_elapsed = started.elapsed(); + + let started = Instant::now(); + let key_sql = filters + .key_filter + .as_ref() + .expect("expected key filter") + .sql_condition("filtered.key"); + let value_sql = filters + .value_filter + .as_ref() + .expect("expected value filter") + .sql_condition("filtered.val"); + let sql_elapsed = started.elapsed(); + + assert!(!key_sql.is_empty()); + assert!(!value_sql.is_empty()); + + println!( + "query filter benchmark: key_patterns={} value_patterns={} compile={:?} sql={:?}", + options.key_filters.len(), + options.value_filters.len(), + compile_elapsed, + sql_elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_push_changed_values_duplicate_keys() { + let mut left = String::new(); + let mut right = String::new(); + + for key_index in 0..8_000usize { + for left_variant in 0..6usize { + left.push_str(&format!("key{key_index}=left{left_variant}\r\n")); + } + + right.push_str(&format!("key{key_index}=left0\r\n")); + right.push_str(&format!("key{key_index}=right{}\r\n", key_index % 7)); + } + + let mut engine = CirupEngine::with_backend(QueryBackendKind::TursoLocal); + engine.register_table_from_str("A", "left.restext", &left); + engine.register_table_from_str("B", "right.restext", &right); + + let query = r" + SELECT + B.key, B.val + FROM B + INNER JOIN A on (A.key = B.key) AND (A.val <> B.val)"; + + let started = Instant::now(); + let resources = engine.query_resource(query); + let elapsed = started.elapsed(); + + assert!(!resources.is_empty()); + + println!( + "push-changed-values benchmark: left_rows={} right_rows={} output_rows={} elapsed={:?}", + left.lines().count(), + right.lines().count(), + resources.len(), + elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_pull_left_join_duplicate_keys() { + let mut left = String::new(); + let mut right = String::new(); + + for key_index in 0..8_000usize { + for left_variant in 0..6usize { + left.push_str(&format!("key{key_index}=left{left_variant}\r\n")); + } + + right.push_str(&format!("key{key_index}=right{}\r\n", key_index % 7)); + right.push_str(&format!("key{key_index}=right{}\r\n", (key_index + 1) % 7)); + } + + let mut engine = CirupEngine::with_backend(QueryBackendKind::TursoLocal); + engine.register_table_from_str("A", "left.restext", &left); + engine.register_table_from_str("B", "right.restext", &right); + + let query = r" + SELECT + A.key, A.val + FROM A + LEFT OUTER JOIN B on A.key = B.key"; + + let started = Instant::now(); + let resources = engine.query_resource(query); + let elapsed = started.elapsed(); + + assert!(!resources.is_empty()); + + println!( + "pull-left-join benchmark: left_rows={} right_rows={} output_rows={} elapsed={:?}", + left.lines().count(), + right.lines().count(), + resources.len(), + elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_fast_query_dispatch_repeated() { + let mut engine = CirupEngine::with_backend(QueryBackendKind::TursoLocal); + engine.register_table_from_str("A", "empty.restext", ""); + + let iterations = 200_000usize; + let started = Instant::now(); + + for _ in 0..iterations { + let resources = engine.query_resource(PRINT_QUERY); + assert!(resources.is_empty()); + } + + let elapsed = started.elapsed(); + println!( + "fast-query-dispatch benchmark: iterations={} elapsed={:?}", + iterations, elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_render_resource_table_repeated() { + let resources = vec![ + Resource::new("lblBoat", "I'm on a boat."), + Resource::new("lblYolo", "You only live once"), + Resource::new("lblDogs", "Who let the dogs out?"), + Resource::new("language.en", "English"), + Resource::new("language.fr", "French"), + Resource::new("very.deep.object", "value"), + ]; + + let started = Instant::now(); + let mut total_bytes = 0usize; + + for _ in 0..20_000 { + total_bytes += render_resources(&resources, QueryOutputFormat::Table).len(); + } + + let elapsed = started.elapsed(); + + println!( + "resource-table benchmark: iterations={} total_bytes={} elapsed={:?}", + 20_000, total_bytes, elapsed + ); +} + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_render_report_table_repeated() { + let report = QueryExecutionReport { + operation: Some(String::from("file-merge")), + result_kind: String::from("resource"), + input_files: vec![String::from("a.json"), String::from("b.json"), String::from("c.json")], + output_file: Some(String::from("out.json")), + matched_count: 50_000, + filtered_count: 37_500, + output_count: 10_000, + truncated: true, + dry_run: false, + check: true, + would_write: true, + wrote_output: false, + change_detected: true, + }; + + let iterations = 20_000usize; + let started = Instant::now(); + let mut total_bytes = 0usize; + + for _ in 0..iterations { + total_bytes += render_report(&report, QueryOutputFormat::Table).len(); + } + + let elapsed = started.elapsed(); + assert!(total_bytes > 0); + + println!( + "render-report benchmark: iterations={} total_bytes={} elapsed={:?}", + iterations, total_bytes, elapsed + ); +} + #[test] fn test_count_only_rejects_output_file() { let options = QueryRunOptions { @@ -1216,12 +1690,66 @@ fn test_report_renders_as_json_summary() { assert!(output.ends_with('\n')); } +#[test] +fn test_report_renders_as_table_summary() { + let report = QueryExecutionReport { + operation: Some(String::from("file-sort")), + result_kind: String::from("resource"), + input_files: vec![String::from("cirup_core/test/test.json")], + output_file: Some(String::from("cirup_core/test/test.json")), + matched_count: 6, + filtered_count: 6, + output_count: 6, + truncated: false, + dry_run: true, + check: false, + would_write: true, + wrote_output: false, + change_detected: true, + }; + + let output = render_report(&report, QueryOutputFormat::Table); + let expected = concat!( + "+-----------------+---------------------------+\n", + "| field | value |\n", + "+-----------------+---------------------------+\n", + "| operation | file-sort |\n", + "+-----------------+---------------------------+\n", + "| result_kind | resource |\n", + "+-----------------+---------------------------+\n", + "| input_files | cirup_core/test/test.json |\n", + "+-----------------+---------------------------+\n", + "| output_file | cirup_core/test/test.json |\n", + "+-----------------+---------------------------+\n", + "| matched_count | 6 |\n", + "+-----------------+---------------------------+\n", + "| filtered_count | 6 |\n", + "+-----------------+---------------------------+\n", + "| output_count | 6 |\n", + "+-----------------+---------------------------+\n", + "| truncated | false |\n", + "+-----------------+---------------------------+\n", + "| dry_run | true |\n", + "+-----------------+---------------------------+\n", + "| check | false |\n", + "+-----------------+---------------------------+\n", + "| would_write | true |\n", + "+-----------------+---------------------------+\n", + "| wrote_output | false |\n", + "+-----------------+---------------------------+\n", + "| change_detected | true |\n", + "+-----------------+---------------------------+\n", + ); + + assert_eq!(output, expected); +} + #[test] fn test_compile_text_pattern_supports_simple_regex_subset() { let compiled = compile_text_pattern("--key-filter", "^lbl.*Yolo$").expect("expected valid pattern"); - assert!(compiled.regex.is_match("lblMyYolo")); - assert!(!compiled.regex.is_match("prefix_lblMyYolo")); + assert!(compiled.matches("lblMyYolo")); + assert!(!compiled.matches("prefix_lblMyYolo")); assert_eq!(compiled.glob_pattern, "lbl*Yolo"); } @@ -1257,7 +1785,7 @@ fn test_wrap_resource_query_with_key_filter_uses_glob_condition() { let wrapped = wrap_resource_query_with_filters(PRINT_QUERY, &filters); assert!(wrapped.contains("filtered.key GLOB 'lbl*'")); - assert!(wrapped.starts_with("WITH filtered(key, val) AS (SELECT * FROM A)")); + assert!(wrapped.starts_with(&format!("WITH filtered(key, val) AS ({PRINT_QUERY})"))); } #[test] diff --git a/cirup_core/src/query_backend.rs b/cirup_core/src/query_backend.rs index 5b26f9b..303975a 100644 --- a/cirup_core/src/query_backend.rs +++ b/cirup_core/src/query_backend.rs @@ -11,7 +11,6 @@ use rusqlite::{Connection, Error as SqlError, Statement}; #[cfg(feature = "turso-rust")] use std::cell::RefCell; -#[cfg(feature = "turso-rust")] use std::collections::{HashMap, HashSet}; #[cfg(feature = "turso-rust")] @@ -54,33 +53,21 @@ fn load_resources(filename: &str) -> Vec { } } -#[cfg(feature = "turso-rust")] const TURSO_INSERT_CHUNK_SIZE: usize = 2000; -#[cfg(feature = "turso-rust")] const QUERY_SELECT_A: &str = "select * from a"; -#[cfg(feature = "turso-rust")] const QUERY_SORT_A: &str = "select * from a order by a.key"; -#[cfg(feature = "turso-rust")] const QUERY_DIFF: &str = "select a.key, a.val, b.val from a left outer join b on a.key = b.key where (b.val is null)"; -#[cfg(feature = "turso-rust")] const QUERY_DIFF_WITH_BASE: &str = "select b.key, b.val, c.val from b left outer join a on b.key = a.key inner join c on b.key = c.key where (a.val is null)"; -#[cfg(feature = "turso-rust")] const QUERY_CHANGE: &str = "select a.key, a.val, b.val from a left outer join b on a.key = b.key where (b.val is null) or (a.val <> b.val)"; -#[cfg(feature = "turso-rust")] const QUERY_MERGE: &str = "select a.key, case when b.val is not null then b.val else a.val end from a left outer join b on a.key = b.key union select b.key, b.val from b left outer join a on a.key = b.key where (a.key is null)"; -#[cfg(feature = "turso-rust")] const QUERY_INTERSECT: &str = "select * from a intersect select * from b"; -#[cfg(feature = "turso-rust")] const QUERY_SUBTRACT: &str = "select * from a where a.key not in (select b.key from b)"; -#[cfg(feature = "turso-rust")] const QUERY_PULL_LEFT_JOIN: &str = "select a.key, a.val from a left outer join b on a.key = b.key"; -#[cfg(feature = "turso-rust")] const QUERY_PUSH_CHANGED_VALUES: &str = "select b.key, b.val from b inner join a on (a.key = b.key) and (a.val <> b.val)"; -#[cfg(feature = "turso-rust")] fn append_sql_quoted(out: &mut String, value: &str) { out.push('\''); for ch in value.chars() { @@ -94,7 +81,6 @@ fn append_sql_quoted(out: &mut String, value: &str) { out.push('\''); } -#[cfg(feature = "turso-rust")] fn build_multi_insert_sql(table: &str, resources: &[Resource], out: &mut String) { out.clear(); out.push_str("INSERT INTO "); @@ -116,18 +102,281 @@ fn build_multi_insert_sql(table: &str, resources: &[Resource], out: &mut String) out.push(';'); } -#[cfg(feature = "turso-rust")] fn build_key_index_sql(table: &str) -> String { format!("CREATE INDEX IF NOT EXISTS idx_{table}_key ON {table} (key);") } -#[cfg(feature = "turso-rust")] fn canonical_sql(input: &str) -> String { - input - .split_whitespace() - .collect::>() - .join(" ") - .to_ascii_lowercase() + let mut output = String::with_capacity(input.len()); + let mut saw_token = false; + + for token in input.split_whitespace() { + if saw_token { + output.push(' '); + } + + for ch in token.chars() { + output.push(ch.to_ascii_lowercase()); + } + + saw_token = true; + } + + output +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum FastResourceQueryKind { + SelectA, + SortA, + Diff, + Change, + Merge, + Intersect, + Subtract, + PullLeftJoin, + PushChangedValues, +} + +fn classify_fast_resource_query(query: &str) -> Option { + match query { + QUERY_SELECT_A => Some(FastResourceQueryKind::SelectA), + QUERY_SORT_A => Some(FastResourceQueryKind::SortA), + QUERY_DIFF => Some(FastResourceQueryKind::Diff), + QUERY_CHANGE => Some(FastResourceQueryKind::Change), + QUERY_MERGE => Some(FastResourceQueryKind::Merge), + QUERY_INTERSECT => Some(FastResourceQueryKind::Intersect), + QUERY_SUBTRACT => Some(FastResourceQueryKind::Subtract), + QUERY_PULL_LEFT_JOIN => Some(FastResourceQueryKind::PullLeftJoin), + QUERY_PUSH_CHANGED_VALUES => Some(FastResourceQueryKind::PushChangedValues), + _ => { + let canonical = canonical_sql(query); + match canonical.as_str() { + QUERY_SELECT_A => Some(FastResourceQueryKind::SelectA), + QUERY_SORT_A => Some(FastResourceQueryKind::SortA), + QUERY_DIFF => Some(FastResourceQueryKind::Diff), + QUERY_CHANGE => Some(FastResourceQueryKind::Change), + QUERY_MERGE => Some(FastResourceQueryKind::Merge), + QUERY_INTERSECT => Some(FastResourceQueryKind::Intersect), + QUERY_SUBTRACT => Some(FastResourceQueryKind::Subtract), + QUERY_PULL_LEFT_JOIN => Some(FastResourceQueryKind::PullLeftJoin), + QUERY_PUSH_CHANGED_VALUES => Some(FastResourceQueryKind::PushChangedValues), + _ => None, + } + } + } +} + +fn is_fast_triple_diff_with_base_query(query: &str) -> bool { + if query == QUERY_DIFF_WITH_BASE { + return true; + } + + canonical_sql(query) == QUERY_DIFF_WITH_BASE +} + +fn query_resource_fast_from_tables(tables: &HashMap>, query: &str) -> Option> { + let query_kind = classify_fast_resource_query(query)?; + + let table_a = tables.get("A"); + let table_b = tables.get("B"); + + if query_kind == FastResourceQueryKind::SelectA { + return table_a.cloned(); + } + + if query_kind == FastResourceQueryKind::SortA { + let mut resources = table_a?.clone(); + resources.sort_by(|left, right| left.name.cmp(&right.name)); + return Some(resources); + } + + if query_kind == FastResourceQueryKind::Diff { + let a = table_a?; + let b = table_b?; + let b_keys: HashSet<&str> = b.iter().map(|resource| resource.name.as_str()).collect(); + + let resources = a + .iter() + .filter(|resource| !b_keys.contains(resource.name.as_str())) + .cloned() + .collect::>(); + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::Change { + let a = table_a?; + let b = table_b?; + let b_values: HashMap<&str, &str> = b + .iter() + .map(|resource| (resource.name.as_str(), resource.value.as_str())) + .collect(); + + let resources = a + .iter() + .filter(|resource| { + let key = resource.name.as_str(); + let value = resource.value.as_str(); + match b_values.get(key) { + None => true, + Some(other) => *other != value, + } + }) + .cloned() + .collect::>(); + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::Merge { + let a = table_a?; + let b = table_b?; + + let a_values: HashMap<&str, &str> = a + .iter() + .map(|resource| (resource.name.as_str(), resource.value.as_str())) + .collect(); + let b_values: HashMap<&str, &str> = b + .iter() + .map(|resource| (resource.name.as_str(), resource.value.as_str())) + .collect(); + + let mut resources = Vec::with_capacity(a.len() + b.len()); + let mut dedupe: HashSet<(&str, &str)> = HashSet::with_capacity(a.len() + b.len()); + + for resource in a { + let key = resource.name.as_str(); + let merged_value = b_values.get(key).copied().unwrap_or(resource.value.as_str()); + if dedupe.insert((key, merged_value)) { + resources.push(Resource::new(key, merged_value)); + } + } + + for resource in b { + let key = resource.name.as_str(); + let value = resource.value.as_str(); + if !a_values.contains_key(key) && dedupe.insert((key, value)) { + resources.push(resource.clone()); + } + } + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::Intersect { + let a = table_a?; + let b = table_b?; + let b_pairs: HashSet<(&str, &str)> = b + .iter() + .map(|resource| (resource.name.as_str(), resource.value.as_str())) + .collect(); + + let mut resources = Vec::with_capacity(a.len().min(b.len())); + let mut dedupe: HashSet<(&str, &str)> = HashSet::with_capacity(a.len().min(b.len())); + + for resource in a { + let pair = (resource.name.as_str(), resource.value.as_str()); + if b_pairs.contains(&pair) && dedupe.insert(pair) { + resources.push(resource.clone()); + } + } + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::Subtract { + let a = table_a?; + let b = table_b?; + let b_keys: HashSet<&str> = b.iter().map(|resource| resource.name.as_str()).collect(); + + let resources = a + .iter() + .filter(|resource| !b_keys.contains(resource.name.as_str())) + .cloned() + .collect::>(); + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::PullLeftJoin { + let a = table_a?; + let b = table_b?; + + let mut b_match_count: HashMap<&str, usize> = HashMap::new(); + for resource in b { + *b_match_count.entry(resource.name.as_str()).or_insert(0) += 1; + } + + let mut resources = Vec::new(); + for resource in a { + let repeat = b_match_count.get(resource.name.as_str()).copied().unwrap_or(1); + for _ in 0..repeat { + resources.push(resource.clone()); + } + } + + return Some(resources); + } + + if query_kind == FastResourceQueryKind::PushChangedValues { + let a = table_a?; + let b = table_b?; + + let mut a_values: HashMap<&str, Vec<&str>> = HashMap::new(); + for resource in a { + a_values + .entry(resource.name.as_str()) + .or_default() + .push(resource.value.as_str()); + } + + let mut resources = Vec::new(); + for resource in b { + let key = resource.name.as_str(); + let Some(left_values) = a_values.get(key) else { + continue; + }; + + for left_value in left_values { + if *left_value != resource.value.as_str() { + resources.push(resource.clone()); + } + } + } + + return Some(resources); + } + + None +} + +fn query_triple_fast_from_tables(tables: &HashMap>, query: &str) -> Option> { + if !is_fast_triple_diff_with_base_query(query) { + return None; + } + + let a = tables.get("A")?; + let b = tables.get("B")?; + let c = tables.get("C")?; + + let a_keys: HashSet<&str> = a.iter().map(|resource| resource.name.as_str()).collect(); + let c_values: HashMap<&str, &str> = c + .iter() + .map(|resource| (resource.name.as_str(), resource.value.as_str())) + .collect(); + + let mut triples = Vec::new(); + for resource in b { + let key = resource.name.as_str(); + if !a_keys.contains(key) + && let Some(base) = c_values.get(key) + { + triples.push(Triple::new(key, resource.value.as_str(), base)); + } + } + + Some(triples) } #[cfg(feature = "turso-rust")] @@ -164,9 +413,9 @@ fn query_resource_from_statement(statement: &mut Statement<'_>) -> Vec while let Some(v) = response.next() { if let Ok(res) = v { - let name = &res.get::(0); - let value = &res.get::(1); - let resource = Resource::new(name, value); + let name: String = res.get(0); + let value: String = res.get(1); + let resource = Resource::from_owned(name, value); resources.push(resource); } } @@ -187,10 +436,10 @@ fn query_triple_from_statement(statement: &mut Statement<'_>) -> Vec { while let Some(v) = response.next() { if let Ok(res) = v { - let name = &res.get::(0); - let value = &res.get::(1); - let base = &res.get::(2); - let resource = Triple::new(name, value, base); + let name: String = res.get(0); + let value: String = res.get(1); + let base: String = res.get(2); + let resource = Triple::from_owned(name, value, base); resources.push(resource); } } @@ -201,16 +450,20 @@ fn query_triple_from_statement(statement: &mut Statement<'_>) -> Vec { #[cfg(feature = "rusqlite-c")] pub(crate) struct RusqliteBackend { db: Connection, + tables: HashMap>, } #[cfg(feature = "rusqlite-c")] impl RusqliteBackend { pub(crate) fn new() -> Self { let db = Connection::open_in_memory().expect("failed to open in-memory database"); - Self { db } + Self { + db, + tables: HashMap::new(), + } } - fn register_table_with_resources(&mut self, table: &str, resources: &[Resource]) { + fn register_table_with_resources(&mut self, table: &str, resources: Vec) { if !valid_table_name(table) { error!("invalid table name {}", table); return; @@ -242,7 +495,7 @@ impl RusqliteBackend { } }; - for resource in resources { + for resource in &resources { if let Err(e) = statement.execute(&[&resource.name, &resource.value]) { error!("failed to insert resource into {}: {}", table, e); return; @@ -250,6 +503,8 @@ impl RusqliteBackend { } } + self.tables.insert(table.to_owned(), resources); + if let Err(e) = tx.commit() { error!("failed to commit transaction for {}: {}", table, e); } @@ -266,15 +521,19 @@ impl QueryBackend for RusqliteBackend { fn register_table_from_str(&mut self, table: &str, filename: &str, data: &str) { vfile_set(filename, data); let resources = load_resources(filename); - self.register_table_with_resources(table, &resources); + self.register_table_with_resources(table, resources); } fn register_table_from_file(&mut self, table: &str, filename: &str) { let resources = load_resources(filename); - self.register_table_with_resources(table, &resources); + self.register_table_with_resources(table, resources); } fn query_resource(&self, query: &str) -> Vec { + if let Some(resources) = query_resource_fast_from_tables(&self.tables, query) { + return resources; + } + let mut statement = match self.prepare_statement(query) { Ok(statement) => statement, Err(e) => { @@ -287,6 +546,10 @@ impl QueryBackend for RusqliteBackend { } fn query_triple(&self, query: &str) -> Vec { + if let Some(triples) = query_triple_fast_from_tables(&self.tables, query) { + return triples; + } + let mut statement = match self.prepare_statement(query) { Ok(statement) => statement, Err(e) => { @@ -401,208 +664,11 @@ impl TursoLocalBackend { } fn query_resource_fast(&self, query: &str) -> Option> { - let query = canonical_sql(query); - - let table_a = self.tables.get("A"); - let table_b = self.tables.get("B"); - - if query == QUERY_SELECT_A { - return table_a.cloned(); - } - - if query == QUERY_SORT_A { - let mut resources = table_a?.clone(); - resources.sort_by(|left, right| left.name.cmp(&right.name)); - return Some(resources); - } - - if query == QUERY_DIFF { - let a = table_a?; - let b = table_b?; - let b_keys: HashSet<&str> = b.iter().map(|resource| resource.name.as_str()).collect(); - - let resources = a - .iter() - .filter(|resource| !b_keys.contains(resource.name.as_str())) - .cloned() - .collect::>(); - - return Some(resources); - } - - if query == QUERY_CHANGE { - let a = table_a?; - let b = table_b?; - let b_values: HashMap<&str, &str> = b - .iter() - .map(|resource| (resource.name.as_str(), resource.value.as_str())) - .collect(); - - let resources = a - .iter() - .filter(|resource| { - let key = resource.name.as_str(); - let value = resource.value.as_str(); - match b_values.get(key) { - None => true, - Some(other) => *other != value, - } - }) - .cloned() - .collect::>(); - - return Some(resources); - } - - if query == QUERY_MERGE { - let a = table_a?; - let b = table_b?; - - let a_values: HashMap<&str, &str> = a - .iter() - .map(|resource| (resource.name.as_str(), resource.value.as_str())) - .collect(); - let b_values: HashMap<&str, &str> = b - .iter() - .map(|resource| (resource.name.as_str(), resource.value.as_str())) - .collect(); - - let mut resources = Vec::with_capacity(a.len() + b.len()); - let mut dedupe: HashSet<(String, String)> = HashSet::with_capacity(a.len() + b.len()); - - for resource in a { - let key = resource.name.as_str(); - let merged_value = b_values.get(key).copied().unwrap_or(resource.value.as_str()); - let merged = Resource::new(key, merged_value); - if dedupe.insert((merged.name.clone(), merged.value.clone())) { - resources.push(merged); - } - } - - for resource in b { - let key = resource.name.as_str(); - if !a_values.contains_key(key) && dedupe.insert((resource.name.clone(), resource.value.clone())) { - resources.push(resource.clone()); - } - } - - return Some(resources); - } - - if query == QUERY_INTERSECT { - let a = table_a?; - let b = table_b?; - let b_pairs: HashSet<(&str, &str)> = b - .iter() - .map(|resource| (resource.name.as_str(), resource.value.as_str())) - .collect(); - - let mut resources = Vec::new(); - let mut dedupe: HashSet<(String, String)> = HashSet::new(); - - for resource in a { - let pair = (resource.name.as_str(), resource.value.as_str()); - if b_pairs.contains(&pair) && dedupe.insert((resource.name.clone(), resource.value.clone())) { - resources.push(resource.clone()); - } - } - - return Some(resources); - } - - if query == QUERY_SUBTRACT { - let a = table_a?; - let b = table_b?; - let b_keys: HashSet<&str> = b.iter().map(|resource| resource.name.as_str()).collect(); - - let resources = a - .iter() - .filter(|resource| !b_keys.contains(resource.name.as_str())) - .cloned() - .collect::>(); - - return Some(resources); - } - - if query == QUERY_PULL_LEFT_JOIN { - let a = table_a?; - let b = table_b?; - - let mut b_match_count: HashMap<&str, usize> = HashMap::new(); - for resource in b { - *b_match_count.entry(resource.name.as_str()).or_insert(0) += 1; - } - - let mut resources = Vec::new(); - for resource in a { - let repeat = b_match_count.get(resource.name.as_str()).copied().unwrap_or(1); - for _ in 0..repeat { - resources.push(resource.clone()); - } - } - - return Some(resources); - } - - if query == QUERY_PUSH_CHANGED_VALUES { - let a = table_a?; - let b = table_b?; - - let mut a_values: HashMap<&str, Vec<&str>> = HashMap::new(); - for resource in a { - a_values - .entry(resource.name.as_str()) - .or_default() - .push(resource.value.as_str()); - } - - let mut resources = Vec::new(); - for resource in b { - let key = resource.name.as_str(); - let Some(left_values) = a_values.get(key) else { - continue; - }; - - for left_value in left_values { - if *left_value != resource.value.as_str() { - resources.push(resource.clone()); - } - } - } - - return Some(resources); - } - - None + query_resource_fast_from_tables(&self.tables, query) } fn query_triple_fast(&self, query: &str) -> Option> { - let query = canonical_sql(query); - if query != QUERY_DIFF_WITH_BASE { - return None; - } - - let a = self.tables.get("A")?; - let b = self.tables.get("B")?; - let c = self.tables.get("C")?; - - let a_keys: HashSet<&str> = a.iter().map(|resource| resource.name.as_str()).collect(); - let c_values: HashMap<&str, &str> = c - .iter() - .map(|resource| (resource.name.as_str(), resource.value.as_str())) - .collect(); - - let mut triples = Vec::new(); - for resource in b { - let key = resource.name.as_str(); - if !a_keys.contains(key) - && let Some(base) = c_values.get(key) - { - triples.push(Triple::new(key, resource.value.as_str(), base)); - } - } - - Some(triples) + query_triple_fast_from_tables(&self.tables, query) } } @@ -843,7 +909,7 @@ pub(crate) fn build_backend(query_config: &QueryConfig) -> Box QueryBackendKind::Rusqlite => { #[cfg(feature = "rusqlite-c")] { - return Box::new(RusqliteBackend::new()); + Box::new(RusqliteBackend::new()) } #[cfg(not(feature = "rusqlite-c"))] diff --git a/cirup_core/src/resource.rs b/cirup_core/src/resource.rs index 0295583..c1e5eae 100644 --- a/cirup_core/src/resource.rs +++ b/cirup_core/src/resource.rs @@ -33,4 +33,9 @@ impl Resource { value: value.to_owned(), } } + + #[cfg(feature = "rusqlite-c")] + pub(crate) fn from_owned(name: String, value: String) -> Self { + Resource { name, value } + } } diff --git a/cirup_core/src/restext.rs b/cirup_core/src/restext.rs index 611ebcb..040cc66 100644 --- a/cirup_core/src/restext.rs +++ b/cirup_core/src/restext.rs @@ -1,5 +1,5 @@ -use regex::Regex; -use std::fmt; +#[cfg(test)] +use std::time::Instant; use crate::Resource; use crate::file::FileFormat; @@ -12,17 +12,11 @@ use std::error::Error; * https://docs.microsoft.com/en-us/dotnet/framework/resources/creating-resource-files-for-desktop-apps */ -lazy_static! { - static ref REGEX_RESTEXT: Regex = - Regex::new(r"^\s*(\w+)=(.*)$").unwrap_or_else(|e| panic!("invalid restext regex: {}", e)); -} - pub(crate) struct RestextFileFormat {} /* https://lise-henry.github.io/articles/optimising_strings.html */ -pub(crate) fn escape_newlines(input: &str) -> String { - let mut output = String::new(); +fn push_escaped_newlines(output: &mut String, input: &str) { for c in input.chars() { match c { '\\' => output.push_str("\\\\"), @@ -31,6 +25,23 @@ pub(crate) fn escape_newlines(input: &str) -> String { _ => output.push(c), } } +} + +fn parse_restext_line(line: &str) -> Option<(&str, &str)> { + let (name_part, value) = line.split_once('=')?; + let name = name_part.trim_start_matches(char::is_whitespace); + + if name.is_empty() || !name.chars().all(|ch| ch == '_' || ch.is_alphanumeric()) { + return None; + } + + Some((name, value)) +} + +#[cfg(test)] +pub(crate) fn escape_newlines(input: &str) -> String { + let mut output = String::with_capacity(input.len()); + push_escaped_newlines(&mut output, input); output } @@ -42,13 +53,7 @@ impl FileFormat for RestextFileFormat { let text = text.strip_prefix('\u{feff}').unwrap_or(text); for line in text.lines() { - if REGEX_RESTEXT.is_match(line) { - let captures = match REGEX_RESTEXT.captures(line) { - Some(captures) => captures, - None => continue, - }; - let name = &captures[1]; - let value = &captures[2]; + if let Some((name, value)) = parse_restext_line(line) { let resource = Resource::new(name, value); resources.push(resource); } @@ -63,13 +68,17 @@ impl FileFormat for RestextFileFormat { } fn write_to_str(&self, resources: &[Resource]) -> String { - let mut output = String::new(); + let estimated_len = resources + .iter() + .map(|resource| resource.name.len() + resource.value.len() + 3) + .sum::(); + let mut output = String::with_capacity(estimated_len); for resource in resources { - let escaped_value = escape_newlines(resource.value.as_str()); - if fmt::write(&mut output, format_args!("{}={}\r\n", resource.name, escaped_value)).is_err() { - break; - } + output.push_str(&resource.name); + output.push('='); + push_escaped_newlines(&mut output, resource.value.as_str()); + output.push_str("\r\n"); } output @@ -142,3 +151,38 @@ fn test_escape_newlines() { let escaped = escape_newlines(text); assert_eq!(escaped, "line1\\\\line2\\r\\nline3"); } + +#[test] +#[ignore = "benchmark: run manually with --ignored --nocapture"] +#[allow(clippy::print_stdout)] +fn benchmark_restext_parse_and_write_large_input() { + let file_format = RestextFileFormat {}; + let base = include_str!("../test/test.restext"); + let repetitions = 20_000usize; + let mut text = String::with_capacity(base.len() * repetitions); + + for _ in 0..repetitions { + text.push_str(base); + } + + let started = Instant::now(); + let resources = file_format + .parse_from_str(&text) + .unwrap_or_else(|e| panic!("restext parse benchmark failed: {}", e)); + let parse_elapsed = started.elapsed(); + + let started = Instant::now(); + let written = file_format.write_to_str(&resources); + let write_elapsed = started.elapsed(); + + assert_eq!(resources.len(), 3 * repetitions); + assert!(!written.is_empty()); + + println!( + "restext benchmark: lines={} bytes={} parse={:?} write={:?}", + resources.len(), + text.len(), + parse_elapsed, + write_elapsed + ); +} diff --git a/cirup_core/src/resx.rs b/cirup_core/src/resx.rs index 05867a1..dc549b7 100644 --- a/cirup_core/src/resx.rs +++ b/cirup_core/src/resx.rs @@ -1,5 +1,5 @@ extern crate treexml; -use treexml::{Document, Element}; +use treexml::Document; use crate::Resource; use crate::file::FileFormat; @@ -16,12 +16,28 @@ fn without_bom(text: &str) -> &[u8] { text.as_bytes() } -fn escape_xml_text(value: &str) -> String { - value.replace('&', "&").replace('<', "<").replace('>', ">") +fn push_escaped_xml_text(output: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '&' => output.push_str("&"), + '<' => output.push_str("<"), + '>' => output.push_str(">"), + _ => output.push(ch), + } + } } -fn escape_xml_attr(value: &str) -> String { - escape_xml_text(value).replace('"', """).replace('\'', "'") +fn push_escaped_xml_attr(output: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '&' => output.push_str("&"), + '<' => output.push_str("<"), + '>' => output.push_str(">"), + '"' => output.push_str("""), + '\'' => output.push_str("'"), + _ => output.push(ch), + } + } } impl FileFormat for ResxFileFormat { @@ -35,14 +51,12 @@ impl FileFormat for ResxFileFormat { let doc = Document::parse(bytes).map_err(|e| format!("resx parse error: {:?}", e))?; let root = doc.root.ok_or("resx root not found")?; - let children: Vec<&Element> = root.filter_children(|t| t.name == "data").collect(); - - for data in children { + for data in root.filter_children(|t| t.name == "data") { if let Some(data_name) = data.attributes.get("name") && let Some(value) = data.find_child(|tag| tag.name == "value") { - let data_value = value.text.clone().unwrap_or_default(); - let resource = Resource::new(data_name, data_value.as_ref()); + let data_value = value.text.as_deref().unwrap_or_default(); + let resource = Resource::new(data_name, data_value); resources.push(resource); } } @@ -57,13 +71,18 @@ impl FileFormat for ResxFileFormat { } fn write_to_str(&self, resources: &[Resource]) -> String { - let mut output = String::from("\n"); + let estimated_body_len = resources + .iter() + .map(|resource| resource.name.len() + resource.value.len() + 64) + .sum::(); + let mut output = String::with_capacity(48 + estimated_body_len); + output.push_str("\n"); for resource in resources { output.push_str("\n \n "); - output.push_str(&escape_xml_text(resource.value.as_str())); + push_escaped_xml_text(&mut output, resource.value.as_str()); output.push_str("\n "); } diff --git a/cirup_core/src/triple.rs b/cirup_core/src/triple.rs index 67a0d76..ce6a3fc 100644 --- a/cirup_core/src/triple.rs +++ b/cirup_core/src/triple.rs @@ -35,4 +35,9 @@ impl Triple { base: base.to_owned(), } } + + #[cfg(feature = "rusqlite-c")] + pub(crate) fn from_owned(name: String, value: String, base: String) -> Self { + Triple { name, value, base } + } }