From 8881bc561bdd7ca748d2e6b197416f4f037ae5a6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:46:50 +0000 Subject: [PATCH 1/4] fix: iOS TCP connection failures (issue #79) - retry logic, spawn_blocking, UIBackgroundModes Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/272a2117-aebc-4a10-a2ab-0348f3040921 Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com> --- cake-core/src/cake/sharding/mod.rs | 46 ++++++++++++++++++++++-- cake-mobile-app/iosApp/iosApp/Info.plist | 5 +++ cake-mobile/src/lib.rs | 37 +++++++++++++++---- 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/cake-core/src/cake/sharding/mod.rs b/cake-core/src/cake/sharding/mod.rs index f761e30e..7b90f01a 100644 --- a/cake-core/src/cake/sharding/mod.rs +++ b/cake-core/src/cake/sharding/mod.rs @@ -382,9 +382,49 @@ pub async fn master_setup( &worker.host ); - let mut stream = TcpStream::connect(&worker.host) - .await - .map_err(|e| anyhow!("can't connect to {}: {}", &worker.host, e))?; + // Retry up to 3 times with exponential backoff (1 s, 2 s, 4 s). + // iOS workers may need a brief moment for the TCP listener to become + // fully reachable after the UDP discovery advertisement is sent. + const CONNECT_TIMEOUT: Duration = Duration::from_secs(10); + const MAX_ATTEMPTS: u32 = 3; + let mut last_err = anyhow!("no attempt made"); + let mut stream = None; + for attempt in 0..MAX_ATTEMPTS { + if attempt > 0 { + let delay = Duration::from_secs(1u64 << (attempt - 1)); + log::warn!( + "retrying connection to '{}' at {} in {:.1}s (attempt {}/{}) ...", + &worker.name, + &worker.host, + delay.as_secs_f32(), + attempt + 1, + MAX_ATTEMPTS, + ); + tokio::time::sleep(delay).await; + } + match tokio::time::timeout(CONNECT_TIMEOUT, TcpStream::connect(&worker.host)).await { + Ok(Ok(s)) => { + stream = Some(s); + break; + } + Ok(Err(e)) => { + log::warn!( + "connect attempt {}/{} to '{}' at {} failed: {}", + attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, e + ); + last_err = anyhow!("can't connect to {}: {}", &worker.host, e); + } + Err(_) => { + log::warn!( + "connect attempt {}/{} to '{}' at {} timed out after {:.0}s", + attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, + CONNECT_TIMEOUT.as_secs_f32(), + ); + last_err = anyhow!("can't connect to {}: connection timed out", &worker.host); + } + } + } + let mut stream = stream.ok_or(last_err)?; let _ = stream.set_nodelay(true); // Mutual authentication diff --git a/cake-mobile-app/iosApp/iosApp/Info.plist b/cake-mobile-app/iosApp/iosApp/Info.plist index 36f949e9..ea6b3bf4 100644 --- a/cake-mobile-app/iosApp/iosApp/Info.plist +++ b/cake-mobile-app/iosApp/iosApp/Info.plist @@ -25,6 +25,11 @@ NSBonjourServices _cake._udp + _cake._tcp + + UIBackgroundModes + + voip CADisableMinimumFrameDurationOnPhone diff --git a/cake-mobile/src/lib.rs b/cake-mobile/src/lib.rs index 5a33b646..23a60d65 100644 --- a/cake-mobile/src/lib.rs +++ b/cake-mobile/src/lib.rs @@ -391,15 +391,32 @@ async fn run_zero_config_worker( update_status("loading", "Loading model weights...", 0.0); log_mobile(&format!("[cake-mobile] creating Context::from_args (cpu={})...", force_cpu)); + // Install a panic hook so that any panic during model loading is logged to + // the mobile log (visible in the app's diagnostic output). let prev_hook = std::panic::take_hook(); std::panic::set_hook(Box::new(move |info| { let msg = format!("[cake-mobile] PANIC: {}", info); log_mobile(&msg); })); - let ctx_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - Context::from_args(args) - })); + // Context::from_args is CPU- and I/O-intensive (loads model weight files). + // Run it on a dedicated blocking thread to avoid starving the Tokio async + // runtime while the listener is still open and waiting for the master's + // inference reconnect. + let ctx_result = match tokio::task::spawn_blocking(move || { + std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| Context::from_args(args))) + }) + .await + { + Ok(r) => r, + Err(join_err) => { + std::panic::set_hook(prev_hook); + let msg = format!("context loading task failed: {}", join_err); + log_mobile(&format!("[cake-mobile] ERROR: {}", msg)); + update_status("error", &msg, 0.0); + return msg; + } + }; std::panic::set_hook(prev_hook); @@ -456,15 +473,23 @@ async fn run_direct_worker(name: &str, model: &str, address: &str) -> String { update_status("loading", "Downloading model...", 0.0); log_mobile("[cake-mobile] creating context..."); - let mut ctx = match Context::from_args(args) { - Ok(ctx) => { + // Context::from_args downloads / loads large model weight files. Run it + // on a dedicated blocking thread so the Tokio async runtime stays live. + let ctx_result = tokio::task::spawn_blocking(move || Context::from_args(args)).await; + let mut ctx = match ctx_result { + Ok(Ok(ctx)) => { log_mobile(&format!("[cake-mobile] context created, device={:?}", ctx.device)); ctx } - Err(e) => { + Ok(Err(e)) => { update_status("error", &format!("Failed: {}", e), 0.0); return format!("context creation failed: {}", e); } + Err(join_err) => { + let msg = format!("context loading task failed: {}", join_err); + update_status("error", &msg, 0.0); + return msg; + } }; update_status("serving", "Ready — serving inference", 1.0); From 6c8a4bcba104d0ba038a457d70167014af1e08dd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:49:59 +0000 Subject: [PATCH 2/4] fix: address code review - Option-typed last_err, document CONNECT_TIMEOUT value Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/272a2117-aebc-4a10-a2ab-0348f3040921 Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com> --- cake-core/src/cake/sharding/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cake-core/src/cake/sharding/mod.rs b/cake-core/src/cake/sharding/mod.rs index 7b90f01a..39e27f7e 100644 --- a/cake-core/src/cake/sharding/mod.rs +++ b/cake-core/src/cake/sharding/mod.rs @@ -385,9 +385,11 @@ pub async fn master_setup( // Retry up to 3 times with exponential backoff (1 s, 2 s, 4 s). // iOS workers may need a brief moment for the TCP listener to become // fully reachable after the UDP discovery advertisement is sent. + // 10 s per attempt is enough for a LAN connection while still failing + // fast enough to give a useful error within ~30 s overall. const CONNECT_TIMEOUT: Duration = Duration::from_secs(10); const MAX_ATTEMPTS: u32 = 3; - let mut last_err = anyhow!("no attempt made"); + let mut last_err: Option = None; let mut stream = None; for attempt in 0..MAX_ATTEMPTS { if attempt > 0 { @@ -412,7 +414,7 @@ pub async fn master_setup( "connect attempt {}/{} to '{}' at {} failed: {}", attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, e ); - last_err = anyhow!("can't connect to {}: {}", &worker.host, e); + last_err = Some(anyhow!("can't connect to {}: {}", &worker.host, e)); } Err(_) => { log::warn!( @@ -420,11 +422,13 @@ pub async fn master_setup( attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, CONNECT_TIMEOUT.as_secs_f32(), ); - last_err = anyhow!("can't connect to {}: connection timed out", &worker.host); + last_err = Some(anyhow!("can't connect to {}: connection timed out", &worker.host)); } } } - let mut stream = stream.ok_or(last_err)?; + let mut stream = stream.ok_or_else(|| { + last_err.unwrap_or_else(|| anyhow!("can't connect to {}: all attempts failed", &worker.host)) + })?; let _ = stream.set_nodelay(true); // Mutual authentication From e58cb3ae2e87782893adcf55df25c932b3fa3b0e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:02:07 +0000 Subject: [PATCH 3/4] fix: resolve pre-existing CI failures - madvise on Android, missing files field on Windows Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/4cd3dc02-5334-464d-8e1d-e9a2135e8792 Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com> --- cake-core/src/utils/tensor_storage.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cake-core/src/utils/tensor_storage.rs b/cake-core/src/utils/tensor_storage.rs index b4beee6a..a4c38ca6 100644 --- a/cake-core/src/utils/tensor_storage.rs +++ b/cake-core/src/utils/tensor_storage.rs @@ -190,8 +190,10 @@ impl MappedShard { if ptr == libc::MAP_FAILED { anyhow::bail!("mmap failed: {}", std::io::Error::last_os_error()); } - // Hint: sequential access pattern — triggers aggressive readahead on NVMe - unsafe { libc::posix_madvise(ptr, len, libc::POSIX_MADV_SEQUENTIAL); } + // Hint: sequential access pattern — triggers aggressive readahead on NVMe. + // Use madvise (POSIX.1-2003 base) which is available on all Unix platforms + // including Android. posix_madvise is not defined in Android's libc. + unsafe { libc::madvise(ptr, len, libc::MADV_SEQUENTIAL); } Ok(Self { mmap_ptr: ptr as *const u8, mmap_len: len }) } @@ -238,9 +240,6 @@ pub struct SafetensorsStorage { index: HashMap, /// Memory-mapped shard files (indexed by shard_idx in TensorMeta). shards: Vec, - /// File handles for non-mmap fallback. - #[cfg(not(unix))] - files: Vec, } impl SafetensorsStorage { From 0f1eebbc3491c4ae1ef375f31ed6dd0dd597c6c2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:17:54 +0000 Subject: [PATCH 4/4] fix: replace posix_madvise/POSIX_MADV_WILLNEED in disk_expert_provider.rs for Android compat Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/8c7587c9-58e7-4276-80dd-82a6b595cbef Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com> --- cake-core/src/models/common/disk_expert_provider.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cake-core/src/models/common/disk_expert_provider.rs b/cake-core/src/models/common/disk_expert_provider.rs index 95f58021..7d0a5120 100644 --- a/cake-core/src/models/common/disk_expert_provider.rs +++ b/cake-core/src/models/common/disk_expert_provider.rs @@ -511,10 +511,10 @@ impl ExpertProvider for DiskExpertProvider { for name in [&names.gate_proj, &names.up_proj, &names.down_proj] { if let Some((bytes, _, _)) = self.storage.tensor_bytes(name) { unsafe { - libc::posix_madvise( - bytes.as_ptr() as *mut _, + libc::madvise( + bytes.as_ptr() as *mut libc::c_void, bytes.len(), - libc::POSIX_MADV_WILLNEED, + libc::MADV_WILLNEED, ); } }