From 8881bc561bdd7ca748d2e6b197416f4f037ae5a6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 24 Apr 2026 11:46:50 +0000
Subject: [PATCH 1/4] fix: iOS TCP connection failures (issue #79) - retry
logic, spawn_blocking, UIBackgroundModes
Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/272a2117-aebc-4a10-a2ab-0348f3040921
Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com>
---
cake-core/src/cake/sharding/mod.rs | 46 ++++++++++++++++++++++--
cake-mobile-app/iosApp/iosApp/Info.plist | 5 +++
cake-mobile/src/lib.rs | 37 +++++++++++++++----
3 files changed, 79 insertions(+), 9 deletions(-)
diff --git a/cake-core/src/cake/sharding/mod.rs b/cake-core/src/cake/sharding/mod.rs
index f761e30e..7b90f01a 100644
--- a/cake-core/src/cake/sharding/mod.rs
+++ b/cake-core/src/cake/sharding/mod.rs
@@ -382,9 +382,49 @@ pub async fn master_setup(
&worker.host
);
- let mut stream = TcpStream::connect(&worker.host)
- .await
- .map_err(|e| anyhow!("can't connect to {}: {}", &worker.host, e))?;
+ // Retry up to 3 times with exponential backoff (1 s, 2 s, 4 s).
+ // iOS workers may need a brief moment for the TCP listener to become
+ // fully reachable after the UDP discovery advertisement is sent.
+ const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
+ const MAX_ATTEMPTS: u32 = 3;
+ let mut last_err = anyhow!("no attempt made");
+ let mut stream = None;
+ for attempt in 0..MAX_ATTEMPTS {
+ if attempt > 0 {
+ let delay = Duration::from_secs(1u64 << (attempt - 1));
+ log::warn!(
+ "retrying connection to '{}' at {} in {:.1}s (attempt {}/{}) ...",
+ &worker.name,
+ &worker.host,
+ delay.as_secs_f32(),
+ attempt + 1,
+ MAX_ATTEMPTS,
+ );
+ tokio::time::sleep(delay).await;
+ }
+ match tokio::time::timeout(CONNECT_TIMEOUT, TcpStream::connect(&worker.host)).await {
+ Ok(Ok(s)) => {
+ stream = Some(s);
+ break;
+ }
+ Ok(Err(e)) => {
+ log::warn!(
+ "connect attempt {}/{} to '{}' at {} failed: {}",
+ attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, e
+ );
+ last_err = anyhow!("can't connect to {}: {}", &worker.host, e);
+ }
+ Err(_) => {
+ log::warn!(
+ "connect attempt {}/{} to '{}' at {} timed out after {:.0}s",
+ attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host,
+ CONNECT_TIMEOUT.as_secs_f32(),
+ );
+ last_err = anyhow!("can't connect to {}: connection timed out", &worker.host);
+ }
+ }
+ }
+ let mut stream = stream.ok_or(last_err)?;
let _ = stream.set_nodelay(true);
// Mutual authentication
diff --git a/cake-mobile-app/iosApp/iosApp/Info.plist b/cake-mobile-app/iosApp/iosApp/Info.plist
index 36f949e9..ea6b3bf4 100644
--- a/cake-mobile-app/iosApp/iosApp/Info.plist
+++ b/cake-mobile-app/iosApp/iosApp/Info.plist
@@ -25,6 +25,11 @@
NSBonjourServices
_cake._udp
+ _cake._tcp
+
+ UIBackgroundModes
+
+ voip
CADisableMinimumFrameDurationOnPhone
diff --git a/cake-mobile/src/lib.rs b/cake-mobile/src/lib.rs
index 5a33b646..23a60d65 100644
--- a/cake-mobile/src/lib.rs
+++ b/cake-mobile/src/lib.rs
@@ -391,15 +391,32 @@ async fn run_zero_config_worker(
update_status("loading", "Loading model weights...", 0.0);
log_mobile(&format!("[cake-mobile] creating Context::from_args (cpu={})...", force_cpu));
+ // Install a panic hook so that any panic during model loading is logged to
+ // the mobile log (visible in the app's diagnostic output).
let prev_hook = std::panic::take_hook();
std::panic::set_hook(Box::new(move |info| {
let msg = format!("[cake-mobile] PANIC: {}", info);
log_mobile(&msg);
}));
- let ctx_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
- Context::from_args(args)
- }));
+ // Context::from_args is CPU- and I/O-intensive (loads model weight files).
+ // Run it on a dedicated blocking thread to avoid starving the Tokio async
+ // runtime while the listener is still open and waiting for the master's
+ // inference reconnect.
+ let ctx_result = match tokio::task::spawn_blocking(move || {
+ std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| Context::from_args(args)))
+ })
+ .await
+ {
+ Ok(r) => r,
+ Err(join_err) => {
+ std::panic::set_hook(prev_hook);
+ let msg = format!("context loading task failed: {}", join_err);
+ log_mobile(&format!("[cake-mobile] ERROR: {}", msg));
+ update_status("error", &msg, 0.0);
+ return msg;
+ }
+ };
std::panic::set_hook(prev_hook);
@@ -456,15 +473,23 @@ async fn run_direct_worker(name: &str, model: &str, address: &str) -> String {
update_status("loading", "Downloading model...", 0.0);
log_mobile("[cake-mobile] creating context...");
- let mut ctx = match Context::from_args(args) {
- Ok(ctx) => {
+ // Context::from_args downloads / loads large model weight files. Run it
+ // on a dedicated blocking thread so the Tokio async runtime stays live.
+ let ctx_result = tokio::task::spawn_blocking(move || Context::from_args(args)).await;
+ let mut ctx = match ctx_result {
+ Ok(Ok(ctx)) => {
log_mobile(&format!("[cake-mobile] context created, device={:?}", ctx.device));
ctx
}
- Err(e) => {
+ Ok(Err(e)) => {
update_status("error", &format!("Failed: {}", e), 0.0);
return format!("context creation failed: {}", e);
}
+ Err(join_err) => {
+ let msg = format!("context loading task failed: {}", join_err);
+ update_status("error", &msg, 0.0);
+ return msg;
+ }
};
update_status("serving", "Ready — serving inference", 1.0);
From 6c8a4bcba104d0ba038a457d70167014af1e08dd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 24 Apr 2026 11:49:59 +0000
Subject: [PATCH 2/4] fix: address code review - Option-typed last_err,
document CONNECT_TIMEOUT value
Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/272a2117-aebc-4a10-a2ab-0348f3040921
Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com>
---
cake-core/src/cake/sharding/mod.rs | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/cake-core/src/cake/sharding/mod.rs b/cake-core/src/cake/sharding/mod.rs
index 7b90f01a..39e27f7e 100644
--- a/cake-core/src/cake/sharding/mod.rs
+++ b/cake-core/src/cake/sharding/mod.rs
@@ -385,9 +385,11 @@ pub async fn master_setup(
// Retry up to 3 times with exponential backoff (1 s, 2 s, 4 s).
// iOS workers may need a brief moment for the TCP listener to become
// fully reachable after the UDP discovery advertisement is sent.
+ // 10 s per attempt is enough for a LAN connection while still failing
+ // fast enough to give a useful error within ~30 s overall.
const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_ATTEMPTS: u32 = 3;
- let mut last_err = anyhow!("no attempt made");
+ let mut last_err: Option = None;
let mut stream = None;
for attempt in 0..MAX_ATTEMPTS {
if attempt > 0 {
@@ -412,7 +414,7 @@ pub async fn master_setup(
"connect attempt {}/{} to '{}' at {} failed: {}",
attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, e
);
- last_err = anyhow!("can't connect to {}: {}", &worker.host, e);
+ last_err = Some(anyhow!("can't connect to {}: {}", &worker.host, e));
}
Err(_) => {
log::warn!(
@@ -420,11 +422,13 @@ pub async fn master_setup(
attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host,
CONNECT_TIMEOUT.as_secs_f32(),
);
- last_err = anyhow!("can't connect to {}: connection timed out", &worker.host);
+ last_err = Some(anyhow!("can't connect to {}: connection timed out", &worker.host));
}
}
}
- let mut stream = stream.ok_or(last_err)?;
+ let mut stream = stream.ok_or_else(|| {
+ last_err.unwrap_or_else(|| anyhow!("can't connect to {}: all attempts failed", &worker.host))
+ })?;
let _ = stream.set_nodelay(true);
// Mutual authentication
From e58cb3ae2e87782893adcf55df25c932b3fa3b0e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 24 Apr 2026 14:02:07 +0000
Subject: [PATCH 3/4] fix: resolve pre-existing CI failures - madvise on
Android, missing files field on Windows
Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/4cd3dc02-5334-464d-8e1d-e9a2135e8792
Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com>
---
cake-core/src/utils/tensor_storage.rs | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/cake-core/src/utils/tensor_storage.rs b/cake-core/src/utils/tensor_storage.rs
index b4beee6a..a4c38ca6 100644
--- a/cake-core/src/utils/tensor_storage.rs
+++ b/cake-core/src/utils/tensor_storage.rs
@@ -190,8 +190,10 @@ impl MappedShard {
if ptr == libc::MAP_FAILED {
anyhow::bail!("mmap failed: {}", std::io::Error::last_os_error());
}
- // Hint: sequential access pattern — triggers aggressive readahead on NVMe
- unsafe { libc::posix_madvise(ptr, len, libc::POSIX_MADV_SEQUENTIAL); }
+ // Hint: sequential access pattern — triggers aggressive readahead on NVMe.
+ // Use madvise (POSIX.1-2003 base) which is available on all Unix platforms
+ // including Android. posix_madvise is not defined in Android's libc.
+ unsafe { libc::madvise(ptr, len, libc::MADV_SEQUENTIAL); }
Ok(Self { mmap_ptr: ptr as *const u8, mmap_len: len })
}
@@ -238,9 +240,6 @@ pub struct SafetensorsStorage {
index: HashMap,
/// Memory-mapped shard files (indexed by shard_idx in TensorMeta).
shards: Vec,
- /// File handles for non-mmap fallback.
- #[cfg(not(unix))]
- files: Vec,
}
impl SafetensorsStorage {
From 0f1eebbc3491c4ae1ef375f31ed6dd0dd597c6c2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 24 Apr 2026 14:17:54 +0000
Subject: [PATCH 4/4] fix: replace posix_madvise/POSIX_MADV_WILLNEED in
disk_expert_provider.rs for Android compat
Agent-Logs-Url: https://github.com/evilsocket/cake/sessions/8c7587c9-58e7-4276-80dd-82a6b595cbef
Co-authored-by: evilsocket <86922+evilsocket@users.noreply.github.com>
---
cake-core/src/models/common/disk_expert_provider.rs | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/cake-core/src/models/common/disk_expert_provider.rs b/cake-core/src/models/common/disk_expert_provider.rs
index 95f58021..7d0a5120 100644
--- a/cake-core/src/models/common/disk_expert_provider.rs
+++ b/cake-core/src/models/common/disk_expert_provider.rs
@@ -511,10 +511,10 @@ impl ExpertProvider for DiskExpertProvider {
for name in [&names.gate_proj, &names.up_proj, &names.down_proj] {
if let Some((bytes, _, _)) = self.storage.tensor_bytes(name) {
unsafe {
- libc::posix_madvise(
- bytes.as_ptr() as *mut _,
+ libc::madvise(
+ bytes.as_ptr() as *mut libc::c_void,
bytes.len(),
- libc::POSIX_MADV_WILLNEED,
+ libc::MADV_WILLNEED,
);
}
}