Skip to content

Commit be522af

Browse files
authored
Merge pull request #84 from evilsocket/copilot/verify-debug-issue-79
fix: iOS worker TCP connection failures (issue #79)
2 parents 3870042 + 0f1eebb commit be522af

5 files changed

Lines changed: 90 additions & 17 deletions

File tree

cake-core/src/cake/sharding/mod.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,53 @@ pub async fn master_setup(
382382
&worker.host
383383
);
384384

385-
let mut stream = TcpStream::connect(&worker.host)
386-
.await
387-
.map_err(|e| anyhow!("can't connect to {}: {}", &worker.host, e))?;
385+
// Retry up to 3 times with exponential backoff (1 s, 2 s, 4 s).
386+
// iOS workers may need a brief moment for the TCP listener to become
387+
// fully reachable after the UDP discovery advertisement is sent.
388+
// 10 s per attempt is enough for a LAN connection while still failing
389+
// fast enough to give a useful error within ~30 s overall.
390+
const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
391+
const MAX_ATTEMPTS: u32 = 3;
392+
let mut last_err: Option<anyhow::Error> = None;
393+
let mut stream = None;
394+
for attempt in 0..MAX_ATTEMPTS {
395+
if attempt > 0 {
396+
let delay = Duration::from_secs(1u64 << (attempt - 1));
397+
log::warn!(
398+
"retrying connection to '{}' at {} in {:.1}s (attempt {}/{}) ...",
399+
&worker.name,
400+
&worker.host,
401+
delay.as_secs_f32(),
402+
attempt + 1,
403+
MAX_ATTEMPTS,
404+
);
405+
tokio::time::sleep(delay).await;
406+
}
407+
match tokio::time::timeout(CONNECT_TIMEOUT, TcpStream::connect(&worker.host)).await {
408+
Ok(Ok(s)) => {
409+
stream = Some(s);
410+
break;
411+
}
412+
Ok(Err(e)) => {
413+
log::warn!(
414+
"connect attempt {}/{} to '{}' at {} failed: {}",
415+
attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host, e
416+
);
417+
last_err = Some(anyhow!("can't connect to {}: {}", &worker.host, e));
418+
}
419+
Err(_) => {
420+
log::warn!(
421+
"connect attempt {}/{} to '{}' at {} timed out after {:.0}s",
422+
attempt + 1, MAX_ATTEMPTS, &worker.name, &worker.host,
423+
CONNECT_TIMEOUT.as_secs_f32(),
424+
);
425+
last_err = Some(anyhow!("can't connect to {}: connection timed out", &worker.host));
426+
}
427+
}
428+
}
429+
let mut stream = stream.ok_or_else(|| {
430+
last_err.unwrap_or_else(|| anyhow!("can't connect to {}: all attempts failed", &worker.host))
431+
})?;
388432
let _ = stream.set_nodelay(true);
389433

390434
// Mutual authentication

cake-core/src/models/common/disk_expert_provider.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,10 @@ impl ExpertProvider for DiskExpertProvider {
511511
for name in [&names.gate_proj, &names.up_proj, &names.down_proj] {
512512
if let Some((bytes, _, _)) = self.storage.tensor_bytes(name) {
513513
unsafe {
514-
libc::posix_madvise(
515-
bytes.as_ptr() as *mut _,
514+
libc::madvise(
515+
bytes.as_ptr() as *mut libc::c_void,
516516
bytes.len(),
517-
libc::POSIX_MADV_WILLNEED,
517+
libc::MADV_WILLNEED,
518518
);
519519
}
520520
}

cake-core/src/utils/tensor_storage.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,10 @@ impl MappedShard {
190190
if ptr == libc::MAP_FAILED {
191191
anyhow::bail!("mmap failed: {}", std::io::Error::last_os_error());
192192
}
193-
// Hint: sequential access pattern — triggers aggressive readahead on NVMe
194-
unsafe { libc::posix_madvise(ptr, len, libc::POSIX_MADV_SEQUENTIAL); }
193+
// Hint: sequential access pattern — triggers aggressive readahead on NVMe.
194+
// Use madvise (POSIX.1-2003 base) which is available on all Unix platforms
195+
// including Android. posix_madvise is not defined in Android's libc.
196+
unsafe { libc::madvise(ptr, len, libc::MADV_SEQUENTIAL); }
195197
Ok(Self { mmap_ptr: ptr as *const u8, mmap_len: len })
196198
}
197199

@@ -238,9 +240,6 @@ pub struct SafetensorsStorage {
238240
index: HashMap<String, TensorMeta>,
239241
/// Memory-mapped shard files (indexed by shard_idx in TensorMeta).
240242
shards: Vec<MappedShard>,
241-
/// File handles for non-mmap fallback.
242-
#[cfg(not(unix))]
243-
files: Vec<File>,
244243
}
245244

246245
impl SafetensorsStorage {

cake-mobile-app/iosApp/iosApp/Info.plist

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
<key>NSBonjourServices</key>
2626
<array>
2727
<string>_cake._udp</string>
28+
<string>_cake._tcp</string>
29+
</array>
30+
<key>UIBackgroundModes</key>
31+
<array>
32+
<string>voip</string>
2833
</array>
2934
<key>CADisableMinimumFrameDurationOnPhone</key>
3035
<true/>

cake-mobile/src/lib.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -391,15 +391,32 @@ async fn run_zero_config_worker(
391391
update_status("loading", "Loading model weights...", 0.0);
392392
log_mobile(&format!("[cake-mobile] creating Context::from_args (cpu={})...", force_cpu));
393393

394+
// Install a panic hook so that any panic during model loading is logged to
395+
// the mobile log (visible in the app's diagnostic output).
394396
let prev_hook = std::panic::take_hook();
395397
std::panic::set_hook(Box::new(move |info| {
396398
let msg = format!("[cake-mobile] PANIC: {}", info);
397399
log_mobile(&msg);
398400
}));
399401

400-
let ctx_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
401-
Context::from_args(args)
402-
}));
402+
// Context::from_args is CPU- and I/O-intensive (loads model weight files).
403+
// Run it on a dedicated blocking thread to avoid starving the Tokio async
404+
// runtime while the listener is still open and waiting for the master's
405+
// inference reconnect.
406+
let ctx_result = match tokio::task::spawn_blocking(move || {
407+
std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| Context::from_args(args)))
408+
})
409+
.await
410+
{
411+
Ok(r) => r,
412+
Err(join_err) => {
413+
std::panic::set_hook(prev_hook);
414+
let msg = format!("context loading task failed: {}", join_err);
415+
log_mobile(&format!("[cake-mobile] ERROR: {}", msg));
416+
update_status("error", &msg, 0.0);
417+
return msg;
418+
}
419+
};
403420

404421
std::panic::set_hook(prev_hook);
405422

@@ -456,15 +473,23 @@ async fn run_direct_worker(name: &str, model: &str, address: &str) -> String {
456473
update_status("loading", "Downloading model...", 0.0);
457474
log_mobile("[cake-mobile] creating context...");
458475

459-
let mut ctx = match Context::from_args(args) {
460-
Ok(ctx) => {
476+
// Context::from_args downloads / loads large model weight files. Run it
477+
// on a dedicated blocking thread so the Tokio async runtime stays live.
478+
let ctx_result = tokio::task::spawn_blocking(move || Context::from_args(args)).await;
479+
let mut ctx = match ctx_result {
480+
Ok(Ok(ctx)) => {
461481
log_mobile(&format!("[cake-mobile] context created, device={:?}", ctx.device));
462482
ctx
463483
}
464-
Err(e) => {
484+
Ok(Err(e)) => {
465485
update_status("error", &format!("Failed: {}", e), 0.0);
466486
return format!("context creation failed: {}", e);
467487
}
488+
Err(join_err) => {
489+
let msg = format!("context loading task failed: {}", join_err);
490+
update_status("error", &msg, 0.0);
491+
return msg;
492+
}
468493
};
469494

470495
update_status("serving", "Ready — serving inference", 1.0);

0 commit comments

Comments
 (0)