OpenShell/crates/openshell-server/src/grpc.rs at ab266c16441ccb90c86562999f054dbc717d845d · NVIDIA/OpenShell · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! gRPC service implementation.

#![allow(clippy::ignored_unit_patterns)] // Tokio select! macro generates unit patterns

use crate::persistence::{
    DraftChunkRecord, ObjectId, ObjectName, ObjectType, PolicyRecord, Store, generate_name,
};
use futures::future;
use openshell_core::proto::setting_value;
use openshell_core::proto::{
    ApproveAllDraftChunksRequest, ApproveAllDraftChunksResponse, ApproveDraftChunkRequest,
    ApproveDraftChunkResponse, ClearDraftChunksRequest, ClearDraftChunksResponse,
    CreateProviderRequest, CreateSandboxRequest, CreateSshSessionRequest, CreateSshSessionResponse,
    DeleteProviderRequest, DeleteProviderResponse, DeleteSandboxRequest, DeleteSandboxResponse,
    DraftHistoryEntry, EditDraftChunkRequest, EditDraftChunkResponse, EffectiveSetting,
    ExecSandboxEvent, ExecSandboxExit, ExecSandboxRequest, ExecSandboxStderr, ExecSandboxStdout,
    GetDraftHistoryRequest, GetDraftHistoryResponse, GetDraftPolicyRequest, GetDraftPolicyResponse,
    GetGatewayConfigRequest, GetGatewayConfigResponse, GetProviderRequest, GetSandboxConfigRequest,
    GetSandboxConfigResponse, GetSandboxLogsRequest, GetSandboxLogsResponse,
    GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse,
    GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest,
    HealthRequest, HealthResponse, ListProvidersRequest, ListProvidersResponse,
    ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxesRequest,
    ListSandboxesResponse, PolicyChunk, PolicySource, PolicyStatus, Provider, ProviderResponse,
    PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest,
    RejectDraftChunkResponse, ReportPolicyStatusRequest, ReportPolicyStatusResponse,
    RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxLogLine, SandboxPolicyRevision,
    SandboxResponse, SandboxStreamEvent, ServiceStatus, SettingScope, SettingValue, SshSession,
    SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UndoDraftChunkRequest,
    UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest,
    WatchSandboxRequest, open_shell_server::OpenShell,
};
use openshell_core::proto::{
    Sandbox, SandboxPhase, SandboxPolicy as ProtoSandboxPolicy, SandboxTemplate,
};
use openshell_core::settings::{self, SettingValueKind};
use prost::Message;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use tokio::io::AsyncReadExt;
use tokio::io::AsyncWriteExt;
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tonic::{Request, Response, Status};
use tracing::{debug, info, warn};

use russh::ChannelMsg;
use russh::client::AuthResult;

use crate::ServerState;

/// Maximum number of records a single list RPC may return.
///
/// Client-provided `limit` values are clamped to this ceiling to prevent
/// unbounded memory allocation from an excessively large page request.
pub const MAX_PAGE_SIZE: u32 = 1000;

// ---------------------------------------------------------------------------
// Field-level size limits
//
// Named constants for easy tuning. Each limit is chosen to be generous
// enough for legitimate payloads while capping resource-exhaustion vectors.
// ---------------------------------------------------------------------------

/// Maximum length for a sandbox or provider name (Kubernetes name limit).
const MAX_NAME_LEN: usize = 253;

/// Maximum number of providers that can be attached to a sandbox.
const MAX_PROVIDERS: usize = 32;

/// Maximum length for the `log_level` field.
const MAX_LOG_LEVEL_LEN: usize = 32;

/// Maximum number of entries in `spec.environment`.
const MAX_ENVIRONMENT_ENTRIES: usize = 128;

/// Maximum length for an environment map key (bytes).
const MAX_MAP_KEY_LEN: usize = 256;

/// Maximum length for an environment map value (bytes).
const MAX_MAP_VALUE_LEN: usize = 8192;

/// Maximum length for template string fields (`image`, `runtime_class_name`, `agent_socket`).
const MAX_TEMPLATE_STRING_LEN: usize = 1024;

/// Maximum number of entries in template map fields (`labels`, `annotations`, `environment`).
const MAX_TEMPLATE_MAP_ENTRIES: usize = 128;

/// Maximum serialized size (bytes) for template Struct fields (`resources`,
/// `volume_claim_templates`).
const MAX_TEMPLATE_STRUCT_SIZE: usize = 65_536;

/// Maximum serialized size (bytes) for the policy field.
const MAX_POLICY_SIZE: usize = 262_144;

/// Maximum length for a provider type slug.
const MAX_PROVIDER_TYPE_LEN: usize = 64;

/// Maximum number of entries in the provider `credentials` map.
const MAX_PROVIDER_CREDENTIALS_ENTRIES: usize = 32;

/// Maximum number of entries in the provider `config` map.
const MAX_PROVIDER_CONFIG_ENTRIES: usize = 64;

/// Internal object type for durable gateway-global settings.
const GLOBAL_SETTINGS_OBJECT_TYPE: &str = "gateway_settings";
/// Internal object id for the singleton global settings record.
///
/// Prefixed to avoid collision with other object types in the shared
/// `objects` table (PRIMARY KEY is on `id` alone, not `(object_type, id)`).
const GLOBAL_SETTINGS_ID: &str = "gateway_settings:global";
const GLOBAL_SETTINGS_NAME: &str = "global";
/// Internal object type for durable sandbox-scoped settings.
const SANDBOX_SETTINGS_OBJECT_TYPE: &str = "sandbox_settings";
/// Reserved settings key used to store global policy payload.
const POLICY_SETTING_KEY: &str = "policy";
/// Sentinel `sandbox_id` used to store global policy revisions in the
/// `sandbox_policies` table alongside sandbox-scoped revisions.
const GLOBAL_POLICY_SANDBOX_ID: &str = "__global__";

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
struct StoredSettings {
    revision: u64,
    settings: BTreeMap<String, StoredSettingValue>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", content = "value")]
enum StoredSettingValue {
    String(String),
    Bool(bool),
    Int(i64),
    /// Hex-encoded binary payload.
    Bytes(String),
}

/// Clamp a client-provided page `limit`.
///
/// Returns `default` when `raw` is 0 (the protobuf zero-value convention),
/// otherwise returns the smaller of `raw` and `max`.
pub fn clamp_limit(raw: u32, default: u32, max: u32) -> u32 {
    if raw == 0 { default } else { raw.min(max) }
}

/// OpenShell gRPC service implementation.
#[derive(Debug, Clone)]
pub struct OpenShellService {
    state: Arc<ServerState>,
}

impl OpenShellService {
    /// Create a new OpenShell service.
    #[must_use]
    #[allow(clippy::missing_const_for_fn)]
    pub fn new(state: Arc<ServerState>) -> Self {
        Self { state }
    }
}

#[tonic::async_trait]
impl OpenShell for OpenShellService {
    async fn health(
        &self,
        _request: Request<HealthRequest>,
    ) -> Result<Response<HealthResponse>, Status> {
        Ok(Response::new(HealthResponse {
            status: ServiceStatus::Healthy.into(),
            version: openshell_core::VERSION.to_string(),
        }))
    }

    async fn create_sandbox(
        &self,
        request: Request<CreateSandboxRequest>,
    ) -> Result<Response<SandboxResponse>, Status> {
        let request = request.into_inner();
        let spec = request
            .spec
            .ok_or_else(|| Status::invalid_argument("spec is required"))?;

        // Validate field sizes before any I/O (fail fast on oversized payloads).
        validate_sandbox_spec(&request.name, &spec)?;

        // Validate provider names exist (fail fast). Credentials are fetched at
        // runtime by the sandbox supervisor via GetSandboxProviderEnvironment.
        for name in &spec.providers {
            self.state
                .store
                .get_message_by_name::<Provider>(name)
                .await
                .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))?
                .ok_or_else(|| {
                    Status::failed_precondition(format!("provider '{name}' not found"))
                })?;
        }

        // Ensure the template always carries the resolved image so clients
        // (CLI, TUI, etc.) can read the actual image from the stored sandbox.
        let mut spec = spec;
        let template = spec.template.get_or_insert_with(SandboxTemplate::default);
        if template.image.is_empty() {
            template.image = self.state.sandbox_client.default_image().to_string();
        }

        if spec.gpu {
            self.state
                .sandbox_client
                .validate_gpu_support()
                .await
                .map_err(|status| {
                    warn!(error = %status, "Rejecting GPU sandbox request");
                    status
                })?;
        }

        // Ensure process identity defaults to "sandbox" when missing or
        // empty, then validate policy safety before persisting.
        if let Some(ref mut policy) = spec.policy {
            openshell_policy::ensure_sandbox_process_identity(policy);
            validate_policy_safety(policy)?;
        }

        let id = uuid::Uuid::new_v4().to_string();
        let name = if request.name.is_empty() {
            petname::petname(2, "-").unwrap_or_else(generate_name)
        } else {
            request.name.clone()
        };
        let namespace = self.state.config.sandbox_namespace.clone();

        let sandbox = Sandbox {
            id: id.clone(),
            name: name.clone(),
            namespace,
            spec: Some(spec),
            status: None,
            phase: SandboxPhase::Provisioning as i32,
            ..Default::default()
        };

        // Reject duplicate names early, before touching the index or store.
        // This mirrors the provider-creation pattern (see `create_provider`).
        let existing = self
            .state
            .store
            .get_message_by_name::<Sandbox>(&name)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?;
        if existing.is_some() {
            return Err(Status::already_exists(format!(
                "sandbox '{name}' already exists"
            )));
        }

        // Persist to the store FIRST so the sandbox watcher always finds
        // the record with `spec` populated.  If we created the k8s
        // resource first, the watcher could race us and write a fallback
        // record with `spec: None`, causing the supervisor to fail with
        // "sandbox has no spec".
        self.state.sandbox_index.update_from_sandbox(&sandbox);

        self.state
            .store
            .put_message(&sandbox)
            .await
            .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?;

        // Now create the Kubernetes resource.  If this fails, clean up
        // the store entry to avoid orphans.
        match self.state.sandbox_client.create(&sandbox).await {
            Ok(_) => {}
            Err(kube::Error::Api(err)) if err.code == 409 => {
                // Clean up the store entry we just wrote.
                let _ = self.state.store.delete("sandbox", &id).await;
                self.state.sandbox_index.remove_sandbox(&id);
                warn!(
                    sandbox_id = %id,
                    sandbox_name = %name,
                    "Sandbox already exists in Kubernetes"
                );
                return Err(Status::already_exists("sandbox already exists"));
            }
            Err(err) => {
                // Clean up the store entry we just wrote.
                let _ = self.state.store.delete("sandbox", &id).await;
                self.state.sandbox_index.remove_sandbox(&id);
                warn!(
                    sandbox_id = %id,
                    sandbox_name = %name,
                    error = %err,
                    "CreateSandbox request failed"
                );
                return Err(Status::internal(format!(
                    "create sandbox in kubernetes failed: {err}"
                )));
            }
        }

        self.state.sandbox_watch_bus.notify(&id);

        info!(
            sandbox_id = %id,
            sandbox_name = %name,
            "CreateSandbox request completed successfully"
        );
        Ok(Response::new(SandboxResponse {
            sandbox: Some(sandbox),
        }))
    }

    type WatchSandboxStream = ReceiverStream<Result<SandboxStreamEvent, Status>>;
    type ExecSandboxStream = ReceiverStream<Result<ExecSandboxEvent, Status>>;

    async fn watch_sandbox(
        &self,
        request: Request<WatchSandboxRequest>,
    ) -> Result<Response<Self::WatchSandboxStream>, Status> {
        let req = request.into_inner();
        if req.id.is_empty() {
            return Err(Status::invalid_argument("id is required"));
        }
        let sandbox_id = req.id.clone();

        let follow_status = req.follow_status;
        let follow_logs = req.follow_logs;
        let follow_events = req.follow_events;
        let log_tail = if req.log_tail_lines == 0 {
            200
        } else {
            req.log_tail_lines
        };
        let stop_on_terminal = req.stop_on_terminal;
        let log_since_ms = req.log_since_ms;
        let log_sources = req.log_sources;
        let log_min_level = req.log_min_level;

        let (tx, rx) = mpsc::channel::<Result<SandboxStreamEvent, Status>>(256);
        let state = self.state.clone();

        // Spawn producer task.
        tokio::spawn(async move {
            // Validate that the sandbox exists BEFORE subscribing to any buses.
            // This prevents creating bus entries for non-existent sandbox IDs.
            match state.store.get_message::<Sandbox>(&sandbox_id).await {
                Ok(Some(_)) => {} // sandbox exists, proceed
                Ok(None) => {
                    let _ = tx.send(Err(Status::not_found("sandbox not found"))).await;
                    return;
                }
                Err(e) => {
                    let _ = tx
                        .send(Err(Status::internal(format!("fetch sandbox failed: {e}"))))
                        .await;
                    return;
                }
            }

            // Subscribe to all buses BEFORE reading the snapshot to avoid
            // missing notifications that fire between the snapshot read and subscribe.
            let mut status_rx = if follow_status {
                Some(state.sandbox_watch_bus.subscribe(&sandbox_id))
            } else {
                None
            };
            let mut log_rx = if follow_logs {
                Some(state.tracing_log_bus.subscribe(&sandbox_id))
            } else {
                None
            };
            let mut platform_rx = if follow_events {
                Some(
                    state
                        .tracing_log_bus
                        .platform_event_bus
                        .subscribe(&sandbox_id),
                )
            } else {
                None
            };

            // Re-read the snapshot now that we have subscriptions active
            // (avoids missing notifications between validate and subscribe).
            match state.store.get_message::<Sandbox>(&sandbox_id).await {
                Ok(Some(sandbox)) => {
                    state.sandbox_index.update_from_sandbox(&sandbox);
                    let _ = tx
                        .send(Ok(SandboxStreamEvent {
                            payload: Some(
                                openshell_core::proto::sandbox_stream_event::Payload::Sandbox(
                                    sandbox.clone(),
                                ),
                            ),
                        }))
                        .await;

                    if stop_on_terminal {
                        let phase =
                            SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown);
                        // Only stop on Ready - Error phase may be transient (e.g., ReconcilerError)
                        // and the sandbox may recover. Let the client decide how to handle errors.
                        if phase == SandboxPhase::Ready {
                            return;
                        }
                    }
                }
                Ok(None) => {
                    // Sandbox was deleted between validate and subscribe — end stream.
                    let _ = tx.send(Err(Status::not_found("sandbox not found"))).await;
                    return;
                }
                Err(e) => {
                    let _ = tx
                        .send(Err(Status::internal(format!("fetch sandbox failed: {e}"))))
                        .await;
                    return;
                }
            }

            // Replay tail logs (best-effort), filtered by log_since_ms and log_sources.
            if follow_logs {
                for evt in state.tracing_log_bus.tail(&sandbox_id, log_tail as usize) {
                    if let Some(openshell_core::proto::sandbox_stream_event::Payload::Log(
                        ref log,
                    )) = evt.payload
                    {
                        if log_since_ms > 0 && log.timestamp_ms < log_since_ms {
                            continue;
                        }
                        if !log_sources.is_empty() && !source_matches(&log.source, &log_sources) {
                            continue;
                        }
                        if !level_matches(&log.level, &log_min_level) {
                            continue;
                        }
                    }
                    if tx.send(Ok(evt)).await.is_err() {
                        return;
                    }
                }
            }

            // Replay buffered platform events (best-effort) so late subscribers
            // see Kubernetes events (Scheduled, Pulling, etc.) that already fired.
            if follow_events {
                for evt in state
                    .tracing_log_bus
                    .platform_event_bus
                    .tail(&sandbox_id, 50)
                {
                    if tx.send(Ok(evt)).await.is_err() {
                        return;
                    }
                }
            }

            loop {
                tokio::select! {
                    res = async {
                        match status_rx.as_mut() {
                            Some(rx) => rx.recv().await,
                            None => future::pending().await,
                        }
                    } => {
                        match res {
                            Ok(()) => {
                                match state.store.get_message::<Sandbox>(&sandbox_id).await {
                                    Ok(Some(sandbox)) => {
                                        state.sandbox_index.update_from_sandbox(&sandbox);
                                        if tx.send(Ok(SandboxStreamEvent { payload: Some(openshell_core::proto::sandbox_stream_event::Payload::Sandbox(sandbox.clone()))})).await.is_err() {
                                            return;
                                        }
                                        if stop_on_terminal {
                                            let phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown);
                                            // Only stop on Ready - Error phase may be transient (e.g., ReconcilerError)
                                            // and the sandbox may recover. Let the client decide how to handle errors.
                                            if phase == SandboxPhase::Ready {
                                                return;
                                            }
                                        }
                                    }
                                    Ok(None) => {
                                        // Deleted; end stream.
                                        return;
                                    }
                                    Err(e) => {
                                        let _ = tx.send(Err(Status::internal(format!("fetch sandbox failed: {e}")))).await;
                                        return;
                                    }
                                }
                            }
                            Err(err) => {
                                let _ = tx.send(Err(crate::sandbox_watch::broadcast_to_status(err))).await;
                                return;
                            }
                        }
                    }
                    res = async {
                        match log_rx.as_mut() {
                            Some(rx) => rx.recv().await,
                            None => future::pending().await,
                        }
                    } => {
                        match res {
                            Ok(evt) => {
                                // Apply source + level filter on live log events.
                                if let Some(openshell_core::proto::sandbox_stream_event::Payload::Log(ref log)) = evt.payload {
                                    if !log_sources.is_empty() && !source_matches(&log.source, &log_sources) {
                                        continue;
                                    }
                                    if !level_matches(&log.level, &log_min_level) {
                                        continue;
                                    }
                                }
                                if tx.send(Ok(evt)).await.is_err() {
                                    return;
                                }
                            }
                            Err(err) => {
                                let _ = tx.send(Err(crate::sandbox_watch::broadcast_to_status(err))).await;
                                return;
                            }
                        }
                    }
                    res = async {
                        match platform_rx.as_mut() {
                            Some(rx) => rx.recv().await,
                            None => future::pending().await,
                        }
                    } => {
                        match res {
                            Ok(evt) => {
                                if tx.send(Ok(evt)).await.is_err() {
                                    return;
                                }
                            }
                            Err(err) => {
                                let _ = tx.send(Err(crate::sandbox_watch::broadcast_to_status(err))).await;
                                return;
                            }
                        }
                    }
                }
            }
        });

        Ok(Response::new(ReceiverStream::new(rx)))
    }

    async fn get_sandbox(
        &self,
        request: Request<GetSandboxRequest>,
    ) -> Result<Response<SandboxResponse>, Status> {
        let name = request.into_inner().name;
        if name.is_empty() {
            return Err(Status::invalid_argument("name is required"));
        }

        let sandbox = self
            .state
            .store
            .get_message_by_name::<Sandbox>(&name)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?;

        let sandbox = sandbox.ok_or_else(|| Status::not_found("sandbox not found"))?;
        Ok(Response::new(SandboxResponse {
            sandbox: Some(sandbox),
        }))
    }

    async fn list_sandboxes(
        &self,
        request: Request<ListSandboxesRequest>,
    ) -> Result<Response<ListSandboxesResponse>, Status> {
        let request = request.into_inner();
        let limit = clamp_limit(request.limit, 100, MAX_PAGE_SIZE);
        let records = self
            .state
            .store
            .list(Sandbox::object_type(), limit, request.offset)
            .await
            .map_err(|e| Status::internal(format!("list sandboxes failed: {e}")))?;

        let mut sandboxes = Vec::with_capacity(records.len());
        for record in records {
            let mut sandbox = Sandbox::decode(record.payload.as_slice())
                .map_err(|e| Status::internal(format!("decode sandbox failed: {e}")))?;
            sandbox.created_at_ms = record.created_at_ms;
            sandboxes.push(sandbox);
        }

        Ok(Response::new(ListSandboxesResponse { sandboxes }))
    }

    async fn delete_sandbox(
        &self,
        request: Request<DeleteSandboxRequest>,
    ) -> Result<Response<DeleteSandboxResponse>, Status> {
        let name = request.into_inner().name;
        if name.is_empty() {
            return Err(Status::invalid_argument("name is required"));
        }

        let sandbox = self
            .state
            .store
            .get_message_by_name::<Sandbox>(&name)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?;

        let Some(mut sandbox) = sandbox else {
            return Err(Status::not_found("sandbox not found"));
        };

        let id = sandbox.id.clone();

        sandbox.phase = SandboxPhase::Deleting as i32;
        self.state
            .store
            .put_message(&sandbox)
            .await
            .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?;

        self.state.sandbox_index.update_from_sandbox(&sandbox);
        self.state.sandbox_watch_bus.notify(&id);

        // Clean up SSH sessions associated with this sandbox.
        if let Ok(records) = self
            .state
            .store
            .list(SshSession::object_type(), 1000, 0)
            .await
        {
            for record in records {
                if let Ok(session) = SshSession::decode(record.payload.as_slice())
                    && session.sandbox_id == id
                    && let Err(e) = self
                        .state
                        .store
                        .delete(SshSession::object_type(), &session.id)
                        .await
                {
                    warn!(
                        session_id = %session.id,
                        error = %e,
                        "Failed to delete SSH session during sandbox cleanup"
                    );
                }
            }
        }

        // Clean up sandbox-scoped settings record.
        if let Err(e) = self
            .state
            .store
            .delete(SANDBOX_SETTINGS_OBJECT_TYPE, &sandbox_settings_id(&id))
            .await
        {
            warn!(
                sandbox_id = %id,
                error = %e,
                "Failed to delete sandbox settings during cleanup"
            );
        }

        let deleted = match self.state.sandbox_client.delete(&sandbox.name).await {
            Ok(deleted) => deleted,
            Err(err) => {
                warn!(
                    sandbox_id = %id,
                    sandbox_name = %sandbox.name,
                    error = %err,
                    "DeleteSandbox request failed"
                );
                return Err(Status::internal(format!(
                    "delete sandbox in kubernetes failed: {err}"
                )));
            }
        };

        if !deleted && let Err(e) = self.state.store.delete(Sandbox::object_type(), &id).await {
            warn!(sandbox_id = %id, error = %e, "Failed to clean up store after delete");
        }

        // Clean up bus entries to prevent unbounded memory growth.
        self.state.tracing_log_bus.remove(&id);
        self.state.tracing_log_bus.platform_event_bus.remove(&id);
        self.state.sandbox_watch_bus.remove(&id);

        info!(
            sandbox_id = %id,
            sandbox_name = %sandbox.name,
            "DeleteSandbox request completed successfully"
        );
        Ok(Response::new(DeleteSandboxResponse { deleted }))
    }

    async fn create_provider(
        &self,
        request: Request<CreateProviderRequest>,
    ) -> Result<Response<ProviderResponse>, Status> {
        let req = request.into_inner();
        let provider = req
            .provider
            .ok_or_else(|| Status::invalid_argument("provider is required"))?;
        let provider = create_provider_record(self.state.store.as_ref(), provider).await?;

        Ok(Response::new(ProviderResponse {
            provider: Some(provider),
        }))
    }

    async fn get_provider(
        &self,
        request: Request<GetProviderRequest>,
    ) -> Result<Response<ProviderResponse>, Status> {
        let name = request.into_inner().name;
        let provider = get_provider_record(self.state.store.as_ref(), &name).await?;

        Ok(Response::new(ProviderResponse {
            provider: Some(provider),
        }))
    }

    async fn list_providers(
        &self,
        request: Request<ListProvidersRequest>,
    ) -> Result<Response<ListProvidersResponse>, Status> {
        let request = request.into_inner();
        let limit = clamp_limit(request.limit, 100, MAX_PAGE_SIZE);
        let providers =
            list_provider_records(self.state.store.as_ref(), limit, request.offset).await?;

        Ok(Response::new(ListProvidersResponse { providers }))
    }

    async fn update_provider(
        &self,
        request: Request<UpdateProviderRequest>,
    ) -> Result<Response<ProviderResponse>, Status> {
        let req = request.into_inner();
        let provider = req
            .provider
            .ok_or_else(|| Status::invalid_argument("provider is required"))?;
        let provider = update_provider_record(self.state.store.as_ref(), provider).await?;

        Ok(Response::new(ProviderResponse {
            provider: Some(provider),
        }))
    }

    async fn delete_provider(
        &self,
        request: Request<DeleteProviderRequest>,
    ) -> Result<Response<DeleteProviderResponse>, Status> {
        let name = request.into_inner().name;
        let deleted = delete_provider_record(self.state.store.as_ref(), &name).await?;

        Ok(Response::new(DeleteProviderResponse { deleted }))
    }

    async fn get_sandbox_config(
        &self,
        request: Request<GetSandboxConfigRequest>,
    ) -> Result<Response<GetSandboxConfigResponse>, Status> {
        let sandbox_id = request.into_inner().sandbox_id;

        let sandbox = self
            .state
            .store
            .get_message::<Sandbox>(&sandbox_id)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?
            .ok_or_else(|| Status::not_found("sandbox not found"))?;

        // Try to get the latest policy from the policy history table.
        let latest = self
            .state
            .store
            .get_latest_policy(&sandbox_id)
            .await
            .map_err(|e| Status::internal(format!("fetch policy history failed: {e}")))?;

        let mut policy_source = PolicySource::Sandbox;
        let (mut policy, mut version, mut policy_hash) = if let Some(record) = latest {
            let decoded = ProtoSandboxPolicy::decode(record.policy_payload.as_slice())
                .map_err(|e| Status::internal(format!("decode policy failed: {e}")))?;
            debug!(
                sandbox_id = %sandbox_id,
                version = record.version,
                "GetSandboxConfig served from policy history"
            );
            (
                Some(decoded),
                u32::try_from(record.version).unwrap_or(0),
                record.policy_hash,
            )
        } else {
            // Lazy backfill: no policy history exists yet.
            let spec = sandbox
                .spec
                .ok_or_else(|| Status::internal("sandbox has no spec"))?;

            match spec.policy {
                // If spec.policy is None, the sandbox was created without a policy.
                // Return an empty policy payload so the sandbox can discover policy
                // from disk or fall back to its restrictive default.
                None => {
                    debug!(
                        sandbox_id = %sandbox_id,
                        "GetSandboxConfig: no policy configured, returning empty response"
                    );
                    (None, 0, String::new())
                }
                Some(spec_policy) => {
                    let hash = deterministic_policy_hash(&spec_policy);
                    let payload = spec_policy.encode_to_vec();
                    let policy_id = uuid::Uuid::new_v4().to_string();

                    // Best-effort backfill: if it fails (e.g., concurrent backfill race), we still
                    // return the policy from spec.
                    if let Err(e) = self
                        .state
                        .store
                        .put_policy_revision(&policy_id, &sandbox_id, 1, &payload, &hash)
                        .await
                    {
                        warn!(
                            sandbox_id = %sandbox_id,
                            error = %e,
                            "Failed to backfill policy version 1"
                        );
                    } else if let Err(e) = self
                        .state
                        .store
                        .update_policy_status(&sandbox_id, 1, "loaded", None, None)
                        .await
                    {
                        warn!(
                            sandbox_id = %sandbox_id,
                            error = %e,
                            "Failed to mark backfilled policy as loaded"
                        );
                    }

                    info!(
                        sandbox_id = %sandbox_id,
                        "GetSandboxConfig served from spec (backfilled version 1)"
                    );

                    (Some(spec_policy), 1, hash)
                }
            }
        };

        let global_settings = load_global_settings(self.state.store.as_ref()).await?;
        let sandbox_settings =
            load_sandbox_settings(self.state.store.as_ref(), &sandbox_id).await?;

        let mut global_policy_version: u32 = 0;

        if let Some(global_policy) = decode_policy_from_global_settings(&global_settings)? {
            policy = Some(global_policy.clone());
            policy_hash = deterministic_policy_hash(&global_policy);
            policy_source = PolicySource::Global;
            // Keep sandbox policy version for status APIs, but global policy
            // updates are tracked via config_revision.
            if version == 0 {
                version = 1;
            }
            // Look up the global policy revision version number.
            if let Ok(Some(global_rev)) = self
                .state
                .store
                .get_latest_policy(GLOBAL_POLICY_SANDBOX_ID)
                .await
            {
                global_policy_version = u32::try_from(global_rev.version).unwrap_or(0);
            }
        }

        let settings = merge_effective_settings(&global_settings, &sandbox_settings)?;
        let config_revision = compute_config_revision(policy.as_ref(), &settings, policy_source);

        Ok(Response::new(GetSandboxConfigResponse {
            policy,
            version,
            policy_hash,
            settings,
            config_revision,
            policy_source: policy_source.into(),
            global_policy_version,
        }))
    }

    async fn get_gateway_config(
        &self,
        _request: Request<GetGatewayConfigRequest>,
    ) -> Result<Response<GetGatewayConfigResponse>, Status> {
        let global_settings = load_global_settings(self.state.store.as_ref()).await?;
        let settings = materialize_global_settings(&global_settings)?;
        Ok(Response::new(GetGatewayConfigResponse {
            settings,
            settings_revision: global_settings.revision,
        }))
    }

    async fn get_sandbox_provider_environment(
        &self,
        request: Request<GetSandboxProviderEnvironmentRequest>,
    ) -> Result<Response<GetSandboxProviderEnvironmentResponse>, Status> {
        let sandbox_id = request.into_inner().sandbox_id;

        let sandbox = self
            .state
            .store
            .get_message::<Sandbox>(&sandbox_id)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?
            .ok_or_else(|| Status::not_found("sandbox not found"))?;

        let spec = sandbox
            .spec
            .ok_or_else(|| Status::internal("sandbox has no spec"))?;

        let environment =
            resolve_provider_environment(self.state.store.as_ref(), &spec.providers).await?;

        info!(
            sandbox_id = %sandbox_id,
            provider_count = spec.providers.len(),
            env_count = environment.len(),
            "GetSandboxProviderEnvironment request completed successfully"
        );

        Ok(Response::new(GetSandboxProviderEnvironmentResponse {
            environment,
        }))
    }

    async fn create_ssh_session(
        &self,
        request: Request<CreateSshSessionRequest>,
    ) -> Result<Response<CreateSshSessionResponse>, Status> {
        let req = request.into_inner();
        if req.sandbox_id.is_empty() {
            return Err(Status::invalid_argument("sandbox_id is required"));
        }

        let sandbox = self
            .state
            .store
            .get_message::<Sandbox>(&req.sandbox_id)
            .await
            .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?
            .ok_or_else(|| Status::not_found("sandbox not found"))?;

        if SandboxPhase::try_from(sandbox.phase).ok() != Some(SandboxPhase::Ready) {
            return Err(Status::failed_precondition("sandbox is not ready"));
        }

        let token = uuid::Uuid::new_v4().to_string();
        let now_ms = current_time_ms()
            .map_err(|e| Status::internal(format!("timestamp generation failed: {e}")))?;
        let expires_at_ms = if self.state.config.ssh_session_ttl_secs > 0 {
            now_ms + (self.state.config.ssh_session_ttl_secs as i64 * 1000)
        } else {
            0
        };
        let session = SshSession {
            id: token.clone(),
            sandbox_id: req.sandbox_id.clone(),
            token: token.clone(),
            created_at_ms: now_ms,
            revoked: false,
            name: generate_name(),
            expires_at_ms,
        };

        self.state
            .store
            .put_message(&session)
            .await
            .map_err(|e| Status::internal(format!("persist ssh session failed: {e}")))?;

        let (gateway_host, gateway_port) = resolve_gateway(&self.state.config);
        let scheme = if self.state.config.tls.is_some() {
            "https"
        } else {
            "http"
        };

        Ok(Response::new(CreateSshSessionResponse {
            sandbox_id: req.sandbox_id,