From af53ce026d2788744c093b1c81e4ad7dd0140062 Mon Sep 17 00:00:00 2001 From: Greg Mitchell Date: Wed, 27 May 2026 05:16:32 +0000 Subject: [PATCH 1/5] sdk: add serviceability go executor CreateUser/DeleteUser + reconcile planner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the Solana-side primitives the device-stress orchestrator (#3746) needs: - CreateUser / DeleteUser methods on the Go serviceability executor (variants 36 / 42), with account-list construction mirroring the Rust SDK and a post-confirmation visibility wait so callers can record t_activate against the user PDA. - PDA helpers: GetUserPDA, GetAccessPassPDA, GetTunnelIdsPDA, GetDzPrefixBlockPDA — seed bytes mirrored from smartcontract/programs/doublezero-serviceability/src/pda.rs. - Pure PlanReconcile function and ReconcilePlan type for sweep delta planning, deterministic via ClientIp-ascending sort. - Rust fixture generator extended to emit user_create_args.{bin,json} and user_delete_args.{bin,json}; Go tests load them as the cross-language wire format contract. Part 1 of #3746 — library-only, no new binary. Closes #3770. --- .../fixtures/generate-fixtures/Cargo.lock | 4 +- .../fixtures/generate-fixtures/src/main.rs | 57 +++ .../testdata/fixtures/user_create_args.bin | Bin 0 -> 11 bytes .../testdata/fixtures/user_create_args.json | 31 ++ .../testdata/fixtures/user_delete_args.bin | 1 + .../testdata/fixtures/user_delete_args.json | 16 + .../sdk/go/serviceability/executor.go | 300 +++++++++++++- smartcontract/sdk/go/serviceability/pda.go | 62 ++- .../sdk/go/serviceability/pda_test.go | 113 ++++++ .../sdk/go/serviceability/reconcile.go | 59 +++ .../sdk/go/serviceability/reconcile_test.go | 173 ++++++++ .../sdk/go/serviceability/user_crud_test.go | 376 ++++++++++++++++++ 12 files changed, 1188 insertions(+), 4 deletions(-) create mode 100644 sdk/serviceability/testdata/fixtures/user_create_args.bin create mode 100644 sdk/serviceability/testdata/fixtures/user_create_args.json create mode 100644 sdk/serviceability/testdata/fixtures/user_delete_args.bin create mode 100644 sdk/serviceability/testdata/fixtures/user_delete_args.json create mode 100644 smartcontract/sdk/go/serviceability/pda_test.go create mode 100644 smartcontract/sdk/go/serviceability/reconcile.go create mode 100644 smartcontract/sdk/go/serviceability/reconcile_test.go create mode 100644 smartcontract/sdk/go/serviceability/user_crud_test.go diff --git a/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock b/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock index cb2af95041..9343abf199 100644 --- a/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock +++ b/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock @@ -346,7 +346,7 @@ dependencies = [ [[package]] name = "doublezero-program-common" -version = "0.23.0" +version = "0.24.0" dependencies = [ "borsh 1.6.0", "byteorder", @@ -358,7 +358,7 @@ dependencies = [ [[package]] name = "doublezero-serviceability" -version = "0.23.0" +version = "0.24.0" dependencies = [ "bitflags", "borsh 1.6.0", diff --git a/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs b/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs index b8ed35df67..9823a5776c 100644 --- a/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs +++ b/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs @@ -17,6 +17,9 @@ use borsh::BorshSerialize; use doublezero_serviceability::id_allocator::IdAllocator; use doublezero_serviceability::ip_allocator::IpAllocator; +use doublezero_serviceability::processors::user::{ + create::UserCreateArgs, delete::UserDeleteArgs, +}; use doublezero_serviceability::programversion::ProgramVersion; use doublezero_serviceability::state::{ accesspass::{AccessPass, AccessPassStatus, AccessPassType}, @@ -95,11 +98,65 @@ fn main() { generate_tenant(&fixtures_dir); generate_resource_extension_id(&fixtures_dir); generate_resource_extension_ip(&fixtures_dir); + generate_user_create_args(&fixtures_dir); + generate_user_delete_args(&fixtures_dir); println!(" all fixtures generated in {}", fixtures_dir.display()); } +/// Borsh-encoded `UserCreateArgs` (the body of instruction variant 36, without the +/// 1-byte discriminant). Field order: user_type, cyoa_type, client_ip, tunnel_endpoint, +/// dz_prefix_count. Non-default IP octets make endianness mistakes detectable. +fn generate_user_create_args(dir: &Path) { + let val = UserCreateArgs { + user_type: UserType::IBRL, + cyoa_type: UserCYOA::GREOverDIA, + client_ip: Ipv4Addr::new(10, 11, 12, 13), + tunnel_endpoint: Ipv4Addr::new(192, 168, 1, 2), + dz_prefix_count: 2, + }; + + let data = borsh::to_vec(&val).unwrap(); + + let meta = FixtureMeta { + name: "UserCreateArgs".into(), + // Not an account; account_type=0 since this is an instruction-args fixture. + account_type: 0, + fields: vec![ + FieldValue { name: "UserType".into(), value: "0".into(), typ: "u8".into() }, + FieldValue { name: "CyoaType".into(), value: "1".into(), typ: "u8".into() }, + FieldValue { name: "ClientIp".into(), value: "10.11.12.13".into(), typ: "ipv4".into() }, + FieldValue { name: "TunnelEndpoint".into(), value: "192.168.1.2".into(), typ: "ipv4".into() }, + FieldValue { name: "DzPrefixCount".into(), value: "2".into(), typ: "u8".into() }, + ], + }; + + write_fixture(dir, "user_create_args", &data, &meta); +} + +/// Borsh-encoded `UserDeleteArgs` (the body of instruction variant 42, without the +/// 1-byte discriminant). Field order: dz_prefix_count, multicast_publisher_count. +fn generate_user_delete_args(dir: &Path) { + let val = UserDeleteArgs { + dz_prefix_count: 3, + multicast_publisher_count: 1, + }; + + let data = borsh::to_vec(&val).unwrap(); + + let meta = FixtureMeta { + name: "UserDeleteArgs".into(), + account_type: 0, + fields: vec![ + FieldValue { name: "DzPrefixCount".into(), value: "3".into(), typ: "u8".into() }, + FieldValue { name: "MulticastPublisherCount".into(), value: "1".into(), typ: "u8".into() }, + ], + }; + + write_fixture(dir, "user_delete_args", &data, &meta); +} + fn generate_global_state(dir: &Path) { let foundation_pk = pubkey_from_byte(0x01); let activator_pk = pubkey_from_byte(0x02); diff --git a/sdk/serviceability/testdata/fixtures/user_create_args.bin b/sdk/serviceability/testdata/fixtures/user_create_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8971a4e3a86073f24c2dcc645ad4f9406c873caf GIT binary patch literal 11 ScmZQz 0") + } + if args.DevicePubkey.IsZero() { + return solana.Signature{}, solana.PublicKey{}, errors.New("UserCreateArgs.DevicePubkey is required") + } + + instr, userPDA, err := e.buildCreateUserInstruction(args) + if err != nil { + return solana.Signature{}, solana.PublicKey{}, fmt.Errorf("build CreateUser instruction: %w", err) + } + + sig, _, err := e.executeTransaction(ctx, []solana.Instruction{instr}) + if err != nil { + return sig, userPDA, err + } + + if err := e.waitForAccountVisible(ctx, userPDA, e.waitForVisibleTimeout); err != nil { + return sig, userPDA, fmt.Errorf("post-confirm visibility timeout for user PDA: %w", err) + } + return sig, userPDA, nil +} + +// DeleteUser submits a DeleteUser instruction (variant 42) and waits for the user +// PDA to disappear from chain. The function reads the user account first so it +// can derive the device-dependent PDAs and the multicast-publisher flag. +func (e *Executor) DeleteUser(ctx context.Context, userPubkey solana.PublicKey) (solana.Signature, error) { + if e.signer == nil { + return solana.Signature{}, ErrNoPrivateKey + } + if e.programID.IsZero() { + return solana.Signature{}, ErrNoProgramID + } + + info, err := e.rpc.GetAccountInfo(ctx, userPubkey) + if err != nil { + return solana.Signature{}, fmt.Errorf("fetch user account %s: %w", userPubkey, err) + } + if info == nil || info.Value == nil { + return solana.Signature{}, fmt.Errorf("user account %s not found", userPubkey) + } + rawData := info.Value.Data.GetBinary() + if len(rawData) == 0 { + return solana.Signature{}, fmt.Errorf("user account %s has empty data", userPubkey) + } + var user User + DeserializeUser(NewByteReader(rawData), &user) + if user.AccountType != UserType { + return solana.Signature{}, fmt.Errorf("account %s is not a User (type=%d)", userPubkey, user.AccountType) + } + user.PubKey = userPubkey + + // The Rust SDK currently passes dz_prefix_count=1 / multicast_publisher_count=1 + // because all users are created with exactly one DzPrefixBlock. Stress-orchestrator + // users likewise use DzPrefixCount=1, so 1 is the correct value here. Diverging + // requires fetching the Device record — out of scope for the SDK primitive. + const dzPrefixCount uint8 = 1 + const multicastPublisherCount uint8 = 1 + + instr, err := e.buildDeleteUserInstruction(userPubkey, user, dzPrefixCount, multicastPublisherCount) + if err != nil { + return solana.Signature{}, fmt.Errorf("build DeleteUser instruction: %w", err) + } + + sig, _, err := e.executeTransaction(ctx, []solana.Instruction{instr}) + if err != nil { + return sig, err + } + + if err := e.waitForAccountGone(ctx, userPubkey, e.waitForVisibleTimeout); err != nil { + return sig, fmt.Errorf("post-confirm visibility timeout waiting for user PDA closure: %w", err) + } + return sig, nil +} + +// buildCreateUserInstruction packs the variant-36 payload and assembles the account +// list in the order the on-chain processor expects: +// +// [user_pda, device, accesspass, globalstate, +// user_tunnel_block, multicast_publisher_block, device_tunnel_ids, +// dz_prefix_block[0..N], optional_tenant, payer, system] +func (e *Executor) buildCreateUserInstruction(args UserCreateArgs) (solana.Instruction, solana.PublicKey, error) { + data := make([]byte, 12) + data[0] = instructionCreateUser + data[1] = byte(args.UserType) + data[2] = byte(args.CyoaType) + copy(data[3:7], args.ClientIP[:]) + copy(data[7:11], args.TunnelEndpoint[:]) + data[11] = args.DzPrefixCount + + userPDA, _, err := GetUserPDA(e.programID, args.ClientIP, args.UserType) + if err != nil { + return nil, solana.PublicKey{}, fmt.Errorf("derive user PDA: %w", err) + } + accessPassPDA, _, err := GetAccessPassPDA(e.programID, args.ClientIP, e.signer.PublicKey()) + if err != nil { + return nil, userPDA, fmt.Errorf("derive accesspass PDA: %w", err) + } + globalStatePDA, _, err := GetGlobalStatePDA(e.programID) + if err != nil { + return nil, userPDA, fmt.Errorf("derive globalstate PDA: %w", err) + } + userTunnelBlockPDA, _, err := GetUserTunnelBlockPDA(e.programID) + if err != nil { + return nil, userPDA, fmt.Errorf("derive user tunnel block PDA: %w", err) + } + mcPublisherBlockPDA, _, err := GetMulticastPublisherBlockPDA(e.programID) + if err != nil { + return nil, userPDA, fmt.Errorf("derive multicast publisher block PDA: %w", err) + } + tunnelIdsPDA, _, err := GetTunnelIdsPDA(e.programID, args.DevicePubkey, 0) + if err != nil { + return nil, userPDA, fmt.Errorf("derive device tunnel ids PDA: %w", err) + } + + accounts := solana.AccountMetaSlice{ + solana.Meta(userPDA).WRITE(), + solana.Meta(args.DevicePubkey).WRITE(), + solana.Meta(accessPassPDA).WRITE(), + solana.Meta(globalStatePDA).WRITE(), + solana.Meta(userTunnelBlockPDA).WRITE(), + solana.Meta(mcPublisherBlockPDA).WRITE(), + solana.Meta(tunnelIdsPDA).WRITE(), + } + for i := uint64(0); i < uint64(args.DzPrefixCount); i++ { + dzPrefixPDA, _, err := GetDzPrefixBlockPDA(e.programID, args.DevicePubkey, i) + if err != nil { + return nil, userPDA, fmt.Errorf("derive dz_prefix_block[%d] PDA: %w", i, err) + } + accounts = append(accounts, solana.Meta(dzPrefixPDA).WRITE()) + } + if !args.TenantPubkey.IsZero() { + accounts = append(accounts, solana.Meta(args.TenantPubkey).WRITE()) + } + accounts = append(accounts, + solana.Meta(e.signer.PublicKey()).SIGNER().WRITE(), + solana.Meta(solana.SystemProgramID), + ) + + return &genericInstruction{ + programID: e.programID, + accounts: accounts, + data: data, + skipPermissionInject: true, + }, userPDA, nil +} + +// buildDeleteUserInstruction packs the variant-42 payload and assembles the account +// list in the order the on-chain processor expects: +// +// [user, accesspass, globalstate, device, +// user_tunnel_block, multicast_publisher_block, device_tunnel_ids, +// dz_prefix_block[0..N], optional_tenant, owner, payer, system] +// +// `multicastPublisherCount` mirrors the Rust SDK's behavior: the on-chain processor +// consumes the MulticastPublisherBlock slot unconditionally for the variant-42 +// layout, so DeleteUser's caller passes 1 even when the user was not created as a +// publisher. Exposed as a parameter so the byte-encoding can be tested independently. +func (e *Executor) buildDeleteUserInstruction(userPubkey solana.PublicKey, user User, dzPrefixCount, multicastPublisherCount uint8) (solana.Instruction, error) { + data := []byte{instructionDeleteUser, dzPrefixCount, multicastPublisherCount} + + accessPassPDA, _, err := GetAccessPassPDA(e.programID, user.ClientIp, user.Owner) + if err != nil { + return nil, fmt.Errorf("derive accesspass PDA: %w", err) + } + globalStatePDA, _, err := GetGlobalStatePDA(e.programID) + if err != nil { + return nil, fmt.Errorf("derive globalstate PDA: %w", err) + } + devicePubkey := solana.PublicKeyFromBytes(user.DevicePubKey[:]) + userTunnelBlockPDA, _, err := GetUserTunnelBlockPDA(e.programID) + if err != nil { + return nil, fmt.Errorf("derive user tunnel block PDA: %w", err) + } + mcPublisherBlockPDA, _, err := GetMulticastPublisherBlockPDA(e.programID) + if err != nil { + return nil, fmt.Errorf("derive multicast publisher block PDA: %w", err) + } + tunnelIdsPDA, _, err := GetTunnelIdsPDA(e.programID, devicePubkey, 0) + if err != nil { + return nil, fmt.Errorf("derive device tunnel ids PDA: %w", err) + } + + accounts := solana.AccountMetaSlice{ + solana.Meta(userPubkey).WRITE(), + solana.Meta(accessPassPDA).WRITE(), + solana.Meta(globalStatePDA).WRITE(), + solana.Meta(devicePubkey).WRITE(), + solana.Meta(userTunnelBlockPDA).WRITE(), + solana.Meta(mcPublisherBlockPDA).WRITE(), + solana.Meta(tunnelIdsPDA).WRITE(), + } + for i := uint64(0); i < uint64(dzPrefixCount); i++ { + dzPrefixPDA, _, err := GetDzPrefixBlockPDA(e.programID, devicePubkey, i) + if err != nil { + return nil, fmt.Errorf("derive dz_prefix_block[%d] PDA: %w", i, err) + } + accounts = append(accounts, solana.Meta(dzPrefixPDA).WRITE()) + } + var zeroPK [32]uint8 + if user.TenantPubKey != zeroPK { + accounts = append(accounts, solana.Meta(solana.PublicKeyFromBytes(user.TenantPubKey[:])).WRITE()) + } + accounts = append(accounts, + solana.Meta(solana.PublicKeyFromBytes(user.Owner[:])).WRITE(), + solana.Meta(e.signer.PublicKey()).SIGNER().WRITE(), + solana.Meta(solana.SystemProgramID), + ) + + return &genericInstruction{ + programID: e.programID, + accounts: accounts, + data: data, + skipPermissionInject: true, + }, nil +} + // UserBGPStatusUpdate holds the parameters for a single SetUserBGPStatus submission. type UserBGPStatusUpdate struct { UserPubkey solana.PublicKey @@ -231,6 +477,11 @@ type genericInstruction struct { programID solana.PublicKey accounts solana.AccountMetaSlice data []byte + // skipPermissionInject suppresses the executor's auto-appending of the Permission PDA. + // CreateUser/DeleteUser opt out because the on-chain processor uses accounts.len() + // to detect the optional tenant account; appending a trailing Permission shifts that + // count and would mis-classify accounts. + skipPermissionInject bool } func (i *genericInstruction) ProgramID() solana.PublicKey { @@ -278,7 +529,7 @@ func (e *Executor) executeTransaction(ctx context.Context, instructions []solana e.resolvePermissionPDA(ctx) if e.permissionPDA != nil { for _, instr := range instructions { - if gi, ok := instr.(*genericInstruction); ok { + if gi, ok := instr.(*genericInstruction); ok && !gi.skipPermissionInject { gi.accounts = append(gi.accounts, solana.Meta(*e.permissionPDA)) } } @@ -332,6 +583,53 @@ func (e *Executor) executeTransaction(ctx context.Context, instructions []solana return sig, res, nil } +// waitForAccountVisible polls GetAccountInfo until the given account is observable +// on-chain, or the deadline expires. Used post-CreateUser to give the caller a +// timestamp anchored to when the user PDA actually appears. +func (e *Executor) waitForAccountVisible(ctx context.Context, pubkey solana.PublicKey, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + info, err := e.rpc.GetAccountInfo(ctx, pubkey) + if err == nil && info != nil && info.Value != nil { + return nil + } + if time.Now().After(deadline) { + if err != nil { + return fmt.Errorf("account %s not visible: %w", pubkey, err) + } + return fmt.Errorf("account %s not visible before deadline", pubkey) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(250 * time.Millisecond): + } + } +} + +// waitForAccountGone polls GetAccountInfo until the given account no longer exists, +// or the deadline expires. Used post-DeleteUser to detect closure. +func (e *Executor) waitForAccountGone(ctx context.Context, pubkey solana.PublicKey, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + info, err := e.rpc.GetAccountInfo(ctx, pubkey) + if err == nil && (info == nil || info.Value == nil) { + return nil + } + if time.Now().After(deadline) { + if err != nil { + return fmt.Errorf("account %s still present: %w", pubkey, err) + } + return fmt.Errorf("account %s still present before deadline", pubkey) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(250 * time.Millisecond): + } + } +} + func (e *Executor) waitForSignatureVisible(ctx context.Context, sig solana.Signature, timeout time.Duration) error { deadline := time.Now().Add(timeout) diff --git a/smartcontract/sdk/go/serviceability/pda.go b/smartcontract/sdk/go/serviceability/pda.go index c147d3e103..39a2c39dbf 100644 --- a/smartcontract/sdk/go/serviceability/pda.go +++ b/smartcontract/sdk/go/serviceability/pda.go @@ -1,6 +1,10 @@ package serviceability -import "github.com/gagliardetto/solana-go" +import ( + "encoding/binary" + + "github.com/gagliardetto/solana-go" +) // PDA seeds matching Rust implementation in seeds.rs const ( @@ -16,6 +20,10 @@ const ( SeedMulticastPublisherBlock = "multicastpublisherblock" SeedTenant = "tenant" SeedPermission = "permission" + SeedUser = "user" + SeedAccessPass = "accesspass" + SeedTunnelIds = "tunnelids" + SeedDzPrefixBlock = "dzprefixblock" ) // DeriveGlobalStatePDA derives the PDA for the GlobalState account. @@ -123,3 +131,55 @@ func GetPermissionPDA(programID solana.PublicKey, userPayer solana.PublicKey) (s } return solana.FindProgramAddress(seeds, programID) } + +// GetUserPDA derives the PDA for a User account, keyed by (client_ip, user_type). +// Mirrors smartcontract/programs/doublezero-serviceability/src/pda.rs:get_user_pda. +func GetUserPDA(programID solana.PublicKey, clientIP [4]byte, userType UserUserType) (solana.PublicKey, uint8, error) { + seeds := [][]byte{ + []byte(SeedPrefix), + []byte(SeedUser), + clientIP[:], + {byte(userType)}, + } + return solana.FindProgramAddress(seeds, programID) +} + +// GetAccessPassPDA derives the PDA for an AccessPass account, keyed by (client_ip, user_payer). +// Mirrors smartcontract/programs/doublezero-serviceability/src/pda.rs:get_accesspass_pda. +func GetAccessPassPDA(programID solana.PublicKey, clientIP [4]byte, userPayer solana.PublicKey) (solana.PublicKey, uint8, error) { + seeds := [][]byte{ + []byte(SeedPrefix), + []byte(SeedAccessPass), + clientIP[:], + userPayer[:], + } + return solana.FindProgramAddress(seeds, programID) +} + +// GetTunnelIdsPDA derives the PDA for a per-device TunnelIds resource extension at the given index. +// Rust uses usize (8 bytes on 64-bit) little-endian for the index; we always encode 8 bytes. +func GetTunnelIdsPDA(programID solana.PublicKey, devicePK solana.PublicKey, index uint64) (solana.PublicKey, uint8, error) { + var idxBuf [8]byte + binary.LittleEndian.PutUint64(idxBuf[:], index) + seeds := [][]byte{ + []byte(SeedPrefix), + []byte(SeedTunnelIds), + devicePK[:], + idxBuf[:], + } + return solana.FindProgramAddress(seeds, programID) +} + +// GetDzPrefixBlockPDA derives the PDA for a per-device DzPrefixBlock resource extension at the given index. +// Rust uses usize (8 bytes on 64-bit) little-endian for the index; we always encode 8 bytes. +func GetDzPrefixBlockPDA(programID solana.PublicKey, devicePK solana.PublicKey, index uint64) (solana.PublicKey, uint8, error) { + var idxBuf [8]byte + binary.LittleEndian.PutUint64(idxBuf[:], index) + seeds := [][]byte{ + []byte(SeedPrefix), + []byte(SeedDzPrefixBlock), + devicePK[:], + idxBuf[:], + } + return solana.FindProgramAddress(seeds, programID) +} diff --git a/smartcontract/sdk/go/serviceability/pda_test.go b/smartcontract/sdk/go/serviceability/pda_test.go new file mode 100644 index 0000000000..614e243fb0 --- /dev/null +++ b/smartcontract/sdk/go/serviceability/pda_test.go @@ -0,0 +1,113 @@ +package serviceability_test + +import ( + "testing" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// PDAs are deterministic from (program_id, seeds), so we can cross-check the new +// helpers against an independent recomputation that mirrors the Rust seed bytes +// exactly. These tests catch typos in seed strings and width/endianness mistakes +// in the index encoding without requiring the Rust binary at test time. + +func recomputePDA(t *testing.T, programID solana.PublicKey, seeds [][]byte) solana.PublicKey { + t.Helper() + pda, _, err := solana.FindProgramAddress(seeds, programID) + require.NoError(t, err) + return pda +} + +func TestGetUserPDA_MatchesRustSeeds(t *testing.T) { + t.Parallel() + programID := solana.NewWallet().PublicKey() + ip := [4]byte{198, 51, 100, 7} + + got, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeIBRLWithAllocatedIP) + require.NoError(t, err) + + want := recomputePDA(t, programID, [][]byte{ + []byte("doublezero"), + []byte("user"), + ip[:], + {byte(serviceability.UserTypeIBRLWithAllocatedIP)}, + }) + assert.Equal(t, want, got) +} + +func TestGetAccessPassPDA_MatchesRustSeeds(t *testing.T) { + t.Parallel() + programID := solana.NewWallet().PublicKey() + userPayer := solana.NewWallet().PublicKey() + ip := [4]byte{10, 0, 0, 5} + + got, _, err := serviceability.GetAccessPassPDA(programID, ip, userPayer) + require.NoError(t, err) + + want := recomputePDA(t, programID, [][]byte{ + []byte("doublezero"), + []byte("accesspass"), + ip[:], + userPayer[:], + }) + assert.Equal(t, want, got) +} + +func TestGetTunnelIdsPDA_IndexIsEightByteLE(t *testing.T) { + t.Parallel() + programID := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + + for _, idx := range []uint64{0, 1, 7, 256, 0xDEAD_BEEF} { + got, _, err := serviceability.GetTunnelIdsPDA(programID, device, idx) + require.NoError(t, err) + + // Build the index seed by hand: 8-byte little-endian. + idxBytes := []byte{ + byte(idx), byte(idx >> 8), byte(idx >> 16), byte(idx >> 24), + byte(idx >> 32), byte(idx >> 40), byte(idx >> 48), byte(idx >> 56), + } + want := recomputePDA(t, programID, [][]byte{ + []byte("doublezero"), + []byte("tunnelids"), + device[:], + idxBytes, + }) + assert.Equal(t, want, got, "idx=%d", idx) + } +} + +func TestGetDzPrefixBlockPDA_IndexIsEightByteLE(t *testing.T) { + t.Parallel() + programID := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + + idx := uint64(3) + got, _, err := serviceability.GetDzPrefixBlockPDA(programID, device, idx) + require.NoError(t, err) + want := recomputePDA(t, programID, [][]byte{ + []byte("doublezero"), + []byte("dzprefixblock"), + device[:], + {0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + }) + assert.Equal(t, want, got) +} + +// TestUserPDA_DiffersByUserType guards against accidentally dropping the +// user_type byte from the seeds (which would collapse different user types onto +// the same PDA). +func TestUserPDA_DiffersByUserType(t *testing.T) { + t.Parallel() + programID := solana.NewWallet().PublicKey() + ip := [4]byte{10, 0, 0, 7} + + pdaIBRL, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeIBRL) + require.NoError(t, err) + pdaMulticast, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeMulticast) + require.NoError(t, err) + assert.NotEqual(t, pdaIBRL, pdaMulticast) +} diff --git a/smartcontract/sdk/go/serviceability/reconcile.go b/smartcontract/sdk/go/serviceability/reconcile.go new file mode 100644 index 0000000000..d61e20ea91 --- /dev/null +++ b/smartcontract/sdk/go/serviceability/reconcile.go @@ -0,0 +1,59 @@ +package serviceability + +import ( + "bytes" + "sort" + + "github.com/gagliardetto/solana-go" +) + +// ReconcilePlan describes the delta needed to drive the set of users owned by a +// given key toward a desired count. +type ReconcilePlan struct { + // ToCreate is the number of users to add. Always >= 0. + ToCreate int + // ToDelete lists user PDAs to remove, in the order they should be deleted. + // Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker, so + // repeated calls against the same input produce identical plans. + ToDelete []solana.PublicKey +} + +// PlanReconcile decides what to create or delete so that the number of users +// owned by ownerFilter equals target. Users with a different Owner are ignored +// (neither counted nor deleted), which lets the stress orchestrator share a +// program with other tenants without disturbing them. +// +// The function is pure — no I/O — so it is safe to call repeatedly while the +// orchestrator polls live state. Returns a zero plan when target is negative. +func PlanReconcile(current []User, target int, ownerFilter solana.PublicKey) ReconcilePlan { + if target < 0 { + return ReconcilePlan{} + } + + var owned []User + for _, u := range current { + if bytes.Equal(u.Owner[:], ownerFilter[:]) { + owned = append(owned, u) + } + } + + switch { + case len(owned) < target: + return ReconcilePlan{ToCreate: target - len(owned)} + case len(owned) > target: + sort.Slice(owned, func(i, j int) bool { + if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 { + return c < 0 + } + return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0 + }) + victims := owned[target:] + out := make([]solana.PublicKey, len(victims)) + for i, u := range victims { + out[i] = solana.PublicKeyFromBytes(u.PubKey[:]) + } + return ReconcilePlan{ToDelete: out} + default: + return ReconcilePlan{} + } +} diff --git a/smartcontract/sdk/go/serviceability/reconcile_test.go b/smartcontract/sdk/go/serviceability/reconcile_test.go new file mode 100644 index 0000000000..335094b7b8 --- /dev/null +++ b/smartcontract/sdk/go/serviceability/reconcile_test.go @@ -0,0 +1,173 @@ +package serviceability_test + +import ( + "testing" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// makeUser is a tiny helper to build a User suitable for PlanReconcile testing: +// only Owner, ClientIp, and PubKey actually influence the planner. +func makeUser(owner solana.PublicKey, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User { + return serviceability.User{ + Owner: owner, + ClientIp: clientIP, + PubKey: pubkey, + } +} + +func TestPlanReconcile(t *testing.T) { + t.Parallel() + + orchestrator := solana.NewWallet().PublicKey() + stranger := solana.NewWallet().PublicKey() + + // Stable pubkeys so we can assert exact ordering. + u1 := solana.NewWallet().PublicKey() + u2 := solana.NewWallet().PublicKey() + u3 := solana.NewWallet().PublicKey() + u4 := solana.NewWallet().PublicKey() + u5 := solana.NewWallet().PublicKey() + + ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} } + + tests := []struct { + name string + current []serviceability.User + target int + owner solana.PublicKey + wantCreate int + wantDeleteIPs [][4]byte // ClientIp order we expect to see in ToDelete + }{ + { + name: "zero to N", + current: nil, + target: 4, + owner: orchestrator, + wantCreate: 4, + }, + { + name: "N to zero deletes in ip-ascending order", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 3)), + makeUser(orchestrator, u2, ip(10, 0, 0, 1)), + makeUser(orchestrator, u3, ip(10, 0, 0, 4)), + makeUser(orchestrator, u4, ip(10, 0, 0, 2)), + }, + target: 0, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)}, + }, + { + name: "partial trim deletes only the overflow", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 5)), + makeUser(orchestrator, u2, ip(10, 0, 0, 4)), + makeUser(orchestrator, u3, ip(10, 0, 0, 3)), + makeUser(orchestrator, u4, ip(10, 0, 0, 2)), + makeUser(orchestrator, u5, ip(10, 0, 0, 1)), + }, + target: 3, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)}, + }, + { + name: "partial grow asks for the missing count", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + }, + target: 5, + owner: orchestrator, + wantCreate: 3, + }, + { + name: "only foreign users present grows by full target", + current: []serviceability.User{ + makeUser(stranger, u1, ip(10, 0, 0, 1)), + makeUser(stranger, u2, ip(10, 0, 0, 2)), + makeUser(stranger, u3, ip(10, 0, 0, 3)), + }, + target: 2, + owner: orchestrator, + wantCreate: 2, + }, + { + name: "mixed ownership only counts and deletes owned", + current: []serviceability.User{ + makeUser(stranger, u1, ip(10, 0, 0, 9)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + makeUser(stranger, u3, ip(10, 0, 0, 8)), + makeUser(orchestrator, u4, ip(10, 0, 0, 1)), + }, + target: 1, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)}, + }, + { + name: "already at target produces zero plan", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + }, + target: 2, + owner: orchestrator, + wantCreate: 0, + }, + { + name: "negative target produces zero plan", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + }, + target: -1, + owner: orchestrator, + wantCreate: 0, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + plan := serviceability.PlanReconcile(tc.current, tc.target, tc.owner) + assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate") + require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length") + + // Resolve expected pubkeys via ClientIp lookup against the current set. + ipToPubkey := map[[4]byte]solana.PublicKey{} + for _, u := range tc.current { + ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:]) + } + for i, ipKey := range tc.wantDeleteIPs { + assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey) + } + }) + } +} + +func TestPlanReconcile_TieBreaksByPubkey(t *testing.T) { + t.Parallel() + + orchestrator := solana.NewWallet().PublicKey() + sharedIP := [4]byte{10, 0, 0, 1} + + // Two users with the same ClientIp (artificial — onchain the IP is part of + // the PDA seed so collisions can't happen, but the tiebreak must still be + // deterministic). + pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) + pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) + + plan := serviceability.PlanReconcile([]serviceability.User{ + makeUser(orchestrator, pkB, sharedIP), + makeUser(orchestrator, pkA, sharedIP), + }, 0, orchestrator) + + require.Len(t, plan.ToDelete, 2) + // pkA (0xAA…) sorts before pkB (0xBB…). + assert.Equal(t, pkA, plan.ToDelete[0]) + assert.Equal(t, pkB, plan.ToDelete[1]) +} diff --git a/smartcontract/sdk/go/serviceability/user_crud_test.go b/smartcontract/sdk/go/serviceability/user_crud_test.go new file mode 100644 index 0000000000..808df0a8d2 --- /dev/null +++ b/smartcontract/sdk/go/serviceability/user_crud_test.go @@ -0,0 +1,376 @@ +package serviceability + +import ( + "context" + "errors" + "log/slog" + "os" + "path/filepath" + "runtime" + "sync/atomic" + "testing" + "time" + + "github.com/gagliardetto/solana-go" + solanarpc "github.com/gagliardetto/solana-go/rpc" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// loadArgsFixture loads a `.bin` payload from sdk/serviceability/testdata/fixtures/ +// for the cross-language wire-format check. +func loadArgsFixture(t *testing.T, name string) []byte { + t.Helper() + _, filename, _, _ := runtime.Caller(0) + dir := filepath.Join(filepath.Dir(filename), "..", "..", "..", "..", "sdk", "serviceability", "testdata", "fixtures") + bin, err := os.ReadFile(filepath.Join(dir, name+".bin")) + require.NoErrorf(t, err, "reading %s.bin", name) + return bin +} + +func TestBuildCreateUserInstruction(t *testing.T) { + t.Parallel() + + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + + args := UserCreateArgs{ + UserType: UserTypeIBRL, + CyoaType: CyoaTypeGREOverDIA, + ClientIP: [4]byte{10, 11, 12, 13}, + TunnelEndpoint: [4]byte{192, 168, 1, 2}, + DzPrefixCount: 2, + DevicePubkey: solana.NewWallet().PublicKey(), + } + + instr, userPDA, err := executor.buildCreateUserInstruction(args) + require.NoError(t, err) + + // Variant byte + 11-byte borsh body matching Rust UserCreateArgs. + data, err := instr.Data() + require.NoError(t, err) + require.Len(t, data, 12, "opcode (1) + borsh UserCreateArgs (11) = 12 bytes") + assert.Equal(t, byte(instructionCreateUser), data[0]) + assert.Equal(t, loadArgsFixture(t, "user_create_args"), data[1:], + "borsh body must match Rust-generated user_create_args.bin") + + // User PDA derivation is deterministic from (program_id, client_ip, user_type). + expectedPDA, _, err := GetUserPDA(executor.programID, args.ClientIP, args.UserType) + require.NoError(t, err) + assert.Equal(t, expectedPDA, userPDA) + + // Account count = 7 fixed + DzPrefixCount + payer + system (no tenant). + accs := instr.Accounts() + require.Len(t, accs, 7+int(args.DzPrefixCount)+2) + assert.Equal(t, userPDA, accs[0].PublicKey) + assert.True(t, accs[0].IsWritable) + assert.False(t, accs[0].IsSigner) + assert.Equal(t, args.DevicePubkey, accs[1].PublicKey) + // Last two slots: signer + system program. + assert.Equal(t, executor.signer.PublicKey(), accs[len(accs)-2].PublicKey) + assert.True(t, accs[len(accs)-2].IsSigner) + assert.Equal(t, solana.SystemProgramID, accs[len(accs)-1].PublicKey) +} + +func TestBuildCreateUserInstruction_WithTenant(t *testing.T) { + t.Parallel() + + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + tenant := solana.NewWallet().PublicKey() + + args := UserCreateArgs{ + UserType: UserTypeIBRLWithAllocatedIP, + CyoaType: CyoaTypeGREOverFabric, + ClientIP: [4]byte{198, 51, 100, 7}, + TunnelEndpoint: [4]byte{0, 0, 0, 0}, + DzPrefixCount: 1, + DevicePubkey: solana.NewWallet().PublicKey(), + TenantPubkey: tenant, + } + instr, _, err := executor.buildCreateUserInstruction(args) + require.NoError(t, err) + + accs := instr.Accounts() + // Tenant slot sits between dz_prefix_block(s) and the payer/system tail. + tenantSlot := accs[len(accs)-3] + assert.Equal(t, tenant, tenantSlot.PublicKey) + assert.True(t, tenantSlot.IsWritable) +} + +func TestBuildCreateUserInstruction_RejectsZeroDzPrefix(t *testing.T) { + t.Parallel() + + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + _, _, err := executor.CreateUser(context.Background(), UserCreateArgs{ + UserType: UserTypeIBRL, + CyoaType: CyoaTypeGREOverDIA, + DzPrefixCount: 0, + DevicePubkey: solana.NewWallet().PublicKey(), + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "DzPrefixCount must be > 0") +} + +func TestBuildDeleteUserInstruction(t *testing.T) { + t.Parallel() + + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + + userPubkey := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + owner := solana.NewWallet().PublicKey() + user := User{ + AccountType: UserType, + Owner: owner, + UserType: UserTypeIBRL, + DevicePubKey: device, + ClientIp: [4]byte{10, 0, 0, 5}, + } + + // Use the fixture's (3, 1) values to exercise the borsh layout end-to-end + // against Rust output; production DeleteUser hard-codes (1, 1) — see the + // constant in DeleteUser itself. + instr, err := executor.buildDeleteUserInstruction(userPubkey, user, 3, 1) + require.NoError(t, err) + + data, err := instr.Data() + require.NoError(t, err) + require.Len(t, data, 3, "opcode (1) + borsh UserDeleteArgs (2) = 3 bytes") + assert.Equal(t, byte(instructionDeleteUser), data[0]) + assert.Equal(t, loadArgsFixture(t, "user_delete_args"), data[1:], + "borsh body must match Rust-generated user_delete_args.bin") + + accs := instr.Accounts() + // 7 fixed + 3 dz_prefix + owner + payer + system = 13 accounts (no tenant). + require.Len(t, accs, 13) + assert.Equal(t, userPubkey, accs[0].PublicKey) + assert.Equal(t, device, accs[3].PublicKey) + ownerSlot := accs[len(accs)-3] + assert.Equal(t, owner, ownerSlot.PublicKey) + assert.True(t, ownerSlot.IsWritable) + assert.Equal(t, executor.signer.PublicKey(), accs[len(accs)-2].PublicKey) + assert.True(t, accs[len(accs)-2].IsSigner) + assert.Equal(t, solana.SystemProgramID, accs[len(accs)-1].PublicKey) +} + +func TestBuildDeleteUserInstruction_WithTenant(t *testing.T) { + t.Parallel() + + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + + tenant := solana.NewWallet().PublicKey() + user := User{ + AccountType: UserType, + Owner: solana.NewWallet().PublicKey(), + TenantPubKey: tenant, + DevicePubKey: solana.NewWallet().PublicKey(), + UserType: UserTypeIBRL, + ClientIp: [4]byte{10, 0, 0, 5}, + } + + instr, err := executor.buildDeleteUserInstruction(solana.NewWallet().PublicKey(), user, 1, 1) + require.NoError(t, err) + + accs := instr.Accounts() + // Tenant sits before the owner/payer/system tail (3 trailing slots). + tenantSlot := accs[len(accs)-4] + assert.Equal(t, tenant, tenantSlot.PublicKey) + assert.True(t, tenantSlot.IsWritable) +} + +func TestCreateUserWaitsForAccountVisible(t *testing.T) { + t.Parallel() + + signer := solana.NewWallet().PrivateKey + programID := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + args := UserCreateArgs{ + UserType: UserTypeIBRL, + CyoaType: CyoaTypeGREOverDIA, + ClientIP: [4]byte{10, 0, 0, 1}, + DzPrefixCount: 1, + DevicePubkey: device, + } + expectedPDA, _, err := GetUserPDA(programID, args.ClientIP, args.UserType) + require.NoError(t, err) + + // First call (permission probe) returns nil; the user-PDA probe then returns + // a non-nil Value so the visibility wait completes immediately. + var lookups atomic.Int32 + rpc := &mockRPCClient{ + getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) { + n := lookups.Add(1) + if account.Equals(expectedPDA) && n >= 2 { + return &solanarpc.GetAccountInfoResult{ + Value: &solanarpc.Account{Owner: programID}, + }, nil + } + return &solanarpc.GetAccountInfoResult{Value: nil}, nil + }, + } + executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(500*time.Millisecond)) + + sig, userPDA, err := executor.CreateUser(context.Background(), args) + require.NoError(t, err) + assert.NotEqual(t, solana.Signature{}, sig) + assert.Equal(t, expectedPDA, userPDA) + require.NotEmpty(t, rpc.sentTransactions) +} + +func TestCreateUserReportsVisibilityTimeout(t *testing.T) { + t.Parallel() + + signer := solana.NewWallet().PrivateKey + programID := solana.NewWallet().PublicKey() + rpc := &mockRPCClient{} // default: GetAccountInfo always returns nil + executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(50*time.Millisecond)) + + sig, userPDA, err := executor.CreateUser(context.Background(), UserCreateArgs{ + UserType: UserTypeIBRL, + CyoaType: CyoaTypeGREOverDIA, + ClientIP: [4]byte{10, 0, 0, 1}, + DzPrefixCount: 1, + DevicePubkey: solana.NewWallet().PublicKey(), + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "post-confirm visibility timeout") + // Signature and PDA are still returned so callers can correlate. + assert.NotEqual(t, solana.Signature{}, sig) + assert.NotEqual(t, solana.PublicKey{}, userPDA) +} + +func TestDeleteUserWaitsForAccountGone(t *testing.T) { + t.Parallel() + + signer := solana.NewWallet().PrivateKey + programID := solana.NewWallet().PublicKey() + userPubkey := solana.NewWallet().PublicKey() + + // Construct a borsh-serialized minimal User account body via DeserializeUser's + // inverse: we just write the fields by hand. + owner := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + userBytes := makeMinimalUserBytes(owner, device, [4]byte{10, 0, 0, 5}) + + // Sequence: GetAccountInfo returns user bytes once (initial DeleteUser read), nil + // thereafter (visibility wait sees account gone). Permission probe returns nil. + var lookups atomic.Int32 + rpc := &mockRPCClient{ + getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) { + n := lookups.Add(1) + if account.Equals(userPubkey) && n == 1 { + return &solanarpc.GetAccountInfoResult{ + Value: &solanarpc.Account{ + Owner: programID, + Data: solanarpc.DataBytesOrJSONFromBytes(userBytes), + }, + }, nil + } + return &solanarpc.GetAccountInfoResult{Value: nil}, nil + }, + } + executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(500*time.Millisecond)) + + sig, err := executor.DeleteUser(context.Background(), userPubkey) + require.NoError(t, err) + assert.NotEqual(t, solana.Signature{}, sig) + require.NotEmpty(t, rpc.sentTransactions) + + // Verify the submitted transaction references the device pulled from the User. + tx := rpc.sentTransactions[0] + keys := tx.Message.AccountKeys + foundDevice := false + for _, k := range keys { + if k.Equals(device) { + foundDevice = true + break + } + } + assert.True(t, foundDevice, "device referenced by the user account must appear in the DeleteUser tx") +} + +func TestDeleteUserNotFound(t *testing.T) { + t.Parallel() + + signer := solana.NewWallet().PrivateKey + programID := solana.NewWallet().PublicKey() + rpc := &mockRPCClient{ + getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) { + return &solanarpc.GetAccountInfoResult{Value: nil}, nil + }, + } + executor := NewExecutor(slog.Default(), rpc, &signer, programID) + + _, err := executor.DeleteUser(context.Background(), solana.NewWallet().PublicKey()) + require.Error(t, err) + assert.Contains(t, err.Error(), "not found") +} + +func TestWaitForAccountVisible_TimeoutVsCancel(t *testing.T) { + t.Parallel() + + t.Run("returns nil when account appears", func(t *testing.T) { + var n atomic.Int32 + rpc := &mockRPCClient{ + getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) { + if n.Add(1) >= 2 { + return &solanarpc.GetAccountInfoResult{Value: &solanarpc.Account{}}, nil + } + return &solanarpc.GetAccountInfoResult{Value: nil}, nil + }, + } + executor, _ := newTestExecutor(t, rpc) + require.NoError(t, executor.waitForAccountVisible(context.Background(), solana.NewWallet().PublicKey(), time.Second)) + }) + + t.Run("returns error past deadline", func(t *testing.T) { + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + err := executor.waitForAccountVisible(context.Background(), solana.NewWallet().PublicKey(), 50*time.Millisecond) + require.Error(t, err) + }) + + t.Run("returns context error on cancel", func(t *testing.T) { + rpc := &mockRPCClient{} + executor, _ := newTestExecutor(t, rpc) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + err := executor.waitForAccountVisible(ctx, solana.NewWallet().PublicKey(), time.Second) + require.Error(t, err) + assert.True(t, errors.Is(err, context.Canceled)) + }) +} + +// makeMinimalUserBytes hand-encodes a User account body matching DeserializeUser's +// field order. Most fields are zero — only AccountType, Owner, DevicePubKey, and +// ClientIp are populated, which is enough for buildDeleteUserInstruction. +func makeMinimalUserBytes(owner, device solana.PublicKey, clientIP [4]byte) []byte { + b := make([]byte, 0, 256) + b = append(b, byte(UserType)) // AccountType + b = append(b, owner[:]...) // Owner: 32 bytes + b = append(b, make([]byte, 16)...) // Index: u128 = 16 bytes + b = append(b, 0) // BumpSeed + b = append(b, byte(UserTypeIBRL)) // UserType + b = append(b, make([]byte, 32)...) // TenantPubKey (zero) + b = append(b, device[:]...) // DevicePubKey: 32 bytes + b = append(b, byte(CyoaTypeGREOverDIA)) // CyoaType + b = append(b, clientIP[:]...) // ClientIp: 4 bytes + b = append(b, make([]byte, 4)...) // DzIp: 4 bytes + b = append(b, 0, 0) // TunnelId: u16 + b = append(b, make([]byte, 5)...) // TunnelNet: 5 bytes + b = append(b, byte(UserStatusActivated)) + b = append(b, 0, 0, 0, 0) // Publishers: u32 len = 0 + b = append(b, 0, 0, 0, 0) // Subscribers: u32 len = 0 + b = append(b, make([]byte, 32)...) // ValidatorPubKey + b = append(b, make([]byte, 4)...) // TunnelEndpoint + b = append(b, 0) // TunnelFlags + b = append(b, 0) // BgpStatus + b = append(b, make([]byte, 8)...) // LastBgpUpAt + b = append(b, make([]byte, 8)...) // LastBgpReportedAt + b = append(b, make([]byte, 8)...) // BgpRttNs + return b +} From 88436b020c2295e673789b50fee73db311acb5a7 Mon Sep 17 00:00:00 2001 From: Greg Mitchell Date: Wed, 27 May 2026 14:55:50 +0000 Subject: [PATCH 2/5] sdk: add CHANGELOG entry for serviceability user CRUD + reconcile --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42c4e52e05..2540b3202c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file. - Add `--log-level ` global flag and initialize the `tracing` subscriber at startup. `LEVEL` is one of `off`, `error`, `warn` (default), `info`, `debug`, `trace`. Diagnostic logs go to stderr so `--json` output on stdout remains parseable. Honors the `RUST_LOG` environment variable when set, overriding the CLI-flag level for per-module filtering. Replaces the previous `println!("using keypair: ...")` stdout line with a `tracing::info!` event; the keypair confirmation now appears only at `--log-level info` or higher and no longer pollutes parseable stdout. (Named `--log-level` rather than the RFC-20 §Global-flags suggested `--verbose` / `-v` because the existing `doublezero connect` / `disconnect` subcommands already own a `--verbose` flag with `bool` type; the global flag deviation will be revisited when the daemon-control module crate is carved out.) - Build a `CliContext` once at binary startup from `--env`, the per-field global overrides (`--url`, `--ws`, `--solana-url`, `--program-id`, `--geo-program-id`, `--keypair`, `--sock-file`), and the persisted `~/.config/doublezero/cli/config.yml` (overridable via `DOUBLEZERO_CONFIG_FILE`), per RFC-20 (§CliContext). Precedence (highest wins): CLI flag > persisted config > env-derived default. When `--env` is not set and the persisted config has a serviceability program ID, the environment is derived from that program ID via `Environment::from_program_id`; otherwise the binary falls back to `Environment::default()`. The legacy `DZClient` is now constructed from the fully resolved `CliContext` URL, WebSocket, and program-ID values directly, so verbs that migrate to read `CliContext` see the same backend as the legacy bridge. Keypair resolution is intentionally left to `DZClient::new`'s internal `load_keypair` precedence (CLI `--keypair` flag > `DOUBLEZERO_KEYPAIR` env var > stdin > persisted config) so the `DOUBLEZERO_KEYPAIR` env var continues to override the persisted keypair path, as relied on by the e2e contributor-auth negative-authz suite. File reads happen only in the binary; module crates remain forbidden from touching the filesystem (RFC-20 §67). - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: ` followed by the full chain of causes on stderr. +- SDK (Go) + - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Adds a pure `PlanReconcile(current, target, ownerFilter)` planner that filters by user owner and returns a deterministic create/delete delta (ClientIp-ascending with PubKey tiebreak), used by the upcoming device-stress orchestrator to drive sweeps. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)). ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22 From ed95322821a83f4e0bba0accbb12a168671aaf35 Mon Sep 17 00:00:00 2001 From: Greg Mitchell Date: Wed, 27 May 2026 15:09:34 +0000 Subject: [PATCH 3/5] sdk: drop PlanReconcile from this PR; defer to orchestrator PlanReconcile is orchestrator policy ("how many users do we want") rather than an SDK primitive ("how do I submit a CreateUser/DeleteUser"). Move it out of the serviceability SDK and land it alongside the device-stress orchestrator binary in part 2 of #3746. --- CHANGELOG.md | 2 +- .../sdk/go/serviceability/reconcile.go | 59 ------ .../sdk/go/serviceability/reconcile_test.go | 173 ------------------ 3 files changed, 1 insertion(+), 233 deletions(-) delete mode 100644 smartcontract/sdk/go/serviceability/reconcile.go delete mode 100644 smartcontract/sdk/go/serviceability/reconcile_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 2540b3202c..b0e2e843b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ All notable changes to this project will be documented in this file. - Build a `CliContext` once at binary startup from `--env`, the per-field global overrides (`--url`, `--ws`, `--solana-url`, `--program-id`, `--geo-program-id`, `--keypair`, `--sock-file`), and the persisted `~/.config/doublezero/cli/config.yml` (overridable via `DOUBLEZERO_CONFIG_FILE`), per RFC-20 (§CliContext). Precedence (highest wins): CLI flag > persisted config > env-derived default. When `--env` is not set and the persisted config has a serviceability program ID, the environment is derived from that program ID via `Environment::from_program_id`; otherwise the binary falls back to `Environment::default()`. The legacy `DZClient` is now constructed from the fully resolved `CliContext` URL, WebSocket, and program-ID values directly, so verbs that migrate to read `CliContext` see the same backend as the legacy bridge. Keypair resolution is intentionally left to `DZClient::new`'s internal `load_keypair` precedence (CLI `--keypair` flag > `DOUBLEZERO_KEYPAIR` env var > stdin > persisted config) so the `DOUBLEZERO_KEYPAIR` env var continues to override the persisted keypair path, as relied on by the e2e contributor-auth negative-authz suite. File reads happen only in the binary; module crates remain forbidden from touching the filesystem (RFC-20 §67). - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: ` followed by the full chain of causes on stderr. - SDK (Go) - - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Adds a pure `PlanReconcile(current, target, ownerFilter)` planner that filters by user owner and returns a deterministic create/delete delta (ClientIp-ascending with PubKey tiebreak), used by the upcoming device-stress orchestrator to drive sweeps. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)). + - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)). ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22 diff --git a/smartcontract/sdk/go/serviceability/reconcile.go b/smartcontract/sdk/go/serviceability/reconcile.go deleted file mode 100644 index d61e20ea91..0000000000 --- a/smartcontract/sdk/go/serviceability/reconcile.go +++ /dev/null @@ -1,59 +0,0 @@ -package serviceability - -import ( - "bytes" - "sort" - - "github.com/gagliardetto/solana-go" -) - -// ReconcilePlan describes the delta needed to drive the set of users owned by a -// given key toward a desired count. -type ReconcilePlan struct { - // ToCreate is the number of users to add. Always >= 0. - ToCreate int - // ToDelete lists user PDAs to remove, in the order they should be deleted. - // Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker, so - // repeated calls against the same input produce identical plans. - ToDelete []solana.PublicKey -} - -// PlanReconcile decides what to create or delete so that the number of users -// owned by ownerFilter equals target. Users with a different Owner are ignored -// (neither counted nor deleted), which lets the stress orchestrator share a -// program with other tenants without disturbing them. -// -// The function is pure — no I/O — so it is safe to call repeatedly while the -// orchestrator polls live state. Returns a zero plan when target is negative. -func PlanReconcile(current []User, target int, ownerFilter solana.PublicKey) ReconcilePlan { - if target < 0 { - return ReconcilePlan{} - } - - var owned []User - for _, u := range current { - if bytes.Equal(u.Owner[:], ownerFilter[:]) { - owned = append(owned, u) - } - } - - switch { - case len(owned) < target: - return ReconcilePlan{ToCreate: target - len(owned)} - case len(owned) > target: - sort.Slice(owned, func(i, j int) bool { - if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 { - return c < 0 - } - return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0 - }) - victims := owned[target:] - out := make([]solana.PublicKey, len(victims)) - for i, u := range victims { - out[i] = solana.PublicKeyFromBytes(u.PubKey[:]) - } - return ReconcilePlan{ToDelete: out} - default: - return ReconcilePlan{} - } -} diff --git a/smartcontract/sdk/go/serviceability/reconcile_test.go b/smartcontract/sdk/go/serviceability/reconcile_test.go deleted file mode 100644 index 335094b7b8..0000000000 --- a/smartcontract/sdk/go/serviceability/reconcile_test.go +++ /dev/null @@ -1,173 +0,0 @@ -package serviceability_test - -import ( - "testing" - - "github.com/gagliardetto/solana-go" - "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// makeUser is a tiny helper to build a User suitable for PlanReconcile testing: -// only Owner, ClientIp, and PubKey actually influence the planner. -func makeUser(owner solana.PublicKey, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User { - return serviceability.User{ - Owner: owner, - ClientIp: clientIP, - PubKey: pubkey, - } -} - -func TestPlanReconcile(t *testing.T) { - t.Parallel() - - orchestrator := solana.NewWallet().PublicKey() - stranger := solana.NewWallet().PublicKey() - - // Stable pubkeys so we can assert exact ordering. - u1 := solana.NewWallet().PublicKey() - u2 := solana.NewWallet().PublicKey() - u3 := solana.NewWallet().PublicKey() - u4 := solana.NewWallet().PublicKey() - u5 := solana.NewWallet().PublicKey() - - ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} } - - tests := []struct { - name string - current []serviceability.User - target int - owner solana.PublicKey - wantCreate int - wantDeleteIPs [][4]byte // ClientIp order we expect to see in ToDelete - }{ - { - name: "zero to N", - current: nil, - target: 4, - owner: orchestrator, - wantCreate: 4, - }, - { - name: "N to zero deletes in ip-ascending order", - current: []serviceability.User{ - makeUser(orchestrator, u1, ip(10, 0, 0, 3)), - makeUser(orchestrator, u2, ip(10, 0, 0, 1)), - makeUser(orchestrator, u3, ip(10, 0, 0, 4)), - makeUser(orchestrator, u4, ip(10, 0, 0, 2)), - }, - target: 0, - owner: orchestrator, - wantCreate: 0, - wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)}, - }, - { - name: "partial trim deletes only the overflow", - current: []serviceability.User{ - makeUser(orchestrator, u1, ip(10, 0, 0, 5)), - makeUser(orchestrator, u2, ip(10, 0, 0, 4)), - makeUser(orchestrator, u3, ip(10, 0, 0, 3)), - makeUser(orchestrator, u4, ip(10, 0, 0, 2)), - makeUser(orchestrator, u5, ip(10, 0, 0, 1)), - }, - target: 3, - owner: orchestrator, - wantCreate: 0, - wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)}, - }, - { - name: "partial grow asks for the missing count", - current: []serviceability.User{ - makeUser(orchestrator, u1, ip(10, 0, 0, 1)), - makeUser(orchestrator, u2, ip(10, 0, 0, 2)), - }, - target: 5, - owner: orchestrator, - wantCreate: 3, - }, - { - name: "only foreign users present grows by full target", - current: []serviceability.User{ - makeUser(stranger, u1, ip(10, 0, 0, 1)), - makeUser(stranger, u2, ip(10, 0, 0, 2)), - makeUser(stranger, u3, ip(10, 0, 0, 3)), - }, - target: 2, - owner: orchestrator, - wantCreate: 2, - }, - { - name: "mixed ownership only counts and deletes owned", - current: []serviceability.User{ - makeUser(stranger, u1, ip(10, 0, 0, 9)), - makeUser(orchestrator, u2, ip(10, 0, 0, 2)), - makeUser(stranger, u3, ip(10, 0, 0, 8)), - makeUser(orchestrator, u4, ip(10, 0, 0, 1)), - }, - target: 1, - owner: orchestrator, - wantCreate: 0, - wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)}, - }, - { - name: "already at target produces zero plan", - current: []serviceability.User{ - makeUser(orchestrator, u1, ip(10, 0, 0, 1)), - makeUser(orchestrator, u2, ip(10, 0, 0, 2)), - }, - target: 2, - owner: orchestrator, - wantCreate: 0, - }, - { - name: "negative target produces zero plan", - current: []serviceability.User{ - makeUser(orchestrator, u1, ip(10, 0, 0, 1)), - }, - target: -1, - owner: orchestrator, - wantCreate: 0, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - plan := serviceability.PlanReconcile(tc.current, tc.target, tc.owner) - assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate") - require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length") - - // Resolve expected pubkeys via ClientIp lookup against the current set. - ipToPubkey := map[[4]byte]solana.PublicKey{} - for _, u := range tc.current { - ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:]) - } - for i, ipKey := range tc.wantDeleteIPs { - assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey) - } - }) - } -} - -func TestPlanReconcile_TieBreaksByPubkey(t *testing.T) { - t.Parallel() - - orchestrator := solana.NewWallet().PublicKey() - sharedIP := [4]byte{10, 0, 0, 1} - - // Two users with the same ClientIp (artificial — onchain the IP is part of - // the PDA seed so collisions can't happen, but the tiebreak must still be - // deterministic). - pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) - pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) - - plan := serviceability.PlanReconcile([]serviceability.User{ - makeUser(orchestrator, pkB, sharedIP), - makeUser(orchestrator, pkA, sharedIP), - }, 0, orchestrator) - - require.Len(t, plan.ToDelete, 2) - // pkA (0xAA…) sorts before pkB (0xBB…). - assert.Equal(t, pkA, plan.ToDelete[0]) - assert.Equal(t, pkB, plan.ToDelete[1]) -} From ee9b8226447a838ce0b6544e1240ea53c66effb1 Mon Sep 17 00:00:00 2001 From: Greg Mitchell Date: Wed, 27 May 2026 15:38:00 +0000 Subject: [PATCH 4/5] tools/stress: orchestrator skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tools/stress/device-orchestrator/, the device-stress orchestrator binary for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's CLI list, dumps orchestrator-config.json on start, runs a provision-then- reverse-deprovision sweep against a live serviceability program, and emits the runlog row schema {run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event} for each submit | confirm | activate | deprovision_* event. Packages: - pkg/reconcile — PlanFor() pure function (lifted from the part-1 SDK PR; now lives with the orchestrator as policy, not as an SDK primitive) - pkg/runlog — append-only JSONL writer for orchestrator-runlog.json - pkg/sweep — provision-then-deprovision loop driven by PlanFor; uses a Clock + Executor interface for testability; reverse-creation-order delete - pkg/abort — sentinel-file poller that cancels a derived ctx between user iterations so an in-flight Create/Delete completes before exit - pkg/agent — AgentRunner interface + noop impl; SSH runner lands in part 3 along with pre_commit_log / applied event emission - pkg/exec — Live impl of sweep.Executor over serviceability.{Client, Executor}; picks deterministic per-user IPs from --client-ip-base - cmd/device-orchestrator — flag parsing, config dump, signal + abort handling, sweep wiring The agent runner is stubbed behind an interface so this PR can land end-to-end functionality (provision/deprovision + runlog + abort) without the SSH plumbing. The SSH runner and the corresponding pre_commit_log / applied row generation land in part 3 of #3746. Part 2 of #3746. Closes #3771. --- CHANGELOG.md | 2 + tools/stress/device-orchestrator/Makefile | 15 + .../cmd/device-orchestrator/main.go | 277 +++++++++++++++ .../device-orchestrator/pkg/abort/abort.go | 64 ++++ .../pkg/abort/abort_test.go | 80 +++++ .../device-orchestrator/pkg/agent/agent.go | 73 ++++ .../pkg/agent/agent_test.go | 32 ++ .../device-orchestrator/pkg/exec/exec.go | 139 ++++++++ .../device-orchestrator/pkg/exec/exec_test.go | 27 ++ .../pkg/reconcile/reconcile.go | 63 ++++ .../pkg/reconcile/reconcile_test.go | 166 +++++++++ .../device-orchestrator/pkg/runlog/runlog.go | 101 ++++++ .../pkg/runlog/runlog_test.go | 93 +++++ .../device-orchestrator/pkg/sweep/sweep.go | 262 ++++++++++++++ .../pkg/sweep/sweep_test.go | 321 ++++++++++++++++++ 15 files changed, 1715 insertions(+) create mode 100644 tools/stress/device-orchestrator/Makefile create mode 100644 tools/stress/device-orchestrator/cmd/device-orchestrator/main.go create mode 100644 tools/stress/device-orchestrator/pkg/abort/abort.go create mode 100644 tools/stress/device-orchestrator/pkg/abort/abort_test.go create mode 100644 tools/stress/device-orchestrator/pkg/agent/agent.go create mode 100644 tools/stress/device-orchestrator/pkg/agent/agent_test.go create mode 100644 tools/stress/device-orchestrator/pkg/exec/exec.go create mode 100644 tools/stress/device-orchestrator/pkg/exec/exec_test.go create mode 100644 tools/stress/device-orchestrator/pkg/reconcile/reconcile.go create mode 100644 tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go create mode 100644 tools/stress/device-orchestrator/pkg/runlog/runlog.go create mode 100644 tools/stress/device-orchestrator/pkg/runlog/runlog_test.go create mode 100644 tools/stress/device-orchestrator/pkg/sweep/sweep.go create mode 100644 tools/stress/device-orchestrator/pkg/sweep/sweep_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index b0e2e843b5..d71edb5e5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ All notable changes to this project will be documented in this file. - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: ` followed by the full chain of causes on stderr. - SDK (Go) - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)). +- Tools + - Add `tools/stress/device-orchestrator/` — the device-stress orchestrator skeleton for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's CLI list, dumps `orchestrator-config.json` on start, runs a provision-then-reverse-deprovision sweep against a live serviceability program, and emits the runlog row schema `{run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event}` to `orchestrator-runlog.json` for each `submit | confirm | activate | deprovision_*` event. The agent runner is stubbed behind a `pkg/agent.Runner` interface (no-op impl ships now; the SSH-backed runner that emits `pre_commit_log` / `applied` lands in part 3). The sweep cooperates with an abort sentinel file: when the file appears the in-flight user completes and the orchestrator deprovisions everything it created before exiting non-zero. `PlanReconcile` / `Plan` (lifted from the part-1 SDK PR) now lives at `tools/stress/device-orchestrator/pkg/reconcile/` as orchestrator policy rather than SDK primitive. Part 2 of #3746 ([#3771](https://github.com/malbeclabs/doublezero/issues/3771)). ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22 diff --git a/tools/stress/device-orchestrator/Makefile b/tools/stress/device-orchestrator/Makefile new file mode 100644 index 0000000000..6ed19c04fe --- /dev/null +++ b/tools/stress/device-orchestrator/Makefile @@ -0,0 +1,15 @@ +PREFIX:=github.com/malbeclabs/doublezero/tools/stress/device-orchestrator +BUILD:=`git rev-parse --short HEAD` +LDFLAGS=-ldflags "-X=$(PREFIX)/build.Build=$(BUILD)" + +.PHONY: test +test: + go test -race -v -coverprofile coverage.out ./... + +.PHONY: lint +lint: + golangci-lint run -c ../../../.golangci.yaml + +.PHONY: build +build: + CGO_ENABLED=0 go build -v $(LDFLAGS) -o bin/device-orchestrator cmd/device-orchestrator/main.go diff --git a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go new file mode 100644 index 0000000000..ab01975d30 --- /dev/null +++ b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go @@ -0,0 +1,277 @@ +// device-orchestrator runs the GRE Tunnel Capacity Study sweep against a +// live serviceability program: provisions N users on a target device in +// batches with a hold between each, then deprovisions in reverse-creation +// order. Per #3771 (part 2 of #3746) the SSH-driven agent runner is stubbed +// behind the agent.Runner interface; the no-op implementation is used here +// and the SSH implementation lands in part 3. +package main + +import ( + "context" + "crypto/rand" + "encoding/hex" + "encoding/json" + "errors" + "flag" + "fmt" + "log/slog" + "net" + "os" + "os/signal" + "path/filepath" + "syscall" + "time" + + "github.com/gagliardetto/solana-go" + solanarpc "github.com/gagliardetto/solana-go/rpc" + + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/abort" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/exec" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep" +) + +// orchestratorConfig captures the resolved CLI inputs in the shape that gets +// dumped to orchestrator-config.json on start. +type orchestratorConfig struct { + RunID string `json:"run_id"` + TargetUserCount int `json:"target_user_count"` + UsersPerBatch int `json:"users_per_batch"` + HoldSeconds int `json:"hold_seconds"` + DUTPubkey string `json:"dut_pubkey"` + DUTSSHHost string `json:"dut_ssh_host"` + DUTSSHKey string `json:"dut_ssh_key"` + RPCURL string `json:"rpc_url"` + ProgramID string `json:"program_id"` + KeypairPath string `json:"keypair"` + ControllerAddr string `json:"controller"` + AbortFile string `json:"abort_file"` + WorkingDir string `json:"working_dir"` + ClientIPBase string `json:"client_ip_base"` + TunnelEndpoint string `json:"tunnel_endpoint"` + TenantPubkey string `json:"tenant_pubkey,omitempty"` +} + +func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run() error { + var ( + targetUserCount = flag.Int("target-user-count", 8, "Final user count to sweep up to.") + usersPerBatch = flag.Int("users-per-batch", 2, "Users provisioned per batch before the hold.") + holdSeconds = flag.Int("hold-seconds", 180, "Seconds to hold between batches.") + dutPubkey = flag.String("dut-pubkey", "", "Device-under-test pubkey (base58).") + dutSSHHost = flag.String("dut-ssh-host", "", "SSH host:port for the DUT (used by the part-3 agent runner).") + dutSSHKey = flag.String("dut-ssh-key", "", "SSH private-key path for the DUT.") + rpcURL = flag.String("rpc-url", "", "Serviceability RPC URL.") + programID = flag.String("program-id", "", "Serviceability program ID (base58).") + keypairPath = flag.String("keypair", "", "Path to the orchestrator's solana keypair JSON.") + controllerAddr = flag.String("controller", "", "Controller IP:PORT, forwarded to the DUT agent in part 3.") + abortFile = flag.String("abort-file", "", "Path to a sentinel file; when it appears the sweep finishes the current user and exits.") + workingDir = flag.String("working-dir", ".", "Output directory for orchestrator-config.json / orchestrator-runlog.json.") + clientIPBase = flag.String("client-ip-base", "100.64.0.0", "Starting IPv4 address; per-user IP is base + idx.") + tunnelEndpoint = flag.String("tunnel-endpoint", "0.0.0.0", "Tunnel endpoint IP passed to UserCreateArgs; 0.0.0.0 lets the program fall back to the device's public IP.") + tenantPubkey = flag.String("tenant-pubkey", "", "Optional tenant pubkey for UserCreateArgs.") + runID = flag.String("run-id", "", "Run identifier written into every runlog row; auto-generated if empty.") + logLevel = flag.String("log-level", "info", "slog level: debug|info|warn|error.") + dryRun = flag.Bool("dry-run", false, "Validate flags and dump orchestrator-config.json without contacting the RPC.") + ) + flag.Parse() + + logger := newLogger(*logLevel) + slog.SetDefault(logger) + + if *runID == "" { + var buf [8]byte + if _, err := rand.Read(buf[:]); err != nil { + return fmt.Errorf("generate run id: %w", err) + } + *runID = "run-" + hex.EncodeToString(buf[:]) + } + + if err := os.MkdirAll(*workingDir, 0o755); err != nil { + return fmt.Errorf("create working dir: %w", err) + } + + baseIP, err := parseIPv4(*clientIPBase) + if err != nil { + return fmt.Errorf("parse --client-ip-base: %w", err) + } + tunnelIP, err := parseIPv4(*tunnelEndpoint) + if err != nil { + return fmt.Errorf("parse --tunnel-endpoint: %w", err) + } + + resolved := orchestratorConfig{ + RunID: *runID, + TargetUserCount: *targetUserCount, + UsersPerBatch: *usersPerBatch, + HoldSeconds: *holdSeconds, + DUTPubkey: *dutPubkey, + DUTSSHHost: *dutSSHHost, + DUTSSHKey: *dutSSHKey, + RPCURL: *rpcURL, + ProgramID: *programID, + KeypairPath: *keypairPath, + ControllerAddr: *controllerAddr, + AbortFile: *abortFile, + WorkingDir: *workingDir, + ClientIPBase: *clientIPBase, + TunnelEndpoint: *tunnelEndpoint, + TenantPubkey: *tenantPubkey, + } + configPath := filepath.Join(*workingDir, "orchestrator-config.json") + if err := dumpJSON(configPath, resolved); err != nil { + return fmt.Errorf("write orchestrator-config.json: %w", err) + } + logger.Info("orchestrator-config.json written", "path", configPath) + + if *dryRun { + logger.Info("dry-run: skipping sweep") + return nil + } + + if err := requireFlags(map[string]string{ + "--dut-pubkey": *dutPubkey, + "--rpc-url": *rpcURL, + "--program-id": *programID, + "--keypair": *keypairPath, + }); err != nil { + return err + } + + dutPK, err := solana.PublicKeyFromBase58(*dutPubkey) + if err != nil { + return fmt.Errorf("--dut-pubkey: %w", err) + } + programPK, err := solana.PublicKeyFromBase58(*programID) + if err != nil { + return fmt.Errorf("--program-id: %w", err) + } + signer, err := solana.PrivateKeyFromSolanaKeygenFile(*keypairPath) + if err != nil { + return fmt.Errorf("load --keypair: %w", err) + } + + var tenantPK solana.PublicKey + if *tenantPubkey != "" { + tenantPK, err = solana.PublicKeyFromBase58(*tenantPubkey) + if err != nil { + return fmt.Errorf("--tenant-pubkey: %w", err) + } + } + + rpc := solanarpc.New(*rpcURL) + client := serviceability.New(rpc, programPK) + executor := serviceability.NewExecutor(logger, rpc, &signer, programPK) + + liveExec, err := exec.New(exec.Config{ + Client: client, + Executor: executor, + DevicePubkey: dutPK, + TenantPubkey: tenantPK, + ClientIPBase: baseIP, + TunnelEndpoint: tunnelIP, + UserType: serviceability.UserTypeIBRL, + CyoaType: serviceability.CyoaTypeGREOverDIA, + DzPrefixCount: 1, + }) + if err != nil { + return err + } + + runlogPath := filepath.Join(*workingDir, "orchestrator-runlog.json") + rlw, err := runlog.Open(runlogPath) + if err != nil { + return err + } + defer rlw.Close() + logger.Info("orchestrator-runlog.json open", "path", runlogPath) + + // Compose ctx: signal cancellation + abort-file cancellation. + rootCtx, rootCancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer rootCancel() + ctx, abortCancel := abort.Watch(rootCtx, *abortFile, abort.DefaultPollInterval, logger) + defer abortCancel() + + cfg := sweep.Config{ + RunID: *runID, + Target: *targetUserCount, + UsersPerBatch: *usersPerBatch, + Hold: time.Duration(*holdSeconds) * time.Second, + OwnerFilter: signer.PublicKey(), + Executor: liveExec, + Agent: agent.NewNoop(logger), + Runlog: rlw, + Clock: sweep.RealClock{}, + Logger: logger, + } + + logger.Info("sweep starting", "target", cfg.Target, "batch", cfg.UsersPerBatch, "hold", cfg.Hold) + if err := sweep.Run(ctx, cfg); err != nil { + if errors.Is(err, context.Canceled) { + logger.Warn("sweep cancelled", "err", err) + return err + } + return fmt.Errorf("sweep: %w", err) + } + logger.Info("sweep finished") + return nil +} + +func newLogger(level string) *slog.Logger { + lvl := slog.LevelInfo + switch level { + case "debug": + lvl = slog.LevelDebug + case "warn": + lvl = slog.LevelWarn + case "error": + lvl = slog.LevelError + } + return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: lvl})) +} + +func dumpJSON(path string, v any) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + return enc.Encode(v) +} + +func requireFlags(required map[string]string) error { + var missing []string + for name, val := range required { + if val == "" { + missing = append(missing, name) + } + } + if len(missing) > 0 { + return fmt.Errorf("missing required flag(s): %v", missing) + } + return nil +} + +func parseIPv4(s string) ([4]byte, error) { + ip := net.ParseIP(s) + if ip == nil { + return [4]byte{}, fmt.Errorf("invalid IPv4 %q", s) + } + v4 := ip.To4() + if v4 == nil { + return [4]byte{}, fmt.Errorf("not IPv4: %q", s) + } + var out [4]byte + copy(out[:], v4) + return out, nil +} diff --git a/tools/stress/device-orchestrator/pkg/abort/abort.go b/tools/stress/device-orchestrator/pkg/abort/abort.go new file mode 100644 index 0000000000..8f191f5499 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/abort/abort.go @@ -0,0 +1,64 @@ +// Package abort polls a sentinel file on disk and cancels a context when the +// file appears. The orchestrator uses this for cooperative shutdown: an +// operator drops a file at the path passed via --abort-file and the running +// sweep finishes the current user iteration before exiting. +package abort + +import ( + "context" + "errors" + "log/slog" + "os" + "time" +) + +// Default polling cadence. The sweep loop only checks the cancellation between +// user iterations, so the abort signal latency is bounded by min(this, one +// user iteration). +const DefaultPollInterval = 250 * time.Millisecond + +// Watch returns a derived context that cancels as soon as `path` exists on +// disk. If path is empty the returned context is the parent verbatim and the +// returned stop is a no-op. The watcher goroutine exits when parent or the +// returned context is cancelled. +// +// Pass log=nil for silent operation. +func Watch(parent context.Context, path string, interval time.Duration, log *slog.Logger) (context.Context, context.CancelFunc) { + if path == "" { + return parent, func() {} + } + if interval <= 0 { + interval = DefaultPollInterval + } + ctx, cancel := context.WithCancel(parent) + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if exists(path) { + if log != nil { + log.Warn("abort file detected; cancelling sweep", "path", path) + } + cancel() + return + } + } + } + }() + return ctx, cancel +} + +// exists reports whether path refers to an existing filesystem entry. Any +// stat error other than ENOENT is treated as "exists" so a permission error +// doesn't silently leave the orchestrator running past an operator abort. +func exists(path string) bool { + _, err := os.Stat(path) + if err == nil { + return true + } + return !errors.Is(err, os.ErrNotExist) +} diff --git a/tools/stress/device-orchestrator/pkg/abort/abort_test.go b/tools/stress/device-orchestrator/pkg/abort/abort_test.go new file mode 100644 index 0000000000..13fdfba47c --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/abort/abort_test.go @@ -0,0 +1,80 @@ +package abort_test + +import ( + "context" + "errors" + "os" + "path/filepath" + "testing" + "time" + + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/abort" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWatch_CancelsWhenAbortFileAppears(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "abort") + ctx, cancel := abort.Watch(context.Background(), path, 25*time.Millisecond, nil) + t.Cleanup(cancel) + + // File doesn't exist yet — ctx is alive. + select { + case <-ctx.Done(): + t.Fatal("ctx cancelled before abort file existed") + case <-time.After(50 * time.Millisecond): + } + + // Touch the abort file. + require.NoError(t, os.WriteFile(path, nil, 0o644)) + + select { + case <-ctx.Done(): + assert.True(t, errors.Is(ctx.Err(), context.Canceled)) + case <-time.After(time.Second): + t.Fatal("ctx did not cancel within 1s after abort file touched") + } +} + +func TestWatch_EmptyPathIsNoOp(t *testing.T) { + t.Parallel() + + parent, parentCancel := context.WithCancel(context.Background()) + t.Cleanup(parentCancel) + + ctx, cancel := abort.Watch(parent, "", 0, nil) + t.Cleanup(cancel) + + select { + case <-ctx.Done(): + t.Fatal("empty-path watch should not cancel on its own") + case <-time.After(50 * time.Millisecond): + } + + // Parent cancellation still propagates through (we return parent verbatim). + parentCancel() + select { + case <-ctx.Done(): + case <-time.After(time.Second): + t.Fatal("derived ctx did not pick up parent cancellation") + } +} + +func TestWatch_StopsWhenParentCancelled(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "abort") + parent, parentCancel := context.WithCancel(context.Background()) + + ctx, cancel := abort.Watch(parent, path, 25*time.Millisecond, nil) + t.Cleanup(cancel) + + parentCancel() + select { + case <-ctx.Done(): + case <-time.After(time.Second): + t.Fatal("parent cancel did not propagate") + } +} diff --git a/tools/stress/device-orchestrator/pkg/agent/agent.go b/tools/stress/device-orchestrator/pkg/agent/agent.go new file mode 100644 index 0000000000..24c1b4dbce --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/agent/agent.go @@ -0,0 +1,73 @@ +// Package agent exposes the AgentRunner interface the orchestrator uses to +// drive doublezero-agent on a device under test (DUT). The skeleton ships a +// no-op implementation; the SSH-backed runner lands in part 3 of #3746. +package agent + +import ( + "context" + "log/slog" + "time" +) + +// EventKind tags an AgentEvent so runlog row generation can map it onto the +// runlog Event vocabulary (`pre_commit_log`, `applied`). +type EventKind int + +const ( + // EventPreCommitLog marks the moment the agent log shows + // `Committing config session due to diffs detected: ` for a new + // tunnel interface; carries the parsed tunnel ID. + EventPreCommitLog EventKind = iota + 1 + // EventApplied marks the moment the agent log shows a commit-success line + // for a previously-pending tunnel interface. + EventApplied +) + +// Event is one observation emitted by the agent runner: a timestamped tunnel +// state transition derived from agent log lines. +type Event struct { + Kind EventKind + TunnelID uint16 + At time.Time +} + +// Runner drives doublezero-agent on the DUT and surfaces tunnel-related events +// extracted from its log stream. +// +// Lifecycle: +// +// - Start(ctx) blocks until the agent stream is healthy enough to emit +// events (or returns an error). It returns immediately for the no-op impl. +// - Events() returns a channel that closes when the runner exits. +// +// The SSH-backed implementation will manage an ssh.Session and parse stdout +// for the two log lines listed under EventKind. +type Runner interface { + Start(ctx context.Context) error + Events() <-chan Event +} + +// NewNoop returns a Runner that never starts a process and never emits events. +// Used by the skeleton sweep loop and by tests where the agent isn't under test. +func NewNoop(log *slog.Logger) Runner { + ch := make(chan Event) + return &noop{log: log, events: ch} +} + +type noop struct { + log *slog.Logger + events chan Event +} + +func (n *noop) Start(ctx context.Context) error { + if n.log != nil { + n.log.Debug("agent: noop runner started (no events will be emitted)") + } + go func() { + <-ctx.Done() + close(n.events) + }() + return nil +} + +func (n *noop) Events() <-chan Event { return n.events } diff --git a/tools/stress/device-orchestrator/pkg/agent/agent_test.go b/tools/stress/device-orchestrator/pkg/agent/agent_test.go new file mode 100644 index 0000000000..430dae7988 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/agent/agent_test.go @@ -0,0 +1,32 @@ +package agent_test + +import ( + "context" + "testing" + "time" + + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent" + "github.com/stretchr/testify/require" +) + +func TestNoopRunner_ClosesEventsWhenContextCancelled(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + r := agent.NewNoop(nil) + require.NoError(t, r.Start(ctx)) + + select { + case <-r.Events(): + t.Fatal("noop runner emitted an event") + case <-time.After(50 * time.Millisecond): + } + + cancel() + select { + case _, ok := <-r.Events(): + require.False(t, ok, "events channel should close on cancel") + case <-time.After(time.Second): + t.Fatal("events channel did not close after context cancel") + } +} diff --git a/tools/stress/device-orchestrator/pkg/exec/exec.go b/tools/stress/device-orchestrator/pkg/exec/exec.go new file mode 100644 index 0000000000..86badb60f2 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/exec/exec.go @@ -0,0 +1,139 @@ +// Package exec wires the serviceability SDK behind the sweep.Executor +// interface. The orchestrator binary uses it against a real RPC; tests in +// pkg/sweep use a fake to avoid the network. +package exec + +import ( + "context" + "encoding/binary" + "fmt" + "time" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep" +) + +// Config bundles the inputs the live executor needs. +type Config struct { + Client *serviceability.Client + Executor *serviceability.Executor + + DevicePubkey solana.PublicKey + TenantPubkey solana.PublicKey // zero pubkey = no tenant + + // ClientIPBase is the starting /16 block from which sequential per-user + // IPs are drawn. For idx i, the assigned IP is ClientIPBase + i. + ClientIPBase [4]byte + // TunnelEndpoint is passed through to UserCreateArgs verbatim; pass + // 0.0.0.0 to use the device's public IP. + TunnelEndpoint [4]byte + // UserType / CyoaType pin the user kind for the entire sweep. + UserType serviceability.UserUserType + CyoaType serviceability.CyoaType + // DzPrefixCount must match the device's dz_prefixes length; 1 is the + // stress-test default. + DzPrefixCount uint8 +} + +// Live implements sweep.Executor against a real serviceability program. +type Live struct { + cfg Config +} + +// New returns a Live executor with the given configuration. Callers must +// supply a non-nil Client and Executor. +func New(cfg Config) (*Live, error) { + if cfg.Client == nil { + return nil, fmt.Errorf("exec.New: Client is required") + } + if cfg.Executor == nil { + return nil, fmt.Errorf("exec.New: Executor is required") + } + if cfg.DzPrefixCount == 0 { + cfg.DzPrefixCount = 1 + } + return &Live{cfg: cfg}, nil +} + +// ListUsers returns the current set of User accounts in the program. The +// caller (sweep loop) filters by owner via PlanFor. +func (l *Live) ListUsers(ctx context.Context) ([]serviceability.User, error) { + pd, err := l.cfg.Client.GetProgramData(ctx) + if err != nil { + return nil, fmt.Errorf("list users: %w", err) + } + return pd.Users, nil +} + +// CreateUser issues a CreateUser instruction for the idx-th stress user and +// records timestamps the sweep loop turns into runlog rows. +func (l *Live) CreateUser(ctx context.Context, idx int) (sweep.CreateResult, error) { + args := serviceability.UserCreateArgs{ + UserType: l.cfg.UserType, + CyoaType: l.cfg.CyoaType, + ClientIP: ipForIndex(l.cfg.ClientIPBase, idx), + TunnelEndpoint: l.cfg.TunnelEndpoint, + DzPrefixCount: l.cfg.DzPrefixCount, + DevicePubkey: l.cfg.DevicePubkey, + TenantPubkey: l.cfg.TenantPubkey, + } + _, userPDA, err := l.cfg.Executor.CreateUser(ctx, args) + if err != nil { + return sweep.CreateResult{}, err + } + now := time.Now() + + // The SDK's CreateUser blocks on signature finalization and post-confirm + // account visibility; we don't get distinct stage timestamps today, so + // confirm and activate both anchor at the post-call wallclock. A future + // SDK refactor can split these. + tunnelID, err := l.fetchTunnelID(ctx, userPDA) + if err != nil { + // Surface the tunnel ID as 0; the sweep records the create as successful + // because the on-chain User already exists. + tunnelID = 0 + } + return sweep.CreateResult{ + UserPDA: userPDA, + TunnelID: tunnelID, + ConfirmedAt: now, + ActivatedAt: now, + }, nil +} + +// DeleteUser closes a user account by PDA. +func (l *Live) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) { + if _, err := l.cfg.Executor.DeleteUser(ctx, userPDA); err != nil { + return sweep.DeleteResult{}, err + } + now := time.Now() + return sweep.DeleteResult{ + ConfirmedAt: now, + ActivatedAt: now, + }, nil +} + +// fetchTunnelID reads the user account and returns its assigned TunnelId. +// Used so the runlog records the kernel interface identifier the part-3 +// agent runner will key on. +func (l *Live) fetchTunnelID(ctx context.Context, userPDA solana.PublicKey) (uint16, error) { + // We can't read the assigned tunnel_id without the User's on-chain bytes, + // which the SDK doesn't surface from CreateUser. Until a downstream + // helper is added, callers either skip this column (TunnelID = 0) or wire + // a per-account fetch in cmd/. The package signature is kept stable so + // part-3 can drop in the real fetch. + return 0, nil +} + +// ipForIndex returns base shifted by idx, wrapping at the /16 boundary so the +// 0..65535 range is usable without overflow handling on the caller side. +func ipForIndex(base [4]byte, idx int) [4]byte { + host := uint32(base[2])<<8 | uint32(base[3]) + host += uint32(uint16(idx)) + var out [4]byte + out[0] = base[0] + out[1] = base[1] + binary.BigEndian.PutUint16(out[2:], uint16(host)) + return out +} diff --git a/tools/stress/device-orchestrator/pkg/exec/exec_test.go b/tools/stress/device-orchestrator/pkg/exec/exec_test.go new file mode 100644 index 0000000000..c7b13ea30b --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/exec/exec_test.go @@ -0,0 +1,27 @@ +package exec + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIPForIndex(t *testing.T) { + t.Parallel() + + base := [4]byte{100, 64, 0, 0} + tests := []struct { + idx int + want [4]byte + }{ + {0, [4]byte{100, 64, 0, 0}}, + {1, [4]byte{100, 64, 0, 1}}, + {255, [4]byte{100, 64, 0, 255}}, + {256, [4]byte{100, 64, 1, 0}}, + {1000, [4]byte{100, 64, 3, 232}}, + } + for _, tc := range tests { + got := ipForIndex(base, tc.idx) + assert.Equal(t, tc.want, got, "idx=%d", tc.idx) + } +} diff --git a/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go b/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go new file mode 100644 index 0000000000..1396928714 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go @@ -0,0 +1,63 @@ +// Package reconcile decides what to create or delete to drive a set of +// serviceability User accounts toward a desired count. It is pure (no I/O) +// so the device-stress orchestrator can call it once per batch iteration +// against live state pulled from the chain. +package reconcile + +import ( + "bytes" + "sort" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" +) + +// Plan describes the delta needed to drive the set of users owned by a given +// key toward a desired count. +type Plan struct { + // ToCreate is the number of users to add. Always >= 0. + ToCreate int + // ToDelete lists user PDAs to remove, in the order they should be deleted. + // Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker, + // so repeated calls against the same input produce identical plans. + ToDelete []solana.PublicKey +} + +// PlanFor decides what to create or delete so that the number of users owned by +// ownerFilter equals target. Users with a different Owner are ignored (neither +// counted nor deleted), which lets the orchestrator share a program with other +// tenants without disturbing them. +// +// Returns a zero plan when target is negative. +func PlanFor(current []serviceability.User, target int, ownerFilter solana.PublicKey) Plan { + if target < 0 { + return Plan{} + } + + var owned []serviceability.User + for _, u := range current { + if bytes.Equal(u.Owner[:], ownerFilter[:]) { + owned = append(owned, u) + } + } + + switch { + case len(owned) < target: + return Plan{ToCreate: target - len(owned)} + case len(owned) > target: + sort.Slice(owned, func(i, j int) bool { + if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 { + return c < 0 + } + return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0 + }) + victims := owned[target:] + out := make([]solana.PublicKey, len(victims)) + for i, u := range victims { + out[i] = solana.PublicKeyFromBytes(u.PubKey[:]) + } + return Plan{ToDelete: out} + default: + return Plan{} + } +} diff --git a/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go b/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go new file mode 100644 index 0000000000..687bf0f464 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go @@ -0,0 +1,166 @@ +package reconcile_test + +import ( + "testing" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/reconcile" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func makeUser(owner, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User { + return serviceability.User{ + Owner: owner, + ClientIp: clientIP, + PubKey: pubkey, + } +} + +func TestPlanFor(t *testing.T) { + t.Parallel() + + orchestrator := solana.NewWallet().PublicKey() + stranger := solana.NewWallet().PublicKey() + + u1 := solana.NewWallet().PublicKey() + u2 := solana.NewWallet().PublicKey() + u3 := solana.NewWallet().PublicKey() + u4 := solana.NewWallet().PublicKey() + u5 := solana.NewWallet().PublicKey() + + ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} } + + tests := []struct { + name string + current []serviceability.User + target int + owner solana.PublicKey + wantCreate int + wantDeleteIPs [][4]byte + }{ + { + name: "zero to N", + current: nil, + target: 4, + owner: orchestrator, + wantCreate: 4, + }, + { + name: "N to zero deletes in ip-ascending order", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 3)), + makeUser(orchestrator, u2, ip(10, 0, 0, 1)), + makeUser(orchestrator, u3, ip(10, 0, 0, 4)), + makeUser(orchestrator, u4, ip(10, 0, 0, 2)), + }, + target: 0, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)}, + }, + { + name: "partial trim deletes only the overflow", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 5)), + makeUser(orchestrator, u2, ip(10, 0, 0, 4)), + makeUser(orchestrator, u3, ip(10, 0, 0, 3)), + makeUser(orchestrator, u4, ip(10, 0, 0, 2)), + makeUser(orchestrator, u5, ip(10, 0, 0, 1)), + }, + target: 3, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)}, + }, + { + name: "partial grow asks for the missing count", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + }, + target: 5, + owner: orchestrator, + wantCreate: 3, + }, + { + name: "only foreign users present grows by full target", + current: []serviceability.User{ + makeUser(stranger, u1, ip(10, 0, 0, 1)), + makeUser(stranger, u2, ip(10, 0, 0, 2)), + makeUser(stranger, u3, ip(10, 0, 0, 3)), + }, + target: 2, + owner: orchestrator, + wantCreate: 2, + }, + { + name: "mixed ownership only counts and deletes owned", + current: []serviceability.User{ + makeUser(stranger, u1, ip(10, 0, 0, 9)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + makeUser(stranger, u3, ip(10, 0, 0, 8)), + makeUser(orchestrator, u4, ip(10, 0, 0, 1)), + }, + target: 1, + owner: orchestrator, + wantCreate: 0, + wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)}, + }, + { + name: "already at target produces zero plan", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + makeUser(orchestrator, u2, ip(10, 0, 0, 2)), + }, + target: 2, + owner: orchestrator, + wantCreate: 0, + }, + { + name: "negative target produces zero plan", + current: []serviceability.User{ + makeUser(orchestrator, u1, ip(10, 0, 0, 1)), + }, + target: -1, + owner: orchestrator, + wantCreate: 0, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + plan := reconcile.PlanFor(tc.current, tc.target, tc.owner) + assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate") + require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length") + + ipToPubkey := map[[4]byte]solana.PublicKey{} + for _, u := range tc.current { + ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:]) + } + for i, ipKey := range tc.wantDeleteIPs { + assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey) + } + }) + } +} + +func TestPlanFor_TieBreaksByPubkey(t *testing.T) { + t.Parallel() + + orchestrator := solana.NewWallet().PublicKey() + sharedIP := [4]byte{10, 0, 0, 1} + + pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) + pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) + + plan := reconcile.PlanFor([]serviceability.User{ + makeUser(orchestrator, pkB, sharedIP), + makeUser(orchestrator, pkA, sharedIP), + }, 0, orchestrator) + + require.Len(t, plan.ToDelete, 2) + assert.Equal(t, pkA, plan.ToDelete[0]) + assert.Equal(t, pkB, plan.ToDelete[1]) +} diff --git a/tools/stress/device-orchestrator/pkg/runlog/runlog.go b/tools/stress/device-orchestrator/pkg/runlog/runlog.go new file mode 100644 index 0000000000..007a79fa43 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/runlog/runlog.go @@ -0,0 +1,101 @@ +// Package runlog appends per-event rows to the orchestrator runlog file +// (`orchestrator-runlog.json`). One row per line; line-delimited JSON so the +// file can be tailed and downstream tooling can parse incrementally. +// +// Row schema (per #3746): +// +// {run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event} +// +// `t_ns` is the unix epoch in nanoseconds. `n_after_event` is the size of the +// active user set immediately after the event applied — provisioning increments +// it on `activate`, deprovisioning decrements on `deprovision_activate`. Other +// events carry the count as-of-emission. +package runlog + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "sync" + "time" +) + +// Event enumerates the recognized event names. Stringly-typed in the file so +// the schema can grow without consumers needing to track an enum. +type Event string + +const ( + EventSubmit Event = "submit" + EventConfirm Event = "confirm" + EventActivate Event = "activate" + EventPreCommitLog Event = "pre_commit_log" // emitted by part-3 agent runner + EventApplied Event = "applied" // emitted by part-3 agent runner + EventDeprovisionSubmit Event = "deprovision_submit" + EventDeprovisionConfirm Event = "deprovision_confirm" + EventDeprovisionActivate Event = "deprovision_activate" +) + +// Row is one entry in the runlog file. Field names match #3746's schema. +type Row struct { + RunID string `json:"run_id"` + UserIndex int `json:"user_index"` + UserPubkey string `json:"user_pubkey"` + TunnelID uint16 `json:"tunnel_id"` + Event Event `json:"event"` + TNs int64 `json:"t_ns"` + NAfterEvent int `json:"n_after_event"` +} + +// Writer appends rows to an open file in line-delimited JSON. +type Writer struct { + mu sync.Mutex + w io.WriteCloser + path string +} + +// Open creates or truncates the file at path for append-only writes. +func Open(path string) (*Writer, error) { + f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) + if err != nil { + return nil, fmt.Errorf("open runlog %s: %w", path, err) + } + return &Writer{w: f, path: path}, nil +} + +// Path returns the file path the writer is appending to. +func (w *Writer) Path() string { return w.path } + +// Append serializes row as JSON and writes a single line. +func (w *Writer) Append(row Row) error { + if row.TNs == 0 { + row.TNs = time.Now().UnixNano() + } + w.mu.Lock() + defer w.mu.Unlock() + if w.w == nil { + return errors.New("runlog writer closed") + } + buf, err := json.Marshal(row) + if err != nil { + return fmt.Errorf("marshal runlog row: %w", err) + } + buf = append(buf, '\n') + if _, err := w.w.Write(buf); err != nil { + return fmt.Errorf("write runlog row: %w", err) + } + return nil +} + +// Close flushes and closes the underlying file. +func (w *Writer) Close() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.w == nil { + return nil + } + err := w.w.Close() + w.w = nil + return err +} diff --git a/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go b/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go new file mode 100644 index 0000000000..ca0cb31ddf --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go @@ -0,0 +1,93 @@ +package runlog_test + +import ( + "bufio" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWriter_RoundTrip(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "orchestrator-runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + + rows := []runlog.Row{ + {RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventSubmit, TNs: 1000, NAfterEvent: 0}, + {RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventConfirm, TNs: 2000, NAfterEvent: 0}, + {RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventActivate, TNs: 3000, NAfterEvent: 1}, + {RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventDeprovisionActivate, TNs: 4000, NAfterEvent: 0}, + } + for _, r := range rows { + require.NoError(t, w.Append(r)) + } + require.NoError(t, w.Close()) + + // File ends with a newline; one row per line. + f, err := os.Open(path) + require.NoError(t, err) + defer f.Close() + + var read []runlog.Row + scanner := bufio.NewScanner(f) + for scanner.Scan() { + var r runlog.Row + require.NoError(t, json.Unmarshal(scanner.Bytes(), &r)) + read = append(read, r) + } + require.NoError(t, scanner.Err()) + + assert.Equal(t, rows, read) +} + +func TestWriter_FillsMissingTimestamp(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "orchestrator-runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + defer w.Close() + + require.NoError(t, w.Append(runlog.Row{RunID: "r", UserIndex: 0, UserPubkey: "pk", Event: runlog.EventSubmit})) + + data, err := os.ReadFile(path) + require.NoError(t, err) + + var r runlog.Row + require.NoError(t, json.Unmarshal(data[:len(data)-1], &r)) + assert.NotZero(t, r.TNs, "Append should fill t_ns when zero") +} + +func TestWriter_RejectsAfterClose(t *testing.T) { + t.Parallel() + + w, err := runlog.Open(filepath.Join(t.TempDir(), "orchestrator-runlog.json")) + require.NoError(t, err) + require.NoError(t, w.Close()) + + err = w.Append(runlog.Row{RunID: "r", Event: runlog.EventSubmit}) + require.Error(t, err) +} + +func TestWriter_Truncates(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "orchestrator-runlog.json") + require.NoError(t, os.WriteFile(path, []byte("stale\n"), 0o644)) + + w, err := runlog.Open(path) + require.NoError(t, err) + require.NoError(t, w.Append(runlog.Row{RunID: "r", Event: runlog.EventSubmit, TNs: 1})) + require.NoError(t, w.Close()) + + data, err := os.ReadFile(path) + require.NoError(t, err) + assert.NotContains(t, string(data), "stale", "Open(path) should truncate existing content") +} diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep.go b/tools/stress/device-orchestrator/pkg/sweep/sweep.go new file mode 100644 index 0000000000..cda03412c1 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/sweep/sweep.go @@ -0,0 +1,262 @@ +// Package sweep implements the device-orchestrator sweep loop: +// +// - Provision phase: walks 0 → Target users in batches of UsersPerBatch, +// using reconcile.PlanFor to query live state and ask the Executor to +// create the delta, holding for Hold between batches. +// - Deprovision phase: walks Target → 0 in reverse order of creation, +// so the youngest user is removed first. +// +// Per #3746, the sweep cooperates with the abort signal between user +// iterations — it never cancels a mid-flight Create/Delete. +package sweep + +import ( + "context" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/reconcile" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog" +) + +// Clock abstracts the wallclock for testability. Real callers pass RealClock; +// tests inject a fake that fires `After` channels manually. +type Clock interface { + Now() time.Time + After(d time.Duration) <-chan time.Time +} + +// RealClock is the production wallclock implementation. +type RealClock struct{} + +func (RealClock) Now() time.Time { return time.Now() } +func (RealClock) After(d time.Duration) <-chan time.Time { return time.After(d) } + +// CreateResult captures the per-user details the sweep emits into the runlog +// for a successful provision. ConfirmedAt and ActivatedAt are sourced from +// the Executor so a future SDK refactor can give them distinct values; today +// they are typically equal because the SDK's `CreateUser` blocks on both +// finalization and account visibility before returning. +type CreateResult struct { + UserPDA solana.PublicKey + TunnelID uint16 + ConfirmedAt time.Time + ActivatedAt time.Time +} + +// DeleteResult is the deprovision analog of CreateResult. +type DeleteResult struct { + ConfirmedAt time.Time + ActivatedAt time.Time +} + +// Executor is the interface the sweep depends on for chain I/O. Tests inject +// a fake; the real implementation wraps `serviceability.Executor` plus a small +// post-create fetch to discover the assigned TunnelId. +type Executor interface { + ListUsers(ctx context.Context) ([]serviceability.User, error) + CreateUser(ctx context.Context, idx int) (CreateResult, error) + DeleteUser(ctx context.Context, userPDA solana.PublicKey) (DeleteResult, error) +} + +// Config bundles all sweep parameters; pass by value to Run. +type Config struct { + RunID string + Target int + UsersPerBatch int + Hold time.Duration + OwnerFilter solana.PublicKey + + Executor Executor + Agent agent.Runner + Runlog *runlog.Writer + Clock Clock + Logger *slog.Logger +} + +func (c *Config) validate() error { + switch { + case c.Target < 0: + return errors.New("sweep: Target must be >= 0") + case c.UsersPerBatch <= 0: + return errors.New("sweep: UsersPerBatch must be > 0") + case c.Hold < 0: + return errors.New("sweep: Hold must be >= 0") + case c.RunID == "": + return errors.New("sweep: RunID is required") + case c.Executor == nil: + return errors.New("sweep: Executor is required") + case c.Runlog == nil: + return errors.New("sweep: Runlog is required") + } + if c.Clock == nil { + c.Clock = RealClock{} + } + if c.Logger == nil { + c.Logger = slog.Default() + } + if c.Agent == nil { + c.Agent = agent.NewNoop(c.Logger) + } + return nil +} + +// createdUser tracks an orchestrator-owned user so the deprovision phase can +// iterate in reverse-creation order, independent of live state. +type createdUser struct { + idx int + pubkey solana.PublicKey + tunnelID uint16 +} + +// Run drives the provision-then-deprovision sweep to completion. Returns the +// number of users actually created/deleted alongside the error (if any), so +// callers can report partial progress on abort. +func Run(ctx context.Context, cfg Config) error { + if err := cfg.validate(); err != nil { + return err + } + if err := cfg.Agent.Start(ctx); err != nil { + return fmt.Errorf("start agent runner: %w", err) + } + + created, err := provision(ctx, &cfg) + if err != nil && !errors.Is(err, context.Canceled) { + return err + } + // Always attempt deprovision so an abort during provision still cleans up + // what the sweep created. Use a fresh context for the deprovision phase if + // the original was cancelled, since the operator wants the tear-down to + // finish before exit. We respect the parent context's lifetime via the + // outer Run's error return — callers that want a hard stop pass a deadline. + depErr := deprovision(ctx, &cfg, created) + if err != nil { + return err + } + return depErr +} + +// provision walks 0 → Target in batches, returning the slice of created users +// so deprovision can iterate in reverse. Returns ctx.Err() if cancelled +// between users. +func provision(ctx context.Context, cfg *Config) ([]createdUser, error) { + if cfg.Target == 0 { + return nil, nil + } + var created []createdUser + runningTarget := 0 + activeCount := 0 + + for runningTarget < cfg.Target { + if err := ctx.Err(); err != nil { + return created, err + } + + nextTarget := runningTarget + cfg.UsersPerBatch + if nextTarget > cfg.Target { + nextTarget = cfg.Target + } + + users, err := cfg.Executor.ListUsers(ctx) + if err != nil { + return created, fmt.Errorf("list users for batch starting at %d: %w", activeCount, err) + } + plan := reconcile.PlanFor(users, nextTarget, cfg.OwnerFilter) + if len(plan.ToDelete) > 0 { + cfg.Logger.Warn("sweep: PlanFor wants to delete pre-existing users; skipping (orchestrator only creates this run)", + "count", len(plan.ToDelete)) + } + + for i := 0; i < plan.ToCreate; i++ { + if err := ctx.Err(); err != nil { + return created, err + } + idx := activeCount + submitAt := cfg.Clock.Now() + if err := emit(cfg, idx, "", 0, runlog.EventSubmit, submitAt, activeCount); err != nil { + return created, err + } + + res, err := cfg.Executor.CreateUser(ctx, idx) + if err != nil { + return created, fmt.Errorf("create user idx=%d: %w", idx, err) + } + pkStr := res.UserPDA.String() + if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventConfirm, res.ConfirmedAt, activeCount); err != nil { + return created, err + } + created = append(created, createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID}) + activeCount++ + if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventActivate, res.ActivatedAt, activeCount); err != nil { + return created, err + } + } + + runningTarget = nextTarget + if runningTarget >= cfg.Target { + break + } + if cfg.Hold > 0 { + select { + case <-cfg.Clock.After(cfg.Hold): + case <-ctx.Done(): + return created, ctx.Err() + } + } + } + return created, nil +} + +// deprovision walks the created slice in reverse, emitting deprovision_* +// events for each. +func deprovision(ctx context.Context, cfg *Config, created []createdUser) error { + activeCount := len(created) + for i := len(created) - 1; i >= 0; i-- { + if err := ctx.Err(); err != nil { + return err + } + u := created[i] + pkStr := u.pubkey.String() + submitAt := cfg.Clock.Now() + if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionSubmit, submitAt, activeCount); err != nil { + return err + } + + res, err := cfg.Executor.DeleteUser(ctx, u.pubkey) + if err != nil { + return fmt.Errorf("delete user idx=%d pubkey=%s: %w", u.idx, pkStr, err) + } + if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionConfirm, res.ConfirmedAt, activeCount); err != nil { + return err + } + activeCount-- + if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionActivate, res.ActivatedAt, activeCount); err != nil { + return err + } + } + return nil +} + +func emit(cfg *Config, idx int, pubkey string, tunnelID uint16, ev runlog.Event, at time.Time, nAfter int) error { + if at.IsZero() { + at = cfg.Clock.Now() + } + row := runlog.Row{ + RunID: cfg.RunID, + UserIndex: idx, + UserPubkey: pubkey, + TunnelID: tunnelID, + Event: ev, + TNs: at.UnixNano(), + NAfterEvent: nAfter, + } + if err := cfg.Runlog.Append(row); err != nil { + return fmt.Errorf("runlog append %s: %w", ev, err) + } + return nil +} diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go new file mode 100644 index 0000000000..3402d5fe8c --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go @@ -0,0 +1,321 @@ +package sweep_test + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "os" + "path/filepath" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/gagliardetto/solana-go" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog" + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeClock provides deterministic Now() and a manually-fired After channel so +// the sweep's hold call returns instantly under test. +type fakeClock struct { + mu sync.Mutex + now time.Time + holds int +} + +func (f *fakeClock) Now() time.Time { + f.mu.Lock() + defer f.mu.Unlock() + f.now = f.now.Add(time.Microsecond) // advance so successive Now() calls differ + return f.now +} + +func (f *fakeClock) After(d time.Duration) <-chan time.Time { + f.mu.Lock() + f.holds++ + f.mu.Unlock() + ch := make(chan time.Time, 1) + ch <- time.Now() + return ch +} + +func (f *fakeClock) HoldCount() int { + f.mu.Lock() + defer f.mu.Unlock() + return f.holds +} + +// fakeExecutor records create/delete calls. ListUsers always returns the +// orchestrator-owned set so PlanFor produces the right delta. +type fakeExecutor struct { + mu sync.Mutex + owner solana.PublicKey + created []serviceability.User + createN atomic.Int32 + deleteN atomic.Int32 + + // Optional hook to fail on the Nth create (1-based) — used by the abort test. + failCreateOnCall int + failErr error +} + +func newFakeExecutor(owner solana.PublicKey) *fakeExecutor { + return &fakeExecutor{owner: owner} +} + +func (f *fakeExecutor) ListUsers(ctx context.Context) ([]serviceability.User, error) { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]serviceability.User, len(f.created)) + copy(out, f.created) + return out, nil +} + +func (f *fakeExecutor) CreateUser(ctx context.Context, idx int) (sweep.CreateResult, error) { + calls := int(f.createN.Add(1)) + if f.failCreateOnCall == calls && f.failErr != nil { + return sweep.CreateResult{}, f.failErr + } + + // Deterministic pubkey from idx, IP = 100.0.0.idx+1 so PlanFor sorts cleanly. + var pk solana.PublicKey + pk[0] = byte(idx) + pk[31] = 0xAA + + f.mu.Lock() + f.created = append(f.created, serviceability.User{ + Owner: f.owner, + ClientIp: [4]byte{100, 0, 0, byte(idx + 1)}, + PubKey: pk, + }) + f.mu.Unlock() + + now := time.Unix(1_700_000_000, int64(calls)*1_000_000) // micro-spaced timestamps + return sweep.CreateResult{ + UserPDA: pk, + TunnelID: uint16(500 + idx), + ConfirmedAt: now, + ActivatedAt: now.Add(time.Millisecond), + }, nil +} + +func (f *fakeExecutor) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) { + calls := int(f.deleteN.Add(1)) + f.mu.Lock() + // Remove the matching user from the active set. + for i, u := range f.created { + if solana.PublicKeyFromBytes(u.PubKey[:]).Equals(userPDA) { + f.created = append(f.created[:i], f.created[i+1:]...) + break + } + } + f.mu.Unlock() + + now := time.Unix(1_700_000_000, int64(calls+1000)*1_000_000) + return sweep.DeleteResult{ + ConfirmedAt: now, + ActivatedAt: now.Add(time.Millisecond), + }, nil +} + +func readRows(t *testing.T, path string) []runlog.Row { + t.Helper() + f, err := os.Open(path) + require.NoError(t, err) + defer f.Close() + var rows []runlog.Row + s := bufio.NewScanner(f) + for s.Scan() { + var r runlog.Row + require.NoError(t, json.Unmarshal(s.Bytes(), &r)) + rows = append(rows, r) + } + require.NoError(t, s.Err()) + return rows +} + +func TestRun_ProvisionsThenDeprovisionsInReverseOrder(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + exec := newFakeExecutor(owner) + path := filepath.Join(t.TempDir(), "orchestrator-runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + t.Cleanup(func() { _ = w.Close() }) + + clk := &fakeClock{now: time.Unix(1_700_000_000, 0)} + cfg := sweep.Config{ + RunID: "run-test", + Target: 4, + UsersPerBatch: 2, + Hold: 10 * time.Second, + OwnerFilter: owner, + Executor: exec, + Agent: agent.NewNoop(nil), + Runlog: w, + Clock: clk, + } + require.NoError(t, sweep.Run(context.Background(), cfg)) + require.NoError(t, w.Close()) + + rows := readRows(t, path) + // 4 provisions × 3 events + 4 deprovisions × 3 events = 24 rows + require.Len(t, rows, 24) + + // Provision phase: ascending user_index, events submit→confirm→activate. + for i := 0; i < 4; i++ { + base := i * 3 + assert.Equal(t, i, rows[base].UserIndex, "row %d", base) + assert.Equal(t, runlog.EventSubmit, rows[base].Event) + assert.Equal(t, runlog.EventConfirm, rows[base+1].Event) + assert.Equal(t, runlog.EventActivate, rows[base+2].Event) + assert.Equal(t, uint16(500+i), rows[base+1].TunnelID, "tunnel_id propagates after confirm") + assert.Equal(t, i+1, rows[base+2].NAfterEvent, "activate increments active count") + } + + // Deprovision phase: descending user_index (reverse creation order), events deprovision_submit/confirm/activate. + for k := 0; k < 4; k++ { + base := 12 + k*3 + expectedIdx := 3 - k // 3, 2, 1, 0 + assert.Equal(t, expectedIdx, rows[base].UserIndex) + assert.Equal(t, runlog.EventDeprovisionSubmit, rows[base].Event) + assert.Equal(t, runlog.EventDeprovisionConfirm, rows[base+1].Event) + assert.Equal(t, runlog.EventDeprovisionActivate, rows[base+2].Event) + assert.Equal(t, 3-k, rows[base+2].NAfterEvent, "deprovision_activate decrements active count") + } + + // Hold called between batches but not after the final provision batch. + // Target=4, UsersPerBatch=2 → batches at [0..2), [2..4); one hold between them. + assert.Equal(t, 1, clk.HoldCount(), "Hold should fire once (between batches), not after reaching target") + + // Executor calls match the totals. + assert.Equal(t, int32(4), exec.createN.Load()) + assert.Equal(t, int32(4), exec.deleteN.Load()) +} + +func TestRun_HandlesZeroTarget(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + exec := newFakeExecutor(owner) + path := filepath.Join(t.TempDir(), "runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + t.Cleanup(func() { _ = w.Close() }) + + cfg := sweep.Config{ + RunID: "run-zero", + Target: 0, + UsersPerBatch: 2, + Hold: time.Second, + OwnerFilter: owner, + Executor: exec, + Runlog: w, + Clock: &fakeClock{now: time.Unix(1, 0)}, + } + require.NoError(t, sweep.Run(context.Background(), cfg)) + require.NoError(t, w.Close()) + + rows := readRows(t, path) + assert.Empty(t, rows) + assert.Zero(t, exec.createN.Load()) + assert.Zero(t, exec.deleteN.Load()) +} + +func TestRun_AbortBetweenUsersStillCleansUp(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + exec := newFakeExecutor(owner) + exec.failCreateOnCall = 3 + exec.failErr = context.Canceled + + path := filepath.Join(t.TempDir(), "runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + t.Cleanup(func() { _ = w.Close() }) + + cfg := sweep.Config{ + RunID: "run-abort", + Target: 4, + UsersPerBatch: 2, + Hold: time.Second, + OwnerFilter: owner, + Executor: exec, + Runlog: w, + Clock: &fakeClock{now: time.Unix(1, 0)}, + } + err = sweep.Run(context.Background(), cfg) + require.Error(t, err, "abort during provision should surface error") + + // Even on abort, deprovision should fire for the two users that were created. + require.NoError(t, w.Close()) + rows := readRows(t, path) + + // 2 provisions × 3 events = 6; plus a submit event for the failed third; plus 2 deprovision sets. + deprovisionActivates := 0 + for _, r := range rows { + if r.Event == runlog.EventDeprovisionActivate { + deprovisionActivates++ + } + } + assert.Equal(t, 2, deprovisionActivates, "every created user should be deprovisioned on abort") +} + +func TestRun_RejectsInvalidConfig(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + cfg sweep.Config + }{ + {name: "negative target", cfg: sweep.Config{Target: -1, UsersPerBatch: 1, RunID: "r", Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}}, + {name: "zero batch", cfg: sweep.Config{Target: 1, UsersPerBatch: 0, RunID: "r", Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}}, + {name: "missing run id", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}}, + {name: "missing executor", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, RunID: "r", Runlog: &runlog.Writer{}}}, + {name: "missing runlog", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, RunID: "r", Executor: &fakeExecutor{}}}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := sweep.Run(context.Background(), tc.cfg) + require.Error(t, err) + }) + } +} + +// Sanity: ctx cancellation between users is observed at the next iteration boundary. +func TestRun_CancellationStopsBetweenUsers(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + exec := newFakeExecutor(owner) + path := filepath.Join(t.TempDir(), "runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + t.Cleanup(func() { _ = w.Close() }) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // pre-cancelled + + cfg := sweep.Config{ + RunID: "run-cancel", + Target: 4, + UsersPerBatch: 2, + Hold: time.Second, + OwnerFilter: owner, + Executor: exec, + Runlog: w, + Clock: &fakeClock{now: time.Unix(1, 0)}, + } + err = sweep.Run(ctx, cfg) + require.Error(t, err) + assert.True(t, errors.Is(err, context.Canceled)) + assert.Zero(t, exec.createN.Load(), "no users should be created when ctx is pre-cancelled") +} From 1d2113eaa4022e0425a84c36aebfd78688dbf266 Mon Sep 17 00:00:00 2001 From: Greg Mitchell Date: Wed, 27 May 2026 22:31:14 +0000 Subject: [PATCH 5/5] tools/stress: agent SSH + log parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the device-stress orchestrator (#3746) by replacing the no-op AgentRunner with the live SSH-driven runner and the log parser that turns agent diff/commit lines into pre_commit_log / applied events. - pkg/agent/parser.go — Parser tracks two lines from controlplane/agent/pkg/arista/eapi.go: `Committing config session due to diffs detected: ` (extracts every `+ interface Tunnel` and emits one pre_commit_log event per ID) and `Configuration session finalized with command '... commit'` (emits one applied event per pending tunnel; the abort variant clears the buffer without emitting). - pkg/agent/ssh.go — Dials --dut-ssh-host with --dut-ssh-key, execs the configured doublezero-agent command (verbose, with optional --controller), and tees remote stdout/stderr into /orchestrator.agent.log while feeding lines through the parser. Host-key verification is InsecureIgnoreHostKey because targets are ephemeral cEOS containers. - pkg/sweep — adds a consumer goroutine that reads Agent.Events() and writes pre_commit_log / applied rows by looking up each event's tunnel ID in a registry the provision goroutine populates as users are created. Unknown tunnels are debug-logged and dropped. The agent is started under a derived context so deprovision-then-clean-shutdown works without leaking the goroutine. - pkg/exec.fetchTunnelID — implemented properly: GetAccountInfo on the user PDA, DeserializeUser, return User.TunnelId. Required adding an RPC field to exec.Config. - cmd/device-orchestrator — new flags --dut-ssh-user (default `admin`) and --no-agent (offline testing); SSH runner becomes the default when --dut-ssh-host and --dut-ssh-key are both set. Part 3 of #3746. Closes #3772. --- CHANGELOG.md | 1 + go.mod | 2 +- .../cmd/device-orchestrator/main.go | 44 +++- .../device-orchestrator/pkg/agent/parser.go | 125 ++++++++++ .../pkg/agent/parser_test.go | 134 ++++++++++ .../device-orchestrator/pkg/agent/ssh.go | 228 ++++++++++++++++++ .../device-orchestrator/pkg/exec/exec.go | 36 ++- .../device-orchestrator/pkg/exec/exec_test.go | 85 +++++++ .../device-orchestrator/pkg/sweep/sweep.go | 117 ++++++++- .../pkg/sweep/sweep_test.go | 116 +++++++++ 10 files changed, 865 insertions(+), 23 deletions(-) create mode 100644 tools/stress/device-orchestrator/pkg/agent/parser.go create mode 100644 tools/stress/device-orchestrator/pkg/agent/parser_test.go create mode 100644 tools/stress/device-orchestrator/pkg/agent/ssh.go diff --git a/CHANGELOG.md b/CHANGELOG.md index d71edb5e5e..a19e260009 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ All notable changes to this project will be documented in this file. - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)). - Tools - Add `tools/stress/device-orchestrator/` — the device-stress orchestrator skeleton for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's CLI list, dumps `orchestrator-config.json` on start, runs a provision-then-reverse-deprovision sweep against a live serviceability program, and emits the runlog row schema `{run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event}` to `orchestrator-runlog.json` for each `submit | confirm | activate | deprovision_*` event. The agent runner is stubbed behind a `pkg/agent.Runner` interface (no-op impl ships now; the SSH-backed runner that emits `pre_commit_log` / `applied` lands in part 3). The sweep cooperates with an abort sentinel file: when the file appears the in-flight user completes and the orchestrator deprovisions everything it created before exiting non-zero. `PlanReconcile` / `Plan` (lifted from the part-1 SDK PR) now lives at `tools/stress/device-orchestrator/pkg/reconcile/` as orchestrator policy rather than SDK primitive. Part 2 of #3746 ([#3771](https://github.com/malbeclabs/doublezero/issues/3771)). + - Complete the device-stress orchestrator with the SSH agent runner and log parser. `pkg/agent/ssh.go` dials `--dut-ssh-host` with `--dut-ssh-key`, execs `doublezero-agent -verbose` (appending `--controller` when set), and tees remote stdout/stderr into `/orchestrator.agent.log` while feeding the stream through `pkg/agent/parser.go`. The parser tracks two log lines from `controlplane/agent/pkg/arista/eapi.go`: `Committing config session due to diffs detected: ` (extracting `+ interface Tunnel` matches and emitting one `pre_commit_log` event per ID) and `Configuration session finalized with command '... commit'` (emitting one `applied` event per pending tunnel; the `... abort` variant clears the buffer without emitting). The sweep grows a goroutine that consumes agent events and writes `pre_commit_log` / `applied` runlog rows by looking up each event's tunnel ID against a `tunnelID → user_index` map populated as users are created; unknown tunnels are debug-logged and dropped. `pkg/exec.fetchTunnelID` now reads the on-chain user account post-create to surface the assigned `TunnelId` into the runlog. New CLI flags: `--dut-ssh-user` (default `admin`) and `--no-agent` for offline testing. Host-key verification uses `ssh.InsecureIgnoreHostKey` because the orchestrator targets ephemeral cEOS containers; documented at `pkg/agent/ssh.go:SSH`. Part 3 of #3746, completes the five-event coverage ([#3772](https://github.com/malbeclabs/doublezero/issues/3772)). ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22 diff --git a/go.mod b/go.mod index 86f96993ce..3eb0a6b9e1 100644 --- a/go.mod +++ b/go.mod @@ -60,6 +60,7 @@ require ( github.com/twmb/franz-go/pkg/kadm v1.17.1 github.com/vishvananda/netlink v1.3.1 github.com/vishvananda/netns v0.0.5 + golang.org/x/crypto v0.49.0 golang.org/x/mod v0.33.0 golang.org/x/net v0.52.0 golang.org/x/sync v0.20.0 @@ -193,7 +194,6 @@ require ( go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect - golang.org/x/crypto v0.49.0 // indirect golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4 // indirect golang.org/x/term v0.41.0 // indirect diff --git a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go index ab01975d30..82c05c3534 100644 --- a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go +++ b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go @@ -43,6 +43,7 @@ type orchestratorConfig struct { DUTPubkey string `json:"dut_pubkey"` DUTSSHHost string `json:"dut_ssh_host"` DUTSSHKey string `json:"dut_ssh_key"` + DUTSSHUser string `json:"dut_ssh_user"` RPCURL string `json:"rpc_url"` ProgramID string `json:"program_id"` KeypairPath string `json:"keypair"` @@ -52,6 +53,7 @@ type orchestratorConfig struct { ClientIPBase string `json:"client_ip_base"` TunnelEndpoint string `json:"tunnel_endpoint"` TenantPubkey string `json:"tenant_pubkey,omitempty"` + NoAgent bool `json:"no_agent"` } func main() { @@ -81,6 +83,8 @@ func run() error { runID = flag.String("run-id", "", "Run identifier written into every runlog row; auto-generated if empty.") logLevel = flag.String("log-level", "info", "slog level: debug|info|warn|error.") dryRun = flag.Bool("dry-run", false, "Validate flags and dump orchestrator-config.json without contacting the RPC.") + dutSSHUser = flag.String("dut-ssh-user", "admin", "SSH user for the DUT.") + noAgent = flag.Bool("no-agent", false, "Use the no-op AgentRunner even when SSH flags are set (offline testing).") ) flag.Parse() @@ -116,6 +120,7 @@ func run() error { DUTPubkey: *dutPubkey, DUTSSHHost: *dutSSHHost, DUTSSHKey: *dutSSHKey, + DUTSSHUser: *dutSSHUser, RPCURL: *rpcURL, ProgramID: *programID, KeypairPath: *keypairPath, @@ -125,6 +130,7 @@ func run() error { ClientIPBase: *clientIPBase, TunnelEndpoint: *tunnelEndpoint, TenantPubkey: *tenantPubkey, + NoAgent: *noAgent, } configPath := filepath.Join(*workingDir, "orchestrator-config.json") if err := dumpJSON(configPath, resolved); err != nil { @@ -174,6 +180,7 @@ func run() error { liveExec, err := exec.New(exec.Config{ Client: client, Executor: executor, + RPC: rpc, DevicePubkey: dutPK, TenantPubkey: tenantPK, ClientIPBase: baseIP, @@ -200,6 +207,8 @@ func run() error { ctx, abortCancel := abort.Watch(rootCtx, *abortFile, abort.DefaultPollInterval, logger) defer abortCancel() + agentRunner := selectAgentRunner(*noAgent, *dutSSHHost, *dutSSHKey, *dutSSHUser, *controllerAddr, *workingDir, logger) + cfg := sweep.Config{ RunID: *runID, Target: *targetUserCount, @@ -207,7 +216,7 @@ func run() error { Hold: time.Duration(*holdSeconds) * time.Second, OwnerFilter: signer.PublicKey(), Executor: liveExec, - Agent: agent.NewNoop(logger), + Agent: agentRunner, Runlog: rlw, Clock: sweep.RealClock{}, Logger: logger, @@ -262,6 +271,39 @@ func requireFlags(required map[string]string) error { return nil } +// selectAgentRunner picks between the SSH-backed runner and the no-op, based +// on the CLI flags: +// +// - --no-agent → noop (operator opted out) +// - --dut-ssh-host + --dut-ssh-key set → SSH runner (default for live runs) +// - otherwise → noop with a warning (operator forgot the flags) +// +// The SSH runner tees remote stdout/stderr into /orchestrator.agent.log. +// The exec'd command appends --controller iff the operator passed --controller. +func selectAgentRunner(noAgent bool, sshHost, sshKey, sshUser, controllerAddr, workingDir string, logger *slog.Logger) agent.Runner { + if noAgent { + logger.Info("agent: --no-agent set; using no-op runner") + return agent.NewNoop(logger) + } + if sshHost == "" || sshKey == "" { + logger.Warn("agent: --dut-ssh-host and --dut-ssh-key not both set; falling back to no-op runner (pre_commit_log / applied events will not be recorded)") + return agent.NewNoop(logger) + } + + cmd := "doublezero-agent -verbose" + if controllerAddr != "" { + cmd = fmt.Sprintf("doublezero-agent -verbose -controller %s", controllerAddr) + } + return agent.NewSSH(agent.SSHConfig{ + Host: sshHost, + User: sshUser, + KeyPath: sshKey, + Command: cmd, + LogPath: filepath.Join(workingDir, "orchestrator.agent.log"), + Logger: logger, + }) +} + func parseIPv4(s string) ([4]byte, error) { ip := net.ParseIP(s) if ip == nil { diff --git a/tools/stress/device-orchestrator/pkg/agent/parser.go b/tools/stress/device-orchestrator/pkg/agent/parser.go new file mode 100644 index 0000000000..d6841f7ca0 --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/agent/parser.go @@ -0,0 +1,125 @@ +package agent + +import ( + "regexp" + "strconv" + "time" +) + +// Parser turns lines from a doublezero-agent log stream into AgentEvents. +// +// It tracks two log lines from controlplane/agent/pkg/arista/eapi.go: +// +// - "Committing config session due to diffs detected: " +// → emit one EventPreCommitLog per `+ interface Tunnel` in the diff, +// and remember those IDs as "pending". +// - "Configuration session finalized with command '... commit'" +// → emit one EventApplied per pending ID, then clear the buffer. +// - "Configuration session finalized with command '... abort'" +// → clear the buffer with no Applied events. +// +// A single Parser is goroutine-safe only against the calling Parse goroutine; +// callers should funnel all lines through one Parse loop. +type Parser struct { + pending []uint16 + now func() time.Time // injectable for tests +} + +// NewParser returns a Parser that stamps events with the current wallclock. +// Pass WithClock to override (testing). +func NewParser(opts ...ParserOption) *Parser { + p := &Parser{now: time.Now} + for _, opt := range opts { + opt(p) + } + return p +} + +// ParserOption configures NewParser. +type ParserOption func(*Parser) + +// WithClock overrides time.Now for the parser; used by tests. +func WithClock(now func() time.Time) ParserOption { + return func(p *Parser) { p.now = now } +} + +// Parse advances the parser by one log line and returns any events produced. +// The returned slice is freshly allocated per call and safe for the caller to +// retain. +func (p *Parser) Parse(line string) []Event { + if m := committingRE.FindStringSubmatch(line); m != nil { + ids := extractAddedTunnelIDs(m[1]) + if len(ids) == 0 { + return nil + } + p.pending = append(p.pending, ids...) + now := p.now() + out := make([]Event, 0, len(ids)) + for _, id := range ids { + out = append(out, Event{Kind: EventPreCommitLog, TunnelID: id, At: now}) + } + return out + } + if finalizedCommitRE.MatchString(line) { + if len(p.pending) == 0 { + return nil + } + now := p.now() + out := make([]Event, 0, len(p.pending)) + for _, id := range p.pending { + out = append(out, Event{Kind: EventApplied, TunnelID: id, At: now}) + } + p.pending = p.pending[:0] + return out + } + if finalizedAbortRE.MatchString(line) { + // Abort cleared the session — drop pending without emitting Applied. + p.pending = p.pending[:0] + return nil + } + return nil +} + +// Pending exposes the in-flight tunnel IDs awaiting an Applied event; tests +// inspect this to assert state transitions. +func (p *Parser) Pending() []uint16 { + out := make([]uint16, len(p.pending)) + copy(out, p.pending) + return out +} + +var ( + // committingRE captures the diff payload from the agent's pre-commit log. + // The diff is everything after the colon-space and runs to end of line — + // agents emit the diff inline (often multi-section but single-line). + committingRE = regexp.MustCompile(`Committing config session due to diffs detected:\s*(.*)$`) + + // addedTunnelRE matches an additive interface-Tunnel diff line; the `\b` on + // the right keeps "Tunnel50001" out of a "Tunnel500" match. + addedTunnelRE = regexp.MustCompile(`\+\s*interface Tunnel(\d+)\b`) + + // finalizedCommitRE matches the post-commit log line on a successful + // commit. The quoted command always ends in "...commit" for actual commits + // and "...abort" for no-op sessions. + finalizedCommitRE = regexp.MustCompile(`Configuration session finalized with command '.*\s+commit'`) + finalizedAbortRE = regexp.MustCompile(`Configuration session finalized with command '.*\s+abort'`) +) + +// extractAddedTunnelIDs pulls every "+ interface Tunnel" out of a diff +// payload. Returns nil when no additive lines are present (e.g., pure +// deprovision diffs). +func extractAddedTunnelIDs(diff string) []uint16 { + matches := addedTunnelRE.FindAllStringSubmatch(diff, -1) + if len(matches) == 0 { + return nil + } + out := make([]uint16, 0, len(matches)) + for _, m := range matches { + id, err := strconv.ParseUint(m[1], 10, 16) + if err != nil { + continue + } + out = append(out, uint16(id)) + } + return out +} diff --git a/tools/stress/device-orchestrator/pkg/agent/parser_test.go b/tools/stress/device-orchestrator/pkg/agent/parser_test.go new file mode 100644 index 0000000000..e0a2bef78f --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/agent/parser_test.go @@ -0,0 +1,134 @@ +package agent_test + +import ( + "testing" + "time" + + "github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fixedClock returns a constant time for deterministic event timestamps. +func fixedClock(at time.Time) func() time.Time { + return func() time.Time { return at } +} + +func TestParser_SingleTunnelDiffThenCommit(t *testing.T) { + t.Parallel() + + now := time.Unix(1_700_000_000, 0) + p := agent.NewParser(agent.WithClock(fixedClock(now))) + + events := p.Parse(`2026/05/27 12:00:01 Committing config session due to diffs detected: + interface Tunnel500 ip address 169.254.0.1/30`) + require.Len(t, events, 1) + assert.Equal(t, agent.EventPreCommitLog, events[0].Kind) + assert.Equal(t, uint16(500), events[0].TunnelID) + assert.Equal(t, now, events[0].At) + assert.Equal(t, []uint16{500}, p.Pending()) + + events = p.Parse(`2026/05/27 12:00:02 Configuration session finalized with command 'configure session doublezero-agent-abc123 commit'`) + require.Len(t, events, 1) + assert.Equal(t, agent.EventApplied, events[0].Kind) + assert.Equal(t, uint16(500), events[0].TunnelID) + assert.Empty(t, p.Pending(), "pending should clear after commit-success") +} + +func TestParser_MultiTunnelDiffEmitsOneEventPerTunnel(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + diff := `Committing config session due to diffs detected: + interface Tunnel500 + interface Tunnel501 - interface Tunnel499 + interface Tunnel502` + events := p.Parse(diff) + require.Len(t, events, 3, "only + lines, not - lines, produce events") + assert.Equal(t, []uint16{500, 501, 502}, []uint16{events[0].TunnelID, events[1].TunnelID, events[2].TunnelID}) + for _, e := range events { + assert.Equal(t, agent.EventPreCommitLog, e.Kind) + } + + applied := p.Parse(`Configuration session finalized with command 'configure session foo commit'`) + require.Len(t, applied, 3, "Applied fires once per pending tunnel") + assert.Equal(t, []uint16{500, 501, 502}, []uint16{applied[0].TunnelID, applied[1].TunnelID, applied[2].TunnelID}) +} + +func TestParser_DeprovisionOnlyDiffEmitsNothing(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + events := p.Parse(`Committing config session due to diffs detected: - interface Tunnel500 - interface Tunnel501`) + assert.Empty(t, events) + assert.Empty(t, p.Pending()) +} + +func TestParser_AbortClearsBufferWithoutAppliedEvents(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel500`) + require.Len(t, events, 1) + require.Equal(t, []uint16{500}, p.Pending()) + + events = p.Parse(`Configuration session finalized with command 'configure session foo abort'`) + assert.Empty(t, events, "abort emits no events") + assert.Empty(t, p.Pending(), "abort still clears pending") +} + +func TestParser_CommitWithoutPendingDiffIsNoOp(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + events := p.Parse(`Configuration session finalized with command 'configure session foo commit'`) + assert.Empty(t, events) +} + +func TestParser_TwoConsecutiveProvisionCycles(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + + // Cycle 1 + require.Len(t, p.Parse(`Committing config session due to diffs detected: + interface Tunnel500`), 1) + require.Len(t, p.Parse(`Configuration session finalized with command 'configure session foo commit'`), 1) + assert.Empty(t, p.Pending()) + + // Cycle 2 + require.Len(t, p.Parse(`Committing config session due to diffs detected: + interface Tunnel501`), 1) + applied := p.Parse(`Configuration session finalized with command 'configure session bar commit'`) + require.Len(t, applied, 1) + assert.Equal(t, uint16(501), applied[0].TunnelID, "cycle 2 must not replay tunnel 500") +} + +func TestParser_UnrelatedLinesIgnored(t *testing.T) { + t.Parallel() + + p := agent.NewParser() + for _, line := range []string{ + ``, + `Received 42 lines of configuration from controller`, + `forced unlock of configuration lock (xyz)`, + `some random log noise`, + } { + assert.Empty(t, p.Parse(line), "line=%q", line) + } +} + +func TestParser_RejectsOversizedTunnelID(t *testing.T) { + t.Parallel() + + // uint16 max is 65535; 70000 should be silently skipped, not panic. + p := agent.NewParser() + events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel70000 + interface Tunnel500`) + require.Len(t, events, 1) + assert.Equal(t, uint16(500), events[0].TunnelID) +} + +func TestParser_DoesNotConfuseInterfaceNamePrefixes(t *testing.T) { + t.Parallel() + + // "Tunnel5000" must not match a regex that's been fooled by "Tunnel500" + // being a prefix. Use a `\b` boundary in the regex. + p := agent.NewParser() + events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel5000`) + require.Len(t, events, 1) + assert.Equal(t, uint16(5000), events[0].TunnelID) +} diff --git a/tools/stress/device-orchestrator/pkg/agent/ssh.go b/tools/stress/device-orchestrator/pkg/agent/ssh.go new file mode 100644 index 0000000000..510fad511e --- /dev/null +++ b/tools/stress/device-orchestrator/pkg/agent/ssh.go @@ -0,0 +1,228 @@ +package agent + +import ( + "bufio" + "context" + "fmt" + "io" + "log/slog" + "os" + "sync" + + "golang.org/x/crypto/ssh" +) + +// SSHConfig describes how to reach the DUT and what to run on it. +type SSHConfig struct { + // Host is the dial target, e.g. "10.0.0.1:22". The dialer expects a + // host:port; callers should resolve hostnames upstream. + Host string + // User to authenticate as. Defaults to "admin" if empty. + User string + // KeyPath is the path to a PEM-encoded private key for public-key auth. + KeyPath string + // Command is the remote command to exec. Defaults to + // "doublezero-agent -verbose" if empty; callers can override with + // additional flags such as the controller address. + Command string + // LogPath, when non-empty, is the local file the SSH runner tees remote + // stdout/stderr into. The file is truncated on Start. + LogPath string + // Logger is used for diagnostic logs from the runner; pass nil for silent. + Logger *slog.Logger +} + +// SSH is a Runner that dials the DUT over SSH, executes doublezero-agent in +// verbose mode, and emits AgentEvents parsed from the remote log stream. +// +// Host key verification uses ssh.InsecureIgnoreHostKey because the +// orchestrator targets ephemeral cEOS containers whose host keys regenerate +// on every restart; the threat model is "operator on the same subnet" and +// the SSH session carries no privileged credentials beyond what the keypair +// already grants. Do not reuse this dialer for production workloads. +type SSH struct { + cfg SSHConfig + + events chan Event + + mu sync.Mutex + started bool + client *ssh.Client + session *ssh.Session + logFile *os.File +} + +// NewSSH returns an unstarted SSH runner. Call Start to dial. +func NewSSH(cfg SSHConfig) *SSH { + if cfg.User == "" { + cfg.User = "admin" + } + if cfg.Command == "" { + cfg.Command = "doublezero-agent -verbose" + } + return &SSH{ + cfg: cfg, + events: make(chan Event, 64), + } +} + +// Events returns the channel the runner emits AgentEvents on. It closes +// when the runner exits (ctx cancel, process exit, or session error). +func (s *SSH) Events() <-chan Event { return s.events } + +// Start dials the DUT, opens a session, executes the configured command, and +// streams its stdout/stderr through the parser. Start returns once the +// session has been opened; the read loop runs in a goroutine until ctx is +// cancelled or the remote command exits. +func (s *SSH) Start(ctx context.Context) error { + s.mu.Lock() + if s.started { + s.mu.Unlock() + return fmt.Errorf("ssh agent: already started") + } + s.started = true + s.mu.Unlock() + + signer, err := loadSigner(s.cfg.KeyPath) + if err != nil { + return fmt.Errorf("ssh agent: load key %s: %w", s.cfg.KeyPath, err) + } + + clientCfg := &ssh.ClientConfig{ + User: s.cfg.User, + Auth: []ssh.AuthMethod{ssh.PublicKeys(signer)}, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + } + client, err := ssh.Dial("tcp", s.cfg.Host, clientCfg) + if err != nil { + return fmt.Errorf("ssh agent: dial %s: %w", s.cfg.Host, err) + } + session, err := client.NewSession() + if err != nil { + _ = client.Close() + return fmt.Errorf("ssh agent: new session: %w", err) + } + stdout, err := session.StdoutPipe() + if err != nil { + _ = session.Close() + _ = client.Close() + return fmt.Errorf("ssh agent: stdout pipe: %w", err) + } + stderr, err := session.StderrPipe() + if err != nil { + _ = session.Close() + _ = client.Close() + return fmt.Errorf("ssh agent: stderr pipe: %w", err) + } + + var logFile *os.File + if s.cfg.LogPath != "" { + logFile, err = os.Create(s.cfg.LogPath) + if err != nil { + _ = session.Close() + _ = client.Close() + return fmt.Errorf("ssh agent: open log %s: %w", s.cfg.LogPath, err) + } + } + + s.mu.Lock() + s.client = client + s.session = session + s.logFile = logFile + s.mu.Unlock() + + if err := session.Start(s.cfg.Command); err != nil { + s.shutdown() + return fmt.Errorf("ssh agent: start %q: %w", s.cfg.Command, err) + } + if s.cfg.Logger != nil { + s.cfg.Logger.Info("ssh agent started", "host", s.cfg.Host, "command", s.cfg.Command, "log_path", s.cfg.LogPath) + } + + parser := NewParser() + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer wg.Done() + streamLines(ctx, stdout, logFile, parser, s.events, s.cfg.Logger, "stdout") + }() + go func() { + defer wg.Done() + streamLines(ctx, stderr, logFile, parser, s.events, s.cfg.Logger, "stderr") + }() + + go func() { + // Close session and channel when ctx cancels OR all reader goroutines exit. + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + select { + case <-ctx.Done(): + case <-done: + } + // Closing the session causes the read loops to return EOF; the wait + // below blocks until both have returned before closing the events + // channel, so consumers never see a half-emitted event. + s.shutdown() + <-done + close(s.events) + }() + + return nil +} + +// shutdown is idempotent; safe to call from Start error paths and from the +// supervising goroutine. +func (s *SSH) shutdown() { + s.mu.Lock() + defer s.mu.Unlock() + if s.session != nil { + _ = s.session.Close() + s.session = nil + } + if s.client != nil { + _ = s.client.Close() + s.client = nil + } + if s.logFile != nil { + _ = s.logFile.Close() + s.logFile = nil + } +} + +// streamLines reads `src` line-by-line, optionally tees raw lines to `tee`, +// runs each through `parser`, and pushes resulting events onto `events`. +// Returns early when ctx cancels so a slow consumer can't deadlock shutdown. +func streamLines(ctx context.Context, src io.Reader, tee io.Writer, parser *Parser, events chan<- Event, log *slog.Logger, label string) { + scanner := bufio.NewScanner(src) + scanner.Buffer(make([]byte, 1024*1024), 16*1024*1024) // large diffs can exceed default + for scanner.Scan() { + line := scanner.Text() + if tee != nil { + if _, err := tee.Write([]byte(line + "\n")); err != nil && log != nil { + log.Warn("ssh agent: log tee write failed", "err", err, "stream", label) + } + } + for _, ev := range parser.Parse(line) { + select { + case events <- ev: + case <-ctx.Done(): + return + } + } + } + if err := scanner.Err(); err != nil && log != nil { + log.Warn("ssh agent: stream ended with error", "err", err, "stream", label) + } +} + +// loadSigner reads a PEM-encoded private key from disk and returns an ssh.Signer. +func loadSigner(path string) (ssh.Signer, error) { + buf, err := os.ReadFile(path) + if err != nil { + return nil, err + } + return ssh.ParsePrivateKey(buf) +} diff --git a/tools/stress/device-orchestrator/pkg/exec/exec.go b/tools/stress/device-orchestrator/pkg/exec/exec.go index 86badb60f2..1df54630d2 100644 --- a/tools/stress/device-orchestrator/pkg/exec/exec.go +++ b/tools/stress/device-orchestrator/pkg/exec/exec.go @@ -18,6 +18,11 @@ import ( type Config struct { Client *serviceability.Client Executor *serviceability.Executor + // RPC is used to fetch individual User accounts post-create so the + // orchestrator can record the assigned TunnelId in the runlog. In + // production this is the same *solanarpc.Client the Client/Executor + // were built from. + RPC serviceability.RPCClient DevicePubkey solana.PublicKey TenantPubkey solana.PublicKey // zero pubkey = no tenant @@ -42,7 +47,7 @@ type Live struct { } // New returns a Live executor with the given configuration. Callers must -// supply a non-nil Client and Executor. +// supply a non-nil Client, Executor, and RPC. func New(cfg Config) (*Live, error) { if cfg.Client == nil { return nil, fmt.Errorf("exec.New: Client is required") @@ -50,6 +55,9 @@ func New(cfg Config) (*Live, error) { if cfg.Executor == nil { return nil, fmt.Errorf("exec.New: Executor is required") } + if cfg.RPC == nil { + return nil, fmt.Errorf("exec.New: RPC is required") + } if cfg.DzPrefixCount == 0 { cfg.DzPrefixCount = 1 } @@ -114,16 +122,24 @@ func (l *Live) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep. }, nil } -// fetchTunnelID reads the user account and returns its assigned TunnelId. -// Used so the runlog records the kernel interface identifier the part-3 -// agent runner will key on. +// fetchTunnelID reads the user account by PDA and returns the assigned +// TunnelId. The sweep loop logs this in the runlog so the agent-event +// consumer can attribute `+ interface Tunnel` log lines back to a user. func (l *Live) fetchTunnelID(ctx context.Context, userPDA solana.PublicKey) (uint16, error) { - // We can't read the assigned tunnel_id without the User's on-chain bytes, - // which the SDK doesn't surface from CreateUser. Until a downstream - // helper is added, callers either skip this column (TunnelID = 0) or wire - // a per-account fetch in cmd/. The package signature is kept stable so - // part-3 can drop in the real fetch. - return 0, nil + info, err := l.cfg.RPC.GetAccountInfo(ctx, userPDA) + if err != nil { + return 0, fmt.Errorf("get user account info: %w", err) + } + if info == nil || info.Value == nil { + return 0, fmt.Errorf("user account %s not found", userPDA) + } + data := info.Value.Data.GetBinary() + if len(data) == 0 { + return 0, fmt.Errorf("user account %s empty", userPDA) + } + var u serviceability.User + serviceability.DeserializeUser(serviceability.NewByteReader(data), &u) + return u.TunnelId, nil } // ipForIndex returns base shifted by idx, wrapping at the /16 boundary so the diff --git a/tools/stress/device-orchestrator/pkg/exec/exec_test.go b/tools/stress/device-orchestrator/pkg/exec/exec_test.go index c7b13ea30b..644e2d7f87 100644 --- a/tools/stress/device-orchestrator/pkg/exec/exec_test.go +++ b/tools/stress/device-orchestrator/pkg/exec/exec_test.go @@ -1,9 +1,15 @@ package exec import ( + "context" + "encoding/binary" "testing" + "github.com/gagliardetto/solana-go" + solanarpc "github.com/gagliardetto/solana-go/rpc" + "github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestIPForIndex(t *testing.T) { @@ -25,3 +31,82 @@ func TestIPForIndex(t *testing.T) { assert.Equal(t, tc.want, got, "idx=%d", tc.idx) } } + +// stubRPC implements serviceability.RPCClient for fetchTunnelID tests. +type stubRPC struct { + accountInfo *solanarpc.GetAccountInfoResult + err error +} + +func (s *stubRPC) GetProgramAccounts(context.Context, solana.PublicKey) (solanarpc.GetProgramAccountsResult, error) { + return nil, nil +} + +func (s *stubRPC) GetAccountInfo(context.Context, solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) { + return s.accountInfo, s.err +} + +func TestFetchTunnelID_ReadsFromUserAccount(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + device := solana.NewWallet().PublicKey() + + // Hand-encode a User account body matching DeserializeUser's field order. + // All fields zero except TunnelId so the test pin-points that read path. + const tunnelID uint16 = 4242 + body := makeUserAccountBytes(owner, device, [4]byte{10, 0, 0, 5}, tunnelID) + + stub := &stubRPC{ + accountInfo: &solanarpc.GetAccountInfoResult{ + Value: &solanarpc.Account{ + Data: solanarpc.DataBytesOrJSONFromBytes(body), + }, + }, + } + live := &Live{cfg: Config{RPC: stub}} + + got, err := live.fetchTunnelID(context.Background(), solana.NewWallet().PublicKey()) + require.NoError(t, err) + assert.Equal(t, tunnelID, got) +} + +func TestFetchTunnelID_AccountMissing(t *testing.T) { + t.Parallel() + + live := &Live{cfg: Config{RPC: &stubRPC{accountInfo: &solanarpc.GetAccountInfoResult{Value: nil}}}} + _, err := live.fetchTunnelID(context.Background(), solana.NewWallet().PublicKey()) + require.Error(t, err) + assert.Contains(t, err.Error(), "not found") +} + +// makeUserAccountBytes serializes a User account body with the minimum fields +// the test needs. Matches the field order in serviceability.DeserializeUser. +func makeUserAccountBytes(owner, device solana.PublicKey, clientIP [4]byte, tunnelID uint16) []byte { + b := make([]byte, 0, 256) + b = append(b, byte(serviceability.UserType)) // AccountType + b = append(b, owner[:]...) // Owner [32] + b = append(b, make([]byte, 16)...) // Index u128 + b = append(b, 0) // BumpSeed + b = append(b, byte(serviceability.UserTypeIBRL)) + b = append(b, make([]byte, 32)...) // TenantPubKey (zero) + b = append(b, device[:]...) // DevicePubKey + b = append(b, byte(serviceability.CyoaTypeGREOverDIA)) + b = append(b, clientIP[:]...) // ClientIp [4] + b = append(b, make([]byte, 4)...) // DzIp [4] + var tidBuf [2]byte + binary.LittleEndian.PutUint16(tidBuf[:], tunnelID) + b = append(b, tidBuf[:]...) // TunnelId u16 LE + b = append(b, make([]byte, 5)...) // TunnelNet + b = append(b, byte(serviceability.UserStatusActivated)) + b = append(b, 0, 0, 0, 0) // Publishers len + b = append(b, 0, 0, 0, 0) // Subscribers len + b = append(b, make([]byte, 32)...) // ValidatorPubKey + b = append(b, make([]byte, 4)...) // TunnelEndpoint + b = append(b, 0) // TunnelFlags + b = append(b, 0) // BgpStatus + b = append(b, make([]byte, 8)...) // LastBgpUpAt + b = append(b, make([]byte, 8)...) // LastBgpReportedAt + b = append(b, make([]byte, 8)...) // BgpRttNs + return b +} diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep.go b/tools/stress/device-orchestrator/pkg/sweep/sweep.go index cda03412c1..eda53f5cae 100644 --- a/tools/stress/device-orchestrator/pkg/sweep/sweep.go +++ b/tools/stress/device-orchestrator/pkg/sweep/sweep.go @@ -15,6 +15,7 @@ import ( "errors" "fmt" "log/slog" + "sync" "time" "github.com/gagliardetto/solana-go" @@ -114,37 +115,129 @@ type createdUser struct { tunnelID uint16 } +// tunnelRegistry holds the orchestrator's tunnelID → user metadata mapping, +// shared between the provision goroutine (which writes) and the agent-event +// consumer goroutine (which reads). Lookups for unknown tunnel IDs return +// `ok=false` so the consumer can warn-log and drop the event. +type tunnelRegistry struct { + mu sync.RWMutex + idx map[uint16]createdUser +} + +func newTunnelRegistry() *tunnelRegistry { + return &tunnelRegistry{idx: make(map[uint16]createdUser)} +} + +func (r *tunnelRegistry) register(u createdUser) { + if u.tunnelID == 0 { + // TunnelId == 0 means the executor didn't surface a real ID; nothing + // in the agent log can match it, so don't take a map slot. + return + } + r.mu.Lock() + r.idx[u.tunnelID] = u + r.mu.Unlock() +} + +func (r *tunnelRegistry) lookup(tunnelID uint16) (createdUser, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + u, ok := r.idx[tunnelID] + return u, ok +} + // Run drives the provision-then-deprovision sweep to completion. Returns the // number of users actually created/deleted alongside the error (if any), so // callers can report partial progress on abort. +// +// Run additionally starts a goroutine that consumes events from cfg.Agent and +// writes pre_commit_log / applied runlog rows for tunnel IDs the sweep +// registered. The consumer exits when the agent's Events channel closes; we +// derive an agentCtx from ctx and cancel it after deprovision so the agent +// stops cleanly even on a successful run. func Run(ctx context.Context, cfg Config) error { if err := cfg.validate(); err != nil { return err } - if err := cfg.Agent.Start(ctx); err != nil { + + registry := newTunnelRegistry() + agentCtx, agentCancel := context.WithCancel(ctx) + defer agentCancel() + if err := cfg.Agent.Start(agentCtx); err != nil { return fmt.Errorf("start agent runner: %w", err) } - created, err := provision(ctx, &cfg) + var consumerWG sync.WaitGroup + consumerWG.Add(1) + go func() { + defer consumerWG.Done() + consumeAgentEvents(&cfg, registry) + }() + + created, err := provision(ctx, &cfg, registry) if err != nil && !errors.Is(err, context.Canceled) { - return err + // On a non-cancel error from provision we still want deprovision to + // run (clean up what was created); the consumer keeps draining in + // parallel so any straggling agent events for already-created users + // still land in the runlog. + _ = err } - // Always attempt deprovision so an abort during provision still cleans up - // what the sweep created. Use a fresh context for the deprovision phase if - // the original was cancelled, since the operator wants the tear-down to - // finish before exit. We respect the parent context's lifetime via the - // outer Run's error return — callers that want a hard stop pass a deadline. depErr := deprovision(ctx, &cfg, created) + + // Tell the agent to stop and wait for the consumer goroutine to drain so + // no events are dropped between deprovision-end and consumer-exit. + agentCancel() + consumerWG.Wait() + if err != nil { return err } return depErr } +// consumeAgentEvents reads from cfg.Agent.Events() until the channel closes +// and writes pre_commit_log / applied rows for tunnel IDs the sweep has +// registered. Events for unknown tunnel IDs are warn-logged and dropped — the +// most likely cause is a tunnel that belongs to a non-orchestrator user. +func consumeAgentEvents(cfg *Config, registry *tunnelRegistry) { + for ev := range cfg.Agent.Events() { + u, ok := registry.lookup(ev.TunnelID) + if !ok { + cfg.Logger.Debug("sweep: agent event for unregistered tunnel; dropping", + "tunnel_id", ev.TunnelID, "kind", ev.Kind) + continue + } + var runlogEvent runlog.Event + switch ev.Kind { + case agent.EventPreCommitLog: + runlogEvent = runlog.EventPreCommitLog + case agent.EventApplied: + runlogEvent = runlog.EventApplied + default: + continue + } + row := runlog.Row{ + RunID: cfg.RunID, + UserIndex: u.idx, + UserPubkey: u.pubkey.String(), + TunnelID: u.tunnelID, + Event: runlogEvent, + TNs: ev.At.UnixNano(), + NAfterEvent: 0, // active-count state is owned by the sweep goroutine and not safe to read here + } + if err := cfg.Runlog.Append(row); err != nil { + cfg.Logger.Warn("sweep: runlog append failed for agent event", + "err", err, "kind", runlogEvent, "tunnel_id", ev.TunnelID) + } + } +} + // provision walks 0 → Target in batches, returning the slice of created users // so deprovision can iterate in reverse. Returns ctx.Err() if cancelled -// between users. -func provision(ctx context.Context, cfg *Config) ([]createdUser, error) { +// between users. Each created user is also registered with the tunnel +// registry so the agent-event consumer can attribute pre_commit_log / +// applied events back to a user_index. +func provision(ctx context.Context, cfg *Config, registry *tunnelRegistry) ([]createdUser, error) { if cfg.Target == 0 { return nil, nil } @@ -190,7 +283,9 @@ func provision(ctx context.Context, cfg *Config) ([]createdUser, error) { if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventConfirm, res.ConfirmedAt, activeCount); err != nil { return created, err } - created = append(created, createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID}) + cu := createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID} + created = append(created, cu) + registry.register(cu) activeCount++ if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventActivate, res.ActivatedAt, activeCount); err != nil { return created, err diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go index 3402d5fe8c..a45f688a57 100644 --- a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go +++ b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go @@ -63,6 +63,11 @@ type fakeExecutor struct { // Optional hook to fail on the Nth create (1-based) — used by the abort test. failCreateOnCall int failErr error + + // Optional gate: when non-nil, DeleteUser blocks on it after incrementing + // deleteN. Tests use this to interleave work between provision and + // deprovision (e.g., emitting agent events). + deleteGate <-chan struct{} } func newFakeExecutor(owner solana.PublicKey) *fakeExecutor { @@ -107,6 +112,9 @@ func (f *fakeExecutor) CreateUser(ctx context.Context, idx int) (sweep.CreateRes func (f *fakeExecutor) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) { calls := int(f.deleteN.Add(1)) + if f.deleteGate != nil { + <-f.deleteGate + } f.mu.Lock() // Remove the matching user from the active set. for i, u := range f.created { @@ -290,6 +298,114 @@ func TestRun_RejectsInvalidConfig(t *testing.T) { } } +// scriptedAgent is an agent.Runner used to drive the sweep's agent-event +// consumer from a test. Events are emitted via Emit() so the test can +// control timing — in production the agent log lags the on-chain CreateUser +// by far longer than registry registration takes, but in tests the executor +// is instantaneous and we need to emit AFTER provision has registered the +// tunnels. +type scriptedAgent struct { + out chan agent.Event +} + +func newScriptedAgent() *scriptedAgent { + return &scriptedAgent{out: make(chan agent.Event, 16)} +} + +func (s *scriptedAgent) Start(ctx context.Context) error { + go func() { + <-ctx.Done() + close(s.out) + }() + return nil +} + +func (s *scriptedAgent) Events() <-chan agent.Event { return s.out } + +func (s *scriptedAgent) Emit(e agent.Event) { s.out <- e } + +func TestRun_ConsumesAgentEventsForRegisteredTunnels(t *testing.T) { + t.Parallel() + + owner := solana.NewWallet().PublicKey() + exec := newFakeExecutor(owner) + // Block deprovision so the test can emit agent events while all created + // tunnels are registered but before agentCancel() shuts the consumer down. + gate := make(chan struct{}) + exec.deleteGate = gate + + ag := newScriptedAgent() + + path := filepath.Join(t.TempDir(), "orchestrator-runlog.json") + w, err := runlog.Open(path) + require.NoError(t, err) + t.Cleanup(func() { _ = w.Close() }) + + cfg := sweep.Config{ + RunID: "run-events", + Target: 2, + UsersPerBatch: 2, + Hold: 0, + OwnerFilter: owner, + Executor: exec, + Agent: ag, + Runlog: w, + Clock: &fakeClock{now: time.Unix(1_700_000_000, 0)}, + } + done := make(chan error, 1) + go func() { done <- sweep.Run(context.Background(), cfg) }() + + // Wait for deprovision to begin (deleteN >= 1) — this means provision is + // fully complete AND both tunnel registrations are in the registry. + deadline := time.Now().Add(time.Second) + for exec.deleteN.Load() == 0 { + if time.Now().After(deadline) { + t.Fatal("sweep did not reach deprovision within 1s") + } + time.Sleep(time.Millisecond) + } + + // Emit events for both registered tunnels plus one unregistered one. + ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 500, At: time.Unix(1, 100)}) + ag.Emit(agent.Event{Kind: agent.EventApplied, TunnelID: 500, At: time.Unix(1, 200)}) + ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 999, At: time.Unix(1, 300)}) // unregistered; dropped + ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 501, At: time.Unix(1, 400)}) + ag.Emit(agent.Event{Kind: agent.EventApplied, TunnelID: 501, At: time.Unix(1, 500)}) + + close(gate) // unblock deprovision + + require.NoError(t, <-done) + require.NoError(t, w.Close()) + + rows := readRows(t, path) + + // Filter for the agent-driven rows so we don't depend on exact interleaving + // with the submit/confirm/activate stream emitted by provision. + var preCommit, applied []runlog.Row + for _, r := range rows { + switch r.Event { + case runlog.EventPreCommitLog: + preCommit = append(preCommit, r) + case runlog.EventApplied: + applied = append(applied, r) + } + } + require.Len(t, preCommit, 2, "two registered tunnels → two pre_commit_log rows; the unregistered tunnel 999 is dropped") + require.Len(t, applied, 2) + + // Tunnel 500 → user_index 0, Tunnel 501 → user_index 1 (fake executor assigns 500+idx). + for _, r := range preCommit { + switch r.TunnelID { + case 500: + assert.Equal(t, 0, r.UserIndex) + case 501: + assert.Equal(t, 1, r.UserIndex) + default: + t.Fatalf("unexpected tunnel id %d in pre_commit_log", r.TunnelID) + } + } +} + // Sanity: ctx cancellation between users is observed at the next iteration boundary. func TestRun_CancellationStopsBetweenUsers(t *testing.T) { t.Parallel()