From af53ce026d2788744c093b1c81e4ad7dd0140062 Mon Sep 17 00:00:00 2001
From: Greg Mitchell <greg@malbeclabs.com>
Date: Wed, 27 May 2026 05:16:32 +0000
Subject: [PATCH 1/5] sdk: add serviceability go executor CreateUser/DeleteUser
 + reconcile planner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the Solana-side primitives the device-stress orchestrator (#3746) needs:

- CreateUser / DeleteUser methods on the Go serviceability executor (variants
  36 / 42), with account-list construction mirroring the Rust SDK and a
  post-confirmation visibility wait so callers can record t_activate against
  the user PDA.
- PDA helpers: GetUserPDA, GetAccessPassPDA, GetTunnelIdsPDA,
  GetDzPrefixBlockPDA — seed bytes mirrored from
  smartcontract/programs/doublezero-serviceability/src/pda.rs.
- Pure PlanReconcile function and ReconcilePlan type for sweep delta planning,
  deterministic via ClientIp-ascending sort.
- Rust fixture generator extended to emit user_create_args.{bin,json} and
  user_delete_args.{bin,json}; Go tests load them as the cross-language wire
  format contract.

Part 1 of #3746 — library-only, no new binary.

Closes #3770.
---
 .../fixtures/generate-fixtures/Cargo.lock     |   4 +-
 .../fixtures/generate-fixtures/src/main.rs    |  57 +++
 .../testdata/fixtures/user_create_args.bin    | Bin 0 -> 11 bytes
 .../testdata/fixtures/user_create_args.json   |  31 ++
 .../testdata/fixtures/user_delete_args.bin    |   1 +
 .../testdata/fixtures/user_delete_args.json   |  16 +
 .../sdk/go/serviceability/executor.go         | 300 +++++++++++++-
 smartcontract/sdk/go/serviceability/pda.go    |  62 ++-
 .../sdk/go/serviceability/pda_test.go         | 113 ++++++
 .../sdk/go/serviceability/reconcile.go        |  59 +++
 .../sdk/go/serviceability/reconcile_test.go   | 173 ++++++++
 .../sdk/go/serviceability/user_crud_test.go   | 376 ++++++++++++++++++
 12 files changed, 1188 insertions(+), 4 deletions(-)
 create mode 100644 sdk/serviceability/testdata/fixtures/user_create_args.bin
 create mode 100644 sdk/serviceability/testdata/fixtures/user_create_args.json
 create mode 100644 sdk/serviceability/testdata/fixtures/user_delete_args.bin
 create mode 100644 sdk/serviceability/testdata/fixtures/user_delete_args.json
 create mode 100644 smartcontract/sdk/go/serviceability/pda_test.go
 create mode 100644 smartcontract/sdk/go/serviceability/reconcile.go
 create mode 100644 smartcontract/sdk/go/serviceability/reconcile_test.go
 create mode 100644 smartcontract/sdk/go/serviceability/user_crud_test.go

diff --git a/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock b/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock
index cb2af95041..9343abf199 100644
--- a/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock
+++ b/sdk/serviceability/testdata/fixtures/generate-fixtures/Cargo.lock
@@ -346,7 +346,7 @@ dependencies = [
 
 [[package]]
 name = "doublezero-program-common"
-version = "0.23.0"
+version = "0.24.0"
 dependencies = [
  "borsh 1.6.0",
  "byteorder",
@@ -358,7 +358,7 @@ dependencies = [
 
 [[package]]
 name = "doublezero-serviceability"
-version = "0.23.0"
+version = "0.24.0"
 dependencies = [
  "bitflags",
  "borsh 1.6.0",
diff --git a/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs b/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs
index b8ed35df67..9823a5776c 100644
--- a/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs
+++ b/sdk/serviceability/testdata/fixtures/generate-fixtures/src/main.rs
@@ -17,6 +17,9 @@ use borsh::BorshSerialize;
 
 use doublezero_serviceability::id_allocator::IdAllocator;
 use doublezero_serviceability::ip_allocator::IpAllocator;
+use doublezero_serviceability::processors::user::{
+    create::UserCreateArgs, delete::UserDeleteArgs,
+};
 use doublezero_serviceability::programversion::ProgramVersion;
 use doublezero_serviceability::state::{
     accesspass::{AccessPass, AccessPassStatus, AccessPassType},
@@ -95,11 +98,65 @@ fn main() {
     generate_tenant(&fixtures_dir);
     generate_resource_extension_id(&fixtures_dir);
     generate_resource_extension_ip(&fixtures_dir);
+    generate_user_create_args(&fixtures_dir);
+    generate_user_delete_args(&fixtures_dir);
 
     println!("
 all fixtures generated in {}", fixtures_dir.display());
 }
 
+/// Borsh-encoded `UserCreateArgs` (the body of instruction variant 36, without the
+/// 1-byte discriminant). Field order: user_type, cyoa_type, client_ip, tunnel_endpoint,
+/// dz_prefix_count. Non-default IP octets make endianness mistakes detectable.
+fn generate_user_create_args(dir: &Path) {
+    let val = UserCreateArgs {
+        user_type: UserType::IBRL,
+        cyoa_type: UserCYOA::GREOverDIA,
+        client_ip: Ipv4Addr::new(10, 11, 12, 13),
+        tunnel_endpoint: Ipv4Addr::new(192, 168, 1, 2),
+        dz_prefix_count: 2,
+    };
+
+    let data = borsh::to_vec(&val).unwrap();
+
+    let meta = FixtureMeta {
+        name: "UserCreateArgs".into(),
+        // Not an account; account_type=0 since this is an instruction-args fixture.
+        account_type: 0,
+        fields: vec![
+            FieldValue { name: "UserType".into(), value: "0".into(), typ: "u8".into() },
+            FieldValue { name: "CyoaType".into(), value: "1".into(), typ: "u8".into() },
+            FieldValue { name: "ClientIp".into(), value: "10.11.12.13".into(), typ: "ipv4".into() },
+            FieldValue { name: "TunnelEndpoint".into(), value: "192.168.1.2".into(), typ: "ipv4".into() },
+            FieldValue { name: "DzPrefixCount".into(), value: "2".into(), typ: "u8".into() },
+        ],
+    };
+
+    write_fixture(dir, "user_create_args", &data, &meta);
+}
+
+/// Borsh-encoded `UserDeleteArgs` (the body of instruction variant 42, without the
+/// 1-byte discriminant). Field order: dz_prefix_count, multicast_publisher_count.
+fn generate_user_delete_args(dir: &Path) {
+    let val = UserDeleteArgs {
+        dz_prefix_count: 3,
+        multicast_publisher_count: 1,
+    };
+
+    let data = borsh::to_vec(&val).unwrap();
+
+    let meta = FixtureMeta {
+        name: "UserDeleteArgs".into(),
+        account_type: 0,
+        fields: vec![
+            FieldValue { name: "DzPrefixCount".into(), value: "3".into(), typ: "u8".into() },
+            FieldValue { name: "MulticastPublisherCount".into(), value: "1".into(), typ: "u8".into() },
+        ],
+    };
+
+    write_fixture(dir, "user_delete_args", &data, &meta);
+}
+
 fn generate_global_state(dir: &Path) {
     let foundation_pk = pubkey_from_byte(0x01);
     let activator_pk = pubkey_from_byte(0x02);
diff --git a/sdk/serviceability/testdata/fixtures/user_create_args.bin b/sdk/serviceability/testdata/fixtures/user_create_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8971a4e3a86073f24c2dcc645ad4f9406c873caf
GIT binary patch
literal 11
ScmZQz<l^SxJ+Ok2i3tD)(gB?S

literal 0
HcmV?d00001

diff --git a/sdk/serviceability/testdata/fixtures/user_create_args.json b/sdk/serviceability/testdata/fixtures/user_create_args.json
new file mode 100644
index 0000000000..93922f3343
--- /dev/null
+++ b/sdk/serviceability/testdata/fixtures/user_create_args.json
@@ -0,0 +1,31 @@
+{
+  "name": "UserCreateArgs",
+  "account_type": 0,
+  "fields": [
+    {
+      "name": "UserType",
+      "value": "0",
+      "typ": "u8"
+    },
+    {
+      "name": "CyoaType",
+      "value": "1",
+      "typ": "u8"
+    },
+    {
+      "name": "ClientIp",
+      "value": "10.11.12.13",
+      "typ": "ipv4"
+    },
+    {
+      "name": "TunnelEndpoint",
+      "value": "192.168.1.2",
+      "typ": "ipv4"
+    },
+    {
+      "name": "DzPrefixCount",
+      "value": "2",
+      "typ": "u8"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/sdk/serviceability/testdata/fixtures/user_delete_args.bin b/sdk/serviceability/testdata/fixtures/user_delete_args.bin
new file mode 100644
index 0000000000..d8d3825962
--- /dev/null
+++ b/sdk/serviceability/testdata/fixtures/user_delete_args.bin
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/sdk/serviceability/testdata/fixtures/user_delete_args.json b/sdk/serviceability/testdata/fixtures/user_delete_args.json
new file mode 100644
index 0000000000..46d892aac6
--- /dev/null
+++ b/sdk/serviceability/testdata/fixtures/user_delete_args.json
@@ -0,0 +1,16 @@
+{
+  "name": "UserDeleteArgs",
+  "account_type": 0,
+  "fields": [
+    {
+      "name": "DzPrefixCount",
+      "value": "3",
+      "typ": "u8"
+    },
+    {
+      "name": "MulticastPublisherCount",
+      "value": "1",
+      "typ": "u8"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/smartcontract/sdk/go/serviceability/executor.go b/smartcontract/sdk/go/serviceability/executor.go
index eefffa5bd6..1049d4da6d 100644
--- a/smartcontract/sdk/go/serviceability/executor.go
+++ b/smartcontract/sdk/go/serviceability/executor.go
@@ -16,6 +16,8 @@ import (
 )
 
 const (
+	instructionCreateUser       = 36
+	instructionDeleteUser       = 42
 	instructionSetDeviceHealth  = 83
 	instructionSetLinkHealth    = 84
 	instructionSetUserBGPStatus = 106
@@ -165,6 +167,250 @@ func (e *Executor) SetLinkHealthBatch(ctx context.Context, updates []LinkHealthU
 	return lastSig, ErrAllUpdatesFailed
 }
 
+// UserCreateArgs bundles every input the Go executor needs to submit a CreateUser
+// instruction (variant 36). The first five fields are borsh-encoded into the
+// instruction payload exactly matching Rust's `UserCreateArgs`; the trailing
+// DevicePubkey/TenantPubkey are only used to derive AccountMeta entries.
+type UserCreateArgs struct {
+	UserType       UserUserType
+	CyoaType       CyoaType
+	ClientIP       [4]byte
+	TunnelEndpoint [4]byte
+	DzPrefixCount  uint8
+
+	// DevicePubkey identifies the device the user attaches to; required.
+	DevicePubkey solana.PublicKey
+	// TenantPubkey is the optional tenant association; pass the zero pubkey to omit.
+	TenantPubkey solana.PublicKey
+}
+
+// CreateUser submits a CreateUser instruction (variant 36) and waits for the user
+// PDA to become visible on-chain. Returns the signature and derived user PDA so the
+// caller can correlate (e.g., record t_activate against this user).
+func (e *Executor) CreateUser(ctx context.Context, args UserCreateArgs) (solana.Signature, solana.PublicKey, error) {
+	if e.signer == nil {
+		return solana.Signature{}, solana.PublicKey{}, ErrNoPrivateKey
+	}
+	if e.programID.IsZero() {
+		return solana.Signature{}, solana.PublicKey{}, ErrNoProgramID
+	}
+	if args.DzPrefixCount == 0 {
+		return solana.Signature{}, solana.PublicKey{}, errors.New("UserCreateArgs.DzPrefixCount must be > 0")
+	}
+	if args.DevicePubkey.IsZero() {
+		return solana.Signature{}, solana.PublicKey{}, errors.New("UserCreateArgs.DevicePubkey is required")
+	}
+
+	instr, userPDA, err := e.buildCreateUserInstruction(args)
+	if err != nil {
+		return solana.Signature{}, solana.PublicKey{}, fmt.Errorf("build CreateUser instruction: %w", err)
+	}
+
+	sig, _, err := e.executeTransaction(ctx, []solana.Instruction{instr})
+	if err != nil {
+		return sig, userPDA, err
+	}
+
+	if err := e.waitForAccountVisible(ctx, userPDA, e.waitForVisibleTimeout); err != nil {
+		return sig, userPDA, fmt.Errorf("post-confirm visibility timeout for user PDA: %w", err)
+	}
+	return sig, userPDA, nil
+}
+
+// DeleteUser submits a DeleteUser instruction (variant 42) and waits for the user
+// PDA to disappear from chain. The function reads the user account first so it
+// can derive the device-dependent PDAs and the multicast-publisher flag.
+func (e *Executor) DeleteUser(ctx context.Context, userPubkey solana.PublicKey) (solana.Signature, error) {
+	if e.signer == nil {
+		return solana.Signature{}, ErrNoPrivateKey
+	}
+	if e.programID.IsZero() {
+		return solana.Signature{}, ErrNoProgramID
+	}
+
+	info, err := e.rpc.GetAccountInfo(ctx, userPubkey)
+	if err != nil {
+		return solana.Signature{}, fmt.Errorf("fetch user account %s: %w", userPubkey, err)
+	}
+	if info == nil || info.Value == nil {
+		return solana.Signature{}, fmt.Errorf("user account %s not found", userPubkey)
+	}
+	rawData := info.Value.Data.GetBinary()
+	if len(rawData) == 0 {
+		return solana.Signature{}, fmt.Errorf("user account %s has empty data", userPubkey)
+	}
+	var user User
+	DeserializeUser(NewByteReader(rawData), &user)
+	if user.AccountType != UserType {
+		return solana.Signature{}, fmt.Errorf("account %s is not a User (type=%d)", userPubkey, user.AccountType)
+	}
+	user.PubKey = userPubkey
+
+	// The Rust SDK currently passes dz_prefix_count=1 / multicast_publisher_count=1
+	// because all users are created with exactly one DzPrefixBlock. Stress-orchestrator
+	// users likewise use DzPrefixCount=1, so 1 is the correct value here. Diverging
+	// requires fetching the Device record — out of scope for the SDK primitive.
+	const dzPrefixCount uint8 = 1
+	const multicastPublisherCount uint8 = 1
+
+	instr, err := e.buildDeleteUserInstruction(userPubkey, user, dzPrefixCount, multicastPublisherCount)
+	if err != nil {
+		return solana.Signature{}, fmt.Errorf("build DeleteUser instruction: %w", err)
+	}
+
+	sig, _, err := e.executeTransaction(ctx, []solana.Instruction{instr})
+	if err != nil {
+		return sig, err
+	}
+
+	if err := e.waitForAccountGone(ctx, userPubkey, e.waitForVisibleTimeout); err != nil {
+		return sig, fmt.Errorf("post-confirm visibility timeout waiting for user PDA closure: %w", err)
+	}
+	return sig, nil
+}
+
+// buildCreateUserInstruction packs the variant-36 payload and assembles the account
+// list in the order the on-chain processor expects:
+//
+//	[user_pda, device, accesspass, globalstate,
+//	 user_tunnel_block, multicast_publisher_block, device_tunnel_ids,
+//	 dz_prefix_block[0..N], optional_tenant, payer, system]
+func (e *Executor) buildCreateUserInstruction(args UserCreateArgs) (solana.Instruction, solana.PublicKey, error) {
+	data := make([]byte, 12)
+	data[0] = instructionCreateUser
+	data[1] = byte(args.UserType)
+	data[2] = byte(args.CyoaType)
+	copy(data[3:7], args.ClientIP[:])
+	copy(data[7:11], args.TunnelEndpoint[:])
+	data[11] = args.DzPrefixCount
+
+	userPDA, _, err := GetUserPDA(e.programID, args.ClientIP, args.UserType)
+	if err != nil {
+		return nil, solana.PublicKey{}, fmt.Errorf("derive user PDA: %w", err)
+	}
+	accessPassPDA, _, err := GetAccessPassPDA(e.programID, args.ClientIP, e.signer.PublicKey())
+	if err != nil {
+		return nil, userPDA, fmt.Errorf("derive accesspass PDA: %w", err)
+	}
+	globalStatePDA, _, err := GetGlobalStatePDA(e.programID)
+	if err != nil {
+		return nil, userPDA, fmt.Errorf("derive globalstate PDA: %w", err)
+	}
+	userTunnelBlockPDA, _, err := GetUserTunnelBlockPDA(e.programID)
+	if err != nil {
+		return nil, userPDA, fmt.Errorf("derive user tunnel block PDA: %w", err)
+	}
+	mcPublisherBlockPDA, _, err := GetMulticastPublisherBlockPDA(e.programID)
+	if err != nil {
+		return nil, userPDA, fmt.Errorf("derive multicast publisher block PDA: %w", err)
+	}
+	tunnelIdsPDA, _, err := GetTunnelIdsPDA(e.programID, args.DevicePubkey, 0)
+	if err != nil {
+		return nil, userPDA, fmt.Errorf("derive device tunnel ids PDA: %w", err)
+	}
+
+	accounts := solana.AccountMetaSlice{
+		solana.Meta(userPDA).WRITE(),
+		solana.Meta(args.DevicePubkey).WRITE(),
+		solana.Meta(accessPassPDA).WRITE(),
+		solana.Meta(globalStatePDA).WRITE(),
+		solana.Meta(userTunnelBlockPDA).WRITE(),
+		solana.Meta(mcPublisherBlockPDA).WRITE(),
+		solana.Meta(tunnelIdsPDA).WRITE(),
+	}
+	for i := uint64(0); i < uint64(args.DzPrefixCount); i++ {
+		dzPrefixPDA, _, err := GetDzPrefixBlockPDA(e.programID, args.DevicePubkey, i)
+		if err != nil {
+			return nil, userPDA, fmt.Errorf("derive dz_prefix_block[%d] PDA: %w", i, err)
+		}
+		accounts = append(accounts, solana.Meta(dzPrefixPDA).WRITE())
+	}
+	if !args.TenantPubkey.IsZero() {
+		accounts = append(accounts, solana.Meta(args.TenantPubkey).WRITE())
+	}
+	accounts = append(accounts,
+		solana.Meta(e.signer.PublicKey()).SIGNER().WRITE(),
+		solana.Meta(solana.SystemProgramID),
+	)
+
+	return &genericInstruction{
+		programID:            e.programID,
+		accounts:             accounts,
+		data:                 data,
+		skipPermissionInject: true,
+	}, userPDA, nil
+}
+
+// buildDeleteUserInstruction packs the variant-42 payload and assembles the account
+// list in the order the on-chain processor expects:
+//
+//	[user, accesspass, globalstate, device,
+//	 user_tunnel_block, multicast_publisher_block, device_tunnel_ids,
+//	 dz_prefix_block[0..N], optional_tenant, owner, payer, system]
+//
+// `multicastPublisherCount` mirrors the Rust SDK's behavior: the on-chain processor
+// consumes the MulticastPublisherBlock slot unconditionally for the variant-42
+// layout, so DeleteUser's caller passes 1 even when the user was not created as a
+// publisher. Exposed as a parameter so the byte-encoding can be tested independently.
+func (e *Executor) buildDeleteUserInstruction(userPubkey solana.PublicKey, user User, dzPrefixCount, multicastPublisherCount uint8) (solana.Instruction, error) {
+	data := []byte{instructionDeleteUser, dzPrefixCount, multicastPublisherCount}
+
+	accessPassPDA, _, err := GetAccessPassPDA(e.programID, user.ClientIp, user.Owner)
+	if err != nil {
+		return nil, fmt.Errorf("derive accesspass PDA: %w", err)
+	}
+	globalStatePDA, _, err := GetGlobalStatePDA(e.programID)
+	if err != nil {
+		return nil, fmt.Errorf("derive globalstate PDA: %w", err)
+	}
+	devicePubkey := solana.PublicKeyFromBytes(user.DevicePubKey[:])
+	userTunnelBlockPDA, _, err := GetUserTunnelBlockPDA(e.programID)
+	if err != nil {
+		return nil, fmt.Errorf("derive user tunnel block PDA: %w", err)
+	}
+	mcPublisherBlockPDA, _, err := GetMulticastPublisherBlockPDA(e.programID)
+	if err != nil {
+		return nil, fmt.Errorf("derive multicast publisher block PDA: %w", err)
+	}
+	tunnelIdsPDA, _, err := GetTunnelIdsPDA(e.programID, devicePubkey, 0)
+	if err != nil {
+		return nil, fmt.Errorf("derive device tunnel ids PDA: %w", err)
+	}
+
+	accounts := solana.AccountMetaSlice{
+		solana.Meta(userPubkey).WRITE(),
+		solana.Meta(accessPassPDA).WRITE(),
+		solana.Meta(globalStatePDA).WRITE(),
+		solana.Meta(devicePubkey).WRITE(),
+		solana.Meta(userTunnelBlockPDA).WRITE(),
+		solana.Meta(mcPublisherBlockPDA).WRITE(),
+		solana.Meta(tunnelIdsPDA).WRITE(),
+	}
+	for i := uint64(0); i < uint64(dzPrefixCount); i++ {
+		dzPrefixPDA, _, err := GetDzPrefixBlockPDA(e.programID, devicePubkey, i)
+		if err != nil {
+			return nil, fmt.Errorf("derive dz_prefix_block[%d] PDA: %w", i, err)
+		}
+		accounts = append(accounts, solana.Meta(dzPrefixPDA).WRITE())
+	}
+	var zeroPK [32]uint8
+	if user.TenantPubKey != zeroPK {
+		accounts = append(accounts, solana.Meta(solana.PublicKeyFromBytes(user.TenantPubKey[:])).WRITE())
+	}
+	accounts = append(accounts,
+		solana.Meta(solana.PublicKeyFromBytes(user.Owner[:])).WRITE(),
+		solana.Meta(e.signer.PublicKey()).SIGNER().WRITE(),
+		solana.Meta(solana.SystemProgramID),
+	)
+
+	return &genericInstruction{
+		programID:            e.programID,
+		accounts:             accounts,
+		data:                 data,
+		skipPermissionInject: true,
+	}, nil
+}
+
 // UserBGPStatusUpdate holds the parameters for a single SetUserBGPStatus submission.
 type UserBGPStatusUpdate struct {
 	UserPubkey   solana.PublicKey
@@ -231,6 +477,11 @@ type genericInstruction struct {
 	programID solana.PublicKey
 	accounts  solana.AccountMetaSlice
 	data      []byte
+	// skipPermissionInject suppresses the executor's auto-appending of the Permission PDA.
+	// CreateUser/DeleteUser opt out because the on-chain processor uses accounts.len()
+	// to detect the optional tenant account; appending a trailing Permission shifts that
+	// count and would mis-classify accounts.
+	skipPermissionInject bool
 }
 
 func (i *genericInstruction) ProgramID() solana.PublicKey {
@@ -278,7 +529,7 @@ func (e *Executor) executeTransaction(ctx context.Context, instructions []solana
 	e.resolvePermissionPDA(ctx)
 	if e.permissionPDA != nil {
 		for _, instr := range instructions {
-			if gi, ok := instr.(*genericInstruction); ok {
+			if gi, ok := instr.(*genericInstruction); ok && !gi.skipPermissionInject {
 				gi.accounts = append(gi.accounts, solana.Meta(*e.permissionPDA))
 			}
 		}
@@ -332,6 +583,53 @@ func (e *Executor) executeTransaction(ctx context.Context, instructions []solana
 	return sig, res, nil
 }
 
+// waitForAccountVisible polls GetAccountInfo until the given account is observable
+// on-chain, or the deadline expires. Used post-CreateUser to give the caller a
+// timestamp anchored to when the user PDA actually appears.
+func (e *Executor) waitForAccountVisible(ctx context.Context, pubkey solana.PublicKey, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for {
+		info, err := e.rpc.GetAccountInfo(ctx, pubkey)
+		if err == nil && info != nil && info.Value != nil {
+			return nil
+		}
+		if time.Now().After(deadline) {
+			if err != nil {
+				return fmt.Errorf("account %s not visible: %w", pubkey, err)
+			}
+			return fmt.Errorf("account %s not visible before deadline", pubkey)
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(250 * time.Millisecond):
+		}
+	}
+}
+
+// waitForAccountGone polls GetAccountInfo until the given account no longer exists,
+// or the deadline expires. Used post-DeleteUser to detect closure.
+func (e *Executor) waitForAccountGone(ctx context.Context, pubkey solana.PublicKey, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for {
+		info, err := e.rpc.GetAccountInfo(ctx, pubkey)
+		if err == nil && (info == nil || info.Value == nil) {
+			return nil
+		}
+		if time.Now().After(deadline) {
+			if err != nil {
+				return fmt.Errorf("account %s still present: %w", pubkey, err)
+			}
+			return fmt.Errorf("account %s still present before deadline", pubkey)
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(250 * time.Millisecond):
+		}
+	}
+}
+
 func (e *Executor) waitForSignatureVisible(ctx context.Context, sig solana.Signature, timeout time.Duration) error {
 	deadline := time.Now().Add(timeout)
 
diff --git a/smartcontract/sdk/go/serviceability/pda.go b/smartcontract/sdk/go/serviceability/pda.go
index c147d3e103..39a2c39dbf 100644
--- a/smartcontract/sdk/go/serviceability/pda.go
+++ b/smartcontract/sdk/go/serviceability/pda.go
@@ -1,6 +1,10 @@
 package serviceability
 
-import "github.com/gagliardetto/solana-go"
+import (
+	"encoding/binary"
+
+	"github.com/gagliardetto/solana-go"
+)
 
 // PDA seeds matching Rust implementation in seeds.rs
 const (
@@ -16,6 +20,10 @@ const (
 	SeedMulticastPublisherBlock = "multicastpublisherblock"
 	SeedTenant                  = "tenant"
 	SeedPermission              = "permission"
+	SeedUser                    = "user"
+	SeedAccessPass              = "accesspass"
+	SeedTunnelIds               = "tunnelids"
+	SeedDzPrefixBlock           = "dzprefixblock"
 )
 
 // DeriveGlobalStatePDA derives the PDA for the GlobalState account.
@@ -123,3 +131,55 @@ func GetPermissionPDA(programID solana.PublicKey, userPayer solana.PublicKey) (s
 	}
 	return solana.FindProgramAddress(seeds, programID)
 }
+
+// GetUserPDA derives the PDA for a User account, keyed by (client_ip, user_type).
+// Mirrors smartcontract/programs/doublezero-serviceability/src/pda.rs:get_user_pda.
+func GetUserPDA(programID solana.PublicKey, clientIP [4]byte, userType UserUserType) (solana.PublicKey, uint8, error) {
+	seeds := [][]byte{
+		[]byte(SeedPrefix),
+		[]byte(SeedUser),
+		clientIP[:],
+		{byte(userType)},
+	}
+	return solana.FindProgramAddress(seeds, programID)
+}
+
+// GetAccessPassPDA derives the PDA for an AccessPass account, keyed by (client_ip, user_payer).
+// Mirrors smartcontract/programs/doublezero-serviceability/src/pda.rs:get_accesspass_pda.
+func GetAccessPassPDA(programID solana.PublicKey, clientIP [4]byte, userPayer solana.PublicKey) (solana.PublicKey, uint8, error) {
+	seeds := [][]byte{
+		[]byte(SeedPrefix),
+		[]byte(SeedAccessPass),
+		clientIP[:],
+		userPayer[:],
+	}
+	return solana.FindProgramAddress(seeds, programID)
+}
+
+// GetTunnelIdsPDA derives the PDA for a per-device TunnelIds resource extension at the given index.
+// Rust uses usize (8 bytes on 64-bit) little-endian for the index; we always encode 8 bytes.
+func GetTunnelIdsPDA(programID solana.PublicKey, devicePK solana.PublicKey, index uint64) (solana.PublicKey, uint8, error) {
+	var idxBuf [8]byte
+	binary.LittleEndian.PutUint64(idxBuf[:], index)
+	seeds := [][]byte{
+		[]byte(SeedPrefix),
+		[]byte(SeedTunnelIds),
+		devicePK[:],
+		idxBuf[:],
+	}
+	return solana.FindProgramAddress(seeds, programID)
+}
+
+// GetDzPrefixBlockPDA derives the PDA for a per-device DzPrefixBlock resource extension at the given index.
+// Rust uses usize (8 bytes on 64-bit) little-endian for the index; we always encode 8 bytes.
+func GetDzPrefixBlockPDA(programID solana.PublicKey, devicePK solana.PublicKey, index uint64) (solana.PublicKey, uint8, error) {
+	var idxBuf [8]byte
+	binary.LittleEndian.PutUint64(idxBuf[:], index)
+	seeds := [][]byte{
+		[]byte(SeedPrefix),
+		[]byte(SeedDzPrefixBlock),
+		devicePK[:],
+		idxBuf[:],
+	}
+	return solana.FindProgramAddress(seeds, programID)
+}
diff --git a/smartcontract/sdk/go/serviceability/pda_test.go b/smartcontract/sdk/go/serviceability/pda_test.go
new file mode 100644
index 0000000000..614e243fb0
--- /dev/null
+++ b/smartcontract/sdk/go/serviceability/pda_test.go
@@ -0,0 +1,113 @@
+package serviceability_test
+
+import (
+	"testing"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// PDAs are deterministic from (program_id, seeds), so we can cross-check the new
+// helpers against an independent recomputation that mirrors the Rust seed bytes
+// exactly. These tests catch typos in seed strings and width/endianness mistakes
+// in the index encoding without requiring the Rust binary at test time.
+
+func recomputePDA(t *testing.T, programID solana.PublicKey, seeds [][]byte) solana.PublicKey {
+	t.Helper()
+	pda, _, err := solana.FindProgramAddress(seeds, programID)
+	require.NoError(t, err)
+	return pda
+}
+
+func TestGetUserPDA_MatchesRustSeeds(t *testing.T) {
+	t.Parallel()
+	programID := solana.NewWallet().PublicKey()
+	ip := [4]byte{198, 51, 100, 7}
+
+	got, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeIBRLWithAllocatedIP)
+	require.NoError(t, err)
+
+	want := recomputePDA(t, programID, [][]byte{
+		[]byte("doublezero"),
+		[]byte("user"),
+		ip[:],
+		{byte(serviceability.UserTypeIBRLWithAllocatedIP)},
+	})
+	assert.Equal(t, want, got)
+}
+
+func TestGetAccessPassPDA_MatchesRustSeeds(t *testing.T) {
+	t.Parallel()
+	programID := solana.NewWallet().PublicKey()
+	userPayer := solana.NewWallet().PublicKey()
+	ip := [4]byte{10, 0, 0, 5}
+
+	got, _, err := serviceability.GetAccessPassPDA(programID, ip, userPayer)
+	require.NoError(t, err)
+
+	want := recomputePDA(t, programID, [][]byte{
+		[]byte("doublezero"),
+		[]byte("accesspass"),
+		ip[:],
+		userPayer[:],
+	})
+	assert.Equal(t, want, got)
+}
+
+func TestGetTunnelIdsPDA_IndexIsEightByteLE(t *testing.T) {
+	t.Parallel()
+	programID := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+
+	for _, idx := range []uint64{0, 1, 7, 256, 0xDEAD_BEEF} {
+		got, _, err := serviceability.GetTunnelIdsPDA(programID, device, idx)
+		require.NoError(t, err)
+
+		// Build the index seed by hand: 8-byte little-endian.
+		idxBytes := []byte{
+			byte(idx), byte(idx >> 8), byte(idx >> 16), byte(idx >> 24),
+			byte(idx >> 32), byte(idx >> 40), byte(idx >> 48), byte(idx >> 56),
+		}
+		want := recomputePDA(t, programID, [][]byte{
+			[]byte("doublezero"),
+			[]byte("tunnelids"),
+			device[:],
+			idxBytes,
+		})
+		assert.Equal(t, want, got, "idx=%d", idx)
+	}
+}
+
+func TestGetDzPrefixBlockPDA_IndexIsEightByteLE(t *testing.T) {
+	t.Parallel()
+	programID := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+
+	idx := uint64(3)
+	got, _, err := serviceability.GetDzPrefixBlockPDA(programID, device, idx)
+	require.NoError(t, err)
+	want := recomputePDA(t, programID, [][]byte{
+		[]byte("doublezero"),
+		[]byte("dzprefixblock"),
+		device[:],
+		{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+	})
+	assert.Equal(t, want, got)
+}
+
+// TestUserPDA_DiffersByUserType guards against accidentally dropping the
+// user_type byte from the seeds (which would collapse different user types onto
+// the same PDA).
+func TestUserPDA_DiffersByUserType(t *testing.T) {
+	t.Parallel()
+	programID := solana.NewWallet().PublicKey()
+	ip := [4]byte{10, 0, 0, 7}
+
+	pdaIBRL, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeIBRL)
+	require.NoError(t, err)
+	pdaMulticast, _, err := serviceability.GetUserPDA(programID, ip, serviceability.UserTypeMulticast)
+	require.NoError(t, err)
+	assert.NotEqual(t, pdaIBRL, pdaMulticast)
+}
diff --git a/smartcontract/sdk/go/serviceability/reconcile.go b/smartcontract/sdk/go/serviceability/reconcile.go
new file mode 100644
index 0000000000..d61e20ea91
--- /dev/null
+++ b/smartcontract/sdk/go/serviceability/reconcile.go
@@ -0,0 +1,59 @@
+package serviceability
+
+import (
+	"bytes"
+	"sort"
+
+	"github.com/gagliardetto/solana-go"
+)
+
+// ReconcilePlan describes the delta needed to drive the set of users owned by a
+// given key toward a desired count.
+type ReconcilePlan struct {
+	// ToCreate is the number of users to add. Always >= 0.
+	ToCreate int
+	// ToDelete lists user PDAs to remove, in the order they should be deleted.
+	// Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker, so
+	// repeated calls against the same input produce identical plans.
+	ToDelete []solana.PublicKey
+}
+
+// PlanReconcile decides what to create or delete so that the number of users
+// owned by ownerFilter equals target. Users with a different Owner are ignored
+// (neither counted nor deleted), which lets the stress orchestrator share a
+// program with other tenants without disturbing them.
+//
+// The function is pure — no I/O — so it is safe to call repeatedly while the
+// orchestrator polls live state. Returns a zero plan when target is negative.
+func PlanReconcile(current []User, target int, ownerFilter solana.PublicKey) ReconcilePlan {
+	if target < 0 {
+		return ReconcilePlan{}
+	}
+
+	var owned []User
+	for _, u := range current {
+		if bytes.Equal(u.Owner[:], ownerFilter[:]) {
+			owned = append(owned, u)
+		}
+	}
+
+	switch {
+	case len(owned) < target:
+		return ReconcilePlan{ToCreate: target - len(owned)}
+	case len(owned) > target:
+		sort.Slice(owned, func(i, j int) bool {
+			if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 {
+				return c < 0
+			}
+			return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0
+		})
+		victims := owned[target:]
+		out := make([]solana.PublicKey, len(victims))
+		for i, u := range victims {
+			out[i] = solana.PublicKeyFromBytes(u.PubKey[:])
+		}
+		return ReconcilePlan{ToDelete: out}
+	default:
+		return ReconcilePlan{}
+	}
+}
diff --git a/smartcontract/sdk/go/serviceability/reconcile_test.go b/smartcontract/sdk/go/serviceability/reconcile_test.go
new file mode 100644
index 0000000000..335094b7b8
--- /dev/null
+++ b/smartcontract/sdk/go/serviceability/reconcile_test.go
@@ -0,0 +1,173 @@
+package serviceability_test
+
+import (
+	"testing"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// makeUser is a tiny helper to build a User suitable for PlanReconcile testing:
+// only Owner, ClientIp, and PubKey actually influence the planner.
+func makeUser(owner solana.PublicKey, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User {
+	return serviceability.User{
+		Owner:    owner,
+		ClientIp: clientIP,
+		PubKey:   pubkey,
+	}
+}
+
+func TestPlanReconcile(t *testing.T) {
+	t.Parallel()
+
+	orchestrator := solana.NewWallet().PublicKey()
+	stranger := solana.NewWallet().PublicKey()
+
+	// Stable pubkeys so we can assert exact ordering.
+	u1 := solana.NewWallet().PublicKey()
+	u2 := solana.NewWallet().PublicKey()
+	u3 := solana.NewWallet().PublicKey()
+	u4 := solana.NewWallet().PublicKey()
+	u5 := solana.NewWallet().PublicKey()
+
+	ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} }
+
+	tests := []struct {
+		name          string
+		current       []serviceability.User
+		target        int
+		owner         solana.PublicKey
+		wantCreate    int
+		wantDeleteIPs [][4]byte // ClientIp order we expect to see in ToDelete
+	}{
+		{
+			name:       "zero to N",
+			current:    nil,
+			target:     4,
+			owner:      orchestrator,
+			wantCreate: 4,
+		},
+		{
+			name: "N to zero deletes in ip-ascending order",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 3)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u3, ip(10, 0, 0, 4)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
+			},
+			target:        0,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)},
+		},
+		{
+			name: "partial trim deletes only the overflow",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 5)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 4)),
+				makeUser(orchestrator, u3, ip(10, 0, 0, 3)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
+				makeUser(orchestrator, u5, ip(10, 0, 0, 1)),
+			},
+			target:        3,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)},
+		},
+		{
+			name: "partial grow asks for the missing count",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+			},
+			target:     5,
+			owner:      orchestrator,
+			wantCreate: 3,
+		},
+		{
+			name: "only foreign users present grows by full target",
+			current: []serviceability.User{
+				makeUser(stranger, u1, ip(10, 0, 0, 1)),
+				makeUser(stranger, u2, ip(10, 0, 0, 2)),
+				makeUser(stranger, u3, ip(10, 0, 0, 3)),
+			},
+			target:     2,
+			owner:      orchestrator,
+			wantCreate: 2,
+		},
+		{
+			name: "mixed ownership only counts and deletes owned",
+			current: []serviceability.User{
+				makeUser(stranger, u1, ip(10, 0, 0, 9)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+				makeUser(stranger, u3, ip(10, 0, 0, 8)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 1)),
+			},
+			target:        1,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)},
+		},
+		{
+			name: "already at target produces zero plan",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+			},
+			target:     2,
+			owner:      orchestrator,
+			wantCreate: 0,
+		},
+		{
+			name: "negative target produces zero plan",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+			},
+			target:     -1,
+			owner:      orchestrator,
+			wantCreate: 0,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			plan := serviceability.PlanReconcile(tc.current, tc.target, tc.owner)
+			assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate")
+			require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length")
+
+			// Resolve expected pubkeys via ClientIp lookup against the current set.
+			ipToPubkey := map[[4]byte]solana.PublicKey{}
+			for _, u := range tc.current {
+				ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:])
+			}
+			for i, ipKey := range tc.wantDeleteIPs {
+				assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey)
+			}
+		})
+	}
+}
+
+func TestPlanReconcile_TieBreaksByPubkey(t *testing.T) {
+	t.Parallel()
+
+	orchestrator := solana.NewWallet().PublicKey()
+	sharedIP := [4]byte{10, 0, 0, 1}
+
+	// Two users with the same ClientIp (artificial — onchain the IP is part of
+	// the PDA seed so collisions can't happen, but the tiebreak must still be
+	// deterministic).
+	pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
+	pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
+
+	plan := serviceability.PlanReconcile([]serviceability.User{
+		makeUser(orchestrator, pkB, sharedIP),
+		makeUser(orchestrator, pkA, sharedIP),
+	}, 0, orchestrator)
+
+	require.Len(t, plan.ToDelete, 2)
+	// pkA (0xAA…) sorts before pkB (0xBB…).
+	assert.Equal(t, pkA, plan.ToDelete[0])
+	assert.Equal(t, pkB, plan.ToDelete[1])
+}
diff --git a/smartcontract/sdk/go/serviceability/user_crud_test.go b/smartcontract/sdk/go/serviceability/user_crud_test.go
new file mode 100644
index 0000000000..808df0a8d2
--- /dev/null
+++ b/smartcontract/sdk/go/serviceability/user_crud_test.go
@@ -0,0 +1,376 @@
+package serviceability
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/gagliardetto/solana-go"
+	solanarpc "github.com/gagliardetto/solana-go/rpc"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// loadArgsFixture loads a `.bin` payload from sdk/serviceability/testdata/fixtures/
+// for the cross-language wire-format check.
+func loadArgsFixture(t *testing.T, name string) []byte {
+	t.Helper()
+	_, filename, _, _ := runtime.Caller(0)
+	dir := filepath.Join(filepath.Dir(filename), "..", "..", "..", "..", "sdk", "serviceability", "testdata", "fixtures")
+	bin, err := os.ReadFile(filepath.Join(dir, name+".bin"))
+	require.NoErrorf(t, err, "reading %s.bin", name)
+	return bin
+}
+
+func TestBuildCreateUserInstruction(t *testing.T) {
+	t.Parallel()
+
+	rpc := &mockRPCClient{}
+	executor, _ := newTestExecutor(t, rpc)
+
+	args := UserCreateArgs{
+		UserType:       UserTypeIBRL,
+		CyoaType:       CyoaTypeGREOverDIA,
+		ClientIP:       [4]byte{10, 11, 12, 13},
+		TunnelEndpoint: [4]byte{192, 168, 1, 2},
+		DzPrefixCount:  2,
+		DevicePubkey:   solana.NewWallet().PublicKey(),
+	}
+
+	instr, userPDA, err := executor.buildCreateUserInstruction(args)
+	require.NoError(t, err)
+
+	// Variant byte + 11-byte borsh body matching Rust UserCreateArgs.
+	data, err := instr.Data()
+	require.NoError(t, err)
+	require.Len(t, data, 12, "opcode (1) + borsh UserCreateArgs (11) = 12 bytes")
+	assert.Equal(t, byte(instructionCreateUser), data[0])
+	assert.Equal(t, loadArgsFixture(t, "user_create_args"), data[1:],
+		"borsh body must match Rust-generated user_create_args.bin")
+
+	// User PDA derivation is deterministic from (program_id, client_ip, user_type).
+	expectedPDA, _, err := GetUserPDA(executor.programID, args.ClientIP, args.UserType)
+	require.NoError(t, err)
+	assert.Equal(t, expectedPDA, userPDA)
+
+	// Account count = 7 fixed + DzPrefixCount + payer + system (no tenant).
+	accs := instr.Accounts()
+	require.Len(t, accs, 7+int(args.DzPrefixCount)+2)
+	assert.Equal(t, userPDA, accs[0].PublicKey)
+	assert.True(t, accs[0].IsWritable)
+	assert.False(t, accs[0].IsSigner)
+	assert.Equal(t, args.DevicePubkey, accs[1].PublicKey)
+	// Last two slots: signer + system program.
+	assert.Equal(t, executor.signer.PublicKey(), accs[len(accs)-2].PublicKey)
+	assert.True(t, accs[len(accs)-2].IsSigner)
+	assert.Equal(t, solana.SystemProgramID, accs[len(accs)-1].PublicKey)
+}
+
+func TestBuildCreateUserInstruction_WithTenant(t *testing.T) {
+	t.Parallel()
+
+	rpc := &mockRPCClient{}
+	executor, _ := newTestExecutor(t, rpc)
+	tenant := solana.NewWallet().PublicKey()
+
+	args := UserCreateArgs{
+		UserType:       UserTypeIBRLWithAllocatedIP,
+		CyoaType:       CyoaTypeGREOverFabric,
+		ClientIP:       [4]byte{198, 51, 100, 7},
+		TunnelEndpoint: [4]byte{0, 0, 0, 0},
+		DzPrefixCount:  1,
+		DevicePubkey:   solana.NewWallet().PublicKey(),
+		TenantPubkey:   tenant,
+	}
+	instr, _, err := executor.buildCreateUserInstruction(args)
+	require.NoError(t, err)
+
+	accs := instr.Accounts()
+	// Tenant slot sits between dz_prefix_block(s) and the payer/system tail.
+	tenantSlot := accs[len(accs)-3]
+	assert.Equal(t, tenant, tenantSlot.PublicKey)
+	assert.True(t, tenantSlot.IsWritable)
+}
+
+func TestBuildCreateUserInstruction_RejectsZeroDzPrefix(t *testing.T) {
+	t.Parallel()
+
+	rpc := &mockRPCClient{}
+	executor, _ := newTestExecutor(t, rpc)
+	_, _, err := executor.CreateUser(context.Background(), UserCreateArgs{
+		UserType:      UserTypeIBRL,
+		CyoaType:      CyoaTypeGREOverDIA,
+		DzPrefixCount: 0,
+		DevicePubkey:  solana.NewWallet().PublicKey(),
+	})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "DzPrefixCount must be > 0")
+}
+
+func TestBuildDeleteUserInstruction(t *testing.T) {
+	t.Parallel()
+
+	rpc := &mockRPCClient{}
+	executor, _ := newTestExecutor(t, rpc)
+
+	userPubkey := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+	owner := solana.NewWallet().PublicKey()
+	user := User{
+		AccountType:  UserType,
+		Owner:        owner,
+		UserType:     UserTypeIBRL,
+		DevicePubKey: device,
+		ClientIp:     [4]byte{10, 0, 0, 5},
+	}
+
+	// Use the fixture's (3, 1) values to exercise the borsh layout end-to-end
+	// against Rust output; production DeleteUser hard-codes (1, 1) — see the
+	// constant in DeleteUser itself.
+	instr, err := executor.buildDeleteUserInstruction(userPubkey, user, 3, 1)
+	require.NoError(t, err)
+
+	data, err := instr.Data()
+	require.NoError(t, err)
+	require.Len(t, data, 3, "opcode (1) + borsh UserDeleteArgs (2) = 3 bytes")
+	assert.Equal(t, byte(instructionDeleteUser), data[0])
+	assert.Equal(t, loadArgsFixture(t, "user_delete_args"), data[1:],
+		"borsh body must match Rust-generated user_delete_args.bin")
+
+	accs := instr.Accounts()
+	// 7 fixed + 3 dz_prefix + owner + payer + system = 13 accounts (no tenant).
+	require.Len(t, accs, 13)
+	assert.Equal(t, userPubkey, accs[0].PublicKey)
+	assert.Equal(t, device, accs[3].PublicKey)
+	ownerSlot := accs[len(accs)-3]
+	assert.Equal(t, owner, ownerSlot.PublicKey)
+	assert.True(t, ownerSlot.IsWritable)
+	assert.Equal(t, executor.signer.PublicKey(), accs[len(accs)-2].PublicKey)
+	assert.True(t, accs[len(accs)-2].IsSigner)
+	assert.Equal(t, solana.SystemProgramID, accs[len(accs)-1].PublicKey)
+}
+
+func TestBuildDeleteUserInstruction_WithTenant(t *testing.T) {
+	t.Parallel()
+
+	rpc := &mockRPCClient{}
+	executor, _ := newTestExecutor(t, rpc)
+
+	tenant := solana.NewWallet().PublicKey()
+	user := User{
+		AccountType:  UserType,
+		Owner:        solana.NewWallet().PublicKey(),
+		TenantPubKey: tenant,
+		DevicePubKey: solana.NewWallet().PublicKey(),
+		UserType:     UserTypeIBRL,
+		ClientIp:     [4]byte{10, 0, 0, 5},
+	}
+
+	instr, err := executor.buildDeleteUserInstruction(solana.NewWallet().PublicKey(), user, 1, 1)
+	require.NoError(t, err)
+
+	accs := instr.Accounts()
+	// Tenant sits before the owner/payer/system tail (3 trailing slots).
+	tenantSlot := accs[len(accs)-4]
+	assert.Equal(t, tenant, tenantSlot.PublicKey)
+	assert.True(t, tenantSlot.IsWritable)
+}
+
+func TestCreateUserWaitsForAccountVisible(t *testing.T) {
+	t.Parallel()
+
+	signer := solana.NewWallet().PrivateKey
+	programID := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+	args := UserCreateArgs{
+		UserType:      UserTypeIBRL,
+		CyoaType:      CyoaTypeGREOverDIA,
+		ClientIP:      [4]byte{10, 0, 0, 1},
+		DzPrefixCount: 1,
+		DevicePubkey:  device,
+	}
+	expectedPDA, _, err := GetUserPDA(programID, args.ClientIP, args.UserType)
+	require.NoError(t, err)
+
+	// First call (permission probe) returns nil; the user-PDA probe then returns
+	// a non-nil Value so the visibility wait completes immediately.
+	var lookups atomic.Int32
+	rpc := &mockRPCClient{
+		getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) {
+			n := lookups.Add(1)
+			if account.Equals(expectedPDA) && n >= 2 {
+				return &solanarpc.GetAccountInfoResult{
+					Value: &solanarpc.Account{Owner: programID},
+				}, nil
+			}
+			return &solanarpc.GetAccountInfoResult{Value: nil}, nil
+		},
+	}
+	executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(500*time.Millisecond))
+
+	sig, userPDA, err := executor.CreateUser(context.Background(), args)
+	require.NoError(t, err)
+	assert.NotEqual(t, solana.Signature{}, sig)
+	assert.Equal(t, expectedPDA, userPDA)
+	require.NotEmpty(t, rpc.sentTransactions)
+}
+
+func TestCreateUserReportsVisibilityTimeout(t *testing.T) {
+	t.Parallel()
+
+	signer := solana.NewWallet().PrivateKey
+	programID := solana.NewWallet().PublicKey()
+	rpc := &mockRPCClient{} // default: GetAccountInfo always returns nil
+	executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(50*time.Millisecond))
+
+	sig, userPDA, err := executor.CreateUser(context.Background(), UserCreateArgs{
+		UserType:      UserTypeIBRL,
+		CyoaType:      CyoaTypeGREOverDIA,
+		ClientIP:      [4]byte{10, 0, 0, 1},
+		DzPrefixCount: 1,
+		DevicePubkey:  solana.NewWallet().PublicKey(),
+	})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "post-confirm visibility timeout")
+	// Signature and PDA are still returned so callers can correlate.
+	assert.NotEqual(t, solana.Signature{}, sig)
+	assert.NotEqual(t, solana.PublicKey{}, userPDA)
+}
+
+func TestDeleteUserWaitsForAccountGone(t *testing.T) {
+	t.Parallel()
+
+	signer := solana.NewWallet().PrivateKey
+	programID := solana.NewWallet().PublicKey()
+	userPubkey := solana.NewWallet().PublicKey()
+
+	// Construct a borsh-serialized minimal User account body via DeserializeUser's
+	// inverse: we just write the fields by hand.
+	owner := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+	userBytes := makeMinimalUserBytes(owner, device, [4]byte{10, 0, 0, 5})
+
+	// Sequence: GetAccountInfo returns user bytes once (initial DeleteUser read), nil
+	// thereafter (visibility wait sees account gone). Permission probe returns nil.
+	var lookups atomic.Int32
+	rpc := &mockRPCClient{
+		getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) {
+			n := lookups.Add(1)
+			if account.Equals(userPubkey) && n == 1 {
+				return &solanarpc.GetAccountInfoResult{
+					Value: &solanarpc.Account{
+						Owner: programID,
+						Data:  solanarpc.DataBytesOrJSONFromBytes(userBytes),
+					},
+				}, nil
+			}
+			return &solanarpc.GetAccountInfoResult{Value: nil}, nil
+		},
+	}
+	executor := NewExecutor(slog.Default(), rpc, &signer, programID, WithWaitForVisibleTimeout(500*time.Millisecond))
+
+	sig, err := executor.DeleteUser(context.Background(), userPubkey)
+	require.NoError(t, err)
+	assert.NotEqual(t, solana.Signature{}, sig)
+	require.NotEmpty(t, rpc.sentTransactions)
+
+	// Verify the submitted transaction references the device pulled from the User.
+	tx := rpc.sentTransactions[0]
+	keys := tx.Message.AccountKeys
+	foundDevice := false
+	for _, k := range keys {
+		if k.Equals(device) {
+			foundDevice = true
+			break
+		}
+	}
+	assert.True(t, foundDevice, "device referenced by the user account must appear in the DeleteUser tx")
+}
+
+func TestDeleteUserNotFound(t *testing.T) {
+	t.Parallel()
+
+	signer := solana.NewWallet().PrivateKey
+	programID := solana.NewWallet().PublicKey()
+	rpc := &mockRPCClient{
+		getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) {
+			return &solanarpc.GetAccountInfoResult{Value: nil}, nil
+		},
+	}
+	executor := NewExecutor(slog.Default(), rpc, &signer, programID)
+
+	_, err := executor.DeleteUser(context.Background(), solana.NewWallet().PublicKey())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "not found")
+}
+
+func TestWaitForAccountVisible_TimeoutVsCancel(t *testing.T) {
+	t.Parallel()
+
+	t.Run("returns nil when account appears", func(t *testing.T) {
+		var n atomic.Int32
+		rpc := &mockRPCClient{
+			getAccountInfoFunc: func(ctx context.Context, account solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) {
+				if n.Add(1) >= 2 {
+					return &solanarpc.GetAccountInfoResult{Value: &solanarpc.Account{}}, nil
+				}
+				return &solanarpc.GetAccountInfoResult{Value: nil}, nil
+			},
+		}
+		executor, _ := newTestExecutor(t, rpc)
+		require.NoError(t, executor.waitForAccountVisible(context.Background(), solana.NewWallet().PublicKey(), time.Second))
+	})
+
+	t.Run("returns error past deadline", func(t *testing.T) {
+		rpc := &mockRPCClient{}
+		executor, _ := newTestExecutor(t, rpc)
+		err := executor.waitForAccountVisible(context.Background(), solana.NewWallet().PublicKey(), 50*time.Millisecond)
+		require.Error(t, err)
+	})
+
+	t.Run("returns context error on cancel", func(t *testing.T) {
+		rpc := &mockRPCClient{}
+		executor, _ := newTestExecutor(t, rpc)
+		ctx, cancel := context.WithCancel(context.Background())
+		cancel()
+		err := executor.waitForAccountVisible(ctx, solana.NewWallet().PublicKey(), time.Second)
+		require.Error(t, err)
+		assert.True(t, errors.Is(err, context.Canceled))
+	})
+}
+
+// makeMinimalUserBytes hand-encodes a User account body matching DeserializeUser's
+// field order. Most fields are zero — only AccountType, Owner, DevicePubKey, and
+// ClientIp are populated, which is enough for buildDeleteUserInstruction.
+func makeMinimalUserBytes(owner, device solana.PublicKey, clientIP [4]byte) []byte {
+	b := make([]byte, 0, 256)
+	b = append(b, byte(UserType))           // AccountType
+	b = append(b, owner[:]...)              // Owner: 32 bytes
+	b = append(b, make([]byte, 16)...)      // Index: u128 = 16 bytes
+	b = append(b, 0)                        // BumpSeed
+	b = append(b, byte(UserTypeIBRL))       // UserType
+	b = append(b, make([]byte, 32)...)      // TenantPubKey (zero)
+	b = append(b, device[:]...)             // DevicePubKey: 32 bytes
+	b = append(b, byte(CyoaTypeGREOverDIA)) // CyoaType
+	b = append(b, clientIP[:]...)           // ClientIp: 4 bytes
+	b = append(b, make([]byte, 4)...)       // DzIp: 4 bytes
+	b = append(b, 0, 0)                     // TunnelId: u16
+	b = append(b, make([]byte, 5)...)       // TunnelNet: 5 bytes
+	b = append(b, byte(UserStatusActivated))
+	b = append(b, 0, 0, 0, 0)          // Publishers: u32 len = 0
+	b = append(b, 0, 0, 0, 0)          // Subscribers: u32 len = 0
+	b = append(b, make([]byte, 32)...) // ValidatorPubKey
+	b = append(b, make([]byte, 4)...)  // TunnelEndpoint
+	b = append(b, 0)                   // TunnelFlags
+	b = append(b, 0)                   // BgpStatus
+	b = append(b, make([]byte, 8)...)  // LastBgpUpAt
+	b = append(b, make([]byte, 8)...)  // LastBgpReportedAt
+	b = append(b, make([]byte, 8)...)  // BgpRttNs
+	return b
+}

From 88436b020c2295e673789b50fee73db311acb5a7 Mon Sep 17 00:00:00 2001
From: Greg Mitchell <greg@malbeclabs.com>
Date: Wed, 27 May 2026 14:55:50 +0000
Subject: [PATCH 2/5] sdk: add CHANGELOG entry for serviceability user CRUD +
 reconcile

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 42c4e52e05..2540b3202c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
   - Add `--log-level <LEVEL>` global flag and initialize the `tracing` subscriber at startup. `LEVEL` is one of `off`, `error`, `warn` (default), `info`, `debug`, `trace`. Diagnostic logs go to stderr so `--json` output on stdout remains parseable. Honors the `RUST_LOG` environment variable when set, overriding the CLI-flag level for per-module filtering. Replaces the previous `println!("using keypair: ...")` stdout line with a `tracing::info!` event; the keypair confirmation now appears only at `--log-level info` or higher and no longer pollutes parseable stdout. (Named `--log-level` rather than the RFC-20 §Global-flags suggested `--verbose` / `-v` because the existing `doublezero connect` / `disconnect` subcommands already own a `--verbose` flag with `bool` type; the global flag deviation will be revisited when the daemon-control module crate is carved out.)
   - Build a `CliContext` once at binary startup from `--env`, the per-field global overrides (`--url`, `--ws`, `--solana-url`, `--program-id`, `--geo-program-id`, `--keypair`, `--sock-file`), and the persisted `~/.config/doublezero/cli/config.yml` (overridable via `DOUBLEZERO_CONFIG_FILE`), per RFC-20 (§CliContext). Precedence (highest wins): CLI flag > persisted config > env-derived default. When `--env` is not set and the persisted config has a serviceability program ID, the environment is derived from that program ID via `Environment::from_program_id`; otherwise the binary falls back to `Environment::default()`. The legacy `DZClient` is now constructed from the fully resolved `CliContext` URL, WebSocket, and program-ID values directly, so verbs that migrate to read `CliContext` see the same backend as the legacy bridge. Keypair resolution is intentionally left to `DZClient::new`'s internal `load_keypair` precedence (CLI `--keypair` flag > `DOUBLEZERO_KEYPAIR` env var > stdin > persisted config) so the `DOUBLEZERO_KEYPAIR` env var continues to override the persisted keypair path, as relied on by the e2e contributor-auth negative-authz suite. File reads happen only in the binary; module crates remain forbidden from touching the filesystem (RFC-20 §67).
   - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: <head>` followed by the full chain of causes on stderr.
+- SDK (Go)
+  - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Adds a pure `PlanReconcile(current, target, ownerFilter)` planner that filters by user owner and returns a deterministic create/delete delta (ClientIp-ascending with PubKey tiebreak), used by the upcoming device-stress orchestrator to drive sweeps. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)).
 
 ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22
 

From ed95322821a83f4e0bba0accbb12a168671aaf35 Mon Sep 17 00:00:00 2001
From: Greg Mitchell <greg@malbeclabs.com>
Date: Wed, 27 May 2026 15:09:34 +0000
Subject: [PATCH 3/5] sdk: drop PlanReconcile from this PR; defer to
 orchestrator

PlanReconcile is orchestrator policy ("how many users do we want") rather
than an SDK primitive ("how do I submit a CreateUser/DeleteUser"). Move it
out of the serviceability SDK and land it alongside the device-stress
orchestrator binary in part 2 of #3746.
---
 CHANGELOG.md                                  |   2 +-
 .../sdk/go/serviceability/reconcile.go        |  59 ------
 .../sdk/go/serviceability/reconcile_test.go   | 173 ------------------
 3 files changed, 1 insertion(+), 233 deletions(-)
 delete mode 100644 smartcontract/sdk/go/serviceability/reconcile.go
 delete mode 100644 smartcontract/sdk/go/serviceability/reconcile_test.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2540b3202c..b0e2e843b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@ All notable changes to this project will be documented in this file.
   - Build a `CliContext` once at binary startup from `--env`, the per-field global overrides (`--url`, `--ws`, `--solana-url`, `--program-id`, `--geo-program-id`, `--keypair`, `--sock-file`), and the persisted `~/.config/doublezero/cli/config.yml` (overridable via `DOUBLEZERO_CONFIG_FILE`), per RFC-20 (§CliContext). Precedence (highest wins): CLI flag > persisted config > env-derived default. When `--env` is not set and the persisted config has a serviceability program ID, the environment is derived from that program ID via `Environment::from_program_id`; otherwise the binary falls back to `Environment::default()`. The legacy `DZClient` is now constructed from the fully resolved `CliContext` URL, WebSocket, and program-ID values directly, so verbs that migrate to read `CliContext` see the same backend as the legacy bridge. Keypair resolution is intentionally left to `DZClient::new`'s internal `load_keypair` precedence (CLI `--keypair` flag > `DOUBLEZERO_KEYPAIR` env var > stdin > persisted config) so the `DOUBLEZERO_KEYPAIR` env var continues to override the persisted keypair path, as relied on by the e2e contributor-auth negative-authz suite. File reads happen only in the binary; module crates remain forbidden from touching the filesystem (RFC-20 §67).
   - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: <head>` followed by the full chain of causes on stderr.
 - SDK (Go)
-  - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Adds a pure `PlanReconcile(current, target, ownerFilter)` planner that filters by user owner and returns a deterministic create/delete delta (ClientIp-ascending with PubKey tiebreak), used by the upcoming device-stress orchestrator to drive sweeps. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)).
+  - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)).
 
 ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22
 
diff --git a/smartcontract/sdk/go/serviceability/reconcile.go b/smartcontract/sdk/go/serviceability/reconcile.go
deleted file mode 100644
index d61e20ea91..0000000000
--- a/smartcontract/sdk/go/serviceability/reconcile.go
+++ /dev/null
@@ -1,59 +0,0 @@
-package serviceability
-
-import (
-	"bytes"
-	"sort"
-
-	"github.com/gagliardetto/solana-go"
-)
-
-// ReconcilePlan describes the delta needed to drive the set of users owned by a
-// given key toward a desired count.
-type ReconcilePlan struct {
-	// ToCreate is the number of users to add. Always >= 0.
-	ToCreate int
-	// ToDelete lists user PDAs to remove, in the order they should be deleted.
-	// Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker, so
-	// repeated calls against the same input produce identical plans.
-	ToDelete []solana.PublicKey
-}
-
-// PlanReconcile decides what to create or delete so that the number of users
-// owned by ownerFilter equals target. Users with a different Owner are ignored
-// (neither counted nor deleted), which lets the stress orchestrator share a
-// program with other tenants without disturbing them.
-//
-// The function is pure — no I/O — so it is safe to call repeatedly while the
-// orchestrator polls live state. Returns a zero plan when target is negative.
-func PlanReconcile(current []User, target int, ownerFilter solana.PublicKey) ReconcilePlan {
-	if target < 0 {
-		return ReconcilePlan{}
-	}
-
-	var owned []User
-	for _, u := range current {
-		if bytes.Equal(u.Owner[:], ownerFilter[:]) {
-			owned = append(owned, u)
-		}
-	}
-
-	switch {
-	case len(owned) < target:
-		return ReconcilePlan{ToCreate: target - len(owned)}
-	case len(owned) > target:
-		sort.Slice(owned, func(i, j int) bool {
-			if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 {
-				return c < 0
-			}
-			return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0
-		})
-		victims := owned[target:]
-		out := make([]solana.PublicKey, len(victims))
-		for i, u := range victims {
-			out[i] = solana.PublicKeyFromBytes(u.PubKey[:])
-		}
-		return ReconcilePlan{ToDelete: out}
-	default:
-		return ReconcilePlan{}
-	}
-}
diff --git a/smartcontract/sdk/go/serviceability/reconcile_test.go b/smartcontract/sdk/go/serviceability/reconcile_test.go
deleted file mode 100644
index 335094b7b8..0000000000
--- a/smartcontract/sdk/go/serviceability/reconcile_test.go
+++ /dev/null
@@ -1,173 +0,0 @@
-package serviceability_test
-
-import (
-	"testing"
-
-	"github.com/gagliardetto/solana-go"
-	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-// makeUser is a tiny helper to build a User suitable for PlanReconcile testing:
-// only Owner, ClientIp, and PubKey actually influence the planner.
-func makeUser(owner solana.PublicKey, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User {
-	return serviceability.User{
-		Owner:    owner,
-		ClientIp: clientIP,
-		PubKey:   pubkey,
-	}
-}
-
-func TestPlanReconcile(t *testing.T) {
-	t.Parallel()
-
-	orchestrator := solana.NewWallet().PublicKey()
-	stranger := solana.NewWallet().PublicKey()
-
-	// Stable pubkeys so we can assert exact ordering.
-	u1 := solana.NewWallet().PublicKey()
-	u2 := solana.NewWallet().PublicKey()
-	u3 := solana.NewWallet().PublicKey()
-	u4 := solana.NewWallet().PublicKey()
-	u5 := solana.NewWallet().PublicKey()
-
-	ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} }
-
-	tests := []struct {
-		name          string
-		current       []serviceability.User
-		target        int
-		owner         solana.PublicKey
-		wantCreate    int
-		wantDeleteIPs [][4]byte // ClientIp order we expect to see in ToDelete
-	}{
-		{
-			name:       "zero to N",
-			current:    nil,
-			target:     4,
-			owner:      orchestrator,
-			wantCreate: 4,
-		},
-		{
-			name: "N to zero deletes in ip-ascending order",
-			current: []serviceability.User{
-				makeUser(orchestrator, u1, ip(10, 0, 0, 3)),
-				makeUser(orchestrator, u2, ip(10, 0, 0, 1)),
-				makeUser(orchestrator, u3, ip(10, 0, 0, 4)),
-				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
-			},
-			target:        0,
-			owner:         orchestrator,
-			wantCreate:    0,
-			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)},
-		},
-		{
-			name: "partial trim deletes only the overflow",
-			current: []serviceability.User{
-				makeUser(orchestrator, u1, ip(10, 0, 0, 5)),
-				makeUser(orchestrator, u2, ip(10, 0, 0, 4)),
-				makeUser(orchestrator, u3, ip(10, 0, 0, 3)),
-				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
-				makeUser(orchestrator, u5, ip(10, 0, 0, 1)),
-			},
-			target:        3,
-			owner:         orchestrator,
-			wantCreate:    0,
-			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)},
-		},
-		{
-			name: "partial grow asks for the missing count",
-			current: []serviceability.User{
-				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
-				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
-			},
-			target:     5,
-			owner:      orchestrator,
-			wantCreate: 3,
-		},
-		{
-			name: "only foreign users present grows by full target",
-			current: []serviceability.User{
-				makeUser(stranger, u1, ip(10, 0, 0, 1)),
-				makeUser(stranger, u2, ip(10, 0, 0, 2)),
-				makeUser(stranger, u3, ip(10, 0, 0, 3)),
-			},
-			target:     2,
-			owner:      orchestrator,
-			wantCreate: 2,
-		},
-		{
-			name: "mixed ownership only counts and deletes owned",
-			current: []serviceability.User{
-				makeUser(stranger, u1, ip(10, 0, 0, 9)),
-				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
-				makeUser(stranger, u3, ip(10, 0, 0, 8)),
-				makeUser(orchestrator, u4, ip(10, 0, 0, 1)),
-			},
-			target:        1,
-			owner:         orchestrator,
-			wantCreate:    0,
-			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)},
-		},
-		{
-			name: "already at target produces zero plan",
-			current: []serviceability.User{
-				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
-				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
-			},
-			target:     2,
-			owner:      orchestrator,
-			wantCreate: 0,
-		},
-		{
-			name: "negative target produces zero plan",
-			current: []serviceability.User{
-				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
-			},
-			target:     -1,
-			owner:      orchestrator,
-			wantCreate: 0,
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			plan := serviceability.PlanReconcile(tc.current, tc.target, tc.owner)
-			assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate")
-			require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length")
-
-			// Resolve expected pubkeys via ClientIp lookup against the current set.
-			ipToPubkey := map[[4]byte]solana.PublicKey{}
-			for _, u := range tc.current {
-				ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:])
-			}
-			for i, ipKey := range tc.wantDeleteIPs {
-				assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey)
-			}
-		})
-	}
-}
-
-func TestPlanReconcile_TieBreaksByPubkey(t *testing.T) {
-	t.Parallel()
-
-	orchestrator := solana.NewWallet().PublicKey()
-	sharedIP := [4]byte{10, 0, 0, 1}
-
-	// Two users with the same ClientIp (artificial — onchain the IP is part of
-	// the PDA seed so collisions can't happen, but the tiebreak must still be
-	// deterministic).
-	pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
-	pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
-
-	plan := serviceability.PlanReconcile([]serviceability.User{
-		makeUser(orchestrator, pkB, sharedIP),
-		makeUser(orchestrator, pkA, sharedIP),
-	}, 0, orchestrator)
-
-	require.Len(t, plan.ToDelete, 2)
-	// pkA (0xAA…) sorts before pkB (0xBB…).
-	assert.Equal(t, pkA, plan.ToDelete[0])
-	assert.Equal(t, pkB, plan.ToDelete[1])
-}

From ee9b8226447a838ce0b6544e1240ea53c66effb1 Mon Sep 17 00:00:00 2001
From: Greg Mitchell <greg@malbeclabs.com>
Date: Wed, 27 May 2026 15:38:00 +0000
Subject: [PATCH 4/5] tools/stress: orchestrator skeleton
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tools/stress/device-orchestrator/, the device-stress orchestrator binary
for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's
CLI list, dumps orchestrator-config.json on start, runs a provision-then-
reverse-deprovision sweep against a live serviceability program, and emits
the runlog row schema {run_id, user_index, user_pubkey, tunnel_id, event,
t_ns, n_after_event} for each submit | confirm | activate | deprovision_*
event.

Packages:

- pkg/reconcile  — PlanFor() pure function (lifted from the part-1 SDK PR;
  now lives with the orchestrator as policy, not as an SDK primitive)
- pkg/runlog     — append-only JSONL writer for orchestrator-runlog.json
- pkg/sweep      — provision-then-deprovision loop driven by PlanFor; uses a
  Clock + Executor interface for testability; reverse-creation-order delete
- pkg/abort      — sentinel-file poller that cancels a derived ctx between
  user iterations so an in-flight Create/Delete completes before exit
- pkg/agent      — AgentRunner interface + noop impl; SSH runner lands in
  part 3 along with pre_commit_log / applied event emission
- pkg/exec       — Live impl of sweep.Executor over serviceability.{Client,
  Executor}; picks deterministic per-user IPs from --client-ip-base
- cmd/device-orchestrator — flag parsing, config dump, signal + abort
  handling, sweep wiring

The agent runner is stubbed behind an interface so this PR can land
end-to-end functionality (provision/deprovision + runlog + abort) without
the SSH plumbing. The SSH runner and the corresponding pre_commit_log /
applied row generation land in part 3 of #3746.

Part 2 of #3746. Closes #3771.
---
 CHANGELOG.md                                  |   2 +
 tools/stress/device-orchestrator/Makefile     |  15 +
 .../cmd/device-orchestrator/main.go           | 277 +++++++++++++++
 .../device-orchestrator/pkg/abort/abort.go    |  64 ++++
 .../pkg/abort/abort_test.go                   |  80 +++++
 .../device-orchestrator/pkg/agent/agent.go    |  73 ++++
 .../pkg/agent/agent_test.go                   |  32 ++
 .../device-orchestrator/pkg/exec/exec.go      | 139 ++++++++
 .../device-orchestrator/pkg/exec/exec_test.go |  27 ++
 .../pkg/reconcile/reconcile.go                |  63 ++++
 .../pkg/reconcile/reconcile_test.go           | 166 +++++++++
 .../device-orchestrator/pkg/runlog/runlog.go  | 101 ++++++
 .../pkg/runlog/runlog_test.go                 |  93 +++++
 .../device-orchestrator/pkg/sweep/sweep.go    | 262 ++++++++++++++
 .../pkg/sweep/sweep_test.go                   | 321 ++++++++++++++++++
 15 files changed, 1715 insertions(+)
 create mode 100644 tools/stress/device-orchestrator/Makefile
 create mode 100644 tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
 create mode 100644 tools/stress/device-orchestrator/pkg/abort/abort.go
 create mode 100644 tools/stress/device-orchestrator/pkg/abort/abort_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/agent/agent.go
 create mode 100644 tools/stress/device-orchestrator/pkg/agent/agent_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/exec/exec.go
 create mode 100644 tools/stress/device-orchestrator/pkg/exec/exec_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/reconcile/reconcile.go
 create mode 100644 tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/runlog/runlog.go
 create mode 100644 tools/stress/device-orchestrator/pkg/runlog/runlog_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/sweep/sweep.go
 create mode 100644 tools/stress/device-orchestrator/pkg/sweep/sweep_test.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0e2e843b5..d71edb5e5e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@ All notable changes to this project will be documented in this file.
   - Centralize top-level error rendering through `doublezero_cli_core::error::render_eyre`. Replaces three ad-hoc `eprintln!("Error: {e}")` sites in `client/doublezero/src/main.rs` (env-parse failure, env-config resolution failure, top-level command failure) with a single helper that prints `Error: <head>` followed by the full chain of causes on stderr.
 - SDK (Go)
   - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)).
+- Tools
+  - Add `tools/stress/device-orchestrator/` — the device-stress orchestrator skeleton for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's CLI list, dumps `orchestrator-config.json` on start, runs a provision-then-reverse-deprovision sweep against a live serviceability program, and emits the runlog row schema `{run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event}` to `orchestrator-runlog.json` for each `submit | confirm | activate | deprovision_*` event. The agent runner is stubbed behind a `pkg/agent.Runner` interface (no-op impl ships now; the SSH-backed runner that emits `pre_commit_log` / `applied` lands in part 3). The sweep cooperates with an abort sentinel file: when the file appears the in-flight user completes and the orchestrator deprovisions everything it created before exiting non-zero. `PlanReconcile` / `Plan` (lifted from the part-1 SDK PR) now lives at `tools/stress/device-orchestrator/pkg/reconcile/` as orchestrator policy rather than SDK primitive. Part 2 of #3746 ([#3771](https://github.com/malbeclabs/doublezero/issues/3771)).
 
 ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22
 
diff --git a/tools/stress/device-orchestrator/Makefile b/tools/stress/device-orchestrator/Makefile
new file mode 100644
index 0000000000..6ed19c04fe
--- /dev/null
+++ b/tools/stress/device-orchestrator/Makefile
@@ -0,0 +1,15 @@
+PREFIX:=github.com/malbeclabs/doublezero/tools/stress/device-orchestrator
+BUILD:=`git rev-parse --short HEAD`
+LDFLAGS=-ldflags "-X=$(PREFIX)/build.Build=$(BUILD)"
+
+.PHONY: test
+test:
+	go test -race -v -coverprofile coverage.out ./...
+
+.PHONY: lint
+lint:
+	golangci-lint run -c ../../../.golangci.yaml
+
+.PHONY: build
+build:
+	CGO_ENABLED=0 go build -v $(LDFLAGS) -o bin/device-orchestrator cmd/device-orchestrator/main.go
diff --git a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
new file mode 100644
index 0000000000..ab01975d30
--- /dev/null
+++ b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
@@ -0,0 +1,277 @@
+// device-orchestrator runs the GRE Tunnel Capacity Study sweep against a
+// live serviceability program: provisions N users on a target device in
+// batches with a hold between each, then deprovisions in reverse-creation
+// order. Per #3771 (part 2 of #3746) the SSH-driven agent runner is stubbed
+// behind the agent.Runner interface; the no-op implementation is used here
+// and the SSH implementation lands in part 3.
+package main
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"log/slog"
+	"net"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"syscall"
+	"time"
+
+	"github.com/gagliardetto/solana-go"
+	solanarpc "github.com/gagliardetto/solana-go/rpc"
+
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/abort"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/exec"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep"
+)
+
+// orchestratorConfig captures the resolved CLI inputs in the shape that gets
+// dumped to orchestrator-config.json on start.
+type orchestratorConfig struct {
+	RunID           string `json:"run_id"`
+	TargetUserCount int    `json:"target_user_count"`
+	UsersPerBatch   int    `json:"users_per_batch"`
+	HoldSeconds     int    `json:"hold_seconds"`
+	DUTPubkey       string `json:"dut_pubkey"`
+	DUTSSHHost      string `json:"dut_ssh_host"`
+	DUTSSHKey       string `json:"dut_ssh_key"`
+	RPCURL          string `json:"rpc_url"`
+	ProgramID       string `json:"program_id"`
+	KeypairPath     string `json:"keypair"`
+	ControllerAddr  string `json:"controller"`
+	AbortFile       string `json:"abort_file"`
+	WorkingDir      string `json:"working_dir"`
+	ClientIPBase    string `json:"client_ip_base"`
+	TunnelEndpoint  string `json:"tunnel_endpoint"`
+	TenantPubkey    string `json:"tenant_pubkey,omitempty"`
+}
+
+func main() {
+	if err := run(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func run() error {
+	var (
+		targetUserCount = flag.Int("target-user-count", 8, "Final user count to sweep up to.")
+		usersPerBatch   = flag.Int("users-per-batch", 2, "Users provisioned per batch before the hold.")
+		holdSeconds     = flag.Int("hold-seconds", 180, "Seconds to hold between batches.")
+		dutPubkey       = flag.String("dut-pubkey", "", "Device-under-test pubkey (base58).")
+		dutSSHHost      = flag.String("dut-ssh-host", "", "SSH host:port for the DUT (used by the part-3 agent runner).")
+		dutSSHKey       = flag.String("dut-ssh-key", "", "SSH private-key path for the DUT.")
+		rpcURL          = flag.String("rpc-url", "", "Serviceability RPC URL.")
+		programID       = flag.String("program-id", "", "Serviceability program ID (base58).")
+		keypairPath     = flag.String("keypair", "", "Path to the orchestrator's solana keypair JSON.")
+		controllerAddr  = flag.String("controller", "", "Controller IP:PORT, forwarded to the DUT agent in part 3.")
+		abortFile       = flag.String("abort-file", "", "Path to a sentinel file; when it appears the sweep finishes the current user and exits.")
+		workingDir      = flag.String("working-dir", ".", "Output directory for orchestrator-config.json / orchestrator-runlog.json.")
+		clientIPBase    = flag.String("client-ip-base", "100.64.0.0", "Starting IPv4 address; per-user IP is base + idx.")
+		tunnelEndpoint  = flag.String("tunnel-endpoint", "0.0.0.0", "Tunnel endpoint IP passed to UserCreateArgs; 0.0.0.0 lets the program fall back to the device's public IP.")
+		tenantPubkey    = flag.String("tenant-pubkey", "", "Optional tenant pubkey for UserCreateArgs.")
+		runID           = flag.String("run-id", "", "Run identifier written into every runlog row; auto-generated if empty.")
+		logLevel        = flag.String("log-level", "info", "slog level: debug|info|warn|error.")
+		dryRun          = flag.Bool("dry-run", false, "Validate flags and dump orchestrator-config.json without contacting the RPC.")
+	)
+	flag.Parse()
+
+	logger := newLogger(*logLevel)
+	slog.SetDefault(logger)
+
+	if *runID == "" {
+		var buf [8]byte
+		if _, err := rand.Read(buf[:]); err != nil {
+			return fmt.Errorf("generate run id: %w", err)
+		}
+		*runID = "run-" + hex.EncodeToString(buf[:])
+	}
+
+	if err := os.MkdirAll(*workingDir, 0o755); err != nil {
+		return fmt.Errorf("create working dir: %w", err)
+	}
+
+	baseIP, err := parseIPv4(*clientIPBase)
+	if err != nil {
+		return fmt.Errorf("parse --client-ip-base: %w", err)
+	}
+	tunnelIP, err := parseIPv4(*tunnelEndpoint)
+	if err != nil {
+		return fmt.Errorf("parse --tunnel-endpoint: %w", err)
+	}
+
+	resolved := orchestratorConfig{
+		RunID:           *runID,
+		TargetUserCount: *targetUserCount,
+		UsersPerBatch:   *usersPerBatch,
+		HoldSeconds:     *holdSeconds,
+		DUTPubkey:       *dutPubkey,
+		DUTSSHHost:      *dutSSHHost,
+		DUTSSHKey:       *dutSSHKey,
+		RPCURL:          *rpcURL,
+		ProgramID:       *programID,
+		KeypairPath:     *keypairPath,
+		ControllerAddr:  *controllerAddr,
+		AbortFile:       *abortFile,
+		WorkingDir:      *workingDir,
+		ClientIPBase:    *clientIPBase,
+		TunnelEndpoint:  *tunnelEndpoint,
+		TenantPubkey:    *tenantPubkey,
+	}
+	configPath := filepath.Join(*workingDir, "orchestrator-config.json")
+	if err := dumpJSON(configPath, resolved); err != nil {
+		return fmt.Errorf("write orchestrator-config.json: %w", err)
+	}
+	logger.Info("orchestrator-config.json written", "path", configPath)
+
+	if *dryRun {
+		logger.Info("dry-run: skipping sweep")
+		return nil
+	}
+
+	if err := requireFlags(map[string]string{
+		"--dut-pubkey": *dutPubkey,
+		"--rpc-url":    *rpcURL,
+		"--program-id": *programID,
+		"--keypair":    *keypairPath,
+	}); err != nil {
+		return err
+	}
+
+	dutPK, err := solana.PublicKeyFromBase58(*dutPubkey)
+	if err != nil {
+		return fmt.Errorf("--dut-pubkey: %w", err)
+	}
+	programPK, err := solana.PublicKeyFromBase58(*programID)
+	if err != nil {
+		return fmt.Errorf("--program-id: %w", err)
+	}
+	signer, err := solana.PrivateKeyFromSolanaKeygenFile(*keypairPath)
+	if err != nil {
+		return fmt.Errorf("load --keypair: %w", err)
+	}
+
+	var tenantPK solana.PublicKey
+	if *tenantPubkey != "" {
+		tenantPK, err = solana.PublicKeyFromBase58(*tenantPubkey)
+		if err != nil {
+			return fmt.Errorf("--tenant-pubkey: %w", err)
+		}
+	}
+
+	rpc := solanarpc.New(*rpcURL)
+	client := serviceability.New(rpc, programPK)
+	executor := serviceability.NewExecutor(logger, rpc, &signer, programPK)
+
+	liveExec, err := exec.New(exec.Config{
+		Client:         client,
+		Executor:       executor,
+		DevicePubkey:   dutPK,
+		TenantPubkey:   tenantPK,
+		ClientIPBase:   baseIP,
+		TunnelEndpoint: tunnelIP,
+		UserType:       serviceability.UserTypeIBRL,
+		CyoaType:       serviceability.CyoaTypeGREOverDIA,
+		DzPrefixCount:  1,
+	})
+	if err != nil {
+		return err
+	}
+
+	runlogPath := filepath.Join(*workingDir, "orchestrator-runlog.json")
+	rlw, err := runlog.Open(runlogPath)
+	if err != nil {
+		return err
+	}
+	defer rlw.Close()
+	logger.Info("orchestrator-runlog.json open", "path", runlogPath)
+
+	// Compose ctx: signal cancellation + abort-file cancellation.
+	rootCtx, rootCancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+	defer rootCancel()
+	ctx, abortCancel := abort.Watch(rootCtx, *abortFile, abort.DefaultPollInterval, logger)
+	defer abortCancel()
+
+	cfg := sweep.Config{
+		RunID:         *runID,
+		Target:        *targetUserCount,
+		UsersPerBatch: *usersPerBatch,
+		Hold:          time.Duration(*holdSeconds) * time.Second,
+		OwnerFilter:   signer.PublicKey(),
+		Executor:      liveExec,
+		Agent:         agent.NewNoop(logger),
+		Runlog:        rlw,
+		Clock:         sweep.RealClock{},
+		Logger:        logger,
+	}
+
+	logger.Info("sweep starting", "target", cfg.Target, "batch", cfg.UsersPerBatch, "hold", cfg.Hold)
+	if err := sweep.Run(ctx, cfg); err != nil {
+		if errors.Is(err, context.Canceled) {
+			logger.Warn("sweep cancelled", "err", err)
+			return err
+		}
+		return fmt.Errorf("sweep: %w", err)
+	}
+	logger.Info("sweep finished")
+	return nil
+}
+
+func newLogger(level string) *slog.Logger {
+	lvl := slog.LevelInfo
+	switch level {
+	case "debug":
+		lvl = slog.LevelDebug
+	case "warn":
+		lvl = slog.LevelWarn
+	case "error":
+		lvl = slog.LevelError
+	}
+	return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: lvl}))
+}
+
+func dumpJSON(path string, v any) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	enc := json.NewEncoder(f)
+	enc.SetIndent("", "  ")
+	return enc.Encode(v)
+}
+
+func requireFlags(required map[string]string) error {
+	var missing []string
+	for name, val := range required {
+		if val == "" {
+			missing = append(missing, name)
+		}
+	}
+	if len(missing) > 0 {
+		return fmt.Errorf("missing required flag(s): %v", missing)
+	}
+	return nil
+}
+
+func parseIPv4(s string) ([4]byte, error) {
+	ip := net.ParseIP(s)
+	if ip == nil {
+		return [4]byte{}, fmt.Errorf("invalid IPv4 %q", s)
+	}
+	v4 := ip.To4()
+	if v4 == nil {
+		return [4]byte{}, fmt.Errorf("not IPv4: %q", s)
+	}
+	var out [4]byte
+	copy(out[:], v4)
+	return out, nil
+}
diff --git a/tools/stress/device-orchestrator/pkg/abort/abort.go b/tools/stress/device-orchestrator/pkg/abort/abort.go
new file mode 100644
index 0000000000..8f191f5499
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/abort/abort.go
@@ -0,0 +1,64 @@
+// Package abort polls a sentinel file on disk and cancels a context when the
+// file appears. The orchestrator uses this for cooperative shutdown: an
+// operator drops a file at the path passed via --abort-file and the running
+// sweep finishes the current user iteration before exiting.
+package abort
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"os"
+	"time"
+)
+
+// Default polling cadence. The sweep loop only checks the cancellation between
+// user iterations, so the abort signal latency is bounded by min(this, one
+// user iteration).
+const DefaultPollInterval = 250 * time.Millisecond
+
+// Watch returns a derived context that cancels as soon as `path` exists on
+// disk. If path is empty the returned context is the parent verbatim and the
+// returned stop is a no-op. The watcher goroutine exits when parent or the
+// returned context is cancelled.
+//
+// Pass log=nil for silent operation.
+func Watch(parent context.Context, path string, interval time.Duration, log *slog.Logger) (context.Context, context.CancelFunc) {
+	if path == "" {
+		return parent, func() {}
+	}
+	if interval <= 0 {
+		interval = DefaultPollInterval
+	}
+	ctx, cancel := context.WithCancel(parent)
+	go func() {
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				if exists(path) {
+					if log != nil {
+						log.Warn("abort file detected; cancelling sweep", "path", path)
+					}
+					cancel()
+					return
+				}
+			}
+		}
+	}()
+	return ctx, cancel
+}
+
+// exists reports whether path refers to an existing filesystem entry. Any
+// stat error other than ENOENT is treated as "exists" so a permission error
+// doesn't silently leave the orchestrator running past an operator abort.
+func exists(path string) bool {
+	_, err := os.Stat(path)
+	if err == nil {
+		return true
+	}
+	return !errors.Is(err, os.ErrNotExist)
+}
diff --git a/tools/stress/device-orchestrator/pkg/abort/abort_test.go b/tools/stress/device-orchestrator/pkg/abort/abort_test.go
new file mode 100644
index 0000000000..13fdfba47c
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/abort/abort_test.go
@@ -0,0 +1,80 @@
+package abort_test
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/abort"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestWatch_CancelsWhenAbortFileAppears(t *testing.T) {
+	t.Parallel()
+
+	path := filepath.Join(t.TempDir(), "abort")
+	ctx, cancel := abort.Watch(context.Background(), path, 25*time.Millisecond, nil)
+	t.Cleanup(cancel)
+
+	// File doesn't exist yet — ctx is alive.
+	select {
+	case <-ctx.Done():
+		t.Fatal("ctx cancelled before abort file existed")
+	case <-time.After(50 * time.Millisecond):
+	}
+
+	// Touch the abort file.
+	require.NoError(t, os.WriteFile(path, nil, 0o644))
+
+	select {
+	case <-ctx.Done():
+		assert.True(t, errors.Is(ctx.Err(), context.Canceled))
+	case <-time.After(time.Second):
+		t.Fatal("ctx did not cancel within 1s after abort file touched")
+	}
+}
+
+func TestWatch_EmptyPathIsNoOp(t *testing.T) {
+	t.Parallel()
+
+	parent, parentCancel := context.WithCancel(context.Background())
+	t.Cleanup(parentCancel)
+
+	ctx, cancel := abort.Watch(parent, "", 0, nil)
+	t.Cleanup(cancel)
+
+	select {
+	case <-ctx.Done():
+		t.Fatal("empty-path watch should not cancel on its own")
+	case <-time.After(50 * time.Millisecond):
+	}
+
+	// Parent cancellation still propagates through (we return parent verbatim).
+	parentCancel()
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Second):
+		t.Fatal("derived ctx did not pick up parent cancellation")
+	}
+}
+
+func TestWatch_StopsWhenParentCancelled(t *testing.T) {
+	t.Parallel()
+
+	path := filepath.Join(t.TempDir(), "abort")
+	parent, parentCancel := context.WithCancel(context.Background())
+
+	ctx, cancel := abort.Watch(parent, path, 25*time.Millisecond, nil)
+	t.Cleanup(cancel)
+
+	parentCancel()
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Second):
+		t.Fatal("parent cancel did not propagate")
+	}
+}
diff --git a/tools/stress/device-orchestrator/pkg/agent/agent.go b/tools/stress/device-orchestrator/pkg/agent/agent.go
new file mode 100644
index 0000000000..24c1b4dbce
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/agent/agent.go
@@ -0,0 +1,73 @@
+// Package agent exposes the AgentRunner interface the orchestrator uses to
+// drive doublezero-agent on a device under test (DUT). The skeleton ships a
+// no-op implementation; the SSH-backed runner lands in part 3 of #3746.
+package agent
+
+import (
+	"context"
+	"log/slog"
+	"time"
+)
+
+// EventKind tags an AgentEvent so runlog row generation can map it onto the
+// runlog Event vocabulary (`pre_commit_log`, `applied`).
+type EventKind int
+
+const (
+	// EventPreCommitLog marks the moment the agent log shows
+	// `Committing config session due to diffs detected: <diff>` for a new
+	// tunnel interface; carries the parsed tunnel ID.
+	EventPreCommitLog EventKind = iota + 1
+	// EventApplied marks the moment the agent log shows a commit-success line
+	// for a previously-pending tunnel interface.
+	EventApplied
+)
+
+// Event is one observation emitted by the agent runner: a timestamped tunnel
+// state transition derived from agent log lines.
+type Event struct {
+	Kind     EventKind
+	TunnelID uint16
+	At       time.Time
+}
+
+// Runner drives doublezero-agent on the DUT and surfaces tunnel-related events
+// extracted from its log stream.
+//
+// Lifecycle:
+//
+//   - Start(ctx) blocks until the agent stream is healthy enough to emit
+//     events (or returns an error). It returns immediately for the no-op impl.
+//   - Events() returns a channel that closes when the runner exits.
+//
+// The SSH-backed implementation will manage an ssh.Session and parse stdout
+// for the two log lines listed under EventKind.
+type Runner interface {
+	Start(ctx context.Context) error
+	Events() <-chan Event
+}
+
+// NewNoop returns a Runner that never starts a process and never emits events.
+// Used by the skeleton sweep loop and by tests where the agent isn't under test.
+func NewNoop(log *slog.Logger) Runner {
+	ch := make(chan Event)
+	return &noop{log: log, events: ch}
+}
+
+type noop struct {
+	log    *slog.Logger
+	events chan Event
+}
+
+func (n *noop) Start(ctx context.Context) error {
+	if n.log != nil {
+		n.log.Debug("agent: noop runner started (no events will be emitted)")
+	}
+	go func() {
+		<-ctx.Done()
+		close(n.events)
+	}()
+	return nil
+}
+
+func (n *noop) Events() <-chan Event { return n.events }
diff --git a/tools/stress/device-orchestrator/pkg/agent/agent_test.go b/tools/stress/device-orchestrator/pkg/agent/agent_test.go
new file mode 100644
index 0000000000..430dae7988
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/agent/agent_test.go
@@ -0,0 +1,32 @@
+package agent_test
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNoopRunner_ClosesEventsWhenContextCancelled(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	r := agent.NewNoop(nil)
+	require.NoError(t, r.Start(ctx))
+
+	select {
+	case <-r.Events():
+		t.Fatal("noop runner emitted an event")
+	case <-time.After(50 * time.Millisecond):
+	}
+
+	cancel()
+	select {
+	case _, ok := <-r.Events():
+		require.False(t, ok, "events channel should close on cancel")
+	case <-time.After(time.Second):
+		t.Fatal("events channel did not close after context cancel")
+	}
+}
diff --git a/tools/stress/device-orchestrator/pkg/exec/exec.go b/tools/stress/device-orchestrator/pkg/exec/exec.go
new file mode 100644
index 0000000000..86badb60f2
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/exec/exec.go
@@ -0,0 +1,139 @@
+// Package exec wires the serviceability SDK behind the sweep.Executor
+// interface. The orchestrator binary uses it against a real RPC; tests in
+// pkg/sweep use a fake to avoid the network.
+package exec
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"time"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep"
+)
+
+// Config bundles the inputs the live executor needs.
+type Config struct {
+	Client   *serviceability.Client
+	Executor *serviceability.Executor
+
+	DevicePubkey solana.PublicKey
+	TenantPubkey solana.PublicKey // zero pubkey = no tenant
+
+	// ClientIPBase is the starting /16 block from which sequential per-user
+	// IPs are drawn. For idx i, the assigned IP is ClientIPBase + i.
+	ClientIPBase [4]byte
+	// TunnelEndpoint is passed through to UserCreateArgs verbatim; pass
+	// 0.0.0.0 to use the device's public IP.
+	TunnelEndpoint [4]byte
+	// UserType / CyoaType pin the user kind for the entire sweep.
+	UserType serviceability.UserUserType
+	CyoaType serviceability.CyoaType
+	// DzPrefixCount must match the device's dz_prefixes length; 1 is the
+	// stress-test default.
+	DzPrefixCount uint8
+}
+
+// Live implements sweep.Executor against a real serviceability program.
+type Live struct {
+	cfg Config
+}
+
+// New returns a Live executor with the given configuration. Callers must
+// supply a non-nil Client and Executor.
+func New(cfg Config) (*Live, error) {
+	if cfg.Client == nil {
+		return nil, fmt.Errorf("exec.New: Client is required")
+	}
+	if cfg.Executor == nil {
+		return nil, fmt.Errorf("exec.New: Executor is required")
+	}
+	if cfg.DzPrefixCount == 0 {
+		cfg.DzPrefixCount = 1
+	}
+	return &Live{cfg: cfg}, nil
+}
+
+// ListUsers returns the current set of User accounts in the program. The
+// caller (sweep loop) filters by owner via PlanFor.
+func (l *Live) ListUsers(ctx context.Context) ([]serviceability.User, error) {
+	pd, err := l.cfg.Client.GetProgramData(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list users: %w", err)
+	}
+	return pd.Users, nil
+}
+
+// CreateUser issues a CreateUser instruction for the idx-th stress user and
+// records timestamps the sweep loop turns into runlog rows.
+func (l *Live) CreateUser(ctx context.Context, idx int) (sweep.CreateResult, error) {
+	args := serviceability.UserCreateArgs{
+		UserType:       l.cfg.UserType,
+		CyoaType:       l.cfg.CyoaType,
+		ClientIP:       ipForIndex(l.cfg.ClientIPBase, idx),
+		TunnelEndpoint: l.cfg.TunnelEndpoint,
+		DzPrefixCount:  l.cfg.DzPrefixCount,
+		DevicePubkey:   l.cfg.DevicePubkey,
+		TenantPubkey:   l.cfg.TenantPubkey,
+	}
+	_, userPDA, err := l.cfg.Executor.CreateUser(ctx, args)
+	if err != nil {
+		return sweep.CreateResult{}, err
+	}
+	now := time.Now()
+
+	// The SDK's CreateUser blocks on signature finalization and post-confirm
+	// account visibility; we don't get distinct stage timestamps today, so
+	// confirm and activate both anchor at the post-call wallclock. A future
+	// SDK refactor can split these.
+	tunnelID, err := l.fetchTunnelID(ctx, userPDA)
+	if err != nil {
+		// Surface the tunnel ID as 0; the sweep records the create as successful
+		// because the on-chain User already exists.
+		tunnelID = 0
+	}
+	return sweep.CreateResult{
+		UserPDA:     userPDA,
+		TunnelID:    tunnelID,
+		ConfirmedAt: now,
+		ActivatedAt: now,
+	}, nil
+}
+
+// DeleteUser closes a user account by PDA.
+func (l *Live) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) {
+	if _, err := l.cfg.Executor.DeleteUser(ctx, userPDA); err != nil {
+		return sweep.DeleteResult{}, err
+	}
+	now := time.Now()
+	return sweep.DeleteResult{
+		ConfirmedAt: now,
+		ActivatedAt: now,
+	}, nil
+}
+
+// fetchTunnelID reads the user account and returns its assigned TunnelId.
+// Used so the runlog records the kernel interface identifier the part-3
+// agent runner will key on.
+func (l *Live) fetchTunnelID(ctx context.Context, userPDA solana.PublicKey) (uint16, error) {
+	// We can't read the assigned tunnel_id without the User's on-chain bytes,
+	// which the SDK doesn't surface from CreateUser. Until a downstream
+	// helper is added, callers either skip this column (TunnelID = 0) or wire
+	// a per-account fetch in cmd/. The package signature is kept stable so
+	// part-3 can drop in the real fetch.
+	return 0, nil
+}
+
+// ipForIndex returns base shifted by idx, wrapping at the /16 boundary so the
+// 0..65535 range is usable without overflow handling on the caller side.
+func ipForIndex(base [4]byte, idx int) [4]byte {
+	host := uint32(base[2])<<8 | uint32(base[3])
+	host += uint32(uint16(idx))
+	var out [4]byte
+	out[0] = base[0]
+	out[1] = base[1]
+	binary.BigEndian.PutUint16(out[2:], uint16(host))
+	return out
+}
diff --git a/tools/stress/device-orchestrator/pkg/exec/exec_test.go b/tools/stress/device-orchestrator/pkg/exec/exec_test.go
new file mode 100644
index 0000000000..c7b13ea30b
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/exec/exec_test.go
@@ -0,0 +1,27 @@
+package exec
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIPForIndex(t *testing.T) {
+	t.Parallel()
+
+	base := [4]byte{100, 64, 0, 0}
+	tests := []struct {
+		idx  int
+		want [4]byte
+	}{
+		{0, [4]byte{100, 64, 0, 0}},
+		{1, [4]byte{100, 64, 0, 1}},
+		{255, [4]byte{100, 64, 0, 255}},
+		{256, [4]byte{100, 64, 1, 0}},
+		{1000, [4]byte{100, 64, 3, 232}},
+	}
+	for _, tc := range tests {
+		got := ipForIndex(base, tc.idx)
+		assert.Equal(t, tc.want, got, "idx=%d", tc.idx)
+	}
+}
diff --git a/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go b/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go
new file mode 100644
index 0000000000..1396928714
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/reconcile/reconcile.go
@@ -0,0 +1,63 @@
+// Package reconcile decides what to create or delete to drive a set of
+// serviceability User accounts toward a desired count. It is pure (no I/O)
+// so the device-stress orchestrator can call it once per batch iteration
+// against live state pulled from the chain.
+package reconcile
+
+import (
+	"bytes"
+	"sort"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+)
+
+// Plan describes the delta needed to drive the set of users owned by a given
+// key toward a desired count.
+type Plan struct {
+	// ToCreate is the number of users to add. Always >= 0.
+	ToCreate int
+	// ToDelete lists user PDAs to remove, in the order they should be deleted.
+	// Sorted by ClientIp ascending, then by PubKey ascending as a tiebreaker,
+	// so repeated calls against the same input produce identical plans.
+	ToDelete []solana.PublicKey
+}
+
+// PlanFor decides what to create or delete so that the number of users owned by
+// ownerFilter equals target. Users with a different Owner are ignored (neither
+// counted nor deleted), which lets the orchestrator share a program with other
+// tenants without disturbing them.
+//
+// Returns a zero plan when target is negative.
+func PlanFor(current []serviceability.User, target int, ownerFilter solana.PublicKey) Plan {
+	if target < 0 {
+		return Plan{}
+	}
+
+	var owned []serviceability.User
+	for _, u := range current {
+		if bytes.Equal(u.Owner[:], ownerFilter[:]) {
+			owned = append(owned, u)
+		}
+	}
+
+	switch {
+	case len(owned) < target:
+		return Plan{ToCreate: target - len(owned)}
+	case len(owned) > target:
+		sort.Slice(owned, func(i, j int) bool {
+			if c := bytes.Compare(owned[i].ClientIp[:], owned[j].ClientIp[:]); c != 0 {
+				return c < 0
+			}
+			return bytes.Compare(owned[i].PubKey[:], owned[j].PubKey[:]) < 0
+		})
+		victims := owned[target:]
+		out := make([]solana.PublicKey, len(victims))
+		for i, u := range victims {
+			out[i] = solana.PublicKeyFromBytes(u.PubKey[:])
+		}
+		return Plan{ToDelete: out}
+	default:
+		return Plan{}
+	}
+}
diff --git a/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go b/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go
new file mode 100644
index 0000000000..687bf0f464
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/reconcile/reconcile_test.go
@@ -0,0 +1,166 @@
+package reconcile_test
+
+import (
+	"testing"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/reconcile"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func makeUser(owner, pubkey solana.PublicKey, clientIP [4]byte) serviceability.User {
+	return serviceability.User{
+		Owner:    owner,
+		ClientIp: clientIP,
+		PubKey:   pubkey,
+	}
+}
+
+func TestPlanFor(t *testing.T) {
+	t.Parallel()
+
+	orchestrator := solana.NewWallet().PublicKey()
+	stranger := solana.NewWallet().PublicKey()
+
+	u1 := solana.NewWallet().PublicKey()
+	u2 := solana.NewWallet().PublicKey()
+	u3 := solana.NewWallet().PublicKey()
+	u4 := solana.NewWallet().PublicKey()
+	u5 := solana.NewWallet().PublicKey()
+
+	ip := func(a, b, c, d byte) [4]byte { return [4]byte{a, b, c, d} }
+
+	tests := []struct {
+		name          string
+		current       []serviceability.User
+		target        int
+		owner         solana.PublicKey
+		wantCreate    int
+		wantDeleteIPs [][4]byte
+	}{
+		{
+			name:       "zero to N",
+			current:    nil,
+			target:     4,
+			owner:      orchestrator,
+			wantCreate: 4,
+		},
+		{
+			name: "N to zero deletes in ip-ascending order",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 3)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u3, ip(10, 0, 0, 4)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
+			},
+			target:        0,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 1), ip(10, 0, 0, 2), ip(10, 0, 0, 3), ip(10, 0, 0, 4)},
+		},
+		{
+			name: "partial trim deletes only the overflow",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 5)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 4)),
+				makeUser(orchestrator, u3, ip(10, 0, 0, 3)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 2)),
+				makeUser(orchestrator, u5, ip(10, 0, 0, 1)),
+			},
+			target:        3,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 4), ip(10, 0, 0, 5)},
+		},
+		{
+			name: "partial grow asks for the missing count",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+			},
+			target:     5,
+			owner:      orchestrator,
+			wantCreate: 3,
+		},
+		{
+			name: "only foreign users present grows by full target",
+			current: []serviceability.User{
+				makeUser(stranger, u1, ip(10, 0, 0, 1)),
+				makeUser(stranger, u2, ip(10, 0, 0, 2)),
+				makeUser(stranger, u3, ip(10, 0, 0, 3)),
+			},
+			target:     2,
+			owner:      orchestrator,
+			wantCreate: 2,
+		},
+		{
+			name: "mixed ownership only counts and deletes owned",
+			current: []serviceability.User{
+				makeUser(stranger, u1, ip(10, 0, 0, 9)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+				makeUser(stranger, u3, ip(10, 0, 0, 8)),
+				makeUser(orchestrator, u4, ip(10, 0, 0, 1)),
+			},
+			target:        1,
+			owner:         orchestrator,
+			wantCreate:    0,
+			wantDeleteIPs: [][4]byte{ip(10, 0, 0, 2)},
+		},
+		{
+			name: "already at target produces zero plan",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+				makeUser(orchestrator, u2, ip(10, 0, 0, 2)),
+			},
+			target:     2,
+			owner:      orchestrator,
+			wantCreate: 0,
+		},
+		{
+			name: "negative target produces zero plan",
+			current: []serviceability.User{
+				makeUser(orchestrator, u1, ip(10, 0, 0, 1)),
+			},
+			target:     -1,
+			owner:      orchestrator,
+			wantCreate: 0,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			plan := reconcile.PlanFor(tc.current, tc.target, tc.owner)
+			assert.Equal(t, tc.wantCreate, plan.ToCreate, "ToCreate")
+			require.Len(t, plan.ToDelete, len(tc.wantDeleteIPs), "ToDelete length")
+
+			ipToPubkey := map[[4]byte]solana.PublicKey{}
+			for _, u := range tc.current {
+				ipToPubkey[u.ClientIp] = solana.PublicKeyFromBytes(u.PubKey[:])
+			}
+			for i, ipKey := range tc.wantDeleteIPs {
+				assert.Equal(t, ipToPubkey[ipKey], plan.ToDelete[i], "ToDelete[%d] (clientIp=%v)", i, ipKey)
+			}
+		})
+	}
+}
+
+func TestPlanFor_TieBreaksByPubkey(t *testing.T) {
+	t.Parallel()
+
+	orchestrator := solana.NewWallet().PublicKey()
+	sharedIP := [4]byte{10, 0, 0, 1}
+
+	pkA := solana.PublicKeyFromBytes([]byte{0xAA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
+	pkB := solana.PublicKeyFromBytes([]byte{0xBB, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
+
+	plan := reconcile.PlanFor([]serviceability.User{
+		makeUser(orchestrator, pkB, sharedIP),
+		makeUser(orchestrator, pkA, sharedIP),
+	}, 0, orchestrator)
+
+	require.Len(t, plan.ToDelete, 2)
+	assert.Equal(t, pkA, plan.ToDelete[0])
+	assert.Equal(t, pkB, plan.ToDelete[1])
+}
diff --git a/tools/stress/device-orchestrator/pkg/runlog/runlog.go b/tools/stress/device-orchestrator/pkg/runlog/runlog.go
new file mode 100644
index 0000000000..007a79fa43
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/runlog/runlog.go
@@ -0,0 +1,101 @@
+// Package runlog appends per-event rows to the orchestrator runlog file
+// (`orchestrator-runlog.json`). One row per line; line-delimited JSON so the
+// file can be tailed and downstream tooling can parse incrementally.
+//
+// Row schema (per #3746):
+//
+//	{run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event}
+//
+// `t_ns` is the unix epoch in nanoseconds. `n_after_event` is the size of the
+// active user set immediately after the event applied — provisioning increments
+// it on `activate`, deprovisioning decrements on `deprovision_activate`. Other
+// events carry the count as-of-emission.
+package runlog
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"sync"
+	"time"
+)
+
+// Event enumerates the recognized event names. Stringly-typed in the file so
+// the schema can grow without consumers needing to track an enum.
+type Event string
+
+const (
+	EventSubmit              Event = "submit"
+	EventConfirm             Event = "confirm"
+	EventActivate            Event = "activate"
+	EventPreCommitLog        Event = "pre_commit_log" // emitted by part-3 agent runner
+	EventApplied             Event = "applied"        // emitted by part-3 agent runner
+	EventDeprovisionSubmit   Event = "deprovision_submit"
+	EventDeprovisionConfirm  Event = "deprovision_confirm"
+	EventDeprovisionActivate Event = "deprovision_activate"
+)
+
+// Row is one entry in the runlog file. Field names match #3746's schema.
+type Row struct {
+	RunID       string `json:"run_id"`
+	UserIndex   int    `json:"user_index"`
+	UserPubkey  string `json:"user_pubkey"`
+	TunnelID    uint16 `json:"tunnel_id"`
+	Event       Event  `json:"event"`
+	TNs         int64  `json:"t_ns"`
+	NAfterEvent int    `json:"n_after_event"`
+}
+
+// Writer appends rows to an open file in line-delimited JSON.
+type Writer struct {
+	mu   sync.Mutex
+	w    io.WriteCloser
+	path string
+}
+
+// Open creates or truncates the file at path for append-only writes.
+func Open(path string) (*Writer, error) {
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
+	if err != nil {
+		return nil, fmt.Errorf("open runlog %s: %w", path, err)
+	}
+	return &Writer{w: f, path: path}, nil
+}
+
+// Path returns the file path the writer is appending to.
+func (w *Writer) Path() string { return w.path }
+
+// Append serializes row as JSON and writes a single line.
+func (w *Writer) Append(row Row) error {
+	if row.TNs == 0 {
+		row.TNs = time.Now().UnixNano()
+	}
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.w == nil {
+		return errors.New("runlog writer closed")
+	}
+	buf, err := json.Marshal(row)
+	if err != nil {
+		return fmt.Errorf("marshal runlog row: %w", err)
+	}
+	buf = append(buf, '\n')
+	if _, err := w.w.Write(buf); err != nil {
+		return fmt.Errorf("write runlog row: %w", err)
+	}
+	return nil
+}
+
+// Close flushes and closes the underlying file.
+func (w *Writer) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.w == nil {
+		return nil
+	}
+	err := w.w.Close()
+	w.w = nil
+	return err
+}
diff --git a/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go b/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go
new file mode 100644
index 0000000000..ca0cb31ddf
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/runlog/runlog_test.go
@@ -0,0 +1,93 @@
+package runlog_test
+
+import (
+	"bufio"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestWriter_RoundTrip(t *testing.T) {
+	t.Parallel()
+
+	path := filepath.Join(t.TempDir(), "orchestrator-runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+
+	rows := []runlog.Row{
+		{RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventSubmit, TNs: 1000, NAfterEvent: 0},
+		{RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventConfirm, TNs: 2000, NAfterEvent: 0},
+		{RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventActivate, TNs: 3000, NAfterEvent: 1},
+		{RunID: "run-1", UserIndex: 0, UserPubkey: "pk0", TunnelID: 500, Event: runlog.EventDeprovisionActivate, TNs: 4000, NAfterEvent: 0},
+	}
+	for _, r := range rows {
+		require.NoError(t, w.Append(r))
+	}
+	require.NoError(t, w.Close())
+
+	// File ends with a newline; one row per line.
+	f, err := os.Open(path)
+	require.NoError(t, err)
+	defer f.Close()
+
+	var read []runlog.Row
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		var r runlog.Row
+		require.NoError(t, json.Unmarshal(scanner.Bytes(), &r))
+		read = append(read, r)
+	}
+	require.NoError(t, scanner.Err())
+
+	assert.Equal(t, rows, read)
+}
+
+func TestWriter_FillsMissingTimestamp(t *testing.T) {
+	t.Parallel()
+
+	path := filepath.Join(t.TempDir(), "orchestrator-runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	defer w.Close()
+
+	require.NoError(t, w.Append(runlog.Row{RunID: "r", UserIndex: 0, UserPubkey: "pk", Event: runlog.EventSubmit}))
+
+	data, err := os.ReadFile(path)
+	require.NoError(t, err)
+
+	var r runlog.Row
+	require.NoError(t, json.Unmarshal(data[:len(data)-1], &r))
+	assert.NotZero(t, r.TNs, "Append should fill t_ns when zero")
+}
+
+func TestWriter_RejectsAfterClose(t *testing.T) {
+	t.Parallel()
+
+	w, err := runlog.Open(filepath.Join(t.TempDir(), "orchestrator-runlog.json"))
+	require.NoError(t, err)
+	require.NoError(t, w.Close())
+
+	err = w.Append(runlog.Row{RunID: "r", Event: runlog.EventSubmit})
+	require.Error(t, err)
+}
+
+func TestWriter_Truncates(t *testing.T) {
+	t.Parallel()
+
+	path := filepath.Join(t.TempDir(), "orchestrator-runlog.json")
+	require.NoError(t, os.WriteFile(path, []byte("stale\n"), 0o644))
+
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	require.NoError(t, w.Append(runlog.Row{RunID: "r", Event: runlog.EventSubmit, TNs: 1}))
+	require.NoError(t, w.Close())
+
+	data, err := os.ReadFile(path)
+	require.NoError(t, err)
+	assert.NotContains(t, string(data), "stale", "Open(path) should truncate existing content")
+}
diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep.go b/tools/stress/device-orchestrator/pkg/sweep/sweep.go
new file mode 100644
index 0000000000..cda03412c1
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/sweep/sweep.go
@@ -0,0 +1,262 @@
+// Package sweep implements the device-orchestrator sweep loop:
+//
+//   - Provision phase: walks 0 → Target users in batches of UsersPerBatch,
+//     using reconcile.PlanFor to query live state and ask the Executor to
+//     create the delta, holding for Hold between batches.
+//   - Deprovision phase: walks Target → 0 in reverse order of creation,
+//     so the youngest user is removed first.
+//
+// Per #3746, the sweep cooperates with the abort signal between user
+// iterations — it never cancels a mid-flight Create/Delete.
+package sweep
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/reconcile"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog"
+)
+
+// Clock abstracts the wallclock for testability. Real callers pass RealClock;
+// tests inject a fake that fires `After` channels manually.
+type Clock interface {
+	Now() time.Time
+	After(d time.Duration) <-chan time.Time
+}
+
+// RealClock is the production wallclock implementation.
+type RealClock struct{}
+
+func (RealClock) Now() time.Time                         { return time.Now() }
+func (RealClock) After(d time.Duration) <-chan time.Time { return time.After(d) }
+
+// CreateResult captures the per-user details the sweep emits into the runlog
+// for a successful provision. ConfirmedAt and ActivatedAt are sourced from
+// the Executor so a future SDK refactor can give them distinct values; today
+// they are typically equal because the SDK's `CreateUser` blocks on both
+// finalization and account visibility before returning.
+type CreateResult struct {
+	UserPDA     solana.PublicKey
+	TunnelID    uint16
+	ConfirmedAt time.Time
+	ActivatedAt time.Time
+}
+
+// DeleteResult is the deprovision analog of CreateResult.
+type DeleteResult struct {
+	ConfirmedAt time.Time
+	ActivatedAt time.Time
+}
+
+// Executor is the interface the sweep depends on for chain I/O. Tests inject
+// a fake; the real implementation wraps `serviceability.Executor` plus a small
+// post-create fetch to discover the assigned TunnelId.
+type Executor interface {
+	ListUsers(ctx context.Context) ([]serviceability.User, error)
+	CreateUser(ctx context.Context, idx int) (CreateResult, error)
+	DeleteUser(ctx context.Context, userPDA solana.PublicKey) (DeleteResult, error)
+}
+
+// Config bundles all sweep parameters; pass by value to Run.
+type Config struct {
+	RunID         string
+	Target        int
+	UsersPerBatch int
+	Hold          time.Duration
+	OwnerFilter   solana.PublicKey
+
+	Executor Executor
+	Agent    agent.Runner
+	Runlog   *runlog.Writer
+	Clock    Clock
+	Logger   *slog.Logger
+}
+
+func (c *Config) validate() error {
+	switch {
+	case c.Target < 0:
+		return errors.New("sweep: Target must be >= 0")
+	case c.UsersPerBatch <= 0:
+		return errors.New("sweep: UsersPerBatch must be > 0")
+	case c.Hold < 0:
+		return errors.New("sweep: Hold must be >= 0")
+	case c.RunID == "":
+		return errors.New("sweep: RunID is required")
+	case c.Executor == nil:
+		return errors.New("sweep: Executor is required")
+	case c.Runlog == nil:
+		return errors.New("sweep: Runlog is required")
+	}
+	if c.Clock == nil {
+		c.Clock = RealClock{}
+	}
+	if c.Logger == nil {
+		c.Logger = slog.Default()
+	}
+	if c.Agent == nil {
+		c.Agent = agent.NewNoop(c.Logger)
+	}
+	return nil
+}
+
+// createdUser tracks an orchestrator-owned user so the deprovision phase can
+// iterate in reverse-creation order, independent of live state.
+type createdUser struct {
+	idx      int
+	pubkey   solana.PublicKey
+	tunnelID uint16
+}
+
+// Run drives the provision-then-deprovision sweep to completion. Returns the
+// number of users actually created/deleted alongside the error (if any), so
+// callers can report partial progress on abort.
+func Run(ctx context.Context, cfg Config) error {
+	if err := cfg.validate(); err != nil {
+		return err
+	}
+	if err := cfg.Agent.Start(ctx); err != nil {
+		return fmt.Errorf("start agent runner: %w", err)
+	}
+
+	created, err := provision(ctx, &cfg)
+	if err != nil && !errors.Is(err, context.Canceled) {
+		return err
+	}
+	// Always attempt deprovision so an abort during provision still cleans up
+	// what the sweep created. Use a fresh context for the deprovision phase if
+	// the original was cancelled, since the operator wants the tear-down to
+	// finish before exit. We respect the parent context's lifetime via the
+	// outer Run's error return — callers that want a hard stop pass a deadline.
+	depErr := deprovision(ctx, &cfg, created)
+	if err != nil {
+		return err
+	}
+	return depErr
+}
+
+// provision walks 0 → Target in batches, returning the slice of created users
+// so deprovision can iterate in reverse. Returns ctx.Err() if cancelled
+// between users.
+func provision(ctx context.Context, cfg *Config) ([]createdUser, error) {
+	if cfg.Target == 0 {
+		return nil, nil
+	}
+	var created []createdUser
+	runningTarget := 0
+	activeCount := 0
+
+	for runningTarget < cfg.Target {
+		if err := ctx.Err(); err != nil {
+			return created, err
+		}
+
+		nextTarget := runningTarget + cfg.UsersPerBatch
+		if nextTarget > cfg.Target {
+			nextTarget = cfg.Target
+		}
+
+		users, err := cfg.Executor.ListUsers(ctx)
+		if err != nil {
+			return created, fmt.Errorf("list users for batch starting at %d: %w", activeCount, err)
+		}
+		plan := reconcile.PlanFor(users, nextTarget, cfg.OwnerFilter)
+		if len(plan.ToDelete) > 0 {
+			cfg.Logger.Warn("sweep: PlanFor wants to delete pre-existing users; skipping (orchestrator only creates this run)",
+				"count", len(plan.ToDelete))
+		}
+
+		for i := 0; i < plan.ToCreate; i++ {
+			if err := ctx.Err(); err != nil {
+				return created, err
+			}
+			idx := activeCount
+			submitAt := cfg.Clock.Now()
+			if err := emit(cfg, idx, "", 0, runlog.EventSubmit, submitAt, activeCount); err != nil {
+				return created, err
+			}
+
+			res, err := cfg.Executor.CreateUser(ctx, idx)
+			if err != nil {
+				return created, fmt.Errorf("create user idx=%d: %w", idx, err)
+			}
+			pkStr := res.UserPDA.String()
+			if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventConfirm, res.ConfirmedAt, activeCount); err != nil {
+				return created, err
+			}
+			created = append(created, createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID})
+			activeCount++
+			if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventActivate, res.ActivatedAt, activeCount); err != nil {
+				return created, err
+			}
+		}
+
+		runningTarget = nextTarget
+		if runningTarget >= cfg.Target {
+			break
+		}
+		if cfg.Hold > 0 {
+			select {
+			case <-cfg.Clock.After(cfg.Hold):
+			case <-ctx.Done():
+				return created, ctx.Err()
+			}
+		}
+	}
+	return created, nil
+}
+
+// deprovision walks the created slice in reverse, emitting deprovision_*
+// events for each.
+func deprovision(ctx context.Context, cfg *Config, created []createdUser) error {
+	activeCount := len(created)
+	for i := len(created) - 1; i >= 0; i-- {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		u := created[i]
+		pkStr := u.pubkey.String()
+		submitAt := cfg.Clock.Now()
+		if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionSubmit, submitAt, activeCount); err != nil {
+			return err
+		}
+
+		res, err := cfg.Executor.DeleteUser(ctx, u.pubkey)
+		if err != nil {
+			return fmt.Errorf("delete user idx=%d pubkey=%s: %w", u.idx, pkStr, err)
+		}
+		if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionConfirm, res.ConfirmedAt, activeCount); err != nil {
+			return err
+		}
+		activeCount--
+		if err := emit(cfg, u.idx, pkStr, u.tunnelID, runlog.EventDeprovisionActivate, res.ActivatedAt, activeCount); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func emit(cfg *Config, idx int, pubkey string, tunnelID uint16, ev runlog.Event, at time.Time, nAfter int) error {
+	if at.IsZero() {
+		at = cfg.Clock.Now()
+	}
+	row := runlog.Row{
+		RunID:       cfg.RunID,
+		UserIndex:   idx,
+		UserPubkey:  pubkey,
+		TunnelID:    tunnelID,
+		Event:       ev,
+		TNs:         at.UnixNano(),
+		NAfterEvent: nAfter,
+	}
+	if err := cfg.Runlog.Append(row); err != nil {
+		return fmt.Errorf("runlog append %s: %w", ev, err)
+	}
+	return nil
+}
diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go
new file mode 100644
index 0000000000..3402d5fe8c
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go
@@ -0,0 +1,321 @@
+package sweep_test
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/gagliardetto/solana-go"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/runlog"
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/sweep"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// fakeClock provides deterministic Now() and a manually-fired After channel so
+// the sweep's hold call returns instantly under test.
+type fakeClock struct {
+	mu    sync.Mutex
+	now   time.Time
+	holds int
+}
+
+func (f *fakeClock) Now() time.Time {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.now = f.now.Add(time.Microsecond) // advance so successive Now() calls differ
+	return f.now
+}
+
+func (f *fakeClock) After(d time.Duration) <-chan time.Time {
+	f.mu.Lock()
+	f.holds++
+	f.mu.Unlock()
+	ch := make(chan time.Time, 1)
+	ch <- time.Now()
+	return ch
+}
+
+func (f *fakeClock) HoldCount() int {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.holds
+}
+
+// fakeExecutor records create/delete calls. ListUsers always returns the
+// orchestrator-owned set so PlanFor produces the right delta.
+type fakeExecutor struct {
+	mu      sync.Mutex
+	owner   solana.PublicKey
+	created []serviceability.User
+	createN atomic.Int32
+	deleteN atomic.Int32
+
+	// Optional hook to fail on the Nth create (1-based) — used by the abort test.
+	failCreateOnCall int
+	failErr          error
+}
+
+func newFakeExecutor(owner solana.PublicKey) *fakeExecutor {
+	return &fakeExecutor{owner: owner}
+}
+
+func (f *fakeExecutor) ListUsers(ctx context.Context) ([]serviceability.User, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	out := make([]serviceability.User, len(f.created))
+	copy(out, f.created)
+	return out, nil
+}
+
+func (f *fakeExecutor) CreateUser(ctx context.Context, idx int) (sweep.CreateResult, error) {
+	calls := int(f.createN.Add(1))
+	if f.failCreateOnCall == calls && f.failErr != nil {
+		return sweep.CreateResult{}, f.failErr
+	}
+
+	// Deterministic pubkey from idx, IP = 100.0.0.idx+1 so PlanFor sorts cleanly.
+	var pk solana.PublicKey
+	pk[0] = byte(idx)
+	pk[31] = 0xAA
+
+	f.mu.Lock()
+	f.created = append(f.created, serviceability.User{
+		Owner:    f.owner,
+		ClientIp: [4]byte{100, 0, 0, byte(idx + 1)},
+		PubKey:   pk,
+	})
+	f.mu.Unlock()
+
+	now := time.Unix(1_700_000_000, int64(calls)*1_000_000) // micro-spaced timestamps
+	return sweep.CreateResult{
+		UserPDA:     pk,
+		TunnelID:    uint16(500 + idx),
+		ConfirmedAt: now,
+		ActivatedAt: now.Add(time.Millisecond),
+	}, nil
+}
+
+func (f *fakeExecutor) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) {
+	calls := int(f.deleteN.Add(1))
+	f.mu.Lock()
+	// Remove the matching user from the active set.
+	for i, u := range f.created {
+		if solana.PublicKeyFromBytes(u.PubKey[:]).Equals(userPDA) {
+			f.created = append(f.created[:i], f.created[i+1:]...)
+			break
+		}
+	}
+	f.mu.Unlock()
+
+	now := time.Unix(1_700_000_000, int64(calls+1000)*1_000_000)
+	return sweep.DeleteResult{
+		ConfirmedAt: now,
+		ActivatedAt: now.Add(time.Millisecond),
+	}, nil
+}
+
+func readRows(t *testing.T, path string) []runlog.Row {
+	t.Helper()
+	f, err := os.Open(path)
+	require.NoError(t, err)
+	defer f.Close()
+	var rows []runlog.Row
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		var r runlog.Row
+		require.NoError(t, json.Unmarshal(s.Bytes(), &r))
+		rows = append(rows, r)
+	}
+	require.NoError(t, s.Err())
+	return rows
+}
+
+func TestRun_ProvisionsThenDeprovisionsInReverseOrder(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	exec := newFakeExecutor(owner)
+	path := filepath.Join(t.TempDir(), "orchestrator-runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = w.Close() })
+
+	clk := &fakeClock{now: time.Unix(1_700_000_000, 0)}
+	cfg := sweep.Config{
+		RunID:         "run-test",
+		Target:        4,
+		UsersPerBatch: 2,
+		Hold:          10 * time.Second,
+		OwnerFilter:   owner,
+		Executor:      exec,
+		Agent:         agent.NewNoop(nil),
+		Runlog:        w,
+		Clock:         clk,
+	}
+	require.NoError(t, sweep.Run(context.Background(), cfg))
+	require.NoError(t, w.Close())
+
+	rows := readRows(t, path)
+	// 4 provisions × 3 events + 4 deprovisions × 3 events = 24 rows
+	require.Len(t, rows, 24)
+
+	// Provision phase: ascending user_index, events submit→confirm→activate.
+	for i := 0; i < 4; i++ {
+		base := i * 3
+		assert.Equal(t, i, rows[base].UserIndex, "row %d", base)
+		assert.Equal(t, runlog.EventSubmit, rows[base].Event)
+		assert.Equal(t, runlog.EventConfirm, rows[base+1].Event)
+		assert.Equal(t, runlog.EventActivate, rows[base+2].Event)
+		assert.Equal(t, uint16(500+i), rows[base+1].TunnelID, "tunnel_id propagates after confirm")
+		assert.Equal(t, i+1, rows[base+2].NAfterEvent, "activate increments active count")
+	}
+
+	// Deprovision phase: descending user_index (reverse creation order), events deprovision_submit/confirm/activate.
+	for k := 0; k < 4; k++ {
+		base := 12 + k*3
+		expectedIdx := 3 - k // 3, 2, 1, 0
+		assert.Equal(t, expectedIdx, rows[base].UserIndex)
+		assert.Equal(t, runlog.EventDeprovisionSubmit, rows[base].Event)
+		assert.Equal(t, runlog.EventDeprovisionConfirm, rows[base+1].Event)
+		assert.Equal(t, runlog.EventDeprovisionActivate, rows[base+2].Event)
+		assert.Equal(t, 3-k, rows[base+2].NAfterEvent, "deprovision_activate decrements active count")
+	}
+
+	// Hold called between batches but not after the final provision batch.
+	// Target=4, UsersPerBatch=2 → batches at [0..2), [2..4); one hold between them.
+	assert.Equal(t, 1, clk.HoldCount(), "Hold should fire once (between batches), not after reaching target")
+
+	// Executor calls match the totals.
+	assert.Equal(t, int32(4), exec.createN.Load())
+	assert.Equal(t, int32(4), exec.deleteN.Load())
+}
+
+func TestRun_HandlesZeroTarget(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	exec := newFakeExecutor(owner)
+	path := filepath.Join(t.TempDir(), "runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = w.Close() })
+
+	cfg := sweep.Config{
+		RunID:         "run-zero",
+		Target:        0,
+		UsersPerBatch: 2,
+		Hold:          time.Second,
+		OwnerFilter:   owner,
+		Executor:      exec,
+		Runlog:        w,
+		Clock:         &fakeClock{now: time.Unix(1, 0)},
+	}
+	require.NoError(t, sweep.Run(context.Background(), cfg))
+	require.NoError(t, w.Close())
+
+	rows := readRows(t, path)
+	assert.Empty(t, rows)
+	assert.Zero(t, exec.createN.Load())
+	assert.Zero(t, exec.deleteN.Load())
+}
+
+func TestRun_AbortBetweenUsersStillCleansUp(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	exec := newFakeExecutor(owner)
+	exec.failCreateOnCall = 3
+	exec.failErr = context.Canceled
+
+	path := filepath.Join(t.TempDir(), "runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = w.Close() })
+
+	cfg := sweep.Config{
+		RunID:         "run-abort",
+		Target:        4,
+		UsersPerBatch: 2,
+		Hold:          time.Second,
+		OwnerFilter:   owner,
+		Executor:      exec,
+		Runlog:        w,
+		Clock:         &fakeClock{now: time.Unix(1, 0)},
+	}
+	err = sweep.Run(context.Background(), cfg)
+	require.Error(t, err, "abort during provision should surface error")
+
+	// Even on abort, deprovision should fire for the two users that were created.
+	require.NoError(t, w.Close())
+	rows := readRows(t, path)
+
+	// 2 provisions × 3 events = 6; plus a submit event for the failed third; plus 2 deprovision sets.
+	deprovisionActivates := 0
+	for _, r := range rows {
+		if r.Event == runlog.EventDeprovisionActivate {
+			deprovisionActivates++
+		}
+	}
+	assert.Equal(t, 2, deprovisionActivates, "every created user should be deprovisioned on abort")
+}
+
+func TestRun_RejectsInvalidConfig(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		cfg  sweep.Config
+	}{
+		{name: "negative target", cfg: sweep.Config{Target: -1, UsersPerBatch: 1, RunID: "r", Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}},
+		{name: "zero batch", cfg: sweep.Config{Target: 1, UsersPerBatch: 0, RunID: "r", Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}},
+		{name: "missing run id", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, Executor: &fakeExecutor{}, Runlog: &runlog.Writer{}}},
+		{name: "missing executor", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, RunID: "r", Runlog: &runlog.Writer{}}},
+		{name: "missing runlog", cfg: sweep.Config{Target: 1, UsersPerBatch: 1, RunID: "r", Executor: &fakeExecutor{}}},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			err := sweep.Run(context.Background(), tc.cfg)
+			require.Error(t, err)
+		})
+	}
+}
+
+// Sanity: ctx cancellation between users is observed at the next iteration boundary.
+func TestRun_CancellationStopsBetweenUsers(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	exec := newFakeExecutor(owner)
+	path := filepath.Join(t.TempDir(), "runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = w.Close() })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // pre-cancelled
+
+	cfg := sweep.Config{
+		RunID:         "run-cancel",
+		Target:        4,
+		UsersPerBatch: 2,
+		Hold:          time.Second,
+		OwnerFilter:   owner,
+		Executor:      exec,
+		Runlog:        w,
+		Clock:         &fakeClock{now: time.Unix(1, 0)},
+	}
+	err = sweep.Run(ctx, cfg)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, context.Canceled))
+	assert.Zero(t, exec.createN.Load(), "no users should be created when ctx is pre-cancelled")
+}

From 1d2113eaa4022e0425a84c36aebfd78688dbf266 Mon Sep 17 00:00:00 2001
From: Greg Mitchell <greg@malbeclabs.com>
Date: Wed, 27 May 2026 22:31:14 +0000
Subject: [PATCH 5/5] tools/stress: agent SSH + log parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completes the device-stress orchestrator (#3746) by replacing the no-op
AgentRunner with the live SSH-driven runner and the log parser that turns
agent diff/commit lines into pre_commit_log / applied events.

- pkg/agent/parser.go — Parser tracks two lines from
  controlplane/agent/pkg/arista/eapi.go: `Committing config session due to
  diffs detected: <diff>` (extracts every `+ interface Tunnel<ID>` and emits
  one pre_commit_log event per ID) and `Configuration session finalized with
  command '... commit'` (emits one applied event per pending tunnel; the
  abort variant clears the buffer without emitting).
- pkg/agent/ssh.go — Dials --dut-ssh-host with --dut-ssh-key, execs the
  configured doublezero-agent command (verbose, with optional --controller),
  and tees remote stdout/stderr into <working-dir>/orchestrator.agent.log
  while feeding lines through the parser. Host-key verification is
  InsecureIgnoreHostKey because targets are ephemeral cEOS containers.
- pkg/sweep — adds a consumer goroutine that reads Agent.Events() and writes
  pre_commit_log / applied rows by looking up each event's tunnel ID in a
  registry the provision goroutine populates as users are created. Unknown
  tunnels are debug-logged and dropped. The agent is started under a derived
  context so deprovision-then-clean-shutdown works without leaking the
  goroutine.
- pkg/exec.fetchTunnelID — implemented properly: GetAccountInfo on the user
  PDA, DeserializeUser, return User.TunnelId. Required adding an RPC field
  to exec.Config.
- cmd/device-orchestrator — new flags --dut-ssh-user (default `admin`) and
  --no-agent (offline testing); SSH runner becomes the default when
  --dut-ssh-host and --dut-ssh-key are both set.

Part 3 of #3746. Closes #3772.
---
 CHANGELOG.md                                  |   1 +
 go.mod                                        |   2 +-
 .../cmd/device-orchestrator/main.go           |  44 +++-
 .../device-orchestrator/pkg/agent/parser.go   | 125 ++++++++++
 .../pkg/agent/parser_test.go                  | 134 ++++++++++
 .../device-orchestrator/pkg/agent/ssh.go      | 228 ++++++++++++++++++
 .../device-orchestrator/pkg/exec/exec.go      |  36 ++-
 .../device-orchestrator/pkg/exec/exec_test.go |  85 +++++++
 .../device-orchestrator/pkg/sweep/sweep.go    | 117 ++++++++-
 .../pkg/sweep/sweep_test.go                   | 116 +++++++++
 10 files changed, 865 insertions(+), 23 deletions(-)
 create mode 100644 tools/stress/device-orchestrator/pkg/agent/parser.go
 create mode 100644 tools/stress/device-orchestrator/pkg/agent/parser_test.go
 create mode 100644 tools/stress/device-orchestrator/pkg/agent/ssh.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d71edb5e5e..a19e260009 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ All notable changes to this project will be documented in this file.
   - Add `CreateUser` (instruction variant 36) and `DeleteUser` (variant 42) to the serviceability executor. Account ordering mirrors the Rust SDK at `smartcontract/sdk/rs/src/commands/user/{create,delete}.rs`; the borsh-encoded payload matches Rust's `UserCreateArgs` / `UserDeleteArgs` exactly. Both methods wait for the user PDA to become visible (or disappear) on-chain after finalization so callers can record a meaningful `t_activate` against the operation. `UserCreateArgs` bundles the borsh-encoded fields with `DevicePubkey` / optional `TenantPubkey` for account derivation. Introduces `GetUserPDA`, `GetAccessPassPDA`, `GetTunnelIdsPDA`, `GetDzPrefixBlockPDA` helpers in `pda.go`. Cross-language wire format is locked down by new Rust-generated `user_create_args.{bin,json}` and `user_delete_args.{bin,json}` fixtures that the Go tests load via the existing fixture pipeline ([#3770](https://github.com/malbeclabs/doublezero/issues/3770)).
 - Tools
   - Add `tools/stress/device-orchestrator/` — the device-stress orchestrator skeleton for the GRE Tunnel Capacity Study. The binary parses every flag from #3746's CLI list, dumps `orchestrator-config.json` on start, runs a provision-then-reverse-deprovision sweep against a live serviceability program, and emits the runlog row schema `{run_id, user_index, user_pubkey, tunnel_id, event, t_ns, n_after_event}` to `orchestrator-runlog.json` for each `submit | confirm | activate | deprovision_*` event. The agent runner is stubbed behind a `pkg/agent.Runner` interface (no-op impl ships now; the SSH-backed runner that emits `pre_commit_log` / `applied` lands in part 3). The sweep cooperates with an abort sentinel file: when the file appears the in-flight user completes and the orchestrator deprovisions everything it created before exiting non-zero. `PlanReconcile` / `Plan` (lifted from the part-1 SDK PR) now lives at `tools/stress/device-orchestrator/pkg/reconcile/` as orchestrator policy rather than SDK primitive. Part 2 of #3746 ([#3771](https://github.com/malbeclabs/doublezero/issues/3771)).
+  - Complete the device-stress orchestrator with the SSH agent runner and log parser. `pkg/agent/ssh.go` dials `--dut-ssh-host` with `--dut-ssh-key`, execs `doublezero-agent -verbose` (appending `--controller` when set), and tees remote stdout/stderr into `<working-dir>/orchestrator.agent.log` while feeding the stream through `pkg/agent/parser.go`. The parser tracks two log lines from `controlplane/agent/pkg/arista/eapi.go`: `Committing config session due to diffs detected: <diff>` (extracting `+ interface Tunnel<ID>` matches and emitting one `pre_commit_log` event per ID) and `Configuration session finalized with command '... commit'` (emitting one `applied` event per pending tunnel; the `... abort` variant clears the buffer without emitting). The sweep grows a goroutine that consumes agent events and writes `pre_commit_log` / `applied` runlog rows by looking up each event's tunnel ID against a `tunnelID → user_index` map populated as users are created; unknown tunnels are debug-logged and dropped. `pkg/exec.fetchTunnelID` now reads the on-chain user account post-create to surface the assigned `TunnelId` into the runlog. New CLI flags: `--dut-ssh-user` (default `admin`) and `--no-agent` for offline testing. Host-key verification uses `ssh.InsecureIgnoreHostKey` because the orchestrator targets ephemeral cEOS containers; documented at `pkg/agent/ssh.go:SSH`. Part 3 of #3746, completes the five-event coverage ([#3772](https://github.com/malbeclabs/doublezero/issues/3772)).
 
 ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22
 
diff --git a/go.mod b/go.mod
index 86f96993ce..3eb0a6b9e1 100644
--- a/go.mod
+++ b/go.mod
@@ -60,6 +60,7 @@ require (
 	github.com/twmb/franz-go/pkg/kadm v1.17.1
 	github.com/vishvananda/netlink v1.3.1
 	github.com/vishvananda/netns v0.0.5
+	golang.org/x/crypto v0.49.0
 	golang.org/x/mod v0.33.0
 	golang.org/x/net v0.52.0
 	golang.org/x/sync v0.20.0
@@ -193,7 +194,6 @@ require (
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect
-	golang.org/x/crypto v0.49.0 // indirect
 	golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect
 	golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4 // indirect
 	golang.org/x/term v0.41.0 // indirect
diff --git a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
index ab01975d30..82c05c3534 100644
--- a/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
+++ b/tools/stress/device-orchestrator/cmd/device-orchestrator/main.go
@@ -43,6 +43,7 @@ type orchestratorConfig struct {
 	DUTPubkey       string `json:"dut_pubkey"`
 	DUTSSHHost      string `json:"dut_ssh_host"`
 	DUTSSHKey       string `json:"dut_ssh_key"`
+	DUTSSHUser      string `json:"dut_ssh_user"`
 	RPCURL          string `json:"rpc_url"`
 	ProgramID       string `json:"program_id"`
 	KeypairPath     string `json:"keypair"`
@@ -52,6 +53,7 @@ type orchestratorConfig struct {
 	ClientIPBase    string `json:"client_ip_base"`
 	TunnelEndpoint  string `json:"tunnel_endpoint"`
 	TenantPubkey    string `json:"tenant_pubkey,omitempty"`
+	NoAgent         bool   `json:"no_agent"`
 }
 
 func main() {
@@ -81,6 +83,8 @@ func run() error {
 		runID           = flag.String("run-id", "", "Run identifier written into every runlog row; auto-generated if empty.")
 		logLevel        = flag.String("log-level", "info", "slog level: debug|info|warn|error.")
 		dryRun          = flag.Bool("dry-run", false, "Validate flags and dump orchestrator-config.json without contacting the RPC.")
+		dutSSHUser      = flag.String("dut-ssh-user", "admin", "SSH user for the DUT.")
+		noAgent         = flag.Bool("no-agent", false, "Use the no-op AgentRunner even when SSH flags are set (offline testing).")
 	)
 	flag.Parse()
 
@@ -116,6 +120,7 @@ func run() error {
 		DUTPubkey:       *dutPubkey,
 		DUTSSHHost:      *dutSSHHost,
 		DUTSSHKey:       *dutSSHKey,
+		DUTSSHUser:      *dutSSHUser,
 		RPCURL:          *rpcURL,
 		ProgramID:       *programID,
 		KeypairPath:     *keypairPath,
@@ -125,6 +130,7 @@ func run() error {
 		ClientIPBase:    *clientIPBase,
 		TunnelEndpoint:  *tunnelEndpoint,
 		TenantPubkey:    *tenantPubkey,
+		NoAgent:         *noAgent,
 	}
 	configPath := filepath.Join(*workingDir, "orchestrator-config.json")
 	if err := dumpJSON(configPath, resolved); err != nil {
@@ -174,6 +180,7 @@ func run() error {
 	liveExec, err := exec.New(exec.Config{
 		Client:         client,
 		Executor:       executor,
+		RPC:            rpc,
 		DevicePubkey:   dutPK,
 		TenantPubkey:   tenantPK,
 		ClientIPBase:   baseIP,
@@ -200,6 +207,8 @@ func run() error {
 	ctx, abortCancel := abort.Watch(rootCtx, *abortFile, abort.DefaultPollInterval, logger)
 	defer abortCancel()
 
+	agentRunner := selectAgentRunner(*noAgent, *dutSSHHost, *dutSSHKey, *dutSSHUser, *controllerAddr, *workingDir, logger)
+
 	cfg := sweep.Config{
 		RunID:         *runID,
 		Target:        *targetUserCount,
@@ -207,7 +216,7 @@ func run() error {
 		Hold:          time.Duration(*holdSeconds) * time.Second,
 		OwnerFilter:   signer.PublicKey(),
 		Executor:      liveExec,
-		Agent:         agent.NewNoop(logger),
+		Agent:         agentRunner,
 		Runlog:        rlw,
 		Clock:         sweep.RealClock{},
 		Logger:        logger,
@@ -262,6 +271,39 @@ func requireFlags(required map[string]string) error {
 	return nil
 }
 
+// selectAgentRunner picks between the SSH-backed runner and the no-op, based
+// on the CLI flags:
+//
+//   - --no-agent → noop (operator opted out)
+//   - --dut-ssh-host + --dut-ssh-key set → SSH runner (default for live runs)
+//   - otherwise → noop with a warning (operator forgot the flags)
+//
+// The SSH runner tees remote stdout/stderr into <working-dir>/orchestrator.agent.log.
+// The exec'd command appends --controller iff the operator passed --controller.
+func selectAgentRunner(noAgent bool, sshHost, sshKey, sshUser, controllerAddr, workingDir string, logger *slog.Logger) agent.Runner {
+	if noAgent {
+		logger.Info("agent: --no-agent set; using no-op runner")
+		return agent.NewNoop(logger)
+	}
+	if sshHost == "" || sshKey == "" {
+		logger.Warn("agent: --dut-ssh-host and --dut-ssh-key not both set; falling back to no-op runner (pre_commit_log / applied events will not be recorded)")
+		return agent.NewNoop(logger)
+	}
+
+	cmd := "doublezero-agent -verbose"
+	if controllerAddr != "" {
+		cmd = fmt.Sprintf("doublezero-agent -verbose -controller %s", controllerAddr)
+	}
+	return agent.NewSSH(agent.SSHConfig{
+		Host:    sshHost,
+		User:    sshUser,
+		KeyPath: sshKey,
+		Command: cmd,
+		LogPath: filepath.Join(workingDir, "orchestrator.agent.log"),
+		Logger:  logger,
+	})
+}
+
 func parseIPv4(s string) ([4]byte, error) {
 	ip := net.ParseIP(s)
 	if ip == nil {
diff --git a/tools/stress/device-orchestrator/pkg/agent/parser.go b/tools/stress/device-orchestrator/pkg/agent/parser.go
new file mode 100644
index 0000000000..d6841f7ca0
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/agent/parser.go
@@ -0,0 +1,125 @@
+package agent
+
+import (
+	"regexp"
+	"strconv"
+	"time"
+)
+
+// Parser turns lines from a doublezero-agent log stream into AgentEvents.
+//
+// It tracks two log lines from controlplane/agent/pkg/arista/eapi.go:
+//
+//   - "Committing config session due to diffs detected: <diff>"
+//     → emit one EventPreCommitLog per `+ interface Tunnel<ID>` in the diff,
+//     and remember those IDs as "pending".
+//   - "Configuration session finalized with command '... commit'"
+//     → emit one EventApplied per pending ID, then clear the buffer.
+//   - "Configuration session finalized with command '... abort'"
+//     → clear the buffer with no Applied events.
+//
+// A single Parser is goroutine-safe only against the calling Parse goroutine;
+// callers should funnel all lines through one Parse loop.
+type Parser struct {
+	pending []uint16
+	now     func() time.Time // injectable for tests
+}
+
+// NewParser returns a Parser that stamps events with the current wallclock.
+// Pass WithClock to override (testing).
+func NewParser(opts ...ParserOption) *Parser {
+	p := &Parser{now: time.Now}
+	for _, opt := range opts {
+		opt(p)
+	}
+	return p
+}
+
+// ParserOption configures NewParser.
+type ParserOption func(*Parser)
+
+// WithClock overrides time.Now for the parser; used by tests.
+func WithClock(now func() time.Time) ParserOption {
+	return func(p *Parser) { p.now = now }
+}
+
+// Parse advances the parser by one log line and returns any events produced.
+// The returned slice is freshly allocated per call and safe for the caller to
+// retain.
+func (p *Parser) Parse(line string) []Event {
+	if m := committingRE.FindStringSubmatch(line); m != nil {
+		ids := extractAddedTunnelIDs(m[1])
+		if len(ids) == 0 {
+			return nil
+		}
+		p.pending = append(p.pending, ids...)
+		now := p.now()
+		out := make([]Event, 0, len(ids))
+		for _, id := range ids {
+			out = append(out, Event{Kind: EventPreCommitLog, TunnelID: id, At: now})
+		}
+		return out
+	}
+	if finalizedCommitRE.MatchString(line) {
+		if len(p.pending) == 0 {
+			return nil
+		}
+		now := p.now()
+		out := make([]Event, 0, len(p.pending))
+		for _, id := range p.pending {
+			out = append(out, Event{Kind: EventApplied, TunnelID: id, At: now})
+		}
+		p.pending = p.pending[:0]
+		return out
+	}
+	if finalizedAbortRE.MatchString(line) {
+		// Abort cleared the session — drop pending without emitting Applied.
+		p.pending = p.pending[:0]
+		return nil
+	}
+	return nil
+}
+
+// Pending exposes the in-flight tunnel IDs awaiting an Applied event; tests
+// inspect this to assert state transitions.
+func (p *Parser) Pending() []uint16 {
+	out := make([]uint16, len(p.pending))
+	copy(out, p.pending)
+	return out
+}
+
+var (
+	// committingRE captures the diff payload from the agent's pre-commit log.
+	// The diff is everything after the colon-space and runs to end of line —
+	// agents emit the diff inline (often multi-section but single-line).
+	committingRE = regexp.MustCompile(`Committing config session due to diffs detected:\s*(.*)$`)
+
+	// addedTunnelRE matches an additive interface-Tunnel diff line; the `\b` on
+	// the right keeps "Tunnel50001" out of a "Tunnel500" match.
+	addedTunnelRE = regexp.MustCompile(`\+\s*interface Tunnel(\d+)\b`)
+
+	// finalizedCommitRE matches the post-commit log line on a successful
+	// commit. The quoted command always ends in "...commit" for actual commits
+	// and "...abort" for no-op sessions.
+	finalizedCommitRE = regexp.MustCompile(`Configuration session finalized with command '.*\s+commit'`)
+	finalizedAbortRE  = regexp.MustCompile(`Configuration session finalized with command '.*\s+abort'`)
+)
+
+// extractAddedTunnelIDs pulls every "+ interface Tunnel<ID>" out of a diff
+// payload. Returns nil when no additive lines are present (e.g., pure
+// deprovision diffs).
+func extractAddedTunnelIDs(diff string) []uint16 {
+	matches := addedTunnelRE.FindAllStringSubmatch(diff, -1)
+	if len(matches) == 0 {
+		return nil
+	}
+	out := make([]uint16, 0, len(matches))
+	for _, m := range matches {
+		id, err := strconv.ParseUint(m[1], 10, 16)
+		if err != nil {
+			continue
+		}
+		out = append(out, uint16(id))
+	}
+	return out
+}
diff --git a/tools/stress/device-orchestrator/pkg/agent/parser_test.go b/tools/stress/device-orchestrator/pkg/agent/parser_test.go
new file mode 100644
index 0000000000..e0a2bef78f
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/agent/parser_test.go
@@ -0,0 +1,134 @@
+package agent_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/malbeclabs/doublezero/tools/stress/device-orchestrator/pkg/agent"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// fixedClock returns a constant time for deterministic event timestamps.
+func fixedClock(at time.Time) func() time.Time {
+	return func() time.Time { return at }
+}
+
+func TestParser_SingleTunnelDiffThenCommit(t *testing.T) {
+	t.Parallel()
+
+	now := time.Unix(1_700_000_000, 0)
+	p := agent.NewParser(agent.WithClock(fixedClock(now)))
+
+	events := p.Parse(`2026/05/27 12:00:01 Committing config session due to diffs detected: + interface Tunnel500   ip address 169.254.0.1/30`)
+	require.Len(t, events, 1)
+	assert.Equal(t, agent.EventPreCommitLog, events[0].Kind)
+	assert.Equal(t, uint16(500), events[0].TunnelID)
+	assert.Equal(t, now, events[0].At)
+	assert.Equal(t, []uint16{500}, p.Pending())
+
+	events = p.Parse(`2026/05/27 12:00:02 Configuration session finalized with command 'configure session doublezero-agent-abc123 commit'`)
+	require.Len(t, events, 1)
+	assert.Equal(t, agent.EventApplied, events[0].Kind)
+	assert.Equal(t, uint16(500), events[0].TunnelID)
+	assert.Empty(t, p.Pending(), "pending should clear after commit-success")
+}
+
+func TestParser_MultiTunnelDiffEmitsOneEventPerTunnel(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+	diff := `Committing config session due to diffs detected: + interface Tunnel500 + interface Tunnel501 - interface Tunnel499 + interface Tunnel502`
+	events := p.Parse(diff)
+	require.Len(t, events, 3, "only + lines, not - lines, produce events")
+	assert.Equal(t, []uint16{500, 501, 502}, []uint16{events[0].TunnelID, events[1].TunnelID, events[2].TunnelID})
+	for _, e := range events {
+		assert.Equal(t, agent.EventPreCommitLog, e.Kind)
+	}
+
+	applied := p.Parse(`Configuration session finalized with command 'configure session foo commit'`)
+	require.Len(t, applied, 3, "Applied fires once per pending tunnel")
+	assert.Equal(t, []uint16{500, 501, 502}, []uint16{applied[0].TunnelID, applied[1].TunnelID, applied[2].TunnelID})
+}
+
+func TestParser_DeprovisionOnlyDiffEmitsNothing(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+	events := p.Parse(`Committing config session due to diffs detected: - interface Tunnel500 - interface Tunnel501`)
+	assert.Empty(t, events)
+	assert.Empty(t, p.Pending())
+}
+
+func TestParser_AbortClearsBufferWithoutAppliedEvents(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+	events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel500`)
+	require.Len(t, events, 1)
+	require.Equal(t, []uint16{500}, p.Pending())
+
+	events = p.Parse(`Configuration session finalized with command 'configure session foo abort'`)
+	assert.Empty(t, events, "abort emits no events")
+	assert.Empty(t, p.Pending(), "abort still clears pending")
+}
+
+func TestParser_CommitWithoutPendingDiffIsNoOp(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+	events := p.Parse(`Configuration session finalized with command 'configure session foo commit'`)
+	assert.Empty(t, events)
+}
+
+func TestParser_TwoConsecutiveProvisionCycles(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+
+	// Cycle 1
+	require.Len(t, p.Parse(`Committing config session due to diffs detected: + interface Tunnel500`), 1)
+	require.Len(t, p.Parse(`Configuration session finalized with command 'configure session foo commit'`), 1)
+	assert.Empty(t, p.Pending())
+
+	// Cycle 2
+	require.Len(t, p.Parse(`Committing config session due to diffs detected: + interface Tunnel501`), 1)
+	applied := p.Parse(`Configuration session finalized with command 'configure session bar commit'`)
+	require.Len(t, applied, 1)
+	assert.Equal(t, uint16(501), applied[0].TunnelID, "cycle 2 must not replay tunnel 500")
+}
+
+func TestParser_UnrelatedLinesIgnored(t *testing.T) {
+	t.Parallel()
+
+	p := agent.NewParser()
+	for _, line := range []string{
+		``,
+		`Received 42 lines of configuration from controller`,
+		`forced unlock of configuration lock (xyz)`,
+		`some random log noise`,
+	} {
+		assert.Empty(t, p.Parse(line), "line=%q", line)
+	}
+}
+
+func TestParser_RejectsOversizedTunnelID(t *testing.T) {
+	t.Parallel()
+
+	// uint16 max is 65535; 70000 should be silently skipped, not panic.
+	p := agent.NewParser()
+	events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel70000 + interface Tunnel500`)
+	require.Len(t, events, 1)
+	assert.Equal(t, uint16(500), events[0].TunnelID)
+}
+
+func TestParser_DoesNotConfuseInterfaceNamePrefixes(t *testing.T) {
+	t.Parallel()
+
+	// "Tunnel5000" must not match a regex that's been fooled by "Tunnel500"
+	// being a prefix. Use a `\b` boundary in the regex.
+	p := agent.NewParser()
+	events := p.Parse(`Committing config session due to diffs detected: + interface Tunnel5000`)
+	require.Len(t, events, 1)
+	assert.Equal(t, uint16(5000), events[0].TunnelID)
+}
diff --git a/tools/stress/device-orchestrator/pkg/agent/ssh.go b/tools/stress/device-orchestrator/pkg/agent/ssh.go
new file mode 100644
index 0000000000..510fad511e
--- /dev/null
+++ b/tools/stress/device-orchestrator/pkg/agent/ssh.go
@@ -0,0 +1,228 @@
+package agent
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"sync"
+
+	"golang.org/x/crypto/ssh"
+)
+
+// SSHConfig describes how to reach the DUT and what to run on it.
+type SSHConfig struct {
+	// Host is the dial target, e.g. "10.0.0.1:22". The dialer expects a
+	// host:port; callers should resolve hostnames upstream.
+	Host string
+	// User to authenticate as. Defaults to "admin" if empty.
+	User string
+	// KeyPath is the path to a PEM-encoded private key for public-key auth.
+	KeyPath string
+	// Command is the remote command to exec. Defaults to
+	// "doublezero-agent -verbose" if empty; callers can override with
+	// additional flags such as the controller address.
+	Command string
+	// LogPath, when non-empty, is the local file the SSH runner tees remote
+	// stdout/stderr into. The file is truncated on Start.
+	LogPath string
+	// Logger is used for diagnostic logs from the runner; pass nil for silent.
+	Logger *slog.Logger
+}
+
+// SSH is a Runner that dials the DUT over SSH, executes doublezero-agent in
+// verbose mode, and emits AgentEvents parsed from the remote log stream.
+//
+// Host key verification uses ssh.InsecureIgnoreHostKey because the
+// orchestrator targets ephemeral cEOS containers whose host keys regenerate
+// on every restart; the threat model is "operator on the same subnet" and
+// the SSH session carries no privileged credentials beyond what the keypair
+// already grants. Do not reuse this dialer for production workloads.
+type SSH struct {
+	cfg SSHConfig
+
+	events chan Event
+
+	mu      sync.Mutex
+	started bool
+	client  *ssh.Client
+	session *ssh.Session
+	logFile *os.File
+}
+
+// NewSSH returns an unstarted SSH runner. Call Start to dial.
+func NewSSH(cfg SSHConfig) *SSH {
+	if cfg.User == "" {
+		cfg.User = "admin"
+	}
+	if cfg.Command == "" {
+		cfg.Command = "doublezero-agent -verbose"
+	}
+	return &SSH{
+		cfg:    cfg,
+		events: make(chan Event, 64),
+	}
+}
+
+// Events returns the channel the runner emits AgentEvents on. It closes
+// when the runner exits (ctx cancel, process exit, or session error).
+func (s *SSH) Events() <-chan Event { return s.events }
+
+// Start dials the DUT, opens a session, executes the configured command, and
+// streams its stdout/stderr through the parser. Start returns once the
+// session has been opened; the read loop runs in a goroutine until ctx is
+// cancelled or the remote command exits.
+func (s *SSH) Start(ctx context.Context) error {
+	s.mu.Lock()
+	if s.started {
+		s.mu.Unlock()
+		return fmt.Errorf("ssh agent: already started")
+	}
+	s.started = true
+	s.mu.Unlock()
+
+	signer, err := loadSigner(s.cfg.KeyPath)
+	if err != nil {
+		return fmt.Errorf("ssh agent: load key %s: %w", s.cfg.KeyPath, err)
+	}
+
+	clientCfg := &ssh.ClientConfig{
+		User:            s.cfg.User,
+		Auth:            []ssh.AuthMethod{ssh.PublicKeys(signer)},
+		HostKeyCallback: ssh.InsecureIgnoreHostKey(),
+	}
+	client, err := ssh.Dial("tcp", s.cfg.Host, clientCfg)
+	if err != nil {
+		return fmt.Errorf("ssh agent: dial %s: %w", s.cfg.Host, err)
+	}
+	session, err := client.NewSession()
+	if err != nil {
+		_ = client.Close()
+		return fmt.Errorf("ssh agent: new session: %w", err)
+	}
+	stdout, err := session.StdoutPipe()
+	if err != nil {
+		_ = session.Close()
+		_ = client.Close()
+		return fmt.Errorf("ssh agent: stdout pipe: %w", err)
+	}
+	stderr, err := session.StderrPipe()
+	if err != nil {
+		_ = session.Close()
+		_ = client.Close()
+		return fmt.Errorf("ssh agent: stderr pipe: %w", err)
+	}
+
+	var logFile *os.File
+	if s.cfg.LogPath != "" {
+		logFile, err = os.Create(s.cfg.LogPath)
+		if err != nil {
+			_ = session.Close()
+			_ = client.Close()
+			return fmt.Errorf("ssh agent: open log %s: %w", s.cfg.LogPath, err)
+		}
+	}
+
+	s.mu.Lock()
+	s.client = client
+	s.session = session
+	s.logFile = logFile
+	s.mu.Unlock()
+
+	if err := session.Start(s.cfg.Command); err != nil {
+		s.shutdown()
+		return fmt.Errorf("ssh agent: start %q: %w", s.cfg.Command, err)
+	}
+	if s.cfg.Logger != nil {
+		s.cfg.Logger.Info("ssh agent started", "host", s.cfg.Host, "command", s.cfg.Command, "log_path", s.cfg.LogPath)
+	}
+
+	parser := NewParser()
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		defer wg.Done()
+		streamLines(ctx, stdout, logFile, parser, s.events, s.cfg.Logger, "stdout")
+	}()
+	go func() {
+		defer wg.Done()
+		streamLines(ctx, stderr, logFile, parser, s.events, s.cfg.Logger, "stderr")
+	}()
+
+	go func() {
+		// Close session and channel when ctx cancels OR all reader goroutines exit.
+		done := make(chan struct{})
+		go func() {
+			wg.Wait()
+			close(done)
+		}()
+		select {
+		case <-ctx.Done():
+		case <-done:
+		}
+		// Closing the session causes the read loops to return EOF; the wait
+		// below blocks until both have returned before closing the events
+		// channel, so consumers never see a half-emitted event.
+		s.shutdown()
+		<-done
+		close(s.events)
+	}()
+
+	return nil
+}
+
+// shutdown is idempotent; safe to call from Start error paths and from the
+// supervising goroutine.
+func (s *SSH) shutdown() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.session != nil {
+		_ = s.session.Close()
+		s.session = nil
+	}
+	if s.client != nil {
+		_ = s.client.Close()
+		s.client = nil
+	}
+	if s.logFile != nil {
+		_ = s.logFile.Close()
+		s.logFile = nil
+	}
+}
+
+// streamLines reads `src` line-by-line, optionally tees raw lines to `tee`,
+// runs each through `parser`, and pushes resulting events onto `events`.
+// Returns early when ctx cancels so a slow consumer can't deadlock shutdown.
+func streamLines(ctx context.Context, src io.Reader, tee io.Writer, parser *Parser, events chan<- Event, log *slog.Logger, label string) {
+	scanner := bufio.NewScanner(src)
+	scanner.Buffer(make([]byte, 1024*1024), 16*1024*1024) // large diffs can exceed default
+	for scanner.Scan() {
+		line := scanner.Text()
+		if tee != nil {
+			if _, err := tee.Write([]byte(line + "\n")); err != nil && log != nil {
+				log.Warn("ssh agent: log tee write failed", "err", err, "stream", label)
+			}
+		}
+		for _, ev := range parser.Parse(line) {
+			select {
+			case events <- ev:
+			case <-ctx.Done():
+				return
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil && log != nil {
+		log.Warn("ssh agent: stream ended with error", "err", err, "stream", label)
+	}
+}
+
+// loadSigner reads a PEM-encoded private key from disk and returns an ssh.Signer.
+func loadSigner(path string) (ssh.Signer, error) {
+	buf, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	return ssh.ParsePrivateKey(buf)
+}
diff --git a/tools/stress/device-orchestrator/pkg/exec/exec.go b/tools/stress/device-orchestrator/pkg/exec/exec.go
index 86badb60f2..1df54630d2 100644
--- a/tools/stress/device-orchestrator/pkg/exec/exec.go
+++ b/tools/stress/device-orchestrator/pkg/exec/exec.go
@@ -18,6 +18,11 @@ import (
 type Config struct {
 	Client   *serviceability.Client
 	Executor *serviceability.Executor
+	// RPC is used to fetch individual User accounts post-create so the
+	// orchestrator can record the assigned TunnelId in the runlog. In
+	// production this is the same *solanarpc.Client the Client/Executor
+	// were built from.
+	RPC serviceability.RPCClient
 
 	DevicePubkey solana.PublicKey
 	TenantPubkey solana.PublicKey // zero pubkey = no tenant
@@ -42,7 +47,7 @@ type Live struct {
 }
 
 // New returns a Live executor with the given configuration. Callers must
-// supply a non-nil Client and Executor.
+// supply a non-nil Client, Executor, and RPC.
 func New(cfg Config) (*Live, error) {
 	if cfg.Client == nil {
 		return nil, fmt.Errorf("exec.New: Client is required")
@@ -50,6 +55,9 @@ func New(cfg Config) (*Live, error) {
 	if cfg.Executor == nil {
 		return nil, fmt.Errorf("exec.New: Executor is required")
 	}
+	if cfg.RPC == nil {
+		return nil, fmt.Errorf("exec.New: RPC is required")
+	}
 	if cfg.DzPrefixCount == 0 {
 		cfg.DzPrefixCount = 1
 	}
@@ -114,16 +122,24 @@ func (l *Live) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.
 	}, nil
 }
 
-// fetchTunnelID reads the user account and returns its assigned TunnelId.
-// Used so the runlog records the kernel interface identifier the part-3
-// agent runner will key on.
+// fetchTunnelID reads the user account by PDA and returns the assigned
+// TunnelId. The sweep loop logs this in the runlog so the agent-event
+// consumer can attribute `+ interface Tunnel<ID>` log lines back to a user.
 func (l *Live) fetchTunnelID(ctx context.Context, userPDA solana.PublicKey) (uint16, error) {
-	// We can't read the assigned tunnel_id without the User's on-chain bytes,
-	// which the SDK doesn't surface from CreateUser. Until a downstream
-	// helper is added, callers either skip this column (TunnelID = 0) or wire
-	// a per-account fetch in cmd/. The package signature is kept stable so
-	// part-3 can drop in the real fetch.
-	return 0, nil
+	info, err := l.cfg.RPC.GetAccountInfo(ctx, userPDA)
+	if err != nil {
+		return 0, fmt.Errorf("get user account info: %w", err)
+	}
+	if info == nil || info.Value == nil {
+		return 0, fmt.Errorf("user account %s not found", userPDA)
+	}
+	data := info.Value.Data.GetBinary()
+	if len(data) == 0 {
+		return 0, fmt.Errorf("user account %s empty", userPDA)
+	}
+	var u serviceability.User
+	serviceability.DeserializeUser(serviceability.NewByteReader(data), &u)
+	return u.TunnelId, nil
 }
 
 // ipForIndex returns base shifted by idx, wrapping at the /16 boundary so the
diff --git a/tools/stress/device-orchestrator/pkg/exec/exec_test.go b/tools/stress/device-orchestrator/pkg/exec/exec_test.go
index c7b13ea30b..644e2d7f87 100644
--- a/tools/stress/device-orchestrator/pkg/exec/exec_test.go
+++ b/tools/stress/device-orchestrator/pkg/exec/exec_test.go
@@ -1,9 +1,15 @@
 package exec
 
 import (
+	"context"
+	"encoding/binary"
 	"testing"
 
+	"github.com/gagliardetto/solana-go"
+	solanarpc "github.com/gagliardetto/solana-go/rpc"
+	"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestIPForIndex(t *testing.T) {
@@ -25,3 +31,82 @@ func TestIPForIndex(t *testing.T) {
 		assert.Equal(t, tc.want, got, "idx=%d", tc.idx)
 	}
 }
+
+// stubRPC implements serviceability.RPCClient for fetchTunnelID tests.
+type stubRPC struct {
+	accountInfo *solanarpc.GetAccountInfoResult
+	err         error
+}
+
+func (s *stubRPC) GetProgramAccounts(context.Context, solana.PublicKey) (solanarpc.GetProgramAccountsResult, error) {
+	return nil, nil
+}
+
+func (s *stubRPC) GetAccountInfo(context.Context, solana.PublicKey) (*solanarpc.GetAccountInfoResult, error) {
+	return s.accountInfo, s.err
+}
+
+func TestFetchTunnelID_ReadsFromUserAccount(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	device := solana.NewWallet().PublicKey()
+
+	// Hand-encode a User account body matching DeserializeUser's field order.
+	// All fields zero except TunnelId so the test pin-points that read path.
+	const tunnelID uint16 = 4242
+	body := makeUserAccountBytes(owner, device, [4]byte{10, 0, 0, 5}, tunnelID)
+
+	stub := &stubRPC{
+		accountInfo: &solanarpc.GetAccountInfoResult{
+			Value: &solanarpc.Account{
+				Data: solanarpc.DataBytesOrJSONFromBytes(body),
+			},
+		},
+	}
+	live := &Live{cfg: Config{RPC: stub}}
+
+	got, err := live.fetchTunnelID(context.Background(), solana.NewWallet().PublicKey())
+	require.NoError(t, err)
+	assert.Equal(t, tunnelID, got)
+}
+
+func TestFetchTunnelID_AccountMissing(t *testing.T) {
+	t.Parallel()
+
+	live := &Live{cfg: Config{RPC: &stubRPC{accountInfo: &solanarpc.GetAccountInfoResult{Value: nil}}}}
+	_, err := live.fetchTunnelID(context.Background(), solana.NewWallet().PublicKey())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "not found")
+}
+
+// makeUserAccountBytes serializes a User account body with the minimum fields
+// the test needs. Matches the field order in serviceability.DeserializeUser.
+func makeUserAccountBytes(owner, device solana.PublicKey, clientIP [4]byte, tunnelID uint16) []byte {
+	b := make([]byte, 0, 256)
+	b = append(b, byte(serviceability.UserType)) // AccountType
+	b = append(b, owner[:]...)                   // Owner [32]
+	b = append(b, make([]byte, 16)...)           // Index u128
+	b = append(b, 0)                             // BumpSeed
+	b = append(b, byte(serviceability.UserTypeIBRL))
+	b = append(b, make([]byte, 32)...) // TenantPubKey (zero)
+	b = append(b, device[:]...)        // DevicePubKey
+	b = append(b, byte(serviceability.CyoaTypeGREOverDIA))
+	b = append(b, clientIP[:]...)     // ClientIp [4]
+	b = append(b, make([]byte, 4)...) // DzIp [4]
+	var tidBuf [2]byte
+	binary.LittleEndian.PutUint16(tidBuf[:], tunnelID)
+	b = append(b, tidBuf[:]...)       // TunnelId u16 LE
+	b = append(b, make([]byte, 5)...) // TunnelNet
+	b = append(b, byte(serviceability.UserStatusActivated))
+	b = append(b, 0, 0, 0, 0)          // Publishers len
+	b = append(b, 0, 0, 0, 0)          // Subscribers len
+	b = append(b, make([]byte, 32)...) // ValidatorPubKey
+	b = append(b, make([]byte, 4)...)  // TunnelEndpoint
+	b = append(b, 0)                   // TunnelFlags
+	b = append(b, 0)                   // BgpStatus
+	b = append(b, make([]byte, 8)...)  // LastBgpUpAt
+	b = append(b, make([]byte, 8)...)  // LastBgpReportedAt
+	b = append(b, make([]byte, 8)...)  // BgpRttNs
+	return b
+}
diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep.go b/tools/stress/device-orchestrator/pkg/sweep/sweep.go
index cda03412c1..eda53f5cae 100644
--- a/tools/stress/device-orchestrator/pkg/sweep/sweep.go
+++ b/tools/stress/device-orchestrator/pkg/sweep/sweep.go
@@ -15,6 +15,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"sync"
 	"time"
 
 	"github.com/gagliardetto/solana-go"
@@ -114,37 +115,129 @@ type createdUser struct {
 	tunnelID uint16
 }
 
+// tunnelRegistry holds the orchestrator's tunnelID → user metadata mapping,
+// shared between the provision goroutine (which writes) and the agent-event
+// consumer goroutine (which reads). Lookups for unknown tunnel IDs return
+// `ok=false` so the consumer can warn-log and drop the event.
+type tunnelRegistry struct {
+	mu  sync.RWMutex
+	idx map[uint16]createdUser
+}
+
+func newTunnelRegistry() *tunnelRegistry {
+	return &tunnelRegistry{idx: make(map[uint16]createdUser)}
+}
+
+func (r *tunnelRegistry) register(u createdUser) {
+	if u.tunnelID == 0 {
+		// TunnelId == 0 means the executor didn't surface a real ID; nothing
+		// in the agent log can match it, so don't take a map slot.
+		return
+	}
+	r.mu.Lock()
+	r.idx[u.tunnelID] = u
+	r.mu.Unlock()
+}
+
+func (r *tunnelRegistry) lookup(tunnelID uint16) (createdUser, bool) {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	u, ok := r.idx[tunnelID]
+	return u, ok
+}
+
 // Run drives the provision-then-deprovision sweep to completion. Returns the
 // number of users actually created/deleted alongside the error (if any), so
 // callers can report partial progress on abort.
+//
+// Run additionally starts a goroutine that consumes events from cfg.Agent and
+// writes pre_commit_log / applied runlog rows for tunnel IDs the sweep
+// registered. The consumer exits when the agent's Events channel closes; we
+// derive an agentCtx from ctx and cancel it after deprovision so the agent
+// stops cleanly even on a successful run.
 func Run(ctx context.Context, cfg Config) error {
 	if err := cfg.validate(); err != nil {
 		return err
 	}
-	if err := cfg.Agent.Start(ctx); err != nil {
+
+	registry := newTunnelRegistry()
+	agentCtx, agentCancel := context.WithCancel(ctx)
+	defer agentCancel()
+	if err := cfg.Agent.Start(agentCtx); err != nil {
 		return fmt.Errorf("start agent runner: %w", err)
 	}
 
-	created, err := provision(ctx, &cfg)
+	var consumerWG sync.WaitGroup
+	consumerWG.Add(1)
+	go func() {
+		defer consumerWG.Done()
+		consumeAgentEvents(&cfg, registry)
+	}()
+
+	created, err := provision(ctx, &cfg, registry)
 	if err != nil && !errors.Is(err, context.Canceled) {
-		return err
+		// On a non-cancel error from provision we still want deprovision to
+		// run (clean up what was created); the consumer keeps draining in
+		// parallel so any straggling agent events for already-created users
+		// still land in the runlog.
+		_ = err
 	}
-	// Always attempt deprovision so an abort during provision still cleans up
-	// what the sweep created. Use a fresh context for the deprovision phase if
-	// the original was cancelled, since the operator wants the tear-down to
-	// finish before exit. We respect the parent context's lifetime via the
-	// outer Run's error return — callers that want a hard stop pass a deadline.
 	depErr := deprovision(ctx, &cfg, created)
+
+	// Tell the agent to stop and wait for the consumer goroutine to drain so
+	// no events are dropped between deprovision-end and consumer-exit.
+	agentCancel()
+	consumerWG.Wait()
+
 	if err != nil {
 		return err
 	}
 	return depErr
 }
 
+// consumeAgentEvents reads from cfg.Agent.Events() until the channel closes
+// and writes pre_commit_log / applied rows for tunnel IDs the sweep has
+// registered. Events for unknown tunnel IDs are warn-logged and dropped — the
+// most likely cause is a tunnel that belongs to a non-orchestrator user.
+func consumeAgentEvents(cfg *Config, registry *tunnelRegistry) {
+	for ev := range cfg.Agent.Events() {
+		u, ok := registry.lookup(ev.TunnelID)
+		if !ok {
+			cfg.Logger.Debug("sweep: agent event for unregistered tunnel; dropping",
+				"tunnel_id", ev.TunnelID, "kind", ev.Kind)
+			continue
+		}
+		var runlogEvent runlog.Event
+		switch ev.Kind {
+		case agent.EventPreCommitLog:
+			runlogEvent = runlog.EventPreCommitLog
+		case agent.EventApplied:
+			runlogEvent = runlog.EventApplied
+		default:
+			continue
+		}
+		row := runlog.Row{
+			RunID:       cfg.RunID,
+			UserIndex:   u.idx,
+			UserPubkey:  u.pubkey.String(),
+			TunnelID:    u.tunnelID,
+			Event:       runlogEvent,
+			TNs:         ev.At.UnixNano(),
+			NAfterEvent: 0, // active-count state is owned by the sweep goroutine and not safe to read here
+		}
+		if err := cfg.Runlog.Append(row); err != nil {
+			cfg.Logger.Warn("sweep: runlog append failed for agent event",
+				"err", err, "kind", runlogEvent, "tunnel_id", ev.TunnelID)
+		}
+	}
+}
+
 // provision walks 0 → Target in batches, returning the slice of created users
 // so deprovision can iterate in reverse. Returns ctx.Err() if cancelled
-// between users.
-func provision(ctx context.Context, cfg *Config) ([]createdUser, error) {
+// between users. Each created user is also registered with the tunnel
+// registry so the agent-event consumer can attribute pre_commit_log /
+// applied events back to a user_index.
+func provision(ctx context.Context, cfg *Config, registry *tunnelRegistry) ([]createdUser, error) {
 	if cfg.Target == 0 {
 		return nil, nil
 	}
@@ -190,7 +283,9 @@ func provision(ctx context.Context, cfg *Config) ([]createdUser, error) {
 			if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventConfirm, res.ConfirmedAt, activeCount); err != nil {
 				return created, err
 			}
-			created = append(created, createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID})
+			cu := createdUser{idx: idx, pubkey: res.UserPDA, tunnelID: res.TunnelID}
+			created = append(created, cu)
+			registry.register(cu)
 			activeCount++
 			if err := emit(cfg, idx, pkStr, res.TunnelID, runlog.EventActivate, res.ActivatedAt, activeCount); err != nil {
 				return created, err
diff --git a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go
index 3402d5fe8c..a45f688a57 100644
--- a/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go
+++ b/tools/stress/device-orchestrator/pkg/sweep/sweep_test.go
@@ -63,6 +63,11 @@ type fakeExecutor struct {
 	// Optional hook to fail on the Nth create (1-based) — used by the abort test.
 	failCreateOnCall int
 	failErr          error
+
+	// Optional gate: when non-nil, DeleteUser blocks on it after incrementing
+	// deleteN. Tests use this to interleave work between provision and
+	// deprovision (e.g., emitting agent events).
+	deleteGate <-chan struct{}
 }
 
 func newFakeExecutor(owner solana.PublicKey) *fakeExecutor {
@@ -107,6 +112,9 @@ func (f *fakeExecutor) CreateUser(ctx context.Context, idx int) (sweep.CreateRes
 
 func (f *fakeExecutor) DeleteUser(ctx context.Context, userPDA solana.PublicKey) (sweep.DeleteResult, error) {
 	calls := int(f.deleteN.Add(1))
+	if f.deleteGate != nil {
+		<-f.deleteGate
+	}
 	f.mu.Lock()
 	// Remove the matching user from the active set.
 	for i, u := range f.created {
@@ -290,6 +298,114 @@ func TestRun_RejectsInvalidConfig(t *testing.T) {
 	}
 }
 
+// scriptedAgent is an agent.Runner used to drive the sweep's agent-event
+// consumer from a test. Events are emitted via Emit() so the test can
+// control timing — in production the agent log lags the on-chain CreateUser
+// by far longer than registry registration takes, but in tests the executor
+// is instantaneous and we need to emit AFTER provision has registered the
+// tunnels.
+type scriptedAgent struct {
+	out chan agent.Event
+}
+
+func newScriptedAgent() *scriptedAgent {
+	return &scriptedAgent{out: make(chan agent.Event, 16)}
+}
+
+func (s *scriptedAgent) Start(ctx context.Context) error {
+	go func() {
+		<-ctx.Done()
+		close(s.out)
+	}()
+	return nil
+}
+
+func (s *scriptedAgent) Events() <-chan agent.Event { return s.out }
+
+func (s *scriptedAgent) Emit(e agent.Event) { s.out <- e }
+
+func TestRun_ConsumesAgentEventsForRegisteredTunnels(t *testing.T) {
+	t.Parallel()
+
+	owner := solana.NewWallet().PublicKey()
+	exec := newFakeExecutor(owner)
+	// Block deprovision so the test can emit agent events while all created
+	// tunnels are registered but before agentCancel() shuts the consumer down.
+	gate := make(chan struct{})
+	exec.deleteGate = gate
+
+	ag := newScriptedAgent()
+
+	path := filepath.Join(t.TempDir(), "orchestrator-runlog.json")
+	w, err := runlog.Open(path)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = w.Close() })
+
+	cfg := sweep.Config{
+		RunID:         "run-events",
+		Target:        2,
+		UsersPerBatch: 2,
+		Hold:          0,
+		OwnerFilter:   owner,
+		Executor:      exec,
+		Agent:         ag,
+		Runlog:        w,
+		Clock:         &fakeClock{now: time.Unix(1_700_000_000, 0)},
+	}
+	done := make(chan error, 1)
+	go func() { done <- sweep.Run(context.Background(), cfg) }()
+
+	// Wait for deprovision to begin (deleteN >= 1) — this means provision is
+	// fully complete AND both tunnel registrations are in the registry.
+	deadline := time.Now().Add(time.Second)
+	for exec.deleteN.Load() == 0 {
+		if time.Now().After(deadline) {
+			t.Fatal("sweep did not reach deprovision within 1s")
+		}
+		time.Sleep(time.Millisecond)
+	}
+
+	// Emit events for both registered tunnels plus one unregistered one.
+	ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 500, At: time.Unix(1, 100)})
+	ag.Emit(agent.Event{Kind: agent.EventApplied, TunnelID: 500, At: time.Unix(1, 200)})
+	ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 999, At: time.Unix(1, 300)}) // unregistered; dropped
+	ag.Emit(agent.Event{Kind: agent.EventPreCommitLog, TunnelID: 501, At: time.Unix(1, 400)})
+	ag.Emit(agent.Event{Kind: agent.EventApplied, TunnelID: 501, At: time.Unix(1, 500)})
+
+	close(gate) // unblock deprovision
+
+	require.NoError(t, <-done)
+	require.NoError(t, w.Close())
+
+	rows := readRows(t, path)
+
+	// Filter for the agent-driven rows so we don't depend on exact interleaving
+	// with the submit/confirm/activate stream emitted by provision.
+	var preCommit, applied []runlog.Row
+	for _, r := range rows {
+		switch r.Event {
+		case runlog.EventPreCommitLog:
+			preCommit = append(preCommit, r)
+		case runlog.EventApplied:
+			applied = append(applied, r)
+		}
+	}
+	require.Len(t, preCommit, 2, "two registered tunnels → two pre_commit_log rows; the unregistered tunnel 999 is dropped")
+	require.Len(t, applied, 2)
+
+	// Tunnel 500 → user_index 0, Tunnel 501 → user_index 1 (fake executor assigns 500+idx).
+	for _, r := range preCommit {
+		switch r.TunnelID {
+		case 500:
+			assert.Equal(t, 0, r.UserIndex)
+		case 501:
+			assert.Equal(t, 1, r.UserIndex)
+		default:
+			t.Fatalf("unexpected tunnel id %d in pre_commit_log", r.TunnelID)
+		}
+	}
+}
+
 // Sanity: ctx cancellation between users is observed at the next iteration boundary.
 func TestRun_CancellationStopsBetweenUsers(t *testing.T) {
 	t.Parallel()