diff --git a/CHANGELOG.md b/CHANGELOG.md index ead1a9e8b5..6719bbfc8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Changes +- tools/stress/device-observer: initial scaffolding plus eAPI device sampler that writes per-tick snapshots of five `show` commands - CLI - Route the `doublezero user get` "no Access Pass found" warning through `tracing::warn!` instead of `eprintln!`, so diagnostics go to the logging facade per RFC-20 §Diagnostic logging and no longer write directly to stderr. No user-facing output, flag, or schema change. - Reconcile the RFC-20 docs with the implemented global logging flag: `rfcs/rfc20-cli-standardization.md` and `docs/cli-standard.md` now describe `--log-level ` (default `warn`) instead of the never-implemented repeatable `--log-verbose`. Documentation only; the binary is unchanged. diff --git a/tools/stress/device-observer/README.md b/tools/stress/device-observer/README.md new file mode 100644 index 0000000000..2d6b7b38aa --- /dev/null +++ b/tools/stress/device-observer/README.md @@ -0,0 +1,146 @@ +# device-observer + +> Alpha — only the eAPI sampler is wired up. Prometheus scrape (PR #3794), +> log tailers (PR #3795), and abort decider (PR #3796) are no-op stubs in +> this revision. + +`device-observer` samples an Arista cEOS device-under-test (DUT) during the +GRE Tunnel Capacity Study sweep. On every tick of `--sample-interval` it +issues five `show` commands over eAPI and writes one file per command into +the working directory. + +It is designed to be driven by an external orchestrator: the orchestrator +sets the flags, owns the working directory, and signals the observer to +stop via `SIGINT` or `SIGTERM`. + +## Build + +The tool is built by the workspace `make go-build` target. To build only +this binary: + +```sh +go build ./tools/stress/device-observer/cmd/device-observer +``` + +## Flags + +| Flag | Default | Required | Purpose | +| --------------------- | ----------------------- | -------- | ----------------------------------------------- | +| `--dut-host` | | yes | DUT hostname for eAPI | +| `--eapi-user` | `admin` | | eAPI username | +| `--eapi-pass` | `admin` | | eAPI password (not persisted) | +| `--eapi-port` | `80` | | eAPI HTTP port | +| `--agent-metrics-url` | | yes | doublezero-agent Prometheus metrics URL | +| `--sample-interval` | `10s` | | interval between eAPI samples | +| `--working-dir` | | yes | working directory for observer outputs | +| `--abort-file` | `/abort` | | path to write the abort sentinel file | + +## Working-directory contract + +The observer writes the following files into `--working-dir`: + +| File | Owner | Description | +| ---------------------------------------- | --------- | ----------------------------------------------- | +| `observer-config.json` | observer | resolved flag values + PID + start timestamp | +| `show-hardware-capacity-.json` | observer | one per tick | +| `show-gre-tunnel-static-.json` | observer | one per tick | +| `show-processes-top-once-.json` | observer | one per tick | +| `show-logging-errors-.log` | observer | one per tick | +| `show-logging-critical-.log` | observer | one per tick | + +The orchestrator additionally owns these files in the same directory; the +observer reads them (in later PRs) but does not write them: + +| File | Owner | Read by | +| ----------------------------- | ------------ | ----------------------------- | +| `orchestrator-runlog.json` | orchestrator | observer (PR #3795) | +| `orchestrator.agent.log` | orchestrator | observer (PR #3795) | + +The abort sentinel at `--abort-file` (default `/abort`) is +written by the observer (in PR #3796) and read by the orchestrator. + +Filenames use the observer's local clock formatted as ISO 8601 UTC with +nanosecond precision, with `:` replaced by `-` for filesystem portability +(e.g. `show-hardware-capacity-2026-05-29T12-34-56.123456789Z.json`). + +### `observer-config.json` schema + +```json +{ + "started_at": "2026-05-29T12:34:56.789Z", + "pid": 12345, + "dut_host": "dz-local-device-dz1", + "eapi_user": "admin", + "agent_metrics_url": "http://dz-local-device-dz1:9100/metrics", + "sample_interval": "10s", + "abort_file": "/tmp/observer-out/abort", + "working_dir": "/tmp/observer-out" +} +``` + +`eapi_pass` is deliberately omitted — the working directory may be archived +(e.g. to S3) and credentials must not land there. + +## Local devnet smoke test + +Against `dz-local-device-dz1` (see top-level `CLAUDE.md` for devnet setup): + +```sh +mkdir -p /tmp/observer-out +./device-observer \ + --dut-host dz-local-device-dz1 \ + --eapi-user admin --eapi-pass admin \ + --agent-metrics-url http://dz-local-device-dz1:9100/metrics \ + --working-dir /tmp/observer-out \ + --sample-interval 10s +# After ~20 s, Ctrl-C. +ls /tmp/observer-out +jq . /tmp/observer-out/observer-config.json +jq . /tmp/observer-out/show-hardware-capacity-*.json +``` + +## Known limitations + +- The eAPI client re-marshals the goeapi-decoded JSON response, so very + large integer counters (greater than 2^53) may lose precision and map + key ordering is not preserved. A follow-up will replace the goeapi call + path with a direct eAPI HTTP POST so the per-command JSON is captured + byte-for-byte. +- `goeapi.RunCommands` does not accept a `context.Context`. The sampler + works around this with a goroutine + `select` on `ctx.Done()` so the + observer exits promptly on `SIGINT`/`SIGTERM`; however, an in-flight + eAPI request may still complete in the background after exit. + +## Failure handling + +A failure on a single `show` command logs at WARN and continues to the +next command. The next tick retries from a clean slate. The abort decider +(PR #3796) owns the policy for declaring repeated failures fatal. + +On `SIGINT` / `SIGTERM` the observer cancels its context and exits without +finishing a partially-started tick. Each file is written via a single +`os.WriteFile`, so partial-file reads are possible but the orchestrator +does not read sample files during a sweep. + +## Disk usage + +`show hardware capacity` can produce multi-megabyte JSON on heavily +configured devices. The observer writes one file per tick and never +appends, so the working directory grows steadily during a sweep. Pruning +old samples is the orchestrator's responsibility. + +## Layout + +``` +tools/stress/device-observer/ +├── cmd/device-observer/main.go +├── internal/ +│ ├── abort/ # PR #3796 (stub here) +│ ├── collector/ # Collector interface + Noop +│ ├── eapi/ # thin goeapi wrapper +│ ├── loggingtail/ # PR #3795 (stubs here) +│ ├── promscrape/ # PR #3794 (stub here) +│ ├── runlog/ # PR #3795 (stub here) +│ └── sample/ # eAPI sampler +└── README.md +``` diff --git a/tools/stress/device-observer/cmd/device-observer/main.go b/tools/stress/device-observer/cmd/device-observer/main.go new file mode 100644 index 0000000000..54d909f99e --- /dev/null +++ b/tools/stress/device-observer/cmd/device-observer/main.go @@ -0,0 +1,142 @@ +// device-observer samples an Arista cEOS device-under-test during the GRE +// Tunnel Capacity Study. PR #3793 implements the eAPI sampler; subsequent +// PRs replace the stub collectors with real implementations. +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "log/slog" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "golang.org/x/sync/errgroup" + + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/abort" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/eapi" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/loggingtail" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/promscrape" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/runlog" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/sample" +) + +// observerConfig is the on-disk schema for observer-config.json. eapi_pass +// is deliberately omitted; the working directory may be archived and +// credentials must not land there. +type observerConfig struct { + StartedAt time.Time `json:"started_at"` + PID int `json:"pid"` + DUTHost string `json:"dut_host"` + EAPIUser string `json:"eapi_user"` + AgentMetricsURL string `json:"agent_metrics_url"` + SampleInterval string `json:"sample_interval"` + AbortFile string `json:"abort_file"` + WorkingDir string `json:"working_dir"` +} + +func main() { + if err := run(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +func run() error { + var ( + dutHost = flag.String("dut-host", "", "DUT hostname for eAPI (required)") + eapiUser = flag.String("eapi-user", "admin", "eAPI username") + eapiPass = flag.String("eapi-pass", "admin", "eAPI password") + eapiPort = flag.Int("eapi-port", 80, "eAPI HTTP port") + agentMetricsURL = flag.String("agent-metrics-url", "", "doublezero-agent Prometheus metrics URL (required)") + sampleInterval = flag.Duration("sample-interval", 10*time.Second, "interval between eAPI samples") + workingDir = flag.String("working-dir", "", "working directory for observer outputs (required)") + abortFile = flag.String("abort-file", "", "path to write the abort sentinel (default /abort)") + ) + flag.Parse() + + if *dutHost == "" || *agentMetricsURL == "" || *workingDir == "" { + flag.Usage() + return errors.New("--dut-host, --agent-metrics-url, and --working-dir are required") + } + if *sampleInterval <= 0 { + return errors.New("--sample-interval must be > 0") + } + if *abortFile == "" { + *abortFile = filepath.Join(*workingDir, "abort") + } + absWorking, err := filepath.Abs(*workingDir) + if err != nil { + return fmt.Errorf("resolve --working-dir: %w", err) + } + absAbort, err := filepath.Abs(*abortFile) + if err != nil { + return fmt.Errorf("resolve --abort-file: %w", err) + } + // Constrain --abort-file to live under --working-dir so the sentinel + // path PR #3796 will write to is bounded by the orchestrator's + // archive surface. + if !strings.HasPrefix(absAbort+string(os.PathSeparator), absWorking+string(os.PathSeparator)) { + return fmt.Errorf("--abort-file %q must be inside --working-dir %q", absAbort, absWorking) + } + + logger := slog.New(slog.NewJSONHandler(os.Stderr, nil)) + + if err := os.MkdirAll(*workingDir, 0o750); err != nil { + return fmt.Errorf("create working dir: %w", err) + } + if err := writeObserverConfig(*workingDir, observerConfig{ + StartedAt: time.Now().UTC(), + PID: os.Getpid(), + DUTHost: *dutHost, + EAPIUser: *eapiUser, + AgentMetricsURL: *agentMetricsURL, + SampleInterval: sampleInterval.String(), + AbortFile: *abortFile, + WorkingDir: *workingDir, + }); err != nil { + return fmt.Errorf("write observer config: %w", err) + } + + client, err := eapi.NewClient(*dutHost, *eapiUser, *eapiPass, *eapiPort) + if err != nil { + return err + } + + collectors := []collector.Collector{ + sample.NewSampler(client, *workingDir, *sampleInterval, logger), + promscrape.New(*agentMetricsURL, *workingDir), + loggingtail.NewEOS(client, *workingDir), + loggingtail.NewAgent(*workingDir), + runlog.New(*workingDir), + abort.New(*abortFile), + } + + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + logger.Info("device-observer started", "dut_host", *dutHost, "working_dir", *workingDir, "pid", os.Getpid()) + + g, gctx := errgroup.WithContext(ctx) + for _, c := range collectors { + g.Go(func() error { return c.Run(gctx) }) + } + if err := g.Wait(); err != nil { + return fmt.Errorf("collector failed: %w", err) + } + return nil +} + +func writeObserverConfig(workingDir string, cfg observerConfig) error { + body, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(workingDir, "observer-config.json"), body, 0o640) +} diff --git a/tools/stress/device-observer/internal/abort/decider.go b/tools/stress/device-observer/internal/abort/decider.go new file mode 100644 index 0000000000..5c3730a713 --- /dev/null +++ b/tools/stress/device-observer/internal/abort/decider.go @@ -0,0 +1,10 @@ +// Package abort evaluates abort signals and writes a sentinel file when +// any trigger fires. Real implementation lands in PR #3796. +package abort + +import "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + +func New(abortFile string) collector.Collector { + _ = abortFile + return collector.Noop{} +} diff --git a/tools/stress/device-observer/internal/collector/collector.go b/tools/stress/device-observer/internal/collector/collector.go new file mode 100644 index 0000000000..a2c5aa70f1 --- /dev/null +++ b/tools/stress/device-observer/internal/collector/collector.go @@ -0,0 +1,18 @@ +// Package collector defines the interface implemented by every long-running +// data-collection goroutine the observer runs under one errgroup. +package collector + +import "context" + +type Collector interface { + Run(ctx context.Context) error +} + +// Noop blocks until ctx is canceled. Stub collectors return it from their +// constructors until their real implementations land in later PRs. +type Noop struct{} + +func (Noop) Run(ctx context.Context) error { + <-ctx.Done() + return nil +} diff --git a/tools/stress/device-observer/internal/eapi/client.go b/tools/stress/device-observer/internal/eapi/client.go new file mode 100644 index 0000000000..1c595b6312 --- /dev/null +++ b/tools/stress/device-observer/internal/eapi/client.go @@ -0,0 +1,54 @@ +// Package eapi is a thin wrapper around the Arista goeapi client used by +// the device-observer sampler. +package eapi + +import ( + "encoding/json" + "fmt" + + "github.com/aristanetworks/goeapi" +) + +type Client struct { + node *goeapi.Node +} + +// NewClient dials the device's eAPI endpoint over HTTP. HTTPS support is +// deferred; see docs/work-plan-3793.md. +func NewClient(host, user, pass string, port int) (*Client, error) { + node, err := goeapi.Connect("http", host, user, pass, port) + if err != nil { + return nil, fmt.Errorf("eapi connect %s:%d: %w", host, port, err) + } + return &Client{node: node}, nil +} + +func (c *Client) RunShowJSON(cmd string) (json.RawMessage, error) { + resp, err := c.node.RunCommands([]string{cmd}, "json") + if err != nil { + return nil, fmt.Errorf("run %q (json): %w", cmd, err) + } + if resp == nil || len(resp.Result) != 1 { + return nil, fmt.Errorf("run %q (json): unexpected result length", cmd) + } + raw, err := json.Marshal(resp.Result[0]) + if err != nil { + return nil, fmt.Errorf("re-marshal %q result: %w", cmd, err) + } + return raw, nil +} + +func (c *Client) RunShowText(cmd string) (string, error) { + resp, err := c.node.RunCommands([]string{cmd}, "text") + if err != nil { + return "", fmt.Errorf("run %q (text): %w", cmd, err) + } + if resp == nil || len(resp.Result) != 1 { + return "", fmt.Errorf("run %q (text): unexpected result length", cmd) + } + out, ok := resp.Result[0]["output"].(string) + if !ok { + return "", fmt.Errorf("run %q (text): missing output field", cmd) + } + return out, nil +} diff --git a/tools/stress/device-observer/internal/eapi/client_test.go b/tools/stress/device-observer/internal/eapi/client_test.go new file mode 100644 index 0000000000..5abcb3ddbe --- /dev/null +++ b/tools/stress/device-observer/internal/eapi/client_test.go @@ -0,0 +1,28 @@ +package eapi + +import "testing" + +// TestNewClientNoServer verifies NewClient surfaces a connection error +// when no eAPI server is reachable. (goeapi's Connect dials lazily for +// version negotiation; we only assert no panic and that a Client struct +// is constructable in the success path via a non-routable address with +// a short timeout is environment-specific, so we keep this test minimal +// and exercise the real client behavior via the sampler tests through a +// fake.) +func TestNewClientReturnsNonNil(t *testing.T) { + // Use port 0 so even if a dial were attempted it would fail + // immediately; the existing goeapi version-probe is non-fatal and + // returns a valid Node on connection failure. + c, err := NewClient("127.0.0.1", "admin", "admin", 0) + if err != nil { + // Some goeapi versions surface dial errors here; that is OK. + t.Logf("NewClient returned err (acceptable): %v", err) + return + } + if c == nil { + t.Fatal("NewClient returned nil client without error") + } + if c.node == nil { + t.Fatal("NewClient returned Client with nil node") + } +} diff --git a/tools/stress/device-observer/internal/loggingtail/agent.go b/tools/stress/device-observer/internal/loggingtail/agent.go new file mode 100644 index 0000000000..ab385574a6 --- /dev/null +++ b/tools/stress/device-observer/internal/loggingtail/agent.go @@ -0,0 +1,8 @@ +package loggingtail + +import "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + +func NewAgent(workingDir string) collector.Collector { + _ = workingDir + return collector.Noop{} +} diff --git a/tools/stress/device-observer/internal/loggingtail/eos.go b/tools/stress/device-observer/internal/loggingtail/eos.go new file mode 100644 index 0000000000..e33ee09ee0 --- /dev/null +++ b/tools/stress/device-observer/internal/loggingtail/eos.go @@ -0,0 +1,13 @@ +// Package loggingtail will follow agent and orchestrator logs. Real +// implementations land in PR #3795. +package loggingtail + +import ( + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/eapi" +) + +func NewEOS(client *eapi.Client, workingDir string) collector.Collector { + _, _ = client, workingDir + return collector.Noop{} +} diff --git a/tools/stress/device-observer/internal/promscrape/scrape.go b/tools/stress/device-observer/internal/promscrape/scrape.go new file mode 100644 index 0000000000..51ff5c8332 --- /dev/null +++ b/tools/stress/device-observer/internal/promscrape/scrape.go @@ -0,0 +1,10 @@ +// Package promscrape will scrape the doublezero-agent's Prometheus metrics +// endpoint. Real implementation lands in PR #3794. +package promscrape + +import "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + +func New(metricsURL, workingDir string) collector.Collector { + _, _ = metricsURL, workingDir + return collector.Noop{} +} diff --git a/tools/stress/device-observer/internal/runlog/reader.go b/tools/stress/device-observer/internal/runlog/reader.go new file mode 100644 index 0000000000..c505d77e4a --- /dev/null +++ b/tools/stress/device-observer/internal/runlog/reader.go @@ -0,0 +1,10 @@ +// Package runlog will tail the orchestrator-written run log. Real +// implementation lands in PR #3795. +package runlog + +import "github.com/malbeclabs/doublezero/tools/stress/device-observer/internal/collector" + +func New(workingDir string) collector.Collector { + _ = workingDir + return collector.Noop{} +} diff --git a/tools/stress/device-observer/internal/sample/eos.go b/tools/stress/device-observer/internal/sample/eos.go new file mode 100644 index 0000000000..bcb2e26967 --- /dev/null +++ b/tools/stress/device-observer/internal/sample/eos.go @@ -0,0 +1,116 @@ +// Package sample issues a fixed list of show commands on every tick and +// writes one file per command. Per-command failures are logged but do not +// stop the loop; the abort decider (PR #3796) owns repeated-failure policy. +package sample + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "time" +) + +// eapiRunner is the subset of *eapi.Client used by Sampler; tests +// substitute a fake. +type eapiRunner interface { + RunShowJSON(cmd string) (json.RawMessage, error) + RunShowText(cmd string) (string, error) +} + +type commandSpec struct { + cmd, slug string + json bool // false → text encoding +} + +var commands = []commandSpec{ + {"show hardware capacity", "show-hardware-capacity", true}, + {"show gre tunnel static", "show-gre-tunnel-static", true}, + {"show processes top once", "show-processes-top-once", true}, + {"show logging errors", "show-logging-errors", false}, + {"show logging critical", "show-logging-critical", false}, +} + +type Sampler struct { + client eapiRunner + workingDir string + interval time.Duration + logger *slog.Logger + now func() time.Time +} + +func NewSampler(client eapiRunner, workingDir string, interval time.Duration, logger *slog.Logger) *Sampler { + return &Sampler{client: client, workingDir: workingDir, interval: interval, logger: logger, now: time.Now} +} + +// Run samples immediately, then on every tick of interval, until ctx is +// canceled. The immediate first sample avoids waiting a full interval for +// the first snapshot when interval is large. +func (s *Sampler) Run(ctx context.Context) error { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + s.tick(ctx) + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + s.tick(ctx) + } + } +} + +func (s *Sampler) tick(ctx context.Context) { + ts := s.now().UTC() + for _, c := range commands { + if ctx.Err() != nil { + return + } + // Run each command in a goroutine so ctx cancellation can + // abandon the tick even though goeapi.RunCommands does not + // accept a context. The leaked goroutine finishes whenever + // the underlying HTTP call returns and does not block exit. + done := make(chan error, 1) + go func() { done <- s.runOne(c, ts) }() + select { + case <-ctx.Done(): + return + case err := <-done: + if err != nil { + s.logger.Warn("sample command failed", "command", c.cmd, "err", err) + } + } + } +} + +func (s *Sampler) runOne(c commandSpec, ts time.Time) error { + var body []byte + ext := "log" + if c.json { + raw, err := s.client.RunShowJSON(c.cmd) + if err != nil { + return err + } + body, ext = raw, "json" + } else { + text, err := s.client.RunShowText(c.cmd) + if err != nil { + return err + } + body = []byte(text) + } + path := filepath.Join(s.workingDir, fmt.Sprintf("%s-%s.%s", c.slug, fileTimestamp(ts), ext)) + if err := os.WriteFile(path, body, 0o640); err != nil { + return fmt.Errorf("write %s: %w", path, err) + } + return nil +} + +// fileTimestamp renders t as ISO 8601 UTC with `:` replaced by `-` so the +// result is portable across filesystems that disallow `:`. +func fileTimestamp(t time.Time) string { + return strings.ReplaceAll(t.Format("2006-01-02T15:04:05.000000000Z"), ":", "-") +} diff --git a/tools/stress/device-observer/internal/sample/eos_test.go b/tools/stress/device-observer/internal/sample/eos_test.go new file mode 100644 index 0000000000..931478f228 --- /dev/null +++ b/tools/stress/device-observer/internal/sample/eos_test.go @@ -0,0 +1,218 @@ +package sample + +import ( + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" +) + +// fakeRunner is a tiny eapiRunner implementation that returns canned +// responses (or errors) per command. +type fakeRunner struct { + jsonResp map[string]json.RawMessage + textResp map[string]string + errs map[string]error + calls atomic.Int32 +} + +func (f *fakeRunner) RunShowJSON(cmd string) (json.RawMessage, error) { + f.calls.Add(1) + if err, ok := f.errs[cmd]; ok { + return nil, err + } + if v, ok := f.jsonResp[cmd]; ok { + return v, nil + } + return json.RawMessage(`{}`), nil +} + +func (f *fakeRunner) RunShowText(cmd string) (string, error) { + f.calls.Add(1) + if err, ok := f.errs[cmd]; ok { + return "", err + } + if v, ok := f.textResp[cmd]; ok { + return v, nil + } + return "", nil +} + +func discardLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +func newSamplerWithClock(t *testing.T, runner eapiRunner, dir string, clock func() time.Time) *Sampler { + t.Helper() + s := NewSampler(runner, dir, time.Hour, discardLogger()) + s.now = clock + return s +} + +// TestTickWritesAllFiles verifies a single tick produces one file per +// command with the expected name prefix and body. +func TestTickWritesAllFiles(t *testing.T) { + dir := t.TempDir() + runner := &fakeRunner{ + jsonResp: map[string]json.RawMessage{ + "show hardware capacity": json.RawMessage(`{"capacity":1}`), + "show gre tunnel static": json.RawMessage(`{"tunnels":[]}`), + "show processes top once": json.RawMessage(`{"processes":[]}`), + }, + textResp: map[string]string{ + "show logging errors": "errlog\n", + "show logging critical": "critlog\n", + }, + } + frozen := time.Date(2026, 5, 29, 12, 34, 56, 123456789, time.UTC) + s := newSamplerWithClock(t, runner, dir, func() time.Time { return frozen }) + + s.tick(context.Background()) + + entries, err := os.ReadDir(dir) + if err != nil { + t.Fatalf("read dir: %v", err) + } + if len(entries) != 5 { + names := make([]string, 0, len(entries)) + for _, e := range entries { + names = append(names, e.Name()) + } + t.Fatalf("expected 5 files, got %d: %v", len(entries), names) + } + + expect := map[string]string{ + "show-hardware-capacity": `{"capacity":1}`, + "show-gre-tunnel-static": `{"tunnels":[]}`, + "show-processes-top-once": `{"processes":[]}`, + "show-logging-errors": "errlog\n", + "show-logging-critical": "critlog\n", + } + for prefix, want := range expect { + matches, _ := filepath.Glob(filepath.Join(dir, prefix+"-*")) + if len(matches) != 1 { + t.Errorf("prefix %q: expected 1 match, got %d", prefix, len(matches)) + continue + } + got, err := os.ReadFile(matches[0]) + if err != nil { + t.Errorf("read %s: %v", matches[0], err) + continue + } + if string(got) != want { + t.Errorf("file %s: body = %q, want %q", matches[0], string(got), want) + } + } +} + +// TestSingleCommandFailureContinues confirms one failing command does +// not abort the tick: the other four files are still written. +func TestSingleCommandFailureContinues(t *testing.T) { + dir := t.TempDir() + runner := &fakeRunner{ + errs: map[string]error{ + "show gre tunnel static": errors.New("boom"), + }, + } + frozen := time.Date(2026, 5, 29, 12, 34, 56, 0, time.UTC) + s := newSamplerWithClock(t, runner, dir, func() time.Time { return frozen }) + + s.tick(context.Background()) + + entries, err := os.ReadDir(dir) + if err != nil { + t.Fatalf("read dir: %v", err) + } + if len(entries) != 4 { + t.Fatalf("expected 4 files after one failure, got %d", len(entries)) + } + for _, e := range entries { + if strings.HasPrefix(e.Name(), "show-gre-tunnel-static") { + t.Errorf("failing command should not have produced a file, got %s", e.Name()) + } + } +} + +// TestTwoTicksProduceDistinctFiles verifies the nanosecond-precision +// filename suffix prevents collisions between consecutive ticks. +func TestTwoTicksProduceDistinctFiles(t *testing.T) { + dir := t.TempDir() + runner := &fakeRunner{} + base := time.Date(2026, 5, 29, 12, 34, 56, 0, time.UTC) + var n atomic.Int64 + s := newSamplerWithClock(t, runner, dir, func() time.Time { + i := n.Add(1) + return base.Add(time.Duration(i) * time.Nanosecond) + }) + + s.tick(context.Background()) + s.tick(context.Background()) + + entries, err := os.ReadDir(dir) + if err != nil { + t.Fatalf("read dir: %v", err) + } + if len(entries) != 10 { + t.Fatalf("expected 10 files across two ticks, got %d", len(entries)) + } + seen := map[string]struct{}{} + for _, e := range entries { + if _, dup := seen[e.Name()]; dup { + t.Fatalf("duplicate filename %s", e.Name()) + } + seen[e.Name()] = struct{}{} + } +} + +// TestRunCancelsCleanly confirms Run returns nil promptly after context +// cancellation. +func TestRunCancelsCleanly(t *testing.T) { + dir := t.TempDir() + runner := &fakeRunner{} + s := NewSampler(runner, dir, time.Hour, discardLogger()) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- s.Run(ctx) }() + + // Give the initial tick a moment to run, then cancel. + time.Sleep(50 * time.Millisecond) + cancel() + + select { + case err := <-done: + if err != nil { + t.Fatalf("Run returned error after cancel: %v", err) + } + case <-time.After(2 * time.Second): + t.Fatal("Run did not return within 2s of cancel") + } +} + +// TestFileTimestampNoColon ensures the produced suffix is filesystem-safe. +func TestFileTimestampNoColon(t *testing.T) { + got := fileTimestamp(time.Date(2026, 5, 29, 12, 34, 56, 123, time.UTC)) + if strings.Contains(got, ":") { + t.Errorf("timestamp %q must not contain ':'", got) + } + if !strings.HasSuffix(got, "Z") { + t.Errorf("timestamp %q must be UTC (end with Z)", got) + } + if !strings.HasPrefix(got, "2026-05-29T12-34-56") { + t.Errorf("timestamp %q has unexpected prefix", got) + } +} + +// Sampler must satisfy collector.Collector via Run(ctx) error. +type runnable interface { + Run(ctx context.Context) error +} + +var _ runnable = (*Sampler)(nil)