From 371195fc8ab98e1e53a5ec91a7c42d50df6918ea Mon Sep 17 00:00:00 2001 From: Gustavo Carvalho Date: Tue, 2 Jun 2026 08:33:00 -0300 Subject: [PATCH 1/5] fix: improve logging on nil-resource case Signed-off-by: Gustavo Carvalho --- main.go | 257 +++++++++++++++++++++++++++++++++++++++++++++++---- main_test.go | 216 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 451 insertions(+), 22 deletions(-) diff --git a/main.go b/main.go index 7426496..82d6462 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "io/fs" + "maps" "net" "net/http" "net/url" @@ -318,6 +319,7 @@ type CustodianExecutionResult struct { ArtifactPath string ResourcesPath string LogPaths []string + LogTail string DiagnosticWarnings []string } @@ -507,10 +509,12 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe e.Logger.Info("Custodian AWS API trace enabled", "check_name", req.Check.Name, "trace_log_path", traceLogPath, "pythonpath_dir", traceDir) } } - e.Logger.Debug("Executing custodian command", + e.Logger.Info("Executing custodian command", "check_name", req.Check.Name, + "resource", req.Check.Resource, "command", req.BinaryPath, "args", args, + "aws_regions", regions, ) stdoutBuf := &lockedBuffer{} stderrBuf := &lockedBuffer{} @@ -637,14 +641,14 @@ commandFinished: if resources != nil { result.Resources = resources } - var logTail string - var logErr error - if req.LogTailDuringRun || err != nil || runCtx.Err() != nil || resourcesErr != nil { - logPaths, tail, readErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes) - result.LogPaths = logPaths - logTail = tail - logErr = readErr - } + // Always capture the tail of custodian's own run log, regardless of exit + // code or LogTailDuringRun. Custodian frequently exits 0 with an empty + // resources.json after logging AccessDenied / EndpointConnectionError to + // custodian-run.log; capturing the tail here (before the temp dir is + // deleted) is the only non-invasive way to surface that cause to Eval. + logPaths, logTail, logErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes) + result.LogPaths = logPaths + result.LogTail = logTail if err != nil { result.Err = fmt.Errorf("custodian execution failed: %w", err) @@ -2455,8 +2459,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api successfulPolicyRuns := 0 hadCheckExecutionFailures := false defer func() { - if p.parsedConfig.PreserveArtifacts && hadCheckExecutionFailures { - p.Logger.Warn("Preserving cloud custodian execution artifacts after check execution failure", "execution_root", executionRoot) + zeroEvidence := successfulPolicyRuns == 0 && totalEvidenceCount == 0 + if p.parsedConfig.PreserveArtifacts && (hadCheckExecutionFailures || zeroEvidence) { + p.Logger.Warn("Preserving cloud custodian execution artifacts for troubleshooting", + "execution_root", executionRoot, + "had_check_execution_failures", hadCheckExecutionFailures, + "zero_evidence", zeroEvidence, + ) return } if err := os.RemoveAll(executionRoot); err != nil { @@ -2465,9 +2474,21 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api }() p.Logger.Debug("Created temporary execution root", "execution_root", executionRoot) + // Retain a compact diagnostic per execution (baselines + checks) and a + // per-check summary row so the zero-evidence failure branch and the + // consolidated end-of-run summary can explain why nothing was produced. + var executionDiagnostics []executionDiagnostic + var runSummaries []checkRunSummary + baselines := p.collectInventoryBaselines(ctx, executionRoot) - for resourceType, baseline := range baselines { - if baseline == nil || baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 { + for _, resourceType := range slices.Sorted(maps.Keys(baselines)) { + baseline := baselines[resourceType] + if baseline == nil { + continue + } + baselineName := fmt.Sprintf("inventory-%s", sanitizeIdentifier(resourceType)) + executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil)) + if baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 { continue } err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings) @@ -2481,6 +2502,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Warn("Skipping custodian execution due to check parse issues", "check_name", check.Name, "parse_errors", check.ParseErrors) accumulatedErrors = errors.Join(accumulatedErrors, fmt.Errorf("check %s has parse errors: %s", check.Name, strings.Join(check.ParseErrors, "; "))) hadCheckExecutionFailures = true + runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, HadError: true, Regions: p.parsedConfig.AWSRegions}) continue } @@ -2495,6 +2517,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Error("Skipping check due to unavailable inventory baseline", "check_name", check.Name, "resource", check.Resource, "error", err) accumulatedErrors = errors.Join(accumulatedErrors, err) hadCheckExecutionFailures = true + baselineRecordCount := 0 + if baseline != nil { + baselineRecordCount = len(baseline.Records) + } + runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, BaselineResourceCount: baselineRecordCount, HadError: true, Regions: p.parsedConfig.AWSRegions}) continue } @@ -2517,6 +2544,8 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Error("Skipping resource evaluation due to check execution error", "check_name", check.Name, "error", err) accumulatedErrors = errors.Join(accumulatedErrors, err) hadCheckExecutionFailures = true + executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, true)) + runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: execution.ExitCode, BaselineResourceCount: len(baseline.Records), MatchedResourceCount: len(execution.Resources), HadError: true, Regions: p.parsedConfig.AWSRegions}) continue } @@ -2547,10 +2576,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api } } + checkEvidenceCount := 0 + checkHadError := false for _, payload := range payloads { evidences, evalErr, successfulRuns := p.evaluateResourcePolicies(ctx, payload, req.GetPolicyPaths()) pendingEvidences = append(pendingEvidences, evidences...) totalEvidenceCount += len(evidences) + checkEvidenceCount += len(evidences) successfulPolicyRuns += successfulRuns p.Logger.Debug("Completed policy evaluations for resource", "check_name", payload.Check.Name, @@ -2561,6 +2593,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api "had_eval_error", evalErr != nil, ) if evalErr != nil { + checkHadError = true accumulatedErrors = errors.Join(accumulatedErrors, evalErr) } for len(pendingEvidences) >= evidenceBatchSize { @@ -2575,7 +2608,20 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Warn("Check completed with unavailable AWS service endpoints", "check_name", check.Name, "error", err) accumulatedErrors = errors.Join(accumulatedErrors, err) hadCheckExecutionFailures = true - } + checkHadError = true + } + executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, checkHadError)) + runSummaries = append(runSummaries, checkRunSummary{ + Check: check.Name, + Resource: check.Resource, + ExitCode: execution.ExitCode, + BaselineResourceCount: len(baseline.Records), + MatchedResourceCount: len(execution.Resources), + PayloadCount: len(payloads), + EvidenceCount: checkEvidenceCount, + HadError: checkHadError, + Regions: p.parsedConfig.AWSRegions, + }) } if len(pendingEvidences) > 0 { @@ -2596,11 +2642,25 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api p.Logger.Warn("No evidence generated by current evaluation run") } - if successfulPolicyRuns == 0 && totalEvidenceCount == 0 { - if accumulatedErrors == nil { - accumulatedErrors = errors.New("policy evaluation failed for all checks") - } - return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, accumulatedErrors + // Consolidated, one-glance picture across every resource type. Emitted at + // Warn when no evidence was produced, otherwise at Debug since a run that + // produced evidence is the healthy case. + zeroEvidenceOutcome := successfulPolicyRuns == 0 && totalEvidenceCount == 0 + summaryLog := p.Logger.Debug + if zeroEvidenceOutcome { + summaryLog = p.Logger.Warn + } + summaryLog("Cloud custodian evaluation summary", + "check_count", len(p.checks), + "successful_policy_runs", successfulPolicyRuns, + "total_evidence_count", totalEvidenceCount, + "had_check_execution_failures", hadCheckExecutionFailures, + "aws_regions", p.parsedConfig.AWSRegions, + "per_check", runSummaries, + ) + + if zeroEvidenceOutcome { + return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, composeZeroEvidenceError(accumulatedErrors, executionDiagnostics) } if hadCheckExecutionFailures { if accumulatedErrors == nil { @@ -2648,6 +2708,7 @@ func (p *CloudCustodianPlugin) collectInventoryBaselines(ctx context.Context, ex if execution.Err != nil || execution.Error != "" { baselineErr = formatExecutionFailure(check.Name, execution) } + p.warnZeroResourceBaseline(check.Name, resourceType, execution) records := make([]ResourceRecord, 0, len(execution.Resources)) for _, resource := range execution.Resources { records = append(records, p.buildResourceRecord(resourceType, resource)) @@ -3185,6 +3246,24 @@ func isReservedResourceLabel(label string) bool { } } +// warnZeroResourceBaseline emits a Warn naming the likely causes whenever an +// inventory baseline run exits 0 but returned no resources, so that an operator +// can distinguish a permissions problem from a network/endpoint problem without +// re-running with invasive tracing. This is not done for policy checks: a fully +// compliant estate legitimately matches zero resources. +func (p *CloudCustodianPlugin) warnZeroResourceBaseline(name, resource string, execution CustodianExecutionResult) { + if execution.ExitCode != 0 || len(execution.Resources) > 0 { + return + } + p.Logger.Warn("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints", + "name", name, + "resource", resource, + "exit_code", execution.ExitCode, + "stderr_tail", tailString(execution.Stderr, custodianOutputTailBytes), + "log_tail", execution.LogTail, + ) +} + func formatExecutionFailure(checkName string, execution CustodianExecutionResult) error { switch { case execution.Error != "" && execution.Err != nil: @@ -3210,6 +3289,146 @@ func formatExecutionDiagnosticWarnings(messages []string) error { return err } +// custodianDiagnosticDetailCap bounds the total size of the per-execution +// diagnostic detail appended to a zero-evidence failure error so that a policy +// pack with many resource types can never produce an unbounded gRPC error. +const custodianDiagnosticDetailCap = 32 * 1024 + +// tailString returns at most the last maxBytes bytes of s. +func tailString(s string, maxBytes int) string { + if maxBytes <= 0 || len(s) <= maxBytes { + return s + } + return s[len(s)-maxBytes:] +} + +// executionDiagnostic is a compact, bounded snapshot of a single custodian +// execution (an inventory baseline or a policy check) retained so that the +// zero-evidence failure branch can explain why nothing was produced. +type executionDiagnostic struct { + name string + resource string + isBaseline bool + exitCode int + resourceCount int + logTail string + stderrTail string + warnings []string + hadError bool +} + +func newExecutionDiagnostic(name, resource string, isBaseline bool, execution CustodianExecutionResult, hadError bool) executionDiagnostic { + return executionDiagnostic{ + name: name, + resource: resource, + isBaseline: isBaseline, + exitCode: execution.ExitCode, + resourceCount: len(execution.Resources), + logTail: execution.LogTail, + stderrTail: tailString(execution.Stderr, custodianOutputTailBytes), + warnings: execution.DiagnosticWarnings, + hadError: hadError, + } +} + +func (d executionDiagnostic) kind() string { + if d.isBaseline { + return "baseline" + } + return "check" +} + +// formatExecutionDiagnosticDetail renders the full (already capped) detail for +// one execution: its identity, exit code, resource count, diagnostic warnings, +// and the tails of stderr and custodian-run.log. +func formatExecutionDiagnosticDetail(d executionDiagnostic) string { + var sb strings.Builder + fmt.Fprintf(&sb, "\n--- %s %s [%s] exit=%d resources=%d ---\n", d.name, d.resource, d.kind(), d.exitCode, d.resourceCount) + if len(d.warnings) > 0 { + sb.WriteString("diagnostic warnings:\n") + for _, w := range d.warnings { + w = strings.TrimSpace(w) + if w == "" { + continue + } + sb.WriteString(" " + w + "\n") + } + } + if tail := strings.TrimSpace(d.stderrTail); tail != "" { + sb.WriteString("stderr tail:\n") + sb.WriteString(tail + "\n") + } + if tail := strings.TrimSpace(d.logTail); tail != "" { + sb.WriteString("custodian-run.log tail:\n") + sb.WriteString(tail + "\n") + } + return sb.String() +} + +// composeZeroEvidenceError builds the error returned when no evidence was +// produced for any check. It preserves any accumulated per-policy/exec errors +// and appends, for every custodian execution, a compact one-line summary plus +// full log/stderr tails for the suspicious (zero-resource) executions. The +// generic sentence survives only as a last-resort fallback when there is +// genuinely nothing else to report. +func composeZeroEvidenceError(accumulatedErrors error, diagnostics []executionDiagnostic) error { + var sb strings.Builder + + if len(diagnostics) > 0 { + fmt.Fprintf(&sb, "custodian execution summary (%d execution(s)):\n", len(diagnostics)) + for _, d := range diagnostics { + fmt.Fprintf(&sb, " - %s %s [%s] exit=%d resources=%d had_error=%t\n", + d.name, d.resource, d.kind(), d.exitCode, d.resourceCount, d.hadError) + } + } + + omitted := 0 + for _, d := range diagnostics { + // Only the zero-resource executions are suspicious; emit their full + // tails. Executions that returned resources get the one-line summary + // above only, keeping the total bounded. + if d.resourceCount > 0 { + continue + } + section := formatExecutionDiagnosticDetail(d) + if sb.Len()+len(section) > custodianDiagnosticDetailCap { + omitted++ + continue + } + sb.WriteString(section) + } + if omitted > 0 { + fmt.Fprintf(&sb, "\n(%d additional zero-resource diagnostic section(s) omitted to bound error size)\n", omitted) + } + + detail := strings.TrimSpace(sb.String()) + + switch { + case detail != "" && accumulatedErrors != nil: + return errors.Join(accumulatedErrors, errors.New(detail)) + case detail != "": + return errors.New(detail) + case accumulatedErrors != nil: + return accumulatedErrors + default: + return errors.New("policy evaluation failed for all checks") + } +} + +// checkRunSummary is one row of the consolidated end-of-run summary emitted by +// Eval, giving an operator a one-glance picture across every resource type. +type checkRunSummary struct { + Check string `json:"check"` + Resource string `json:"resource"` + ExitCode int `json:"exit_code"` + BaselineResourceCount int `json:"baseline_resource_count"` + MatchedResourceCount int `json:"matched_resource_count"` + PayloadCount int `json:"payload_count"` + EvidenceCount int `json:"evidence_count"` + HadError bool `json:"had_error"` + Regions []string `json:"regions,omitempty"` +} + func (p *CloudCustodianPlugin) logPolicyPayload(payload *StandardizedResourcePayload) { if payload == nil || !p.Logger.IsDebug() { return diff --git a/main_test.go b/main_test.go index 21109f9..fbcd891 100644 --- a/main_test.go +++ b/main_test.go @@ -1144,7 +1144,7 @@ exit 3 } }) - t.Run("does not read custodian log artifacts on success by default", func(t *testing.T) { + t.Run("captures custodian log tail on success without surfacing it as an error", func(t *testing.T) { script := `#!/bin/sh set -eu out="" @@ -1178,8 +1178,16 @@ printf '[]' > "$out/test-policy/resources.json" if result.Err != nil { t.Fatalf("expected successful execution, got error: %v", result.Err) } - if len(result.LogPaths) != 0 { - t.Fatalf("expected successful execution not to walk log artifacts by default, got %#v", result.LogPaths) + // The run-log tail is now always captured so Eval can surface it on the + // exit-0/empty path, but a successful run must not turn it into an error. + if len(result.LogPaths) != 1 || !strings.HasSuffix(result.LogPaths[0], "custodian-run.log") { + t.Fatalf("expected custodian log path to be captured on success, got %#v", result.LogPaths) + } + if !strings.Contains(result.LogTail, "success log detail") { + t.Fatalf("expected custodian log tail to be captured on success, got %q", result.LogTail) + } + if result.Error != "" { + t.Fatalf("expected no error string on successful execution, got %q", result.Error) } }) @@ -2053,6 +2061,208 @@ func TestEvalFailsWhenInventoryBaselineErrors(t *testing.T) { } } +func TestEvalZeroEvidenceErrorIncludesCustodianDiagnostics(t *testing.T) { + now := time.Now().UTC() + + t.Run("all checks exit zero with empty resources surfaces per-execution log tails", func(t *testing.T) { + executor := &fakeExecutor{results: map[string]CustodianExecutionResult{ + "inventory-aws-s3": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + Stderr: "boto3 access denied while listing buckets", + LogTail: "ERROR custodian.policy AccessDenied: not authorized to perform s3:ListAllMyBuckets", + }, + "inventory-aws-ec2": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy EndpointConnectionError: Could not connect to ec2.us-gov-west-1", + }, + "check-a": { + StartedAt: now, + EndedAt: now.Add(10 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy AccessDenied: not authorized to perform s3:GetBucketPolicy", + }, + "check-b": { + StartedAt: now, + EndedAt: now.Add(10 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy EndpointConnectionError: Could not connect to ec2.us-gov-west-1", + }, + }} + + apiHelper := &fakeAPIHelper{} + plugin := &CloudCustodianPlugin{ + Logger: hclog.NewNullLogger(), + parsedConfig: &ParsedConfig{ + CheckTimeout: 30 * time.Second, + }, + checks: []CustodianCheck{ + {Index: 0, Name: "check-a", Resource: "aws.s3", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-a", "resource": "aws.s3"}}, + {Index: 1, Name: "check-b", Resource: "aws.ec2", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-b", "resource": "aws.ec2"}}, + }, + executor: executor, + evaluator: &fakePolicyEvaluator{}, + } + + resp, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper) + if err == nil { + t.Fatal("expected eval failure when no evidence is produced") + } + if resp.GetStatus() != proto.ExecutionStatus_FAILURE { + t.Fatalf("expected failure status, got %s", resp.GetStatus().String()) + } + msg := err.Error() + if msg == "policy evaluation failed for all checks" { + t.Fatalf("expected enriched diagnostics, got bare generic error") + } + for _, want := range []string{ + "check-a", "check-b", "inventory-aws-s3", "inventory-aws-ec2", + "exit=0", "resources=0", + "AccessDenied: not authorized to perform s3:ListAllMyBuckets", + "AccessDenied: not authorized to perform s3:GetBucketPolicy", + "EndpointConnectionError: Could not connect to ec2.us-gov-west-1", + "boto3 access denied while listing buckets", + } { + if !strings.Contains(msg, want) { + t.Fatalf("expected error to contain %q, got:\n%s", want, msg) + } + } + }) + + t.Run("mixed errored and empty executions includes both per-policy errors and diagnostics", func(t *testing.T) { + executor := &fakeExecutor{results: map[string]CustodianExecutionResult{ + "inventory-aws-s3": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy AccessDenied on s3", + }, + "inventory-aws-ec2": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy AccessDenied on ec2", + }, + "check-a": { + StartedAt: now, + EndedAt: now.Add(10 * time.Millisecond), + ExitCode: 2, + Error: "custodian crashed during augmentation", + Err: errors.New("custodian crashed during augmentation"), + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy traceback during augment", + }, + "check-b": { + StartedAt: now, + EndedAt: now.Add(10 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: "ERROR custodian.policy AccessDenied on ec2 describe", + }, + }} + + apiHelper := &fakeAPIHelper{} + plugin := &CloudCustodianPlugin{ + Logger: hclog.NewNullLogger(), + parsedConfig: &ParsedConfig{ + CheckTimeout: 30 * time.Second, + }, + checks: []CustodianCheck{ + {Index: 0, Name: "check-a", Resource: "aws.s3", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-a", "resource": "aws.s3"}}, + {Index: 1, Name: "check-b", Resource: "aws.ec2", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-b", "resource": "aws.ec2"}}, + }, + executor: executor, + evaluator: &fakePolicyEvaluator{}, + } + + _, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper) + if err == nil { + t.Fatal("expected eval failure") + } + msg := err.Error() + // Per-policy error preserved from formatExecutionFailure. + if !strings.Contains(msg, "custodian crashed during augmentation") { + t.Fatalf("expected per-policy execution error, got:\n%s", msg) + } + // Per-execution diagnostics also present. + if !strings.Contains(msg, "check-b") || !strings.Contains(msg, "AccessDenied on ec2 describe") { + t.Fatalf("expected per-execution diagnostics for empty check, got:\n%s", msg) + } + if !strings.Contains(msg, "custodian execution summary") { + t.Fatalf("expected execution summary header, got:\n%s", msg) + } + }) + + t.Run("bounds total diagnostic size and reports omitted sections", func(t *testing.T) { + const checkCount = 16 + bigTail := strings.Repeat("AccessDenied X ", 320) // ~4.8KB per execution + results := map[string]CustodianExecutionResult{ + "inventory-aws-s3": { + StartedAt: now, + EndedAt: now.Add(5 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: bigTail, + }, + } + checks := make([]CustodianCheck, 0, checkCount) + for i := 0; i < checkCount; i++ { + name := fmt.Sprintf("check-%02d", i) + results[name] = CustodianExecutionResult{ + StartedAt: now, + EndedAt: now.Add(10 * time.Millisecond), + ExitCode: 0, + Resources: []interface{}{}, + LogTail: bigTail, + } + checks = append(checks, CustodianCheck{ + Index: i, + Name: name, + Resource: "aws.s3", + Provider: "aws", + RawPolicy: map[string]interface{}{"name": name, "resource": "aws.s3"}, + }) + } + + apiHelper := &fakeAPIHelper{} + plugin := &CloudCustodianPlugin{ + Logger: hclog.NewNullLogger(), + parsedConfig: &ParsedConfig{ + CheckTimeout: 30 * time.Second, + }, + checks: checks, + executor: &fakeExecutor{results: results}, + evaluator: &fakePolicyEvaluator{}, + } + + _, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper) + if err == nil { + t.Fatal("expected eval failure") + } + msg := err.Error() + if len(msg) > custodianDiagnosticDetailCap+8*1024 { + t.Fatalf("expected bounded error size, got %d bytes", len(msg)) + } + if !strings.Contains(msg, "omitted to bound error size") { + t.Fatalf("expected omitted-sections marker, got message of %d bytes:\n%s", len(msg), msg) + } + // Every execution still gets its compact one-line summary even when its + // full detail section was omitted. + if !strings.Contains(msg, "check-15") { + t.Fatalf("expected compact summary line for every execution, got:\n%s", msg) + } + }) +} + func TestConfigureLoadsChecks(t *testing.T) { stubLookPath(t, func(binary string) (string, error) { return "/usr/local/bin/" + binary, nil From c2a104267be44fe951bf5e997ecf03d89a2e048c Mon Sep 17 00:00:00 2001 From: Gustavo Carvalho Date: Tue, 2 Jun 2026 09:35:20 -0300 Subject: [PATCH 2/5] fix: copilot issues Signed-off-by: Gustavo Carvalho --- main.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 82d6462..662a635 100644 --- a/main.go +++ b/main.go @@ -3255,6 +3255,12 @@ func (p *CloudCustodianPlugin) warnZeroResourceBaseline(name, resource string, e if execution.ExitCode != 0 || len(execution.Resources) > 0 { return } + // A baseline can exit 0 yet still carry an error (e.g. resources.json + // missing or unparseable). That real cause is already reported via + // formatExecutionFailure, so don't emit a misleading IAM/network warning. + if execution.Err != nil || execution.Error != "" { + return + } p.Logger.Warn("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints", "name", name, "resource", resource, @@ -3374,11 +3380,22 @@ func formatExecutionDiagnosticDetail(d executionDiagnostic) string { func composeZeroEvidenceError(accumulatedErrors error, diagnostics []executionDiagnostic) error { var sb strings.Builder + summaryOmitted := 0 if len(diagnostics) > 0 { fmt.Fprintf(&sb, "custodian execution summary (%d execution(s)):\n", len(diagnostics)) for _, d := range diagnostics { - fmt.Fprintf(&sb, " - %s %s [%s] exit=%d resources=%d had_error=%t\n", + line := fmt.Sprintf(" - %s %s [%s] exit=%d resources=%d had_error=%t\n", d.name, d.resource, d.kind(), d.exitCode, d.resourceCount, d.hadError) + // Bound the summary too, so a very large policy pack can never grow + // the error past the cap. + if sb.Len()+len(line) > custodianDiagnosticDetailCap { + summaryOmitted++ + continue + } + sb.WriteString(line) + } + if summaryOmitted > 0 { + fmt.Fprintf(&sb, " ... %d more execution summary line(s) omitted to bound error size\n", summaryOmitted) } } From f4eee7cb42243cc1a8cdc0d67f5f68081d6e18a1 Mon Sep 17 00:00:00 2001 From: Gustavo Carvalho Date: Tue, 2 Jun 2026 09:42:33 -0300 Subject: [PATCH 3/5] fix: copilot issues Signed-off-by: Gustavo Carvalho --- main.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 662a635..59204be 100644 --- a/main.go +++ b/main.go @@ -2487,8 +2487,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api continue } baselineName := fmt.Sprintf("inventory-%s", sanitizeIdentifier(resourceType)) - executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil)) - if baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 { + // A baseline with only diagnostic warnings (e.g. unreachable endpoints) + // is still treated as a failure below, so reflect that in had_error. + baselineHadWarnings := len(baseline.Execution.DiagnosticWarnings) > 0 + executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil || baselineHadWarnings)) + if baseline.Err != nil || !baselineHadWarnings { continue } err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings) From c95fd23f2a626d8974fa4509ee16ca66601c90db Mon Sep 17 00:00:00 2001 From: Gustavo Carvalho Date: Tue, 2 Jun 2026 10:00:32 -0300 Subject: [PATCH 4/5] fix: comments Signed-off-by: Gustavo Carvalho --- main.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index 59204be..6718b12 100644 --- a/main.go +++ b/main.go @@ -3298,9 +3298,11 @@ func formatExecutionDiagnosticWarnings(messages []string) error { return err } -// custodianDiagnosticDetailCap bounds the total size of the per-execution -// diagnostic detail appended to a zero-evidence failure error so that a policy -// pack with many resource types can never produce an unbounded gRPC error. +// custodianDiagnosticDetailCap bounds the size of the per-execution diagnostic +// detail (the one-line summaries plus log/stderr tails) that +// composeZeroEvidenceError appends. It caps only that appended detail; any +// accumulatedErrors joined ahead of it carry their own (per-policy) content and +// are intentionally not truncated here so genuine failures are never hidden. const custodianDiagnosticDetailCap = 32 * 1024 // tailString returns at most the last maxBytes bytes of s. From 1d379610ea4b8e5bb2bc29535208c47bebfe5d13 Mon Sep 17 00:00:00 2001 From: Gustavo Carvalho Date: Tue, 2 Jun 2026 10:50:27 -0300 Subject: [PATCH 5/5] fix: extra debugs just in case Signed-off-by: Gustavo Carvalho --- main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/main.go b/main.go index 6718b12..bf9fcea 100644 --- a/main.go +++ b/main.go @@ -569,6 +569,7 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe "stdout_len", stdoutBuf.Len(), "stderr_len", stderrBuf.Len(), ) + e.Logger.Debug("custodian outputs", "stdout", stdoutBuf.Tail(100), "stderr", stderrBuf.Tail(100)) goto commandFinished case <-ticker.C: elapsed := time.Since(result.StartedAt).Round(time.Second).String()