From 371195fc8ab98e1e53a5ec91a7c42d50df6918ea Mon Sep 17 00:00:00 2001
From: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
Date: Tue, 2 Jun 2026 08:33:00 -0300
Subject: [PATCH 1/5] fix: improve logging on nil-resource case

Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
---
 main.go      | 257 +++++++++++++++++++++++++++++++++++++++++++++++----
 main_test.go | 216 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 451 insertions(+), 22 deletions(-)

diff --git a/main.go b/main.go
index 7426496..82d6462 100644
--- a/main.go
+++ b/main.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
+	"maps"
 	"net"
 	"net/http"
 	"net/url"
@@ -318,6 +319,7 @@ type CustodianExecutionResult struct {
 	ArtifactPath       string
 	ResourcesPath      string
 	LogPaths           []string
+	LogTail            string
 	DiagnosticWarnings []string
 }
 
@@ -507,10 +509,12 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe
 			e.Logger.Info("Custodian AWS API trace enabled", "check_name", req.Check.Name, "trace_log_path", traceLogPath, "pythonpath_dir", traceDir)
 		}
 	}
-	e.Logger.Debug("Executing custodian command",
+	e.Logger.Info("Executing custodian command",
 		"check_name", req.Check.Name,
+		"resource", req.Check.Resource,
 		"command", req.BinaryPath,
 		"args", args,
+		"aws_regions", regions,
 	)
 	stdoutBuf := &lockedBuffer{}
 	stderrBuf := &lockedBuffer{}
@@ -637,14 +641,14 @@ commandFinished:
 	if resources != nil {
 		result.Resources = resources
 	}
-	var logTail string
-	var logErr error
-	if req.LogTailDuringRun || err != nil || runCtx.Err() != nil || resourcesErr != nil {
-		logPaths, tail, readErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes)
-		result.LogPaths = logPaths
-		logTail = tail
-		logErr = readErr
-	}
+	// Always capture the tail of custodian's own run log, regardless of exit
+	// code or LogTailDuringRun. Custodian frequently exits 0 with an empty
+	// resources.json after logging AccessDenied / EndpointConnectionError to
+	// custodian-run.log; capturing the tail here (before the temp dir is
+	// deleted) is the only non-invasive way to surface that cause to Eval.
+	logPaths, logTail, logErr := readCustodianLogArtifacts(req.OutputDir, custodianOutputTailBytes)
+	result.LogPaths = logPaths
+	result.LogTail = logTail
 
 	if err != nil {
 		result.Err = fmt.Errorf("custodian execution failed: %w", err)
@@ -2455,8 +2459,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 	successfulPolicyRuns := 0
 	hadCheckExecutionFailures := false
 	defer func() {
-		if p.parsedConfig.PreserveArtifacts && hadCheckExecutionFailures {
-			p.Logger.Warn("Preserving cloud custodian execution artifacts after check execution failure", "execution_root", executionRoot)
+		zeroEvidence := successfulPolicyRuns == 0 && totalEvidenceCount == 0
+		if p.parsedConfig.PreserveArtifacts && (hadCheckExecutionFailures || zeroEvidence) {
+			p.Logger.Warn("Preserving cloud custodian execution artifacts for troubleshooting",
+				"execution_root", executionRoot,
+				"had_check_execution_failures", hadCheckExecutionFailures,
+				"zero_evidence", zeroEvidence,
+			)
 			return
 		}
 		if err := os.RemoveAll(executionRoot); err != nil {
@@ -2465,9 +2474,21 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 	}()
 	p.Logger.Debug("Created temporary execution root", "execution_root", executionRoot)
 
+	// Retain a compact diagnostic per execution (baselines + checks) and a
+	// per-check summary row so the zero-evidence failure branch and the
+	// consolidated end-of-run summary can explain why nothing was produced.
+	var executionDiagnostics []executionDiagnostic
+	var runSummaries []checkRunSummary
+
 	baselines := p.collectInventoryBaselines(ctx, executionRoot)
-	for resourceType, baseline := range baselines {
-		if baseline == nil || baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 {
+	for _, resourceType := range slices.Sorted(maps.Keys(baselines)) {
+		baseline := baselines[resourceType]
+		if baseline == nil {
+			continue
+		}
+		baselineName := fmt.Sprintf("inventory-%s", sanitizeIdentifier(resourceType))
+		executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil))
+		if baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 {
 			continue
 		}
 		err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings)
@@ -2481,6 +2502,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			p.Logger.Warn("Skipping custodian execution due to check parse issues", "check_name", check.Name, "parse_errors", check.ParseErrors)
 			accumulatedErrors = errors.Join(accumulatedErrors, fmt.Errorf("check %s has parse errors: %s", check.Name, strings.Join(check.ParseErrors, "; ")))
 			hadCheckExecutionFailures = true
+			runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, HadError: true, Regions: p.parsedConfig.AWSRegions})
 			continue
 		}
 
@@ -2495,6 +2517,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			p.Logger.Error("Skipping check due to unavailable inventory baseline", "check_name", check.Name, "resource", check.Resource, "error", err)
 			accumulatedErrors = errors.Join(accumulatedErrors, err)
 			hadCheckExecutionFailures = true
+			baselineRecordCount := 0
+			if baseline != nil {
+				baselineRecordCount = len(baseline.Records)
+			}
+			runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: -1, BaselineResourceCount: baselineRecordCount, HadError: true, Regions: p.parsedConfig.AWSRegions})
 			continue
 		}
 
@@ -2517,6 +2544,8 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			p.Logger.Error("Skipping resource evaluation due to check execution error", "check_name", check.Name, "error", err)
 			accumulatedErrors = errors.Join(accumulatedErrors, err)
 			hadCheckExecutionFailures = true
+			executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, true))
+			runSummaries = append(runSummaries, checkRunSummary{Check: check.Name, Resource: check.Resource, ExitCode: execution.ExitCode, BaselineResourceCount: len(baseline.Records), MatchedResourceCount: len(execution.Resources), HadError: true, Regions: p.parsedConfig.AWSRegions})
 			continue
 		}
 
@@ -2547,10 +2576,13 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			}
 		}
 
+		checkEvidenceCount := 0
+		checkHadError := false
 		for _, payload := range payloads {
 			evidences, evalErr, successfulRuns := p.evaluateResourcePolicies(ctx, payload, req.GetPolicyPaths())
 			pendingEvidences = append(pendingEvidences, evidences...)
 			totalEvidenceCount += len(evidences)
+			checkEvidenceCount += len(evidences)
 			successfulPolicyRuns += successfulRuns
 			p.Logger.Debug("Completed policy evaluations for resource",
 				"check_name", payload.Check.Name,
@@ -2561,6 +2593,7 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 				"had_eval_error", evalErr != nil,
 			)
 			if evalErr != nil {
+				checkHadError = true
 				accumulatedErrors = errors.Join(accumulatedErrors, evalErr)
 			}
 			for len(pendingEvidences) >= evidenceBatchSize {
@@ -2575,7 +2608,20 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			p.Logger.Warn("Check completed with unavailable AWS service endpoints", "check_name", check.Name, "error", err)
 			accumulatedErrors = errors.Join(accumulatedErrors, err)
 			hadCheckExecutionFailures = true
-		}
+			checkHadError = true
+		}
+		executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(check.Name, check.Resource, false, execution, checkHadError))
+		runSummaries = append(runSummaries, checkRunSummary{
+			Check:                 check.Name,
+			Resource:              check.Resource,
+			ExitCode:              execution.ExitCode,
+			BaselineResourceCount: len(baseline.Records),
+			MatchedResourceCount:  len(execution.Resources),
+			PayloadCount:          len(payloads),
+			EvidenceCount:         checkEvidenceCount,
+			HadError:              checkHadError,
+			Regions:               p.parsedConfig.AWSRegions,
+		})
 	}
 
 	if len(pendingEvidences) > 0 {
@@ -2596,11 +2642,25 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 		p.Logger.Warn("No evidence generated by current evaluation run")
 	}
 
-	if successfulPolicyRuns == 0 && totalEvidenceCount == 0 {
-		if accumulatedErrors == nil {
-			accumulatedErrors = errors.New("policy evaluation failed for all checks")
-		}
-		return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, accumulatedErrors
+	// Consolidated, one-glance picture across every resource type. Emitted at
+	// Warn when no evidence was produced, otherwise at Debug since a run that
+	// produced evidence is the healthy case.
+	zeroEvidenceOutcome := successfulPolicyRuns == 0 && totalEvidenceCount == 0
+	summaryLog := p.Logger.Debug
+	if zeroEvidenceOutcome {
+		summaryLog = p.Logger.Warn
+	}
+	summaryLog("Cloud custodian evaluation summary",
+		"check_count", len(p.checks),
+		"successful_policy_runs", successfulPolicyRuns,
+		"total_evidence_count", totalEvidenceCount,
+		"had_check_execution_failures", hadCheckExecutionFailures,
+		"aws_regions", p.parsedConfig.AWSRegions,
+		"per_check", runSummaries,
+	)
+
+	if zeroEvidenceOutcome {
+		return &proto.EvalResponse{Status: proto.ExecutionStatus_FAILURE}, composeZeroEvidenceError(accumulatedErrors, executionDiagnostics)
 	}
 	if hadCheckExecutionFailures {
 		if accumulatedErrors == nil {
@@ -2648,6 +2708,7 @@ func (p *CloudCustodianPlugin) collectInventoryBaselines(ctx context.Context, ex
 		if execution.Err != nil || execution.Error != "" {
 			baselineErr = formatExecutionFailure(check.Name, execution)
 		}
+		p.warnZeroResourceBaseline(check.Name, resourceType, execution)
 		records := make([]ResourceRecord, 0, len(execution.Resources))
 		for _, resource := range execution.Resources {
 			records = append(records, p.buildResourceRecord(resourceType, resource))
@@ -3185,6 +3246,24 @@ func isReservedResourceLabel(label string) bool {
 	}
 }
 
+// warnZeroResourceBaseline emits a Warn naming the likely causes whenever an
+// inventory baseline run exits 0 but returned no resources, so that an operator
+// can distinguish a permissions problem from a network/endpoint problem without
+// re-running with invasive tracing. This is not done for policy checks: a fully
+// compliant estate legitimately matches zero resources.
+func (p *CloudCustodianPlugin) warnZeroResourceBaseline(name, resource string, execution CustodianExecutionResult) {
+	if execution.ExitCode != 0 || len(execution.Resources) > 0 {
+		return
+	}
+	p.Logger.Warn("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints",
+		"name", name,
+		"resource", resource,
+		"exit_code", execution.ExitCode,
+		"stderr_tail", tailString(execution.Stderr, custodianOutputTailBytes),
+		"log_tail", execution.LogTail,
+	)
+}
+
 func formatExecutionFailure(checkName string, execution CustodianExecutionResult) error {
 	switch {
 	case execution.Error != "" && execution.Err != nil:
@@ -3210,6 +3289,146 @@ func formatExecutionDiagnosticWarnings(messages []string) error {
 	return err
 }
 
+// custodianDiagnosticDetailCap bounds the total size of the per-execution
+// diagnostic detail appended to a zero-evidence failure error so that a policy
+// pack with many resource types can never produce an unbounded gRPC error.
+const custodianDiagnosticDetailCap = 32 * 1024
+
+// tailString returns at most the last maxBytes bytes of s.
+func tailString(s string, maxBytes int) string {
+	if maxBytes <= 0 || len(s) <= maxBytes {
+		return s
+	}
+	return s[len(s)-maxBytes:]
+}
+
+// executionDiagnostic is a compact, bounded snapshot of a single custodian
+// execution (an inventory baseline or a policy check) retained so that the
+// zero-evidence failure branch can explain why nothing was produced.
+type executionDiagnostic struct {
+	name          string
+	resource      string
+	isBaseline    bool
+	exitCode      int
+	resourceCount int
+	logTail       string
+	stderrTail    string
+	warnings      []string
+	hadError      bool
+}
+
+func newExecutionDiagnostic(name, resource string, isBaseline bool, execution CustodianExecutionResult, hadError bool) executionDiagnostic {
+	return executionDiagnostic{
+		name:          name,
+		resource:      resource,
+		isBaseline:    isBaseline,
+		exitCode:      execution.ExitCode,
+		resourceCount: len(execution.Resources),
+		logTail:       execution.LogTail,
+		stderrTail:    tailString(execution.Stderr, custodianOutputTailBytes),
+		warnings:      execution.DiagnosticWarnings,
+		hadError:      hadError,
+	}
+}
+
+func (d executionDiagnostic) kind() string {
+	if d.isBaseline {
+		return "baseline"
+	}
+	return "check"
+}
+
+// formatExecutionDiagnosticDetail renders the full (already capped) detail for
+// one execution: its identity, exit code, resource count, diagnostic warnings,
+// and the tails of stderr and custodian-run.log.
+func formatExecutionDiagnosticDetail(d executionDiagnostic) string {
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "\n--- %s %s [%s] exit=%d resources=%d ---\n", d.name, d.resource, d.kind(), d.exitCode, d.resourceCount)
+	if len(d.warnings) > 0 {
+		sb.WriteString("diagnostic warnings:\n")
+		for _, w := range d.warnings {
+			w = strings.TrimSpace(w)
+			if w == "" {
+				continue
+			}
+			sb.WriteString("  " + w + "\n")
+		}
+	}
+	if tail := strings.TrimSpace(d.stderrTail); tail != "" {
+		sb.WriteString("stderr tail:\n")
+		sb.WriteString(tail + "\n")
+	}
+	if tail := strings.TrimSpace(d.logTail); tail != "" {
+		sb.WriteString("custodian-run.log tail:\n")
+		sb.WriteString(tail + "\n")
+	}
+	return sb.String()
+}
+
+// composeZeroEvidenceError builds the error returned when no evidence was
+// produced for any check. It preserves any accumulated per-policy/exec errors
+// and appends, for every custodian execution, a compact one-line summary plus
+// full log/stderr tails for the suspicious (zero-resource) executions. The
+// generic sentence survives only as a last-resort fallback when there is
+// genuinely nothing else to report.
+func composeZeroEvidenceError(accumulatedErrors error, diagnostics []executionDiagnostic) error {
+	var sb strings.Builder
+
+	if len(diagnostics) > 0 {
+		fmt.Fprintf(&sb, "custodian execution summary (%d execution(s)):\n", len(diagnostics))
+		for _, d := range diagnostics {
+			fmt.Fprintf(&sb, "  - %s %s [%s] exit=%d resources=%d had_error=%t\n",
+				d.name, d.resource, d.kind(), d.exitCode, d.resourceCount, d.hadError)
+		}
+	}
+
+	omitted := 0
+	for _, d := range diagnostics {
+		// Only the zero-resource executions are suspicious; emit their full
+		// tails. Executions that returned resources get the one-line summary
+		// above only, keeping the total bounded.
+		if d.resourceCount > 0 {
+			continue
+		}
+		section := formatExecutionDiagnosticDetail(d)
+		if sb.Len()+len(section) > custodianDiagnosticDetailCap {
+			omitted++
+			continue
+		}
+		sb.WriteString(section)
+	}
+	if omitted > 0 {
+		fmt.Fprintf(&sb, "\n(%d additional zero-resource diagnostic section(s) omitted to bound error size)\n", omitted)
+	}
+
+	detail := strings.TrimSpace(sb.String())
+
+	switch {
+	case detail != "" && accumulatedErrors != nil:
+		return errors.Join(accumulatedErrors, errors.New(detail))
+	case detail != "":
+		return errors.New(detail)
+	case accumulatedErrors != nil:
+		return accumulatedErrors
+	default:
+		return errors.New("policy evaluation failed for all checks")
+	}
+}
+
+// checkRunSummary is one row of the consolidated end-of-run summary emitted by
+// Eval, giving an operator a one-glance picture across every resource type.
+type checkRunSummary struct {
+	Check                 string   `json:"check"`
+	Resource              string   `json:"resource"`
+	ExitCode              int      `json:"exit_code"`
+	BaselineResourceCount int      `json:"baseline_resource_count"`
+	MatchedResourceCount  int      `json:"matched_resource_count"`
+	PayloadCount          int      `json:"payload_count"`
+	EvidenceCount         int      `json:"evidence_count"`
+	HadError              bool     `json:"had_error"`
+	Regions               []string `json:"regions,omitempty"`
+}
+
 func (p *CloudCustodianPlugin) logPolicyPayload(payload *StandardizedResourcePayload) {
 	if payload == nil || !p.Logger.IsDebug() {
 		return
diff --git a/main_test.go b/main_test.go
index 21109f9..fbcd891 100644
--- a/main_test.go
+++ b/main_test.go
@@ -1144,7 +1144,7 @@ exit 3
 		}
 	})
 
-	t.Run("does not read custodian log artifacts on success by default", func(t *testing.T) {
+	t.Run("captures custodian log tail on success without surfacing it as an error", func(t *testing.T) {
 		script := `#!/bin/sh
 set -eu
 out=""
@@ -1178,8 +1178,16 @@ printf '[]' > "$out/test-policy/resources.json"
 		if result.Err != nil {
 			t.Fatalf("expected successful execution, got error: %v", result.Err)
 		}
-		if len(result.LogPaths) != 0 {
-			t.Fatalf("expected successful execution not to walk log artifacts by default, got %#v", result.LogPaths)
+		// The run-log tail is now always captured so Eval can surface it on the
+		// exit-0/empty path, but a successful run must not turn it into an error.
+		if len(result.LogPaths) != 1 || !strings.HasSuffix(result.LogPaths[0], "custodian-run.log") {
+			t.Fatalf("expected custodian log path to be captured on success, got %#v", result.LogPaths)
+		}
+		if !strings.Contains(result.LogTail, "success log detail") {
+			t.Fatalf("expected custodian log tail to be captured on success, got %q", result.LogTail)
+		}
+		if result.Error != "" {
+			t.Fatalf("expected no error string on successful execution, got %q", result.Error)
 		}
 	})
 
@@ -2053,6 +2061,208 @@ func TestEvalFailsWhenInventoryBaselineErrors(t *testing.T) {
 	}
 }
 
+func TestEvalZeroEvidenceErrorIncludesCustodianDiagnostics(t *testing.T) {
+	now := time.Now().UTC()
+
+	t.Run("all checks exit zero with empty resources surfaces per-execution log tails", func(t *testing.T) {
+		executor := &fakeExecutor{results: map[string]CustodianExecutionResult{
+			"inventory-aws-s3": {
+				StartedAt: now,
+				EndedAt:   now.Add(5 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				Stderr:    "boto3 access denied while listing buckets",
+				LogTail:   "ERROR custodian.policy AccessDenied: not authorized to perform s3:ListAllMyBuckets",
+			},
+			"inventory-aws-ec2": {
+				StartedAt: now,
+				EndedAt:   now.Add(5 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy EndpointConnectionError: Could not connect to ec2.us-gov-west-1",
+			},
+			"check-a": {
+				StartedAt: now,
+				EndedAt:   now.Add(10 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy AccessDenied: not authorized to perform s3:GetBucketPolicy",
+			},
+			"check-b": {
+				StartedAt: now,
+				EndedAt:   now.Add(10 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy EndpointConnectionError: Could not connect to ec2.us-gov-west-1",
+			},
+		}}
+
+		apiHelper := &fakeAPIHelper{}
+		plugin := &CloudCustodianPlugin{
+			Logger: hclog.NewNullLogger(),
+			parsedConfig: &ParsedConfig{
+				CheckTimeout: 30 * time.Second,
+			},
+			checks: []CustodianCheck{
+				{Index: 0, Name: "check-a", Resource: "aws.s3", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-a", "resource": "aws.s3"}},
+				{Index: 1, Name: "check-b", Resource: "aws.ec2", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-b", "resource": "aws.ec2"}},
+			},
+			executor:  executor,
+			evaluator: &fakePolicyEvaluator{},
+		}
+
+		resp, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper)
+		if err == nil {
+			t.Fatal("expected eval failure when no evidence is produced")
+		}
+		if resp.GetStatus() != proto.ExecutionStatus_FAILURE {
+			t.Fatalf("expected failure status, got %s", resp.GetStatus().String())
+		}
+		msg := err.Error()
+		if msg == "policy evaluation failed for all checks" {
+			t.Fatalf("expected enriched diagnostics, got bare generic error")
+		}
+		for _, want := range []string{
+			"check-a", "check-b", "inventory-aws-s3", "inventory-aws-ec2",
+			"exit=0", "resources=0",
+			"AccessDenied: not authorized to perform s3:ListAllMyBuckets",
+			"AccessDenied: not authorized to perform s3:GetBucketPolicy",
+			"EndpointConnectionError: Could not connect to ec2.us-gov-west-1",
+			"boto3 access denied while listing buckets",
+		} {
+			if !strings.Contains(msg, want) {
+				t.Fatalf("expected error to contain %q, got:\n%s", want, msg)
+			}
+		}
+	})
+
+	t.Run("mixed errored and empty executions includes both per-policy errors and diagnostics", func(t *testing.T) {
+		executor := &fakeExecutor{results: map[string]CustodianExecutionResult{
+			"inventory-aws-s3": {
+				StartedAt: now,
+				EndedAt:   now.Add(5 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy AccessDenied on s3",
+			},
+			"inventory-aws-ec2": {
+				StartedAt: now,
+				EndedAt:   now.Add(5 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy AccessDenied on ec2",
+			},
+			"check-a": {
+				StartedAt: now,
+				EndedAt:   now.Add(10 * time.Millisecond),
+				ExitCode:  2,
+				Error:     "custodian crashed during augmentation",
+				Err:       errors.New("custodian crashed during augmentation"),
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy traceback during augment",
+			},
+			"check-b": {
+				StartedAt: now,
+				EndedAt:   now.Add(10 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   "ERROR custodian.policy AccessDenied on ec2 describe",
+			},
+		}}
+
+		apiHelper := &fakeAPIHelper{}
+		plugin := &CloudCustodianPlugin{
+			Logger: hclog.NewNullLogger(),
+			parsedConfig: &ParsedConfig{
+				CheckTimeout: 30 * time.Second,
+			},
+			checks: []CustodianCheck{
+				{Index: 0, Name: "check-a", Resource: "aws.s3", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-a", "resource": "aws.s3"}},
+				{Index: 1, Name: "check-b", Resource: "aws.ec2", Provider: "aws", RawPolicy: map[string]interface{}{"name": "check-b", "resource": "aws.ec2"}},
+			},
+			executor:  executor,
+			evaluator: &fakePolicyEvaluator{},
+		}
+
+		_, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper)
+		if err == nil {
+			t.Fatal("expected eval failure")
+		}
+		msg := err.Error()
+		// Per-policy error preserved from formatExecutionFailure.
+		if !strings.Contains(msg, "custodian crashed during augmentation") {
+			t.Fatalf("expected per-policy execution error, got:\n%s", msg)
+		}
+		// Per-execution diagnostics also present.
+		if !strings.Contains(msg, "check-b") || !strings.Contains(msg, "AccessDenied on ec2 describe") {
+			t.Fatalf("expected per-execution diagnostics for empty check, got:\n%s", msg)
+		}
+		if !strings.Contains(msg, "custodian execution summary") {
+			t.Fatalf("expected execution summary header, got:\n%s", msg)
+		}
+	})
+
+	t.Run("bounds total diagnostic size and reports omitted sections", func(t *testing.T) {
+		const checkCount = 16
+		bigTail := strings.Repeat("AccessDenied X ", 320) // ~4.8KB per execution
+		results := map[string]CustodianExecutionResult{
+			"inventory-aws-s3": {
+				StartedAt: now,
+				EndedAt:   now.Add(5 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   bigTail,
+			},
+		}
+		checks := make([]CustodianCheck, 0, checkCount)
+		for i := 0; i < checkCount; i++ {
+			name := fmt.Sprintf("check-%02d", i)
+			results[name] = CustodianExecutionResult{
+				StartedAt: now,
+				EndedAt:   now.Add(10 * time.Millisecond),
+				ExitCode:  0,
+				Resources: []interface{}{},
+				LogTail:   bigTail,
+			}
+			checks = append(checks, CustodianCheck{
+				Index:     i,
+				Name:      name,
+				Resource:  "aws.s3",
+				Provider:  "aws",
+				RawPolicy: map[string]interface{}{"name": name, "resource": "aws.s3"},
+			})
+		}
+
+		apiHelper := &fakeAPIHelper{}
+		plugin := &CloudCustodianPlugin{
+			Logger: hclog.NewNullLogger(),
+			parsedConfig: &ParsedConfig{
+				CheckTimeout: 30 * time.Second,
+			},
+			checks:    checks,
+			executor:  &fakeExecutor{results: results},
+			evaluator: &fakePolicyEvaluator{},
+		}
+
+		_, err := plugin.Eval(&proto.EvalRequest{PolicyPaths: []string{"bundle-a"}}, apiHelper)
+		if err == nil {
+			t.Fatal("expected eval failure")
+		}
+		msg := err.Error()
+		if len(msg) > custodianDiagnosticDetailCap+8*1024 {
+			t.Fatalf("expected bounded error size, got %d bytes", len(msg))
+		}
+		if !strings.Contains(msg, "omitted to bound error size") {
+			t.Fatalf("expected omitted-sections marker, got message of %d bytes:\n%s", len(msg), msg)
+		}
+		// Every execution still gets its compact one-line summary even when its
+		// full detail section was omitted.
+		if !strings.Contains(msg, "check-15") {
+			t.Fatalf("expected compact summary line for every execution, got:\n%s", msg)
+		}
+	})
+}
+
 func TestConfigureLoadsChecks(t *testing.T) {
 	stubLookPath(t, func(binary string) (string, error) {
 		return "/usr/local/bin/" + binary, nil

From c2a104267be44fe951bf5e997ecf03d89a2e048c Mon Sep 17 00:00:00 2001
From: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
Date: Tue, 2 Jun 2026 09:35:20 -0300
Subject: [PATCH 2/5] fix: copilot issues

Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
---
 main.go | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/main.go b/main.go
index 82d6462..662a635 100644
--- a/main.go
+++ b/main.go
@@ -3255,6 +3255,12 @@ func (p *CloudCustodianPlugin) warnZeroResourceBaseline(name, resource string, e
 	if execution.ExitCode != 0 || len(execution.Resources) > 0 {
 		return
 	}
+	// A baseline can exit 0 yet still carry an error (e.g. resources.json
+	// missing or unparseable). That real cause is already reported via
+	// formatExecutionFailure, so don't emit a misleading IAM/network warning.
+	if execution.Err != nil || execution.Error != "" {
+		return
+	}
 	p.Logger.Warn("Custodian inventory baseline exited successfully but returned zero resources; likely insufficient IAM read permissions or unreachable/cross-region service endpoints",
 		"name", name,
 		"resource", resource,
@@ -3374,11 +3380,22 @@ func formatExecutionDiagnosticDetail(d executionDiagnostic) string {
 func composeZeroEvidenceError(accumulatedErrors error, diagnostics []executionDiagnostic) error {
 	var sb strings.Builder
 
+	summaryOmitted := 0
 	if len(diagnostics) > 0 {
 		fmt.Fprintf(&sb, "custodian execution summary (%d execution(s)):\n", len(diagnostics))
 		for _, d := range diagnostics {
-			fmt.Fprintf(&sb, "  - %s %s [%s] exit=%d resources=%d had_error=%t\n",
+			line := fmt.Sprintf("  - %s %s [%s] exit=%d resources=%d had_error=%t\n",
 				d.name, d.resource, d.kind(), d.exitCode, d.resourceCount, d.hadError)
+			// Bound the summary too, so a very large policy pack can never grow
+			// the error past the cap.
+			if sb.Len()+len(line) > custodianDiagnosticDetailCap {
+				summaryOmitted++
+				continue
+			}
+			sb.WriteString(line)
+		}
+		if summaryOmitted > 0 {
+			fmt.Fprintf(&sb, "  ... %d more execution summary line(s) omitted to bound error size\n", summaryOmitted)
 		}
 	}
 

From f4eee7cb42243cc1a8cdc0d67f5f68081d6e18a1 Mon Sep 17 00:00:00 2001
From: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
Date: Tue, 2 Jun 2026 09:42:33 -0300
Subject: [PATCH 3/5] fix: copilot issues

Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
---
 main.go | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/main.go b/main.go
index 662a635..59204be 100644
--- a/main.go
+++ b/main.go
@@ -2487,8 +2487,11 @@ func (p *CloudCustodianPlugin) Eval(req *proto.EvalRequest, apiHelper runner.Api
 			continue
 		}
 		baselineName := fmt.Sprintf("inventory-%s", sanitizeIdentifier(resourceType))
-		executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil))
-		if baseline.Err != nil || len(baseline.Execution.DiagnosticWarnings) == 0 {
+		// A baseline with only diagnostic warnings (e.g. unreachable endpoints)
+		// is still treated as a failure below, so reflect that in had_error.
+		baselineHadWarnings := len(baseline.Execution.DiagnosticWarnings) > 0
+		executionDiagnostics = append(executionDiagnostics, newExecutionDiagnostic(baselineName, resourceType, true, baseline.Execution, baseline.Err != nil || baselineHadWarnings))
+		if baseline.Err != nil || !baselineHadWarnings {
 			continue
 		}
 		err := formatExecutionDiagnosticWarnings(baseline.Execution.DiagnosticWarnings)

From c95fd23f2a626d8974fa4509ee16ca66601c90db Mon Sep 17 00:00:00 2001
From: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
Date: Tue, 2 Jun 2026 10:00:32 -0300
Subject: [PATCH 4/5] fix: comments

Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
---
 main.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/main.go b/main.go
index 59204be..6718b12 100644
--- a/main.go
+++ b/main.go
@@ -3298,9 +3298,11 @@ func formatExecutionDiagnosticWarnings(messages []string) error {
 	return err
 }
 
-// custodianDiagnosticDetailCap bounds the total size of the per-execution
-// diagnostic detail appended to a zero-evidence failure error so that a policy
-// pack with many resource types can never produce an unbounded gRPC error.
+// custodianDiagnosticDetailCap bounds the size of the per-execution diagnostic
+// detail (the one-line summaries plus log/stderr tails) that
+// composeZeroEvidenceError appends. It caps only that appended detail; any
+// accumulatedErrors joined ahead of it carry their own (per-policy) content and
+// are intentionally not truncated here so genuine failures are never hidden.
 const custodianDiagnosticDetailCap = 32 * 1024
 
 // tailString returns at most the last maxBytes bytes of s.

From 1d379610ea4b8e5bb2bc29535208c47bebfe5d13 Mon Sep 17 00:00:00 2001
From: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
Date: Tue, 2 Jun 2026 10:50:27 -0300
Subject: [PATCH 5/5] fix: extra debugs just in case

Signed-off-by: Gustavo Carvalho <gustavo.carvalho@container-solutions.com>
---
 main.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.go b/main.go
index 6718b12..bf9fcea 100644
--- a/main.go
+++ b/main.go
@@ -569,6 +569,7 @@ func (e *CommandCustodianExecutor) Execute(ctx context.Context, req CustodianExe
 					"stdout_len", stdoutBuf.Len(),
 					"stderr_len", stderrBuf.Len(),
 				)
+				e.Logger.Debug("custodian outputs", "stdout", stdoutBuf.Tail(100), "stderr", stderrBuf.Tail(100))
 				goto commandFinished
 			case <-ticker.C:
 				elapsed := time.Since(result.StartedAt).Round(time.Second).String()