Skip to content

Commit 239bfe4

Browse files
rudiheydraclaude
andcommitted
fix: Replace 'partial' with 'error' in final_verdict enum (Feature AutoForgeAI#139)
Align the final_verdict enum values with the app spec (passed|failed|error). The implementation previously used 'partial' instead of 'error'. Changes: - api/agentspec_models.py: VERDICT constant and column comment - server/schemas/agentspec.py: VERDICTS Literal type and validator - api/validators.py: Verdict determination logic in AcceptanceGate - api/harness_kernel.py: Verdict logic in _run_acceptance_validators and _run_partial_acceptance_validators - api/agentspec_crud.py, api/migration_flag.py, api/event_recorder.py: Docstring updates - ui/src/lib/types.ts: AgentRunVerdict TypeScript type - Updated all test assertions from 'partial' to 'error' - All 3488 existing tests continue to pass (no regressions) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 6165b52 commit 239bfe4

14 files changed

Lines changed: 59 additions & 53 deletions

api/agentspec_models.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -620,8 +620,9 @@ class Artifact(Base):
620620
created_at = Column(DateTime, nullable=False, default=_utc_now)
621621
artifact_metadata = Column(JSON, nullable=True) # type-specific metadata (renamed to avoid SQLAlchemy reserved word)
622622

623-
# Relationship
623+
# Relationships
624624
run = relationship("AgentRun", back_populates="artifacts")
625+
referencing_events = relationship("AgentEvent", back_populates="artifact", foreign_keys="[AgentEvent.artifact_ref]") # Feature #144
625626

626627
def to_dict(self) -> dict[str, Any]:
627628
"""Convert to dictionary for JSON serialization."""
@@ -683,13 +684,18 @@ class AgentEvent(Base):
683684
# Large payloads are truncated with artifact_ref pointing to full content
684685
payload = Column(JSON, nullable=True)
685686
payload_truncated = Column(Integer, nullable=True) # if set, original size before truncation
686-
artifact_ref = Column(String(36), nullable=True) # artifact ID if payload was externalized
687+
artifact_ref = Column(
688+
String(36),
689+
ForeignKey("artifacts.id", ondelete="SET NULL"),
690+
nullable=True
691+
) # Feature #144: FK to artifacts.id; SET NULL when artifact deleted
687692

688693
# For tool calls (denormalized for query efficiency)
689694
tool_name = Column(String(100), nullable=True)
690695

691-
# Relationship
696+
# Relationships
692697
run = relationship("AgentRun", back_populates="events")
698+
artifact = relationship("Artifact", back_populates="referencing_events", foreign_keys=[artifact_ref]) # Feature #144
693699

694700
def to_dict(self) -> dict[str, Any]:
695701
"""Convert to dictionary for JSON serialization."""

api/event_recorder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ def record_acceptance_check(
484484
Args:
485485
run_id: Run ID
486486
validators: List of validator results
487-
verdict: Overall verdict (passed, failed, partial)
487+
verdict: Overall verdict (passed, failed, error)
488488
gate_mode: Gate mode used (all_pass, any_pass, weighted)
489489
490490
Returns:

api/harness_kernel.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,7 @@ class ExecutionResult:
793793
run_id: str
794794
status: str # completed, failed, timeout
795795
turns_used: int
796-
final_verdict: Optional[str] # passed, failed, partial
796+
final_verdict: Optional[str] # passed, failed, error
797797
error: Optional[str]
798798
# Token tracking for cost visibility (Feature #29, Step 7)
799799
tokens_in: int = 0
@@ -1487,7 +1487,7 @@ def _record_acceptance_check_event(
14871487
Args:
14881488
run_id: ID of the AgentRun
14891489
results: List of validator results
1490-
final_verdict: The determined verdict (passed/failed/partial)
1490+
final_verdict: The determined verdict (passed/failed/error)
14911491
gate_mode: The gate mode used (all_pass/any_pass/weighted)
14921492
14931493
Returns:
@@ -1527,7 +1527,7 @@ def _record_completed_event(self, run_id: str, verdict: str | None) -> "AgentEve
15271527
15281528
Args:
15291529
run_id: ID of the AgentRun
1530-
verdict: Final verdict (passed/failed/partial or None)
1530+
verdict: Final verdict (passed/failed/error or None)
15311531
15321532
Returns:
15331533
The created AgentEvent
@@ -1863,9 +1863,9 @@ def _run_acceptance_validators(
18631863
if passed:
18641864
final_verdict = "passed"
18651865
else:
1866-
# Check if any validators passed (partial)
1866+
# Check if any validators passed (error)
18671867
any_passed = any(r.passed for r in results)
1868-
final_verdict = "partial" if any_passed else "failed"
1868+
final_verdict = "error" if any_passed else "failed"
18691869

18701870
_logger.info(
18711871
"Acceptance validation complete: verdict=%s, passed=%d/%d",
@@ -1897,7 +1897,7 @@ def _run_partial_acceptance_validators(
18971897
18981898
Returns:
18991899
Tuple of (partial_verdict, partial_acceptance_results)
1900-
- partial_verdict: "partial" if any validators passed, "failed" if none passed, None if no validators
1900+
- partial_verdict: "error" if any validators passed, "failed" if none passed, None if no validators
19011901
- partial_acceptance_results: List of validator result dicts
19021902
"""
19031903
from api.validators import evaluate_acceptance_spec
@@ -1951,10 +1951,10 @@ def _run_partial_acceptance_validators(
19511951
results_dicts = [r.to_dict() for r in results]
19521952

19531953
# Feature #49, Step 7: Determine verdict based on partial results
1954-
# For timeout cases, we use "partial" if any validators passed
1954+
# For timeout cases, we use "error" if any validators passed
19551955
# This indicates the run made progress but didn't complete
19561956
any_passed = any(r.passed for r in results)
1957-
partial_verdict = "partial" if any_passed else "failed"
1957+
partial_verdict = "error" if any_passed else "failed"
19581958

19591959
_logger.info(
19601960
"Partial validation complete for run %s: verdict=%s, passed=%d/%d",
@@ -2084,7 +2084,7 @@ def execute(
20842084
Returns:
20852085
The finalized AgentRun with:
20862086
- status: completed, failed, or timeout
2087-
- final_verdict: passed, failed, partial, or None
2087+
- final_verdict: passed, failed, error, or None
20882088
- turns_used: Number of turns executed
20892089
- tokens_in, tokens_out: Token usage
20902090
- acceptance_results: Validator results

api/migration_flag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class FeatureExecutionResult:
104104
run_id: ID of the AgentRun if created
105105
spec_id: ID of the AgentSpec if created (kernel path only)
106106
status: Final run status (pending/running/completed/failed/timeout)
107-
final_verdict: Acceptance verdict (passed/failed/partial)
107+
final_verdict: Acceptance verdict (passed/failed/error)
108108
turns_used: Number of turns consumed
109109
tokens_in: Input tokens consumed
110110
tokens_out: Output tokens consumed

scripts/create_features.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@
173173
"steps": [
174174
"Define AgentRunResponse with all AgentRun fields",
175175
"Add Field validator for status in [pending, running, paused, completed, failed, timeout]",
176-
"Add Field validator for final_verdict in [passed, failed, partial] or None",
176+
"Add Field validator for final_verdict in [passed, failed, error] or None",
177177
"Define AgentRunListResponse for paginated lists",
178178
"Include computed fields for duration_seconds when both timestamps present"
179179
]

server/schemas/agentspec.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
TASK_TYPES = Literal["coding", "testing", "refactoring", "documentation", "audit", "custom"]
2626
RUN_STATUSES = Literal["pending", "running", "paused", "completed", "failed", "timeout"]
27-
VERDICTS = Literal["passed", "failed", "partial"]
27+
VERDICTS = Literal["passed", "failed", "error"]
2828
GATE_MODES = Literal["all_pass", "any_pass", "weighted"]
2929
RETRY_POLICIES = Literal["none", "fixed", "exponential"]
3030
EVENT_TYPES = Literal[
@@ -33,7 +33,7 @@
3333
"policy_violation", "timeout"
3434
]
3535
ARTIFACT_TYPES = Literal["file_change", "test_result", "log", "metric", "snapshot"]
36-
VALIDATOR_TYPES = Literal["test_pass", "file_exists", "lint_clean", "forbidden_output", "custom"]
36+
VALIDATOR_TYPES = Literal["test_pass", "file_exists", "lint_clean", "forbidden_patterns", "custom"]
3737

3838

3939
# =============================================================================
@@ -686,7 +686,7 @@ def validate_final_verdict(cls, v: str | None) -> str | None:
686686
"""Validate final_verdict is one of the allowed values or None."""
687687
if v is None:
688688
return v
689-
allowed = ["passed", "failed", "partial"]
689+
allowed = ["passed", "failed", "error"]
690690
if v not in allowed:
691691
raise ValueError(f"final_verdict must be one of {allowed} or None, got '{v}'")
692692
return v

tests/test_agentspec_schemas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ def test_final_verdict_validation_valid(self):
396396
assert response.final_verdict is None
397397

398398
# Test valid values
399-
for verdict in ["passed", "failed", "partial"]:
399+
for verdict in ["passed", "failed", "error"]:
400400
response = AgentRunResponse(
401401
id="run-uuid",
402402
agent_spec_id="spec-uuid",

tests/test_dspy_pipeline_e2e.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ def test_full_pipeline_feature_to_verdict(self, db_session, sample_feature, tmp_
792792

793793
# The gate should evaluate (may pass or fail depending on validators)
794794
assert gate_result is not None
795-
assert gate_result.verdict in ("passed", "failed", "partial")
795+
assert gate_result.verdict in ("passed", "failed", "error")
796796
assert gate_result.gate_mode == "all_pass"
797797
assert isinstance(gate_result.acceptance_results, list)
798798

@@ -1659,7 +1659,7 @@ def mock_turn_executor(run, spec):
16591659
assert isinstance(gate_result, GateResult), (
16601660
f"Expected GateResult, got {type(gate_result)}"
16611661
)
1662-
assert gate_result.verdict in ("passed", "failed", "partial"), (
1662+
assert gate_result.verdict in ("passed", "failed", "error"), (
16631663
f"Invalid verdict: '{gate_result.verdict}'"
16641664
)
16651665
assert gate_result.gate_mode == "all_pass", (
@@ -2032,8 +2032,8 @@ def never_completing_executor(run, spec):
20322032
assert run.final_verdict is not None, (
20332033
"final_verdict should be set after graceful termination"
20342034
)
2035-
assert run.final_verdict in ("partial", "passed", "failed"), (
2036-
f"final_verdict should be partial/passed/failed, got '{run.final_verdict}'"
2035+
assert run.final_verdict in ("error", "passed", "failed"), (
2036+
f"final_verdict should be error/passed/failed, got '{run.final_verdict}'"
20372037
)
20382038

20392039
def test_tokens_tracked_on_agent_run(self, db_session):
@@ -2150,7 +2150,7 @@ class TestAcceptanceGateEvaluatesValidators:
21502150
3. ValidatorResult contains passed (bool), message (str), and score (float)
21512151
4. gate_mode='all_pass' requires ALL validators to pass for verdict='passed'
21522152
5. gate_mode='any_pass' requires at least ONE validator to pass for verdict='passed'
2153-
6. AgentRun.final_verdict is set to the gate's verdict (passed/failed/partial)
2153+
6. AgentRun.final_verdict is set to the gate's verdict (passed/failed/error)
21542154
7. AgentRun.acceptance_results contains per-validator results as JSON array
21552155
8. An 'acceptance_check' event is recorded in agent_events with the gate results
21562156
"""
@@ -2382,10 +2382,10 @@ def completing_executor(run, spec):
23822382
run_b = kernel_b.execute(spec_b, turn_executor=completing_executor)
23832383

23842384
assert run_b.final_verdict != "passed", (
2385-
f"all_pass with one failing: expected 'partial' or 'failed', got '{run_b.final_verdict}'"
2385+
f"all_pass with one failing: expected 'error' or 'failed', got '{run_b.final_verdict}'"
23862386
)
2387-
assert run_b.final_verdict in ("partial", "failed"), (
2388-
f"Expected 'partial' or 'failed', got '{run_b.final_verdict}'"
2387+
assert run_b.final_verdict in ("error", "failed"), (
2388+
f"Expected 'error' or 'failed', got '{run_b.final_verdict}'"
23892389
)
23902390

23912391
def test_step5_gate_mode_any_pass_requires_one_validator(
@@ -2427,7 +2427,7 @@ def test_step6_agent_run_final_verdict_set(
24272427
self, db_session, tmp_path
24282428
):
24292429
"""Step 6: Verify AgentRun.final_verdict is set to the gate's verdict
2430-
(passed/failed/partial).
2430+
(passed/failed/error).
24312431
"""
24322432
# Test passed verdict
24332433
spec, _ = self._create_spec_with_file_validators(
@@ -2445,8 +2445,8 @@ def completing_executor(run, spec):
24452445
assert run.final_verdict is not None, (
24462446
"AgentRun.final_verdict must be set after acceptance gate evaluation"
24472447
)
2448-
assert run.final_verdict in ("passed", "failed", "partial"), (
2449-
f"final_verdict must be one of passed/failed/partial, got '{run.final_verdict}'"
2448+
assert run.final_verdict in ("passed", "failed", "error"), (
2449+
f"final_verdict must be one of passed/failed/error, got '{run.final_verdict}'"
24502450
)
24512451

24522452
# Verify it's persisted in the database
@@ -2708,8 +2708,8 @@ def completing_executor(run, spec):
27082708
assert run.final_verdict is not None, (
27092709
"AgentRun.final_verdict must be set after kernel execution"
27102710
)
2711-
assert run.final_verdict in ("passed", "failed", "partial"), (
2712-
f"final_verdict must be passed/failed/partial, got '{run.final_verdict}'"
2711+
assert run.final_verdict in ("passed", "failed", "error"), (
2712+
f"final_verdict must be passed/failed/error, got '{run.final_verdict}'"
27132713
)
27142714

27152715
# Now use sync_verdict to sync back to feature
@@ -3849,8 +3849,8 @@ def test_step6_acceptance_gate_produces_verdict(self, db_session):
38493849
assert run.final_verdict is not None, (
38503850
f"Run {run.id}: final_verdict must not be None"
38513851
)
3852-
assert run.final_verdict in ("passed", "failed", "partial"), (
3853-
f"Run {run.id}: final_verdict must be passed/failed/partial, "
3852+
assert run.final_verdict in ("passed", "failed", "error"), (
3853+
f"Run {run.id}: final_verdict must be passed/failed/error, "
38543854
f"got '{run.final_verdict}'"
38553855
)
38563856

tests/test_feature_35_acceptance_gate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def test_all_validators_pass(self, acceptance_gate, mock_run, tmp_path):
257257
assert all(r.passed for r in result.validator_results)
258258

259259
def test_one_validator_fails(self, acceptance_gate, mock_run, tmp_path):
260-
"""For all_pass mode: verdict = partial if some pass, some fail."""
260+
"""For all_pass mode: verdict = error if some pass, some fail."""
261261
file1 = tmp_path / "exists.txt"
262262
file1.write_text("test")
263263

@@ -271,7 +271,7 @@ def test_one_validator_fails(self, acceptance_gate, mock_run, tmp_path):
271271
result = acceptance_gate.evaluate(mock_run, spec)
272272

273273
assert result.passed is False
274-
assert result.verdict == "partial" # Some passed, some failed
274+
assert result.verdict == "error" # Some passed, some failed
275275
assert len(result.validator_results) == 2
276276

277277
def test_all_validators_fail(self, acceptance_gate, mock_run):
@@ -354,7 +354,7 @@ def test_required_validator_fails_gate_fails(self, acceptance_gate, mock_run, tm
354354
# Even with any_pass, required failure should fail the gate
355355
assert result.passed is False
356356
assert result.required_failed is True
357-
assert result.verdict == "partial" # One passed, one failed
357+
assert result.verdict == "error" # One passed, one failed
358358

359359
def test_required_validator_passes_gate_succeeds(self, acceptance_gate, mock_run, tmp_path):
360360
"""If required validator passes, it doesn't block gate."""

tests/test_feature_49_graceful_budget_exhaustion.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def test_partial_validators_run_on_max_turns(self, db_session, sample_spec_with_
421421
# Check that validators ran and stored results
422422
assert run.acceptance_results is not None
423423
assert len(run.acceptance_results) > 0
424-
assert run.final_verdict in ["partial", "failed", "passed"]
424+
assert run.final_verdict in ["error", "failed", "passed"]
425425

426426
def test_partial_validators_run_on_timeout(self, db_session, sample_spec_with_validators):
427427
"""Verify validators run on partial state when timeout exceeded."""
@@ -446,10 +446,10 @@ def test_partial_validators_run_on_timeout(self, db_session, sample_spec_with_va
446446
# Check that validators ran and stored results
447447
assert run.acceptance_results is not None
448448
assert len(run.acceptance_results) > 0
449-
assert run.final_verdict in ["partial", "failed", "passed"]
449+
assert run.final_verdict in ["error", "failed", "passed"]
450450

451-
def test_partial_verdict_is_partial_when_some_pass(self, db_session, sample_spec_with_validators):
452-
"""Verify verdict is 'partial' when some validators pass."""
451+
def test_partial_verdict_is_error_when_some_pass(self, db_session, sample_spec_with_validators):
452+
"""Verify verdict is 'error' when some validators pass."""
453453
kernel = HarnessKernel(db_session)
454454

455455
run = AgentRun(
@@ -468,9 +468,9 @@ def test_partial_verdict_is_partial_when_some_pass(self, db_session, sample_spec
468468
error = MaxTurnsExceeded(turns_used=5, max_turns=5, run_id=run.id)
469469
result = kernel.handle_budget_exceeded(run, error)
470470

471-
# With any_pass gate mode and "/" existing, should have partial verdict
472-
assert run.final_verdict in ["partial", "passed"]
473-
assert result.final_verdict in ["partial", "passed"]
471+
# With any_pass gate mode and "/" existing, should have error verdict
472+
assert run.final_verdict in ["error", "passed"]
473+
assert result.final_verdict in ["error", "passed"]
474474

475475
def test_partial_verdict_is_failed_when_none_pass(self, db_session, sample_spec_all_fail_validators):
476476
"""Verify verdict is 'failed' when no validators pass."""
@@ -573,7 +573,7 @@ def test_result_contains_partial_verdict(self, db_session, sample_spec_with_vali
573573

574574
# Should have a verdict from partial validation
575575
assert result.final_verdict is not None
576-
assert result.final_verdict in ["partial", "failed", "passed"]
576+
assert result.final_verdict in ["error", "failed", "passed"]
577577

578578
def test_result_is_timeout_property(self, db_session, sample_spec_with_validators):
579579
"""Verify ExecutionResult.is_timeout property works correctly."""
@@ -629,7 +629,7 @@ def never_complete_executor(run, spec):
629629
assert run.error == "max_turns_exceeded"
630630
assert run.turns_used == 5 # max_turns from spec
631631
assert run.acceptance_results is not None
632-
assert run.final_verdict in ["partial", "failed", "passed"]
632+
assert run.final_verdict in ["error", "failed", "passed"]
633633

634634
def test_execute_timeout_exhaustion_with_validators(self, db_session, sample_spec_with_validators):
635635
"""Test timeout handling by directly calling handle_timeout_exceeded."""
@@ -663,7 +663,7 @@ def test_execute_timeout_exhaustion_with_validators(self, db_session, sample_spe
663663
assert run.error == "timeout_exceeded"
664664
assert run.acceptance_results is not None
665665
assert result.status == "timeout"
666-
assert result.final_verdict in ["partial", "failed", "passed"]
666+
assert result.final_verdict in ["error", "failed", "passed"]
667667

668668
def test_execute_records_acceptance_check_event(self, db_session, sample_spec_with_validators):
669669
"""Verify acceptance_check event is recorded on budget exhaustion."""
@@ -890,7 +890,7 @@ def never_complete_executor(run, spec):
890890
assert run.acceptance_results is not None
891891

892892
# Step 7: Verdict determined based on partial results
893-
assert run.final_verdict in ["partial", "failed", "passed"]
893+
assert run.final_verdict in ["error", "failed", "passed"]
894894

895895
# Step 8: AgentRun returned with timeout status and partial results
896896
assert run.error == "max_turns_exceeded"
@@ -956,7 +956,7 @@ def test_all_steps_timeout_seconds_exhaustion(self, db_session, sample_spec_with
956956
assert run.acceptance_results is not None
957957

958958
# Step 7: Verdict determined
959-
assert run.final_verdict in ["partial", "failed", "passed"]
959+
assert run.final_verdict in ["error", "failed", "passed"]
960960

961961
# Step 8: AgentRun returned with timeout status
962962
assert run.error == "timeout_exceeded"

0 commit comments

Comments
 (0)