From 9fcd5f385bbf43a88a56ffa33c3ea3eace4bb667 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sun, 19 Apr 2026 00:33:48 +0800 Subject: [PATCH 01/26] =?UTF-8?q?feat=EF=BC=9A=E5=9C=A8=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E4=B8=AD=E6=89=93=E5=8D=B0=E5=87=BA=E6=9D=A5=E4=BA=86=E5=8F=91?= =?UTF-8?q?=E9=80=81=E7=9A=84=E5=85=A8=E9=83=A8=E6=8F=90=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bun.lock | 2 +- src/query.ts | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/bun.lock b/bun.lock index 3ae85cfac6..c33d4e254c 100644 --- a/bun.lock +++ b/bun.lock @@ -210,7 +210,7 @@ "selfsigned": "^5.5.0", }, "devDependencies": { - "@types/selfsigned": "^2.1.0", + "@types/selfsigned": "^2.0.4", "@types/ws": "^8.18.1", }, }, diff --git a/src/query.ts b/src/query.ts index 8bfca61116..45af85c02d 100644 --- a/src/query.ts +++ b/src/query.ts @@ -113,6 +113,7 @@ import { createBudgetTracker, checkTokenBudget } from './query/tokenBudget.js' import { count } from './utils/array.js' import { createTrace, endTrace, isLangfuseEnabled } from './services/langfuse/index.js' import { getAPIProvider } from './utils/model/providers.js' +import { jsonStringify } from './utils/slowOperations.js' /* eslint-disable @typescript-eslint/no-require-imports */ const snipModule = feature('HISTORY_SNIP') @@ -696,8 +697,21 @@ async function* queryLoop( try { let streamingFallbackOccured = false queryCheckpoint('query_api_streaming_start') + const requestMessages = prependUserContext(messagesForQuery, userContext) + logForDebugging( + `[PromptDebug] full request snapshot before callModel: ${jsonStringify({ + provider: getAPIProvider(), + querySource, + model: currentModel, + systemPrompt: fullSystemPrompt, + messages: requestMessages, + thinkingConfig: toolUseContext.options.thinkingConfig, + toolNames: toolUseContext.options.tools.map(tool => tool.name), + })}`, + { level: 'info' }, + ) for await (const message of deps.callModel({ - messages: prependUserContext(messagesForQuery, userContext), + messages: requestMessages, systemPrompt: fullSystemPrompt, thinkingConfig: toolUseContext.options.thinkingConfig, tools: toolUseContext.options.tools, From ab2263ba86767bf2feedcf954bfe38441a0a0af8 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sun, 19 Apr 2026 19:02:43 +0800 Subject: [PATCH 02/26] =?UTF-8?q?feat=EF=BC=9A=20=E6=88=91=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E4=BA=86=E7=8B=AC=E7=AB=8B=E7=9A=84=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=E8=A7=82=E6=B5=8B=E5=9F=BA=E7=A1=80=E8=AE=BE=E6=96=BD=20/abs/p?= =?UTF-8?q?ath/E:/claude-code/src/observability/harness.ts:1=EF=BC=8C?= =?UTF-8?q?=E9=BB=98=E8=AE=A4=E6=8A=8A=E4=BA=8B=E4=BB=B6=E5=86=99=E5=88=B0?= =?UTF-8?q?=E9=A1=B9=E7=9B=AE=E6=A0=B9=E7=9B=AE=20=20=E5=BD=95=20.observab?= =?UTF-8?q?ility/events-YYYYMMDD.jsonl=EF=BC=8C=E5=AE=8C=E6=95=B4=E5=AF=B9?= =?UTF-8?q?=E8=B1=A1=E5=86=99=E5=88=B0=20.observability/snapshots/?= =?UTF-8?q?=E3=80=82=E4=BA=8B=E4=BB=B6=E5=8C=85=E5=90=AB=E7=BB=9F=E4=B8=80?= =?UTF-8?q?=E5=85=AC=E5=85=B1=E5=AD=97=E6=AE=B5=EF=BC=8C=E6=94=AF=E6=8C=81?= =?UTF-8?q?=20=20snapshot=5Fref=20+=20bytes=20+=20sha256=20+=20redaction?= =?UTF-8?q?=5Fstate=E3=80=82=20=20=E4=B8=BB=E7=BA=BF=E7=A8=8B=E7=AC=AC?= =?UTF-8?q?=E4=B8=80=E9=98=B6=E6=AE=B5=E5=B7=B2=E7=BB=8F=E6=8E=A5=E4=B8=8A?= =?UTF-8?q?=EF=BC=9A=20=20/abs/path/E:/claude-code/src/utils/processUserIn?= =?UTF-8?q?put/processUserInput.ts:142=20=E6=8E=A5=E4=BA=86=20input.proces?= =?UTF-8?q?s.started/completed=20=E5=92=8C=E9=98=BB=20=20=E6=96=AD?= =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BA=8B=E4=BB=B6=EF=BC=9B=20=20/abs/path/E:?= =?UTF-8?q?/claude-code/src/QueryEngine.ts:213=20=E6=8E=A5=E4=BA=86=20subm?= =?UTF-8?q?it.attempted=E3=80=81submit.blocked=E3=80=81file=5Fhistory.snap?= =?UTF-8?q?shot.created=EF=BC=9B=20=20/abs/path/E:/claude-code/src/query.t?= =?UTF-8?q?s:414=20=E6=8E=A5=E4=BA=86=20state.initialized=E3=80=81query.st?= =?UTF-8?q?arted=E3=80=81turn.started=E3=80=81messages=20=E9=A2=84?= =?UTF-8?q?=E5=A4=84=E7=90=86=E9=93=BE=E3=80=81=20=20prompt.build.*?= =?UTF-8?q?=E3=80=81api.request.started=E3=80=81api.stream.first=5Fchunk?= =?UTF-8?q?=E3=80=81assistant.block.received=E3=80=81assistant.tool=5Fuse.?= =?UTF-8?q?detected=E3=80=81=20=20api.stream.completed=E3=80=81tool.execut?= =?UTF-8?q?ion.mode.selected=E3=80=81token=5Fbudget.decision=E3=80=81query?= =?UTF-8?q?.terminated=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + ObservrityTask/Observersity.md | 637 ++++++++++++++++++ ...1\347\250\213\344\273\213\347\273\215.pdf" | Bin 0 -> 365795 bytes src/QueryEngine.ts | 31 + src/observability/harness.ts | 160 +++++ src/query.ts | 457 +++++++++++++ src/services/api/logging.ts | 55 ++ .../processUserInput/processUserInput.ts | 76 +++ 8 files changed, 1418 insertions(+) create mode 100644 ObservrityTask/Observersity.md create mode 100644 "ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" create mode 100644 src/observability/harness.ts diff --git a/.gitignore b/.gitignore index 6f0a4e069d..e5ce2615ac 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,8 @@ __pycache__/ *.pyc logs +#Observable data +.observability/ data .omc .codex/* diff --git a/ObservrityTask/Observersity.md b/ObservrityTask/Observersity.md new file mode 100644 index 0000000000..768287c5d2 --- /dev/null +++ b/ObservrityTask/Observersity.md @@ -0,0 +1,637 @@ +# 给 Codex 的正式任务书 + +## 1. 项目背景 + +我正在为一个基于还原源码运行的 harness 项目建设一套**高质量、可扩展、可维护的埋点与可观测基础设施**。 +我同时会提供两类材料: + +1. **当前项目源码**:这是你可以直接读取和修改的代码。 +2. **一份 PDF 文档**:这份 PDF 是基于“原始/上游源码分析”得到的 query loop 与 harness 运行流程讲解。它描述的是**理论主链与关键设计意图**,但**不保证与当前还原项目完全一致**。PDF 把 `queryLoop` 描述为一个“模型采样 → assistant/tool_use → tool 执行 → tool_result 回灌 → 下一轮”的状态机式主编排器。 + +因此,你在实现埋点时必须遵守这个前提: + +* **不能默认 PDF 与当前项目完全一致** +* **不能默认当前项目一定保留了 PDF 中所有功能** +* **必须主动核对 PDF 中的重要节点,在当前项目里是:** + + * 仍然存在 + * 被关闭 + * 被轻度改写 + * 被重写为不同语义 + * 或已经删除 + +--- + +## 2. 核心任务目标 + +你的任务分成两部分,必须同时完成。 + +### A. 结构化核对当前项目与 PDF 主链的一致性 + +请以 PDF 描述的主链为“理论蓝图”,逐段核对当前项目源码中对应实现是否仍存在,并形成清单: + +* 节点是否存在 +* 入口函数 / 文件位置 +* 当前语义是否与 PDF 一致 +* 是否被 feature flag / env / gate / 组织配置关闭 +* 是否仅保留壳子但内部行为已变化 +* 是否完全缺失 + +### B. 在“当前项目真实存在的运行链路”上实现统一埋点体系 + +请以**当前项目源码为准**完成埋点,不要把 PDF 当成绝对真相硬套。 +如果 PDF 某节点已经不存在,你应: + +* 保留该节点在埋点设计中的位置 +* 但将实现标记为 `disabled` / `not_present` / `rewritten` +* 并在最终报告中说明 + +--- + +## 3. 冲突处理原则(必须严格执行) + +如果你在核对或实现中发现 **PDF 与当前项目源码存在重要矛盾**,你必须**立即暂停相关推进并向我确认**。不要自行拍板做语义假设。 + +### 需要立即找我确认的典型场景 + +1. PDF 明确存在的关键节点,在当前项目中找不到。 +2. 节点名还在,但语义明显变了。 +3. PDF 说有某条恢复链 / 工具调度链,但当前项目走的是另一套机制。 +4. 代码里有多个可能对应 PDF 某节点的实现,且它们语义互斥。 +5. 当前项目中该节点被 flag / gate 关闭,而你不确定应只埋点保留现状,还是尝试恢复开启。 +6. 你发现当前项目只是“轻度还原”,有明显 stub / mock / no-op / placeholder 痕迹。 + +### 遇到冲突时你的行为要求 + +你必须输出一段这样的说明并等我确认: + +* **冲突点名称** +* **PDF 中的原意** +* **当前项目里的实际情况** +* **你认为可能的解释** +* **你建议的处理方案 A / B** +* **你当前暂停的位置** + +--- + +## 4. 任务范围 + +请至少覆盖以下 harness / 子系统: + +### 4.1 用户输入与提交层 + +* 提交入口 +* `submitMessage` / 对应入口 +* `processUserInput` / 输入归一化 +* slash command / attachments / prompt augmentations +* file history snapshot + +### 4.2 query / queryLoop 主循环 + +* `query()` +* `queryLoop()` +* `State` 初始化与每轮 state 迁移 +* `turnCount / loop_iter / transition` +* `queryTracking` + +### 4.3 messages 预处理链 + +核对并埋点以下阶段是否存在、是否生效: + +* `getMessagesAfterCompactBoundary` +* `applyToolResultBudget` +* `HISTORY_SNIP` +* `microcompact` +* `contextCollapse` +* `autocompact` + +### 4.4 Prompt 构建层 + +* system prompt 构建 +* CLAUDE.md / rules / memory / skills / attachments 注入 +* tool names / companion / extra context +* 完整 request snapshot +* request 摘要统计 + +### 4.5 模型请求与流式响应层 + +* `callModel` +* request 发起 +* first chunk +* assistant blocks +* `tool_use` +* response 快照 +* usage / stop reason / fallback / withheld errors + +### 4.6 工具调度与执行层 + +* `StreamingToolExecutor` +* `runTools` +* 并发 / 串行 batch +* tool enqueue / start / progress / complete / fail +* normalize messages +* contextModifier / newContext + +### 4.7 恢复链 / stop hooks / token budget + +核对并埋点这些路径是否还存在: + +* prompt-too-long recover +* media-size recover +* max_output_tokens recover +* `handleStopHooks` +* token budget continuation +* terminal reason + +### 4.8 子 agent / 分叉链路 + +必须纳入统一观测模型: + +* `extract_memories` +* `session_memory` +* `away_summary` +* `side_query` +* 以及你在源码中发现的其他 fork / subagent 类型 + +日志已经证明至少 `extract_memories` 与 `session_memory` 会触发并发起自己的 prompt、工具调用、文件写入。 + +--- + +## 5. 设计要求 + +### 5.1 不能只补零散 DEBUG + +请实现一套**统一结构化事件模型**,以 JSONL 作为事实源。 +控制台日志可以保留,但不是后续可观测系统的主数据源。 + +### 5.2 所有关键事件必须可关联 + +事件必须能串成: + +* 一次用户动作 +* 一个 query +* query 内多轮 turn +* 主线程与子 agent +* tool 调用链 +* 恢复链 +* 终止原因 + +### 5.3 必须兼顾“完整内容记录”与“可维护性” + +我要求能够记录: + +* 用户发送的完整内容 +* 每轮完整 system prompt +* 每轮完整 request / response +* 每轮 state +* 每次工具输入输出 + +但这些大对象不能全部直接塞进主事件里。 +请实现: + +* **主事件:结构化摘要** +* **sidecar snapshots:完整内容** +* 主事件里只存:`snapshot_ref + bytes + sha256 + redaction_state` + +### 5.4 必须可扩展 + +后续我要基于这套埋点继续建设可观测系统。 +因此你要保证: + +* schema 版本化 +* event 命名稳定 +* 字段命名规范 +* 后续容易接 trace / dashboard / metrics 聚合 + +--- + +## 6. 统一日志/事件规范 + +请实现统一函数,例如: + +* `emitHarnessEvent(...)` +* 或等价的统一埋点层 + +### 6.1 事件公共字段 + +每个事件至少包含: + +* `schema_version` +* `ts_wall` +* `ts_mono_ms` +* `level` +* `event` +* `component` +* `session_id` +* `conversation_id` +* `user_action_id` +* `query_id` +* `turn_id` +* `loop_iter` +* `parent_turn_id` +* `subagent_id` +* `subagent_type` +* `query_source` +* `request_id` +* `tool_call_id` +* `span_id` +* `parent_span_id` +* `cwd` +* `git_branch` +* `build_version` +* `payload` + +### 6.2 命名规范 + +事件名统一使用: + +* `domain.action.stage` + +例如: + +* `submit.attempted` +* `input.process.completed` +* `messages.preprocess.completed` +* `api.request.started` +* `assistant.tool_use.detected` +* `tool.execution.completed` +* `subagent.spawned` +* `state.transitioned` +* `query.terminated` + +### 6.3 文件组织 + +建议: + +```text id="h3ie7q" +.observability/events-YYYYMMDD.jsonl +.observability/snapshots/{id}-request.json +.observability/snapshots/{id}-response.json +.observability/snapshots/{id}-state-before.json +.observability/snapshots/{id}-state-after.json +.observability/snapshots/{tool_call_id}-input.json +.observability/snapshots/{tool_call_id}-output.json +``` + +--- + +## 7. 必须实现的事件清单 + +请至少实现以下事件。 +如果某些节点在当前项目中已经不存在,请不要直接删除该事件定义,而要在实现或最终报告中标注 `not_present` / `disabled` / `rewritten`。 + +### 7.1 提交与输入层 + +* `submit.attempted` +* `submit.blocked` +* `input.process.started` +* `input.process.completed` +* `file_history.snapshot.created` + +### 7.2 query / state 初始化层 + +* `query.started` +* `state.initialized` +* `prefetch.memory.started` +* `turn.started` +* `query_tracking.assigned` + +### 7.3 messages 预处理链 + +* `messages.compact_boundary.applied` +* `messages.tool_result_budget.applied` +* `messages.history_snip.applied` +* `messages.microcompact.applied` +* `messages.context_collapse.applied` +* `messages.autoconpact.checked` +* `messages.autoconpact.completed` +* `messages.preprocess.completed` + +### 7.4 prompt / request 构建层 + +* `prompt.build.started` +* `prompt.build.completed` +* `prompt.snapshot.stored` + +### 7.5 API / streaming 层 + +* `api.request.started` +* `api.stream.first_chunk` +* `assistant.block.received` +* `assistant.tool_use.detected` +* `api.fallback.triggered` +* `api.error.withheld` +* `api.stream.completed` + +### 7.6 工具执行层 + +* `tool.execution.mode.selected` +* `tool.enqueued` +* `tool.batch.started` +* `tool.execution.started` +* `tool.progress` +* `tool.execution.completed` +* `tool.execution.failed` +* `tool.result.normalized` +* `tool.context.updated` + +### 7.7 恢复 / stop hooks / token budget + +* `recovery.prompt_too_long.attempted` +* `recovery.prompt_too_long.completed` +* `recovery.max_output_tokens.attempted` +* `recovery.max_output_tokens.completed` +* `stop_hooks.started` +* `stop_hooks.completed` +* `token_budget.decision` + +### 7.8 state 转移层 + +* `state.snapshot.before_turn` +* `state.snapshot.after_turn` +* `state.transitioned` + +### 7.9 子 agent 层 + +* `subagent.spawn.requested` +* `subagent.spawned` +* `subagent.message.received` +* `subagent.prompt.build.completed` +* `subagent.tool.summary` +* `subagent.completed` + +### 7.10 query 终止层 + +* `query.terminated` + +--- + +## 8. 每个关键事件必须包含的重点信息 + +### 8.1 `input.process.completed` + +必须能回答: + +* 用户原始输入是什么 +* 最终生成了哪些 messages +* 附件如何归一化 +* slash command 如何被处理 +* 传给 `query()` 的 `QueryParams` 摘要是什么 + +### 8.2 `messages.*` + +每一级预处理必须记录: + +* `messages_before` +* `messages_after` +* `estimated_tokens_before` +* `estimated_tokens_after` +* `tokens_saved` +* `attachments_before/after` +* `tool_results_before/after` +* `snapshot_before_ref` +* `snapshot_after_ref` + +### 8.3 `prompt.build.completed` + +必须记录: + +* `provider` +* `query_source` +* `model` +* `system_prompt_segments_count` +* `system_prompt_chars` +* `claude_md_chars` +* `memory_chars` +* `skill_listing_chars` +* `tool_names_count` +* `tool_names_chars` +* `companion_intro_chars` +* `messages_chars_total` +* `attachments_chars_total` +* `serialized_request_bytes` +* `request_snapshot_ref` + +### 8.4 `assistant.block.received` + +必须能区分: + +* text +* tool_use +* thinking +* error + +### 8.5 `tool.execution.*` + +必须能回答: + +* 是 `StreamingToolExecutor` 还是 `runTools` +* 是串行还是并行 +* tool 输入是什么 +* tool 输出是什么 +* 有没有 `contextModifier` / `newContext` +* 执行耗时 +* 是否成功 +* 是否触发 synthetic error / sibling error + +### 8.6 `state.transitioned` + +必须能回答: + +* 为什么继续下一轮 +* 从哪个 state 到哪个 state +* messages 增加了什么 +* token 估计变化了多少 +* `ToolUseContext` 是否变化 + +### 8.7 `subagent.*` + +必须能回答: + +* 子 agent 由谁触发 +* 为什么触发 +* 继承了什么上下文 +* 跑了几轮 +* 调了哪些工具 +* 写了哪些文件 +* 总 usage 是多少 +* 为什么结束 + +--- + +## 9. PDF 与当前项目的一致性核对任务(必须单独产出) + +请单独产出一份“**PDF 主链核对报告**”,至少包含下表: + +* PDF 节点名 +* PDF 原意摘要 +* 当前项目对应文件 / 函数 / 类 +* 当前状态:`present` / `disabled` / `rewritten` / `deleted` / `uncertain` +* 证据 +* 处理建议 + +至少核对以下节点: + +* `QueryEngine.submitMessage` +* `processUserInput` +* `query` +* `queryLoop` +* `State` +* `getMessagesAfterCompactBoundary` +* `applyToolResultBudget` +* `HISTORY_SNIP` +* `microcompact` +* `contextCollapse` +* `autocompact` +* `callModel` +* `StreamingToolExecutor` +* `runTools` +* `handleStopHooks` +* prompt-too-long recover +* max_output_tokens recover +* token budget continuation +* subagent 触发链 + +如果你发现: + +* 某节点被删除 +* 某节点被不同语义替代 +* 某节点被 feature flag 彻底封住 +* 某节点只剩壳子 + +请立即找我确认,不要自行把 PDF 语义硬套到当前项目。 + +--- + +## 10. 与我沟通的强制要求 + +在以下情况必须立即找我确认: + +1. 你发现 PDF 与当前项目主链存在明显冲突。 +2. 某个关键节点存在多个候选实现,且意义不同。 +3. 你不确定某个功能是“关闭了”还是“重写了”。 +4. 你准备恢复开启一个当前默认关闭的节点。 +5. 你发现现有代码中的日志/埋点体系本身就有另一套设计,与本任务方案冲突。 +6. 你要改动的点会影响行为而不仅仅是加日志。 + +你找我确认时必须使用这种格式: + +* 冲突点: +* PDF 中的描述: +* 当前项目中的真实情况: +* 我目前的判断: +* 候选处理方案 A: +* 候选处理方案 B: +* 我暂停在这里等待确认: + +--- + +## 11. 实现顺序 + +请按下面顺序推进,不要一开始就全铺开。 + +### Phase 1:核对与骨架建立 + +* 阅读当前项目源码 +* 对照 PDF 做主链核对 +* 建立统一事件模型 +* 建立 JSONL + snapshot 基础设施 +* 先打通主线程核心链路 + +### Phase 2:主线程完整链路埋点 + +* 提交/输入 +* query/queryLoop/state +* preprocess 链 +* prompt build +* API request / stream +* query terminate + +### Phase 3:工具与 state 深化 + +* tool detection / mode / execution +* state snapshots +* state transitions +* tool result normalization +* context updates + +### Phase 4:子 agent 与恢复链 + +* subagent lifecycle +* stop hooks +* recovery +* token budget + +--- + +## 12. 验收标准 + +只有满足以下条件,任务才算完成: + +### A. 结构化一致性 + +* 所有新埋点使用统一事件模型 +* 事件字段命名一致 +* 有 schema version +* 有 clear event naming + +### B. 流程覆盖度 + +能够从日志中完整还原: + +* 一次用户提交 +* 主线程多轮 turn +* 每轮 state 变化 +* 每轮 preprocess/压缩动作与效果 +* 每轮 prompt build +* 每次 API request / response +* 每个 tool_use / tool_result +* 工具调度模式 +* 子 agent 的触发与行为 +* query 终止原因 + +### C. 大对象可追溯 + +* request/response/state/tool input/tool output 均可通过 snapshot_ref 找到 +* snapshot 有 hash、bytes、redaction 标记 + +### D. 冲突显式化 + +* 已产出 PDF 主链核对报告 +* 所有 `disabled` / `rewritten` / `deleted` 节点都被明确标注 +* 所有重大冲突都已向我确认 + +### E. 不破坏主流程 + +* 默认行为不应因埋点而改变 +* 埋点层尽量旁路,不影响 query loop 语义 + +--- + +## 13. 最终交付物 + +请最终提交这些内容: + +1. **代码修改**:实现统一埋点体系 +2. **事件 schema 文档** +3. **PDF 主链核对报告** +4. **已实现事件清单** +5. **未实现/不存在/关闭节点清单** +6. **你在实现过程中发现并与我确认过的冲突清单** +7. **一份示例日志**:能展示一次完整用户动作跨主线程 + 子 agent 的全链路事件 + +--- + +## 14. 最后原则 + +请记住: + +* **以当前项目源码为实现真相** +* **以 PDF 为理论蓝图与核对清单** +* **发现矛盾时立即找我确认** +* **不要擅自把 PDF 语义硬套到当前项目** +* **不要用零散 DEBUG 代替统一埋点系统** + +--- + +如果你愿意,我下一条可以继续帮你把这份任务书再压缩成一个“更像 prompt、可以直接粘贴给 Codex 的简洁版”。 diff --git "a/ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" "b/ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" new file mode 100644 index 0000000000000000000000000000000000000000..4e75b8559c3b7b71e5dd5db461d9b2099bcec3b2 GIT binary patch literal 365795 zcmd?QWl)`M(=CX*ySw|w-QC^Yo#5^k+@0X=1b24{?(Pzt;DM9p%vUvU=EBl>L%z5FZ5{@|%0a|LDy^ofMXVxS%#tbUgh zA^w|!W9E=~Lx-gNH_0_Mb4(^v1~oZ#^z*4bq9TEhRlw8hRQ%6c72E4I?}pdiv~YjE zPxcU~?`3)qg-GLwOxqVlb=GgEkA#Hp52c&6*E~);!nwq80@VXR*FKoEApEAVgYaS1Mb(4u)48?RmT&DqkU)R7@!~HQ}#P^(oeNU40N_! zobenHjQ(74UJZhuz2jpMwbwjP0P@q1rYtYt=>BEvy=4KId?rGQDw7<%mkw`Y2;TX9QzIY zU*DHUMigEN-=Drce)v4!^!R@#Q217wwAk;v<@?^r!;23bNuHzVA~@v`a74*82#}*XzmbEVP&q~x zlzjJfojS`fQ)Tqml7-cKUw15eHGT`QnOn&Gp5IO}6*GNfNyT}|xtkC~o@<6Mk;v*f zZnZaBOj_uqAjj3lDaK?Ssm>XCfP5FN0tP*_gO(^1GE7Yt40S*(G$*sKZFw&hs3>Vo zo$Pnt_gRq>@{}n3-o9H!F#<0vsVE8r$t5f>m{f^U6c~#8TM$&e6vO4dEi}$>N27m8 zb-IahxB62aTXgYN8|V0ulLuH85Xfhs;xneDyiUdVr z_*Ci=58ps-30?p-7ABSZAXEtKj2w_(q6AoT1X~RvZ6Fm|o#6R#x@$07ET9@^6vr$x zCm~voC_8F$XJs*qFNik=tWh8N88S+@f57Eu~L7g$`Qk}|;a8G=&q7D|VvZb>az z6}g#VB{T1wWsL+OTAJ+`UXXlzHBs(?M&prkS0nkWnMJYf2026ynVFOQ-lk$-nzkNk z!HzBP&;pKcUv@`oH-vdd(|&(4EyAQ|sM9dPsc7X6sC)t$^B%#L@Vb977>a_UCNL5b zm$S|);_$$CDO=P5AJRXlC;_o8rVA5zzSM) z8RPGXyAKk)!i~im8m!s`7R<4%kP6R1WL7BSrrwF-_9ACndqgyiM4iA+RXRu;B#@#U zs_;0CiJyD`aJmQdw|g7IM)YA3LQ#oEP_ePwlN+CJ?^5+Hr{pda02}%V5*q zzZc(U+*cY_gsg&BlbM9}D~mNkaP?^oFvtZ3jS*4VN`avcVz$7uXJC_&NOfdHOukhR z_R8UiJFC&AJ0P_#P-UA7GN8({h#!%hLJ51lcu2}(vBMUjWtZpODs3Do$kuUoVy2BTMg-d3 z<8*(Sjkqt1t3fI!0I5G5$wSf;pNT3>0)9Gn*+VcIwMb+_11MA+g2^;eY8DvI<;oHiZNQ&WN}h|NAu_gQ<;cnRMX^sSqcV;qk;NL} zu+(UsCbV+SFnZk;9PAE=$Oo4C`pI;BAOU>Xs|V~%@jyLFVMgs?)`|mU(VAJ8U~@xG zm9}1oSYk091w7F-Fhod<;6#*{d&g@B?1s5MZ%uIqOr=OEJypkO+s_d6F&$k{v>`++L48?_?D5&#(ZPnUBPU5$;ZzjHfI6V4E_jxL$5;mp3944;YaCrbz=Aul z3#MQTilFo5Sgh|fmFmn{!6YBDf}9Z*2SmdX59V9du~V4+vhV%It#zBZU0M~h!?M@K9s z59kPRI5=owVXJCMF6Lcs7;8~sxZUQ{!|3HtmWj3cNIFjB z5`CmZ@W>^9vy9D#-pvO3H@zDlq9K)>UXC(MaC_9=u?U4Nk-hK^n@Lk_Ak#kmydBRgol}mC_u5WUHp$Xv_KG%8Mil&RQEcMDvquVs^bh35$u*8un`vh-a znT{bWe#@FbnzD?R6?*m2c%Kx)W!r?uy||85Ck~xU^NCH{=K^(-k=2RHJ4{C~SF=)2 zkS6!Byz*oD)5iMyt#ooYV0MXMU#ddOML2K7eG*ni<>9s>ffg)G1iz&hQe4a%lH<+R zR$NT?{XG*b%qt{t_koj{M~oshNxBfb_@|!?p{WO+kXWRPPl2^~v&3v{B5tJpll zK634%QU?1m_y%JWH!0^`&|V#Krmqz@cC`!5jB7Tx7{>$-sE-@+#f(QKmmv3gGc8y0 z;%t(ub56ADzHOnT>^n2y`*J^rl{r*i`fs`m=OK-?lG0-2+VQqJjo+ugoOJIzSg-PZ zcY;-!NOhw`f4S|Fq9@3VaWhDe4slrgYSrM4$ikMoxLHGiUk)psL=GO3HHlMD#L*&| zOgA`WsV|+oQL&+Dl!n!)MdUPrnbfcSf|ggehyJ4@<@JYVo85Miba+KdR{mL7U5jR$ z#@Tj#2=d$aVsz*wPn1Ph7N>+btu$VgWk@%+EYVDTLh*1OIfgXLQ+$EY^|b)$@l^G!W`={GB(een=6f>Bi*&I6zL|;qm8<_ z{dILoU~;DbGt90Fa-q4F{9EVPg+;9AlVCU>L%~{ z+`Ls|=gnql#@FP=F!%@su#{RIBw#L*WCI5a4jsJ=MlXpl%G}4B(soW)8qCsrT*23$ z_f5-i;;gjpx2YgMa5igP8N47=Ez`T^>SMydrdo4*rjgfI^+3D8JdK*J#hSi}x$|A%AzYc&#gpX1IM&_eb?ladLJ36au!|kV`>?4>2%A~PMdQaxKBkX zKhlcipF0}mG~?38bjadn@EdXiDb<&eM2vN5rQiTG+navDb=s!yYG==#)OwA(!;@e;(ISP*mBb% zk`4oSzUmN;tlpDpy_S!f9@EXB<(%PH0yC^_%WaiZ-A^BdkS}}$oRDZ_)92yS2_{t@ zi{N(+^Jo{kYvO8zMs%zRCVfwXW^@qWUA&>nL1u1|<gc8jas2FXpOD-T;mgp2J>wJ-VF5Z0uuN&>9eSOFb^U==f=9bCamXS|LUUQyLG=Y4HhV4{@k(?LZ+* zIg_;L#4*{o*({UYFL>yL7B$+F6*Jmc(ZuJ=jdqL+e`jnzLun}R-n|47vTQJnuHR#`aM{zk0=m|6Z)tg6KdAq+4 z6P68T7U_SKRyaw%`@_e_S76#S!bRm*$l<>ECajf!wl^b!aS$l)<|J=1JdN0NkKx?R z?b;Z+50Fr5K0#S!(GGL=FW&WGH=7`O4DX6>Lb2dZ9&Pg6!Y(s)(Quo3HasYOdD>h( zprv$t2jKka1^!BIv$FmE0xB}G=S*A_r5lk)dJJ)YvgBy;^>OXj*T$xLM zW=@5wL^mD){yK`hLj|7^KRW^DqVf2{_=FvdOrwTavJ*v2Uusjfil`)bi4C}9Z|+N4 zV8bU0Iv^&qcj?$1iSo3oht0$vdDF8iz4JR@=-8q}ux+p7C(#;_B5&wIm3y}q`z%I3gtqy z1U7W8_pSg$tI&^aW33=Gl0etlc9tqLNzd$M1dNJ*f^`b{QPy5=?LdDnCcKWn2;f** zHQSzGBBMrlS$%Cvqh_itY79AF*~ZyG%y`4Fy&6%p1zHOyM2+j>j3e~7Ty%d$ybf(m z{u34c!>Rc{J1@-aT)%NSLB@W72~PBpADodWz+d`U3fU!L!+$+LW%zp7gK2CRj(k#9 zcAB*!ITbc#J`T@XTEuD2yD)0@oavB99{e&47B2l*(w{woqe-#Qj?Y0>my@n&o(J6hA)@BVz(Dy zzA<78owZU3^ln0N0R?U^$Z7DO^56wdxI#mWl=sM) zGH&$vu{^2yt95W%$AQ3-@xGcHQX$YHx2ZTtWVCC*<&Pn*Rrj^`vB#T@(Bd$CZfJ|a zwePcX>$7{9w^a&))}Qhj6QLbi5uP^%OU4n2bt|aMA0@fcY+MUM9M(qby^O>jY^!OE zb3KJ`QCo|dE_rd%j2MG|hi5@H{>0~a}oSE1iO4kn{4YkFiBrYY&1;X9%xzvz!ndr77|%})C9!s#d^ z8rR{hd&U(bHR>8XW$(m&e0-HR`yLd}aOpnrcjwZB zGiMGxB>@X-45WL>vl8e%krTI6wVx#MH*h5jWk8EP-rc<}@xF|H=oJ_pC*X@-qv{{3 zAw!19ZZs-INlxW2beYiN91d>V#)xya5TB@k3Gjz-yJ_s(t@s{*7~PX3JZBy>;64wz zQ9J!}Cwf?d0OGLN$10J%=Q0y-S>Jd=MK!0^PJOHGNeIu#fkC4{k()kc zZ8&FbhrgtOp?&We9E8Ce@!wHn{ZFs<|6Bn8%#J2^KGGdk)j+)B}$ z=Ie&yRKGX9O8tks{mT*Pk{fN|1FQ@|{U}Yn<}#=@UV5S=xhde3L1I`j6Dh|9n6Va+ z`=@vKtCRzMILH!l6P1?LhB?mJ1+Gt!w-+7XHY zp?P_jnRF%mEpw$~)OdUCp=x=u3Notd(rO_^EIU>~!Yl0+A?KYZR^^dz2FLU7IQIt_ zR~(Pf$L%<)?rxAq&jbPObsj(MStIM%#kYo zDtXxh|Cypg-9u6*BO|>?uAM1amRnDSg3o%^$n%*xs7%Au1P$Xe(yE0at-!I&uW95DhruN5@^Qasg6z!}m-JQYL*;_R7}*j@cQC$HZJrhl47y;2;w z>O2VTfgbD}$t*pSjy+t~NdGYV#D`-PxK+Xl#?s^PQEoeZ06kb!W59z+JriAY0tUr} z<{JiLl*Ka$qMc&HA7a49^f$Q<_q`Za1)Jn8h@0+ii|1RQ3V6_M_~dym*1L|h!E#eiKk&VVREj-XWCH0>sP* z&}jLfnKG<~(QS9umX?{&;PAVh$xo{SkJA)8SF`89sIp5LpDVWu#IYQkV;AAED(BHt z$}!QgDDUcOq0_eHa-e~G1wQMkZ0x!K{04c3w+s3#H{nVdxsV8&k-8m)((;V$U?C`_ zy_9y<=%G4Srn`u@N`A`b2n-z)AY}%f7sfn5Ea$O`ADWvMRA?jVEi=0u5iaA!I<`w+ z@g(x&y}ZmZ6BC{?=326<(aQ|ha+V&(II#V|AZ4Sopk8#MKEHNr+Yxa)d<5;~cFzAd z;_QIGqx^rZ>|h7{72+%`zg1~DWf^;LVz_nZNL?W6kT=e6l@%@NTmCwb-19B{g)6ZD zmS1%mLq!3Pke*?fj@Q>jpS&(;T-etBj~m zHy+xOFEu_INRY$`jxR{RItZg3W)=SwSPwANw8m5rCNgoY2hYY>) zIslfiFiQP<)$cw_HmbGf=DrF5b)LjdvK+o_CNevoL@Fp!hrDc^6f;q_k5TRN)P+0b>4$-ZD{kzqsd|f& z0z(^8`Jk2naZROnm)fTx(0q?DQ=BA764)F*w}+-$*VmM1_g8Rbl8Dwn5sUe+GV%8g zKrdd%W{?Oe^y)p5%N{Xm)y&E}UQi19V8+gBjnL2~5oU@C)p@k1cC9Xy04?Xr_eFfw z- zzAttrokQf7!Btkm`4trqi_J}fUq3p(V2R%qYn7GAjfKCC$c$Z5GiL|6Tdpr=1hTWa=jeNZFJ@)T_ zo77yT8pOjN&rkG4QcG0#8j?R)=_oe=a~ENRrOJ86Y&%r>-Rr$mSd|Gy60y@k!apQw~YK%r$eOq_#qn1{92!tvNVjTA5JRpFy=n14E@QgYDuZ3x-+(!0|y zp&5O3NXavT8e)jL!n?k+GPGYKg=XQzXg4gh+U+=l9F+^SMN+d{%0;X1T2-wG57<>4 zM)Vh+jAT%T@K;erJ_e)^H}8RWz=+)b6!E`0CIG+xV7^!xgaD>r@+^B#FAq9*Ztl)9 z8n)|WNjip>;u`Cho-X8;_UJV=1al)+(yGpKDvOL7q?JAnI1{{4$JAtEmrkCIr(V`r zb3xEwOx7=Amz%pAuOh6v5JeOT^J04=v~`&iLQG^O?D^9V{yRPSdmjB4`Pl%@zv_&+ zSpQQ=@Wu4O_cB3&e#q{E(;)B0UnW(YmA?AJlAT?d`m~nQpt0@|;cw!Hv+Y?c=YNTG zCxv&>c_h&9OSU($AeXQaN_Uhs#Vu2pE6QLh?k2DKY56ltg`n*al(r8)_zyAW{Hxyg zU$v^wet-x`^vY+5AW0--09z{(MeH_TPOXJ7!03Wx5mtLY_@i5}TJ;RL$mpn#?sjdH z`xg2D7joSsXg0}2w|^YNsCO)hllIeAiQeMoGzCn?uAdIdG=7YbB|On~LkHQ`&L>Bs z-oki9x6vF5P9Oc(s#msQhwKCt)&je}-E;>pz|G<$)K}xuX|wIpi{y-+!4V8()y3m& zc6LtPg8I=g3bb}B?U_e!+V~9t2g{0(6Bo?m!j}wuV>92^$%KW?an}YL` zG5xdUw8WND`#EE=z=1U&Nsn@{D1%Ew{<4zn2{QxwN{K${2 z4bvp>fr({!mNLx+NN1)>rQy1{g+w@z1us1KK6Yc5Lj8s!c;;1Et75!#U`M>MnI?5$ zB<%}K-q_4LSMw<<%_8i>$?PZ#kKna4lOdeeg|P?psLvByNGI4=69_eWA~+?WjjU?S zSSV@di}_mH^&dF8#kgs>6Sm%am|Tv1%xzHVru{$`HV$IvPMqMhCCOkuF7m;z&%My7 z-Mq<2+<1FM?O-F5e5#+m46cha!-SgL&NI1r5EDWU!qfq}i@Bl&e4>bW<3c@p$BEN} zayOu5jH-#~wWM`=k#xY=Hoaz$X*R_t?wjEbdpbpRe)9MM#o4qB{hP6@On;TutQ@}s zt}>Qzzy$Zo14FSxqqIjnv|nBAc@I{%iYOT4y~#L$#wZxyyi^q7%4I?L)p$(xxiEY! zndLR{%EZ$CX>Dc`n06P~y?=R@(y#H?9?Szao$d>8xO=a(hoDXjbc27dF2v_G$h8rC zS^*5yZRAn=zl)>1w_(V-X;*!{@hU6cj9@%G0BGgWEh4F}$S3j! zgv)LyL1H9E;u;gJ4z^XK3xl2lK0|UJe0!`NZYqj!ZXYCRmED}6RG6qaBfBSVl#U!= zY!*#el!jA#r=M3NN3hRo_gzLb@W^JR@R)@V<=UGs0WLrdo5oAtf;qEtD~#<8y+R(B z6qtAomww=nsRVcMQHZ=|t(8gQ*hSV!kNi7F^Zi4~hS%7}>E#WsGdbcavusFfkQmn} z?Rjmf<@ccWz(;vQUw1LI_}Z)F6~8YTLZ8qb>tdySBgjf-S=vih*Y7*Sf0%ux*{V?9 z>ls08xtNmr6Gi^20b^zR&Gpbrl1cnk6GZ-*u#9fn@mP|Y%t2wRdel-RHQRKYOzvXPcVY;SEYH2Qd{c(GP z>`7oCVcWSR*H@zTl>I_zKjWKJtDA2=^UQDiPciX=&-?Zinjd4{o3JNT**3Hc@f}=e zI?OmZM2Nyi25eG7&P%>qG=(ju-?iyt_c7m05^jw-<)JhsxvP5av1l)h--iW!@2>wd z&_gJphu#ynTYf~-^Cy<@P>SaIdPT5pyV>J@1s~jNtGpr9T*GR6tEY#kg8DC5a z%^Bg>S`+eVLN+)JAnU&B|EwNHQ3xviL{KKsQ*uV_^W=FmFc&6i(1nEJ#jS@e2bX#R7TS)dfavsmsa*hei|wW!%JLR%4EG^ zIKC!sC{-OzBLHM&^R>Sb6voATi@}D@_8U9P9^hnPgMYwS8|_1%Y)^m*^d#jj;%h%oy-M%dAk`7@%>S zUM)M7t)LDx81am7dQqx!kmPnCVWu`(i5Tf&YLG=3gE;;=vkVrL$~X$N64`FlooAA( zTjyI7bZYom=N2G}?8AwezyX&-KAQ@+TNZWbLe`5pJInQU1#0P`*IBL%)6z{~QMH_l z!(Tiv0pDzofr%<5@0wJPlg+$d$>8U6{6c14mk@|Fe3Xo=OQn;st-SD){~gF;sX5|L z*#1@P&&>MUaiFIt^p7I#JV*DIMHFgI!oUn69t1qaH!oa(np#~zSxb%H_Vv_~g`;`J zryIG2trj0Q+c(_`3S=YWdtoJ~B3LwV2F~o4v5vvUOF!|*^rW|vv!-{rK5#a9F@*qt zUa%KdJJ8)It-O3l?l+}eq~q#d`P~EEqnZRb^B>|9DnKGcB-@={hbh4_ZC}+P^qyS_ zdHqohD#fT!zO}ITOJ>t!p0pAq)X`N2u*r!jPWZ1mI^PGkI!dhM!zG#N9itnKQ0&TJ zU*AoHi9mQyk#?h%CcC>2AGH1~{HmD^Yd0UEJf*2<<)b@`gR6J*6D0n`wfEn_V*~tE z@K{)X+e*<(oUjjOLK1s?hhdx-1H%Fl6_(@;8Z1Cec>w*)S0p*48AkW?>SdcLtrsx8 z;#_ON`%uwrf7q&gCHJi}I0c~)4JY-vhHHj)LtF|(d}wr;@2fxmk4O*k4Z8^#725C& zQ|#icj#WMNu2~V{BU)bMD>c}e!p_+Ni^YuA4%#2^sG}(_WZ%jI`Z)O>&x&&cywu%n zLstoxFDzO8u?e$4#%=Lht)J)&mG~exCvf8@ADuJVf@{cH9h1>(n2h@(>@m8 ze9}J1>LJS{qwxfBjvDg>@{$`h9c9gGSLJof$WQ))88We&dG4X{Z>@^0p|GjZ)gv_& z0xMnFI9Eu=Z&(ODHg-qGgu(bwvBt^Q9*oxNW&Kx>M55)L{|1MZ>#wR73;S;~N3}#b z`ynQ%kVh_X%+IEixm>@@dxp7J2cCUVg<4vFXpaB@_i#LMjS~aQsVa0DBcHU6`~`In zqPpZMn5oP|7au-StXR^EorUi5s zIbgPrE*4|rMnQ`fq#`wtH)K;LJ7+M@-GMJsfue|K&;sSA`ueyEuK+*OHNEC9^VETWhdf@L#BJJn@ob!)Q0vENxcI9%wjyc82mNUYvBr&{X9*SuP7l;Cl6x{V*TWoAiKrXW z%0kA5u&lCCL}S*10w@tmB@9Z?ZBc{}HqFJ^c`$Q|cg0kf>pe1Sqi59k8!SpZI3A(C zIkToLRF=gsNOqH7Vtu>CaXCV1^>oS8?(+d+t5c_|LW$@h5Llk?!8*_rG1sGPZYJY_ zd2Q$Ns>4VfniwW6b@h-=nvKM>5bO;5fskgIt%A5edJWA2I)GC;101<FgKZ#-pC2`Znz{Hha0kOOoZg{bbL(EF z*t#9W^{`&Gu0pXO3E-c(iI)*n#P9au>|O}`fyC*mPyR%OztYsdgv@`MBYH`4i8_E^ ziF<{n+p8x01~t9Ep!&ZhX)5oxq^dW*Kpaa`e3MZmXV2tfm*%Qe1FysE+uoa8T5mfq z{LIYW=8w9iFV{lhqvsU_gsO($cy@!!#Z6ZUJy3YhhC#wQLT=}nc1K8^$L_2zf@^~( zsL*5_=qzjSP~-yC5x`_XTTHt31PKrb*h;`)c7!9Hp(1%1v!1J4ZqV+daeeEWFf`b+ zDEKit%+YB{f`*$rW$6?LjCg{^8VqMnGe9|C%6w6W@JwEPaL=?La~xWl$8f)4M8m%g zcGV|c_jPS-FBkcF86C4&GAtFu@MCbRNhGmx0ne+mT~cW%P&N|(T9T#I!-{Csv9}r7 zSpZ$r57*DyNESXfP_3p%1c+JhVp8NQyePiLFn=!)K_iTo$A864vCBdIegRA#{0cQE zWV>aQ1uY!7n)t>1>Pnk8>4ChD3-_l`d0SG`2Ton$4@{W8uBKe9ni<4w=TM;i`To~c z2U4tbpC3^2`XMUoaB5~J>zJAMu0O#M<126e4J+H`OApe zYsZxrp1!(jLK8vZCWfhUZ0iCiSWobBUu;F9MB^_DhA{41RZ$e9&|WZD7*G#8vD1#Q zr~^{vWZ>;9F5BYLLS|8M<<@Y$3Lp#0n}ey6#!av|GI68~Z#8M5v`FMnc@N}~xrM=9 zdwfI~V(#i7+GM3%Q1-R3#o*K^4XL;F= z3V*2V)xn*?hp(FGvDp^9syY9pC7i0qY&GyldB{%^tv0>a-FQ?JvrjIpSUfz0(o{3b3!blLJ_Yb zg)4tOD#Mwq@f%2Dx{C#%wC)iye8@c+A{d7O(-%bp43wLISPo|+(^Nn6I*TsHW6`CP zW59eC%mJ5y10?H}LkOBO=yNZCVA8AtSG26C>f7^*Xw~xA15@q>OMGvDmrh^l?SMC_ z%`6z3?H5}uuPca#j+p$S(-uXdD%!Kz?FuTk#|Rk*ou<*giZPoF|ALTC@HwW$wO;b-mCl#tq81h5Ln0(sO&T-A(cFd zz;0JV<)QmALJSMPru5J;{7=~Yoo&PNy9QH{ar(a&pgRLB25Zai8t;ziH(niJNA1^aQ9|}0!H|EI#CRsI zbs6>Mg?I%%r}FG#1_KV+PF=*tL|u_-O^Q|UcolvGLc~t{lP*o>3vf^+ktoYSm$p_p zAQ^QDpfe=KLGB@oAfbI~p?y_EIFV6XN$f;D^f(o4&`0r$^4+7fLA zmg}>DV)hkJ^q&Kco?U(3xyC<0C)(Pi)fA(A1QSogRN+CFeZs3dxVMPJ{7rv?UWJrt zXtBCwt)U=YWD_L@8#-EYR=OT5J#rE08{PHW`Nn7CqO()(B9#hqVUVd`_EU6YIcox| zOjnWb`Z8YM*o2)w`D@-uz;O;ZCDAJpn3qIxe+g;hgeynWDtCCG;pLK+n>;C0lk{#{ z?@DS8;@RQum~|y3pIdcMl(=|EUr@>9IJ+>zDQro3o}4gY*S0Wpxpa^{TEx!yg}an= z3kHoQe;u$g%J?8Z0|b!EZp*~&7+E@8F89rH{?DU#*e}ylI1>$58w{p}Yul}w-(rkc z_*OlwBF!hQASty8S{jI6eM8eYRnN9>;9pW2?EZxOU%4?X%)cFnYKi}&zPZ5}4F!{* zkiYGdob#gCjB~bC!Wio(QwUJwDV$F z`>>wMz*QnoX0J-f_(R~IM;V;3YU+SmmwDIuNsm(B87GZvhe@gEatBtw7~)-Us_$u4 z{L<4$mR}#c13=5_NXnwfFFi52u4XU2YN@$yR=iEhEpVUs3Upln6KN-`P#wg4?qVXb z@}tTsjVA=QguyeIrdLiK?DyezJRV%lI`+zmOHciKw$^nlDlp*}GG>p``qYJi`7>NQ zg@uk6sJ<5^eOxjn+^#D)AIQw<;l8|>aScPt{Xu-On4tbKxv+;4lEfh-K9VC39^l{A z`72F(HI5zGW!5*B{79K0!_5uRh6P_VJ1IFuDJxiRZ$lvX)(Ej{LN5Oe11s0xB_QB8 zIU%P!;duCKm7)@ceFs|!O`u|gCj)Pb*094MK%Sionq3qMtlU{X<7=#LQ))J6qQv`2 zS+$dAai>Ie;>1zH9DN#rF7?JTYnudxWDGer^bQfnSUQ;oFOzal00Rglw%jGX5g1W_ zo~tGnP2zwWDNP#GLLq0QA>-c!AWX7B=ufAshJ z?Z6`WCsCw}36t6K=Wh&b5#o4N@iLz7=|XOrEjvkM#TOop`v<=4|Dm%L9jHSary2S2XdC^KZ+2YN|4cy8o8m#bnpT=v9(v3$9Q`7waeOxF?%FqLe*M_zwdAszM z465#!U=puB{St6rv*X3i z=@{606i%d2RJWrtEPj%Pz*j}m7SbS*LQuXq$q9kH_8#Jw@dz4KC2k4vYwHXt&e`uA zpREMagp_6+ZDWyt2#3Xjl`IOClJ0=;g-M5&^<^uQZ8<8qBJB6-Y`Ogmusv)8{^$cc z2SpsB98Ax}h<^jk379|kzfSC0EO5!XRHb@fWIuJG*K%o{-JEIZm;+97U8Pt&Xq`lL zlohn9d~n@5CnOvjhd+W)`h6@xTGixRh`cTOc7#qkzCR8qf2cMCr%1 zG19uieA0#Q${JJK^%Jrv4TV`4#&Mo=2qOU0kn=!afAS1P&+vv$pUatWM+Yx!42|bcilI47ZucHC`gR*1EuCUX10UN0nnfD(bGp z=zyyvtLj5WHm`CvqrI%9 zl@Banddhiha-7#FTOFYj-Gq@}#g?lQ0W4y34%M?(9Cn|3BI}{RFE$6}15kHnpW+>{ z7j6tZzA=^_djj0&X-U~k{N-^+Q=2d)lU#q7rkQwJ6uebn+XW+r?DR!cO}*3>FAfjk zW#BTVLH_-xVKps^UKDomqrL&zk>hn}X#gpXwlJKDT71NkBYoKDuy7%@Q7ANyoUP$4 zmy-U-GZy#W509U;)~lT+y3mh^QN_!cI=j|FsK|uvrq}lLHdtdS@zb6`HU!%8%a)v_ zSNn(xjZ39}V8!3W=sM83Kh>U-#uHuKWg9laCH3Z`BdFE~p zCeQ1m&%nS|2**=<>-Aoxqa%jthD=H7#uD`qDs0k-T513b8KqWcVlz;IVzd-zVvS2e zxRcO2ZxIBVZJpHD2y;onL-m8*`+~Y3t-Q#hX`G_=P~KS^R3f-Pa522y=i)ET%azWF zDx(c#L;@AUU^CwWv({k<4Uf{5BiT>ak@#$JI6`K1t!I_{fk_;vS05>SXRV{l zpBqg2M-aAK5kbFh6ykv_p-T3Zrq1I^4)KI*KW}tYvtg9diQ#Dq4C#yaPtYGgoxK01 z0xQ>F-D+g{-F#P5ma+!}Ky|*N`{a=~>_5t)9Wvd0*uZdOTQO{!fr=@mKW;TjA?T6I zQgR&TGwSCEX`pqfx59aCjZWZ}>?^Ym0e}>w)dTkLH6^}>eZxIctvrD2L=}D%a0!N4 zCPKY$o+1O62fs0)cF}i2u(C-3nt`x~%JevfeikoQpB?4!A-IytIfwlE+74n;j824` zB8OU9LHs?_B)bbD~{qPYB(H8z&~xf28nt^}2Dzq3lFL|~(p z^V(1z60c(fQD&0sT*%#49khIg~L62fkKFbJHI>P_hO+xTdzfW zL?uPbp@*=;n?Tuf6c>RtPS9yKx%#9%AF4LEaEt9m!}!z1Us1?mSXz z?d~qVPBw*-3=A8m-VIINW3VtvZMV-uL;8Utbp~K`&v~wgvIfEL-Y_8;{uD##)j=3J zNPFWdUO|Iwt!7>F!!^{OYDM5sGzy-ew#f0c-2m^l^&d0eN5U`h8zolpuXWJ6-sb!h z`TpvnD;w+Yl3!IQ@qh`c;~4#2up+uIV0&Ii#dnm~(7y-e($-wQy15>Rn564*X&vuE zR?qI)a(<1F%e=;I4d>z&w4?Ey$#aLPw>~ol?2}bVdv0gW^70KoUQy{fm>R%MA5U|**Orwu3{9Tk#k9OLDigKrJ%BID3r8%pMg)$ zKrmw1m-;_Z*8|1t$77|9H9)iTt_j|C%cfH2P3~kA=7k<47U}h?`ZlsbDZUnj z;fzo8)u33cwJiC&&+9zE%dG0&!ippIrX#}LV_@01<$$B7V>K$KtgzH_4q4GPZc(!= zJf>%X6`HzdIdE%3EkKSg@46tAHF%p4ALcI`NmEIu$##b+rw(hQI6A6S)?~Hsd zgxnZbRKE|DA2_t-b}T8=nQBH=tog|iN-vLCQ$OA2K8XBT!+r&*0R-AuL1>5@dK^uh zzx>iTtAGZ(&E6pmb9jN7vjAi zGw6?1U3)UQyy`WxET~jMk}9Ve75$*{{Tu5Kj!&2&se$sif63MzbV^a#%!Vq z0P5G>`d@bYk;Szq^WtLE!TWkb_NDC9*{KndaGnjl)}DlU_}Ca}$6;J-gL27m!t#kO zvt7mlLqJTjb;c!qoKmc&rDW6CB>G&tnY|h>fujD?TcBKP6F-t^0}(KjlKRd@*d#Z- z=Uep|U=AwAQ;JQET&rp^xRdZ~r0y=?Z(u#Esz*`j%SoL~J`ejE@Kd$?8qzKbb~gDC zm!8=)%`TMH`1 zLS2df#oRkZ*THT5!?D%awr$(CZL48p+h}YxXlxseZKH7-H~IE+&KTeL_c^`D<$Iek zlI*?KTx-oaf0#m%&YG#%(6@?L09eJO0;|j?2B5|aNB1xLB~W%ITw!^9ke%gi^)oKP z$YG^TUR5$8f=F$18JQ!k_jsdw5O5v}LCW4J>0Phyt_Om) z3FJc$g)pKn8=hSG;Q99I^Ev~%(;!EYGhzVRj=)O}h68FaE9$0!h^v#({f~spYqmEK zRHHo8|J`Xe#t(vQe-tEoN&mS~z%IXl6F3GV5%zj}nV2Bvyc?9lJh(di7`1{XqLlsz z6xDYw$(Uzek$F3=yHA}f9bJmiR;i}29-~gH2kBkd8cwib)03FAr5@HeN`KF%L^5zdJJsz>}jb!%zALx++s8_GTlM82WXK`eMRG&Am$vl ziR5HMW9buQi?wz(Vt|0eqn-&0{bW3x{p*^2*6o#tJgmj1Pc7*}^Eez-;Ir+_XmBB- zQraNPm~wNd<-Jmx6xGW0UUs!hzhP-%U=8S-bz3H_lyt;Y>`;&;6>aTA(*U=3*Zjz}=jQgkYfNel`4?%u-QP>)P0x~irVo^lzWra}FNXlhP*8GS@k10$<<4ShXyw5+Qt_+=$}HIQqKaD3}ZDs%L#nC~QyL zsUufa%tRr-({Ke>j~>Pml^j=aW45;)))Ea4%3<*VZiS41oEC7(iv~M6DvIyHU3~dM z-NOV6uvvdW$k~`a#Aje*{j(3ODisUxy#UmWZ-PMMi5(pjK|+Wp?=>RQddz95lBAhM z`;s=_nyfqVWg6`b7K+~_SImE<+bvjVF{9|4H3D1eAMKvOGqURZV8+Rz!z%l#cjaZ3 zJ*GdcuqNf4I>YY+99Bn$(9uilfyz#Rkn#iBw7L+Y)1lmHUy+E$l|Y1D3-au+%A~IO zGte>nYnDM5sYFuWd_9~MbWJcGaW!pA?@mHrSD{bmF9F#K$`+v{%DhFMEG2*4Ek@jt z3L87^UehwGI|#^e%*^lJI}eWYV6C6Q@Z~A8pY@?-{re29d>QMBdj}DI;*91mP4W{p_B7G1-#->t|>*HnR|jO&}f+zE^y3Nh9{1aH5xZ;3$@YMUN`H?9qG^hkkA;`&P?o8Iacore~NUp_=0x^`>SrjXtMn<<1rg zPktd^*UiY8aHpyC#owt2(r8}2Xg&2Vdd7fRW)CuVX@GXFyl*d-xR4x1Oo@#fn6%FkzptNF+(CYxvyu9d>zS#h z&1LU~Tl_`1LTwBRlgc}rlwc|PX$k<`t(3M4G9y^Jp|*#fhqkW1j)x4 z+?1xz2xYB^nu%SFTqLK?*UA9baZY5+w3VXe#VQ(!V6s!H>8tgyv8vYnR@54aUCTGV zM}8j7TPp>-ORZ%4MxPbV?Z#5wO?7}BJ|oq5Rp=;Y8Bw=|`}0An*=6rNws)OY-plT0 zba#yACB$06OKfQ}e%}n^4CfNt9B#b;oFc0@X8oNq`qP69G2HAVlNl_O0xu6e5C)dz zCGKGWeb?yCIhYkMI6Pf-(ZetppH_AeS+Tqb?HB%6kH{s`daI#C+g$Rs09D;Iqd^Y# z@5R$~at>;gGfThVzuGqbMaIC&^&u6&|6R!yE7!+BGNwOYGbN!|P^SO9W`?9t(D4sH zD^^8b-p^L0n$M+420@1m8P(8InD#61zYBwJH8*MzEzx>5q}vp}2o>^dV?Pnldw<$w zqytJiJt1G9=+RVG11i5+7|Z?EdlpLhEN{}AwToW`cjZ%y4hMkMXh|s&6C(BqKrHk z;Np1jg)zt@^Fuk}B^)qrc~7k{g#ezFyyb2hms%;QC`crKuuWrAUftiQw#Ns6Cp7wx z9tFrR0vEwQXbzLt)|JwwqmyaL8d#?szLGm=?A3ENxYtO{FpWRwn1EH6a7HDW#aB%^e-jBd4YmT%~{q z^4N0-SAfrFFJ;^>)cGKpGol<3Gd>a0wj7Iw@fh`5$Vub_V9_Q7feb#vV{>b(y`__P z>lj6l_-}5WpNuk@jLaijb9TCVNfqJd(+jUPsj4R=OiI6rWZaW9H^M$BKpE$_9XpMj zlnOm^tZqWVj_mEgeyh`Y%(!fIZbv#4ilAGK;hAQCExgU4lW$R?gnl$%6DwMbA4u#S z3iN#J4|4T;7{ILSELY~$kC;sytjsO$%D&-?{mIH63MVo0`xxHa8VdYp%fnR@isKen z0C-ZF<(S^nbqKky?v}J}_Fts+tXv<$!B{x{JV}XwypRwMU==4!Ac=-7+~eKRm=bjR zel`_ewU=dS!el9K`Ax0DBWZX(9!;%pUqa@RK90Y=V{E$ddi~f9$h#S_V9{<2;yo@IA09Y50}9YO=#I^o{B%@{Pyj})s>L8A!LUR ztws|v1kV7`EzO>igLUQiH$NZo9#704;zzu+$BuML7w{AEYwP!MAO`W~NYdl|!c`D) zZJHu!N?d0$^o>FH)6FtXlXiikcGh8JG9-q@S+*Uz?1McePDq70Ra{}=Sh>qB(&905g1dDa zn8FhCl`$Md*uXwMKhuo;IZE&R6rWgv?J}DWu?ICj@Ysp?CN1tEw?_!Xf;tpVBiJd{ zXZ6mNp!i?t>l)IwKEew=Lb7~Ia@uLE(jj0lg+u0wb2&Y)2%&pj(3Q0y+(>wyo$A^; zl#m-Uq@ctQBq7g7v{YgU+zsk5LgBS%9i{isy}`3wU4-Af@bKTjIgDa1QL2qEp1~>h zY~CCiaKLm}xh*(tjgp!MG1>BSmQ-W8l~=%tEn>Oh)m3CQ&XLb7PXIt-JFX4Rlz&^n zfpMV?2^>mD>MqCpkQ8+;^cVC}O`j@9__d+tv0C9&%Kr4$nFdXQuo>WWsiw9loN(2P z1=HyTdY7@X#%WWrcouMaItj}w+;r?ET79V{g8{3m#Q7Lc2a$q_?ZB`E*t;fr!dUJj z!@!w=tHvFsEW;Xe%RQ``u39O9VZXq@jUWFTHSC-piW+A2KUrUj7OO-1b~bhNYS8n`??wOI_28@Cmiu7GKwxT<)t7;zbGj`#@N(yXlS`P z7|S(vu$0HJjmt=*eS--v^YaWuvn2K4L#?V@7y}HsvYNMuX?EvoH|*aCX7^_s_up+@?8?)C7+61)rEaWg+MSW|1G3-KUQ;3=6fs}t)%Ma@NS^jl-O2q<< z^u+$dIUlf`@T^cf75nwO8NMWWpq=7%hb0~96H@Ya)-F;|FW#49By2})oL9X5!t1%R zf(V(b9`x5HQ%?N|7u`f0nXA^u6wC#%xru=xEpRzo5hrQaIrw&QO8eu=b}|pc$*0CI zd0R=mwcu6XjbNSC5UtmKz4q8ELE0_v!L3t5D$LcwGX0Lgv-F@}U5r814>N=6r5Fho ztW97t4+d5qgQG)%`4J}kaCt+o8FxZsg-Zs-2xZJTnq?S}MWb48{yW^o+*Nw25JlYa zSBn37XV=Bt0^ZiJKu)#OU)~z_kEx(p*#1Zx0Qi^le@og|e~T8WV(4|m59K$FcYcW$7A43K%6gN4gk+5(0 z>@xTgE_wONp)4|y9Gg!443hNf+rgPtu~q>xeW$9xb_D+9s^S+|CX!$*ctE@IxwJSc zpT?#1h3rtPnI1D(Owp_)J$`!W_{qLNK9of{2CbzW%_QR!zHx^>1ODfUc${QQubhXy zv}Y$u`{GyUAW~hw^%PFdFbWiwV@98%gd9fnqPY}(j3nIH2ce*=B_7-uaRUeo!VY%Y zWhXqWXGHpP9!e{@D={n?R7sa>D(HtcX>mtyB=hcMTza}0ES(*whu$S{x2qLy`3^+` zoL(xv&0Bk^>{&meu`CG9m_XA^sa(k{NnuOj!ZYnHK8zSjtoSgFzVDQCWnNWwx|DGzQ(WkW>gN}$dJ0QhV^R0jD}KgEDxMwCoZwt_*on-DzP|mm z6J8iy6_{rRiJ{7aK>4tWb~IPr&HQ;mQ`G|~>*rs15 z56|;W%u`A4o-QfD*t6pU-E!~U%oD{Ua(usg-^VTTEsNjmpfGnW`boXL*gvmUSx~Sf zv<-cGB4XwRSYPzB#!OlIZEa|kRzHQr_8LV=A(U-#XP7tq)(nr-gqDb=&@E)ty(6Sqm6(BIVataUaf(p! z&NtyRSQdTQL!)XI-}5Hr4ZMUtmCO@5kh!Y+eNb|*d?1UCWHB;ppok}x;?#;ij<=Jj zP?2#dHn2?Qg6Ir<{!ItvGxQvkLLcNh=I!n5l)#nzh`@1)GWJtQz;iKOjNkYRZ9`;? z(HJ&eLxs9>v@EvjJ_(f=ibJRxo>29^Q^T3rIeaXHp$UiWW{lUyG!4u!*6pRw)NVd+ z>NRE*RaZ&YOMMX;*tv&P^?ELGOEVK)=|!WChxm{wc8z%!e{X0d=FoMf+%JhCs& z4b49u8MWef=Y&p6NXR}ZZ-dF+KGN&n?uVb8^5GzyZv6rs{@gtGw}AdwS@SQD3&5fU zBLY;YS$!CSeUCw03KBE4+z(qNa7hUD-qw+QYWDWult`E=h>}XE;Y%6l*{?@?1qKV0 z@OVcXK!ry#OS|UORI!!=FA5ZwjD&`!pKML>40It~v)YuXF$*$i5sv~3^4MzY@YmoK`(?=A&zI$-ZX}pm9og=TU2e5j&@>NX-h1N^<9Xs@i+Lq!=H|o~Mugj=M+O z!7(OT=|3LPn%KN8h_f>M^CqC0OM}&$(S(N^Dz0-f-9v!!K&_zpcka5`1g|(%|Lvuv ztO)~;O#;{ZEBXaw=NaV14yu#}I^s(52BR4+&kM^St}?h}}x`^H^99n=r)uDlo-#wf_OT>yF@8h{EMD1Exl? z%++#S)Leb(tvk70F=uB>c(S1u(Z^^22v(OvLdOS9Qz_?eOLI0!i%yJfAwoJnb+_z){ zkh3o_km!*13-7%hFCE}p+W~72%Xsg-rG&qDyvpt;`#}>bc?Ls0_+B9F_rOWCSYxJf zwfO>{TT?)qZ=guO>J*2pO<&eRCDcl+LcYrFHriwx!&MlC_>>UI*VH^YYEp3KYe#dL zN6M9f-N|Kr<g$e zoW;_`UGSZ^092<;yhsO!t>H+U{-AK(u1r~6Ql-dn$R^E1lJjk zD~s(16m5>sGP>fvTk`D`b&z+HpBk)H-2W~c8`p=hTV{?w_{n++llH-cNW+)BgSgFM zF=N_m;On+Y_=Me0kBBTHs*OkLQm`{&S{`BO;^o1; zZmrm{jN7DWQj}1m=aVoTx!!Wb6c_djV?^a;gKTR(~89+2PTv$6Z z<7+vm0st~&#R$K;G(D#1*LIFzvXt>6xy*xLrBhDtD!yEId*0HJBid>_3p2x0>-y3S zzneGXqluSsmu-vF7JHC1tK)^*Y$GkB8coua-M#A7k<6oKxe{ebho zEG*}T?xcVF$pac*1B8H@4d6{&BrOUyl~i7>wfwL(HC(hdN_+tO_AZ1{-Kb24!A;8z zj(8(3pbO@z$>wEH_LCL1l9v(O#m3^P^HdHe8n%5gZl9pU(B2MvP{~5}rfXVO$soy> z>Tz#1Ikj~It2ngHQ7kzVR85Kw=o7}B{s}F{v-IUn55So+xxCW;8;M(f1+CP7{^A_w z*%#s%ZHFw3LtGB_tUMm%KFOk8K>tF?mb4{chcq2A_lTAcJ2TPv%+Pfv0b|9*#!bE> z%<`2FkN-ALd`c+eJf$v^(+%4){vv=#yYN%2BQs^K6S@f2Hr}PMJtO0o5*kTOcS%g6 zX}b#nr@}k6pIC$IU;M<_J_aW+|5?}RB}h3B5W$9qG1Qza;Dt$`M%qADN;Cn z7ec%vV|BdD=GH1TsmeP@8Y)7x?+=RY`L!Ubv7^W7xDF_8p{v_kjz1gj@-VjjF7vAc z)A;&Kzq`O$a7EuBxgH3ax@~PQ57|!Q7|XF4LJSA^Nd=6f*_?vzip&VXsy)s-Bg@Ie zYK-btUwVNoc$K_GQt?xMygMiOFH4DcWqGzDL7!VaAWqiwD1=P;on@)& z{Q*+$*?32F!f7UBE|$}-#3y0`)al+VyB#e5d(@No5%LrL>)zD2-|e%4bXFv;Ybf7t z$#bo;t@~(T>vMq(&g*@F15<4OVxY_RG0_S0zucJ;Wd1Sc|HqvPLowx1GVhi&B{KxD zlGrSJoUm!RqI0b5aZHx+SZ;B7d>B-792}6p$fBmm*zftBUrE6N6=9zeqZ*|31yZ3y4uR&iSuW@8?_Qk2n6?79*hQqdg&VIZM# zB%zIkm1cWncY5QC6}|US=K*()b}s1(sSQ$TOk`d}P5U{Pr{@$qvUtX%6l&5#@Xsch zOS6&CBih+B5m|zFq?AG|ZhTdhZG=eUMx$HuRuZF-Iq7e^>VG4 z5Em$pQp8KrOt!wHa`#Fh8W0ND7g|pj7l|)b;aFXnl|rgwx!O^2NcnuyuSA78=!*ef z9fXSj2sixR_6dl>D!d3B@Xs}UM(>Q^m6UM&iq%PIp*Yv1a(gFoJE(*z4*~96IH+dG z{c)q@2$M5{F>gSp9dS_DSWWyY=IMkN6aCE8U%ozi{1AsY)Rfo<(wya$+y9&ojpF!3 z9e~+E{KG{!Xq`As6x1r+^aq-*+Py4WGbsz8l`-Tt&3k6s3mUgA?Ha?wbS;fLCvYmj z8~C~IwJQs&i&U z+gSO{*|h5k)shiZGnITwqN8YN(3Pd`@Q)o7%>^n>+-aE(4u9+2p|7KuGNyB;@p7c| zim$prbn(FDVFC9Eh3=-3!s(?55w^zEpGHB=JTT_ZRhEEb+$MikhEqXZvAWBJ)^d45Hx%(See=Qp5RRw6N`juu0v= zJ?h`Ze&ffUAI~?%0#C|Olg7pi^0^66#WDU)vX;j~6S$cIVO2(`PDuAS@(qK!<>%Qb z3O}941`>(mMlgm?PAUQ3!)HT-Wbne_{x7|9kG<=JzTD?j%iY_JMhrSZ#RiTj7d?^J zV91*RT&MkSh#Wgd=?FDd7$6wy8~f1?E!a5eX&meDw0*D_`VcVQA{VnE;2D5l>iEUb zGN`%-dod7o71}<+FyKB-zPKAOCN{`ec0I~Op({&|92!9q_(YQfFEU}w6~@h(iz%q{ zHN^YJ8qEZ>hF4UYv&GJ^CJ$_RxyP*3YJ)eG#e5FiTP`6BhGn$(PitojRDYXJ6ratI zxVxiANbRak%fCbn=ZA8g|GV7>HqH-Ko6LV2!>A?5Bn|>{ihrg%6iF-cS5xTKFxNf^ z*xCO~`$397dH~Ybs9u38t;1T9FLZdYyFpwFx$sWTUh@z5(Y@7{`=qoiUY?Kpz9e=@|4U$02!4v@f&BQr-?}A zg5f`Px(l}oWK0gKyn)XM5_byEZ`@OiFX-;(d37LU6pM4LhPmJ?Wj`!ttQsHlmQI`O z>&f2pIoc0$G2uDC@j9W`o(=U1x+PMf34%OFY(@f2{T}wy(?5i>?eaklR^W`@&%x?wp1>8LBd* z8##7?Xp&J5F0lw>q@n-Vs<;}{n<8o%qXFVl9`LzXNd)#YSSr++Y`Jdz!uJ@$xQc$* zTWC9aad@@EXSBtUD)%ChPDEY3Z>c@l(NyB*Fk96A{M5fQOP6ZVytkRVsUhaYC#KR3 zQc{vM4XpLp&hQa+!Ec`tb(j1IwuTw%ulxUI3c>lYV(?GZrCx%}KY$6KEfWYU%52c@ zIcxok81QcU^F@ev)_Mq+z@%A42Y{+wrRSs{a#D0b>4G9kzXFf9jucS;(96CK7UKqE zko?_u62kr{L=9UXAU?AfJ5-oApb-ArZ(~y94E$@-uZ6*Ke5*`HI;E+_*}h+Uiom8@ zL&ZzceDXWvjV;CrRM}CMA_T+|4^#0Rr%;1wnt}#(?&kN9v9RMsUCmM3(=yW2yd|&2 z&nCaPNm4^qC3aJug;wRM)GZgzjz{20+Eio~AAc?}4Hf+&>FqdudR+u3@aSidA=gy; z+lBa{ZYI;e=q<5AF#o}oa{6H5EHhPIa+S@EF1xc|Gp7++JdVBdBB5X}hX`DDy3p`l zD2-H77>$36~(1yP^ebOwCNYKQt(D5*WVPU;S1F=}1Py5}6Do)Ag*M+=r z9r4RA7%495s))e)iUI3?KQ_n5CVu>De8NBeSiy{dI>1LP_1qXNc-TKUzJyVd_F2(N zz+Y+twLF-T5%#UiUAR;kBhTdNc_?;I!)~L($?WZGh8QFbjyM@S?d>F#Ax2B=7%W#( zq2|NL+u*#-kKtW=9rbi=wh4VD#=~wCcdf}Ge0k@tp2DMUY-n>;jr%JtH_Xo*-@1)J z8o&3@$dDTemLBPM-=Q#WrQ&HvJUMM3Xx^4SF0(S)?@harm_tWhfT!#0_0k09nTpLN z<5NB{V3btvGl$n~=CO)UIe?&IWRXx9xe&|aE{!OOO!SjD<8n_$kPIsfM3~GDr_~?Vq_8qgJ4#up)@;l9|TDAiW?g#wP7~ zLqQT%1N#F%po&F^LaE-ex}0J}=-R_*{Y0XE#f?GCfE+Fe6_B|FVK?=m$u8zEDOw)| zS0qP2LZdG_#0cqE+dl>PkwPO5QMZq2(T!IqImUfEGGEGY_mYo&8*Qpy+Gqv9mo8|< zmY$2o>Vzs&2+k)NOEv-zkN5_A7(8`+e__9KFn{Q-Vd40f&f&jet{1dtLEuIEZi*Oj z0sw%8a3ak-mqy)}DjE5Gipy+_z153hrDB%OfV)7Lev8g86vKW)jW$hJovMx9znC{v zYqXg13&L823gcrY&^PC#N70Rd)zCJa#R0ZcRvBW+#C_N!GYH9h*x*gIxE-luvM=06 z5RR?)pZwvJT(kV-7k6(lt%{VB?##Qp1aoDC8A9rqpEtt|`{E{3xcux3P2q0UL`D6=AAvSO&3e5bcb0<(}pHubR$ zy=>n5Nepe?I;Ojtue1grpKZAfFkgJ@W76?*k!ic(7{C@vegUN@M;8ukt*ni!ug!eY zC&?Xjd4=0zjZSzJw?wSQd+Ek(Nnl!g5}(S*G9HC0TMd8@C9FbIV5>6P^v?N3k4ZR*7u{=_5rtCeIck&$lC9HMPi!!?JJ?Q13en*Pq zj54g|XvGpx{j5w#Xl{OmtC5Mq2=T4Q_aqNfB!Cr(% z8x(g8s|OlUFwB;>8|L7uSNw`2?T9-ByulhW@7{|K0WV%863!?)&#_)Xv|pjbM;TOT zqk1_Mh2oi`B?xU*z6ml1E1J?evOp|@;=e%7y693Y3m@#ftx`GsxQ88yaQ{n2#=-I- zYAEx+fHWl;`$0jdwI5VI0g|9_xmibQY8ajmTZ$QkN5be` zrs?#!!~qYrFt~NKeM(nWj{P<*G*~U6u)WT-N%l;d@_~wpZ)VlkET+no<7KV+^QL+? z&@S~F%b?O$am;=_=XJUEc5_gk=X5yjs;s54XPDV-J_p#=eF6`)!|3!A;1T4~~8Z?AlQrK!0`IShrVw z<~QcBx13p8QM428Y_+f`Ujq-)%oBldy3oXukV&%1&W`l)XVUF@%6)ZHIIY{-!?^!# zk@}ZAm*qoI%*y&l{}up9Rs&?;|D9kev-krr!E{;-5XFklTMdYm)>4iFA>sx}w@8%3qdN@j9YN~kS~(Bo+fR!O z{9L>CVHZ}Zt!3UJX=77|l$rHBnnW>aVJ)W%L)0ZOl$szKj8sCfy~d+UXFY5hH*i@G z_-2j{MB2+slq+zB9+#r7?)aqu-6cq+z;NYMSj#laQ&de+is94b@PIhDV2~)|fPQNX zM|ierYP28n&;^EXnBMf?SGI^Lf4VoRmn(hYCV0g48x|1dXpH(x2(W(4LWb*42Q+|! z17N)kklZ{2(62Qra^nL*W1>FsyIb5BhTr`M-=kWzu*qXZWGK3qyX zIcBNjL{ro7c3G+ZX6lt>0?MH8Kt^YUB*TX1{i15zS!D~QqZP<`(qhg!1oHUI6t8(X zmgulRji1xQ~3D|4lCKJzSHyI6*7d^J+T35>zX+^YT5RVS-^pA#_vzvdy# zPXtoEwVV?EEp=MZxV~B^_tW)%=Xn2*MedQV6qL zU;A`jyk0?Mbfo#18Z? z%WvB{+j_{K5qJZ{0(DN-kr2`OQtp1a^SH+&-8!p0s=vHZaUwRh!-&pn91!smlwG%! z2+T9#n-@qT--hj@Y`#5(zTx`Cz3;Uy|Ap7d!SW$l#=oaHB}&*xvkK__MPqtA z0YiyhFS;k+W6XGPo+GLwApg83$v}b;M45KidRNyYY z*vfsBziAwB9Ju&oj5}?#T{#1E(l(l9ipBuPsi>i}|G7o5#BTo_8CNd9?3k%z|1I-#_O_25g&o-z+Pa&PU zzT>qwQgyWmf{spYh4dAZ-4}{`Wtf_97F9c@G;QkT@)XrL#VQYP9|X=7YEnk89scda zOH8##BOYCp2o~K-C(bp67tr3Q1iil;Z{`ne)BoMB6$i`566Aj&vjE?B05ba@v_UdO zkZ{_QnJp#c=xxm^IGdBE6PbX6Zld$^X_0swW>6|+>kMKuQwY9F^joGJ9k0ADs3tVQ z#7*JoE<+U~WR(!A3l}vE8l#*~4MeI8cghwT1@?jYQbG29d9LmLg*y{ z4ipnnyuoti(AWg-;NSvq1$1}|b>AVFFLo`w2c#|j%;j@p|e zF%3+keuDLoJ!QTi(zcGkk`S#Gmh|Wmds{|uDkxS>$QB^nq?*j)<_WW5%(~*PEgW@& zBDg}j806CEf5aa|3wi*KAZcTX1yci*X!f=wV;3N2C`Kv z;V({49IPKolK;iLms6Gbz6MC{&QJmFjG_?)C%h-Fn|+<{O+YF8o+$mu@PUL_=bbVb zF0G5W-`U9Y-Yoks+4>F)RM-SNDsOE=j^+J3_ZJYd=s)AQ{f6B@s|CTIR)-{g0fm3p zfA2f#&=9%jN+aB0lZbTSbt8F@*xmS&nZYagYH$5EwD zrB0~G0jkjo88q#Xb{V7MO5e%I6-k?1Ae>L-|M?N3PD|l~%`YE`vm7*&m7%`1LvU5jjB7B#GSA+$4XR_$@Q;O3Cm|3Iw zy4;aozuYrOTsBHj7FVj_>1|ol$tnbPNilGBL(Zm%8726eF~OZBucO7+efh&eECa_A zV{Q=_hHN5v=PPbYCP6mvJ5oy2{ItNKXwcwuN{Fe~vv@p!_pO4*Sd~AdDbMoP{jnkl zFBG#nW(^^+x(8^F9+ay&_4AAEFz^&;#6T(;*+ooYyGj%9+D*$gur|AOqO5#o_K}k? z=igY`hAW9#mY&jbFGCR9;qS;MOF<`&@}lq$HMrIdQgdp5In=BldN5d+{#5h= z=+A&DPyj{mv!MBeanGex>72;3|63RFiI-#*6Jc=e1Q*jzV4Dk|j5H3OriCh@aOXiw=cFgcdIYWymRLrx1-3_d7CTV}7B1jQRV9fr>l2nN zGD*c{O+lSlyMtWpOFid?5Al7$c6Q?TCdWrERkJU68~VPl)%l-qxcPyDKD){NZSgN*%l znj5)pJ;-B40I4yn6yMHzj3nGmRHQLGi&!1h`56~VsC;-|`rDVEfv3}SwTNS+nF^97 zED880sv?$*{g`>0k>bkk3ma&wSCiUL#ylL(c~-Kl@n5@clJsk zdkpfMwNlgf>PC?saahSF=3(3~@c6OR*Pg7Z+)OIC3yRH%B%DWv5MLd+M`Ja-Wyfg5 z6U$a)j+&}3-!z`(CdYHGeJSy2220*aE|*lWDpv0=Qoi?;Jp2aLy^wgFzQFVgf@~Um zZVUNE^BZ`W)7ttkE6V&q)b7s;cbW+SkUoG_oE#^SlKfu$m#U2B>!o9}WROVB4nZ_~@JsVh=VA2tkuI&}!A{vFZ z%7&d=T{-pK%gO`T9^oSokXZ0@J1_2SZM6uyGvRsS5w;x*R44>#QQVHjFjp508LkwU zTZGlveT~7av-MpHArc``Cyu)!rwR(wUqdH-5NBI)UKoa3dIZWKVg+NzjG1R8>Cux! zKs)cC+8cLt8`zbBJA;nk46P-!TKiBDtGDMXe+zSH{CWsg?=%_r_c*x9@h0YSWNL`~ z8B`HX$aJPXlR%1x+JUHgR7C94E8@|FdpYin$2me7e4Ho%u0<}l2{vjv@2a&hPL!E$c7a z#{408gqiD)Z4*--wFAgcU)o{V8U4-l-1%_8JZ)hlABaAqkD3??L5fA1xX8-P7g;dh8{FSyB3 zD1)#;w8f#zbAUT85_=_QNiHF;UIUtdNIJ*GdQK3nC}Qts~4X7x)uHU5M^_HSi>D!@?bPEGPvB*cQiPGU@3q!X$f73Uo z44_wsomWSIDQ&{oW3I~fmh8p~y2)J}xCtH8ar7+zFmRN;c6kNSbaDR6SS}8hkF}4O z{aR#?`dh>lI=z%1g8Sw=u`i0aAP)1r^Y72_;ngOU=N9rl`;pCC|LXr;3o_($_Pe1!*GT_|8AE zO7qBr9iAosazC^(n5-My!?6_((nqR^aGdhxjo~3X_<=k!WJw*>7lVJ=r-rc1lXeHa zO-obk@4w+IyrL!F*9+%tOk1|rJI1@mq`S`8Ga#LfQ%xL!@pOgXF`duuN(Z*o_$na~ zMir9!J;2C(Ipjx3q*Knqt! zAG*p||GmloFGyuRdRZ<+5_b1qBbVyU>>~9kR5T&E0AMqnNTv}s#-1eoR7kRzOk*?p z5DwMD2<-xtjjVDIx$2C#A;x*YM+-DpvaApEZMW+{+p=5Yg8$0bupIa~=RFUF+XzKG z%0d_;wDmVkM;ICBnx=(ana%uUsZoRtE8~nuVZt_<5=YZpYBQngq+>cTW=mm6QeovW zRNoY)?ZU1N5{5jg-S#{Lv~eC%JtF1=E*(Gegb3Ab z(`br05K)t=wd5LGoy!Sgr#ocz6?)u0zC3+hesaX!jF?jTvah(d^a%XupCA2~O=kU= z*^i3XUHWEX_GdMTB7){t@L8F>Pd4-Jq+Fk zcPH{!Iql+Wj<(vS_A$UmE+;J8YXVh)-U*=6EsuU41$HnonYQzyUJX~hPZj<>l(%*_ zpB2o<3_2o~#n*iU96@ep%PDU!xX&>Np@D@Wf_r;scOFBN5EDQIU>)Q3B9c$wN3*!s zuskrh8rdxcy)8AsIxEvTb{${@ocUcJV6rbg*X3YaalgJxGY?Mcy{suS|wzFR>=}T1z&WfFf~`WNV~c2x8^>p zGMh9O=idwQ&&y=MoZ4)=F$-`w+p9ObgL?wlz4|i^&H}qftaLmX#GxD8l=$Fp1ROCl zE$P?*Z&nCzjz4(5@f(HfkdBC;)N+Dw+A&;HVqNqWkkbPEh@Rym1lt+i@9q2>dygkA z%UT@#p&eng69AF|1FEJ>Qo6-S+QipB5o1eq#mg8eW*JG=vFynkN~c3HB>5hk*EnQQ z-|A`D80iA#{QK2zHanD(WO z1-WdiS<$k@Z_5N*FE=J!9OCMmP`bmh4UKBfV`oMe8Foo6Dz$t6> zM*P|BA+pa#*m=!$n`$fu*Gzg>NL;V89CaqascMS9Y5)f;XR@v_L6Il*4GGDmc>
9b4U*I~nH1@Zd+{biCmX8&n>zJ=}su zTc-d1+m?Q)R%2oPgBtD zW&(Y^JeFElL$DfGW>`n0`I?8r&(t2Ys{o#DY9@GhfU@tf#9PVKf_8(T<+sLlWKgqc zrUvWvr`;4QYky#+9^9Bq3g@0vqH^$wnCCn|cB^1+D2x=_Wvn%w8CKQbk#Z-## z?nP~;bpJ8Rmx?taOhdw^Qn;$DXnYU37OpO}pFarJvdunOv~2A;HUX4fM9@+8f7v3| zk5x{8PPCR&>2(5>(=XmIwA(xh1)|nDcV50lpsp_<%FgFB;ZlKwhB`Lz@FY%Y9953{ z17UgzKFKEAFWSHJ{y*m4GN{gGOB==^c!1#U?(XjH?(R+?xCVE3w;;hixO;F29^Bn| zch1awGkFg66+h4aUA3!vb@#pcT9@pK)`poUzji6TwghvehAHKmVg-E-T6V(!wsFFA z*3^=9Ek1rlA^xd-1`T;i+}x^qB8~`k^QOoVPaekXLtLvN^G#nEXf`B#JfqHah^$7N z9r?v7=#Ge`REoo#B($1^L*9a|vnb^lDHW&N_fE*4-wOy>@a1=3{iD`~!N^mV**zy9 zu#Uuq%u8L8A1X;?R55<#D{(r6T^|>>$+$5PkI5jg5KnWYX*c;1dTJ|Sfn`IMwfR95 zWuSd%{$0C%**R6{WO)lSjU`%3zg@~A76HB%S3Aii3PZmsaH_aETFOQo6KUzmQc%%z z(_}f9-jooMm@mkS5e9ZkN=vc9Aq9^gzcrnfHrVsX+3iVZ43;8Yb2z{t^NU%CQZXbx zR6pNyNbT*RYi*suk`J$LWhq7dWi=+(g!{zYw(9D{BL- zxR`u2N>FMSy(o;rs6F%dUa7s$mVO9uv0}8t?6hQtn3`-t3oEfx4hk2Z)9E7%W-bv# zb+2VEXFJ18GAO@)DGV{K=N(*Pw-$IefgO}L=Ov{q?2NbN&-v`-vHiLNr$Hn8@>Ut% zb*vcw!Hh&P%ez3~v_;EX^eV?IwZ;+AJ+a!Q~eOr76PTg#?a@f`sCs5k{2hGRbC{elukQluo z9YQ<%&-LD&WCVPRgj{LsXI=;KC-aEj4x(Z5p$*{SMlJg~AqYG~gZ?;N>7HnfMusm) z^ECL(BJ;Pb{aV`otb=K!C}m}u^1Dt{F-QN3VSh45E%TNgD)P3FbmbdU&N@;U_%>Q9 zf$>9mFq*4-00A72F>xO5A<*Wn55zl7i`f5ukH(Gs0RV(fm4YzQJL9gM00;nw^}~%YrhZ1P_{g|%#E5b;nS&GaJNFn(uOFg!oq?iSB z4%JqeWOJV4t5ZKYf`MQ|n`T^tZUe%~BAFZ20%Wl+#$ii?n1tdSrRQ|bCAIbSNCQ+* zgvZ7QvWHw(>O7~ui_gLAetaR6f;8@aq{W)n7TYlUDC`HTjaG`OV>QA+66w-ZduxnS zY_~6psfFp!ejdw#ndz&e6f=Z_dxzFH@2A6#v=fs#bY|L+v$(a*-3Xn>!w8uoT_D}f zpiFvO-ax@|FiorZE_Uwa2LC-GB9EILz z0s&D`Cf;+K)q)aaL>D5@HJlNpMtQ+6GRs^@KEyH|fzJ z<|pIR5BqypbWHh;rz5qL5ZOztWyG}MC)GIBqn8_W@1{`Ho%kzj$rX@RD(!~8M$I7B zLZeW#TQgNIQs*DzlN`6*u#e{w+EtUe=w`lj+Ae*tbg4-Zug;|r`IWhbh`SR`8^dDEs89S3#i%rX33Z3pi3fdB$Re;loLM$EYy1NO7mD0!G&W@@Do;Hsa;{Hw6fvJnO z&4UxSCbqWoRSv>yPloT-pKJEttHb`j2*rO$r*!~UO+EOCZI^$-nmc5k+(E8g&jSc7;Jn zN}_&5u`e>9#zS~!xXha5oSaPK{zA%|{8b*UgR7CDYZIlutKivJS~(FcH^F%CnD6-J zXwT!a0vY+yEbZIuvjCE6Al_jskAVy2-F~jIcF}yD%a)Z4Q?W4Ih=yc+eZuAEc5PS( zmx@!gl#fny|GsQ&?~7^v58l)a@7vV$e<6P=<5KXx^iW`vJjLrEr1HErr$!Os9C=_GW1P|vD&wLB~ide7WaJEar{pARdeQlB^?(%|9B|Z{D8jI^oPzx^;*7_*@_>>9+(){{wM#1=L ziE83<+cOSF9<%w_w2Ar}w*$}!VmRd9`HlA9Fm4@)%MaH5NUgW)C{|+)y{wJDO%Nj@ zrR4JO5^jcTZL1wF+@D*cd~!4#WeZcoM<%`-+oG74nr|rP{&r&4j+ZHVWE1{qf3XuuC`bi$|GkpG5?238e8b?zU%Tyevv7<*X%5MquNu;T(oD@Gsg=#wVl%`wLFMoAty((qB5aGJhDTxkcf!ceY;f z_C0aLR+>+drpJYETyQBFW)f-qo^Ebb@Mlfe(#KJgrgW<5!~?^5?Hz}w_|!9fkR8o` zFLbf%ha^pP77+4UA4eSUlcelbWFeqwX1lNYu@ouRmqID7M%8(~5rO&l={cS=-f-_* zC+A)5{yz-SL=>fL!T+1)gXA;lUR1WadHu4ShSBc23ubSPY+!15QSd9tIj=Oe!?+&& zm|d2y9DH^%$8yD;UWF0}OQ-x`vH=<03JA6llb<3ChrERe=j!AvCx1`!M|e;xb?)tf zaVIrD-;Ezb1c^xQ=gJFtKvd~?T?k(sLmscNOcIY^48|K7mOb#~{M_M~(2IzenqquelK*aVCZ1AA7mg%R7G5qqf(#Gkn1MIHy|2G7 zv;Jf5<6S3)0O%MV0N96im4>NFiJ9L0GQ^!lG!Q#++)3~o*v9QUud41LARK7GKJYEl z55xOGcdq&lr+U!PKr-zDHFFriH{^YjrKN=-l<1mO2f;vZP^1i ztt5SGUdQmh)|%-r2?ZeH<6oHQpLBJ_JK|1vT`T5rw%fI+%mZ)5s^L$atg_A(BvOJV zromUT9JG3zj;Yei7j_x>Xqh;SJXv;0ZT(J$l&7%{Kum>(Bd}-n4WsQKUf{#cayeBf zby_OZkzUU-%ex`b-KSFqKMWinzUl2H*<>RqcV?0f$5uPtIu?iM)Ey|6u)EqW)a8|R zgQZ20Pqb;lA~VV6qh}#_M$7Y8q0qtap{|V^i%+as;8zO)@1-d+R0(qSZD}5f%~$s` zLw@K^S&HJ#*U_PjE{uaB;GhM%LKkfCqJ#1ezhpOT2+p>FU0lFx3TY0AUAb_5Xw=4A zfQ3#Q&y-D|FGwehe=vURd=TB^LG&*Tm}+8!5$kGSs$7Pct$U0p%Dcgg)F->$?_(-Z zija%%E0d8@)VK{*;g&nTcK1F#cYDHk1-|$7?tkN?Gro&4#=`g)qzRA%)L;WnIv{i< zr+jq&j$5LzCdKX5hIq$BQG>-M07zo4z0vxdNbbwX7}dabapeisghO)G>>`xA6KEHY zBVlY64d2{iRISr?y)5RXwx)v@b3y)gDevIpdi&hzZ}6Pnq4^8t;BRb?Xq-92D%3!F zzHB=E;cC) zQs}S*TD7+m$&IT6Eyi#%vDw(>nQt5E01=tAEm~CMpxD%N$rL<&WT$ zr;!vgcLGg$_azx`@ppePR}&uqM;E*)I`ox*$>k@Z6IiG82(HoD0Y{U;E7p#RRrQ+f10_*O#E>P>i;j7Ah>d!U$|d~ER8mEeH51=`O#@4!2Lg;9m6y&lJf9-_l&aj zu+{BnaaC|nx|s<_EoIEGcJyXZe75TVBt{o_}Lcr~-~OFu5PJtj0?;NzCQSxShBbC2*;B z-mDSG-M9FHWk5`u1mmnNC%&++3~aR=mXq}A_{St@Go7n@SY2bzUG<~u7|8B%b7)5K z?FGYy`!1oq4q3;HUwURXVWY@qi#U_rudS=5+*X?JoM_mEETmLZfzzMY>R4FIHt)gV z!@WrVeJz>aXN=4AcN?n{J7OC^51`$91|{zkZhjmT5)=;%>CHn(yaN5oQ;-z)J)(B~ z^zdA`bOwAcVrTXIj`Lefi_&z>Svo+k8d8pmGRru(Gm}sD&fFsklNK^%Q zW0C6THrjA)JWN+=?cOVCtHM#AyGSpeX>{g6!kfyoL0ynjs;@SP07*U__L0?Si@orm zll|h_KiBRL+Cp{+TS3gm9^l%9;JEaRib3f8;3(rMY#S;y@WFjzPiF5M^u$tP*5Rl_ z8shvQb8S~Dg$3q}ae;+=d|@vQRP13$49s8 zMZ+@1uFL)FZ&N{Qbh-rTx^Gl~^LG-YJw;Vxqz9_H#ya+0@eTUDD zI3Y>A#9-MG$D`7p$0g!Dr(;Kzbx{QIxRnE<&I(R*mG;#4kZqvIbX~5q14vp?uP`&9 zw}~D4zkQN0#5c`3UA@KB6GDPBS!f^XledL?HKxT+z3;qeMNfuS-PDgus);~HK*gez zKZf}YS0VQ~mKLdJkT9KxXE|WQiyqQq!0RgrGnKinZIGcM>%ppT_c>?y>pEylrOQTGe+w&zKng{`AC2 zMEy&Z{jXJ(b8yVEtoIQ3<+Hpd5#$fPxv}xZ->O1+2opyqAQ($z5&^9uGir{rJ-pKq zObyADAN@N*FU-$!dmM{HXH?ryb%h(%`iziOq?-FzVU~_b8n% zV%r2_D#UiQiAc_xUu}ebWBb5pcZSl^eL*Pu?SZ(*$t` zZxn&;eXqcO;8aQiae(hXfcxv2Qx3}tu?onWhHroUAzDMml&Vw?3`1Z9AdDwtI2IPo z9K`8(TU-`8_Q~H{Fl=`2@<+2}d_uAXq9Uolvu~v4JF_UHQ#T5TOZ$Kb>cWCi9)&KR z0ZsbA*%Vtxn4!q?LAquKcu(JZ{iot1)vr=`Fvy%w2T-+2=hV;s<-GDjUnDz*KG***jQnUZT}OYUW5(O> zSYRT4EcqU~aDf!=61a-T8;9kHmiCIhz22#O&X?9faf!7#3ez0MNlqZ>UJ9%5$%jRH#H6i2bNhF0ZG0uI6KgH?kSJTTd&H#~Z+4*6p3~*jRGlb=JOtujpKBC)_1u1x09RF-&I9xo^6_88((P5LOnb!xmki*ctkA zvp$ot+-h#2>7xf~xdkrvDKFDej>^C}llC~ka2_IbWVo7&o7*%5EC9=GvwGFOf%F3 zW)Q(i%$3mXHlpt94DoJ6vRt0rE@y3g_^sz0z%eQmxPWw^PHf z)8(IU0^>FEpf}BB_bE9At6$31RVt|kSx_2S)E7C#nh0XgviZ^zyH&I1>^D+E zs^m!Qtw-{cBY&H|3@puFm_b`Ubl;x>tRWK{apF_{hQ!t@e}jAiEe!O3Q!Zt{&Y!S?U8x)!-{yBRTYF~zKlA!PNd(4IsS$;dI+cZn6NGp^pPJx-j zxx;zQOP}Oo_$C=b#C#B7_mw&8c>J|>T5aB3^5og$yN=6zo&!$fabs6vsYkEHY~Nk8 z>83FyOD;|tmhQYoey+4;uk#uBtJG87#a21lx}!sf#5tlIDA4QmU4cc~4^>A($Onh| zn-_42COq1p;z}GuT=V50P_wvZVEa zP@5CQZ!`;If&MP=7jQsB@9}^<#ll}7{^Zwhe0CD^ZLEu9t%YIFP&G6iq2x7zAu`P87?w~ z1G7PZ=z)}|L0nY|5uNm~5V+_pJu|^9M`n~%O@tN0FN*?)7+Y?(xz%x2;}v^I=I!93 zrw`gGFl#D$pY=L;pF>*rro5MtTeuY8Fzse$eHZWR-@GR{Yv!+brk6YWtrj%xt z6)63IJ!g{GV}L0x9rSY_1%_2SJ;B4f_GjzQy`$$t7t^(jGC5JcK(93p@kq1fuzlui z22iUwkmuHls7uUPHj3%ASiZ%NsjhSq%qDiq6<_wC8sjpBh8Eam$D7uIvnmyE3gvTA zC04M9jxn}v8=^AKAWRZC@3Fnxx$AN#&S8F;=?T7dI^NY|F#aRcDRv|hK+Y0AE%S#% z$EWZv@Th@uw14<ck)C{97jbH95$){s4C zi#1CbiRsKo#ghV1zW8spL|EjngoT87=Y4p|2nMdj_U6Mo)`C zpoL1?Xgb=amPVy#Mh2ZiH0qhpzE%^Q^90_Ei5rV2{Z@@f&vldFeRuobY}PTJla&3+ z8_UY}u1kmUZ;Fvhtc)Yzc^UwCp3=sovcE`pX1T6917x{2Cgv2@>q1F59E0efpp|=C z5YuSX=M*ycvqZ3KHkFYb!A4R3>WGa$U#9{&Sq54|$sFKEn->nP$w`FwTX))*)zYur zVl{eTypnVYUWV@0M?LfCyXidWAra!>raaUz(TI&PW(bxoG0K`5;=g0VqzTZ)rn=TI z`c`7%e2Anxv20;fU1tH;p*oddASaXS$hX)qXfoI~C<$cK7E3>Q?)-^}iZC<0F6~H! zMv%4x8r?E?tW)5kcdbRoZsmhD>soaKij!ja)&_iEoCMQ9ZnwDqRhHX-!Ux{uW&$Y7 z0X}cciks}^ZD$rN`hd{}roh0Fw1w!$D=L|v`}T5x)b@O$&v7+Co^j~;x+&gi)vBX= z)`mDG(v-7S0-TfkNF}XT<2UAEX7GNSPWk)%ZPK3R{BUVMJLDJE%s=4K7i1Rn2q&d) z1pdICIY$<4;4X?GwU!Yy6x5zUJZ13XDSZS|PF69d?XinC`Ams>0P=$fkOQVXG9kHU z{~NEO!5!ADj;8L@?U5dOS?AX!8@Du2U6240HAYKc4W~IT`S}g{Nwbo^P4JyV?!7X< zBwaWSLiXddIWsZGoSQS_nl5k&+R!qW{-9-Lx7TjW^%Zco8SRj68@sP^W~aaXajP=5 z5nV=7FG1=2>LajD1}bsp1YeiQ_LK~m`7>Kqhf--Y#t_Sc@r0sJF$Vd@?M7x*_VkIez zK*@~Ts?}$nOZX?mongpLtZ8M#=g*;FFh1>~pC_dxrLRP(&v(j+x+n;~osn6tVgc29 zQg9u2_A`Sx;~aKa@TAL6HA~WlBhn%6^M;{eqG)hn>v;@nobxs;>O*m-Ulud7CGwu{ z*5-bB!m3Oe>|E{POce-ofP(?FCO@}Yl;u2N6~iZ0Mw`=C3{muk$+O(CZJB+uZXpXZ ziG6a}IjwK#77oMp#r9`J3`9~fR0MKuk-v4?hwSFSj*72VD_a%#gFS!+_0k#lCeX2f zEd11$)$>DHrU&8V34vLIs4@1u+t@Y=5JE^gE~(WXD<8o8ZbU~>iRUo;B2Rdj_dN!_ zpAt?)Ww$I6^mNKIEQxjXsJ}a5^D{f_3*%9~`$90*LKI*ak5`}Y;`Ri&m{E@NR@L7% zTwr4Vn=hcKWs41HY_pa2Kp?&q7hXz#ydr9UwHq2T{Rsk-au{nK7kAQbA{F@UCW(Vq z<3ftgEm^$kw=hIriTbP=Bu5}_ug&LA}z)pN+dmMq!v@%2I0eBmb3b=EK`ni@7H{8kHCbDT4h^vuq$A@?T@roP<|0`FenOjlR8rKkru1${bz zKYS`cNl7aIVQ$rN58@vqw(nED8PthXOL87iu|oG?Y$AsoK>!`^u*9&#AQ|6Ln2s-v z36W&TZoX@t+m?@)9ip+-wuZs&?t7(y4(1r4gcoWSr;fjyCQnoRBJCK%hXgOG^Hky) zHmv5)lAeWM1EK=Cax2oWB0atkDiLY3(+?HW3?4Wkq@@=gPMHsCE8@QoMv;@>6kME6 zdS`018!q+h^QHAFcb%PD-JQcTI3!jg+FNCOS5L^q@sBb}#1a2DMnnk9y8NZa$Pk|6 z^+(ipS>-{8F*ySi!6e{ya!wPiNs1%oIFcNtOAuWWGh(`>0Gv1Vy2Qjygg%45sAMKL;eO1%v~` z6=bkK+5rHa*2cK>!B__b_&b5+QnlDCWLHn3-h_J8JS?OHnHeRN18x0;q%f-d%qKST zAenN7!fKQ>Xj}>-3}bpAUjJJUs{&oO^?D1@^)Ktsfm)@XOZeIN?7DoLgDk`i%7T-N7`kjFe#QO2;HuJm~U0u@iY^A!hHx4Z9F>j z(d=`)&d+&nZx!@ics3L3-%Y+EAfWMo6nwCEJ?A5#+1M|Yn=p=N3}iap5m8RPIwpux5f-@@d2dZk#W+Z5lg1U97->9w&z zjVEK^$*11egLkYa-O*#|+elbMq1+nNMeT)%2n{@7iS|Z9&t6+Hlcl zZNWD6Stme8bLep`u#O_KHOhkdJ~n&W%Epb1f!hyf5tZG%S-^0KGuL_hX)YL)sRujz z3XMC^O%(R1_z99qGV2Yo7Bk!X;PQW1%PUI6{n3O2kXn2dYKZh(YHwvx_PN#lB zBzl#_rk=^CswFy~y(ZD4bFDq%PVJW#e`~EGzGDn>_|F z@klIJ6mBmFnO?h|@K1?>vp&qA<4}^2TG9_WPZLX2kWUm%e)XZDjqlnZ(DqC(K_PFL z0`?$-xk6pw;3Jf>=gb%ha}c-=W2??!%P`Nu@9nMc5}iR0$f+&d24vg)qCAPxS=%5D z^v;b?qx!Dq7kT=| zfqGvi9TWTC;Ay;+Ex=Jt7?34HhtH4F7d;-2pYq4CNO|ZN&KHBb)Vwe~D?lvWXo4!I z30xxV{ajC-RJi@UPlWb#O$e+CzpH}Jy${Y3OoO^3E+#0ubbqGf}P=>`&+zm;uFE5GJX^G z2Zyq5M5F}Z&}Jjd+!H1WauS#*p%;k`aQ!R@WO80WO3VT02+`+ryvU_WgCS=QKly|d z(LygR?x?usQU8qRDLTOMg8jo&xzzfN3bMY7@x=JIxhKFW9q`2eW5W?di^xASkTk(0 zG6nEOv?;A)$^`N|$CpCc=T{LSQHiRd;nHE;2JP`0Vn`mjwRhn_RUVD6R+OK5ri=!z zs!YuQtq)ByaMG74D|`Kmig%}bap3%%8)PRZ@*8n zE_+55@2j8R!(Hi$u(T_@Hp$R72FV|RxK;;WbT;;NVNOXd{`j+Od=Dy|9*VSA6y*xN z3vyL3+mV||K^d0fAvW$Q3QomqCye)LR_pS>?j11Ef+E~oEAy__or&dd7Nz3_z2>sK#OE)(ZIS0=q(c!R2RVpWD$3`yq%2Xzl`;K9- zy)YP3jQf6$T%fj**mNtb1*>7w zim$ex)J-rtSurQzmqN0}bdI&qFqY{)<)37Sf4xd(KF|VB2Y#SYNK*|1_6b)FE#ITn z5ZhcuI_=Ph4WmCnMhRcCDMH8gkQh$e=Ba6yu+j4DX%)>&!5-FMV>P$n@+eE04<4jK zl<<3S%qYyKKQqFB7nFmgK!gCJ9qk=1nL+NiWC$pP6(y(=_umF(r6ol`0RDaS~o!zQA!3Ct#jMTT5ig(YXH;iLFXb|Z3+xoujIN10NMEgs%~i*$7VuSn{u)r1{vFPfkA zJLOM*CTR6%)^|@Dyste=^6R6A6nP=Pnj1tz!A^&~{*W{{$h=j!NqVx)TIFfigiG}c zZP){Wi~FZ&Rq92FH_w*BQlLLrzkB0n1_kL9(Hw9r5W05wyUrsAv{7;&YKy6RFNl|n@?M4qU*z1QYb%KVbLIj%DG3J*6l_;*&eNp7Pu$OS-W$J*%8 zd(mh zfAfNLV!AAQ=n;iaAHsNwLhw0wwhduP2-by`5gYOFrL2fb)C_6xo^E4N*n)8D=tsF3 zS}$g@v;dmN#P)sq?)5>&ac_fw$*nzcWytFBUYSG-(m$HV#_^BENf2}gnEb18Tw=Ng zx29gvbb{VBBCgEcCyz5Ezi0@a)&o%lLW_XIg&zK;c_bM0Kh^?h9_PIPnnyh=es&Zc zo+WfU2lyp?D@=tMNfvoM+DQJI*S5&L5E_rsDWk;xazzya?{?T-@a&4w?hNQessz&J z03uIZ%Db4-r<2)JyEkSdEG+M959t3=%jv{U0JK1`0Hz=Kz==EwvBt8vLIEX(BK%{% z$roRg40sEIuaiQn-snrROTbsPM-!e45;_E!Cv*j z{GM*$f}A6I^GkJsp}J;x6>FP^p65SgEhoD+753?Y2|6@Zx2%DD%M zU;yi5V?7~wz;(=8K4=%M%pm0r{uImmoV@>Y?=Y5k+K>E$dlbuM`$sMN1#pR>cRq~l zF6ze`I=-?Rvo7Gz4%UeCIXIuvXgRlPduht1MvR11a+9i+4bKl7E|U0#mM;-|(&N!a z|IQ?L8ENm{P+YBQzKUbkh5(D^`|-t#76!WxdWX+sS3qy%HF4>j)<&3SoAcT~)3?}C z0LXI_ti$gEQR18cSsyD<*!JW?5jljtDdiM3(lWEj*a#as1x9e}IXT2S4|Gt!c|HWmCiBg{QRhEPQ1^YFFqamGjjrS>7N1Ny!}U?TfhAw?*JL%6;Vc>1Vo!?E25 z{6r>OdfM@f=0x`#_=c}&DMu66aQ)pH#Q%Onu`s`{llh0)LCk-N3{Ts-@%X!07tXR& zRS@<88pAE-F>pA8IqUT~Ocuo=P8EyP7z7W;)H_|sB)RrD_Z(uc?n5|}z-4nZ$PxkJT+UF#_Cgm2pGfLAfL#ah zzlwzpbmZ8cnu%+ z>4t{0M>}Dmn2_U{j*UcdLMZpOlCnI!pJ`ZfGsVQeS%mD9xJf-|Zs7m6SW=Z4I4jB{ z^aq|3L1MJ*Q~=t^f%=v+Fg=KNw1<7^8lmB&8}h+A-rDxRzseJD>b9YFn;>3luf)%oys@1 z4a>XUx=ieUeb@j&7E#pyt?p603(hbTm9v}j0+{$x$XKzepmyoQw#>zmbeJ?0XISo& ztZ|k}nI03J0&+ZTwG?xKa1<>^+3qnh3d)A9w&A&xB`d&g>7zxaV{c1uF<(Pw-5=UR zexb}L3sJXMCY^!G;v*H+OwBEWAMLAro(dx7qk#sg5up1TAtMpkIyH#+>YFMLBp#b6 zrj0gJldTj%nu4c;vIwaJme4MyNAmSF(zOGPxC^*(-8kJpxhIP>w-7dV1x2%CA@v5k z5Y7pYGc5lXeIdV=gk<2nRD2So??kt(Cj>LIp&N&~z%S+=d%!#yC)O@g z8X5BYX^Ff~!p^rFM! zZ#Ku}bJE2EY%L*T>n%_!d>BF#Y?vV!xsq3@wzT`tddutb^_cZB_2cS(Rv=r2ym1Mn66RX0 zV+u5BOIoWXSkt8~JoAIyfa{EkM^6+4dyXuP=C`8YxNZy9mRP~AP<^s1QGw(Bh5&fk zmEX^zUD_S}2pOD~c5f~9yGXK(e|Os|3LLgz^iW67(7gH|=tG{fBJ)sMU+tKTOppt) zK(j!lRYXj1Qn{36aTaOuSRbrYVeVKw$n@>p;jNw#>VcMz;HZR(#H4Izz6njE`r^7r zkk1)P?3p@^81}JDX*VXp$IM>uoE(0pYAMpCm|7a^|5Yh#15W&~POp}q6uujjl2Yy?D>VQ($+ z`z~>ozn(NI3V?J023P>T{p1TgG`DkGK<<(9^r~s;&w5M%L==WHrtr~EA8&NYFF#1y zb5Fh8qu?_<%|HY-`QrY_sf$+dz?lef=t>F~h=l1fLH=|;(jN3liBP2DWfIG1XCKoA z#Lo!}@#hF}YDOlRYIuSDybHE@mX3bXfu;$AZKz7D(S_rMZM@Gv6A~2ZR`XnzvA&%O zLmd(ON%D=llZ{eb{5s`>r{bk(w&wHVS@pJAy*`hgT=}VY=FznhUVX$ja~XgdM!;ri z<1Eyth%@93>h;AbiCc)1Wqq4`Tvw)uwB#fK;z>Ri8;`SkXuNm)1dx)vO2Bo#dk0~) zZScmhj^&*ONNj&y?jnjK-vKG7t>w_H>lmRvbG50`@Q(lwQsARoR?FITt{fOp zlqtgMB&UVq-u_fO@FQKx6b(|9H6NMaouD3558Gd;*Vvobf}I&d>^{uoKZ>KYhB$u@ zc*c^+0*m&Cm9@tXy@mW~jchCIo1+DUb=qZ4hml;R=tZ<;$zO$oX7)RCXH{s;ndk~aP99AqPMzsVsf$2^k@ep3Xjn^v*UAniV3Td) zUUqKYD0P*-jV$ILz&T`uqFfI}Rb|MiBMg&{8=BFm4K8hpNQ1wlrfGUfNdymS2}LmZjX2_EWk9HU=n zwwLE;ap7EBXSZv|YTC~^*-6o8arM}5*HJz7%Rm=9&Y-YQ+vyc5V7%P#rv*S;phOTM zMtx=J8HaC#bv1p%xImbUzcO22gpTJmaF8X@=V3}$o{dSOFF~$El{$^|JAISz&K-Y3 zelqGkoMCHA1U*GDUe^*mlKzbFqsA zc0yQ`ip=4^^0Mw&RJ$DnnEIm0;rTZ{{meDo_t$AxG2L*d8*p4qkwzwD$(~?zKss7s z7a9cmP+fkC!#Z2Y72Gfko=W3#S43cgZhe9Nx!7(g*QW14KI|B&9j}T#G#77KjgwxF zEs5_sflsXR^Ti`(?z+Vhv;4TQD>qUI#wSrjy2S`eM2MQ~jbZjmI-vAbQmCUWi}2or zSjfgSt=A!DOa?=~X$Po`;9Nkk6glkEYg=mpToIZxgfr^dEYtl{i_5375@K* zZTJEW0m3%M{>bpZ>pd*isp6t9=K#E<3@N>uN1_N+F7n45GmFibSoWKhl^d_@TStnE zxzivKvxo}0Bpo;L_@WzN%LB3Hz+`7fWp4ndD!V+US1f9;M@%*Q5?gw4qud{ zAfBM~brp0G?)W2^AG0Yg5=4Pr2UeD!!L?sn!VVIOz;~w!BOuqcGtL;(Dprn1@Gid; z9a1CLNm<(w#Djd5IX^Wjp+<~06ln?+vgA_EJ(FD7@AFW~7X0A`{-{Q2L!5{|OD+*R zWBpTxPmQ`VJ{evqcI_34AG^%E)aCm#O0?Ot{6gGXYC|`9Ort+hU!QGSJYhP~?uglR z*XcS5RbaT4SU=|h(e1T+galg*sd%G6Ebk&-GW`R)?_zn`mvI#Ze;g4afz*??VqNDzy`kHx%Zvl#4v_9w0UqR zj!V8@=YDc>1vaPXX8yXlLv;aB2|qTu*MbrkM9DoCwPUZ8_%2a`tAADgSw1U87j9aw z*y2#^lzvSL>zCsbj=Ftddd}R>g_yoD3g}|4S!HH2pgda`g38kRSafC`(F1=}2#WAK z$WOu9<&QWLGv!YZ8a}Zx;M6e&!bCQT?4KkMvx_g(DJ3@x&UX9^7=MfW&Qj4>hjJXn zki{KQ83bhI{XRrzSwU!_VRpTRw`)Y(fLraFKE%tok%J}Bl|N<6Wu9{j)Khg``U>&w zl>I{pH>j~?7vU%=|6wQh46oDCwP^> zxb^xMcNjn@hptQ*_8UcdZ}{uq;!S|e5MWmgV6=Cm@psp9$Tdi|-?_Zngyd2xwQmCB zc7s;co5wPYDdyXxGwvZom*rp>T=5LYo6_d5z{Muer}s%$ecYvq%!-9bPRR2=KBo~` zK@9rEwaWAY@Ao`(%Okv0kbpkFe;^ia(KT>xSfe>LgG9N)g3UJyvLfJF5yTE1LsR;Z zb^`GiUW)E&O6M7&Nr>jQk`==W;aZF4c7wG53?V|lVcVlF-Vx)^A-9kVm)u^>*1^8r z&3oSZ2GDsp{{$_cEu5o3`wcctHg&a_dZ^$+W2E_WxCDVP)LfM?+5%@wUV@=k692OH zFKbh}*?BG9N5k&C=T%(tpH!!e%LR$CS zjeCsYVgwF$6)eHUXsP=TxhI=kZb&T^~2yq8LPQ3UP~Q?O-G1Jff|!o#Yp z5#^wZrSxUG_CY^Y!ZQY-Z^(lp-KWi#Axf>-q`Y>S#DJ2Cy1B(!V#g6Tu#)dGslo00 zJ;q)GrK8WsU-N(OCbi6k)8BQ+3U|9O5sjx=JH5wXo$MSh-WbPiUnS`0whgptDHgfT_ZM(Pw?Tc?Z7bI*@iy1?IWL8u2yAp^v z3v8nkdWQU#LCQT?PI(^bFrc+EBm$R+r+IpK`iUWxUIRiN{bVwgnw#U$ z54iVJcJc-E+5HiB$6+`V=s6s-Nc7Yk5V~(cS$M^vXS5x27#NW&w-8vtjg@nynjJP3 z&jpltJe8$?fq_SEC<@dYx^6%|l?i=24cM}myRyFd_stV31Vp zw|4ESM<|7D9-xCo=mHQ0kPm#o@k*1~uU6Y%f9M}q9OIa<#&G;QiR|waSz=-S;|l+e zV(@P(v;X+W-$V5ReB_6_o8H*N3`RbO9>GsSY3u=Jw9|t7k-1*VU9uh)u;iZ@nV-kM`Oso}cJT+8^@UP_EFFmHgPaPOP3;~y zH2x4_k-)3MlA)a=H3h@a#E_^|jUxy8;SUw!~7szEWvMOxg?$GN21 zP}kF2m)mUhs{zl{4V!h9ex@>s_ZB^_?I#fz`@HW_7*_SesbO2~hbG}p5~>Y&VXgM6 zIFeRHx$cE_X9_6Ru5@1J_{4E*px~~5*F#{wc8nCJY)Sch*%_u0|SB}*GLI2lAdZgSh9$LD20tn|t zYD;1`taF2DuZ@8;7TIa;eSN4ABDbAn0L$`rDkfL1Fx8jnR^*QZIc;HMmN_zQH3V>7 z7vHRMRK08@K|a+jwa!{;dRrw8b9GmQ4*z6@ZNZvszBa69j!RZ9jpVTJ5>m`|@ zgHd;pr@)|{@_KAC(;*O;mG8CS2kSIhgq@bNllP}tvnpNmH=XELCQDfjQEqzFPW0&` z>VR^Humfkf=IzU;pUI)W#VENFs{+!@ZX$OiG>Y2*;n0LaZb4TLVPcLr{vsD* z&|}xbBiG~~YNAs&W|A2xZm%f>eQ|!!wrMsxJgZ=nf)`Cg3i5H6Kn^(IFPAR+`#cE$ zAs{6iEf_9B1UUC^XqufOV89bs6fxS@D;_+4?k8()lzz(4KQ)S?DdXhxiQ-lc0~i{_ z@}Otd$%ogK`?t4^0vwe8?MX5Bm>~25Dv;@sM20B7c)EUxyz>%BucT^>HAc;!5XHJ&;yU% zPo~L(CLK=f!I|tvW@1{)Wl5iogGNH?5sfD7%N%ufJuQ+J1uPr}YOVJu=^;-fHJ!CQ z2ov32VSi;1Br6wW&WDwSX_ga?A86b(WtDQV%WS+sa#>w%+qJA7 zMotWx1yU^7^o39!Tu>g@C3~U^ElwWp@-CiX!$x_3f88Pfr9OrDv_gq+^Gw`oZkYf- z7{ogThkdfGBvXj)5nUOi9T+FpbjQ6ar#mUMLIq~X$_x)$t3;-PFqF$TcbJw4R+NMh zJ`}zTg^6#Yugu(4M%dN*V`1Jrd*Q>@x+h8;TiCHF9lD1VVmmllmhL39n)md*=%Bvt z3PTm_edE==_Hg|?ad-Qn%wcsZpY@XcIH=Diunk9^5C#eYSye-R%a;3`cq6Z$Ez?jQ z%7pB6+#)i;d$<3sM82;z`x6E!8zorYO9&-!m+lQK0|EoHe>jo*dtO2yvTT`*=|~7J zBqD(18^I2a3POILqv+y;IOX{j8$ZdBOw}RzA%NK}pYZjwA+0_Q1Kg(bN=s#w)21>= z!s=-KOFvbrS)@C_(;e-D?Zll9jIl!W17t^T_%Gf1Duv12Q}0afKu>v*+Af7s#jQ9_ z(qyR<4)%~Zy=%0w6Nu(aWv{+3@D4g7(K&F-eSpc`GA!AE&KWo4wj;eiAI#@fyHBYJ zK_!-nX4?C*gR^Tw<~v_rfm_N9ivJdwceP{x09^qzPk*C9T90XN6Y;&xV|w|Zgtv6g zvk;GNLCsB0U`!~ zSyc?;M?%(ieOi}+>{xyBMbr+jp3mtOEh9yk5@o{5B3GrcQ9Ju=%vZ+XGkCTMwZGU) z3Z7NCZtQxSaW7}nm2ktlt^v_Bhbi=RhVRlpCjT8mPR4gl`2TS!r~p1wz-Zd%KN#E2$C{n1T*~Ib zmbZEXZ+q^Pp@dM{m{|L9Dy8_Lsdht(h24bk$yog|s*i|EOSWMGVxyI!A=DMz$AsU?+hjlN_*oBRkr_+*@I4D3?D05{XH@ll5C`y$8IKHWVb zotA!zhwdUs^Qp(dbFy@5JPZXMgZTEeo%%8APd2R1s#^?EmX)09jRcOg93tXk(YQ!* z(*ty-iwySF&gee6xxQus$$g3^vuEs3)P!d^L*u}iFrhk>nxTb|lV}p=X61djX;BJv zGUH`hr^iY|EVk@C3Ljr_1S9@W#uu{rqpQ^IlV829Glm>CDbG7Aby2Ltx}5}w5Jx)_ z!Qe7-5~tG@SAbw>MaPSIHDlT}63Io%bcAHK6{I)gG;y^ewKXA~Po9>e%h8N6R&?uJ zszWhOf2K*m76in#`L(7k+}_>8elU&({#&x%b>3N6{sH^t0+{f^{pLJ(vhX^z4)9=O z4!tdd_mLi#!HobxccK`P=gV{VGi&fua=%F62p=5KUYqJ_pMSPK09x_Iq3u>-p#Q4d z-N0z>zQCdGTT(R2?ZI~Tb=Y5Y@%L}PB+9LIGuP=RE&cv!F5jiko$ccD`R7SqAdXFl z5bL^AnE;uIP?`gIEwoe}dfOhonarBSS>Ekh9}Pr5dRzerunl}~?ePvLE=>dE+4I>M zDD8#wxiGZ$DjEvue5DDHUfXdUl(o&f2{buH+2Jh9CZd?7^RUu=yLr^~4fZH!vDv=2 zjdDWdUmrxEwi9uwj+an3tcKfpd%)#Pz*Yb(6pLTGkvDOw3aS%&U?PL!{@n`e4)OpA zAa}mAFLCfiR3M??UcNJMtoSl^V7fPpm$zH6VA9BjAI4yy>5X5irpJtx$K7hbhQma) z{w*u-x~f^({+wqk0jO9+a4lCb9)u+4-tcEdLE&6G&u`o)v-1feCOax(E@#ty37U2O zrJKz8&r|t)y-KfavI)|%rJtINg8Vi;5_yv`WHh8t^98?YVT?mTC^O5&w!Bzuk|mI2 zP6%%J>8JN#TafhaTdtj8_Pz*uDOyzZdcqIn#PwNx79XitWIsXA-lug5 z`|BoF3$4VCOmLOj5N>M*^F9h9r+j3{Dgy?Gq7fr9PUP80O2EJy1ko>+o>=63#g0s8IJ77g>?M_3i<_~mzP9-xCZfulAT``K6pEqqkSZIyj8|ro{4hvnL9eWe)89UGXEM5~Ant(8A(pVeh~zp` z?p`eRrFe=)V8OFlbGUJ=g zCMZPl1LL@0Xm7Y`ER3VPkdk%#%A3tiRuk75i0@i5V@PCZPNg#;4=lQk)v0BS8A z*lM!Cm`dg{g$2r&3bGyOL>p4S%iHmaFY!`XtpN|lygkdVM`7W1PwYr+@x{^4DBSl~ zO%k1`v~6na6eWZG4*R(ULPzXDq7z5#Hfe@ms5tYtJQ-Iys*efB&&;bnx<-4LmjdG0TfmF@34G==GqImbk+S+Fmb@n+s|lr@A`r% zbrey>X(oTTpm-PO;o(TBhv|}wA5R}H+e;sPD^SU)wF@>rIl`p>oQ-Zdzig+z@~tS@ z6XKL!iabr6=3=t*<&CUm!FO;yfq->jEsZq)NEk6<=6>EC*iaBatufmhGO5%z7G(Ub z>O<;Q;Z;|&2zTIM^E6_ur8|yufTeOx-81y3E>sdPow`h>52>SyDD$f@eNCRlIh3@~ zR)V^qg%C9}acy)8_V7Cd9gbjXvP|H$(Jc&9ex*X;RQNJQ(ms^uD6U1Sc7yRcE(ryD zKo;FAk-#&{Se#i;uB{Dqz6jgl#?P=(9~FE#1HHcs2v5~>-+7}3>P_;2-Dq~N+GpBL zd6eR@)5XkNgtM@HLs(LcQ3Mam#SWk``GEm`6;Dp5_$@^lD22k(XVj^>!A=ww!=}Gn zUVecOVr75#&>~wSy}(U7Rf3}+JgPBDx5h6=-Ht%b->VDMHKIl zjHNwjG?L>Omtk;pFv}*|0@sKTx&2R;qt%`2wj$QAm7 zCR|b9TJ6#YKj+Yjs_)X4nbb?jymCO6LW~a#yoVLKIJnwDWAI3Xt5&?;4GbX2&)BLP9wO~Rq4s%2V10&kLH5O@$}%30w^(Zw zVNFl0jgecPp3}#OXV~=BF73jmcS(A+3R7k3J$3WgrBRxh%xt!SL=lovw&HPmepQ5P zomj>kKyB+aHj_&RK#zBn><_F|hNzk{QYwiq5y{r#3JOx+mKDRpiczXKn{Q*&1)C#P zpD%LbF+mT#1GQT-B<649H*VpC^lb(ixxa~VlT&p?uE53Ko(>r_ne$P^R=-Tjet%_( zq%Y!@5^M-*&N2L$hNPxRYc*GPHs&ucY3mLgmLDt%P3#SHidz>n29~<6b*Sf zL{r+CmeGJfci}fRM>3QYhCBTcY3q7BJ1cm2j{6M5x`6QO_R!pjXZX)X%c*zcwcQ2v}SG5^B)M4qYtYQ*sh@_@4t+8U;WR%}%GS57&entfa^Y;Eq z3jW~NH1ftE^sW@LIP%f1W!FU@4{Q$kdm%`P5K!jrb^?ke_qvYGh1ll5<>_4rU>24? ziz>ju5irsOJ8KmputpamJa?(CivxXqb1)Xctx3qGGa@C*;xZo6qaI*Ps8Knj|CroQ z72ps@Q-r}~YLRU?IK>^w$tyEWU{E?GK5qTfgKv zEBr*U2_8NwG7`{*Avb8~Xquii7zo4c<-n7Ot1{itU|V8wf^N6hx@}@T)LUJ9k&~39 zPKysPsCl6q9_C$5DEPzL$^avOq~X}!_w>{Ym@+85a8X8AD~HjQiR9Chnld%MKTQI_ zdj{*aKw!m1pG6pWE}cfn7AlR$uXmikn@4PP-5`DVUcZgfTmB=|K>B=x+C1{4F&VBa zT&i~Em56{jxpn;-GGT)U5@DC)1sD-1Gwm;VVS3*jobA4WL;vj=qk?*YLT z)87?+MNFbNe+lb90Y_qCnD|oqg1)(vE2q^8XAe|=U9RxSZdTRcf?7MFG_{|kqN9`B zHU}39X`7}E7Np)hje6^w#KMSd)FuxQ&TEOXK|peq|8D!;ON)L0BSZb%h)g5TWx)-d zt?$Ns6cM2i&zE*YKuXl+57HASfPRg^( z3&*lyn8Y-D`SrAwYDvzOBmQDXByNB)l(`=*n{$W3B zz`EXqlc1(HFJ@7OM@Fq>wf3>D!svW7kB6d_8CxzZf4QYtP4WC4EE%Kw!@pz7!T8P> z=^t+&t?2*iAiu&e%!rCO_D3P1M5^me1xP&s_u?z&>d^{gJi86B%jVGw7$-LU=-2md z=Hf6_|8{OfKGiMsBq#}bWzPX}fNtqStShoh7*A(w#q>QHjMDO_lUJ3(i^yf_^aftd zm7Z!d&91#41xo7&Xn>$`^Hb@U(B+o-bw+QE11~i2xXGLuyMnLH$lo>$p2YOr)#v$4 z0dZA@XSKCCC_s+!{rm$&rod9*vL`E(Yabk#>}|KZSGv6I{s?X;2Ss`Yb^>N6WFLh~ zj@HgzXq;DkQ1Tb{*wyIV)xtc>e$NcrgS~y~;^VYg1d(kVx6ynJw~NuB!wHrl=|{VP zyqg_3t}UArglbvbxABg&xJ9GThtqm}2chz3PTDH=c&A+My z`^}Z(h1Ld_qoyjrf)HxhMn6!X(a8sT0iiSzgou z__|*4O5HIvJ4`ye%5!+NLaGHDc*bY{l3uC>>pB+arp*r$XdRm$*3irsIz%l8!#*xA zT0`?5!7u-i;TcD(FraN0(SfxvAcn2R#Z--lj@gH3nDTw$;|_MopU1=tVb<5ELNpzf zAs?f^^`c}_y%jY$4Q0)3Q7c-`_Cdv7)7RygTIZ1v${ z|E8Lwqm8!ahdEuhu9`sXkxLF5O|$O&s$F^~+h$BdbEMIWilEc(mLNk`xUK8H&ob5i zHB+Ojem`_SI3;*>*grp+w>#xk50*@}qg}x(S-+XGFpWxIQkUMZT==VOi>Cw$A3RT| z2f3lG`T0=&fW}w%o``Q@B%ir;Vj&hESA(GVJ|y!~RZhiRv4i~yBN>PJYLipealDXw zd7W=qYGrk8(#;(fpOe-CcKj$P#3z%leZ3NFpaW8s>F15V!v!+$d8o5`GyXYl?nW4)`~mx};ZS+J$*8&gbdvCot(iOjGS7D^w&Jx<#2}OsXvcT@sMiZUlbM z0KyBtTVd*|bg_@>VJ3+p-Ht;nJJ*9p6Yy+gFc{FWgZb-<@1uG93Z)2{8re3Blhd5T^>!zD%m9NI4FTEeXMiF1-(tQy}h z4Pp6wsnQr!)0d$U`iEXPMbx=mX(gaXwCsKwG-$9OtA^}0>#^?^z4uP>6k#WriyHh) z>xid4iBNksL`}GjYqKO$k_w?$+U01-RDfj^4u+Z1B+4lBG)OlaSig#?xxOAieu=IY z@`wk6+LL#^mgcZXrG<8EAdI$$^#W_s%B_$rG-Qolpi#!qu%SqR)+O|U-$f2x0iR@R z(T%wRCPPtJonMnlF`}~MDySv95c-z6p2>(Ryi3`7u>l0E4964U(|2Yq&2AhV$BOG? zeZa_hLLg7{fs*}yyH#N zzM*U`Wv}StlOO@((h92JU>q}skIvaX(LT4_twtNN91Dh%$lJ{#nf zJ$293FM^uRJ-kEKcAV=kUp&{l9(yc*B0Yb@asE@o0pcY(0?*R>CJF{o0HbPbScjM} zHYCw;9E|}iW>Wvu8DZ6A`=JK>c(bo<2s80DG$LBNrA5(RGx4r?D)Pz9O5>?yBQ_W` z+>ivaRqVAX!8}@GIgXz^@FRMf?~v57({qz4KTJ%@?nNC)hJH6QNv-#16;p>SL*wP2 zjB`DqlfqvYl@M{(PH*7MNwuA_@@?#9$mtVK@`bjf7qVqB?tFC^oong9RHEt4ln_|e0t~N3eU;HrF%=}S`Z5O08@~jfYtybdVYg!qM<#B%jqJyt-VVHw-}!ua zrHXS1;Q-&;-*pJlS`L3Z`f$nV9{yE{$(!bHDR^Je{j*(EQDlk+Wr8{e5KJmrAj~V* zV^UJkQr;RU7Ke-~BxBV90$Um-%2o6O)(MLxJCOu*!Ybi`R29?y7LJyQVb;i&)~M-E z;&-=Hq^_9$w;?ZnDW2VesC6~+$Nu~NbTQ^AzyF}G+3K!B&!Jnnu`r8PEUw>(eOJL* zf0G)P- z6wivU2=9-^zLhP4*(8E)cy|Gv1a&sKdh&Uo4yGpOowoKziFVrsvU(;8PcMX=_WG(k8k;TgLP&%grMne2bb8|V94(m$WcS^%Fc zk^f=RZAfmSg6um1j3KUHmzT~=#8|ZYh;%}KYoi2PaSf|*3lHx~i_^ymf7$&}+GKg( zxTyh;l};dU0s8&fM{TkG-V?bC?7N$f^ghcbyd_B~G#8GWDNiS}2?x;1xB`#W1EnO= zNqTj4Qo`04)cpi9%TNJIFjF;NCXjr%xMoCYWSvh0Q3AM2_2q8A{xck853)*SzcI3{ zW*H`Hf%J0k(%|7%>KR7MvseUQ)B=Rc0>CMo zY7cRrQN97#Op#|2?6!

BZouh8Pj3CE8#sPBFEZ`z`MHtSo}aosnPm9roHk*>`}^ z%o2K=ZWn!L+ZKZ##xq6up)S|msQRRV)o{o63WVAhHvG3dy{p7Aas1iTL<*wx5JHLG zrEh&DWbveQPPga&EpCsIEqQcG^^8bBPK+egXVib7F^R<>*Yw?d*#E3|6l;SHRBC`f zc#mAX*c8_P(J?gqT5|KFmh9z?#}?A}d2~0bf!4{-@#V0lZj%LYgIkRIzo-<$g9H~_ zgakN}-(XNhT%|o61&Yd9ge`RxmBH;oP@QbID23>UYO7M9pj{N&cMSm2(Sr;+g!-w$ZyIYF=Tg4XC3jOw6RfW@ zZ1TBD+CaPLn|OI}wGoX)pl&(1SWnYFK9+yr{P3FaSPmw8MA{JVu}Iu$VZVoZ@tQxV ztk4}9;)e!DABN3Ah0`Sd2y2nMf2Ww^eV(MOf4YgtD#}hDGyccOijZY* zz;j}TW8FS#&Ev^8jdhik8Uh+=#ic;*$ez?aAAC2{W5stL!R()2C*ra)>`zJ3wtjQNh|0rJ%MN68f@ME|3C;}haMq^domDE2SZ zk^$R@$t7`8Fc9U>sZ0>Nv2LF$;tjQ=*zgqZ{0q98G92g}(1yJ0Wj@PWhnrdhKY(zQ z4%U9+6oMINEY2C?mi)4|_0IIf*>DS2L(JmM$o2L7FBcFuR92>$P` zz1TV5wZmop6WIG7`1AjAlEB#4uVl~+r|jM=V15`ymp&GCDJ%S@tD}#tgbc?K9}0_I zWMg+<<6?&u$A;EZ=2VKjQ9Fm~=on11Wf%C;4U@69MNeaG;JBSR{k;G1C2X%ha-Xpf zztwVfl`hZYkiJ6wZ89ZSAQH{%7QuU76QRw1`apHjcZptHH-;-Vg2=4F8?QVITY$AV zQ{%wWqv*hy$6`4z(Q>LDu^IZ6$9}EwE_!V@kl?_8Xkj{RVYY1qgvaJ%<%SE?;}KhV z(XN0l!!?5KCQABzgVf-*k8c%z=DB@e(+=7RWw;ts-q8)1RM4_a*ZkliSl}{w%!ZCc zNk#BSA$+`KdQ%?p*NYTE$V;wv;2jq8zmx)Yu6OMf|Djk0xRq`D0HpxnGPp5@#i%nr zBUcatxRs^GNndE0Rn199pz!nuBR^agv*m#{?rKG_#kb-Hw0 z5;$+hfD%wnvlul!~_??YV*(9Oss;~)4QsE79 zPJQDAv_V;7O2Qu)%u_*0*SY7)8yQoS;2RQ*5><{aZ~LUL1!j1^Me2d`Lhi`d$ywmE zv#G>v=t%_hK@7Y?shn=HnyxMT_IUGZIq@O?2AvVGPX(h~U}^nKa3(NzTOfR7Bf8GV zO(ObK{=ug5z((4zq20)e)o2IIv{(yCV?4ZeA7pZf8cT!tYU`s?VwUfxqWfiCK5`Pd zE+@05S8o%ug^-C(%k>C+8K&RViK_-}y~S?5x^LBUu#m~JQHImK>N)qbe#;AK;Ck@a zt4#sc$ir&mWHx|?hz@hQrhK(=1ubig$o#{D6oKqe~XJIbjnyZ z5}eGkMszq6LPCTYvupQLr$#N_;1R0#z0lECok6g1pshEiP}eS9%nlPaU<^-$g)#}w z1Ide(fW4qhAr;7^ZEQ1UF|+1BA*b)M5_wicwHMptKir*Xxh0gJ*nAG8(gYjLM1Li+T+M>VhA^x{^ z^-ch8Sq0XR^d~E&#ykVVoosyY?$O>I(HeD=fN5&yA;urcO`)#=IJ1SBb>|Jml#V%* znPy88AxdFq^(b+cH(2CM^Q> z@YvSNQh9-OB9nK!Y?|EVKS4BSakOifX`OG;MjHu%({K==ebKH*PAz{eP~{uPbX1Zb z%gMk&Jj4R67cdje{2FO0bGOFK)h;U_Ac`5Hw4oe5>LKG)suVY62w%=ijcpOFmlK$Z zisX@*aDhML$!rdo%lT}&oX+uRUT~Cp$6&kf5rJrnuk{yN0LS}~4=gPI0Ic%-=DLfx zqhoTTVcy1_NyIi-EmL=!eDi0~o#N_1^!^w>bNh?S#Di`1M6h_f z93sd-r{SX4zU>pw2@a;Wb68B`Pjc!UCwH?uZr#Dapk<_e=$lGpwJ65Pszse1*?;u5 zY?Qs;UHJfStiK{^m73q-G{p~X{YDdqo_BO#NzU|bc~Ws3I427<%NAvc@y7~Ku%O~H zDi&Vk#&AVS(qnc@q?h-;?RpKN!A{6l6}4&blGM4jyu0MMoyt%wXZHcwmqK!hu9;2w zNl2SMik@pG{};7xBM}JEpmCFdX%3;Cgjkwr^ ze>Q#rrb2f93#^Kr^IhvsX7)d==%%@BeFfmaCtE*d!OG{=C0uPJCj+g2yus9ih~fK| zRfY_h7aY`v_0_sg8$3p_cV9^HykN9o;mI2ECIS!Oxzuvt`?4Q{5{x_9K!c8$m`q~e z);9Ddi?l@{5yCO{Z`$`dvFl508nL>f-x?8QL*w+-;M^e^iLVR^oKNNRC)MD~t`J;f z6D*~Y9-FS)w)eV!O32x(XYZ3(h;qK5Oy*jcI7mU)!QiNJ?6U)ogS8n?8r2zHJP`~_ zSu4_kE}L~3p(!J#3Nruf)6tvWU`X0{DofEvI+hJex6utjly^SZ0h}VjB^c9snLcFe zmt<5?MXhgpU+2kC-eafork>WeT=D{Jdgk;O&qQ{vcU20eKjldP%xeSyCL8=N)f-lw zS%;thQ!#bO&g-&JbpDv7L3)6((it8PimW+N_@>wCJ#ipz?LjC{BLz1NHx>IxDVMGi zUZbaUIu)elMif_?H&J}kBBGYv&(YO(cvYl$ndCgpqZLPD#1cAhRx>SAj7-fXUOOE5 zd8q;->63IPK`ucl_a~`xTRx+JEKEy^@JU5U-l+R;ZzAj;=chqUjyTQ(a(=e*Uyw(c zIr=1!%6M4Eal*W9es=WZa0P-4{uK1@g2~SHu5Oiu@sFhyfYJ}pxcVOgP^AIm#|-0a zh2K)!N{u{xhpRDO)NtomJ`jhQ>CE4n6yzSa;*0lZKEbwL3j{x9pBfo>Wk%$x)%a!7 zkr=mrhH4`1#O7$Z698}cR*frA(TSG9UfDxVZUG4YWnyO8G(BJu=C%-hL>L zLIi-Yundzxc*P z8*56LSA_AjFqb(l3N$dzZebNFJQ)5K^>?kMnE&zK2SmL`0HBa*2cx9}^ulJBjvFj5BJ9+1%;DF0jEEKa??N+a!56nMeeO zOcr2~)!kaeo_nrq`{$g@4ZLSSwvFZzaLLx&nhoCQQ4c3D>H3cAjzQ^`T(K;%;yTMR z64$h7??FEa-+r{|u}gAT#7XZ8*bJX)UK@&#cjdrLU3$uIKQn#?hFrG)%MzBI>s^(S zh4mkR16kW&zo{17;F$90gh&UxSO4KqS$v>knOIZL5N&y=0qBs!=cV_2d&o|_{*>&T zT9T2kH>>|4i2Ei<#(!Z}v;x8{BM{P(mx2t9pN5!8DmK^gBrBNIaL8+8;Kg)Zwr9Ba zNAV@HD0P;lEj7;GJw${y25JHMk%-8J0JWFnIiU@GT_mZi*eM|roz`H}D}X!hcYZh? z(=Qj=-d~tfiooxn5~T{Vn;al(PhP$QBCsDW_PEYz${g1VY6wHVtzjD4Zz{%Jt>Kgs zceS*u&SgdWvt-P1ixq-C91>#xi*--$r02KeaB)Uve89}iSz8Bxew{>a8a1lOx*-(IE5~f!O`Qp;9iDv$dU^K zC1kjolteiDh31V6++?&rA=7OeHv9|fCR9yDzn);$k0n0-`|NYP4`t8%4=r+ljt)S{ z8nF2JO-iqd2k7YRMLxXM#rhyDNz16SN;0+1XNwlID)|iib{~d_W6R#;-sya19ZOV? z*;`9l5cn}!5tZ3w(JT(;Z&?d#9;u;d=5_@Iwr@^G@lT?fclCgy^Wt9$m!+En)?>~JQ{3aOTKb1t_xCQGw*OYb) zT;(!iCkjC#yyzTmCZgiwGP?!g*ZA}-2X;G71@ENrJ%CsSx{Ydss6g$fNYl1`r1ZFx z+v8z*`vBtVr*BVxiSBzPTK??ZqNKw`0W-Dza&P!VDB_Hp6}bY)yAzK`7`Ke;K-Af6 z7^hG{ALYV<)L7}kH1|fo97CqVDIrgKDS9|4GNm3j`*NckXG?pID8`nw13BxKSl#t|RDkoV+?t z+cKOPl^;JB70jdDix8`e$sFtzyJ=cLL=)D1C?+iTC&)TZ$|QJH zH!PLV#Ta*d6;37aUQz5Pp>@G5jjY+QbRyK{yi0Jn)sT^+mkFjt4MGg`TGT&56p-SC zud$$^z8QL|*1YvtM(wfGfBjn|-}Opo`%|b$MOpCoqWQO<0l;~70c$O$tc=Ux)k7<2 z+@QKZ6^E8Yx71E9samW2ngm^3ZkPZsk4&wUZapU-WG{Y2yu3z85tL9Ge;Z7l+ z&v=bAVv39%Pl52hHR8$h$FmvbaMLf^*erkZJ{vzWQ53vI9N#G0EV1%0!QE=~GtD%Vtns50p#8E6by$!L>vLd{&MJHAJ|^7R zzQ&%Hw;DdH!(dB7MT@$?wxCe9f!?6~=h6jQ^jcqHkS}1^I=!E576&L>jg7(X6OipPY z;j{ES6?7qGoDp29mTn~*CBd#uw#4oIoQPLZz}R!YHQR==ro_I^^*z}ciSpFv+RBb& zd*4AW#(k5Ait3s6&T?rEQZ?`UGjuJg>l!cPGq$JA?1)&i*wamqq{qD3Ai2(-;^PW7s>c zJdr|d-{?%l;VNB<%?Ick<|ild0>lX&2(f;UPxWSvU2so*W*QQHwdTaM>_Vhp&ZY2! z2v1kQ#`)VS1!1F1t{m3FU zSG%RrHC~mb23k`(dPPCHh9);7=XlPd4tW8g%wx(tW72GR=_Zo{#)A9)Z~xi&a9OZ$eO3jN;G`x-TIfV>sx&8}Xs^vH&2HYd>!JcR zKBh*L=yEcfxLsw2AF$RXB3e$DrNf_8dljT9yY0|`yZ)I%#O*-2W+ z8M_8)S9M#C)Sp=Hsz;rj-tYCJ2a?DNN91T)R!-=@WtuX40WbS%GwXJRQN>^v>`{fOa)(iP^~?pjn5{oKutPgEznQ>u|AK+ zo|~9Jec6ROoxM)hHQ&L?R}EoYt@Dww!|2uDVV&cBQo1axf8OhWnX&M1E(*+7PwY4NeaV8DzLA%PILVlNoOJxbYFZ>C za&p+@k<{;`aZ?F!%ThJ02TY~kh4airT(f&^;X`ix$5}xqMZP&Rn{dgN7c(d$5TU{Q(7HAw)t$7+PYtT9S;Lc6PjqaHoWzw)!Z=u3SlcHG+X?+=< zdC%%7>KOKPU|{ThgEiv5dE<-s%vMZt@A=95=Obj#w&=a*qnhYk7+AvsYL93aos!WF z%uG#>6tws8Ea^`CAZP;uJvl^SFC|^?@5z%*EvritAGyT|Sp!{^};_0oQ znZAEX49-jW1RD7L*BZUglTBb+!ra9N$CaE-ZzbpWUoz{?bgz8wKm|%Z%YQ3d@7iMi z!;w~0UeK~v0IFpl{Z)WeN5}K7R^uk}^{tt6s=nYtT?eNmP@s~1oLRVl7Q7?xE`>1KPcMIgM8OVvThMV5UqGF@Obl3g7>s@ zFOyb|#Qu%1SDFJ8QOvHnGL5DEhb&qyo=w@#vJg4+6I^V!v~^a&r%5S+t%NbevKb%w zf|p-Rg3X|fIcC+nR(i8pQx+{*zl(-|qoa(A)gGe5ghXAk@y+H7RwhO8wrLOG;8#f? z|1Bu*10MWifCwPd{I|&$05YrD{VkR$@#aByC}Si~C;*kjOf~#_a-MEGs!=UunD1&* zqD}dE@C-hnS#zxR&4*Nz;B?MiBW@^4!JKrbPE9G}O{erhAcEesQP5!u%G7uy5~_%* zNA!u^fn9V)f$YnYmK9Qobv=<`MFi*)cK?(PImL_Ap10!4o3hkNuwA|(LL+xpF)l>+ z4RPs(KuKA&9IAPVXC{LORQik8=1w>e2eY9fA>(MNO@K8!1Z0b3x=kQI=>hk~0!o+2 zFKEawyuIYWdv%gk=$zL&WhuVQTIQ7X-b@Yf*evz+T0hE%A?)GjD^fbmzRHJKhFY?+ z|76l_D>AieL?!OdETR&=pnDxx&4f>9;6?K6i<@&d(A*Ac&GK9^&B{)LMQr!!+UqJ* zf(cvrlI6%){!^%F`Ro?*mdE!mgc*)^s^I@gnE5^C>HM$GOCT6~zjeB@3g+QU1D^E^ zc?mBUC>+6WUs17v%gjW%kwf9+kxnJG$F*;5Gjq-Nuu9f1W)-({E0}n8A(3wiij_?8 z8>JyCFijogy&>ha8r~o+1s^;%Fep20U>{>KPrJz+H`o`emM~R?^{+9LS7Kdu2u$QwoNK8_fE$#ilXkw)kzxdy<~bZ^y_k&@5&Guo>e?5w@-1Nu~bqT z;Gsz<`+-n|%pjnnRnTDe)cia(@!Q2#omF_;ZBkC(X=0la3`dw$H<)$9b|TL>fovs7 z|MItw!gEPWW^2F{#d2wH@}b8F4Wd_@)`e}nN|JXx7&r=b)i5L;hT z+>_z6@W+y(9+tz=qME(dLz<0*}`m65lBSoWDX7+)~w@+yRO<@Ff^fg`Os;?7f6i z@8hmUtAB~_zjn}2C}B~GEX326OuL8&IA~UJGxHFilhCM{Dj9W>AuPi9RA903cd~o3=`f?8;OLRLCI@?+{pvAWbc# z4u5E{@O$2nc_VkIYQwkK!yugtJ+0dwtrTE_+K)PMYSretk&>YOBlln{rW%;8QDgAW z2a9IhoMY~#o^-jSMox0FSZ5o0{;93oAIvz?M~@+&>pj49^eX-m%J-QPG5?7qQvnF( zfD!#hD0#yI<_a55&eMm#jfq`Li;K$98O|SIhRfJYcjVX}a~Hd$K5F**7CRNJgBoI$ zbuSSldOMh@J;c(B&8bW#mP?w=#FiYBBS48QI2}s<#ZN9Vt_;f#y--K{6e=utrRwY+-RqQO{v)3*N0&e>b_V0pR+Nmonv#rN zAsqFT`k3eVtMfBG+6>fH`!E_&*bDj&RGeMz8{Ll7+Ag9sn)zQG5?R>ah4Nzl$E64W zvi`qCxOZY9l|p6C&jq0Lmm8L{$^ysuUvFpDtY4p>iDXF*12?bTax|t*w_dwZO3SoP zN|I!`Ng-euaPb3wddd!Yz_Y6iEzQkQ;~~O|mcXvUT_jq;Vdz{sZcmC=R(%JP^4O%a zp5nN~FmHv#iGPVOw5{Xc7W7kP8;Yw!ItsHP)jT!NKM!biLfIlBMZeW{;kwaAUp-7N zh>4=Xy=dtJm58s{Y_lY1t<#j1Iwb`)q+$!2t_r`nX<^$mXUW#g`dRrXZ_i-j^zm-- z7?FP&5jN}_-e1n!`}*BKog-AnTHreg;Y6p_lv=?f&Ec?agR;>oJ#hGuDOaN$+o4!| zh^VdegUy2vY@nUwth=!-nEJe>RJ$Ye+L%8(G^?q#^m*`FdMq(u;h(Ceef55nY7$iWIa>t?`FEn za@Wgy9=Y%q&zxVc^pESrU!X`H6PTq$^n06Vg;Rv?rlMEe+SSOqAXjf~M`f0Oz&maQ7*b>my znp!nRP-&1|hf+z`Oj2COflo@4P3F}X14QaEdwY$P8`4p`peLta&CmNcurKkt34clM z`wSkK|FjB~jRdd)8KJ<=ocmlmRY1Tb&Ms7`A@_lwp72VSNHJTgAbh_#pB)5tM$uH3 zl7N1kp2iJe7$`9r>^IXtB3|=iSn)DW$#$798A=6o748LRotUScHFK*0d6gIay68n? z{K^9>#5ZB%elpz3r~wFG-}el#$KBK&UvlQ_kF0)~Z@u-G8f8HssD6jo4yp_aLfXGF z=l8KPj>FQQTw3)1q#cNis;^6WA}P9<7=ED5uCJClOboy1q(nToFgSsY$L3(mu=Pth zPlH0Ku})(PL*>#VMbff0Sz#+{);Py^!L!MNWnbL5$*uO|b*`q)-{STz7&h~tBS~4n z=p2BA|Ls!l33M9!ThF!Y;f+mr+1i;2V$U4SFDZyBanCk?S2XULq^I^<^URZwhR^_| z>znXsM1WIh$JC<6>lr&;O;ykG2Z3{A!$>uj zh8U~o&&BdQ6(wBG_og|7b(u+f%4;UjOQ)KlB994ZLEfk<;L6_-IS%~=ZKKG%nSV6Y z;OK=gpT{efnp;Fy*j*jquT4~Wmo8z!!@U@7#6?EZb-q$kVLh>*KTkIk&Naz~)@6)J zI2XEP{7~3whq%r7AUc>4d(P7)5=~-NeWjhJ zw#xU`bBVzj2rHB0@gr#Ui?NO5ni8EbCmE`7sH@hoE9<1Z2srw>Majc~02YRLc^az1 z{oPJ@hcHB^_7_$e5qTj&(N+OaRJl!KrHba=tiAi~+J4joUddtVhAe@ZqMXfNLcsp6 zSL{C~EPzk)_c|OfVev);tiy4h0S^SkQ-D=ZYT=Lt*hn{VooczckMy_mU)}rvY=@C>AC>8k z&d6GOm)Q9N_3!uy?IoE^VFm7^GU*$A`OIDKXm`NQ&iXUR|0@(ThQdfk=Vs>G!p52- z)~5+fyj*WB-Bv_q3yRzs!H0nnd;RqGQ#9;D*^kdv5-NX-!219Jtba6%Dw9kIfDQ+I zGF{0U7!@pTlRsq)ZTw{!d^Kn((fBztF$fA;g-J{1n~DOAJ98z5%vi$zL)}}3Rk?Ou z!!*(%-QBeo-QC?tC@tM3p>%h5ODQeg-3=lj2!eEjgw(h6zW4KN_U3($_4EA>f9!Rv zed)T+n&+HjjydM*_`WJ}mIFhWU&C&VqT%*V>a0W>6Rx31$&o8?@~YF?uAi)EQg3Ts zm(4yS-0K?NC-RE|M6$nve=YKnICD(Oa2-$0Bc(U+fi*q#^%4L2*sCi@dUH|5r-lSp z3gaIQH$fbS|7{xE8UOLy8(pm5wDM%9=~0(U^l4^7_R+l6x?Ofg$G-OGc`1ArhCW$K^E2&X{hpFT!(qf!$=7_+6Xw4dHW`k$F@1px; z5gIo0cil)-0zXhwa#|oQR@l8n&tV)t5j5_iFXOQ+pbv1!c3z*vRaStq=tSSfwm&T+EzUrEH8`%p}ZA>`l#B z70m2jyI7F1|Fb*?)M^J*<}pFvC{lP#yOmb9n}2k_lrG_uGZvZP3xESHGnG4mndnH6 z(-iJ2D@k$69mljmzYcgQCXtCNUjkS>A>ma7op@5VIezAslh|zjtvL}v-ivWAuZrk& zr!zN^pA99~K9Br(9{W{$QP`kB-LByTPMJ_xdp3XVN}$3ft3r-C7nnJGQu4u?HJC(Q zTn*b)hj(uOtAkvCEqP=k?W=@{{hN#EE+EkbP zqj*F0#Q0M1(H>gw+vF#D6S9NZ7wmB(N=4N_Mi&GfdXuSi? zS9r2Gp|FcofH|9)3+4sn`Z-J>8F!D3r=sr*Cu=SQ*Y8sj*pu`XgDGAS^HL+#xz1xB z*jzlm;F^m!f@nRaFn1i=ugR@mpS{T%UW3E*YOX}VL?eout?B~!jeh6Wx{r-`L+oQ; z-Z+fWg10>P-XE9la6@=oXd%z3R%sYf8lDPNLa-2c==Gdrdujz+L~HbkGYTev1$?BE zuK2$dXsh_paQl}Vn>2l!4+pYJEV9K0RQNEz=jw2J9(e)3M&s?Lq#~CY@)%l|gbyzc z?0Zjt8ofMNu^|c=Acqi0BVk=3oTd1351G}whv(g8;%I*;?rGl5bNROW!w)JVdPv?NtB~*vNQu0wi-I?1vaH2#`lvPn+fPZyt09~RIs8%BRQ0(mG*3oR zG235yNaEUH#`T(`!Rd;k*GoHMdCQs{(QCipU3mXUq+2wC3LaLU>-eTkrdXCyQsnMBFW6_8@bj2|VgKj~X%X04h= zX9s7PYNpH}ofXj~%XVV6%3B{x$gE7HIRo=wW*#7_IqN(DSDcc>?pb0DU;i?NJAF62`kE_=J23iPW@Io9X1_L`$rM zw}pKgi@|QcGPm(RrKfuv;eXx(b#M5wT_t09+zHrg;}^zL(mSRj8hoa`YwpWi;+&bX zKUS`SDRhLun-NwD+C{4P(Y|nv#8KT}>R-kl6PKHk$oRI4?Tj2Bqq0~4* zj5i~{|DLPN^Ye#i1)P_!ub%k`j;BvIR$LR|dG4TXaC*KIN8Gjl>JVAUU26Pp zNw{@V^C|xk%;xNzk2a@fyli~$e^pMlpD|mk31+9JLf<{?$xKExrNYRmHQD^uB@GA} z@O215zsl<`I^Nzr{z@lq#HGzw1yftv>sO`*RSzAt!Q8lx7q)!cOTFp`e`qf6&?9)Y z?q1(v9&e?kbcPhhHbH&Wi8)$Xk4S8dY_?26&H(tyzx(07E&1(Rh7a=Z8^3#Vl_er> z_1m5F?@DIXzrTgZcQB)wd@m%r3)~F+05De?@6;Dk;nFB>WppXKkK=YP^uX{(gNMEx z+?xCP%qhomOQ`sBnk%f*D8yVRlXdc^7=IBsR_>h-AVd+l@g8SEUa_T}^(8$BUaI1D z+IwnSfdzrGKdz1f)tp3wa=(9c$U*qDFsKiXD(m#+H&ioa_e9NXXw9N35Nj){kdo8# zSFQ|*;i|ubmwdiL*YCoacIIc%c)0SREj^mgVjgpNHEO;)X7_`VhSq~(%iSCh8J>}_ z$EDNSXq5!FDeo}4d*ehz_2&CFmh}ZH^u~_p>xk8;DgMr(W%v07BFbm9Z*ZvQ8{n!D zh%s@g?FQTv-OtMk2f`EdY#wsM7iphEMVArde3_v;>fukYT3FTXnFL6MjUqoJPF6L0 zN0$rOKj$Gp_2_d~FXbVZYBTA~+zV2~55{Y=$!E~T)~|@}G&cLFE{DC1u={*9v^9>H zKj&93rwC6!f!fHkFL;U-JOna;>R08PrS;`oR@zw3?@QkECT5Ej{X~r87uEu#eSBF@ zHkcc@*-A`Pb@>iJ?6<9|IH z>~|vsVk^7mnT8_Nx=Q!A)0KEIDK zTa;VCSBz|S5sfMrjHmYDC*vY}<@Uj-;#Xhs`qd#0V$9&QvFiBT`R^aS_&@Wz#FZ>6 zoR}8#nPo_@-G?plX}w)@o@Blfc^h4Do1}pnIDFkp^ig?)wQ$wL`c1X-t6Q=ysdor= zC11K+?`YP^ZUJ^*a0Zs5KF6G*iI&4Jvu=jX{604qzn&!cW4P+En&nU&>Hu||5)+CzwBHnQn@I|N%Ua^pb3)cO z&hR=)npFJvv|EYdU$^uV^7=y+m-C`V1YUH(;Zk&!Y{+Amxn~)SBF?TUaVK@#aKyjL z9!#?*`QqPAE$F@0GtD3+f(Y!B#lO#JWRAe%G-+Y_Gp)VuBMHB}& zlQ05LilI)(0j@ubct^%6dQ{cPZZ^q$9p0^t!7g2XeI>_i=5?6lg0dr1%}am%bH5~3 zWB+-a=RA6J>q#Fcgbhc}RzuChO66erH3&6wwYKgsP=_EYKamuFiEu0k#n3gvdZ8RG zF-mJ2_x?6y=~^b|9M-{D691`12Fr8-|G_u~nXVoYkm>S4WSLYu3pq2zGBf)2j?m(D ze_GHFT6HX}{yH2c^kDqVSejKtVV+E4n$n&(c`Q6|yzkUgV)Wib&UZ#~@igSFsu{m^ zdAF*APRFf4!(tP_`W2@VxPu)#oEnA5ECORhfV2%vZO0@LFX&M9$FCpmq^=wvu$_($ zcaszf$<R2`R&2 zZ$t$~UR_9{Ky`>3Yx?Yr1o<94_R#~7n$MW&fV!MQcMENDrMH}a&Z6y^y?trdp1nzywV@nyak4BXy`;O~3V#6&-0_Zmd-~HseNp9eUKymups)S-G zJ<)hj%cIPpj2N^uxh-g$G`V909^X|84+4?P7YX7crTF4CDjz8t*2;c<>iy&_dAwY! zJd#Hmo;{KkR=$E0j;cXb_bjOM*duzp@Ppxj9M-->8RxXQ^;g#f3hWN5Qm(}GKrg+R z)~vQ@RI3re@*`}=yE}o#g|8L5P35vWFomC!?ZnKjdOGPQb|rJ!##-MJ4XuFM=Me5Y z8$2=5++YDb|4?Exi2p0e7)}IAG9swxB9N=nVTSlv*r17YxyVpT_>p+rEFkO^5{JHx zzH`6zOVO5m9~|UmZ*FC+;~`u!-YKxGh;)nCAPOLV!~JQ*e4&<@a7+Meey_Dtr}V9{ zYErb z)Hk4BBS>=g4A_$Rlc1;V`M*S<6L)Yi+sv4S zof)(!k1_ZJlBhy)yL7wfxEAeNznn~!Fo^x5+b((eEBr-K z1lf}9Ge}A{R5;dOxpu-L$^Fr}$$?C)x0iF2g_A+OP3cVqBa`(V7rJk9NykGW1r_Av zLq05Iir_hFr6*#`P~RH7*JJvMNif>e?y2u7h_5yean7!xedNZ&F&%j&vE7oeotV9L zyo@S}mQg3%n0uju-Y@bUi>A%#rXLq3-DcoraP#R6z~i2;-7P|e+F+BTTSlK5ce^n= zZdS4@_mEFYW#nC*x?5yIqoefhR?6fX`XzQG@A0n^?JyP1n^;XlNArWWn*ujN)c2U& zC!wrdTVki5DzJEiH3>)w>-wd%1v(r%|7mp@xb z_ig9*D5zd#k1r%QqyjL>=YEDk>yAA`18V1`L4|uv{s8Gtyq&3NyM+Y!81vwWyvQ)l z9X^8!SsVe%UXv^ZZr=s`Lo8pvkOTcss=yb&q^O6Zx=q#<8-156?ozj5))N?ehK-lP z?ymNNsC^n8H0@I52 zH!x=i7u3wFY6sN9PWw@4Nd$|K4QyhMzEDIiN=b{4lAEYexhVFSyf0UZEEBagpbu1^%y!Ks_R%fE zf}_3h4UVF+ke3%QEQ4cR4Y{)-qek2s0kGP3_}}%(xlL2IloPGDuF;u;3|_mWm+#f1 z!?xXW_xjp~ZlyIAo<{R-Ut~m$N{v7>JfIRDQ%zs~d=8DL1rsaCmCR}b^h^_IEGK76 z5H;x~;U*o5cvntheS||O#QkY}|D!+Jc9fQRFYZ^^m){%`%KgzwmKI!B2vOg4ys{vU z5@)UYRy0q$NVRtjedyOQFmXZ(A-=WxgslSa2L5L>5GRTX{a*@&0tI6Dr6mFG%#HmA z@8Y@T=ij9LiA)M*tX$^IXC-7KUlOT}yOgr|_$MN^`gL(`KmK9!hHt1b`*!} zvVKi}jaX;T552GudMYx3Bs9d}i^JDQ_XD&Qa~~W-gS(Ot_baFYEB)tBjTo$y%0H^H z4L}X8{_57^k~FA1F{7^DHVBgLdIkX0%C-##kil4&tB7K~%tD7_tm;HdmO`%ljDL?Q z%SaS-*R+lmy(BnH`>HX5HW+yF34gD~9eGWC)Pb5Nm_kAs`NpZT*M>83pNdTn#i(b_ zimrqa&H8M8MxPGJtabdw1RM?O>D*B06doq*))t+ z`O3@@r^j>R^@ z-TeKUfIvcLrt6fisgx4J9)^+pK`5?DlJDa4Dvkm;QZ0~P%l>brk-pP`wq-G6NW)Sjls3-gF~3Z#CCapo`#4a>D{&y0BGJ)M)@e~@N96hWrQj~xDD8BMdh|xY zvCxOi@#YY?dBPF9FYm4(k-dYo{=MUPfZ)NCkNepf#E$*$f)jY;87@c@LPkrK2Wvz} z4@vqk+%EzzzO88ua@P+=!+r5KSS(!P4J0Ls zqtM8&=rYc+^PF1KjCodL*wRn)tYy(8ixnKgk*Nm-ewL1mpt z%|mO_w&z*-i;@8ox$N<+kaz=S_9WVtc>ue!m0CY5_UU@`@x6Fgh{6w7=!Ysh)Tbs2 zEVq;W9|RkajOL#}{deO(>}Ng`38ke>@i+F;QB?^GZFsUS+{4>#4J$?UGyb|I95WZ} zs7D1^-#@Q}pfY=76TJD(aMjJ=L1I^H-iBtx=#7o2s8O(r^kFrvwlu3}J}5Wwly7uh z_?Mb!M)ts`F9sZ&_qw0bcdwA#Q~H0{ZT!@`?XLNR*naGUn+d5fWurd`a{@<7NErnz zSWJ9=HmD_(l_3}MbBbh#yra5661L>?C(ovZea<7tV?Oz zSxCk?(Ni==x1vT2RmTlDm_v*q&D=e^wLL3z{nzzt+~YP~M{)7W#xJZT3?|btr8@b( zYctL9@|>Ewc*pPO{~T`dNnJh%@8{uz2#>n3Ch)yHorKc`7Zb>j60;HUK=h{vPSr8OAh_ELQ# zpcv4_s zNYkTPdm|Ti6lQpQR30B++w67?N}8(q7)A*v6WlfA(-Op=GXW( zJ2kGtZHTyTI+!`UcZ7uCgTp9&>Mg*kIRo%MLTEvHG5?i6kTmFw2MFH?5KF$G0_7cY zrc7r9xK&gdZJ4&ZTpX`-b1AR#MTb*uaNk7{OqlXczEdZsY*_2*MY@{GY3cJVqZu$U7hO=@U3Tr@j`x($=b zBAQDM2MNc>d@HE~CHZIel<#4%xM*#U>+y&YxZp8FKOPlnGxnEMrI?aaDv4v18_b)! zxZr%=h+iXGo@z^EL%)wL%NHsXY%Jl-4TJtrl?D}D8{_~$Y9FNF{$8>9;_@rhE4ftt zjlBtR$1RQ9_F0}WK2pJa8?|cZj@w{M7;TtC)@ChBaetxVY_8a7Cvgqge zeDP3hQnZca{KIob@l_Eem&_Bx1C!hISQJPwh8I;Fq;cV;1i-z*{9rr2PA3=lwF5=x znx08G8J|p}6TJu9?%7KfXMw=LAFCmWXR*<~f!$wL{S=pdOzoY92oOhXr# zNS{OMTV|-(7R!|Z%rlXWGPv8eWnT#ZIX?e;h=9Kfeg2p3@pnC3;2*!}_lf)c#}t70 z?|%YBI|heD0$_WjK`9vBf&$X~5%*H+f<@NJ7d{ohg`v;9Tk3UV!6p^;S35{b>z}3m zY)b++@6WAq`{6m`m#8pIw}SN4!g`ggA?qR=wil+q9Ewyux+u96Jw`gJwr7`rr8TmB zU%4aYu~%q4?uGYSB<*35b#2jr7cM*dg00{bk`{hpi$5EF>wz z^#lVfQZq#pw~Hkm@VLiN7k9-W-5I0FGPkM20(%IKbvA%6f6uC9{Kk35rw3O50AbW; zNt0lJ&JxSi#8lFfc?8yqT2vY@u%<873U!)@hR>X<9pIc@jf{6T+kYX&2Wn5}@ z_Zv~xl0GieRS!=iahz^^5gOV=ottKMsPDi!;y3w~lULoO;yj`)PmLNp2bb-U!D0r< z)_?@A{#y}zMHPFdpy>(#iDuI5gwuvb1x!eUAXEUL><|I1g{5z%8Wz5mr|Z||Fl09_ zNfASxiOK|whmBnQvRhnW&p@wS!Gmvw4}td)rFh1hYo;pl<1|3)UI<|dK}#-$pdzgJ zg@%OzawY8@7yT++Q~>3OP1Yjgi?}d0v~mujvQOGl-w&BI)>C*AlinR;+HF|vMXVsy zeMwpp<3>o*=JWsdd~TYJ677oPWyo7}sqa4b^KDDTZA)l0z0wg+%m)WpA?81VYRMRW zRH)x1>`V&WlG)o|yvi8HbeFFjbSBrE1XAJwE1MH_}TwvZ6f%3ZgZYumP)h9kM~ z{5R8{6Is%}j{?b0jTo#dg2x>V6r=c2K}7`O?x15DJIuW%q&Y|4_OKYsphhbkY14v4 zlIlMToGhaS&i}t0eG`w*-izYCsSMKP-t^pSsEJ3u-39W+R0qb}x9 z63@O?ks{pW^=Ow8qd8R3uc~cg$bO)QaZ`NiB{@=;VuQ1Yb@;eGY~2Oo&|Ynh-YW&;Qo+ztdh zvbiAZ^FL})7GWJHZ}5;VLy{qua z)>E2ngVWmzazcV&Tzw{l?Zeto`0f38r)hi^L?}E3Vsn=UEoTWZ0oe}vvL+sW8Ojh| zO9j@m=Ox-oKC!p^+b*Kf39^|&NYtP0Np7{N{9-3_8zfW<2}I)0@N_-B^w6m14)F1@ zn*vN}|LxH~}?Z((5R#4l$SBvs(@DEP!WygKIHeLf4b z2P!N)H9KHo`H!>x>~)`hyVP77|Mh5VAx_nhn<5pW{37*POrEooWkDCYjwvYmDZ~<( z(#Vji_>lvVC{TbfVF*ClKmPR|GxpIFF9QH;sUEkZ`*KcB+sY5sew{}Xx?MH@u$cQzg)?ip5C^@eHIN)9 zv|Z6eQau`v8sYri`EJuH-0OLk&?g4L0Um(D{@7=ZZ;AuCOE5lCelYdnS698Y+S;#W zu3NaFD@;ST-yb7GY&z`38&r(a#I;FX|leX~u|iG=TRg6fpt8R(CAxVhJriV7PC@6A@j_A6e- z1Gc+IC3HUVD_T3MIlAD_2s0wkHoF=l2yY=EH)%!7dNNe!|%%OGM? z&$->nAQM$nh7UGna6t&}1>UN!rHWXlqqc+gR%@J2W=U)iI=kt*nSCzY8afuqonVqJ ztl+J8>q&UNBUz6P_g@UD`5N5_ZD-Vqr};mBF=%smw(;C(PS@86q-TYRn;~7e0|T4(5_FU#Kl$3j0Vf8?AIh)96K&w46{asjpEe z@|LK{1Cle7ULe${`_XaM+#)yLJd`4K>=cp4P|!aWn1k1v>mOIABxn?%Is}Q!%zPHZ zZfS{$nS^ns{Tq9D&a4-zD#zjdkimI@_yqGdLSjG|N$COcGC#E-fUMxkekTCHYMM^WVz>3Yx(RC>~!H)X}1tpn*XDDJhXuarB;0ustxke3(b@2#bOV zt&~@MJrDP47qySehs+x14pub%Wt%G#ik>}vvp<*c4BmPtbCqkXHNog!)VPw^%RE26 zOu3uTGB<`Eo%<8>Dwv6-@z8BY9Mh^C?;n(`cqJ51NIwAqU^&_UP#YOfF@IIf%aIT&jgr2j3Neq8otj_ib{|1}{CzJhwPHapO zOw&^ffQrbYMu@Sg=0!y!G=l=BG=tGZERC+bDqc>Fz{crD2p%>9zke?`5CE1z1OOko ze1cJ;zxVB*3L*u6iqIoR_61NwpS^mXw17N|yyI;qeTl2xvV8^%;UG7P+X4c==0Vfk z!=n!t!nr~oYWea%1moig4mu09TS?aQS(Eaf5noU>i0ESod|BkNK#WTx=GnqTV%fk- zcEYf4U-4Nl64rV__tbsK4kiZjIIsdS*8<2v!hxBfW&{+kP-^i;Ty&0zXZ z?sI$a$9y_z0Y*9La)H0#$Nk-g1$B@&s|8_JY zTLDLH$8J%Q)N_VZiyinx3JH{A1M4h4rfo|`|F^vB22sdxs3n-{t#VljJm@UO_S&L_ ziq=AjeX_FA>duf{cJjn16I=`E9~5#3;|HwXu9-!o)0Vk2ezh28&`3>A{IG<|+)A&0 zsBV?z`1y4o>b<@91O8c@A2({|m=PKM6T68WytwXhdnmzJ{@-yf2(#&e)dHsGOw_6- z8aqF5ttvCb^c}-JL$etSPq~cFOYFdf1SPMJOD-WTQCKlpWFu2HUzN$%6}SwHEOrt~ zSPY$;nI$Q@1>E?5&BaKtHW_X3--1DFJc+|en`}uu=cpn`rwNQVjlW$(s3lQ@2WBqzR3F?ubhF}sM!&0$fu^0>nCnjX zm1F1ufTZ#hqvrtYi~|6VqK@Co<@?`iyjT%>wQuSs0^1LjP>Sc%#N9PZo57JMu`R|X zVLRww^tPda9Oo?5djxnHIZp0=+aimjr=2(53i-CjP$|*RancQ&YARWt(>!Yk`kBI( z^w~g7Dv7$MqZ|K#w)MFr)f3%30N`n*12`Tb%FMs1pT8;GzXf%_+usP3=Ce%S*dx&} zl0+K9xl4;t63x)?X_-1y4gfD$wgLDU8e@(fi2~_Jh#ysmq2RIotlJwht_oSW zuVj4D)Qg5i5f5b7?F})vh|Ly9|J7wUpWi{gC8k)U*TJJS(Kr~!5K+XFP(o!hz80bM zEe|<8{Fa-8To`xseKOWBxS`MEozcOs$yzb!P$EVd`$`vTdV@H8lgku%_{mehzK%{Jpx@1mj((Bjd;lvej&Iy`!vm(sD=L84zAk$4Kk zM8-|FlyML48ek8?xvAugYZ}>cxV2yJC4?MX$2Q{n$liU-9k%_>qpPq21B&vpgafmU zLCJrc>W}=~iftTZmM=X;nbi27DP)Ipr2XJ;tCD3O6Gf3ne=QRpis|TG#aEs-(Bkkz zfY@{>s1^Nf^{*G{R^iL6z==^qzp~59mK{a3JBaRj_P{5`2L$grdo0$#3A)Ze{`)=vu{HJrngIzst!V|$CP;5LQ}+QniLb=?V9%-9p;xysh+%glxT!q}k!g<{ zld5Emag6!jH@cH4i1N-n%oZtYk;Wt(G{^Uz53a$w=2&de54AP>vafjN^)WE6C2N-_ zVS5LJcI?1=j{r@c7zcP#?*G`koUyXeu>a!}(?D(I{rri$eEF>&T9eSj*uh2`#yz@= zS&&ykC-Pn3N2G?-Tj|Ala)d3KXn4D?F8KEGLE2n_*f|+v8+^i5Lq#uLP<(&E_SNj< z8dkRktoUWLZ&*7KmZl!^SkZT_9C&h!A>j$HpP)SS9NJAeYEltbNLNa{XZi7k9JVCa zR_MmnWzwmJ(3{^Aa4TFTOYW&c6KL=u=($fKN=TH>xK*1s(h`yd`FcK*C()#1rHhlU zN5DPB&yagaA{W{Gr{)B#K%M82r4sy)xCG4Pin<3R)h*hM1P1IvN0cnt^31go7yCy# zcpU3sDZilE6ROmO4HSEZd>$dIsErk-xMCiTiLdd)vIy5Zwzz?&<*7cFZwV&X{NYar z37@Ei0m&YKbyu@LF0q;b{5GVplUeExelo%K`yjC<)sXXhYw|)%{*#X}QGNv6a2_u* z*ll`u$P;ynkUE}O;4^%voO|DcNEkMj|EY7oob9_(pzPjifRnFL+iXb4_|-@i9ZNJySp>m#!T4BV`SS@WQWl`p>boY= zZ&WXpfCJP@-`gyIbW15i&XQBvs+es;TQ}p-^*mh6Gb|CuiY)c_i)dYM!&ot%la#`7#xkIlUUwIY=on3-)MRLArk0mr{9>x@lfn{i5h(ffP50y^k682fd zU{@My`^~2^I=Fc1;}E(%olqpm)|)!RmDlDGnpXtz=_6EPCareK(ow!?$H_~>G7o)A3&EfYAK^RaldV&q?riLo$9OpyCRwEx`qp@RTDs$mES zEo-y=Jjy)x(4wd}4|8d?le!7+SAbD-Yt*}<5xHo1O;|0n+LW6D0ao50aI5h6Pc+m4 z0>HYTc^{o;MWrE7yoUVC3y~FlNITOsr)-yU`9W_h{udOqR3q>2Xe~e74s)xI4XYcs z(b6y{Rf4+2si)9&@}q0>9`i>vwior}ah1a|aYVz+8b7@W-nqb!$G6kEc&rAa$sUlF zJhAHFk~AK@=`$+4kO*X-VKQ zih$JTCaak#v~;9XL!0H(H!xhUd*DBQbnxfV%%O1$Gh6s&FyGp$b|Ze^q|#lj_v@+# zMj3q=pFaUJN$94r-Sryd?sM8VL7G<3FsxFTo^XeNK(MA80C)s_X^yv`fMiHuChvneRp& zUll}r)o?y^)L#pD!?EQbMYluWrZ)IRtU6}7P50V%kHNTXVQ=1fEr_;VHHL=oiP;CY zboJ5f|29N_fR*18Ryw(%TRTG=tiQg!^(e$-42 zY9HKz_wp87xc5T0QbzIDmq$e;oEj{C6X+y)|72G|lq8yw7a7O|rj+G>}q% zqA&yiUL)SeU}~TyLv;e~U~0jyJ~3{RvT#udA_PX zV5ov)L-!n;N}zJLDi1dg)r3i1p_Th8c_U1i+;;$?b}hE#*~|?3SOPgsNP6vHY#dR~ z(2a1+c!jc8gD_ts6YIYZJ*a;RIQP{*Xii)JZBQBnpgzgT#i7Z@#ilmN2?S{V56#J8 zis;$%LSP6|5$*rdoZKDJJf8Y}@K_apCu?|Q3P$szg4ALEuqvdWf|Z?e<`^U|?hicu zX;3_CB5fy(*IY2|<>QWVjC4)2>w^MoEZ0K8Tj|6quY%c6FY zf!t8R;wxM>42+PoPnOM9pq`}Fvg_gSMrxW8pMq5+W4%7@o7RZ+iZhauh%?g+)eHs0 zz5k_7!|SgnN@vg+h7#pCX?!ZQyu)}Gg?6^Dh})L%K;rZ7*#&~vOX6`Z{CA87dPNZA zXe?})Pmz06ZRuGB5)9`M1W=8VQ4Cr@52cbf-MC>hP_Dk``?5tbj#BCdT}#1P?UO#s zM`MfZGeyc*NaBuoX*h>jpPRfd*f3Ebi?)0Y$=gO?_V1Sj@cs?V`Rfe-Ed{~;2PX#^ zI%aOrW@YE*CIf&DDu|;4`s(*5TSRW2KObxmfB(-vPk}!E^%p=V`9IJ7A5SuPJ<_vx)jv+AXpJD>VGq^?@KNRw2>vC{qq z{fuw;QYPZ%c_CT?t)SAsryR%*R?Gq5d~|Ja{s)Wt6TC|Rh1dU5S|lYmc!O;25zRru zG$ftw*xP$R9oW6f8pboOSQdcBtDTqSUBYY4B*I{wZw$PqF!1X0XEu{~N$yag{FK(m1;d&+sR$lE{JV*|&4!jX2%`no|V^u*>BQ;_HrRaroGup;+= z?B(C)6a)*y{)UB_(sKS#`Kp`V5wf*dj)sQZnoo8|kAEpb`vi!Cc?Cm}cn(_^7c1*r zJ`~~5BRUth8#rc@ufzJK^DSj9d=**w7YhMG3)%h6?hkAqSUspve~t#vWxf$Yjmk*H zCIl48Z7(+y3Xx%PQ_k@c2Jm#_*cL}z+2<~w(pHPiT*=9)s5L2QHmbb#ZNxo3M}97g zU8ns+J&V8|B;q9@a`5k!2JrmNeEgrqodA!OZ+_ef(hz8W}Wehn#uDy9|cdhln54f z!;iV~g!s`t0~K`sukmaM`Gh9L2MI+G%3&QLLew)iEX|N=lypzT{~*aYaBg4#+uuy0 zMqC?aUd{?pq!*4}3JtPt5i;i$v*y9&DZ_SO!bU$n+2LlI~H9S?OE(=~0YZQ=$C zWeZ>SNegeoSRFMB(M*MmxMpBg zLeW|mrp~C{Q55<>ju~&H=Qy{T!L@+(Hf~NB3p#iRt<0)`?$yi!G(B>iCjSb@ttkhx zu?&|E%Pf}^GD$nxu*55p#vysM;BHf02N?a{U1LNxN-tMq8KVw&`)}4GXof9=UH6jR zuF^|q87`e)7QbD$Zx+!CZbOIb^KU;fT6VDZ*JJH4b}kV68^l^so#brb&|m|CxY!)r zD!&~hL_Zx654`#}51a|)AVCfTA!P~E^Vm>On8~J}!F@?YuHaySpxMH){r9T?c)^?a ze_*bl(+z5J0OOU@jVsc9)w3=jp_&dN>RKm+W9%QU9)4ZGX|POagO50sw-|*#r%piV zlz(0g71fu4X+lSt;PVNSE9_;j8}dl4Cz%4dQ&FYW3~f2{uc_gh!c2^L|6ST&&WAt_ zYYL-;V64sJSh+@Xvd;{2*Q|lJio!;HK+1oQ0|@e~!O&v^UG9G;jcLaG36Lec6uV$y zLN|7}|6`pqD1TsCj?#{_dXaoClyxt4M%&?%G$7Hnq45dtf%3L5HHRYE^VNqHnQrmj zi(;>T$G-0IV;$`#-A|?Ti;|0wwL6Xr+@(<8XFdmPGjHQ7$k&S+FJTX8hN%{p>$Trt zFENoC*(oL@MsNxvS6RjpZJz_*V{1LP=W#emxx9-;!A_KM*WsWlt8{q{X-iyus!dEV zaSFS|Dbo4Wz`;U?{(aVe!&-k*{upWxM2e53b}p*#dA#;`UaHr31w|!-$eo ztT2bNG3W|pJxhoj#PJtA8RK7QB_Uit{CqDyxkTX+w&%(i*`!-Qmw8&b0&|uk=Pfp} zdznPupnnQ2P(fDn#2a&g6<|KjrfZJ=2hZx{uUv&z6v0t=Fo}e}ebBU0e6Bod2eEHx zS zQ4oNfkFx2UAgyQ+Y!7pp*^P^b@X(q0I-l-$drB8*d-gyOj z+SvWF&P(XbdVNv5n|WFv1s4g?nBlQTLW-0D|9a3KsmJX+o(?{@o?PA0ZXK`W=Nq8T zf?vZB>~{}O5Ov}BIi2wj0nYxY%WsQ*yWuIFqb8&9 zSTl}6jKOfDCmaA}{p|3VFOgDql^H+$WSHoYIcKr?8Dw>gJv~3XBXxAEu+W}E7M1oDRk8?QYNOj(}fka)-mhQP)c@bD zU<2@gwG5!Z?eA7!P)*)W;&+RlaD9j{Z@<5Y`CP5zhzY(~Jf)nbp(*G1>Sg+~KjbrX z<>s?az4G3N;dHS8vZj|)TGQvuoscRGQe?LwCZh^*jYs-W@(7I2vMYWiha?Ex;xMe6GhDKx|$vD$#;cvGi3@Fh& zUu;gchSzgyR?%^moo8V=k%y4xvA($#yJwP%ysKIjaiv1iB8p82dKH>x@m*6yq7*+w z3*JKTNC~gMSN+m4QO}=Kq@^lRTzmRWrWaja0Z7J*{^7qL zwS(l8siEoGX-hRwGag<+@6r?>!0wB-`4JeYaAFhmd_Nir3W4`V6hP9B8`l8v0?f;L zt31$B<$zgbyM@wIG=!D;-izTdD3i&UVZ7^2t;9DOUDg3~*SAJUFw^GjS{r~t0Z64} z>3sB>t_b7dTaLOM$GGy)r{VeL+(vvLn#cmLz#<_0DH{)0*W26+BdN!TL(=L-;!1A6Yy6QGZOJ^Y`K|2LfWZ@>M22@fCd zG*E%U@0|uh$h(mgl5T$r$iAue!PVVA15fb`TkaSnUkN~e4R47g;TX-0~dGCXV#8^y83?1aWYXsa_ca>^Bp&SRG zZ}?50yoS97PYkcMAR4O&N3p6!@RXE8vlGWl2ZChYkK|h^7GJXjCaO~}S&y(RC$S}m z$0fdw)=I)QPI##|dpBs`K+1YZ9X_$sz*B_%2isS3LLTu?7OG7t4+-zQ!G|PeTGI2u z){NTEg%iyb#YsiFd+p0ttSsmkCw*i}daqPYP!67Fz@%f=d$)IL$it_qx)D`19DQsi z3VVq2p{m>39aZDB!JzZeU_lSZyi_Q9BbMrvY{ogMQ$=^uTZ^1mFAXZ;fcCMVW!tiD?5X$PHqAk!lDu9?7q#39V+bEN< z6cg{}z_4;fZh=5#Iw`WpMXu+&HH>S^VY;Gj5p+2MtG&TY^sp`ZD!e;{Ng3KSE2MoY z($-ILIv*S%q5^)OJ0_*I&<}b2Qi!pBNDbSf1-Y z_{*Fzim0HJ4TwkyO0&_CZNFLMuwQF=s5c4waD6q@>48!q5G1d##S*=17bx{ZxGrz< zr{$ET$W&LKO-&#)g^ngBON~`n5piyE6*n7q#jG1Jg7zlXXgZ_~LVU7a)jz<4WTnG$wA}K4A zKkB$xG9p**z8J|~oeouT_Q~*YpHwcQg%GyYTdr1!L4S=b8R(k$Oxorrza@5e zo8USo=EN~J@%8iq=JXWz&}MC9h>C0ekw)<5h#V5ncQ5SmPjVP#vJzRPOx!FRq}R|r z#Q?*nh5=T54`BbB!Scw|{40?o_&br)jXQhxg~Q+zXosbx71gpmQ2Rojk0;eRpI9Qp zBVCafh6okTa6_1xwz<3vG>2U0CVvfIU{zR!YRaNkPM?+_->TpkqgHeM^=!=kQ?Mb| zts^V0#x0#E=RPd*`s_*4tYg7dqeZps-c8Y2za*>YH15y}5blYwu!FU?9^dl+#VR=1 zH2(uuaIvZW1y=l3wEk>l5^500A#J{sp3zyTnkw?#xp6gkt8;90ZFp5~9SWkn8~3Ri z5O%Pf4p0K%?}wK&R`OzEX~=3<*&K5BGXU3JLOh9voABb;t!NH?^qz!apI?j7t?^u9F1_r} z%vSYH_mC-Z+5iEvFP!b7(a`R-O{j}dqIbScDMN)hQw=v^Vr|YU^)oy&5{; zwD_M3D>~b%y|(F};wW{3cP!QT?{xur zAm9KVkIA}vM@%Trme*e3c>$J-C0G& z)vj9_cMI-Lp~2nV-Q5Z9ZVB%0?he7--Ccugf(Ca8bp5+W_u1JeJw`51Ush2Ui?zP> z$@|Xv%x{e#dhU|Wv#HowSw&jBy8EIv_sFQ*XQp&lcu{mE%UtpH3S0+JLv_STf4DHt zzuS)dUVQur)Y$-wk3gMG9Z~JU&|t4OJ$G^IYY$@cLNt_{QIudPBCz&2)#uVlou1j< zubQM^D8A+|w3iTF)6E>gZeF0=ag?BTTuhXolBMn|&k9e-iGgSN-Acy9SNMYMar?^w zS<5R3-U$)jf4^duzpH_0>(}Q%f0_%!m|uxuXmR6@bJ03f6;?&iFUTcAPp4c z)RvAXW&fVFOyQNddtEqk3owr*`C_jb*%`xhh9u;nx3O}ii)_n{1PH#Iy4M#tL!|G& z!LFcv*xu#mH$$a&LuGG8^19`zx{+zk%SVN2*fvCbL2)ZfpWKIHi={9MT;`wBv*<#q zUe_^+%Q~7Pj*3C>MP-boIRhWAmelHnZoJ+zs%)Y8WQsmKV zpnx~wrPi=rIgjjGScO!@TBg)QNFB6DRGO>O-4Z`b{5p!MyQ;HmbPMj+0Fn~eSzW)K z%_VLDNg-bZq?InZUgs#kV(z#Ko9v_yhG^vhZPuIbDCyKxr?;W$x;YipW|$9X63*T80|e)INc=wO=l)(!ZnU{OtHx8nTqaGuOq*@fCRkGgr>UXXmaz=CZ{1r0Ffx<(`zf zY`#<|_{vg3ZA=j!A{?LzF>)N{hX~3d$`qP2=X6Rz>REi^m6Ok~uz&9}_R@$KNZ>ve zj%yQ@5V2K!k|2Emk#QYcJFo6z;KjG~ZJA7Lw`Cej)WecNY5K-T}TohiMwUAQvfm!+Ut)v>4 z$>>FV1D2QFVh1R_!L~znOCMqv58~xJqL>KX}H$Z!FXl|0E%aK)Df6O!vik;INf6622K}$9v3ZjAo5Y7?=4%8A!oR; z0>|14?hX98;%=@5<=0bWm^od3^^a^ylA_cQVjN}SOhpz;pT33&-(n+cg>=QU$^sMU z9Z|yK)t68PE*4Rrhs46kv>pmLC-pC%0&$T1Z<{SelD^O4_7vfrKD1MOreWC8j(B3?U;gM<8J`wE&a5cCgXQkH*L zM*P3et}JZ7HQLk?|DF1u7VDdX6YUibG}jx>Z%)%Fw2mu|l88b;kZ_8-r&a90^(;c* zAfW|T&W`QcL_G^3#ZEw+YvCc#fd-;~#zmZjr*J8f84;X2W=vj^Ee)3SR+QxHA>Co= z=Dh)ftJ7{J*OL_d!8SYN-rUfITdLLX*~H&@Nw-Krl7&nMTF6@~gJj)|*3lKqvB3*< zkd7k>?GW=5nsGDq9vP8By6(lo zmi+*@%REqI;Oxa3=}>rkZ@RFemkhVxZ_gCY|TLthBR=%e;sW$j7B12p$qh73`f zz`&;I0BRKgFGX-a{6Z7#gQp>zegLAihmlX7q#fy(F#?&(u@s^AHG$I3)<10VSpLr5 z!~DCwM^EuzoZ`+?+WR!Z02>Z!HW(pEo6CIUlY4Me%b##oQlr;>Uks#Rsc(tv8X8z8 z>G|AKr;k8xH+5D`{FLy%JJwaE!hp-K#npoqo3-Y@PH1ro7q)Jv_&J-4%V2}&5yk5T zY#X{OyY)f~Z+8mUOhI(iI1@Sca6bgHWPm{K;-kmR9gDSUFpJ!L*_ z;0q@^ddiV~?H_^-*H43mrG-PPvgHygd~*z~G{*kkav8+O=2*|)h)8+XCVy8~WukZZ zRww#$AY|o8gM( zXXKf(+szEbWw`7u;3)02bpD<>Q$N(&rS*aco|b;0spJ63r|xxpX3YT-+iAN_HF@f} zT5zb~s84tKZG_b2zX)gFJM{W)+Uc>e?-(fW`<#xyYaA+O!*;a!jCn71hYB*3S~fwdL^4^8{3`7QsFh<1SAw>hqiP!?Fp6WCJ6hkgf;*{Qjg!_e5$1;5c6W*i4*l6)a9;wt5wZ=a9&1ou!I#08xW_|BYaUSpzep?|u>z3~sd zq~e<9n+g2n6dz;Sy{o_2;G3WK`o=)y;Kd-|^`87iMmY}fhgak8NN_Cdzm>q$l)3DH z?z*!(xX*%q9>*RE0fz7w@9UGJ);#6BD2J#YfsY35LfM>Ei@m7B$65+Y@0vb?Ms27> zW{9y-N-nVm6%iNmkcz!mi~0o6q!8B&>mZ+4*ckzJ3p>`Kndt|g18!be-b%5z9?(y` zDz~!+W^%OjicSaT9Kas%QJDGI>mHd^@MG5CQ3ANZql*iyNqQd|>*6N}5gX?RGZ&Kn zLv%z=uCt_UUFZdr4CqfW3j#$${YmHrz_BZ`3t*#wDgeweoRR1_Z86K=NC+%ufI zptS?vlt+v+96_^-caHrVc8n4IxkjDQ%Lx(+TfqTv_4JsFgK<$n)v!MAd%xlV!hpiJ z(Ymw3$+Pzz!~3Bknl?t%t>TfXO$V0ySE0klJCdC)A5){xN^gRwF`+WMc0%K&KgRg+ zY{#{3{3&m_cI#uhpJw-<{1%&1=&n9CTse>~H^;w`}<{9TScpunT>sC;}Fi*+MB>MW~K+wwU4jz>L0OnU78g&p? zBsfERkpb&(GlbRohs}ke$tCV490~?7SEW~94{y7=IBigT{DsD%J>kco)AB>Ox<#>a z@o%HN5}0gD6D5+os8q{`li{J%G(UiSDK$d;@OoG)nw+oVUoH&eleHFg2|)K*La9$S z^w3m$c#HdlC;ox+$b)S2r_r@7Z_Ca>6(@-eD0peV2V3WEHl0;Ggw_IsE*Y~St=X<| z#@j02We1}7x!C+SwrTA<{}u4fXsqKmsNA3cXM4rAYuoqyonw6LclexdELTMBnLaVG zS6DuLrRINn*nf9E_>VXj)&F}w@B=+HpEx|;+5NcoVjt?lY}o;aHc=ocHMiB_5>Ee> z*UW5O_qs3?DfLq^ko_v1F+<;sg1I#0D>XsmiO(L|XatxU2)VY75CanpUxiQ<$>K)t zAd`>J|2}`?vA5o>wT9DDt&GBfFv2bRVz@_QXpEtzt_S}zy?yxEC333L+Tt{G6 z-VLgASL$q`C;tb*WeC!W@$hOoHc%ddXMwVN6ObJv+HFKNC3M4WKOo#VwMs2P34S3d z5W<(G91HlSU9I2+!H{zrM_ z9}PRg`@L6yPfIiQliw-n0&;qjg0oQYCyqD(-C(e!wihHEP~0hXe!huFPMHy!+Yf{8M&;~nccob*_WU;6zpf}2 zz_c*;;EF@?2*;UeOz0>E>kPMtKHWQEy#Pa9+5p=f0BfPf1xu~Q3i9iYCF9}lQ2CR6 z*pur|NdQ3|C}b#G{)6GFNLjq8D4K0OEF0r>^m!WozRS02(GC480tt6|jKDMZkJFax zp6Wk*EuBe6)wBC`MtKU%`7|FTFU?lzGLXee_0xq*>~x{Vhr`JsS(8LYw9>mI?4Oh5 z!O6Z;A^viu4c=wEM4+xkqUtKA>k(=M#l}lMQfgEXr6G+k^pQ$3g7MIyY0#QM%CLp= zDmyEf%cXur%TY;_yh{&Q7|g;j;($RFV+2QV6rF{aMFG{4>Z)amRQ`-xk6?X0Yd4VM zt5*jiFVr8I079O$Jl2anJ7P@Uyw7WJji`HvpJ8Vab)s@|r;_~9$qoWB!CF0`aN=|K z@JHV&u8W1I9dwgc@4z2)Ux}Q3hKg*^#wm|PV-tI1LN?TYl?+pu8rGG@{D(%-Nm3&M} zVZw2~T|qxJ;n&*Y)_z?Fd){4L_algQyLxJ?R)Fb)x$=B~uHE;1NH~1BLtN7w{e$72 z_3yj~Y`;et%BjfIA27lIgF|$C)qa@N&akM|Y!v`&YVl_-?e5YbBoNEMB@;H4y>KET zf##_)RXz37!ojv#cadkh2hn^%iAO~vQ|N13XJ+FfbzZ&g&_R?)*t`f{*SFIM zUOk?gK`09+xjM+bRdCNw7VMkugyc!>Wv-Y(bc4f3lC<@fW4*bW&Cp?7A4trT;3%Ln z0Bo6cA#Hgp-6A9|^83^^XB(g>4hUT9{7hbsTUGXREjLE)) zROr9RzFxFk2-@w&b`ZmP_V{h9k`q0u0o@mXt*Rv!4*cEZnbXmoPt^fv|#ajcLn1~!~ zPndYAdH8u?HM=xW$g>b~`XI0>`_OY>#lNt%@*Cc3?}kin?Jl0v5K7l~-yITf7v&s>z2go%}1} z!m=IxV7xr=`s#R?rA-7d28mxpmle_MF8+pB&2em8i^bFS%9F@ zkGsduy~#IDwq>+OJ=qK)y(RAk&=Zp{$JAeTk>_?|GygEL|E0|TKTb0Lhl$kLPgFLT zQKbJ~`mc0G#JIQ$EbB&u$ovX9yjS1&wG#05W@HEsLgn3@q%Hc#5$o^R>D~w$<+@_ul_e$B1o2v)3l#Xvq zxPSNu{!Ze`@_Tlkp0bP+(5JB80}QO6Ns9%|rOcMcI{wv?{5n75k{CxEL|)}ApT4If z-Z2qFoqS!QQ8{s`ckTK0;@UH{4Qu>OW6LkL5>GcdybolZI@kSp_!{IbiDPGf9%`_H z$s#h2&e4FmK-jnEMy~ea#L%-nciW7%ppG6<^Iwp+l2KtmBc6Gvr~! zET`q^u(*~*T5fUL4|ce(1GG|XkD_wSin9JlnDP;M5HJ{&NlqSAuX2_aDQ{9Yl!bj1 z)CrZ}5&^X!B7zuLy$BH;)=yD4Sj!)=4WA8J7jju^Pi2Z8WVK{1&hZ#!s2MpXxH1;? zqrhkaZot81E$^^p1W$k9T}X& z3(acZ4MR#DXN2J476ge6Kw%GnA?@tf&uPke9zYD(k{_ChE z2y$wlTN>D>Gr95QMY1?s>;p*(b%be!^+&Be>WBM}rF!iLGBvDoDciITpUuZ-zEd8k zt0bw@KS(L9JFR>WSh(@per|iyiw{yIJ^&Ri0MBs)L}` z%Y^I-xq_rE+}&-$#g#NC*QtC9XUd(8B{bQ(1#OaaIo%^|t5GDPyi4cli!a)~m4;Ff zMpvqi$5xC7fU@`_x@Yl~N3if0q-M>zNzi@sJG5Eh zb;M}g*D{8|p06LRKF0V_-rz9J>rz9>$GT`LJNUNiGyksYl>c(~U?}OQ64pFJLQ=?yAIW8M)IEIvnH4rdWKZbf>xxDEhJTy; z{5fT>Dde%x=PfxZ0)kK*L=pX!d{0px{Bud(cf2i@r{jQ;gf0uBB*U0KH>5)%@RXlf z7hsP?-wF@5OM5T&UY?Qp>{eF(G}!)16U6c#Nx#ZMF+e6u$4!I}NH}>AFzB~wO$5m5 zTXR{oEsh@s8*$TZYy!x2%4Fx%`$!=@Ef=IK+lFKffz2)q_LcFa-u_TFL3*M&7dfd- zDs0{3P6_`C5{4pkE!(%S1+osNu=08t&_Fx%q2P!1PC8^E>-CPliij$8gX(1ae zYlu?Q1!Tco8Tj2Q#C?-{BIqX$ak(5n7@}7OrmSVcX&1Ed?u@gv7$!C_7TxI~KnMcW zxUr&F=r z^@HCZXg+uh45LrHB61kv3G7g~9hyDnUPFJ?$ovDfi1qKJf~@SnnOfCUq+-Z`mA4m+ zXF$Jz(xu+?;f6}^)G5u>F51QIF|$QTQY5SBN1jFImcdWHPBuQvnc= zC1v9aj2E4XqgE19CnOjeqrbO(giQ-2*O#pHTa0m!J{QHitZN%QRi$8~?FRJ*JA9Vi z8sP627+C>RPqE*iq{?X_8K-Q0JsXDV>0j78C2ZNjWi0`zEFq79 z6G$dMMo;*b8?vi&)e3K(ry>7+zjSOa63I3Tli($7g2Ij?#UsJ|xD7@76uaKz7 zI8prLJAgCrqj?2AyD_!0!en^Z77s9;w_mcNGu2_|9*!gVFu%~UsJu$^rO~p&MJ93P zleOdn-=MEltRCt>@eJE_L@230WAj6x(6tRd5 z;_|SRdHO4K$j?UKy}Q3Hy|J}%^2yba1B5?(`YJT2WJ=+2;7k3fwx6v9hD4%zljP-Uz5A>n1SXU(tvrtS||jLm*+ zwVx7YnrpvV_s{pGie}EF&Cg5(AUdOwLxs}@TYemdSFn($PD7{1(3G#hn7*7~i}cJf zS3rvBjB#aPLz<*$ld8aXOUXT^)Zk8wh`w|*bLs?)v1KPj%8WQKvXb{fuD4(JQhO-t$h+fAGj)vK!=;#O#o9 z#HIu$qbM3&6UuIBRUZw!R5P-g^*+Z`BrGV^W6TIEeF9Zfq3_i7<2iUK8!Gh%`>~9i zR!r9$?%uj_5x_CZ57NZHQ`q|xjeoR#2a!LBAknwCjq2J6z z|BcA zNC=v5d?#rl4B9~}r338;45EqC^3CmQ-#$vm;-jjxq27kyV+&QH`*Pl*H=Sqi?#YHj z$R@GCvIZ}M`7<+R74THcR?eXOgv6ycwW)KcZ#0+f4N4QU9z-*|JvHNPF-)45weyFf ziLw*7;A~XkL9e$=b>4^4PzozYWKa-0sm4O&dO@;^Z`eKi_Hp)d= z?da^@I3t$*EaNZ4_=Q`K*@LeE;?nlXAvS^T_FIgt&}w3F*$Mr;^0-!)!SB1&gj{QS zU3T~|QUKT#${tSBc72n3zJn2QN661{_~Ii)hRAj|-QkarKvAs(16nE`kB}-Bnv`=5 z@Cx1udV=__DgH1){avyQ>+co?H5DE^5hjFnUpT!T(B+kTPi=IZgqIqMpX73>L2*g6Z2(#m@bLbG@B6l`ur!X%`0byo;4}jed>G z=g>IjNkpK}ha@8IZ9c zu~~_;mTA0D9X-bw!#NJ}Q@DlGbR=I=JX+_uWe{N#cfeU|2p$pzK%<1{#x1iks=t~D zmNAK=G9MZ74OVJSWnr0=ae>g)Tv^={^0xTN83EJd$~qV+`PZ?2)NBwI13IbkH!>Zk z*7O#$M~M5%F^~-_6&ZXxcGa!8j_$wVObcgAX$ zsGehWzg%+|N$|5iboYJ6n1Bl5W?M~M?F znYgf57qW{ZX1Y~?7`7&KqMzv}4V_1=`JjMDtvx-wH($sKZc|XG_ZH zBL&NuW`sm3`5J7fmDcR+tz@>mjoQsjf5&${B-Zt`j+}AR@C)=l`pD%EgX-UnS^sZ= z8QZ@jwSe>`RT(=1;LI~a?E(tEyfW{p3>4%7q2BX}P%z99AC-zkB_6iwDdg0>Ck{qo zpNbV8M!Co04>j<{6YGOjx8i`=cEy_ z9>E(~nMfX`H>#BFL0sgmZx}~WZ_CvRcSPY6wO``lq>H4``zQ#^h5qCLP6r7HKy`*1 zRekMfj4nbV^N-+bA8?P6$)sQk>FQ*^r!E2~kE4fynlNWKuc{brn$S(w~XD&F&MI6B6sB${ei<%&G&$OY`` zR`zm4DfPKnH4Dw@g>icpbaWxzF3^wfcjSp6Tu;Z{%{|m;ph4Kf`2eF1Ryz@ zjU8Cyh2XD>V`b2SsAOtK=_7p~&h#|8qbC|n+WktuzjcKgp`wa#Orsh*@qv{wx;66l zVhR=0i>Z0GZy+87>MYR51J#*s8b3PXrL1j2)jtEA)Z-|73>jcCZFp%Ya?u0I8`FWs zpGXFd2+v^1-gev_(!+zgwv2*g0{Y!o!$~q9IvnSd zd77R+$H?#z$m@X4s<9t1*V3`RAr*!IX|1*s1Sp%UznxIx93I=CSLv-|>M}Ef?G=#9 z4lpouR%{;N8Q$&!%H}{aYuYKcrl)`HgaHJHE8x)YlWu7?GO<5+)Z5~TwIfZW@P0u{ z9U?&wKeWcP#E_SlyzNIA9jB|z(ram!qU6?1?D5_8vfs3U-rKX_hS8> zlB$*{7X$P{1V3;=VuhLZe)|CoUur)B>O+w&HpIvGR3^6vW;V17#kr5Nst*bmT%LDD zR*y{yZ{065f@}Jx5hnc@s{9V~WiGelHCPX)Wzf~Rr_w}uvPFUiaoZc__7%VEpGH@L z%h@N5L@_b2-qz(0A~LsIj4NqTeZwl_5?Y}iuppLs9vp+qxF8?G#11c|*{4%{;(G{D zyjyz~wtg5LLirNR=g#P|sTgWI$O~kpxv1aot5|3>QI}=Z`D-iv514_10bF=#Nqy z`59Bs@f569W!+)K8UwN#7NxRQ^r84$mW|`ny`yHR5cV)5lHY23sM)w^nHSt)w+M0O zF1DSKY6CvW0Pnd<*@Fc#TL1n>S-xm=qG~w6&#AreV77QV)40gs^pQNHNq=)vd9)Fc z(O}w*D|^<6pMv_ls~p1_b}` zkpI#|^*^=@&;EBzD(2r^U~&l)(f?Q`E-C^LVRkffz3O4^cW%FI=BIIbGhPxqhKP*! zPY#*}KG#cSW|u6)vMz}ErY%%H;Rb8$nu60zL>miYDZBQ9rS&UD6UsrQJ}i;1%i=T! zdYyT$g)mouA(6J=t*Om7k;?WUbskso; zJ2rOPPs&<}*E@tDkC|x*n`C4fe~3A>oZ;f6PSPWnbdOMTF!Qf-b`yxiKLDqV&!Qqv z9ia42nst(b3;#~zYM!3oiU;JEceH_38J9@-pb}Mu%N}BBMC+rxAd;E45j|M-*yf30 zb^fSy$g+SRcpIT>?QWt*XnhEutqIgL$Jp?@C3=!eNqx%PbfBxQ(a{!IC#t&e{e%3B z{qJHrfNJG`KgH@L$~gVg5PSGnZZ$D^{tbC@MzHg{ZM)K$kNVB{-U4e>c|}>!M7r(h ziAczJ2LWw+mNVNE@80k!9-rT-fO_xgD5g_$x#SrWyFQ*bhau!^Kfxt;3aRNg(-AHX zeY{L{Jg9$tC9Xr2Hj0BbnwAJy7)(Uq>o|kiydx3hOc2k>aPxlJp23w&t0KTs(OK?o@X$X;86Bm3DGU++(*eO|XaVFF@_cd$&Px)wQ5L0{rpC{98Z=#+-U zopPf~V&|k{aqv215hKFCX~hF?F;FBn54PKv-IwTrIcYm%3YVI#3JYI9ppzocsVMV> zr@p#Oz8Mu^i4uPQmGZkc#Lt>+~p;qyHUPIZvda zzK~e>cA6bKV992{7lKC2swz7t1+q4k^J&KLzg=rpqe7&5zzHYamRiZMYMzkapBl?g8Xy; zr3~$r%@Q^H+ohur+w?Y~{iRiFtojE#$Akm+@37%D#S--HZd;Dke?zxEx~RwIAGxc-3yCp$U1auZ%(q` zsTN0Kc>GFe6loqu@;eoq|!?E}aV#W^Gg3j`Aa@tKp!;pHMbR+xq8e#b_sVQR8*EgNMFvdsCKrEc>{*U@@VO%EORa1*v3Nc;c*JKW!S31`1(bC*fKpmxc zF8OEKDSqM+_M^WAj4W#a)pLdj#iL+GN#N^a4X7S{1C_d#uw{&%LU^;2=*&nUqZy>; zT6%J}IT3NrOPc8(wmhqMPm;5!WCK~J!hx6?YCrsOE(#R^=1ZYnn}ISPKhu!9Fpo)fPo)fIzC38 z_{qW7uX0Ht?&J$iS>A|nT-)er@@hq5i!?R=8qtZCoz0)_@n7j}nOT4H7^@`!{h-7M z|1eJZcW4k%iMJBJE)piZ8)yXB*yL`67!`d`6eT{w2(=zzHI{v?ZG7Z8$@0KDt-rbx zS`(P3kqsZf6^A5<-9xJ&4xfJH91c^<#K4$Hw!x!ebL&pl|K7qW0b?q<8qDoop>O=` zWxpPO$+2ytaA*prPxsz!;C)3!MBU&-mOaVv8TnRm6l1y!=KmPCe~8 zAay5V(&ULk40OI8R)YA&}aT#0o7VFhOC;dUaZkt~yVI{oJ90 zb01JmLmi?XDyohyK#0hs4}ZKYM|E%*qyyANe87|pAVsZFwOZMkcOX~Anp7%a<9x2m46WBTJ_~t?M5Fe6amCyzSMKj)F7YKU{CL1RNv3)mr zR7Rj%n>_RM^T@y7a*NH~4IW0fi4nJIR7@#qN^mOZYLo}KFi$xK(yt0W>-}+=5(o(ETyW; z2KZFNjtwqQt$j{6tc-zOQ#?(ccoOdcc(*gTO(_)#A zPaN7`>05M3YSe=l5E8Fpk0^@iTc{GfBdY>uFlilmRh2zCbm#BFHUzw8?k)0BaFIjd4D}v}PfB$k%<@#R6EM~M`7v)ppV-&Jk z)6ZzO%kTEG^;nW@R?~N{!U@U?QNFQsDDS3P_PFo{4rM~Dx6|Hz%{yF z_qhvv5L!T_3x$4Obi%fugp8t}BQPY%XN{pYuqC~`-e5*6CJU`X(nPslVt(D(oR2r{ zPfEEIX~w6*9EB19gx8UK3yrR^ss|_k+Hvv~I>b6_K`kj6I*UqngE3cSe;xTR& zQj@&4FBM}rz==1!8jOD@~%5=fz z+RU(stz2;Etu}5^ToHtmQG(vYSP zg(eKmy$6yOL%w%b|KY&{`Qv{>JN+Mffam->At3AT_!7OOiLZD}2x1Q^kapUIW) zWG~}_RRr6S8MyAhtrAEcVD#)UK_N1iN*oo~%AVgwB5w)Bj%B{#Gn3CHnHH_;3iW5z zMo)_z+V}vh3}_E_@~E{)55GR}ekKAOA5id%;N?b7kk%pa&oYb%QvKBY2C!bkYQ9^? zj49qc)`T=T90nHTB}46uYbMua;f){(+odq5JSvqw zBzmbM2oKBSTAEvTfr0A#TkvV6e>TK}Si9<9Y)nFK1P>2}9VBC$I}7cL7`=R2G~Z;&iw&8pwXKta`L z;SOhc*Ehp@vRxjE>bNoZyJz*bR2<3bdm^6vZD()XOz@6w8EnAC*ZJ%EEd%kqnOFA( zT!_b=^NmL9e^#;OCALZ{vE&6B_$G(zAM`pUGey-;BA*#86S2hWZE+frEo!jiPDd2h zg;nZ!YB*N~Ja>z?yP@G0gvj-E4w4T=awXybRzBH`za2CJOlh8svf88r7cK9d<|ajD zzIQ@eBNu;;K{qgqALxy|yg3k16@Cn8WjBL+wNSGU0H_Dvq~Cl68_L}$TA4WX|I<+a zD=z?$QvUDN2eo*qf2gdY|EzTuu*il$UV^XL954XgY_vC3`vjVP9PMGNJQ1LmALvBx zIS<|9zd(_7yJY-4dZs>#@$r^M+PPo#NLTaO&oEXFoBdHxeFw|vR!Q-xs z&G6?e0+|V1Nw|^D)U#bw>yC+dutVdxwSO4ckmaoqnsLnYW?B{cVi;xqb7PTiN5?n= z4k8qnlb$@iZfVDq(n66^a>MMF!RoKC&z8AAm%2d@f0+vZ>7M-+f`^6mcilk(Ff;!D z-ZMO#OHb~IdLZGfq#`{$P3|U0K(Do`Nxgi&LbsvKG)Oj;;iF2ZZZ&<+P=~E9-q&Ni zrqV~jy0s*jO^4L*i@?gFZo7s$;&yUc9mIoTl5dUUvH|InyC9N6iT*qb^k`zKfTDEU zR++(#R_RWA@U~PnUr*%fCdf*1$39(C+%$<$-I~^MiM{5{|nifUHv7QrE zFnY<%Ol`#M?wQM9QUz{J(*DqP<@`GhG4t<8Exq_Ds~|>{kqgfVE<5DN%E%5lQdPY^ z5rMLw;1 zf|7O@DLqe%mY8v^o0yR*g(2x?ID(f#TSFO$NHfOgSK*NM;(75_K?54A37gg0?>j-` z{o&uaVDPCHWJ(P>U`a0PO~^(We!7uQ84Mnld?2+C_2%%wY(0u8JYEc=d!J>fPl z8x$~Tr+xPS;vQlqB^xh=p=^^i^ER#?1Vi zSWhpm&uV}XMfBo1g0Cc$m@?>|6oF!BMbMinCAiLZcAwr}&OmmY2fRAWz8~T-R_N z8mHbqsMr0Nt8*|Nxz4FBEkT!|Xv#s-i)@98M=7H!Ky*B}rQDq8iP@P)%K@?+2g+pvaH@=S z5bM^B%_n`|rUt27?5&;6q?rCnMaCp_OuAkAEdEpaEG~AXU#{Et0Tfw3!=^~jgT;v` z>i+kiqvX!}wQ1^7q>cJeO3=HEp^dI@8;Knj4^KV1~; z9Y8|Gh)`VM!aG~;HC-CM4WEv#0QID-tjZA;ytbe6b3%OQ5lYGS2kJ~x{KHO#T$dQR zPhu;YDMUvOt;4UOb#=y{jmIW-qV=ZPc+bG?!M}PG}dX5SXCkf&eXnQA(6v;{K0$rUMC`vp%+FdP<^ZB7h#te&69wa}F? z)J5%XyIN2o6n;-#3cw^830U^fP}mFFCMT;T48Y@>Aozig77`nV$CQ{!tP!@CFS^P( zt<>GaL?8*N2~*sD=o{vCZU#g0L$#9fLMr$*RJ274mISdQzTG{a(#Zxi$J;47_loZg z!~nU5-W2Ih;PF&5ik%R zTBSL-Atf%0CqAww1g_Ncwlc$Ui&=E8v=D5Z+NhtEq0`rd;POo0cTGkQm^d6VPSpkX zs`Tv&ygU2Ws1BJybg75bHVQWH;|}esI0QyX6*w95!mikz;dl9>WM*#;HSrBx@_GL+ z=FTxX@2*?-vDMgUY};&XHnwfsXxN~!ZQE?D2952cG0)xS-RF$mXLp>DkNaE3$ftX) z^`C3aIj{M<24h3GX(0kXBlJkV#7GHV#+s*1H2y7)v zBb_9}QICSNXo_Sin9)|>XeG%2^<_Xsr|t`66hCeIRSUk4?4H&%*EJiz;?j;{zm{tcUhL!x!`WS{F>rf6@>D zxHPce=yg6(?QUef?t31T=-QN3-*35^sS0eTi>O(>svrCX1{f@@abM4RlVb`~Q}3~A zjs_Q_Gw!i7UlGrP%4Q*#BCEGSnI~E$7jO&>J+N0%XD33S1*I-93vr7_QtAY>^?LLz zWbM&J6TsyTM}z7-Dh*Nd5EPJ?^tOJKXG#uzQMe1*xTm(TkL9&E6*({$F8f9kK@_ZR z%^Ov1yKQ9v7}$qBmGR%Ai^y%dtsSebo_~;Wy*9z>5uJ7w9p%|LWneAMp}3h=bfuvC z;L!19*ID}b{wY;w+mmOAUY=96A3HZmh z=s_``Sb#4eKTKf5zXhupm8K_9*iyfekgrcsQ?l1Q1&cg0r3QAf+G5s7(9uwF%SmB* z#E+{K2|-cLc1SMRwi&h%`@;zpJ-&dZ#aMY}f=CHbUB--XHjn<@;7Gf_gAawB+%c*?MOqAuH85BQi3DB_?$-$zeKZ+CL0%3@XDTJMYDy%cdUbBxa z1!6Ljh9XOk89KM}wWbt_7^%Why6pPK2US+<;&?>n)u&#)d_d>M?&za$%%v&qV<<^F zP;_04EZD+a(?_w{W|^E{ORi+*++U^VeyKp^VGs1=*i6FeV#0sFE86UFX!fwDU7i^LAw5;mH@7`>;c#NaJOXB_ zScc2FH6a<^ul4{^Dd4d5w>5iTcKWANly;n8d_UoDr={O!nIZxcvr5_>;sArw>}eyD zI~oFA0U;3qGkAn!1Uk6F3*myaz8>)lT2*=;a#cy(UeKb8)Cd*L;N>Gl{FGkzwzg_f z@?+7O3~xi2{nIlctO^8CJ9bj-uP^;&Dz@G+7qocJGRzFgAF$r+V6H-o?We*#Hrn4% zC+@;XJ!iXP1Zc}xTy>yzM4`9%T9v?>wBE{MUG=~#F%=MShx(M(+_5St;yBo3F8!^h z43kxI+WhPf9vvPLZLYNs(X{86RURuufL zA87jp!?%GI2%mD7KYK{@^kx~HQ229=g%MdO;(D#+m$Ho|W|3UX4k8i!=AN5Z+ysJI z0x6II4LTMOmKPFVlig;5NH-J%7|b|hE^l)xZ4|Eqxpy3lI4HL+Nz8cZMEKpDHQ4vS zd>lkt4QkdfN|#Yd)gee3`gwvJMd$mW2=IAz`<`C`pfR!znCzNew)XGU(Eh`&g2IDO`@MPwPJkSmLY`^k9caoLoxcZ@+P$#b@GDAcz)V zhHer~=j04xX<7Hu_=E<2O(cyvzuj17?1;wenpU02x`rP=@Ra}CvcKz@&;0K$rc?}o z!~2hH!z}y@@DtB}k^X{zW=OD&C@HDv8S0xDIOG^&&N>xJKgJcFIk~xbu+9s9N{JPj ziCdP)*Q4V7%*mXXDT9ewX9Rcb6#oH8YIx0l(uHlBgvFCpB5=Um8EMD7*TQPI`&^%m zdWjsbR1(EH?hpopL`_!IB~ns6NbD|pGTO3P<*i7YuPHmUyRj2P2RX}y+-n0}OANj( zIyiaS?n>&#U(>F!brXAMUU86hCYN_+=OK8d5saYOvFHNhZ0YL;=-tKaJ#6168Mf!a z6OEO!73sRqzq0#RIFQrATGog1z_-H!srkzDPQtIQ5F&`U&!<~g@P49v-N{FQh6A01 z_YPIosp=G#GOcq-lqFNIHf*7SI&91omieh7K8;j#mao9hNY#%SLMPbxH4Fj0LFkqW zgt_Xpo#=?HI`YZacFU^k6p>^(=`ZdXfWFy#O1J>y=RcDesoyrcK);#o(r8oxndC}J zw!cA)UYX%b{dDF?jKN{Cu{f{AFcW+VmHi9ga`K1tPMcn7ZL@I+TmJbifn-YYRN^(GBGlonpyPK+b*K)TO0*U4_Z*k!&LOcC$-OO z9OKN}PemSd-h^nBio5IMk69OeZ0|)}9@{~+xrf+R<)$kwe$6emT5|JK9rXVggz~iW z1BFRWKEj|WMMgTbEFXbNwa?6@-UBkq;EA#v2fSCStOXpRTu_^y27tq>BAbid;%><~0?$#F=<~0Xk1~JJ2TD5!$ zxm3lW0^~<>a;}39d08lxk-c_~4~`yF$23gN9$C^5^}*JDxkw;WAkFX_(O)e-#lg4N z!m1}LG7^$d;=+elE?#~Jm%b2!T+8-jhLatQ#Mz&2i{aMHq$+fSS)yfONnn0Xic)EN zT|(j+S~V(aga2`Vb?|djB1Q)-oCq0~G*!QG&o~OZHeMTdB>Y#o*-ChBqmZ>=+*!a! ziX#%j4Q+A$3!aaxo>Ye<0z_fvwUF)x-Pn1=nMN~t3BtObqw0n^`vd{ZLy_)81!2IuJ6?sgDEe zBYH92c^0pT-M!}OBZR7g9xTBunHz)j@0E+$J4NTTIB*OpHzWItIr*%d|y0u6~DVW1O`4g-YzRcJ2hLG@5Z{ zj`Z%JV_mR|?KO9%JYz@*vk2%el-|P(Sf~OGVa77D1yt{K2>!OiHWsYDvqb? z+x)0CacO_stan{h0H?w~B4I2yAfW?fC|Lm*4B6}ASMFp`iqdE6A(DRD`NUR7aq)mu zS>eZNh_FHcqISs-HO@|7mGs^^*&qCzZKh_2rBfHt?%{Vr>u$G(E*tBwx zn~^QJ{O_0wB5jIHzhOSd9@nqh9*K}ZKIqIwsWhhMC4EHI4uO;Xj!(j0)@I7AOjbcT zoR-cUM?a${JV+s76s&ZCpJ>kPv^%_Y=ad;__SUxjvIqa{_ySbb zExhr!m3UX({$Di%m3WEY!Xu#bb|C^gG$Fz%Po|`+o`*N*`XJJ#K?`fl@0`LRl6LUO zv4c~P&-%G(n?l$Z62=rUAjLhf)GPstn@Q>d+cSR943VUrih^KA0?L~ogGz^_)IrNO z63*sCa3rU%|HaidmxlO8oHEt&P!X|LxbrygLs&^`z1(=Q^oZAd%8eLE$<#k_S;`=q&v z(qQ=Q;9#X67e~0ctKr2VH;+0dr`c|!J-x@;#pz^NiE~P4qG{Wo=-vqu!57#k4l=~A zjqc3fX9RyUp+9SDFs#bl`r7po5^OsP=`RcW&XEANf7cBDBVpwN$4H?9&W)R3E}5g* z0o}PZKQY>}+A-z(31xn+-71~%=}MgoJQmTqQX8L-=#wKc0<66zn6y24P%DonomR6i z1Nn5{I#d#5;!i!4-u?vV?CU!Bs-04su^FS$!E9I@=>+1bbUHNZ{KyYUpIkibkc_|~ z%KVN{!9->B>iFQ)awOcUbs&mMNx4igdQ+q*n7_hM;R5F%9So|>br(k(T&@G=rHKwV zxGZWNLU*`S0>71ah!zyumrZ1v`oE|{}5BquwJ@*vD69tQP3FUFN^t3Ko0YtFPAu(-#Jh4@0>>+ z913dxu%uGM=&Z|n>Y7C3VPtYUCn(I10jYH?CASo@Q%?5M z+&+UFYqT^rw|Y4h@TaAWDybc_u5tT4@K01WER)SrDn-qn^Grevf&>a%j8tL>Ii@OD zC%JJFIZIsNf+JBlkK`5 zyG2_VQ{yHm#94<@5v7xesU(U+0VMmSV5oDETEa-|LOAWAT|d*W82)eAl~IQl--qos zI0Tqg<6)EtClSvqCdmT&fcEEqq$7OD>qqOf2qH&r#);NY*JMG4<_S*V2+Bvlfk1Pf zIFgOPe*U`nRSC@j+-n5y!U`WnAio723u5Esi5=ItF+vh!aOM9Jk)!rv+Nb zy>Ccnf(#$3J}`_Qkfi1#s5p-;#Ha?;_hR9f?U`hZ=dS;teW|D>1077nwb2|1aSSip ztwz=;aISdJD;d~v&`Pge7$T;0m@gkHt{Bb_qcQCLs<3~mo_e?V3aoBr{nyofS2^ha zi+Tpz5fl&-??(+6C7?Ql^yAI{8c`Efw{gzhIIDcryMQ&`am({@VWA@atJt~89-cav zBq!{?MdJFdxE)8E~Ek=ElCHCYAp=K}}y{Dnq$ zt|vmFni`=fZgRa@TD%|fo5PRbCOG1^BY4UgxPqab*u-I(4Y&sO^d1mUr(+l8%)1G2An7H9<|>ko+~7VCM)gv zxK{EcIzBc))L5=m>@mvuFhhJgyxTWDC?rSYQ7tz;)jE3y5};4@{-0-?>3w$D%uIhM z1^)zX_cMc$1oadD=4_k)cg}W=j8X~gcdV7WBagwdw!#F1g0*j%8i>?*!8_PrA1bk? zs8@qX1jSqQj&+ujWCB0pM?6_;=pXHHB=8i?c@E7Q&*ShWdUf`qTgww-v9se%4rN>?EfuR3ggx+O9= zV>kOFnrW>gefhfQ`_Z(X%ObX?C8WhSi)Bg@9+OHFW97Xk;Hvrbztp0b*xyxW`Bytf zB~I{nhyf)016qXfz&Izro~ueDuU}UXTr^2>ehFQ4>yR@gNg>nGfpQ_Kwv!eX1)??-RJ~xl)}fFbW^g)b)Kh|c z3AZGcN3drlS-T2r?YoYme>vmq?|PcCu>LVr_?rZ?D)0{p#?N2T+*dAuFf{W`SNQ8} zU}l2(=|`2{L{wFWrUL0k*28e;`QT*U!zg{mVJJw zgZ;pPqibql_)^^7Oew)i77?D%VkVq&Ch#aKKQ3cHV=!BtV+m_bkm`wK;EO?)as-IS zL5l{znj$!{Wh22oh;*w;Z!~mFL87R56nq=l`6x!HSufP*WaNbw_$iij90^mf)7gO! zMI7>wUF?Z0nOP3(XI^C-=_~y`VZS;-WE4S`nv#v=S$Q4OAr&R^obuBhajk+vnKn%M zC-+nhA2?GfIJ3_S+xspXCWuou`eOHldx2iS2&Oj$#q8}+>C!Q4DNQ<@%_Hn>YH~{W zCyy#9c3&J{K*{Ku{_>z?dY^MH3+tZ>8>N-#Z2`C^S2PR4ZgFhhLVN=ByEtmSHeH94 zF){%?W{=2R=O`lK_MPd>S^%W;dAbj_khAcan+CW;D-$D%Mki}% zJus2S3tZf0HzGnX!jr=TQ(Yy>4nLe!&rlfIZ)&6QJCASVNkqyCqoZ2Cli!YQ<)h#3 z$@;-iR{C z{(wwB?CNvyOE1Z;r8pI0YavOI(k_rYEv*(j8T2ZOGZBu3qlC0@rU*wDS^o-B8ij8C z{s$rkrZE(WzR-1S4z4m9V`z=)r5l+eOJ?cWlS~gv(WV6tA?iYsfL1+pwM<9rRvQ_4 zLfs)m65CGCzk~*kcg;L7ar_w?;{W^8E~@6z{ZZLDZq4@zBAw$bcS|Z|vuZP)**IP( zR3eW^tYemd+&m*DjoLdQQ7{xPhlvoykgC)#^DqenZ42e=Z6G}OB;}7Okfj~SqUuAgqB|dO#Mz|tv#*rtyfucvC0C<4P@0Z5lQ7AAZ*>a`SuSd+*~pX> zi6h&ZtGO-&1hAtOK-9Y1DK=q8;f9*OD23diH8WiBOgJwydn*pTd@^I$)K*pU^>V*}I@LvM{AH2Z-W7kD+5cE%8AVP9 zKpXS;4(+*nB5bSVjwHdp=G7NR0EgSg>IaA*Qhp4hk$6bDH68Wm(jCn&tQTravHR0Y ze&&i}f;g1MVx1$4ah4l3Qk8i^z&TSW?K6~kQWND6=M}DaUx@F8iHU3wC^MAmii|K-_F=&$khH(*F^)V3!J&E2 zx=d|DXrUwf9fe?IUuuvBZ1zcs{QMZ^(7waAeERVQ9+vb>$$(VLaICHDY&qMz|6rU^ z*{VeEiatNAK-ylmY%AvQPIV*cCq`Y+smM^$iU^cVZZ_oREs>_L`3D+A;0#E&(v_IP z<~1TIqMYV%o%SS?;_-e)Z?z4t71%7E)InFq2oZ7`80Q)ShDqrmQK2!U*#(%NpQ}V2 z%J_%qIpY&U|FV7T?<&YKa{SQ)Qi+wa5hg+cyGzfd_v`o4+u)h3{YQ~F#4~UUqKutA zA6&mMrHA6&YS5}rg|9dVIzMnJwNvtKQ^1xzs@oYM6w@XAGWx-zE70YEQv25^OX02E z7Rv!bd_e8V(eT;$Ag=ceS{wz$d+hwrLVV@0^HsW!lI) z;PVt%L&3CsHeqzznxL6Rs-hz+^VQ1jM7Lwg$dh>f+wh{EE>lVVF#=sv;fcTf{CDLi zEKL8pE;Ma+0JAgzCg+0h;w$!5 zNwVoMULOEatl;ou<&Kx4;g|UcE2}BjjYE^=*SbLO1~RiYowK}XI~90mfwXN?OP1N% zs6!ydSJ{vA%WKPe0Cc`9Hi9kzRn##YHJBkCLe5cg1I>-LKv6(#fC*|$Th4RXfpJP^ z!15P5QZ+X>qUs~Z>F4ta%P~v)JRRoC7972s^PbMiF11_yWW&)V_%H(;nz`jZtl&Yk13{%cWHMgoQ80?K4PQ-zzu(F3 zZ+W@@h}hXve)N~!=3sbFYloTPk9-JF#`}$W6#Xq(7U)IU_@!K?5_$d7^a7uhUmKqF z{SMcrLV*~*REb-ZfCb{kP9b#c;$E#ELS8!kT zm|MR>T;>Y>i=hn@+q<+20JigguOZ7QNjPi>KmkCFUfE@71^zyAg^G?B&u`czc;t?? zRs!OF&J&bMLdNy&ahZKHU0E!V3+v$2XyOn`4^4bEcB`zC_Cib{}O z1u)ti(oWF%^yt};7F7`1KF!5a&~DsZKmrB_8^13^fSBOq6r(Hws?LqHiYn;FAM0^O zP3uEcJf_~irh=F~`NcAr{{kZa%tE6^T@UE#rbzy6+%?;^Yia?Vb%q7<0JQ-~`!-R; ziYpWvVS+-@>UR4C$o7igPAN3+BsdxxyE$WfI(;jafra8{!aC2uA~4a}4G@@l+|-em zrh!i_+qdcZkA3Pb@N%t&!!?!u_y`&M8joUZO*Xxx(aRS@%VH$z6_{=`*ntV+T)eQp zdJeop+rAeX%e@8d3<<_`k;Dusa=5)y4lyjJ*O7xR2Oi{u69`8E^?t1B*5%=zhMj2q z^!e~ZACYjrIdUgcgLBQ z_8)>3*trKdKZBpRz4s(o2x7vU?(7_9qa(GrR_m~%Y+8L#3~B*Wf%xNQjlqmQ)8XCV zuJq!@)SJ5P$@1~6OZ2|{fO<(V77p#U_KV;ocp{4)=bj8JLj3&!fRM7B!+MYam@FA7l?5*qP|V4imRiw18r(#CDvR(_iv5nT5-v9i@(v! zKQKM~P9;q(5j~QGs>h0U^Ovc!i(>B3fB{#dc{l@PC%+(QdD-50(Aer?6)#z!03do6!OPH4TSb7=9Py{cGml=67+Zul0p z%~jg));uKr`kM+=Wgn0!4m!FeV4%l2wX8q4WT*4P`b^cP#g}WrCa-E&hs~2U1|9v= zt^TxwH`xz{mGjeXw}{#oKU*#j1R`q|FlQiaWo&i^RA50pAt%xwfrcAU-s?JGq_Tx;*MB?DY;+lW*NB3aXQkP zXL5S3S2?_ZSg&C5cF%)w_JlH@gSqwlGhE7U9z(c1auBkqj43y@JR0Ptl)SDQ}Ocuk`3#eoWN} zxh|+n_zcW-tQf;M@udzh7i#FjF^j_1#)0Qg7IXPUAb>o?ksWn_B3%VF;?WPwnFj>k z%WX+Bt;(m2(*{MibPsljMr!5?*4kLNMMb@<=kcN}bzpE`EaMl!Kt{O@1Xq1|SlUDi z%LPSeN09<$Rs|^2f1S{*;_K+)5h zWpRdcxx*SUF<)cg43Hby%Eph+kc*$ZC}v-0#KBx8HsP1cMOw1U=c?IEC?1)^olmV< zME@2wmWXnW8^ih;mhgjiwwT|95q7_=4CDNe>G(;)gdrdN?yEVkJnkLhEZdWcqM4rR zQmDQjlg&twmBNz-A>(Z` zL9C#oTz#3Dfub@0ap-m!g%BQuA5-5kK+zw2Z3GxC9HVHV5$H7gnWH5~QHD}ilfj0Z zIb@O5Jb&>*k>1=GpJgKmM~GmXaT!b-f1x7g9 zqmPvjbXOd>GabjYDFxD(>vq@F8xWMd_Y`m7uHEI_%LNO!@Mnfy@HF)h6Kr(FX3K(B zT+l(g96corW831mp0Y`-1}x#((w49HZvORaQ!b+SzIK#Npq+M7Z6IfMONNkMo40Zz zuK{>m#NB(~oU)%7hlWi<&t$QLIb-r}>LzzBhGOwB2DvMyfoo4ce(7`bbm7h8DL3u0 zZtj~I4knVNbfajS-m$mI9gAf^rLUSNQFCI9Qquxfn&qTNoDrK)kWV)MkS7&9VgtSH zNKCg8Nw)HKpp=8)WA^dF1e2aW6g*Clwg%P|0bgvoCf%yYh^1B2U5bz?(_aBNP|E*{ zQBsLh=XaX`!+S76`55NQuiXShO3^m*lsXXF8sJUoT2x)O}~(t@j;@tOQ@sqJaOQm%8n-^RObtW(`uNxRd4 z6t2u5E1?!PUl97BUW7Iyn27%{KTWK3G?UN4>+cx?$)+0SSEO@MuYVILuz9jm$V~7Yrqx3-#sDRqxYb)FrtKaQ|94Yj@kMp@=($U#}9@F z(Ld3cg-fK-$6#(s=hx{YhqN900k{3Yr`>N3chqw|4UmEBr6vie$?$_Ug5=Z8QB(Ut zZ~RKylKP7T$F|gnVM)Z!<1owdWPQK7 zC`(Bk0mVzS1Bc&XUppQ&usTwMWm zWWrI~>2NNwwaw4ftg`U7!wyF(&tHlldU`#-v3$`XirI!)mpjb~yu$dLju&vwn%})h zMKPpd$+DvfWrr+%+Vdj=$698iFcCs&RHVkQR zY&3tj8{t?3H%srfdNX^H0(1*&G4li_kgP&vDCWpu!PognHlUTy?c^Kkco{Oj>BS+u z!*o+qT|rw84dwoejumg$3_d8CQX6oEsSx?6Ii16*T3TxDUYk)4>G?k1~_$obt%zR4R zxio1ihyjrqxhJZHe z&+v;gwbVnN)-bc5m&m?Z4r@mJBSBe^H|*`I{F|Oc?8E$D@dL3iK zkmx3aX-1+xXKH;)=A!Qi(8FYE(+#N_{8=4er~@-Jd3y;Bm z<_B5m1k*Qwl4ld}=`+WYz8cPI18gh#QV`f60murHl8r>jp;TPncn#RyTBmTE@Bu|R zL?Yap#rdG7h#orxhDcC_%ycVhCCY42j|`h<8mA{&VPeXtx?L_D^a4yjsR@JEEh>tb zAx3OLebQTDLov6m0oq2~5YE}DIWqaDZ?VXH=G2nyT0fmjSmy$+I&K3Co@h?z{Rvo~ z5ImSXd-+@)f4Xl;HhvHC^v?7}ugA+8Lae9G?)^*Hrj&5^U{7`yIWeumNtu8ZI{4ZKUxJK!eU<($#yVD#u= z12~V11$a7K`ePK)x`DH-VP1YS#C!@95tTWM7KBsqW+!Rk`?Ykk6lkZzbg3u=#9q*B z|8={ZL|lN+LF7dLM|Mo5;N27iu~&1OqTZ*Iniq_y$sjn(CM}2int4wP@zi^?YCQ2qgTKG z+3_lS?{)bY#d1ODjOa-RqejpCK<^Kkj!H0Ayz9|Wc+&O7HvoP8?0=pp*7q@)S^o6m zRsj$kw*i3x5YXwfskT1(?O!TW!H&N)&B`p-nxJ$c5&kd}nDMmjFt4wxU&ie)Zsd1J z*`T0VUXLcu)eil(_5)JBTm$4I@N4JQ<639Tdf(5Lk}ZmYWdOTTCa56gqC4ZFP07pW zOyZt=+doX#mCRN&%d8N#(p`lcrG+n>@q?YP-A3194MS2DT?=A98pYAuwrt8{{^Dp>aE$z4_o zjX~rTh?)n?uvXIr_>8&NvbDewBhd&>BPrO9d>Fwt~9Q5BfYhsnxUMGqJUQ51Id=-{x-z;GNyGKgy#j31jwyfRJ*oEC`w@D9)q& zg*g>B{k3XI^!|(Y7IV44=MhE85z@Q(FPqGsw(e%H@%b~qrVDo0l~t~S5Tp`d4gwgv$HNIU963_(1=pmIF=5W%xg8pLSum6@aqvNVhv9Hyy<_kyu%n2uBc}>K(2C{+_@RCF&h&_t`)t} z41sG~leeQ;y<-?r{ukFo!~ zwF=gEqJI7~L6K2fi30>5fK?k54^(&Z~j7==G!x0$6(nn<$a zTr4z0D?lpYF?_hNvtOO#*VjK7%WM^fCY!5RNnCDw?=8#>EyZ&HZkM1%zkYX#!4CUU zFCg2>JZopOOAn@E8!k(6w+3`ZzW9`;%-MLA6}?bm-pO!wcf_7|bmoAU$NLg?i?nbQ zTr)J!Y3>Jdz%18fiu+Lta)yzg;Zn6vgpx1<0mst?12L5nX!+~PVoti2KEt@YQcb%A zX9a(L&W73n;wrD9B+HS)>Xk$YD*dfW&XbV{>7LNqZMgHAk(#irN$SQ2uC+yP+kkSf zASYEVE}WpY?oDy0A+@a9x)eQDiX71pZAnS&p7|e0Vn%g%GhK38QI#e;&Zh{CWFP)m&EYsGLtPaXuXxe6q_>35z%?7| zBcqigbG4=;8n*GUN}A>DmDCfCVqJM$1mAJ^<`~_5fkIGM10SD$T-CYow$0yn&-Dt* z8qst7w`2XTLNg1)pCV$wmDDHjpBzJe9;mN7eW*kkedML-P%LKGD&G`zTzWy7mM-Cc@DL9`-vggK#c|E`I%}_KG|AH~!M0=Y6U}1=YAEOnOXP(75<`~BQM-V-*k(D&r~0{3`6@OU zmCO-c^T#MEN6*hzOuIlzaC(uLiZDkl0S#L_XJ0c`!uBUW{TqaGK{lF(#Y<)*|$Y6$;R!L?>tDTZ@mDju6{ z>ql5cn^m?ep#?D0*00SU8Vhh z4I8TfY8QYJ8UQX#2m>%eqdTz;2rHRSwngCK>dzW--%BQj2SA#hc zp%a8`T2fXy#PB5%37Zl+5Sb+Rs+tWpf_-kwlzXJC^pJTG}0>T9o%gsjqXM< zw9IEbVi%-X1K()BtyG;{J?T9bI1#5X0F#|Bjc2=VnJhC^SW6J#GuLmNg7 zlg939I&ANq`y4=V$KUjHs~y%D)GOkdYRe=hl=gULV5ZQ#vE+{wMpkj{Mt0dp+`oHF zfB5N;+`xa)c=m@}x!MPwYhjc|=`ByhK^?LYMPgZVUxG{Bn4U3A3gw|eH3U7+(Aj3wZ- znsbTz$;yO**rf?3q!_C=`##Di_-g+QG?L=q@|WFsXLFJH&*q|{l%g=egQrXB9>jKVorF&gD@3&A4V}^- zg!^Vbo|Ln&G?z*0EQtt@jCp6A4oB%sfhcPchX2A{uLZcV($|g)(&D*Sviq|lLP>$J zsc*J0F1#Sf!P~AQZ{a?oxKc8PdDIGO96i|e`S52T$uS^ zevyj*&uI%_ie$2ME}?K8`~3EKMz-jbR8I_9VtVGcL46V%v5~Q)%)^wFqa=>XRcm1t zXcpuiRCmE8qAH0VLWh2K6BQ}EG74|W%P>!+U6eG=L8M;Gw|+Z>Vv&vq{6$=TCy`G! znopo#8mg#W$MSa-tS;#q2(bvWt$uNiv`k@i-feEx2#s>U*7c!8HBPzyid)6ogTT~H z&Z#i|da%QS{Z*l#ArG(D-`?1lP>Z)0RFv5D9@Uiz$6;vM`da7`Jv)uo$aF*{-C`}- zE^=?4loM5U5I5FQgz=tXEi`c<)$2JPSL_Evk@rV!)Y4ai+}Cz)PPE7}h@oaFM-kIO zk0n0rxg?_0+6-bWf1&K}HDauu!ge_As}V7C~#E02~(Pw9Nb9 zxA>7+1(F6XT9P^qNsQ|HSPoQgU03D;CwskXQ^Z-PrxfpY zUqyPo_^NDOi8B87O|ieLneo3k4|`vAn2F`j5Dl0>>tz6x6*+_HIj6S~{)Xyms zK#kUpI5wxn5qw`%>B&Nw!Tf4@ci5=-x6>MbVx0zXFB^&zr7s8>-9XO~2Hqk`F`etx zRba8kfk%%IY8XYa$&fn-FV=@S*EoAl zxbK0nno6+E_X^x!X0r9SkovB+>A#gm@cu@G|w~8INH26kbpfd0z+0eCFsIP5o@tD_pCw^ zi3eg$VG0C=DeHJ@DFR7Cx9M>A(!Lv%@Ww9?0QEm5}9fW;lLaizHzbhUT8coc|PR}#(Wm6%y zS3LQm%+%R1xyRAPO;_)e(97Z);38cDp3s++oD)t7N_NYjOb@XxlhpIm4d?k1nfuSM zQUyu0ng^S2Uk>0na?ZxtC^J%~b{dJ3;4w$}eEMwdWd*0xjoh`V2q~7#J5A1j9t_j^ zi==AVaALF7uXNb)nL`!pA(0O1%D8QWlS$_^?Y3EJ8Pdj_pBRo;WuJ73jW1I@ntPLg z7p7U#7IRPlt!-1bnZ(E7j)}viT$D^*okko(xdisw@^%4E+4A{GPRTmpA|_HP zs!tQnr=Yw@`&Nhb%U^vo^RIIXk42{=^?FYAzj&wVf7D*~>C(;f65h?t{VY!M6-gPUnta$poFj?F=q6SzG*7FcDwS7=cb&R<@Z2s` zv(|tBe;V_k4QGl>7915;qwSjPvPDP@Q;)lOKW^73gTyp$e%m!>&3m?CL={KsJAKrM zH$M;mCHgi0Q~$~_Uh{0%AV0uSu;Xvv_Pav+zsByh<78|C8IXbjgL5>qq9Q^;LL@<8 ziGTsdycv9;6MuE8K_-)))BjA59^mYCn0~=i8F$Rf4?g($y(qG>wLTk6fzx5 zoNY!!#?>=#dhiCWLavctrXMWeWKOG=zZ^p(!4OpC00u*CUVw5xhy6wN&d%_zh9v8s zl<(gLqqu_qF&KrAR5UNS5vRda&I2WJV6 zzT&8i^mwyGNygZ88OGZ<1G}t1y{fqH`G%i^oCxA`jfPS~clNhesfcm4RFdZe8E#Eb z&OQ_sAd*TX=XqO(5W1ETZp%_(i4Ly#^zbN`SLoRJ%L$bkcvAtnMp{(PJvYYWlakux zY^7L1g(L*Uh{)6)Bu|ueqb*r&t)0Q$`r z(PZV@JM25)ne_ZqII>G{Z9aUsYh7^3%0KRB>cE`nXt_St1PToY+DD{H`j#jfZAUwo zaZDCF2NiZn!}h3oPFSq1#Tcqy&V+za`7AL>WXdGqMVaodWajzHQ;ywr$(CohFTKn~iNYw$<2;Z8S|98*OYgIZxlczw_j^3TrCb&yXu74x zbERvX@az>h5n^2G^Fq%dcqA^S^i-=p+#`OZ-~J-_;5}5JSTou*C@neZPdwH*W$vgi zljrpaXJZ{conHp$fBQZI_;I&>|C#2TO#hD~>6}dO!!}s|W|1UH|533J|1+22fNq3# za0KY$9RRiY3Z(KL={;Zzgj=$(GN;ehv6Zt?r3>s^@X%rU+Pk3gNcK1w^DuE)N&{oe zS4c~G%^1_E8#w*~`Ce8awPXb0xvX%B1u>d^)+fX!!nMwH;02>s`wM)^r=LEJ*uFJM zHxRQEw@%|{>Xx2gS*U<3eMSgUwimo?r&1U1NHU5d6y}YKnX56YKS|U8GP!aQen~@V zVV|vW;v521VyfbX!VeFD*^b6h#pw2uAkPooG65 znOh7=xGb2Hd@xzHlUw#8h>Y2Eh=1z48tUYLy@`qS=8wi~8QlqVjJsc|L` zyG&|tr~VZ~Yss9;H)rRxBU9p&bfphE3M`Q*^fNdifM*(titkACY7|LAEuwl8O0PHp zmKq((GQz7l^#0!y09;q68g#({b)@(nM`3JqQk!l`wDHbU zPZ8KA;eY}+eJ%I$%ohHR{iu=CC;!RCa`M$F=9kTIfgl5!fWe$ZJr|z^QV_;wTjL0K zPXOYpc+n;nT(P-U90QF}S_V&X0%@WG1zul+E^7bk_!L$6YqCW_jvzNpz0u zHbVO%&IWqN9Y=WLrA2EW1Eds+3Q;Wm5(N(BH{?BhHW4yr0-^{mp=~{oGYyWfy!R`y zJqOUfb>B+Yqkk!>a`2>djlyIt+x5u_a!#T?Xj|olkCTHl2T3Av-)$o?SH)TD$CxOd zAcour!Mp)2c801|3H8g6Cmf%7hUQQhs*QCrfI1xl-=X)di-auk-!_CXW4?@vbZ0Iv z>QijZgW@lSDKTiFr|KH;9tfmfa)TKQ-?RM%u8Q<(Ihb|fXI$5Hb_JN;UXWh2AEYDRQmr7{h#3V#;L6hV}k3eVVqVh&P zCCgj$19%*y5XcH60^Z*```6Q`kwIgJXnqjjgQ+Uu3vu`mvA8UYTrp=>;DRlv4s>au^I&A@o41ryeWE zod?D_)<`y7I@i^wT@YesF2!X0xHt6qyP)} zC-g@l;MAr^eoMCfc!1!Pl!n!euTZAd3il)_GF{ND{48T_zPi-X$Y0dCQd-Pr;3%B8Q>jD|3&~tOd8h2D)slw+u)b%= z=X;dj9)uYJv{X`T8lUWh=j6rJ>2SE=rZKP#&ru~lW}@}$K?bi8qL~tBp;iG$5O3}u zklX;@A=q98G!Z*iC&OTs%xLii@Iazfb)wPy5P^DGXR_HETJcmd)wSP~PH~-{+}J_n z2moyYK_v1hhN+3roK8M>6~uxO_G8T0#{0ma}gY1FMx`S;x)`7A^s)QQ4emJ2E+0myxccI%H*572@0P< zsn;+xY31oi1jS{~w={=}-x2pV?S}BhZQyoG(x7dRpbsL%CU-i*D=?673&Hn7;oz+% zSIdpHo@%YZ1RVu~^dj)KWDWva6V#J46%3f6cKWg7`Ot(O{fNjh0upci%6Hv#tW5tZ zv8DegtBT*b!Y~*_jQUT{SAY1^x`wAHSCdwL7&N>0fkFl*#)w7=69+y$mko>Wn+ZAR zu~FV>P{yD<^@nxq*$F>l^01Q}Sz_psBaxRAY4^91A{`zbN=zWN}s+-bCdDO|&fafZDsAI7zfpyc=k zK&@Ix<7rP-zeFx}xvMZ#HoCtI$w+V5tkr_HTCu@6-&Cy0R-zaT&yDtU(0}H0q5(%A z)|@VR(7Ch#`PJbv^@)R8B}&VwsUuJMDB}8OBz`SryTPItp5L(Go_(y#8-ahH`6(;Q z-`gwUznzHxJb+W~VX^^Nf5OF{mQ zZ+VD*d}|fnsWbE!mtR~(_79M}3UD?6O2qeb&M7Eup}N~(QJ_8!9lApQW4tXA$5ObL zG#ea6glH@ZZoa@LJIJ@EwB>v@Wr)$jb<|o-s&1 z8d#qB@t%2*QUtaZA!T;!zI#d2$cCj?7RaY?`d=|PgW^quR@;9LU}eB?P#y(mM+LEY zb5$~Y!Gj6_`bi&5%qbkddw+DY%UdkOdpGD*<)4Cf7D6aRyQ#)_FuR;yR)|I7sTXX~ z;>#hZJE@PgEOr;yz^U+1E8E7pe;hFew!H47ZFceD11z!dhY8m_9Gl#u9Mt~{r60RK^N0L3&%`a(b$qA zTJBUT(e*IXE2Qw42M-gLRdx}n`A*q6npTdV3Jnx5wf8K$$US5oVpee$B1FcZj1j8lG@EX-WSa~~ zb~7&UGE;ctFL|H~e|~oCG;`CAy3Y|t*zNcY61|cChPpZ1|Ldp%+xvW80fg1R>NaJ0 z`yeK`uLm0UOZYT0G#0ePMF^n-hJ-jPhHVT<2#8VBe*A$-Hj8r|YuI%Eq;5 zA!jr3#e)!dii-y=v#Nz!4ZpBOaZ8b}VuWbJRU0CwXPP>3eQU@4m&f9WS2KNCk~~*`DBOBadD7y%qiVHY>H~pmgR%u!G8nEC;8H`-sxu z`C}rR_%pnz19us z*Vi)yt{<%`3tfr8h&id&)oRkXUprkhCP#38J6O8T$QI3(hClT=8_LS{hRUAnB{F>C z%vjI{SSw6P0@NKBKb{Ts$dX)f@Ar~>U?#O^`9v;vmk+S>g5$oG=1^5UB(wy{9Fq7vY73DF?IY8+r5hC5Fr3C}aG!xtTsc z5fQAO-NS7q7M1+jD|~8k+a6jT2n04YLnA4f9awIX5bDDwZe!7?!XJT+hVfsy5I1ECpJ(JQ4CNF1F)3@muc zAsDL&xwRLuDH$QGqUTJ+IIN+305dkd;U>dt+p5-ME}kcoYklO%LCS@Q(bFu4q71b& z!%R6Q^jiyDNveQGfTIWk=FLFn2wLkB)~7SsV}aO#ss_lL>Nrnqlz zK+dBv0pWr2M4OX{b z`b+b#@e_V_JVU5=C&&=pCif8LO*y1U$PFML=gv=Yc z8^O4Xq@R0NsuT3Y5I}bmShn_fQrkgm9FC|kvnv==1R112_7#4OmN+0glYex88)bhD zZexBVq`CZ5<40K%TldDQy^jO@uZGv3EerrH#r})}_@nuuP#zev{RDS@JyAn$&nE{{ zj5w)a(XvjVSvSSaVab#(dA5LN{h_1!BBN$h)T;2KK)w#SzNw=AbBG16jK}XjI|M#h z0x9ggWz3quQ$?-9s!w2S=#R@{Mw%W0kd3gMZidynv!kORkCfqmCm&+@m~lJ5VV5-4C~ zQkU`<0#LWuVW)q$%nrv&yCIhE*#(z>kBrp2$dJwNOGDLHYnAVO)VwD{AkHe*_LIcS zio2ydPPDwM;m=CjV@^+V5my%DY8yExuDcB;x4w5dR(#i7^UoKny;c6!JPLL*UVV?OcdRhZ zX&fP*d7v8pLwtYdW1jt0$1@NJt-~8ORqXE+@L~U(p`t3AXbiAO01OcJs1H3)czRh? zU;=7tQqM$%8;Upsy5QhHtwNtsKDQ!g>GbDWL+kNxJC}I8cw6fQ*eu40B`b6FQjv0c6G8ON}o* zc}nlS#@|BWBGlR--F+;abU0E(!TTZ8~`Z_rrS0e(sPj>jlXyyRKBE56qba_5W(2FtYab>p*Do4^)? zPe22&mxicL2B7+AxLU3lW>R4o&aiP3V?Kzjv~?~gDy32Vph+ndJVz3;!@4=(Mj%o< zc_{{d9>Yl-p_~a+mgy|)Iggfa?A)ZZXE2T5BCyCI)|%AQ*ovXZiqqc05|V8?b-__% zhk#!GWHeRGMcUF%9f|$K4#~=wTY}>D*i|fDz4_A@35k5#)HS%UJI&0Y5sdCYAzEpv z>!VDCAi_*+_YR@T_uAVxl8F6%AkzO@b8B|?ceTC$g%9Z_O8*&c`O~LkG!V)vzAA#M zmpbb{DFqvL`!>pp&z)?BWjcq*shpO~``eDq%!@P1QFIbE+Y)paqppxp zqrlz!gm8MSy)nf6dq~Ik5bbj-?j69SDY>0nfAx!nV1PFQCpWL7YMz{yVdMp;iEC#= z4Y7MQ>@>H89M1qmJp)`yLl~nR!e;PR=+afk!iaw3>Mf!>&8=rY4Q#+LWM%4REQK6o zM^XuB{Nb!xd^o+d1hBw1c}qqzMm(1>k?xlceN_=1WS%Z0^Cbv%9#}9>!nfS!iK#*Eogi%4*VHv5%=An zdy^@dyY;kLLflfaq$zCJHMxDJ;WseHSi~E)AMEV!1LnB?CY7iP{~7-NBRD|Bid74L zb~L4cu=c92(I=moQxQlv&N8c2sa8ibTqT;S5fM*FZzzwDtNS8@OS{EtmTD$%DK3i!$KS#2KM}JSTA1VkJz<%HPg``a)dY z**kmqa@obXcmw0l%Z)S8x20+;E%Z~#RB#yOHr;)!EYf;>Y*^BS76S5gb#R~rC_Q}P zK5&%dwWp3aH+3*JmwrYzGYT@=KHzMV7l%yfU7;KK+fnIRzL z>ONCoC_;SNPfcA~fcMM%*?R7o--fE#i>Vj5A3+j~+w}(S7i8yId^2-GN7nabFcDOzDy@nAwaX z?bO1dLkW+1J;k!#eWw?X_59JYA_z8J5eCxP$;`nLPTRcoGVU$)mEI8M{#Ux9I_Dw> zgJrTwIAQy+MSmAb8(}nNicUsRx58e2)VW8Z2Bhj5?oCi~xr(cwPSKogBR@W_Vd39^ zSBHvkaxc30yi^FD*8K+kI%#|LRzTk+IK}+0+yE$W{+B0s0fYZ8;?h61nTM->xE;!& z*O8QCH8ch!Ai|iBDxzsU`1-3Npt=VG(?i6t8ks*48&|uQoxBQunQt0A=-XoPJ0PB? z@9lIzBAgmzzLwLJO#DJ*FJ>^>^pP@8d~ysrw<^Cw zAHINqWz`Q`xb#iuO(B@<#)(mvlgeHcp)>^030`eqfkWBUnWj4iiQQXMxOfm(0%SJQ zPOzdnjg0RNap{a0+iI5fgT1a^y(&h@VeJ+`MCTJ(I2KbEY#K&AxhV++XtCf^20`ZZ zZH%l*7%8RF5^Uyx7pbAWls?xJ`4!dcQy6(3E11OjQ!3klUXlDTA=eFV>t7|88k?b~ zKv(2|E^etMT@smP$#M#mBcU{=aU{1ZAAOm>&^-5|+D*=p39@J{pnAQ(2!^<*9SStv zKr+D$WF3fQvDne71oAMj9R{6iFGYhV&%zb32Nos8qLtPcP(AMFw;v**KH|1x zrODMtB4kE=zy=c6-|0i*35yh%Q4e_Pg}lYu)I#heNb{Xru&V2fCq7YP-cEO^I&=kbHbIFc04@>iGkE|{`<=sDM3!9`dj5nBdTlQGC)%F8n_ZSlo* ziQPy@^PuYdnF2%|zm*Kz-D*2VFxsj)11|jZtIJ*pxXqzDr)-K5&87iMu};I+$~Ak^ zl0}s0|&he zMx4w67J;tUK2zbdqp`FEoM7Fz7LFK!f~nM@7~|ESp!6u6%SONZs6C(&CNOP4s1Zux0K4O{;ATjAvb>^vJQRNyN>nGt6*y_XT_(N8eS%2fY;t4N?!oH=Ti2h)t z(>=F=G`VV2U^!zk5spAh$J5tDB0UusSM!AErlstjnmt5e&KJzAA^z!kg9q`|#-0$! zNb;?%{I1s~3&-Eli64&v&;>=$YDD1S41QO8)kEFJpS-pwQ+BuUOL0&q9%QjeN5vKk zqsw4l_ZP|?DmgxS8-eqO{%SW!1BJ!eNPGTt9#Z{^h2hhC1w7rkN!?87vt@~QB~rgQ z!Kl`&R)SL6NA1S((FOO^h3OK#PL;Xbh^pQ>G1Tyv$cl8T;bPjCrw#u-z-i@NIFW;n zMx1{CVQyb7lqk)$Gz<(wwjH8Sa~jom!SnWH{?7OrLC@nbg-RccCKe9moK>If#JmBu zoc(8Pb?e55M6_c(%K_j3kCr&SDEG;GLL|lAkdneA3(Ue|-G^hvAHE?j-;c{BH!nGK z+Yu#Ll8Ypykj%k&SpX?p+Ea^!(_^ z#Pi%+!QI}VPp^oSsnB6*r)X&5dIHy?ts1anyB**3YCvJKaIyFl0o9yDSgzei*?U3s@sAdI zbR#IabPnJ|Sk`rOoEXcG@U4dkCH2)AWDWNz$#mJa&WeXn7>jP7v#`|0LRz2IAtETX z)Y5!f_;c>TSfU{WsKG-Y+Q0U6eSl0b-$76<#sRLK-<^H*Y9dt@K`ZN?@sJD~OVu8T zZ+K$B=QEc!R575hK`rZZ@QOcQ;T;E+`EEP2)2*@MC>dz{vKSM!ev9Q7|3M%><)t@C z2zbLHH*r}XE3|=5q^fxii7?-}HN?OcuTg}j+KHB*XADMlvjn!*!wj*kj$)hr zPd)x7Nm`Ey`Sq;+ga`hIkF&{*0mC#WHqzn;nDJlYZT>Ur98B+eVX^$J^`INSW)%#W zq(8etb8h>p`TWt*T^P)f)?=2?>IsM$+cs|qX(z;fLt3YK26hj3KPx>e;^*ymt34vG zmWIQb-n_=f)`?1VDE?TP1~TyS;0}!~MhZJkv3)yjJcVikvkz!MvyiTs=A2w=PNx@u zn13ix_X_UUSW{lB9(vvkUKXAhLm@+8I||9v5Fi%ob+aMANm+^AeNgqWgCK@?t?!nY zK*8(>*Q${;0k|+JvB{4rjr)DQIz5EEn53shRnfbGwT$9I^qCA-{M_TJ;G@8wYe_|0 zL3d6~-t|Clsmkv?<+kp=Uix(MA`+)*)Rc-6!Wv2?5IlF^+L#{W{&0qQK;$#~Ry5zW zhuQu$O_L<=bo|FN?4NN5AmBOAOMnp6!OO}by*J;e*>wycZ<_8HbW6sVg{|ev6aAKT z&MGyWp9z{&rX;$z>waQtRfvE? zTpcQv8oL2)_&&E3Dldo1`g= zfq=ygS}SAM3|zlX7_}!c72Y#R1Ze2?N$}tGdJ6jLR}{7}36MN?o_8`e-6riI>LEbw zem8N%gfJK5W5gRBwGY$x;C^O1$n9(YYR9&}@BRw8`&S&di zlSjHq!het@u(MC#f(Alp;t@WIktAWauZyQL&Bk{qa=W@-(qmdO2_$CqlC+6yCN2+i z)ustwF_Si`n>Fv9H>rBaTer|U7jrRA*iaA)pNhM-Q1B#v-2nb!%=+~kp-$jkNx$ir z9TvcUq8nNHZL{%wI%}no{H2(6l)4#2`-#na6Q>t#)p3fdkAN%s@OY;uK?rMgRw0?; zliglZlK?>&!_fijG_&g^eK^;FYDuE2%)T=;s-ChtA*5JCsWw%+zxGpV|W-s6=cC*vC#AGHOx+he%ziG zUZ#(=v9($luqNRxG3ZBd-5f*o;p)84PQt5_PJ15CNqHRT~tIa0t-e3=fpRGo4_fSt_My)>lE2 zL}2H&uyxdIIJRoKwI#9IP}$)l;W^zezlN-POam+9fdXkt?0>F;zvtbBbTbfam=?%=jP@=aGDU0+2 z3xXh+JBR#AsViO075dyaqe#@`^CE28jvufE<*PbIdEGXkOg|J#t`}hm-48qOP!huP zknwWbN_Xi~N4=LaJmH?RQ~!-dab$=sYx6hWnhgSkm-&33z0(6odO7EZ>?H^oUZg^$ z+=H@B$TEvYI#NrKKfa30<#lWJ$X>bb8{5wF%^z zvj71*dCP-uHnc?m0lS?vvy*G&6LV2j+ePHAmmod|-zmM}JH5~3P;_N&Ii%f0h&s6q zaMyb$6E6e6{E$M&&`&-2S0VMwjS5J%KZ(w7UFF&gw`J?_g!s1xMI047>@^x=2>WUjx{ECwl|sAY|^h(5G?HJUKC1jLUf>J-IN5}Lsj z_!2fMgIMLI!v^ecy!-{4YY%BUn@|+Ijv`W|MPJ-N(?=%*N5~ewj8KF&vPd<}!B`=AQI&}np{kL+fI!qF;wniW9rrW; zSlwhg?Cl{Jw@A|A`CMOTmDA?lqfYyma@SU|{Cp+k*58+V^wbX?_Zh*uRLQgI%16jly|dgXtKk=&4B zSQa*H;uu+_CwkbAX{gL%jaP@N2%*=LWChLza$oR@g^Z9ruR+Kjw%H%pljap|x6uB3 zcIWBd`x%tE6Z(xRlH+|N+U%TvAOFenH9JE8?f3d^W=8N})Z!Y$^KL@{oh+6=b5|Aj zFtG1CK*ckHI<773nAsHJp)l+*arjX76C$D3J!n4YJM_92FuY3L>=Tkl6f=S{c*x+368wxE=p8~U#BtGK z&V_y@{9eKrcl6u0IYn7M)!I~`TfE|dJR{pU`7mki6xNdC>*Xut=K?#)HhsRVknqg- zIrt|CT7uBS#K4^c=EKh3(^=fY_UOw|yG<)i@#RwIH8rbEpof zzz=wbj%{~*f^)gTJH1eN1{`u*FJ}ze=-M!>SqI6ZC)*;NZPZ@*_ySuPFQn6G6s+mE z&S&^=QL^|@jSMh!ueKD{v>`UIg3@|1$CPjpF;STkEj-GcqugV-ZGi6>;`_GX=T8 z6o*sx+H3#FLaD~jfcjN=xee6}C#Wt~Q&|)2Rix~#`xyhD9`1EnFOfrUU&XW>{rn4X zG=qDs2zj@irP+B7978tqQuO;DZ2EO$&sBZoZcX>}N^}EbL+Yjw{7!ba1(^``8V)NR zqgt!wlPSEIsVh%+_mDmN7z1ye`R{9^{!8?8Mi>Eb1%l6fE!_l>FQVG?rAT5}ezKP| z(KTI|M{ahx!zY`a{n~}{{NTtksnKQ}DZebXkYl>~V zW#h-j5KyEZ018ZXb90R)-Q90Hs8Q=xVwKnMZu;u7jvZK6L8MlY%(O7^Tr)+Vr#jt8 zMbJ#1Bvf?UTop6AW2YAY(~6WU97c&;nF@I)*YF<+hQW7Tb(y|h>c{9Ill>;@;vIL& zQL@fh2n*bUJbl0;41W}83G*pk#lbs|?u^zRoDoVh@z+)VqQ1DNM1Iq?g49aDq+K)-t zY}3t1-XiELLU=$Mo;ktP8%M_hj*S_iTLKI+Zl?R2hnLK<(y2h%hRtzBV_$A!q^GF0 z$FU%@43?x$(FZ&Yq>!DG&CigD33(6}8vqzn!^kEhydPRPRH*8@SEnU6d-%L9U?D8wTDA-6=ca`isoNeXr*p!vpEL;r4LrTR9Bz*`-+-iqqG3VLSFzo1(Fc;Q$uW+>1z&k#W*Y9yj9etxZ!{JYmh_Uv?S zvAXCni4iPNP>e%wo;m#Si^Npc*_g*ODfiwrogG@+yfDaBFkfVs`6MUysFqIZT*u7| z7$$AXU8{>2T6I^j@UMGvqKo)5C_byIRjuN%)CSdX)!AP-Vm3{>6jj={f)+}GST|Nb zI2pmG${Lsx(1;5YQIg(p3$<0I>V~F0H%+h{BjwJo0?bw#{Y;7Y^4|K@@2Z4Y*#G93 zCGY~q+Tp~!vjZ`idb-Q5xS&+H?(A6`-SBOSAuXw#LIIS4U17m->ku zw=JaF1oE@`*Flx^cJwK7GXJj0NCk9vgaM8_X`KaSAZGvw3+3KtOkZB^JE@gX3E$ci z?*W7X-_!W`exL0OUIjF)?%DGn#8_9iDGT06qPj3OWm)k{iF%{l6ZyyiC5n>kwkouz z0_XJ@@`fTnvb944zvslUtifx3uiN3K|A5UN8F(x$7+I{Je*LPyW9}0tD35Js_*NX> zMgL}F`g=r5Mfi^_{~wRD9(4$~rL!Y7t3PH}q(68wM~fwmgUC!=0G6{pQu;adq{XSv zbNc~>QlAAU`HK{Ne1KY0G!>5aWpm8#SLi<$Iy=+Xk=P9VH z#@c~LFJ+bAY~?kZU`63wFi4FI+h7}jOR`vFo^26vsa;Pi!PtvrlAQ_TP|N4wKvd)$ zVsCV9N`g!pAVOetAf6I(O$@`Jwa{jcq9KSLr9p8ybRB{PS$2@~so2kAkxrM0&Gk>` z$Zb;_%X_E{9>9&oE`o)cNsBFwL8wz^i{ONdQAG zW|t=Y99wGJ5>gL0Y5X48xBdyuN2tl(BoSF9u`49sd6hf>ycbiN`MQULB+_9R{1Zkl zXfaZ6Go>6UPy4_WUp1W)X*6hulZx_bkMLq2^f{?i*8soLuj^sacu+ zPLM=dyFVSAJLf;A?M=&cQo$if+iUZ#9ILQ2_P+vkb2EkgJh6-8XJ zI43Acr?P>?W-60d930UD^^t0#vxOEsVU0%^61!FrM#~5&c#=Op`an)J!q!FOLp`ab zz)sUX=8FCHba$eOuvoz=l>1$|VU*S#X1@pDYwsF`dht9H{vz-RSi#Bj%Udb+E|`Fg z?Jp{mJV1r^pU$E)3_~8x1#;dMSIT8y+bhRf#=z)Mi1?K^6VE|1H8RVxwp)qXXqa9N zxpDFM;-2#;6k{C+ww*RzGaPKnuqjOG9>xi@G2^UMnM;2;bmiTsbSY|0UK7%JPc1KHd+S-a7`WNIP&f722%MO~B*gq(*ipWZjMdlh=F-njCc5Z5%?%RKV#aN6X^} zoaDwEHUhmo?6R+C_e$Tc?&ts<1GJ!nu@4g9q|sD^4-`zxnn&S))n8xehTm)_|#zYBvNe}FKu@Bln0)+ zLFoO?pf+Rz8O9IBUI^osr(=jN9k0e2SM1w;f@*Bw!p)v+2p?oDs4MR+TbB!1w*!%< z6l4-h{N|>Ii?OcbvQ1h!=`476YbGL;yU00`_2ms6D$aMhH?#b;1;hbrt78AOPXpnx zYE9`s@FPHQKVSR7vrjn{a{QEbjt&6MKJ@0E!>_ww*8VB92&h+ftMkCdZWeUGd(f4ivAD~&sNbQ|6bowlObXdXhH71#rI$5trI6Ogp&5^@ z%xlUJE|SP)r8)J3dZD^cEg=>tQx5&bt-KX0#@sz8zr)PTDNad@z4|NFB#h({sSKRm zsq7cyFWw3bKT(I6$4y8N%Jg$m>({f6KR+3LP=H0buy^PxhYgsx<9~kkd$r6xS?B@| za8(q1BU9d|tNOneLE?O;F4n(%ul^@@6dj6qlP@V_>`$u4?8%LOys*)lR7^`x`*s8c zF9Qox%xGy0gP$rQC@-P!vA%?lm6YoNU6sX=<-n``BFg!KYNH9CRe2sN%Vb+>>RQaH zW7@3^Gto}1)$OoUM&Vj2V~qC;@!QB4qW30>k4|NdTtPUW0fCJ{Ur>R?oihziFJoVIFnE~^7ZW6z%1EDS;B@eP#2DK$TrR-DW zJm;-j#-ZpRI#DL;z9&0f``DK2vQ#y9)(G&gIO(ekT_}tqbY{~>+>v@2o%Vv>{!TU# zEx-8kt*s04B{|tqpK!50xz1H&K0`GU|giwne8J`8B!xIY?T3Gp>I?Zk`nBq|9CGwT<2 zOF4@BfLO0Z+f1u~wbH;H+0)dNBq-FIQw zIGuizbr11rb(E$gUeRYc%H|iJeruuh2s9Y;ZOKS39o7UuVYGNPko2wfc~@b~`mbIg zV12}h;Z~d@bvC7UML`ZM|7(3XCOR!SM@VApP#XcALZ1Hh4_f66wh{diXhI9DRX&Mi2sv1YGz zIHNZCEOQ9$8Whif;i>KNGo-4!MfffwA6N_LswMXGBKCPezznIe>w>Lm_B~QJ1W#_Z zMzs7^Z5xP3IJ)qh$_5YYeavmVTN;avUY((j$m{m9?VhJ=?c-e?a2iHX(BN7GA z5-FwE9YY9+0i%lsU7bTTx@H;}Y`-poT~zNJP|z16&hL3wRV>WpB42jDF?rG0u0P$y z20!?;=<}JSA-%PL@9Iae{*90DtH{Q#0q~J0w03{sUyQk>gL!|%D~VZyT8v2|3#fi^ zOU7QO@C!JJM!f#~DfJ_p;Z>spnZWZyqu-`&4}!F~^A0U>?XcQQ)yNC7PwGZX@j%a7 ziiR3-_F|8!Mh|yEgi(QP)F2m?7D{X+FWFcem-%_vkicGVdw7s&D}yPf3-#bheFn&(HbxMeVND>T zJne1ob4@r%KW~DruQy&sBtLwrwx7n1dIOjTh(o-~B<=6Pa^1x7KVpyKcP}uE^WtDw zB0?fke8qH+O7ow90*J+FAh7F4Hr$6qRm)*#m~*__a_UsG#*zp=9CfCb^Y_y{3A;n2 z9+_#yW40n_dC0hlcG%HA*$tavAVf~znTA{xJxP;ABv0|4GE}yW>wcHV^gx-aJnDga zq;bz3#9lmD;q&W<@=B?ZQvD$ur!YPFJQq!bPrH)Kvr72=iGbY?o$wC3sNOnmzEKr& ze6`U4#}_=I)s7~qT><>?bR);nBFGe&aOeH#RwIT4#4b~P z;;;}?I`Nux+TD=`pB^CTi5p2o?kIXmVrFlNqDiYMp$+Hn61z!+^ux)N2ajn#qb8i( z#A=qh^PbtQVduW*@6A!}zozEfj<{WJwjgRxqovabB;XdxsB@tE)k>D8;7mN zRLFw>U6}(taNKZ{2|xZX6+BXRf+vI0^NYxm&(VzM09(6=xdbD%3=;)HJ{KPR&O`5! zDtTH%4c+EQJ>#T;V>~S>4G()0$2`y9qS~-n=sk%sp)B-~cH|{!0m{MHC;ixn5UNx@ zvAV0cA#KW<44!X z{UW}|9-s}|-inqEvqntjZ!MhXuBW`%9g2{X{6H$<&>|UK-81w&e-2OI-jDHPW`F1$ zqwV!NHbeKj^|u8DYjj_J!&LpLUc+rS7Xl!y(RYJ3x`z7G?AbfkKCy%zHSTP0#QJ@bP+WhbOwqmY{!DP7ujzfT zYLP2ShXPPH$hWU^;UL@Y_0uI4BFgCH&4FfOW8l0YX6|kZMZKol-vtqR1s98DeUxpF z1D%4WPl_yl-xxWMfZs;oym92RzEh`w?JrULKa3E-ohg1$)raem_{#B1THQ4M)NlEh zq^hYK(31o69~4neLwo*vp4(&Mg7>_7;&lVb>v>awPc5XXKQUV5jv0UkQ^%KT4N|Oh zAh5Q!YYGJEg1=!Lv83&5>;-hzA$X3)_gU2AAxAD2@Fa*QUpC_XF(8FoX$BNBTh!qg z-GrS82}%)n@a^}Lgn;63mv9a}q<*DGZgU2*IGTC^b={TFS|0RICo{Ceap&ohT|Sgf zm8X-A*~ay<3bHgx3hrEk$V(s?L!&{mG+F_$*iZyJ)BBWP{zto9t(3#RBN6!k^t`)A zaN1s-#{XPSKu)|Tb(mTHYDCJ%3L}X88J^BEVh{rSa4&)wN9D8blw}{+yFOe9H8G-d zXad8uuZCQwGMQXX+pZ2P;u@zj-=n-1{i<&2dfXiJT>UN3aVU-G1$w1N)$pI+4{+*u&-b%%{Pq2RbfN*6 zxfsBa9@8S?|1kHKU3o1`wkYoI?(Xhx!QCx*aCdii2oT%}7BslKySoH;2@>o+yZf9o zI{W1V<9_PF4*+V_s#P^>)|{Q&eE2c7X={EN`S>wtrRxm9p$D}9q3DolHHNwAWZBQP z`RPB_pKPGCdIDDL9|KXz$Q~R#=<%ZWLYV z3wawIf^SBCNGcUil}W^%D7R7mBqqx&G>tHWw2Lfn>Co3+_$-+~u;#Qa<&TFs&=;Vf zyg=Nm<>z`E!kWwU)HlFZ;T{>k`uFSHL$ zziVSiALgbd2Y+EgjOu_dNh!`PZJML33*`pE-diX*&QhqD@u*H0OmSG^zxSq(l; z?Wo1hJhxzcfuuUka{)A>O^{X5YARr3I4;%FscARTC7ol;5Wn2=_Fb{^ppN082ii}- zPZ;OD|L#NE6&9Ai3IuOA)Ome`U^If31ACYPHH!LKu(!2^KAhDnf&sNDThu=r{Jg>DV-Vl6Cy6TR8{JqPhwiUdyTi2wB5gNu zX$NJ?z|S~@U-lgxOjUKxAHZDvhV3enCVwYH%pWQSEdS!RC;kuZieUJCr)+|sp4)%~ zh~!#NN~v{60Y}?&Hlv`(at_AY!wbGBX^hVm6+3L)+lna#-SQrZ9Ttg8=?Re<1}{(v z1`<<(8$`kv*)~^HhiTrL9<$lCR2PMTUlH}{1Zv2o&BUrsH-0wcqk{01+Xj;h>;Dco zAeKpCsOk4etShjtIl#7U*`Kc|mm6oS2*DhIOaJk7nQq^+E0{X3U&;II>=caVxxK=b ztD|GzOYi=31fiP@r)bXQdWEZwEd4IT?1sK+yFp-N9sVnAIl*ts<;}x_P#3zE0x>=1 z;fM-QPS^G({y59b0 zgju*hw50i${JLDiuoAd1;2Hxp1m%#1@vLY!AfL+{gvjgHcHP>>l8x5F_i&;3$a$PY z29q7Ns)`^}-%AHMr$dJ%GkCctLf`vMMw#9fuYn2>+CIbeV7R90Zwv0tE!hw5JKsDO zpT$=-_n_T~lTjlninx62ush>MGk3J$`GbAHb}c2k>iB<#jFtIg?=90`2&@`l zD+&-)0lR1mL=*!F0`8y)Q-HLJ%gpW|3BRuj|Y3XZGq?v2seB`m_42s6Ek z$`6bSB}GZU+kx6Gpv^JaCvW=*rOTsdwAgIFfcv7Fc^l2ZCkJ^$OQ^p zk2s-Y0rmskm=A70sBL6qtLMG2C!mHZwkCK@RGJk2bWqR2Gk;;(_fWzTZ5BiLnqjq}^Z9qw zB^z>S`29E87JwNDzOdENKwIDaHf>Qd6LHLq;*al@hUm{oXc&B4sIL?l=%=~gKAVg9 zts!74?(rUSoz>N2%6qk9+)}eJEJ~czo%&Nm=b|G>3M>zTkCklHK$cew;4TyiQW3GA zm_n>+19_$6TnsaZ9@Sm9FWjCX@^F2&7|=lCfy}0`E*H zgTXdvdm1#Zo2;LTB7ieej>o!3~+>JLk6Dy-IkMzpnos;0#&(EXd%A^ zql@e`5ig7sD}ml3JbD=>^>tf_#-2P6*Drv)JP(!wt{WSrR$HSFQs7woOH2qcmx^J4 zX#A;x0cw=fxE))!oyDN=SB%6Hi8!w_{Puyl*r47@1;?U+GwhKBo{2{cNBpI+qu8&v z$frx6lFkD|XlCz2$E&(tY_LFll#;J+V>d9qe#RngG3c!h(%o!Yv30kD{``wJ?zU%1 zK>G$?EhUxv1U|g0#~(%(WN9mD>Gdw*42m~-L*g!pN+&u%8lp}(WDu48rwgaSQ7(Ry zjw>2+QbbRa#spG1)^^kN0X0KDpPK-6^l#I~DqgNx&KFQ^n5=iiuUJ3kY|H$w(Iigh zpCl5!&m_SV8i7RNQYcr#oPDx8Nk-1H&8Ix=lK^a#a8&Tk*jg3A_=^tbe%~@!W|a++ zN$D(J{p2+EOtvLbijP9bh_smEjQ8A9ZSp}jyUMa%MK56(}n28d{y+o<&e-V;n3ZIY2Vg0jlBl0trNt5 zFN;1@?N~Ygl|}#4kWwSzH^9Xs?bdI+2%@cAT{?>qCEpK7F+! zgPUz6=o5pO?t@agBJFuhlK`<|WtKThMQTF$K&)GV zC?nr_P49Jkw6`(gpq0bhQ~}{B-E!)B)6d!ZW6-n537reb>G^ho5Y#C@xAbcq-z9CFAJ4u`DKtmtBx z&((#j7p;a3`m<~bnASo?biXE;hH|i+em+JDfYg>)l12)Vm`P^_@(cS3)T(^2`_ut% z>cJIhJ1$6)7XhUGh-k@xH%-b(KhbN2*xp zCYI_mP<4m0XAdem&ML^cTAg%jax7hnB1AdBl*<#%2xrVlYJ z0X61-%5JJDjoJVhmjB)2{n1Q3@DQICLR{PrvFkUH2%x$wJ_{I^=|*t_@*`sM4q z-u7QZY>xy#WMb1PQgDex$ACN=+CnM(?vkCB$bus)p>4-A;@zMB%<>K{hP!evFob^! zO70-%kvhgrrjW!eta1uo5!BSSO9O8>CI!_0}t1ncw3#VjTu59JI4*@CIp z#wtC8iy(&~J%UrzEUIGyvk#8+G^vrk*J^Axgzl=hs%gc| zCtdK(IzFhvxl8s+syL*EbqWelJy^yQNT4mQrR=y-j>l+qQ*=zU3J;kVC&0q}p(pS+Es>RtiAfRgQ=>QqFg91LW@A^GV&|w)m1O=E7_k?qMbwSQ z(8QMvoj4g#KDIrwCA|v*6yu9H|K6|v(AdNEcjGnw-#BZRU-2b~q!fCpGhwiM@=Is3 zGT2L&$tZ_fNMIkzup%ERx;4dIbbo}?5WJdEOavRpT~FM-BDwgj-pM0!IU zx??s{E}5JiT&tpTl=l~Z6au-1Y+YZC?QYcN`oZr`&Z_OWE6A({0oeb1EP#vf58;h} zd#Ccp^8)ZjI8m=009!S;v+~9osW5G=6aFiBLs|fFghC+zF@@^xcy{5C-DIMk!x&!RL zy61;qv{N8LuVg%8FeQ|6d@4zzB^(CCbN4#XgCGc3)aWbj=%TVasX=Ye+Nqh6RT@s^I(}sGNIiM zcP2@$Aw4WXOzAcqFsGU~YpXScsg4ff6pq&Z;_i7GxWKidV44jmFO&w}pGokC@`xUI zBMt8`_-s?7>*hfX$v55T4Eob_89BSwd*rLRADoMPr&0lDNYM6EO#4v6ntr=rAVP1sAk4D-VFS2 zFOw5eH}H0?pN+{GgPISfB8ET=sR!{4slD|Ds7)mf8kkIAZxCd(LxIVPAS4<0A7={O zm%k~5YtUPU-ma|GUTONXWvh*=CW-b_!r`f-nhv%PFYfIRb`tap4kF@DS+&0x+z){O zEPu}^e+kW+bPOCE0i68!0YbCl`Bd%KNuvhecep9#Sc*u=EdVk*FPBU?&P_9$TbIR? zQak01EGewx|#4gVQR7VZzt zLH~Q%Ypfr0K4AWrbXENS88rX}nF-|)m>W@Tz8^q1Krt`>hOQuex@1oc4!+?~e+;F< z)*VEB#6;_!q_gR6TS1KKHH%yPOqSx>+innJ_rQLlF|*>kOZ&<`R?Tz4HD&_CJjAsR z!B#+1=qL|49NmjP!<$K5Ljylk4R+IYTlan0GrnBj*o1ZZ_pj-i@^i8iMusNQ&O<4} z#yh@u*oCajAL>+DSpR|m`4zeUsUs&ZFuq;>E-Ri2f#Q+tZ;g3;!Z5(W2n2~gF)-Gc z#oV)k{F8ijK^j<=LXp_VJH)A|j z7^dqV$h4ye5v(2=^-V<@5ugHKSrnX!x1gm-K|-e5%Mm=qleab86{dcTRz|}!R$~m) zp&N#~1eH;3##{sYNx<4REs&8jY7;D0+h0ss<#@l>k8}@Kf4H19@V!6tLnVy)Z?XlT zk3azMt3~g*!7;4aWT|GEOa-Q*(?3Ph1}bx7 zSg|CH(z()?GIL=gSrOC)e%m~60}DXiG0TZdmK54L&@B9-=uFwucXg(PwEXjHLuZux zM!65L%Pm*+fBv&PgMYy(HxQ^)4+OvpW`>+aIu_=`Vq@8h}n{?<%}kE&f0B&PGk7Zsf9 z*f13^C{oTMWNM#f7!fc)_Ki&-F5$RiwziTSJQ!nG&&U+mgSGkXlgQU?Q=u>X+3#vK zIaU(4+k_UTgCSg9A9EX+a~tNi4HUXcXffj@SSvu~N;GM2k^?Hnc*s|DrHDU-l@I-9 z7+l2}J2IykrUT~B3hd5qXgzy87fKS;5;p!Lh=4b~Ox@N=U{Zw9Bca%Y5bAQ1f@D(U zp{gg210tC#o8_&os{YRSGO3y_cgRNHSJONXPJILlW_A1+a;&7LkniOD=TAOICOLHa zhJ9O{9PnW!o?8<0BZBY~$aQLo6)yT92pHxauRx&rAy1Utl%oj zAqan1kCrH#Mr)gI5dFZQxPvF~=TiM@jAc|5fZhegWIeWNc-|mr@aa>EeLy*W7S*!1 z=pJEBuVFPqP72tplR z1&h{QC9fv13PYy$Qh!vci;PqC6UwS0{%9_kE!hx)Rzc?VdL|?B zn`b8LjioKIdW@6H{^F(`Xz+*Qxmh!^CgF76aQ$@z8d=5unXN;XQit3LaSAAAbPZ+J zB)b8~ayn_xu!)z?_HoH1rg6?&g>wf}NGw^$A@GZ!rCkV#4`}igI#mi5~FJnF$7g0rYLr zZv1U5aPZfd@%nani2SU*9y3)w3zRrH5yA+}E;gp39);UWtAjyNc)K65PgGm}7 zGtW;C3CA6d1^gaqJoIst>GcRFXIiDMqU zG?EvQjq7}i-EaG$poHvT^gr343MD;(L2S^Ue}Rq&B=m=!)Z&ee*iZ(K6$McXxkm z$1y0Sc1Qyk%~}HS1U1ND+-UVIK31k)wQO)?oS8_bE8TTd(*kwlwC$H7Lasp6vhRjq zxE&s|MzF5)s;9AxKQKE-@-2r8s7LhJ!>Bmt%~nFU&OG89%?ol^aBdO|D^rmANpe3Q zBogItNU!y+^~)NOh4p#BorkrGGso9Wq45d|6sq`ZN=1Ugh+(FRESpOQn1!Mgk6oGa z1yQKZFg=;gN2W8qaacp%tu`S;3$xeD7Qx8vH4Pdm@+^a_N)e!oLa2Ko*H9|MWes@1 z5iRo`JSCoSJvqZbqHK)`9~pE|STty-o3P?e1Pj`IDmUV&R7Nc_zH;n`6+j%@2^RmY zUwyQ#seCIx2(%PO*0|(Lw9_OLCMAr7h38;d0(H2WeBp_!zPi6nXi&r^vn=mUBIfzm!CBG#Bj?V?haMn7w|W5g5fp z`V)nmWeJ)ygTSO&o+}sE<)%N*1N~MOT3o74H4%ku+uf7>!siKWqFdz@=~O8~t7-To zJJ_HJseGlF*6U0i@;0HFEBFPdbxy_-d7O!97p2NBUZ3ZC)bZa>e;{2RZwcPZ$PYm< zY=6_;)KuhVbN~kzohtSLoW`%w+zLEd3^Px5w9CVbCg|WnLV+d5aM&oK=X>N?GASdB z?ZqK$N-D4Vc{g1Lu*X2wzwMe(%@#2;31sb`0@(+6w?Q5K4=kR66#}as!4a3`AN)(M zaz?*@Q8#v1ar_=B-$$dDi~~bMs6idiX{nm@98(`MgPni&i=*&m%H~{Tjk1|v|I%aY zn__L@NRP@g};xos;}?lRl``-hg&5j zoTR(B(Y}lI6MOjo5?pW9n+gH->2lD6IEhke1RHYhE3Xezr z)XI~kM?=|zax>6)@_h8^U{32+!gI$fu&H~?d&ZpXA3|1GIsT?$0*pBUN?-qF%n4*w zcY1HabeQ+@=0a6IUKj=xEvJGa{*TukB99R@y-H**p)F}Jf>MzNF3-`H{dEYd`qUNv z=#lk4?5DB9SzwdIodbizOgZ*P5AA!ubQ_koLvv~df6AcCiNsC0tG)s+AM0{en*Fq+8CqEQE-&Y93I@^Qj?8ebYd$@u|HV$63m|_iOMJV|`WQGN-0; z=GnTz*LiGAxKaHtXPT@)JeK)&rWKYAF?Y45dR~~528Lh>_!Z)9NOb3+jfp+1E_G+} z#Yfern&SHj!B>f)C@5|mO~E_nV=(|zvAHf034yb4at4FE!aFQsRAEGxf+J{$mcoJn zXZ8rBn9M$+09egB(Tt*+`6YZ&3SGg74vfszR~tX4ll*tY`q(}u`p?4t*G5@Sar7Vi zKB zl+)DwJ?CrA0d7%0H8K^wh9uPj`iRNeou0ZW!P2dt)I*!EPXydw(FDuIsZb^xIeH{B z=`jzR2okF4%KfwC(S_;X#t||K2>U`>b1Q;s=}s0St4U;=_@+-i%tgiyFs}bf3l?3>5 z8)tO9_Rd&8YpcgNX#@1Pcr1HZ$|R1?m74bEa1(Qzbub}S%EP|d%UfIrEcY*e6;QM6 z=wg-Hr~u(TruF%Qf|K~yqVCz^5kQexG2bTRWmmWO7X`%o(NeO)4#>y<W{1|sbe#VG@^>1Mi7q2_pqP#TYj+v{N3}utK6!~;0FTTKkl>-Ty*CjWo-SN zGD&Mj6)L7#{yrzmRz$xqYh4I;GSbEkdsN(ms91Nl(op>b5YUIY1h*96>M=a(2D z_RNG0J_D7aR_sRWn;4frp5|*POBMp<;7U$THL0djHUC=9=YkG=k_6GB#u@V%rB`T; z^SB46Gu_=8z~3->xQfG)*Td_DTg+*hWS4e45<^Z2Ayd&oICs`i#S!fAz>uc*c}?cgPo<`3`&@*~#d5 z@BR7^#>UF_cZ&qXwjXz4dYe>9+WrW zRH4MaNpy84?s;-{beC^A%=uNb;KY0lmF?~LF_Y@uYZX2I8@f&v7jRg(Z{!nH0p88& zPHt1)>5|+AMyx5KB;Jo-MM8aAIQ5l8Lr%D}|52t$Gw!ucDO;pwsrV9&3!h^eTl+Og zw~MYR`esQp7F|}SJQA3wnRgOXid91A#%Rq(jyL!PxXQBkKD4PW&mC09vW0F%$#=%G zCE28nnl^50p$pb9)0~H%+?W!UWo|n!&yZ!fSPx`tE@1V=qY1Q3nT|13OrN(VN0}}; z(&hwnw4X4mzj%udJfn?@TVu6BIUP;Q4a7ssDU&yeH+1CJ(*yWba&qxIPl)A1vpFlv zUlmXd;7hs1ab*AR0{dh+~spUFk(db6?>TDuGCK zR;Q$$EOLHXF9|Ov_?p8+B91xF{i|_Te~8^;VIeQm>agt-wq$+@STLFIYI2Y&%63kK zb~#m7?7A7)Hh%(O+?Y7tdMR}6Y%yurx{&9F^&#<#PoF+VO1X)_gHQ9m1R(6NyDtiA z{t*ittY8c^qo zZ$V$qqe_86L@OkpcaA35Sv~|Zu>8H8{YU;t6aX^(xBOAcVy%uKJQdHyBT)XG8pGmuO>^#ytPhA}TlX?N+h3VZ&k$8a3!;dyZFhvXd znDxh~2GNICZC1yP`w8l21WUZo3Of;|`j4*XJlAy47A940KhZ z`JL_i!xXL#{hKlUQK+cGl*=*Kw;;929sNtRi^$E()4RHkMZskZ+4s_o^<(R)f5G|j z+z6t88yCRN8CJ}wtvd7IlTzfbSDP-A#{^xzZ-!Vvtf;d{E#J8xj|(L<*tQ>Vj&H3e zsJzi8`U7G{nruzk22r zT7;XL@JDtP7HA#rDijQ>{|fLr1Yub}$E(PD`cvewVMchg3`FaVCSQYyKn2WCc~&>T zYs8p$JAlwMq%{|y|82N=>DdExb7`jru{U;_rwW{qqSpmprCL%i{OBg;r zMhOdRXH!Q;32Q@VQ!!IxI}=kzSyS6D&gMic|MvKn`^N-1@XG;ZzDfD8Jb^7&E}-l`v2UCASdjO(R(YD^w}a; zjH4W(Z@Q{-cr78gBK!PD_%N)w78`~dVSVas7?QdzkBBz8H)$Tv?kw6XY7<)&B&C-} zKMEdEAXg9kEA$R|Y-h8x6IH5C)md&#^>1_bX$xEZLk?%!8K81_r4cX^59Ntiqn)*- z_<>*O;i`yM0X1K$`ryye_`N&Z$v+M3-8V5F7utDFGZO1*e0wKCEFa>n{O^Tu*;zi+ zi7+$&MSanmsDTGOV(_VKYE2>(EL`s=F(sJI*E8TR{ZcKINY9gE!NF@hwt8rSA=7AX z5GuXngP*B-O0QDw9rphWtmPox+g{fda$I|OF(*RIl_5&U*=d)(PwYKb!cIE(?V(LxxuXl}tLl1fYIGN0^9 zaCzm;`b9-LC04YPCt7dSxTUUddE3}tmbGerJLT@f@fIb>_y#jpYIK|8 z-gRsShn^GJgn-Z^qCq>Q9WF2Mr?eCr({8EiOC6E8p@3{h?be(fzD7M z6!c4&i?Ea@rcF;~S;cH5(l>txf&8Ed3HcdC29ASg~WL*-f@)1My`Q^5Tt^oxkP{j|;v08kI^o7aJS#(T@7P zf~H5uCm+qdc^CjqKS|z8LnAJRst&SF8az(+WggoaepF%^e;Jbr(}dD>%uUy&(>^om zru79_LnLPSy~p#R?>5`ts0qJ{l<{dKoO(}&HAejS#;)e*wh5N>vs&^5 zFvA3ot{bvgD3fk_&f7Ns)b_24PHder300tpL|DIeIcjQ(s7%o{!s+wX>)uENxv`O1 zL8&I%68+=!(m4bA8-dE22FtUZI)NM`iU@p&Pp~Z$b zk`TD;%-^|d<^mnwc9WuGdduFAtOA;cB!!eCmjmlY9H-=~W=22xDukq78g6}d%+99( z#~_c3@PIWBTRCsU{b$d>y^RFPNK-IO_zBeldCHvF4rzgy`1}H0hM2;O7^D@c#M|`( zI*dOn3F0JuM%O>pX08suI!D}INZ%FJ!pSr@8^mW{UXz_jGyl4D;>O4L-W%r4z%1AC z1@uoQ!XJ*&ke5OMpn}fKe%hqE_@R~vLZ(=y^aL|$f{=c(z4j2#H?|OAYQ=)C6Bj!V8MOzM zm)vyv%bh2PpHk``-%E-QVf!rq+RG}+$b$<*t=&@}gCt#AAbHk@}FyR>v)M^W$Q zXHUdY`;EY}NB8rtj}NF|=_mDfg2M8#ps@VaeCfsW0*<+#_venie zdf%%Lv1+0LCYxMHAqS7p=^B>ylOFtHeGT7=hceIajb!#ES2^E3teCm16nV zeGFhG12n0vleD)1z!m)|&w$Mk;_TaJnH4F@W@HrVgxY^r`UA8>j*bt%Cl^XYVe1=L%`;)rV2z0=D z8Jk@ZvhIMfA;9`P!e;fqhoMn;{y5}dZd6>LsEdME$|O%*l?As#2YHBsP{KJjTw5mx zk&Cc33UU0>)U0dI{%4{lglfLp98B^}3fqezPAhzU+qv7S8z8JXiJQ5+L|27qZvQ(t zGJJ31E4er;7}TfT$gt7r?A!!>+3YAz>6t_g2>V)RT5kAQTFXV4nru5}T-(a+lU;+p zSK-mAUHO5u%|XHgN9U9GQt(4(1pu1v{{)=m5=L$Q(FMIIGs0madKGISfb1q{1224n zvoTIeA`o*3#TfylV1w}}&8e;AU~05xD1GskoU9R-5qbl=o0xo$BVMmt=M|ZDDs0E- z`c`tASZrR6EuWop#=7#Iu%)#na1Px;V5Df7j$9Wd4umBQq>Rl$VUz>Soiz=e#D>uz z)uZN|sx?YTMaOD_3C@)o53X23WtRx2RAZGMqqg(;jkN!~u^2T-H=;Hn+Zi_mK5NgBIX)^?wFhT`<(4XyPN$DSDl}lIym6;Lm3Hfk zmI%|C&D&s7o4pm(cBni|hn`?;3c7QC0%AHq>t zSpUY^5@Zm1nf_a_AUqg2$SGS@?VnvTyFtk?g(q|GWqCLh)VRTkY;}}Bb(I;5@b(R#CW@Tq-NYvprU zj&<<;XD_HK8lK7Yy>N%t@udR#t>APBN?Uwe0-AZm&H5xV=6gOJz}t zPlBk`5lgbu41 zpg(94>#_YnljnvQ+lA;*P#N50K5_pY{7XZ_!5mJ(!6Xo*wfD(zCneCB{W%a^s$0|+ zqQV`8t_L-_6bbD!h`6KF zN~Gk)cD{%wS#U40VnffA+;np)cD?Qb<8&E>5&sqm6V1*1GiUn#wUpW2Q7{ z+4U>O2iz~uEg(snv^mS2J7#yE7YR#Sn%~h(;`rF8%JDDmWwMkbK%z_x;2s9hwlyMi z&(4uH%>yW>*v{0})m(A2`7h@FqZi$?XPZ-7RuuL;LrYR!NbgrwO?{t9-%-sW*Kakf zmv!ANQln*hy(LYeOU>*7-06rG$nJhV@AX44^=Qi{tch5VE$q&#j3BsA^W`1NPW!pYLZ&PzP5z0T08V zxp-~Uk7t($PBuKicpQ>>%B}-!`cD+1vUvtp?BS&~wxF-dnh7IPIx&$8hw=q|fxzei zzZMqYeK+Bl=Ui}wG%HA-9*3)Ew*@tw(k1bkjHjl_S7X^dy6saN2Ko$H!C<`v4jsr? zn&z`v@2pOH52tZOCPn+rQz=)lr5Q4no$`-ufMc#iMCmq25osd<5QpZ`o{KRyV2I{Q5+5^o&lunMt zzpCPI%E+;cC1HvIO=h-$Nk@FYbtE<7~~FJCfC#55nBcxb&Z-84meCx zcF~HCl55n7dxQPf7_hUJr)5v$T~$iJm@T4R8`kl*N@5ON!%(LnWHxaWo`Zc5Xphw? zNdn6B9hlTyLWQ%CN_9QlkIuzx1i+`CD*_)eV{_NOGV$Ims!-33JHT2LfI#?M2bD$z ztqpWABW4YM(VfAo27Sxm+^lBl%5 z#M}1r?{T)@=4Z=mHy!DfBRDAM$+jo*M#wgzABJ&h@cx{}szX9EIZ}|+Xf(PYz=zOO z{DC$Ek@&Wu@MD-s^#}~ylBV{kNFFyb?92L3w{IC)_|IP|pY9}_c_;nO^P@xSpR9ya z+FRTQFBk}RF^vf|rQi8%uzqOvW99m*9|V9{j^e~nV*gx)YZiH=*bFm#XeZTPKcU8ry!a)vUhmrhK)&GGR!6pP$CO_xxsf=E%hd>m#Y z%Xf>{9bDIZYKwF|<&jC$Y_EB~8%u%(d%-$jIp^d2aFy!1 zWG45%Y#|IeP}yFwm41O%^~#i3Ut;BL**3Ys=5+TxkFay6c%|OlLqstvaDDGNeyFei zkI%o1BOPF4e-F^))%ccEGs#+|tYvcE>4TC@mL%6W0ugL7^$%?^*{eT3i`w*Bp$d<| zt$Fyyu?n=4p%EyPdg6q=cquaJPm?Ti;~4Un@D8h89t>1j>tpKcrubKyPi$5|?CZFE z_c^IN*YNOsjA}xa$su&}hs82l_C$-pS;4THc5Zm(D)UJ6nda$-8&qa1ZdEN|!Qe(o zV4GgCNTzn!U|dpEK6q*#U^v&P>9QGkbQuN&I~x2LlfW7kApN*JX%>aA0mv2UEpI|I zBnGRTaJA%oF~wLE$ltk>UcGAEZex;%Rg&$gnow8PuXQAzRbvTxWZpm(tgtV_N%R*l z6gv}oVuDjbG%g77MKfHYt9Gg59wWom+pw~QG@$omr=<++@w_3Hg?aUu?Nt4vPd9BU z8d#(w*|=tf$& z0#2fxP*V)&Ea}FK;&_XvxyF;CtsK>ojg?Nz7gC1nq@jv~=9A%#ZyD-)-DTx>+ON*> zxEYjGBg%jIn=(YK3!!Nk-ZMBWq__Y@@yq8-N2%>yR70u%VszJQ*y3~t^V26>^rHkF z`Tb6EuzV;v{`YPPI6k%|U}gJDV?_?2f%E?kq-+1tz=$n3J9+N~CS6bL; zb%stYHyeTZO7}R$0VzV->diS0{9Ps+)Cp`$`@%f;fk=;h7?p8`hU+{<%(j+iBCqyg z9g?LC;ybw|ssX}sCA#jtIxxRNhKWx=dDkUggk8L4&fR>QGQ%YF(fmPx!f2p#U9z)z z8mf1elUAMr7Mem3^2(z|cxwL4x945b$LlMSO{VB8y*t?L;jen+aB$e53~%H-WQJiNMPEci7La;y4Jyq8=bLdE{Q z22uVW_CTgD{}@)pZ}`#g-COX2(D&j4*o^y=BM zrJ?m}Y7gtaaE4le3yAtu=IbIN^<>9GVT@r3;x6j?_BO+w$s_f6-I|uR}}X z840}lNr_@ZpaUf_Gb9g%+vSNT*sr@@J>2ZWn>`#C^x~N%UAt}~nUJL!dodqG?XC@r z9}kM{p6ie;1tv44l@(fjSCq`#T62jq4HJSbq^yaUyTPj8G;x>(Y*nR%4G!@QZ<+jZ zJWGDrt0$XBfIQ@Tzs`Z8i|G1B7zr`FI4^= zL#*F7&JMYhig=F&s!RgfU&pFXbn1ZqJNVB^ImJ1;Ik86$&*AC>R{?Az?i5?07AH!} z`}{KhM&bFhG+#))B z<~wfghqiEynDlxE=xv7hnq?aq!F>3O{x&}(?QIY!twF}XgAvk>Ovn<@RFA=IpmlXh_|14}z@TsqjfGFFXF~fZ){wX4;o6obG&g)8$<)fCBv{NEW#SHHD_!Nx=Od+q)KmdIHR5hT0WGiCTO(ItLZjXX^UwRS3OQ z-ugL8-i#J_ z@2Fejg=4F+ZQD*`J87&YjcwbuZQHif#%5zXX`20X?fqTX>ihN}hwD3hPMl=1dpkkm94jkZH?WuL(yz{-%fkFy?RE;1pg7WupIkk@hMSblN4t>IKf_ZQa2_t{vL7i+F z9ZmP)?nQ?X>*#qk9aS?d1I6NY%K|ZGgpEr)qOo?fb9zoxn8=X7TLXq%V-`N$>_A#9 zIgfs{RJ9YOWpvsMnEOJXS|l*PMTK@6$v5~t2nG@}EE4A>xeeY1Q5S?k1W?)DP#fBjB0Vs7Y>aw;}g*bh%EMAfr6r=^2%X#N#Bb3NC(cf-e1X`vrM2 zcJ|Mj`Iz4KcPX#C=5Et6zbH1rpvI>gA+y0d^psgPK9mnZK&6yGDtmC_DYQVMPhX6bUjt;#~hrC8wz zxqVxr`}UifUp*@Jg%jHPiPm-wPy9#Or~eY9nC z2TR20Rp-@hqflvQkLRv3YdlNu`$gbCFmvqeANzOz6Oy=L9>|Ckba{*F(AK3f-`Ydy z?}XNQgxKT-#DUe~hb zQKZ=~M7?vN+B!6gg&xppGe_+eFr~htv@t#W zYuk5KXz~V^^nvxbh^1!37YTf=Ic{FvdL{l){<;kl4FAMgO3q+<^hcr21KfJcv?>xV>J!rei?FG7ys6V{uq$ZGi z%h|zsmWEEKI^3k+a9yRL=#}>lWw(Z_^Z&n)M6b>mnL$gsApj(Ce;eh0Ac zJE?4Y5u_?*RiO`WVuZEd!Q(8PK2;iUGN~q!_fY<;ZH{0T>KO&~ce$T$I{Vl$L>2~p z!Qs-`u6%Rs$#>R@+~a^wi}fhIDi6X`vH?TwiEi8&=VWxm5bDnj*;tN!UOZxYyELA5 zNrRgsk)vKBYU_g~#$o=z>?&V=!ifvH&UL#wByw+Wh{ltJJp{>meUqoKum>tI5;gx7p0jWJFh1_#) z;2aI`$mBv6GcHmp$RgnobFD6Jt&~en*AiEss7(>z#p(5EI|1K*X6ws##z!6r_t*MI zMm3lYV9~`Gs)aO27iHR8M0{YLZIq%qJP+mh)4P-3= zfY@3uR)eg$5Fs zk3taHQY)d~@C=0!Ws4p1#6cEb;6Y~?`hvZPOEztxCtbee6?=%>OHJ_lnkT|@@R#da zWo3s95Px~eCRFJMsRR1~M`(7~W>}}IxDzfXcNUGJ%K`(SAc&0WOA*u@mF!QOuGmF8Gx-Iy4AG+h$44B(~hZAWA=IMx~u}qWDeuD!G zKjAl*4bc?MK5BTzv>T{}z_nUuoLeDVuC-XeO&yPx z)`x1_gJB29sUF0(AE3VBz0tW0TB>Dy#50jIfMMT%tO&lJe^Gi}w+LSpyZ*5qSe}e& z*t9Z>O-n5u|rO^Z%42oApRmF33 zD^@ir(qh%N#k)Xq-qSm27&m47a8wTEEV z@RJfvKqBV4)vUfc7)fE!=#Ql}ALs6!R0dWP!O28FXB46V3Y&3;{(vfmG^!wJXzZ#z z^hpkE;x?IsQT=Qr7zD%$~7zxP!sGDu=8IszAbKsjP5!ZOoFU1%Ez+a7d@pkhJU3FxdHu3{ic12M=?MYeUt?%_fn>=Ppt`^o#%;cV0fCu1yK zy`ItnEZNXHGsQLZIvn=O{jkNd);U3jQuvKGo#uk zQFi)<{+$6j>0in``^S{|zwPs?@v_x`7I@&F%>Gd0-r*-+ukv<@%O0;^lvFFEdS^_N z+xnMy+;Dy(n))-~7JUTl$x-r4l%Wdgb*C(u{ zkpjN0RvYjc%wd^13Ot~1Z!UDnZ(=FSO#JSA;1p&OK`ku>)Z@{k-t@JRs{_98#(V83 z(Up8UHZXTP7T^C_aB~kCweWZ}9Px9K^wq!tA^LT&$g%YoMtJY^cPcfssi*Khob0e; z{k0+$nHD0-)UoEMMniUJA)y1#VC}gJh8CMzm9wCA{XW*GZ0>2?8_FFtSNT9^`jshPLL~q`z7e| zQrtliq?5SfP!mgs9m_Z+bLmRc@o7f@HNW*Df9$>^z27nOftj3{YJ;IqOy3QOr53GG zLZmG>oNs-Gd@TN}2qVmcsrUMvTZU1B|2fo9^<&lg5U8@F%=HVKhs~$mIiN@Z)`y=C zwupCDcoe?}Q4~&m;viZ_0{oqCY&-_sIxLh&OkWYR0&3v?v8W$QxA;G-+OmDDxAY&$ z*h+vfcS5+1+i*{P(s&@E%d`b^gtPY=ivhV38r2~y+&#?3MQYW~38USlDKe$*#>kT! z%&sx{W0ZwBsxII`pi&f1Ww+!-AQBz5Z~H+7mhkXK`FS9z0WQZ?H`Ha}h-{(aEuiXQ zSS422LFr>>ps(|Ozf?(!`&2%LcNvN8cOcvChW{>4$zMT;@0ocR!r=`$FtY zycV!Ct?S$5RJVMs;|(0H5h?MnlKPlQtBKvL9r#W4PisqkLuR120x-( zk=bKlnP#3LK{vD-av(`nlia{i`o1W6!~6AhsKvkx$l8v>HT;$TYu?M?agGtWfns(L zNV_g$you?Z598t13eFikTK4*mTkl@9TzOV^Us+=H&v~|@tEP+d7S%bZCw-$1C<|F> zM>*fJdT2OrUis>Ka(Lt7Bc!Q5#P7Cg@7sM8s%Cg%-~?LW8-49MvPMXh=BlB6eU1Q{ zg&-79_TiyWFg%=CH~;kapHj|hjcxvhJcW^_ns}@D;a9g3BGTcLmNcX_7n**N;M62SVt$D4r<~9E_vX?y(s9PVHFEj1exY1oPu4LHy-X! z)I{;m^{VD;Y|}To<#kmgQH15y(Pl~S9|`d|CqrS;AzP~q%3Jl$syJ?8H3goCK(DCh z>1HU@e)`F%OdI@o@bGf?>g9>o5snN9+v$ATMhUWT$t`0(x`_#)n9mb$>3H?KN zmM)_ON6ruT*6*}-BCGbK{Qthc>ZnN9uK_mjDQc&$>0nLa zZR-!Wh^1E>SP41ZRZ-++VqoOL{6Dxz@};}@B-*6JuvVoPmVEb}v^OQ>NWd?3IS&=)Zg^wqaG2tM8(P?FqSQNCWB= zr0VOFJ4A)M*5go7*beM*QJxY%`>8vdCNc2G*KCVWJnM@GZduj7+>t3Kebyo8q;Ga_ zQ4CCrgsoKzF@bs4P?_Q+Jz{>1uoG#;v#`Q;>w2FC$ z)LcTj0zL$2KmfHNF6T($d=K%cE-#;`yw+l=jbrGwFs^j@7oN?4p2E2hbvmOSE6kRQ zvRN|a5<~`(5Ko7O?cw&33Rc~KK2^Qg7~@^~ zU6>d_98&Bvai$jktD-CqtyesWc$z}!*%i_AqM!8C#c_p%rEn zoJ%<9EpvH_YoE^y=6?6$i$G2pG9^+C?Sb#Pp1Ddzi1;`8?;&mli}U?N4BSf*qShZM zVr1mbvY)1P6-6`JFKV?FDHmxrbH`_g4#)lpcjyEgyo`jnND@7>O=$$rj{(5uI@j(S zL5~l4Ng|D9dNjIdwWbW8JOwLfj88-U+;|q~|JFpt1;M3CMqq9fz zkNRT!&_$A!>90nssW(59?sGHbigKjkZJB@EBBN-K=*@QK5}yWEU&C8(uZv8OGn{00B))?GW=lM-HrE3bOTK_m(0 z*cbG|!WN6b7E)#!v2~1l+c&c9R~e^%_nS>1-?HeI{wsPtLZjrWSFg^M-9fK>Z9!it z_XP@Q0O!cgbn8xO-?PTrS8JYe73ZICIfYm>RP~q*$z=7|iw8Ni1*Rj`t>x|T_0TPJ zL<9;p4c5zG&0$q(FyE4j7FGw-H<)tj7mCRWsazWw$R@ZZ*h3OT`H*dS(1EoVROVlj z=rRcIlPBE;4zyP-B`3m#ANE8z)kmfv@ce=-O!!zWgyyQYH;P3~2jAps4+$=>P){x+*%A?&d) z7D?3B_JuBKbO6gw;~n?GQP;RlSlAE6jL%47DYFP&MVUoGpcP{kje|+RA&j_>pO9sN z+4DSJyT;}12<$oW-{`D+)t)=JNcqo(k3f&I36>RqlDX>BtBhv1g41O;RxMtGcF7XI z+PfTA4m#88YWn1&+_r4aZUm)su+3daC>yBRi7VEG=W`_q9*^TH$(Xa|9Ns~#O@J-jL*tv191T4^N)Ei#owFjI{Um}oLU3f^jia8mc@GNDpnMghF)vcrSLINl-Kt~}!+rk(m1iy2Z5J(>oPl1iL9*qB;kT4c|f$g}duTommHt zZ@EeNEQvh#{896etzgkh9Dg(*Ft`iFg7ZAGKhs|v4*nwJfAF0})p>7b+T6u%6k230 zP*7d)KWSGZT=CfyicEO^PCIXu5-*=pOuw?ns_a(pf!DxPLT>U)iAdlA>QVsJb5uoF z04&rR?_dAYhjhUIyRRpXk44f0GU)&Jr(QMTe~*_2dfR5p$W8$u_0C^L?aoB0XJ(In zfLK>4k}$1?fXQOOln`uIf}c;zT}~OI;#0^$1UFLD1B;)^Yx|e|>ScU`vgRBV=(E=F zPkIU_(-7UP1>1e|OKQb#-!VTzMD~O#celw$!OU`}Jm-dV5~UaMUsA(dNy8MFWwV2} zPW8se5$CN1qr4D9Gq=hyGt+LGLtTB{W+M>etDz&gHQSH&@iZ=4ZxH-CO;@13$urza zEG1zLGBERAG&h| zqfNL7s*lH)h6wICnJ8zMQ9K7jYBNmeq|Jh~a*&JX7}%8R{MyrjkK%e2ZVzQhDLsZx zr{H|~I!)R&9&|31y!wor$*%wE3xIrfZx#o!@v|u{R%4SM=7IY7-ie9sZR0oaENXAp zf0rc_`-j{z%*=nK8|%dOnGY}`eZG7N=OYgxq@cSah9er<7FktlAt02t5Lc)f{KEfq zNkc^*gine&#rsX|=6iQDVD}~V)dwu#_;ryAc7MxCoM6+j9Rn&>u>r{+w%YJ?ghjcec*{*`Ve&wQ#+l;-$C#HDaf z$gWLe{^K@!Gkcc`X(mf^`~KXZcGBHKeTSfZUfCu^O!rgXR;_!{#nz16R*cW^xsS_Z z=l5vT3zdtI@ip-9JZ{oDxmsBJ*l&SE4@BDAsBbcMei?*Do7?DGfw1ethV}tQRa?bg z2Du@?Ob#W(EMq6&nxMDl=0mCTvACR2bOSp{s5rh~i2eC|IV8ZDhG9^b=B@%DbZfrF zZ=+&E!fVVANzAm<`mv!ixIT_W(1RHaSZ#w$D$durYID_RqG}c6mWMpMU6bi-k0}`2yE%5%l>z%aePdmW@Y(%r2q}1 zJwmv32N<1RRVX6pKV$rwPa8fw8~U$Y8HKQ)O9a9fzZ8WR;TBZL*u)e0G_s!B4_4kN zV zAU{@w2)5hK(RTu*Kz8VB2)Vo6kWc|yQ3Vmiut=2@j%z`inGN}O=H-FdsTpvCBU^ErKSIK}RHzI9?;7E(%Kr?NGJMux`H8 z3u|pO17VN%p8Otr)GP9@mH*IV8j$7vzrV0L%98Pam`H$(B|;L$h`KFh6cn$~mv<-9 z4w?=lxgse+LcfmkaH&*n7L~8#(!+E#jRd+8H-as(6+j1q^p?4aft9-zuA7C(I28ljgwmZB?N&EUM@ zoESuCBmH&kv7OsGEytz9J%!V3?2x^KU#Fsrmp186VD`&W;SH9_^WE5bV4M2lXBP^P zT=A02Ca4@4t!u9dkA%OTt+FRu33o`b3v4y@OYF9+X!79h*F4bgbrUrD8@Y4_)bkBA zdIF@zkz*|L1Ecv^hP=WdJXiNO-8H~d+RC;%DF!_eJ#AI{ib|Q!)AUUIh+B`g`djAr z0-UXfZ_KzMJOyS{sXSeDZK9GYVft@`{U6FcLNJbSSNWj(e-o?=+{b2(}8odusO zWFR=sNzz{4n|MU%mQ9dG-uwTCH6@Y)f%0yR&CSsPq3u7S&IU7{ zM0bA>9#ye@+ZEB}N~VQRBl3C4d+IcFdd6)>hDykTp9QJiKd5yjFv6N?&m56xC+rUu z4IvwNMY!R&Y>GM@K88?vH|TXHE|%;6k{>pq64cV%1fDIPBbh?pH6NJtWCq<(77IRu zz&8tdgb*6mVT~GO?x9o%KWE+2ZqsyleR3&Dg-xq1yI} z$`;rnh_8$NCD{7g;fis~cy(DRq!+w<{7u;*PE4PO;ab@3R5WZ??ZJk2MbQ z6a`Y8BG&f(SUG)~lj35+A73r&-#R@5Agk9-VddJruqZGrKGPv>V`Sr4)wLRrA)f8! z6G6+RMfA9?9QQl7jLAn<{^-p(L5;?K!8p3dWgX5LgL36hU=0C{MgjGkM`e2mdEOLA zQf=2k30t+`yzYG>19^gz-S>F}#i z;F1h6!QoT-m_&!EeSZ9EFPFW|cHDd!_zxNPT%Q9B6hd}7abU-%L$(M{>g3u;m3Q2FsiP3Y?Tn3 zPFNH2YF~y*->l^+Im2DKc`W%mNE&?#?8(c)!7MXb`8$qk-<%1ZBepN(?zw#Yfe<)? zS;70zEZF)}FqadFx2vPHK>VYI5uF0=RM}pe8+gC&Obyr=?TPg?5aDofGr!mXlYD|1 z;PaBh-k51ne&f}G!bwFO3D6w(l~>OM$KFZ9``*Ag=}HF?eo8Y`DlfBMq9D(AiM3?a zhBYsqjwYIzgYeBmn?{?Merq9{Ik>7{uNA-WSL;Qa>Os4$CR!cj!g!N0MFFr9_vhHD zHv5Bgt0Ky4=1?bTsk<PG*v5`@6em#UvjwyMCMiUohf-olFX|0}H6d zr;@37q*UDDRo%Srqwgv-p=!DIYuB=P&AU~7@A?f2@#lZbFtxGuU$WiYzfptiZ$i$}tRDV|RM`~G} zV<032iAk#Le$Sx1V8!jltSjspsB$+Q8PV?=K0z&?BG;2{ z_!POJIr5n$xB-EV*A9UCzMq@V6^gu<>Xlc)2@%E0e8#N}>~jmlY2B^ZkE~yDY$x}u z+@(857faGkEjn^Cu>X>L9o4@WG>J%Dl9%UqRANw(FOp3~bEAus1FNniZfe7+N$|5F za0|07Sx&$f%c12m>WGW*95kh_-^WOYVzF5mXK;xTKeYtXxjf0f>1yD}DGD2Iv%lSz z^PNmi06K_H(8!cP<%yGg5pAqk^0pg+`?fPY<_@P#r$W|&4MDQw{3|~{%eLTe{C6O@ zhqi-%l^PK@oGPjPCocHvpqbp=PA>XPz4xpK&E>fC!^f@_nKVtEN3-Dbr~%N%OSG1sbB; zz*=4N9?ojZbGA0c=Q2AQry9JNXK-;V?2E zyt|uU*WBvI*0_E+Pb)EEtMyLNcp7TC4t2xFg~K7fn?{(_5$}kS2W30eYGzb{A{~PF z9KZJ+{D6Ag;T_nW-Oi{mP9L9cp$~7w5BXiQb5{e)^<(xsNnR&Cc0|-eQ>q+AH?zX9 z`xR%-GOx^a9tlT}?}WjFg{7JGQ@ep9n8^$5CivG^jK;>2ZOB1UPm*&RsI^|E>v-g{ zHqx76)!`FU;F-|yHX`R>ZEex70sXqR-e{0kZ$$Z)FiD26bgpwc1zI8=ufR54IeV8+ zkeqV;w*RV$4~c_pjDHLGDnfsDKj2RL3((>lx&>ZRjTjrFwQ)#oOXA>{S~Z*XnEHWbOTGrT?%clh{CDQIAG z9ihCxVdpeB;OlfSK`qBRq~`f)uShHNdjRmQS?Gw*m2G2mPHlu)6ryp@M#zz+=@Gg- zO>k)0g%@FQ`4L8Y1&+8AtmocyXETg;MR{&dxFG6z!ia39lc7|~;#CKeYRepAYXt4v zycQ_Z{(=Z8or;r1g&BcfJ3OeAbIsFtKJ54|j0y+HfCQP`V#aPL(7oMER|E5970;3S zvtN_DwoQy0G@7mOjqOKbBIuydpJ45L7~`KlCk02+J4d3tzVh+PF_+{7`siR~*@jB; zLC>sT2116(^`~j$_Ej}@i_*E`!D9TzQA^E_sG@ zvQ96Qb?3Iu4J9aAjorlB((?L0n)8zuf04Ys?%Dlm;aYt?!Z7t`i2Bpx5AuK14%>&^ z0sj$>^=F70kU0x5$N9rH4idcbhUIJC0YU=utHSC%g7YaWiTnFkJgXEoch35@q|&C< z>Gn9t!qv6x(?Qtx^Tppq4RFkcIYen2wYXrK&=1X?)Sw-Gyg9neCU^;b5)9u19{bL~ z`2Brc!5CnM43i`eaGZX8m7n(no@X>p7{^saPTz0ti@j-QM)x#_x^UjlOMylsj#(Ykr?)2;7m1 z`p2Kb{;`hO-+t0R{gSi)?^TdW?S~|Bm^NSYC+5$G1Dj|1>q!)NDR@-#VxgqGo2*TJ zzI3qXu|f4sIM@DREn{xTvLg_(rITBf#tr{Vs^BjHKlqXK?>u{ADb+?8ZUp>Bdu>c! zz*Q|ccx)^k%Yx3;sIa}PBF*$$ATuJM{m4wKrHu81A=Le1$1Hi%OseQjCPseK`o@&H zp(8-aI;$5g*&`h0gl&7}IV4MvGf`jNNiHUWSS_g1ECyajQ*&(8wuBAQWpE+xDeIN4 zEk(_!&C43%(yaF8VY%B&0zwjE)IQ5Bs!P!265@k&DSC(b?6PjxUqgn>&}{oz63w0s zn@e9ZyLt$YZi~3g7)Dl@GI6<#WUQ6EkF~`Vw(jo;PRzC%W9h^f*!URtnAW~@o}gJP z&&p!d=iQ0BuKk93vHEhj&BEXVSJDwY?u`3uXZ-s(XALY9zPRAZB1yBlt%e4C-gmv# zs){`dwTz@@uUxfc&DVgn&q)u0+;|oVxMhfWCAF&kkY>-|_uNyFd5YyfDj4l04o3|_ zs0*G}PU_k{QaQF`UBDhK?OUi6lXuaphnOp|ev2GiEIf#)$X1cl8$^zTtJB?5GE2hOzg2BgY2h&Pw*7og_*E1b*@OGI ziN$6k^ncG+*lEHEBcE(=N2QWZ7o(W*@Dc9WX#>@q`sXFTyFZ__KJ0chL|#T0%AC;B zVfs{uHQdPXXXEh_Bm^9qAB1zxfR=FCr9o_Z*tDHI>iUF(_6!al(%NPRXM6E z+YQfXN@V4=2|XI1ii#m%Ot_}+@dgIv^8bhbjhXXf$I;&)K%H12i$Oxb=*sVKE^EYy z6EU{1=+#nN^=Gw+yGU`@=D*?cvI^4GfG}79HBXxmIR|LW?dLbv zKh(S#QdPE$U5MJKptLYajQekr6G`@SIgLePd@M(6!Own6@NL^zPktWTKUjb6ys)G7 zMiMF(;bSr$1u}P~FTpmWEUPPC17H4{nlNbm9#;Z)Ey)a%^1-JCSxU54N^w7(KvC=ei1qy08hS!A%DdTc=B(K z<6#!98+6=oVL*@YmoQw;yQ)#JtVIt_78A?@XSP5z4eH5)QG|NC~g z$bOwX)b%NTPahx$c*^Wf6r)U`523UmGN(Pxw5qNy22mVXuGdpNzbMmIc2-qMGxIKiUd~{5e;NX z#pdcJW69D?BPt%T)qN4fEizM!i9@D2j$bYVU%*?N@M}kaHXcA1?fZ2QT@&8;%5$~8 zB7K9L{K9N*FIFL=&(C_`be7~RSw+Pu;dE^0+f~FS7$4BhK``bpXF)w$z?ieKi?dJL zihw!6xG!fP>ZZ6N+zG9s7)kliRtRidXk5_cnfIiYn|U5IWJzcWEuKZk?aD5p5qFMu zb&46RiNk`uocpPZipnLN>-2PSm7Tb)|U^FFQoXU)$HW zZ-cWtT?}?PBJ+cAy?Q(PrU+t<&8}X8?b3#PQDj1OepVj^>y%ziFBL_(ApH97#SA?J z%)kXJCcOM$at5zZ==XL1gs!2LH?H5H5Ex|<|EkXqO=$ergqth?F9k?h2TW*`8Z-Lv&qRv-F$GT#Sf`7HCo=3` z_v!=qQYB2saRcAVl{w$+6J*@MD7w;9L7b3H`%n$3x*T7=9YZ}A7Su7@l1h%~g~&Jp zbs7_N6!2wG^3S(+VoAd2iR}(-XZW{(+m}@*Gp~p~u-&fEy36zmFKw&6Dy4m$ z=Ra8Wz)Pm5;lKA?&CnelJi@JM2LR>{-8gGR)F&`~5q*Xz==HWOM=DnammAFj5gD0K zB>&`u`^8p!-{?m)xS;N?ke{iT7e#S*aSD`H$;qb!^@_R+;G8J_j3v29G|1)99Rj-3 zYR|lw69?#SM)xXv`_g1^aacJN3E3 z7x)$PByE-15%cZxV>LEqcCE z8Rz1bpNj>N0}BB_xLxYUdNQ77l`(=YkW-f^I=vJI&^x3@kY(5%?_GivJFDZ^LzAX! zuE%F>^IeE|u${fB$4>s5wkK2UQ3}^f7_=%2@-5 z$i>-$4x@G0=w!#;c*5S#)MjC#-NE7mzQR5F1k)&2!S^8gLd|4DC{n~SKyni|-aFJ9 zIv0g@KbU+nnb&c=>Jsa*=jVP`4LQG@2&O`Zd^>bfXSlpXp1ykz(78Cgw|fJPI=XBB zS9N~uz0Cgi$09+t z$a6f$(UO45l1Ey$^erJ`lY+~dKi`-Nj%#aogf#3(MA`a`*@hX z=Nu8Zf_v_eC&q7WVoVYfIXrXVo-m}DS|$)}njOU}tBuZLJKthOW(UaHsIKZC91sin zdX{ojt|3Sw`fS4|lTC|xDYh+=QVS9+v&$48pq-ZrOT_Gl8DRl`Zg51;e3F7(lhi?E za{Q4a2#i;9d0LVu4oAiEL~$MAwciyY z+4vs)UeeQ7@~_hP(EEgq^>3#I0M`pRxc)$~0THE_nE(Ys6Y!luIBu*_&Xrvj%O?$w z{*zJ4fHfg4m+`&i*Br?mAcAz?ZIBiV5IPS=Gem+_bHjO->DzF?t=;z>yjlN3bN7jf z8q}`GbLsj~!Kts{Fz*|)WmvX^YGE4501$r2gSXgMQ40|dX!97yupq(Ub>2c&P#=|8 z)FkmrWmLTAM&v}iG|QBLpKyh~u3!*KzXbY~9l3nv5bnuISgZ?(U_!5LS-&o=fg!*lxyo5v|Gy45;DQ3U=vJ zc=hj|)=QNZP9pJzT3j(Q;K09x=IA7={T=+1$G5+Sa4^tiTS`Dv2`)d+z%h6Tlb>>G zMEM&!B^cv&IMgf^Nd6hQM<>`;CyqVWlBvA*mj(`()fR({ol)!YQ>bOaW-g9N$v3s1 z7{(9xM8v1Rdkkg{Y1%JiU;N=(`Rio%aTtptt4&*jEa%K!c2jYttN$LVL8NG_Th%hbTfL1Nj8088`5lqlrf;=9Nmi`M` z*%+6=Zmux7<4?|dDfIIjhUb7l5kBL>a=~4sOx;$m0~8S1dM_Fgy=Nx^Rv(lc0Dw^` z%jR2P9H$ojtQ{Y*fu!*N+5y^hRa7r}sjD175sjmj0Qifx0@G%Tj-*%R0FuSW|z>xV9d|97(MO}yMt?XI}SGlWN)49R_Yoyn{RZN2j-q^Ovc%vNYakMR`X?${dlE6 zGAh>${Rl77wt6i{^(HjLsVhNszlBrdwD05Z;wPd}q)O#p1I57N&i5<|O;Mpl`XJa! z{KjM%iqvVFk_pF>_62h8Y9kiQ+p}|--2Xmqyf;Qo$}}N-;G3K!Og?L(7O!7Gm;$77 z6q3Cw9qPYl)G;JJ^EHLTx3zvoWowI2tb|}=>vSfa9Hc++b69*T=%-85&!5Hd=S12- zSeT~MqxFdcy;O|Iz3rgBsFXGXH;&RjCgG0g3z7#pvNtVJc-9m*T&S0e1;jJ^QYQBO z!u0xLPa#ShW@mcKn-7GXi#Y%@f{d4QUGpv@Q4s-q4*Es5XysjZsHukt2XvKfamb%a zcBx|6DN*L>cW?mq{u}%wr)Mrs6JsV^pySJF$-yjq3hgL5GkUffJ)Pek{UeJz=N=gZFFMKY-bzS;NBfk}M<+$_s0x8bk z-|^ok2|MeDE;Ik_*8Kx={e0;;gr6W1m>^9*8lSWnk0@IKI7#3vFv@giu?c=Xf}0m9 zAriu~AbyQ&xhNyS5bCbu;1k2d(;ObEu|$H%#0|RzuVk@PH{obN5GHTkN)B}pFTjuj z9(~-WkK^EET|{(6y-A|0FMvkKY2iQihTp+4Erd!cwW#GR&d_yiJ;f42Kbv3K?`X|J zzD;>r2I((4y0yXCqoWtXF_uMvQB`FtZLR{3e>*%MWK3t65^j=}(#^@R7`rG*8&BXn zNloHnquNV>dA1XOddk}tDZY6DMgq0|hmnq*?L$q+|K#1q%KkZUgC4p6sNk`Cq~yRx zP}U&2cOmV3zO+#~LUhoYx=#;yV?;spq}?TH9PV&68HdA1K)*J$Rt$Z0aSrV28&aU~ z5LRn-5Tqt-)sK@4Y$Frebc%hAjNB*2k$rs>oM62DrbJf|axi^vVY=8sExS*W9eH>j z14z<=uqvn(mVXpt?C1`K_9j@ZQI?<9T8`Z=UAJ)t@*Z!bi+u@7Ktb+e1g&SDpr*8G z3C~k}Hy>G;yT8$LRr~32GFfw9M@%0ofB1s*+-NW4sj*e_I(nEC!TQQi_8XbwI+Vfm zlFIFidN}TYc|c4Q_h~*&ww_P)zj(LotRK1={a-$9KlaIG{QGJb3&_>@(y9>J$hq5i1QH7<@OaaKzC-ct2g8~kj;Ty>Awz-k14PJfs09y{sW-;A0v*w z;4kIFVmwN?+If3U!67RCfoNT}5 zY8K4@5<;y<;#P3(Epa*?POTc@%t_5T(_3Z-eh#A`JQ3=cZ~HBjN-@^Hu0h1wI z%l(I2`ebLEdPkXLWFssaBcJoW+b7hdcow=mv(r|j$yQD^2IS8={)tNprTtk65bn3y zg$o529W!R`a$%_xj74_~v6}Jx>VdCwpU4V-duxmB;ONG+4DH@{#aZN&8d;-%1^CR+#LI9V@XLd_5Kc9F77Y;ikS zyuz0*FwpGVeiPTOg#9GFgPZZ)@LY$jd?AX_qL5+pzQWyAr5n#Z;x9+o+HxDsI?j6TTVo&=A{fVFx{U{_0!TyS~_(c%lZ&9wD`n|*S+8&AHb9LLRKbxHUVVxG+JVMRrqxH*5_^D8G z4VSrFt%M}lc*M2Z=pNX2Z;llRP=Ldx`|4gF*F*giD=1&D3?-`;Ljm5DmQTeG7BhHV?`nnLQ{*pwoq z(D&RMc>H>z!x$-y04bNY>v1t7%2{rr*|2d1;DVF+%7Z3w2~X$lfw$=+0@3Mhs5f#O z+bExJ3>3^JIlsik!Kis|XQ@%-_{7jEy%D6bfa^1?U1$V?xhaFhLz+&17lTzf?kT)4 zf8fNva}NYx6*zX`(nT%zI}dcog1X;A@N$o38)F@KhHpTQcK5Y&I~w2xeUiLOlttN# z{KeS3ASBh$7W@BDcUEC}2FbR@-QC^YCAhn5aCdiicMtCF8Z@}O1_=<{0zrcl&Yzjt z=S*g};BudbD=z=;{;IpGR=rDi`XDS>mE1Ta8@s~Z8HQh?3 zxNe$VxR6P{yD-PGNps^m;u#d|e9|;N7rtL&g82@FAs4nJ2)zC%hHA`DI(ZA}R~VX< zsCNE&27;`vEhxH2gNTIqm*ov6@Wokny zC+113XNsHjv}TmjgdgZAk;MQ}yN$fM)FsS(0*IouCR;-uoSvf>;hCq66kH#3aJJ6) zzKz)U0d;9e!+644zc88g0Pe{D!y>}ZcE~#)E<#OxO*DRxQ;REhlbXvKru~DF6ExX7 z{v`XHiQ`+&D+bXF)b##dr9W2lXROHcbJJr6br!FTr%w*{!@~>#6JSk=mL2ZK5bg}yAjQw)rB&CxuyV#LwxP_%-a(gB ziaPtArR>K}csBi7D8}Hq6OQ|UtcxbT(|&TFJjS-w=!*b%!x$On)9U_`71$I*>bWf= z17t&D^Gp@TpijDExJY?@wFmlG4xf&Va>P)FJ!LW+85Zush(pStan}@Q`>stRWsVoi z#;s`!&#i3gUGsibTBY}w!yHj`F3wH z=!8Ia!9ExT&lnA7c1?!mRAiwU_m1|&W#g^x-N2mhtJ0Nb^M({@YT|jb2pQK#2igo- zS$gyrhgICd#zl0V0$WGou{{Q4&1R%Pu1|gS$#u}}8YghL;7Rs2vfRKDmEhu9+mp6p zS}11Y^^#hTJ>9m<9#_No9E!B06PY6z(r914b{~k1T z73Bbm3$(5$+D%7qO<5ZiuLMDk4MBvLd`tg;xlgJVsxjPq-!(Q&Vfm=8(=+bE3T$83 zaq$DXy=fugm@JEUYu&C<3Fh3g<%1au#UIhgMAJfCyDtzad$#(nPZOjPH%^iGOQ+^Gwa}9^QMU zqc<;E<=`q@h-oI?A6*Es>B=rt^xJW1h*=)+itnE&_Tg<&nwX$5`l=>2h%R%#PWJ4Q z$94u9eLu-!H{SKsL9Qz$x4ttm^_0%LoJv|&zqVDQWUlK}atG_@>%RBa623DP`dfdQ zC}j5!kjg)BXG~aJ3YB#X^H26XD*^&%;|d{xhW%gMPX;GlJg`crLuWBLv8mwMA6Lt{ zkF7Ev^Fdt`Tt&=5G+Xz47B!yi2m+6FiZIsbUaTI1mmK)MkWVrnBQb9G^UW&rV?FDg z?b;4Mad>=Ackx#r^rPBFaX9XnY-22F5q0f@ z)|a58^4=x9r&z}kVz)dk%D(3WL<*fJGKmyPGV%H9$T8&v8nSM`>Avh=hG<4cZhQSl z@tPdtZiA0fXmQ{kNE68A-pBZ`#ly0h>#c#;QXNMAE&4z+CT}46I62={d17VzTT%rO zEPye?{dj@#H30h2SaL@OrEK&2d)N&FL>5{wu}KDr<#D!3>0nY7Z}F!whKEI^uTG|! z&oR2h8aF9?O0a4l8}_f@9K! zuFB7nTjPSuBQy)w;Q3aaEVT+YnM-c-_|&Z`oPb)MGaDTf-rH#Yip!;$xx{tJ3aC|K z81OX(!|^M$9_k&oAZ@tCru>Oi6S2SEYCyv1aVJYQ*UCFpKoZ5DdI@7 z8>@dE${!|zBe)mj8)+p3RUfOaQ)UY0n*Hqs%}_etA`nuiEL{3{49~K!+)f7436gmk zxP7OX=vujJTK+VEqtubM8vEC9rUAZkRNQe%KK6v*w09!Gt~XaCAHMd!&!b>8I0=0H zaaguXrV32tLG;loKbURG7L%n|(%2ZFi{3NL2*(xvhKC?0=etU4Z0vvSkeZ56+@HC* zKS+5gVBvtf8>^P^pRf8O@@Cd-WsOM&HX_FWg|Ww&?!mkaxytV&^r#}2)S1*1$Zo`# zOhh~iOZq$6R2;9~ACJN#SjvayqI}vhScTmpyp)h)F^g58kZnpMaDn5hG z8difA6hHRTnUH_IX~3xTRrCpIyM}x8=~Pr!P*!kR z{V3)itXA!F<*on9pn*mnbg%Q7h{I4i8d0%$GJbnNb%zSl$_3>SrOB1bcH2Z~{6~1) z`e!UxS*I+eglMlkw&d?C@>GuA%MFK2`3BxjtFD`>7xlJQQ3hZDS1WJY(#o)teGuFa zO8NC|G0n0A;hF)m*x1W`LCHQ5qnSf?qu5!Y1sinvecH?;Ln~-TiA3%q%+agSb;1;* zZu>vNLt^%`=eKIpsB13P%U;5se(i}O-mE#CgQ{?hprHAROXq*zkhOC%(nL`>QGzQB zL4mvcXrL{fP;X4=ETDmg2rZ%x>eog(Y2YfeOW2408~pU!12B?6B`z#5!*9RhTFp+i zT0WXVP>rd%LyW);IxVA)(4Na%lW0oFu&cXA5Zu@H);D4v$GbA6|Gm`(KMTIFLu3pKpFzz>6Aju-e_8y*rLOluYQ{<6 zN4fI=_L@u~u6J3;NJcoT!*LkbKsz`Zn_`*O7OUR%_*0-l5!WY#WENAAPxB9%ccs#s z;PSIVeVuq^+%D8VKbHWl7~O}L&V%Nq1=mqN6#?mrWOI7Mk4mFs-mQSJ$M5(mg1X8C zh2wBRxIAIkFh)F8wqbiIn4crP`7Kuxuq$LS|5-w-DwJErrZQ-(M$ z22Ous<0^ZW5Zw`+U9l_rlorqm%eNkUKJ1f&E?Rui%=`mB`h=lb&~+59-V>LR2%QKk z$8uUCw@Pab2`%JUnt(isho=}tjGvzGCp=cNo_>L+Z{Yzk6#uKfy zqR@RC-d1{+&)z5r;ec>=MnmKU>57hf2j_CEyq?)Bc!qSb{gt!QI%_yS^3Qml<`ab{ zh*j*4$TyBR=eyqAENp+X(Gvev+X7ta^@Ae@Tnh18O{0J}w^))LHLc!wTQ9nynGsD% zMyYBVnwS8KYI-T{i*7UT5!S>Vs4lEWs~>X6@ zsy0j*0yXjGwRa~l4F#;WG6fsxTgu$HE1@51-fCj6z(Zx7Jrw?^_cLgWlTP4&uN*Vy z|L0{*X3qDOHUFbytt4Xy{?8^75z2RtP;N&Fr4M6nS z(lKRS!@m7JIvr_t{NJoXR ztxG#Y#@(?z`jU&?D7*yv4R}H+I%Zy;wRRd`2K14IFSD5?FBjx3nALN< zH+pp1^6fF<2?8?3&tOg~ww7yz1};f3Q&gzVqdj$N^#J*H?z!)yxaYCn2Pfn`T<-dVsFS8M!xg|OdigAx&4cshpPZt|uDWH-29t?6;1*=}&c^P|@WF1X^T=^WPr{ z+dF$qTz@qx0sa?%MhF1@7k|3d5fTAZNug+x37R(r^L{}893hwt>Z!A}pm7a!6=%Jh z4$iN(Z$X0y;^b?9fWHxTioI^O2%Fs|B8xQI3d-QSV#L)J1EID1y1Jz~;YX4J0(Q>9 zgZrg_d%O%GPx7LY+lm;H%O@>1HcPC+9#07eN-7t0Wb*@G>q!YdG6*Bj*Y$(YkMZi) z8kY?;AuA(u9Pn9O)0L*>+T271hUuU+mD%}wuTF<@bH7fjmO@J1wLPWITeD~Be(8{> zF-~tMiG2;$Q^-|l*u5Uk*mc_Ncf$6R7y0h&9`EOUW{h4dl=v`Cwo2QYjk}0{?FjtY z!uHc0=ACw@Z|LBNf>N>*&D{%-*=1zuTg&^dbb^iP??Rx8Of4=m+_xEOmq6f~R^MC= zG+lrrOBGiD z-U`DF=2FBBw}v-5UCqh(pan*|BiXF2$49CA8r3>BURcm6o&-RKIl1#{#k!ZM1%K5y zuw`l`I%tssGu}Yqmh2nPM;PE`rd&FKrMq2csw4|@+5*-p{^Xd_E9l-gZDH~YY$$Vm z;O7Itp(e>iVE%jO4OWskbUu{06)tCjIzxplXF*>&7GtE?22s^nbURvU_W*Zqcy^BZ zj)al%3@9wdw$-WsXvoNt%P#DiPf?ni7txm-k>uf25HyY&Wog|Aj1)nAA@fTWmZa?# zA>->6@m-W#4|w;jCq+8SJ5250I=J}wgb&%K0idiHW*J=I*?kGC>A1YkYZExXjeqVL z*5zwjWbo=Ln1I)@Rve3nAADcd2YM}igL}>Pz9v5#(_fj0T=Ik?DKo$}J_ZC@7X6YK zc$joK4RQCm?weekP0j(d00_LK_xZdje7h6#4uqzGC2wEHdVNl)eb&HDng8IUJCV7D4?ecGCy$vp>ViACS#9FJULL%At!n$oy(GM=XHFn-={gzbz3?3bz40jA}8 zoA03Sdr>cZg;6>LB2k@qi3vvoo*3OI3u`6{a8V#zLQX#}vedhl?P&h0Dw*e2J&-@5 zK+W1D?a@*b@F1Mr@9h;9a2(89B(HBu60|i9?VYY4@5SF4?Z>o7J7@sQ1Sb=@-LqAm zZP!pQ`{dEBJ;hi0g5cwo$y*2M+20j{rxQ;X!`{9T>c{jNl>7BAZx61tQ|D57g3J*` z@(?=B5oz5I-F_}ZCNc^BccpaXYs>4XeEA#m&1|giyM0;zDo3d$3jML(_y?*`5b&ayAn`d$R{5(8epUJw zPG&2eycl?{)aaJYtW$5QvaQvR{w>9VCI|9EcX4n6eL+eLg>NA*a^=utRoag&=Mhda zRDlV?C;=n7HzFPDyV5W=mcL6+0B-|8%i~}DMv|fU>11(Zj4?pNQH*k7K@bU3wn^1z zG1GZG#i@Nz5&6lL!Ok z*X2q&?i(O5o@##4j z;H`hgbx~49%gF&oPTxSd6vGtI_J|W8t*@+NTCg6Z>6#%AjL`^EwBZtGl1Xv^`s%z_ z?TtE@RYi*ITK9OlE8%SS`%2_Gt+}6K!B5KX0&g1jGR?%$Y$sW9t6Xez${HBmPrDws ziGJ*iiEpL9zn2p-R0WARAh4YQz|l2%mW128IrEe zA*aRnWN}5rVjIjy!9X`qB-Q58ak4bcVSNj;eZz|bp&46L@%3m8rlZgbk7UGeVX5DN z>X67jpmix?cZWLjRh)C9>T<)=O&&o~>b72e$I$!X432wora2~-Ln@>&?#PPvWECnZ zV*zCcr3ri)pUQ;Ul7y!ItaIUOJU9xU=zWe@j8pPW;sX)y5ynMFfGjx6J*fS?Gy1Ln z_r>KM7Y{^=F|`8+^)xt*<5B;@IsPs$lGz&yEX*Wv=L=nrQiZcbX}YdDXWKhj=4~eH z2_^n=2Qb8aP`|t_$%Q-H@73|_%^x0|43|_~L%Id{TYntv$+WewIUBMg0gJVMbbyoXl=p<&4l_ z2^Od~K9td%II4Mv3bEgg16i2}#az9DHX)9~nw&W5ygEvZE5B8fj>on9BMnyJ<8|jx zKJK>b>@>*J;L3H@$Hp&u+f;QsqhHwW$%>429gVtsr~NHT^tF*rUDMi!$RGOA@CeUz zBev<*!**}&eDzKk*FiAC>V!74g8a9rrhW+M1ZxpIm!>7?Uxej!KF>dW{0+3)8D#X< z_w%kYJsZp4Ds>g1q&=p8puc+6AmCQck83KpZ=PR2BrW6|^NB$dg%EuhP9UZ&VN_V< zE|&gn>8y6#BIB1y3hpr8#C$F;>B9m41o|ar)@qYU*lQT2iJXsR2;%C1!;@_SM3hwf z{=iwM_p8Z;U;@rjXGU}h=urh6RD+VybpZ8&&=%Ypk?8&f?9nz(V7{Ht0(zl0s3P`= zivGE@>-;pPq$*8{QUXfQt?D#+E!?p9tkU3DL@C9WQA3M6hbzszY@C>)g(;!ZJLfao)0iE}Fl%s%=rJXm3kRq#Aa9k;mf zH2+>&;{M;k+fO+cz`w*9c*SxFhr`3hdz*pAmezkr~K11J(2D(dI@}YR@jjgm|Q@oFqSH+mJ$k<~(3#OEtt( zy`p4hlSkX;Y9Gvs-_t+Cl<}>5!-$5B^;$Oo>_>K|=}8AmYTO$?`)5|A*bU8Sf175LC^Ku6qwlshz?R)7tdJ}$#CjcePY|i)fOqE-y6-|s0LIhoO1dByf zLQ+4s(OJg!VeTMx-rC-ibB3t2Yfb9eNDgF^@W>SQ=qI6SYHo!Q)w+48CS5O6dnRse zNT^4C&Wz6%0!j^>IcNZ*gc)ZZ9L?P4dk5jDGsl6YYI%n!Ie(yx03#-%QSprr%}`J$ zynRk2RcE;CDfMwgB-I`W!jTJ&7{SaD-clAyHGK8dNBKp;O*Z#}9|w?b?gu^d6wgP` z+Lb|)f25+_tPWJYv=n3fJcbVm=>>GK*cCtE$$Nl5U4@!MH$!m5jOp9TSL)A%x@;?V z@R08vP<@t0&#_&NDrtNj7BuuT7M*zohS#8FJB=FbuQY%%#nf%fwK_^*^33N2@`UZ_ zCa^eaeF1I4^FVoH_1`xu&c^z81P5TQ01^BjH>N|#J+w=1X8MuX67a!o4pHkyleZ~~ zB+`&E98QX``APARJn~IAVwHlKRNg@8{IPM8W1rJJ_i=XevFj03KExNqRN-g!;v53F z98e@5{4R(S`p{Zt@e)ZEMG8y00N4x-rmE&XtxjP}u|quheky$8L3q6m@nr^G3+!ft z+0xluBcaf*WqmNqNE{dq`K>#wG&U)f2K%TvSicpFH3xdj#F|M#y$n#%7qz6=cdQ)j zPknt1&F>koPMQ*#k%wDe8^F#bv?GBR=pXQETRR+Ov@T#(8$ld5_AqBA0D@hWLe^Nkw{jLyja>ZITd{K93H#j75Qq$M-ukLD`s=ny-^=*y zyUN%^nj!Qrh%%`>8Qr)`BG`aZ6=?aK$gPxl%lT?^SI8~tj?*8Komr;%nY#wcClw1+ z?c;8Vc&;Df9KX6kBD>n_^`&<(i*ig_=}Sc1`o5U4X{3w+rL?M@4K&YEjVlYWyn?Ab zX)AY}4@~mQu8251A!&WV0%sz+?c=}&3-Oy~XJttQa?TNquRMHkM6ZCm#O(RAX|LV3{0&dX)K zY^BllL9PcaZ_Djhr&H|#Mbw6Rb(bJ!l4%SDyAA!o`bx}Bp*g#%v`8rW=#@ejotAxj zhXQ0%5~sfbpF~W-(ABqkm^82j$^g3}pMAhWppM&L?s%%GmY)^DFqCXT{2CXtW?r#Q z6Oo>iiobW(s4Heo8RU|imp(_5Qx+*mhkG4(jMsV0s+VF9E`G(Hk>2ags+=DmX#sd8=x7uymL zut~-!X%hzLvg*jPRb-d4)4en4I{3EB>8^?Ad=Aw5BZ%K9+ur=J-&L^tUwltcQyJ>_-ymilLY)*&3p|db;1DUbX)F(w<}eys1@$=`D2I zar}&D{t<4FrtRPowjR*S4 zEB#SkN47gNj_^4YFSTrPM{U;}MA@o%rM92W?2gN^^CmK!IUar=Sfui+G% zKFrcySiY#7vCagY%j!X_E#a@gCkW!mM%MfpOzWG*H$2wB=ESjJy)D~BI0jMNe0kS6 ztIRxig{SAB%cc;&L*4JvTb^hU>@(JxwaClm+GM(f@T{_kf~7<6OW0y+&Oiq`Yfzq6 z;f(xveHX3;9KKI%|L`nBLH;sM_({c9@vX4-z84tAx+{!;G+Dc=YB$p9Dh%+`2^ zqoXDMF;K=qsjcpXlf7D-#ZjtsD^R9Bc6E%Swv?i_^v{0|N_7vJN$8YjH=L25{L?`~ z7KS0wj|oZxds&(EomFhILq?H_Ap%(DapU0#*%g%We~j`PjC5IVXyh*bwzDUV*B+qS zW}l=TCqS;v`!s1uE{+58nQ^Or()?#!hbgFtCmI*I)1wH=(^eqeoO0glx5NRi3T^e? z0TBOXpAwDN<_CDX1@Ae=5Y6D!w-)_fWiyul5WOYL{Bg)0KKJ~C8IsX)cSR1W*91d+ z5wRsCq9}r2BpdX&ILjIYh~CJa!!pjZ;tS;N)-Cq~yLm{xZkTB3@fplqK@5APES<&y z4%z2wk3*C61w!lg(If2(MZ__{v+=XKUEj7GlFfQ4g}-Numfe7b*Oel=_Gmo?vsxiy zS6C--L6l+I=|98>CZDd1&1(4ZKumofJTybTQq8+%>zCA_!#rxsO{k}<3T&~JQ=IVs zVx$3Abbk<6-w2cJshw-xU;8lqsaFmAUP~vlx7B&4FlwEGA$rf(JyLszoZxckL`O1r zI+bLkUV3-8@asx>)H$YitNX@(FChRm@SeU)Rg0D&p;Cn4{oSl z59Xj*SNgp~#kV`R&Y;rU(D21r80cbVR6Y=6CA;N^!^Az0C7ChgijV$U#JANa+&BwR zwCYMhW5Q&;68~gUkeScAMWpuqjnTORL(4nJx7eSo0iY)NKcFT^0H}!u$yJxR`H|3F z6JNK^kBr%oPkdj+*QX-AT)~q+)zVMzP-METN=0k!UP)z2w84;HrZ!C)J34;C^DvqA zi+-`Nv+{qh89Vp8zV-jT*Bm?dyQ=eSoPPx?06jf!51_C=M1K?nvTh*np$zWBn}6-$ zu%!F~rznUVCzhB}4d7*?FqnIeol;v~NyN_*m$i?LY_~aOX0GFU!S&uFh7EgfL!OcX zPwir^W1JE>*B z$k^T6iZ`>IkK5`Fz7~Ie+D!a@!oa{D(-^ zIaW_^MIUm=u~}!2xj*4+9CBCEN-)1~fnEyQ6Jbswexm$Wgo~Y`0cO7DBpie5J~rtB z-WX4_bDoSoC2rxBX4)BWQdf8p|Gqf_6Oj+4QFDmyzXQ8ngPy2DV3``4?&T<|KSPQg z$yp5rqtleu@@W#_eNBPU<*%|mT-Jc#i9f9IA#(kRUs5!&$~xF&-?F@oY7X4qAWQ0n zGt!-0Uk5oS9Og8IYrJJgqav+uK0C~bq+-f(VED|HN;b<>L9IB%*|~{;$8-|sJ0%1Y zL(*Q%ogvTC-M>RIqHQ62>_Kj=ps&$EB@Hm{7-(CdM_z_ zIJT&Yk^5vH4)M)V-BT)4Ke(+$JXY4y9l!Z8pZhqR^HAuCsjR2Qoq-p|+z5t1GwAN% zXJXYyuuPe4rIk;D@O_{%7n`z$&~pTV<9tkw1ff=f*1NE=XDGb&h)`rE3Ut7CZo4t_ zN77m}siGLA={IXJEf-YTrx~O6r>yl7H}ybyWUSea+Ytx*`(~;iBlLn-ct9vF@ly9% zN{hrT z7ccA&H>EJg2VrCzKdyzBvmf&2*xET!4pnbj63 zD!-2n7xqrM9(lYV>fPHp)*SMu9zsRVOfLfYC*Y^^+Ta(TW1E1RqDvQ?B<%yZ(|D2i zIrVTwia=U$9aY+-T7gHwHL1!GJ!qvrEOglkS?bX-uwl)+qaJpE}*&Z;9%8^Hxe5zg~3>Y0n-5eSTUSi-^k$1K50tPtXRSl&#H9`@`1v|wiC=;a>;Qbc^>Ftd>fkqEZd1HQa9>^4rq0*GY{g!-=9Lnhw-qSYT_&v@q zeDmD?uEN%YWz{~eh)nEyXdFgcjt zm0$vBivK>Q0kBk$3t)m0zvBbPWTI|Xvj@jU+6u?49@M4+ENrL4(RnP5^t5PlMhxc4!vhb;@Q zBoX$DT2Hfu3@<8fI7=wF3YT(XfqMbb=6DoCcHI*h%e5|+2es~gq$hJ|5!OgN%VmG0 zXGPy*UrfW}(A*DtI;`u?LQGjhamI9zUzdGoF&g75o?QldbK#AOJ~4!_0$A?kMuH@H ziL*MaWWiq?0J`Zgftn!k4-ejp?be`@MGCgF9cBYxBDOZX74spftZq!0&Odw3>DkmO zlGnGu(NGm^#)ic!vI_Ni!!84l)!a zPM=iCHr#QW#JMB2Mv^@!h={m+(2qgp4(z<-G*Jd91-?8&{dC?>*8wdv;3Gz*ZJYrG zJHyEz4d+R*Lu+{ukg$3AoE@L#mpK@gan&_bB8`eSwvhhOeIKa+iK2gmmykz zcestd2wcUhFLZQogZ>?xR^|Hwuw|+~()*RLg-kKQpl19lxB!y-lnH0O>ly(>i%7B~ z%Fqj8gp+s2O@*#e9lzBZeP)k=CbBQj+!23tAeb|;)m(cN2X6zDWXK;6 zsjjb$XQ{CBG1lnI9t9=obqq)B0xky=y2rq?&Ku=P=q;XYARU7&py4xhBE*yUMzzf4 zx*2Pk(7L#<#@-f&S4ufC)SEH7<@zIBA&^~7i8{_BMdKu7;mzv2f@ z=zuPa5_D%`O$&McTE|0bGs0a0S|o%+u53-;lK#o}h^|QsnpU8ZeW`H*Z{NOvlX0YU zt=kdD0h*snnP{Wi0h+h_$nix8ntU8g82!09O%6uS7hpusU(P(6UgEj0OK*iA@PvsY zN-Otr5;cJL%{v-cigmywavuDU;|!LXi(d=lpdIEUt?gFkQt%6Yl459k|8!yf_2;3F z4kCK+ao&Bk>{A98yo`@vfl>95PhIAR7C!{k%Ii3D^TG6->e5Ux_Dm9zo`($)`?EED>s1UE zTe(D9lg)(WZ6tVb_~Ju6_z;yk4YJnF@5e?6Q6d%YX)ZBnY^Wive9=qZJ9=NR#|^We zM#dloL(m3x-Xv!d>)7V@nKfOF=u==iz@$uc@PUx$>k2coTd?mI<%anej8ey+KQ+s9 zI(G@LBV`|Q9cqf3io8{?6S+#vIenlNGMP$7af2+wu z@qYVU6W0a(gTw1LT3(d|73-$%H>xtnTV-M?9uibBr9g|!sDh=}wwI+j#1m~4)O}_e z#E}CXW=E;of#c?QXWy~wUFYexia&|2o(lc{OLXu4mFULzI=_KGWqzMa1NBoc`}M3&A0&Yu8$qrPReeCH~>OlYhG3FYJFgMA_q4fsv&{Lv@ob4n#E=K_9b>sRF7ang z(H=R^`(LZId*F%p4tWH+>6?j zC`to};ek0r)Tx23LW7}bDekE5VrPtWRvgwY*i3GYkCLtw3@lPQAUXK9aOGDOS)#j0 z@!9pR4ttVufn~Q_Vhl^aJP9mV9w5%Nyh9IQ(>f33KK=-k8)>U2N&iv7Vx`*5q7Vaf z0k(ES9Jg4mS-P7TPG1+TUb|>{i0dZqKkL+rU56{({z{?oq5Bx}JcjoT|66AE_r2f# zR`jyR2_Xy!LV=!l^%H7ZGPA*+OR4=z`2CtmUQQWb-^T=s+Q6jFaKTi*UDID7CP3U2 zkUKi!o-K?R!=xijt32D~IwIpPuFtpnNxV_b(iC0&JNfGY#l|-|{sJ5YXMd-04Spw3 zvR0y~x3=kB_rQNN1o-0H;Q@_%FlV(s2(XAe(XY;7+k!s@eq9kr00R6^i<5jI{_-a_ zCsq_CFGtof)Sff!pUEA@o}A7vMZokL1qZ>IEe5hyj`*L_G4W!{HbbduRep_EmQ#gd zebSWrT6t(u*(8YUioimjEUK70daLizs<05q1$Gs*Pm;@Id^efZ;{|? zV3FH6*P=SQ=3+Xyj$M0{9>gZT)86YIE`1Lv-jdO$jLz?{xfj5>9UK~O{6w~Ql{xemMZ9nV#beMq08JN=OX94 z@0oEQB_|O15jS9#!@<7U=?fT>ZCZuC(4l`*39yB>Opg1E3e!U?sE}W;A)n>TjLpnX+>BOtYsI4W}s=YLG{pFIx=hI%^9V87Mp~upc*G za=SnI?9uzfnX&f2u}SRjQrG=QoK#ETLjasZad-JXaFR>=C*F!mgyWZG5cx29Uf|yz zPE^DNs07CFF03X)n87hhH{eTZ`Lq{E!V7^CP`L);a+v8F!{EOjEAT&&e)K<*fn~?c zKB<{LF-y@B!Wxf(r*t?PzG7Xj{Sy?)rq+Ib=Gz99&fp`T3N=TSplG^=z@8h6?7$P_ zE;25hUw06$a(?IE|KKo(fr&%c7vow$mYP3O zMm^W=>-xR>RY7gjWdYNkj_M7&dce7RPiujhwp!7GJ`MeKIP35jY%|ZOA(&Cxh8u7Zd&Uhq<~= z?ag!1t>jvLF-%rX^K^ugGfF^GE`41uM}A`G4uUs~z0k9|zNssEw*b?zi2Wn5A}P(q z8(Z-{Pd4ViXWIW<)Bu){h<#I{B3oNWHE z2Q<3fnFzz~u$X$~a7Ou;U8K3#+kuEdrt&hW67$@RG^r~S1}~1%B2h-gFk6FRCdptu4-&M|F{#)iR7b}DSh(Ms{o?`p~ z&A_Nze5zG?kFVCdv#DN2qBilkH_Ec~8wDG>HOCXhy1_Td`N$sBRk^JQM`@xr6)`3` zB_Z@2r$l9}Y1{PzbLdwd_f2r-<HksVhy3;3E{yeSZ{>F@FN-wi1&6??b@K z{Vp{v3)f$X?thN3p%?@v1dyRb^Q&ee0(s}BMK#1UZJqOQDABkY{)jo)ocqO9p5{`yPJRH^7=>BsgVwL z$9-n=!Du^*D>`0`phFx0x%$y!P(IkBcT;}@`uY=2iZv-W$yp6a16Lm}k56BChwIXZ z9{&?A&(!e|S1jvD%VF>W<=C)%-ebmbV*lQ1-!(OPt@6yreG&fowl=$x89sokSTjhE z`>WfXRmH^vyVuw0{b|N3=&eCvxsAGkOR7B1I}>Jl9Q zD>4!PZLm|}lGT%>Z2j#Z8Av87ETTi(wx?{TLm@&xB3vigp7Ll2}PeFHBNKAeTvZa`+$7S{76J0D;}{8 z2fOfx_{_kxynjFG$=X;x#@~*UmpF)H?t3OsS!vuHV6|tIK+*;zBYv%i z)+#S0KV?}_NiOhj_0LI$8qbCX?l!Vfb#jD(u!}eKke_Su)u6NH1*fkmP5_AhTr?rW z36=#VGFI9hkQ`5n=@oKTo~gl&ZIJ8VF87MQyea7xebBHjosNC=?d=93HsaGweXhru zj(x6a;DurHK2{E{37K5wdWop<1>pUL-Hw48;X5G9D=sxUOSpFEW#;z@5AiMIzcx;S z1U%Ay)BHMo%%>I@lW1QD|CNlukN&kn&DDP!9!*X2%UkR7u4|d~?|B7X6S0E;n zZm11MRC4*9T!ay%`-kXt$&<$0k{a z7eH9JB~@gYPs1yW=%9BDCulT0t16NutX zqc?&BvtjdO0&sTpz`@K%Tl)D>%1!DZ?T36WFwCbjtsFB^?WWDr@dt*nZD3PW+DIa0 zxeUQogXrg(xBB~G>94|tPbvvkg^=#*kP~IQONkn159FzDUbQBJ^i<a&v|)Gx|tJN-5FKU5wG3H7k34$rVXB$gm*G-&0J;!8F_t*5az)WCUHqw;ITGZ1Xgr_!z>w7$H#0d)sh7nL~&lQsJ6^D))CV)k{yE zxO#p-p+btNWV`zqz#SjoTVX9R)*L@>X4cV0L8USIb=6}71VvL1lSR}|7wrL#ET5(D z2r}F?K6g_Bn~_EV5`Hz9uhxS0@cShq|6K6X8>{?2sUa&j`@cW5!zZd z-FtwM2PS=~Q8(m18nKRK&rdwrz+)p#R{aDW89A1_=M6R^o82U$0%v4EGepQ2wLB<6 zlSBCkjCyGIAl!z3Oq_cf(uEn_c}SOye$~eKqcM}J2#)Nte3oQb%MtR&s!KQ_TGA4Z zK1)Q$~HRM!h*=^+9{~@T?ZbcTm|}zhXE{q98ulPbRWL>2f`VkgJ1>G1v`lUV#;N#c!U5;t{Ngu*gw_8v4;`#33RLY%G z3~Fg|CKYQq0uhk7z;ih+L?60-m`Ri-Lnt@hK)z^>QW_Ags?OU8qyFJB zv?v8}Fy6dCIby}C@oAwOC;q%MvGS1J4|tkJPw%ZIdRIQr`nT7l+W!U8hB6ok7T@zJ zL>2z2rXf)CSgQLWNuZdEE@V4THrQl0_!Q46V(YuqS3A-?-SeeQ#m>O8254Ul8-OSI z?_fmuDH66!*qdaA%I+3ND9MMSvFfCHLA_cb+aODK`$vohN(5735GMw zvJYr*j??CQKX?-egnvkB-cqw^U9OLjJy2GsdUJW1q(ESgAJLH<|H2p90FlQ{?;IzMn(=oRzhScR+JC-1Sx0k(mlHE!7 zBbCHHt&-Z46}E=t>tlZtKIwrx~w+h)afQn78@NyWBp+ew9A-v0WW({Fd3 zG3sZJvHw0lo;lZEd(FA-dEF}G4K<>Pn42_p4)Zv8+=B3K-D#;M->MmcCPCWeTEK%3&_i|Bl))*bxPNEIDF$LT?oJOSW%oO9HJkTLJ zW1d9Qo;kW$qkbG-yl;$ztvnX-$I{RbxAHgC!RFrVbV0DBkaw}?2PKoGC(+4*<7O}Z z-Am7Hg{ODxmY1dYgwpI=@P~1oKd2y>{&JRq<==(pvO&RTT zzqsyFi9Zu{w;xvczjCgvSAVH1SObeksR6@k0jY{mxSxuAIf>xL04!D;TyrtZn=*fT z)P5}3cv#U!Hx2+*bzetzUEF}ar-O!=J|~W5qWAVZK%M?Ncw2zLu;n$3{@x;?O+NEV zYfGg)Nzo`grjBv<}F7X`w$EhS`fD+1;CiO@ILui!$_!80C;OB~+xcw?HRej{GmPN*^r4`gUNMzM0itms~+ic?Qt zOlth5K)`niq05KtIjp1u2a{sM{Xdr|8^d3HI~duR*#EOmW58!)X82d7cA@Iyq_}|d zx%nDL+Av+lvAVJO3vX!B}%c?=kH=jQSlu)CgW+BYXN;OK%}cjYBvIVSMA%*SO~ytTja z`tIGuclqRf6_X4A=xZSE50IPmc$#K7!zBk0yMzGaYz;MJmfC=G06$9A?m|p_1efUp zK5Y$}Q^-?Gv$ktH09=_MFmewXwo}SR+|=LqBJYN(x8n#fIKB8$Mc*P1?0DJ7VA&Le7mdPg;fCZl zb^Y_Dcdc#U*TOsOo+bs%qBPkVGfRTl zR$JIh*F%@j_)1@C&`I#-QThr?$HnaIUK4u%I z7fd5Hp>dI;Rv|J#w1ctxQ++>PUo@cgCmwp!r-hEN9vVvc#W+slmSl_uR$a)*6;hRo ze9qEh<}opc>1lu8L|o~mIlu69q_IN+;&BM$A?O5yPIE(&tUOJPr=F!Ik91Y9he!7K zM|GzY!y_^04Ydj?u;-BF=vEbb#DAMPecfvL^v-LfN(#UUqoyu$SYd3lFb1xyRF^&> zQaS!*fag|4fX*`X#x;B(fuASey#Uv(7ko&yxf^$kWDQVK zk3$RaT4D|A4ek7sfN*_W4rDf#NPN+5W3CC5)ev=`+08+dY=U78Qe)>ak_gl{O8fr8 z$pLj0AO827i8JPR-$Vjw_B(ST;C!3 zNhtP@BV<7*vdeA7SU~UBq@Nnt4?P`^aM!C*LO-SUS=gcJ@(%RTdmhnsShw=>*<)1)jO2a z&&Y3q>tdML6S=kB{6={Afy7>}|ARCUEan>E9ivb+TVMzNvP}q{^w$tK9E4?0Ass;g zzzN4_dA~22>ip`Sa51;cU{zdhIY?DpZ<1WtJw@xR28DNtO8BROEz)P{>VOPVJ6VZ& zp0UAPe|P_MVApZ=6%Fd=Z53tOCdp{KMX8&)7Rpr2S3?{fE#YQ$ZNYwl@Z+Q^;)8FC zNC89QHAcKu2InQ_ktt?m!t+$tu!N%QJtYAE7LDg z6fsOO4bc}dml$k33`~ap;A}jTK7aERY;LJ#bpuYR$RegFQ_@KY371&;!NFxvQYAPR z*qI}_`Z=fb`=Ggo7o|N4k2exVm)l;6RYrDt@pAB2@F^G*aA!XDCSfZc(U+JwN3juk z=bz8#KdVFAW1z7|a1oO56kuoe2@tWwVW$ItjjXnN0jyVknfrWt@3dQ}aDRZfzty4` z5N?tW+Jn7&Js4!SNp_^Fz3EmqKEpKn{{lF|7Th59yi`;T#p2{KAbW(eIDCCjL(-1g zzCsI?Fhf2Lq&v$Gw_UJMzDZOqJ4%dJnXQn_12^1^nu?XAEkmM|DJy6=B=z~m@Oy%V zl#gVaj9++W-fY!q)p|BndZ7qa@t1kG(@X*j1|~u}{kM1ixI$sM>!8F-v!Qfe!{H|P z)Lds^q95w+T(&5>WK_#z-CL~$v84Ma+eqNB{BwkK<>qiT(|I|`BC+!9@xp^0u#XveAUwmzaA|3k0tr_9^YU2ofAu|4p%SE zn?KTcT+3~e>&+`?m+e&FGo39}w(MWaP{zuWl{qMFE6H^UlUIbtdJ*eO-E`!4u=8{I ztC6lDU0RG;h-<^)=ah$W+}E4oLZHN;Dki(y|28Vso$`3!?62wvAivA=Dh=?S8HDf)9Idc-9zuPCex zBwcz;`7>Raz^)vaxZp0Vv9d%z-4T47OHMbYa0a#Gyai7-CEVdw_-@k$l%ZVPQJO$0 zWkHz$X<9cjQD>B-c+%>q$iSw8t^s_UIP&3{oml!BFpeD_*Y;+l>}gDUZ4E)MQ6)J; zPxx$|U|n9>6t`Uyl3ID224>zWg9(gQ*qNpFJ&|Meh;7vxiTKkSG&4D|AuHT0MhSM_QR^oM3&#ksJBSZ$4F`q`4v{no=|zk2#Rr$xaAvgbY)#W6`16TR1`9T5 zcm0O+)3w9k%HDP+?@(_S)~I$!2)P(w9dO-Wp>3tKXFNeD-gKNc@(-W3?<>w z*vY}$0zC8egOZ~6>{zXZMKf^#ZQIWuoWWF%kVYun*tJ_<+vpv-)1U%0qcO&!6nr&; z@G;X6VwH^ci#)7SbJs&-Frcb(J`gTA;@@;GS7^mn}pI%03|HYoci7h>KeTen?~4#E;Q%y{SD6NOt2& zE5mk6WhpeSTS`_L-h7A6Ro2NfmpR5c?%AO~;7xAzmX5b7ye z(G)}REht(r_a${ThA>O9M+-BE}Xb8*E4R8pig-M*E)Ztt*b>kLA3Z)D3 z#Mth=T6d_PS?WKZ`%j^JnCo}&#;uptKb?aAu5$nHNZFn*pSiEjk2fUFnvngj6deIQ zCm2Ts*2E(~bfr=70BgA8@S3WN5IE8@{^uu|x0FuX(qucxZM^>pL(g#siB_-VKS)`3 zd8(Spf=;UDRA=P49dAv<$xOTeGfV(?LuZMGo}GbUz!*XASP=(r|GuZfqb^;;w*d?< zf5`?KHEj3U_Gs0Akg}PaV{KAvVX1sWOQ;A+Cud@qNYu3QSi~1($T9;9f_H-pkoS+- zU5b5X-psX{3#KmRlBK(wTTIEHPte9Hgp-O%-~IO?CifiW82Y6T9KnSqY=!?6r@zt6 zFtV}#t2=(7rEW{yjNrXgeS8FP7^ln)s1In)yjRRkj1X(uhX%hay35Y*?LBOxl3ii* zINnfDK?4|}9^RLKS{^R~***Qi%=qC=BKrBD5w0g#Gttb)_jV>gL1pzaBlCd{#0H!!-tz0Bz8}+nSQg5iX|TdtdU|_3 zJJr^O;pq5;YMQTv5C(pUhhhg@KoC>Hw zIX`dZSg#0gn!DL%`PjS04#9aEW$j2DsYt*sG9lN>mKW8@Q%Tw|A!>F~IPc}S^!m1{ z5WM1`weM&@Cfo%d2scYAihv^4J{7=Zq^jW6V-%9SUa^O-Fh>|QuDA;S!?JUxBK(Xu zq0qk1JY+M~Pvn7H#1UC;FM-!Jz5XfAv>km4CNQ>LtCD090qF*0vF&0vu)db(*_r#)ajl}^TE0^-rUSQ_)ld5&AbxqENbw4EKl!Y^5M><&n``8pyumE1zINAb zLeT;gW8F5TM&~|-%sq;KJ=TP=0s~Zr748q^1eB#HNc=K}T94cP2j)ZR+&YE(+r`{dJl+++!(cg&p(~JRnf^ z^{nWNpC5mQt>jJ_wii5)ORp(i*mYGU;+WiNO0<+})}VsGD{9ZC+KfGH)}1OA(lYt^ zM=L&p1$?82W!ZwyTd5n|&0#rRfbLQ!j^vCdZB>&zrHDVgogUIbR6IX8Olcm0plJ3p zkaZYs^K>K$SLXY+z=`41X$uJj6W)5^WF)mcMOF)P^@c1U_D`Kor)8%6kn2b(J{p z|COCW2pUN-+QG#v!H=F^^(B*q=cMn!KgOZ=l{vD32EO(x5?GqktI|9z!KN)|a;ugz zq$I)u;e)AXFicIZ$A|(wR~wB{oeb{V@l0MG!az{Zb`fo>(Q$9GODRhnFxU+{muD-exR?;c_z;iwrC2+6m?l?NHaTH05UL!lMTUI8;8LU z(9nC-$|TN8(@!=NX;j)hI{-f-0h}}kIkqr2lzK@dD@_(3y4)7Jl$bZdwWtqkAI{e0 zdGJ%LN=oEIuORVwx`YoQ4LF|<9w=$?N8GO^L0twr`+~!3Esh!^o|I&?w9ud| zxHa!-_c^%Jb~Ep}4_prwOR~B&XR9t7wIj6&iw%w<{&`jEhmAW*qsY2FyNsCncXO_W zA*Se|UFV50^z-wZv8;}vM9MG@Be2}H&{WDnInTH2 zw8$%`gtX~vERX9NMY#y#>qIp9u`d#F0*3J3NP^ z3IpZNzlp?PR<7zznOr1)%MJ08x0a4Wsmq>1nlFITpT^u91_nE!is=ys)HX$C&DGx- zcqwt8=WgaU?GayUot~5BNTW9n1|vBgo7hkC$6241i0P=rx+qDMC8m-Ttu+#XQqK~_ zSP@}cHJ&A|jqxazT(;r4E8)4gBvQrt4JKLXdKBFrGa|)Ea49?1OMIt-vWFLP_5*H- zDscS*w6-=l3rJ2xDGowS*ga^S#6n5>KR_02I<@ph3eL zQ{dLTX*q8;%SFaY_1c0V%ipKilC@{N5(kYnato)$iYr8`ADauli`hy%uJhH^NK}#) zlIjd`Z#fJw$f=NZ&b<(k2MbvVrKhHjMC6a^P$vW$F|2(2vinAD_djUupP2rqPMnl-;iccO?aquqa`K z55)8Or{2m7&JHz{R9wWCCWxT+qtP;Y>t67d+0M*3&|uKe9;Zt zFDHI}@Xa70o%m9EkdX5TTM4|bOSNI=q?>8XLJFqqHYAFR7%IF^PS}E|0XxMNLlUn&?X= zxtZpkNrxRyBH3|Qlx9wpvS`9Qp5-&@Q5n-JnGW|k1KN0ue(`ryG0H|e>T8W)WY35C z0R!fFf;nCX0SN}?(C8TJ!g9>%a`(sx1gv}N&5?6?Dtnb-<#4gs%Nq2X)S}v;NF`VH zQ#76s$a~~vY5RwW$WRx+jS5=3k6bm^#*T8d5Dc0o_1Uo9&B3q>IIj)WuC7))x}h=o zrWV`{rR5Y^2+Z7%mZTk!1y@~W%TZabMZ2vKwnr4Ciegf>D~dq$7H@*u@!MT_!sx5T-Qg?GEh-^f+te1U z5mo({!7-NnkN{LUsGr%RTf(QdUe;7S5r*yJ;k1-l=fI1V-g)~Vw>RA`4!yH1Fh)HM zfTKCGduZtQRz0itCzm%-K6?2Q2W%?<4G*qfY}-7eEkECw=4Y@^Dn0yo20#yffV=+K z25xO{_KLFx^&_V5pmJSojB~ftK}ZONwjvVO<+PNXg8qaXW2^eZd?x!}(&IV))kgmk zk=PvkKNPt$y6obByojYQK!sCLbv;C$S{->@cvYiOSkU?&O}l8@L5$8}A5r3!)Kgr0 zn=pz%UebX+1zs{*o^qVqkHmyM9IVvLP!r)O60?G^c(X%vphM@>2V1*j&OW11MXYY8 zKz4vj`GJWiwd%98+n`Z(^chXLdyl;FiP)3R-uTb9L6~=jv!{jEoHC~zYZdVep;`dt zNqGN#_Hqrz_txJxYx_>vp7%2rn8~YPM;xzY6`-duI>ypO^3f^*)iG35csKSzy{vXb zJ$X4JLd9xg3&p^IjftnQrKkBAj{RaTq)8`0bmr|B z{7zy@`wOZ|A_`-AY7d8Y3QDkRP;F_G3Ec_^R*Go~aNtz*o6Ty;z*i>^0LFQhk0%C% zIO6*VfL~yjqWYt`YFAt$mGX=Zr?DCen=G@_KRt6OIvHwZ=1h{%@5Sj`Q7lCDQJrm+ zuH*9L?J!saxz z=)tcv$g_Ssu|MBPfXH6b%^Dv_`=7-X`(OHX{9B7F^AGkD{|mGJb$uKfcmp>XFJF#! z@@iWQ{o6W@Osi=Otg2Cg#D0~Nw=c5yBu47a-RPbwDv|X{a-Gu}Onp=t>_WsoFN%WF zdNzpgTI^J(MUdDKJ|ALU^c)60NdW%^yYL%dje0gZf3s{H3E^v4H?-AxOm)6z1mc8wp)X%0akcFg#`BkqU#YFZO=)ac_$~S4(~

AU#=?xzhZU_#fi z;ij3T2zK8jeKIUFv=;L-)6PzD4Ydc-s$f)){OrYX_f(dSS`L7^8-q!8(#bK4&R^n4 zx9XcU21_;$a@n_V>YAZYe+u|tNrS%J0{*S_`hP(~zJOy*O;7WB(;?%Z)k3Qqnb?pf z60D7}Gn?&B7(yDd$+rpOzw?*6`2v+bw9_ai5Efzeh#ntU6nK}k_*Vz)Wa;b&%#l*l z_zUxZg`uo0z+!{3^#j+J(9ZxToZ;if5OBg6Yo$%Zh=0CuTW>5aB@VMh2*yA+HcHl^ zoytZcXs@b4#rTi3Y6;HFGZJ$V9%hd4e*1ZzZVk|%`_7G`79HO#sg&+L?KO>kZd$Bs zcl4{O?%CWyNKEPp<;Ny48m2Lte#TNQLNRtg`fXkQxEF^;pN>|>UP@BR)dkn3`f)S7 z)~c{I6>O$-l@|CWxCZ*~mPoXF79ZLU{5636AT(|?Fx+1|1e$PH!Nu3-FTswUbXREn z%r-V_hXNxo9NeoW)b^B28+!E@Op^Q8JEVuKPpsHC7ofxe#CGXuo=hd&IOSdO_blGg zt(er-^&q|asr2+UJpm=g+pr!R1Y_rju`N;ll5SNgpsXi)A(ZB{?2twXuUOx?ll}7< z$oObU-huzV_ob>BNbfD85;{@^Uf{&!gP|GhW! z?96{9nfbc6{jY2HAFg}gKb{O6L3F;zJ_cMz`pl=GUyR^{_z!4Ae=)5pcF^@7#j?9j z1(*{My$^jdgio{eVe~jzlGFjwxrQSXoS8CY;t3sdx#s)F3hR@D8{ba@mRsf}u+5rJ zltF@L7W&_yLWE$YkGVN1`go?RRQhW2?wFPqa3{bjfYu}4VIy6wd-&G<#nK-L zDa^D=@`-U>8(7q3f9a1Sn%#csK*w8lSm(TlR?+?SZ5#S2L*`syG>Q`>B}ser1W*OI z#ZhXNU`dS&m`_JQ9ch#*;YnYS@%U%_a@Ao3Pz*Sv8G6}x)|9Mvm~k`?ALy`4GHI5S zp1-eHN~hde-ruDg4ZN&<060S1{$VPNo%!z?fvo>6tpCMj6$A{JSFY@o?)K^z4U(8d zTUgdLJv4dmcz=Pcrp0uFz{ow(1d%xu>4YlGaGVV>?-hvNbdmqaymzQwS?!%BMMsF8xYO`4?q$7u>>WU(Xxx@{6M zV2T;=BBIzCDV!B8EVM|0#3+s?hxrg=qKl6MfstfXqU59Ya^+~zo{SqWF3D=cEZ8KN zK!4$UhZ7r!G~=7!Eokp{P_1l{dh=42_+Hm_3q4sw@pe%0c0Fy{$A=^I1eh&mo#hL1}W4{K3&PgR_S zjoo!PSu-Hw3TlBJe%b2Y&aiKOVk@=E^Go_Xafi0Mmd+6B6>~gM4Unc}#2)&hMj~+u z-C2(g^iEj2<1jIyep0O@GdILsCxB>hjf=qw699jM)Uj0(A{DY7T{!fIYQc`6Q zK6RCUh_%#zASAI$Btmbt*VshcgK!aV?>@JTOw3lauqwHyT}_1}a6JkO`$ooP@txSo zWoDrZ5WQ7zP>GrY!CJIQV+`|u2%!V#;Pqs7Xl~p6Eb9ADf+RplbsbDwX65@JLSslE zGN|~uP$5=xwWn|n0v9Xe^C;a=_ZtZ>T12c!rp*Qfk$s(3&YlNYiF&XmXhQs6e9aBotXP8#%qp&5Rv~R&)w#C1Xdi4sQS83=7K8vm4p{P-!$UwUvi-}qQ4R%>WTC1pb_0k`_f+$8r(=Mzj-a)w? z6#`ZzjvRKZi$5^Yh>r8IF=f$vhGlGszxA6FJ2Q$6w(YW;4!QR1u=r#nGHDwL0cL?h zCE5w2kw}^34iC+#UTz{rJ)8IEsECi3(I1u!(_bl~m>B*Qapj+1a~Al}LH}39sb~1gjT6+ENiu_EIVT=xZn=pIjC+b-MqjMA36h zqv8Vopdy!Utfd2LX-~uBpwH+H<<#s@4ksBhmr>jzLBJ;G9@gdW`p6Rys%k>|B927Y zl*SOY^*z(tNu^d(R-|)NykQ2WLhWs^AuQ0AQRdg)XhvG2Z6$y*r2VVtd?1z{TR4BY zZ&NCG#&rlz#l}>!gQc5IUre4J)BLJ^_F60$w@V5v8Y-OqHE!AoWRjXgv26ZST5S~} z+iIv+dm6mJd`@7r;%{y;JuKr{$=4*t&iT7z60MN4oTzxr1?>c~lr#kjRQ#uGPWMTzHUT4IDk_;9_C+|*`#PoyDK}s-`YU~%Mcj)lhtdp0fIIv9PUP&LuNB9i zTOrv2LlK%I>2n6Gdw0rO$A#W3OUUMh_ttmaBt z^u)j$mS5(*rba1)cNVEYFa+?d){*AIVp&IiWc$dchDvuUEH8ZFSSt#n!vSXJb~A2CIRWd>X@)N(3Njy3Z82wW9Ap zsX|muD(C1P8Akp*o9xX8b8te{}%;A4WG4Hv9C@9cL(SzJT`0n{xv6 z;7^~`iUk|84z*Bu%1{z_8S$hTb3ZlBethTjx}itlqz>BbLsyQFIr?4?9t>o2rs%m{ zQQ!zz5>FhC8?PgTGElPX;tRtdQv#}rkVXR?x9)2RhQMd<+VDi4T^xf8xiOU2aPLu*WL=bY?Hs_xpbR6}N)ub#*F4!8eB(~lnsRsD{| z??jf=&s0sM)Ncm4$)yfDOJWquH_mloI(ziYrr;{0%sD?{^w`6{LA>-E?9*3p(#U`Nbw3 zD_-cl1zdcrKDr~QE!AyHH0nZ^Ci*>!j*KrQf4p|+eUi!*K|b?*01I6-a{eLZf9DPM z-+|WO|HDwUzCqX%GA$z_ID^{CZ|gvWK^b!;6VTDWg` zccINM42x&XTM5RiUTcWe{lu4#1G`#wS-X-+`Rau?StTs2WxKDSxhZa3e+c2^$m&#{nR;AW8rMl0O+!9GLYFZ- zD7r#~kH~aJzAT{&A60QQ`;(!UtN^)FR60@#tO}`qj3SC4&kOB}r7PsbDi5FvsRWY5 zTBwqcp+Fi2y0R%zPgH>*Q(_|>#s|)X(F(0<1@d#Jb_Fs{j?|tR!ud3nf7#HH}35jmwmye&L_{{lFLA6XmY1pR2g+%G^^<`|8EnaWif12o8yKCoIMxZ!FqX@~8&58SjI&veT>)~N7uyrWvp65w6ePOv{Cv5VH++@)qoEeeIPn_x!pHU+P+sHCDht#FNC(_RX5-vZXLU{as;<_LpmvKO{CZjC9~DeFM9Kgju}4i>@R@EkDU;Is zINM*k5nchnz<`@NU5g4hZ@{pC^k7khSOIv>va}<1y4h&^Bb45AaLjCmdN^|k+NHRJ z<@jjF$!ObkmBJLb7LSpN^NJMB1Kr+$2`%79QL%W7fJP%Ug&0TkF^m$v6`8{Fwxb$H zQ^ci^3doz^Th|bKGJE|@40s(gVo8z>#boq>xjjBl10#HZuzP2XdHsPLYFHkWsNXaB zqR;}3wni=FF~*Uz>*(Qw?F!Dd$H(-z=AXtI(ucAM8+7C15AFupHo|K>Y1Rk_GbMk) zj5$wc!zYX+W;IPuyrRa%Iz3&T)gG2oem#lBG&Qw5i;=Ba4q*|HfE-gqIlt2X+lPOI zCnm#A0oQ|Dx-TZ^>6cx9I@qNSfe(=wdhmkn;qcDVGZeYe?(&}&;9u2xU&ChqYKbX) zd8!jYuQ`Tm0fYs5ZgVRtYfE4Ks+?%>@}Qy?$6gcoGR^9jd8tdVCP2B(oluqBqTQk~ zk#J33CLotckJGG5VlgUOr}2`c3j>W4+~_YgA^%O(x~7_b>Q)6h2+Xp~W!crk2pXwu z``s{`;|wP5-2291Sh5&llsC2!gn}xmWYix4TUFCBG;RLYQQE6&EM!>}ObEZsU^i@` z2H27cA!cNia^$pCF{6b=Q zcSwoHEf+g3Q!2cezHu-vR8w~ht&I~J;^x$6+s8>Xm1hDvpasNvv@Z%@IhTrI%Ox** zk^PeQqBq!UFR6YN_-RcSuKfL@l&FCRDa{dS{*qvRVhRixz=vM=j@i5yP$4mljlm8b zG|4Y+VKj2SHMn^4RM9`>@bAi9_J0*$G716?Up$g8D>~0^Ca{30tuL@!$o=P%q%7MA z$=B@*V$K)kdzX~uHDN*dFoBMS15?m?#CC+YV6uBm&`R3Z4#3hOVLkz~d`#UzOd!!R zkKb@^A(iW@&owFA0NCC&F5S7?NnR|OAJ8sJza+P8IvCL!f-`|)xNHqx@%}hso(-!M z#b5}a_At(Z0&GV)KZ1t2MqGv6pi{nK7Ksm%i=J>!7|r1$Y*?a*BcQ;q;G4F|eH-$< z8@8Qy!lr){3=3qYRNkRRWDIx6A0%nlya&uF`44P5z?|qxT$R94WKBk7ZpA>Y%fp;UFq#`6e zy$nO3Q&6`Z0Oq%#)q#_hyw^52_XOT-g z`Y*4ZFNk7Ee_DagXK|T>_=gWnY-|~tWdn3XZkY_$o7d0PBH*?rI1n~mg`KQ~P1@%n z%dQd3o7o*pJ=Z=$GPP$2^K_~~_TGiN9*ibKIJdza(G3VDtntGQu3M-j`c6ttBlr48 z`ON2ym3=BQ`ws@p|M>^l{))83$i(`uZRA2r%J!@Yar0HxOgo2jA3B@g&Ud-C?KIEY zw+ikBD12>qa})J=>S}s++9OJMz#%;&%gB%lnFHJVZX&?QqTB6vDe}9w&Qljy{;8Rv z(}L?S><`!J)dkx3l(JWJA|(LQ#d%&Oh|L1Irk9J5zUFM@U$b0lmNm&)YjT~uyslui zqW$yjRkK%1x=I%zTi%=mZo+;}~_-R-!9%bcvC9 zWuNcCu()V-=E6|OT4L~+h{0=L42{+ZxvULO9%^ExBsad@YkFw6{dnD33JGzI*?RvN z3i;&zygs^G5OTD^6fx+i-uzV^EmwTf-lja6T+FP$C4+7$br^55+B92)lPh2&a8`K? zo_LYgvX=*cSWzW7EiQTrUIircVbaEcsX@8(P#jExlHT2F3AAg}ea}ED9hUUIoIQ&i zIx|l|xdd!0w*duE^a!X~Qx1#AA6yO`0F7>B&fteux)5C&^$i4bf2a&t2B@Ge+S!~l z)Le)PMtwj|n`{xG%un9{{T#BSaR{YmO?{4jT?1p*Wt(836=_-{Ek7jJx9gEOESx$_ zS(}VD&K$k|C^jjC*>ApV?>4q~zg~h$n)D*MnCRNy9CRK8KaZH+@+`;HlrBds?t2b7 zsumfogF6m&A7#|ZM0gb|3t1WzF&H(|g2-hbd+u99(Krm-N)f)o))dHi03TWvps`VpF486f zVOzLY@ZSx;CD3E)yMx}va`trZQ_zUj13Sm=#{XzE(Z;DJGaY6xPKXk&rGgiF)T8j2 z8H)<3jYt(b=}$+d@Yr`iy^pLkN|uflf5znA{F(lfB^->%xbZVV)YTpwX*=st&k?xE z(2?}Wft8F$ciB@@g#Cd5L#Wsq)0af2i471>sG*NyeONz*a5l8GdpJW$+8}|tOBoSq_MK&lL$J6X#9tj! z8vlB54z(UTso7}nygj2n0MN$0Jb zy0xIYd50%3L207HPh^KF*_Whe5a6m%9&`ocU-+R4@R{2v;3P9)Gxg4QSqdOm8&?@| z5qZy_hrLh-vinE93zmuP9BWAy4!7A#;(+tvJiDkvEKbx=)2QdaY1;Jkf9;Uu+WfRa zve1f^nNHtAWP!C5&Sos@jVv27s!pg6OpxS|<XI3Y@|T2*)T@0F}~i@ zA#+26A1@J5s2juvvo4E}MA0C)^#CUE!^UYt?mcw4S$zwUm8`Se+zDt)i^2D^dhIfM7&1- z$YmS?PY7^tQAkYuV^kZq`-U*7+9UMIPZypE!pYHJZSAGsFd0cpu1KaS>u!d*x_~sx zwfI2M6dRR|oOOo@3aXR)Fy2@eo7^^{I3T$w*iF5h`ROf)x>yz*m;7*A=O-${`WT#| z2D)Vm@LV8O zTF6W!elNT9)%jA>B9)?$MMcpQ8DfCMsM9f)-W5clJH5ex!*6*UI@Je3T>U6d&$5x{ zTs}6KS~d!FbxX;EQMnjmG?CZZ>i${$lL6|78#`2)ix*_J6jTubrpO+y#^+vNn|?n} zeoIiwrun5vQ}-Mt)_ytuEn-&mK+(tT8bkKF@dX8Ez!+34S2eTgamV z?#77fs;3o^3{Q(qgI!^i?(%O*EZ5g7|LKXP0~InJ3xN545sluvnT8mUtwBWz;&|a_@>*Sgdu6(fMCobilCg+f~M9()E zG&@Qj^TtA{z>OrU2zTeR#BMv^C<%RaXXc%UdmfLA%R|T3-wvw@SAP#>Snm5etQ%ze z&Vng74+XCfpk4(BvaiBWh>JXiVxTr_v(HcsgC2~!8XhTH#z#nkVo_MH%#WMXwp}hl zkgcdtE3mi=8fb){y^3?>YAviQY1dG8a~$P4oO)8rOGUbVALDUcL)mqu82s)|lt_p3 zIv=LfTDw`hQn^-zS-R!2mAJrVxLtqdw-0{>s7Us->xMtmH7SjB>u*lmOo;W&&d5Ee z%sHC4rz;UPV}bSN8LPXdX3wY~oC#4KPN-`G`;BW9~cnZFI`ib}BpG z4&)FVDja6wrWtyc=&I+m%i4RP^~$zv>YEoV>@3LYShTzHkskdR_zAVHNA60jN0*LC zCyLqrWx@DL#hRBu*wR6$IPo|cuK45GiVs#T+B&$Jz>?eVW zdH!PmOQ!Ds+5;r}-@()WTW9&(I~y@?>l1e5S3Ajg3@&Ag6PfL2Vt-*r?9P*a7!Rzg5yo< zeof3Y8Q)HurD{C?rMp)Sm`>7VSgqFUtl$@MasQ$nu0oV3Zo~)cz?>^z5L>QYG3n7& zKRq{zT6}z#iw5Hwiim4xeS^HEgLk+jCIaAZD4G^GG}^3E7Fd`I(Uw@Z-7KtRTIu>I z&gcq_vBl7+^X3_YumJIr93k4vCB!wcIhqG9=Z($es}BKHp4AV*O%VaX$ArF)>!oZ@ zHD=Q;`mE}D``b4BusVS)xrWX<4y&Ir>oZRe{oIQV>YI{VXG!dAlb~vjDJ*mz+D=Vl zpSHu%kRJ7p#NXE@UsLCSFo1n{PJpd)7$O9*KlY-(9~k&2Qo)s6PuFSEBOKCJE!qWb zLyW`rHo6&jP6?(V=jGOo6&%|T$(U~zJ+veck|>ch(NCVnWtHB>Lg+aM{gP{0ItrxY zjt^M_S7=Ee{Vna*Y){+Desy&C8_<)_2M}?f^iQpazaz&p|C@EKtoVODU7o1k0L!j{ zf}if^m0(jpb;bOhW;q%$JBI6~HOiH42#noE&xIHd#Ng7kCC_MmLZV1XtU`C1TU;2@ z6~@~#LX<(Rl$pOV3IoftV$nA4jn4^(d60)z6pN5rj5=k)V!UIUC9L1j`|KLbB2W}iv*12E#;E18GzLq_sKs%W|sfm zKPpa2wp5{^?qtye6z<$L36l%~nzn^GCEJSxO= z6G$;KP)tWu%PM|cT)b7pzP|B%0H#Zh6a8rq`>P_Dk?G%U{x97|A3ePA)k`=}Q3(E; zIUFDX1S}w4T2k9PfVxJzzXqb^@9kgJ1%b#OvD5T-Ni(-iT&|O%+gJO}F$kh(&^x=T zvk<_ti?wZlhRz-34rl71gGlSPQvI!t`KDjSl`;FYKUtSEBk``oZrB*b7iDT*EnW*= zSpxAgW<*L-67=k(<8_o<$}s%UthQ43*IF)+?vkI^fdmzuye#uOdq~Km3f2Rql@dYh zTg8JX-|o_|7$RRZ& z7!#-H>W%FFD471M*4_VNnZ|#mRsa7ZE?NGKn)yPg{a@|wmmClN+Z|H|S-w2omv6Xi zVy5hHUgFYs)*>tBLShMvL@(rc^$@j0ybaMo1#9&W+Q7?o4=UThZO?|I$FPSuN+95L zFA^$2J6;Vi6`ZYZe${6At>aV8F@D_`1>jEJTDv=Jy>*2I^ma5X5St~l|;9{S+k%VNy z+d9mS8U&}0<$9eLs_yWbP&7!x*rmeFW6n(p z+igZu$@#D&Z?9))JfuxhYpDH3j!{U@jb&3&0tASW0u>@Qv!h)e?$9+e#S4NFZHvnW zgtLXRciX%&|K%C{I<4ky2yu&Tt7IUu2?fOj?ehZq>elV-5L|GQFz#oWY5zgK0TlqW zlMj{MUd&NN#AAr;(1RS%+=5CbzEZ{u#Yx!tf2ezhFiW&3TQ_WE*mh=Q*tTsW!?tbP zwr$(CZQINjS#`g<_f(xmHtRK`u^Tb>T5C^?`HvE>ZF9;!Y1tiM@3;)Ks!A!4K2+{w z>!LL#865;IJjVb6jZ7z$oh9GP9;3I7Pe4Rg1kHbYK>kXELicYA$^Z3$kkv=~(ZqxJ z!HbG-czRP-;P7_*p9duM>jBYWxXV`^v2!yS5sbb%mkdGx6Hc^wgvR>v1QOo0204>h zS6G?I_lt(STx8vypWP0S1Gpt>vD-&+tXh0}Cf>(V*B2%_@0|P(coFmeg%@c$G@n57 zqMC25?zc5x{&_$&fO~6>`>g&6FDfhnjr};X5a3R3Gt8lq6wB0-nj&e-F#In4SgeP7 zIG-P`{e6E3XJ;?S7d~b53>*@`^2h&-6Z2oSi|Lq|{<)Bhy%I^(bmP&Pn+2RKUh53x z2P%X)BJHxQ@6CY~*}?M-Yh)1c_JyKU^*n4Li6JQA3ljFTs7^J@>M_0iktKO0DhwC84jZ00@U`c z1bnaU*x(yLbaol($LGmg;m7sGSzpmr&dY06(0%Dr?abzmA;j}h5cTuhO?61GFyjE2 zf0Oc#ua`n40u;w~msFttl6_6z&Gqf|2@J$EBQ^J)ES0%pgd;3D_{11Bd{ofg(cyvB z39VohE5Qx8KHtef)r1~PeboC49{2m}^Yh8c(AMWi-y*{R-itk#c*()h;n%0G z-33@e1BeztCr_e;Icfw$RXtviZpFn*x~FT$#cqT)_X`|MH4?|o-fs``1afHsCFIsKj#Xw38ShO8lPOZ*N{Nip7RZ%e^Pc5a%9-&}4xkk)1 zMEy0_?WeC;h znFX3CYz|JuX5mZ-v@yP!BGsHH=^?J3ay`atndAFXT)%#KUb-}5)kVXU=#l$V7E}~S zQH6JvY7Io`1GM{~xfRYp4-|`UzycB|g2{P7BwYy_#mvyIs@hCbXEyY`q~Ilebd~|* zxGp5n4;JH;_M#OV{uKNcS1)QX;vP3k9&vzDk|RDaORhfH*3fZ)-wDWOW>B$Y?iy=M z8K$uc(AL}Sb;=Q4Spdo=w4BtOiOzOa$aQ%-Cm-OXgI)RuD8Gk-s5n>i3O>GWwT4>V z*EtMiF-t13#MxsW((0Cj>uuHEC}GM&YeYe}-h1x!f6t6WNZuhDa5L+UBRQhWR1VoYoD z6h8(i)5$hWu~e)sj5z2(tDd+AvR{6RZf3dD8eU7Nbt^zOhccxI89P(yXSSzD1$BFt zVn!<|k}9WbNiB|Zr^+R(G)x-FYJ1jJctcdQ^u&4_&uT0`kRw22%HJ3gi!#e6SmqjX z=d345g}oflyh&`*do27MeJ+9!wqWZI(G`NNDitZ^8Y5=t2e_WL)Si1pOP5xZT;8EM z+x$9(N#a8!O(sq+Pw5h8H60^UsPKlyBt2P-E$$&nj z7?g*>GKfWAAxqW6Cfurf)(^*d*k{d@{m7xch`o0>0%H2vnt+It%uLjZMh~tPT*`@F zxtu6g^^b5tHZ-#u^AIuwXTgx60nDF8qt~xK7jFHO(4ZR*4{(JW-Q-ytzwm<|QktrC z_H!k>ow0w}$5GC@NKm!qJF8oSmSEl!`D<~plc!O~hkzMSvz8uesb@V1X@om8cF5~> zC<=q2Zb%{?DPEH*J^jAgI{&DAVDkL#^!uoFwSaYK3FY2K$!S8=(OF9SlBgBEq}^y$ zmJw>C7XMpAbbfLgh}x()!fru>+)N#oaaBUxO|Vna=I@&)GTexK1{W}luyxv<{=(0Z zM%2sqI1ExTuE)#*!awoPqwgc3w=qFDwAjvtdxFgDU*Aoc-+=Y|G^5k)rmr{e;c0yv$V zHv-M%iQWY@mn%q|afB|oqc%6w()mE{)3#KYV|cErzR;G^XAw9kHpV%9c^(3xqSY3!leY zBuNvhiriOP)j@p4EElD0VacGHB;n-7tTpLPf|oRU%(-w>)G^$GuG@{22kS=-4^9Da zSr5&jwly7?#PF#liMHtmabaDT7}mH{-yai@_CQ#lDi^XfWFl%-SEO8dGfx5@h#u+Z zt4ANgLNyzzi#4pStTn=uf2L={M=>Vfwq(iOc39GJnZ3idt?W=C&Y6WBazb+rb~aOQ zLT26~!z-_K$erXOO&d;Fj;RQLPvA{+Z6C;YY~Aub51_*@pfjgAR?fw!b)u_nI_U+Y z?{&+A)3=I@ML1wS-i`?CPgO~HS25Ye9=0Px0Ydk!Mr#yPudJj2)gGLp5C0`FczB>o zwmLmWH-MUh-L)CFs)E%#kff@6b`_mB?PbrJ@y^vJW5>5N(*vKKMsXd% z6WSs%sGo{X%&FO7=69v$3`H`|XXKG)K-c7+Pc>cBw<~ zw))E0_moqNGtTnHPavQbQj#Q4*Nrv4^dP9M|8Nr3-rV(~5_+xk{OnuLlcPeM)pXFhjr* zc%NF-hb6E?e0ojX*wOoZ$ufuW&seDi6@!UD4l|gKpFtFA{YPX9B^eZycvd;w#_s~t zfDYNg!?+(av(RXls$RD88S=rd#a$NLHj3|aCFYYv$e26{mCk6#4=V^%D*JL^Q2E$Y zKR7X~o0;r@zu~T?t>v&)R!g1p_yjvyG}ipI~tn# zjfjRJ=&M*u@P#tVLsTa}70>D$)l(|+DSU0NBHB>1dm%0=-IjHoO{7jv)2`gux6qU} z$xXr;eWM^L|5YNv(G91EMe;bLBuN6!BU`Iq0O`HxqfgdY^p|6OUVuWS!KeQ-C1&|s z>ghjHV$En?vwr+<0au@)td{U$BdD!tkn!;W*pBL{kKpJT`F!-F{n)oxWlZKd-~mT& zS)Hsu%}30*i+czu>yoY*i)oFy>T9~$_ANPR3i3T?O1fxj2G$+i zb}ric@6`Ufu^9di_Wa;_y}7g{zL1ZW&6tF*TW3JrdE;CRdTEwd40dHkZY}E&1}QRV z_!)?=c0cmt8pt`j+iopu+9B2{n7>~|ejvjK#hdb}0!KPr?K;QDz8B&Z(QQ|a5QW8r z+$do{0=p1Igq#LAYCYOUze4>!BZEJO78s@^NS#p;})Un^tmN zubiBQzLJa%haFN?^B6F3Dlzm(A6yTyrXHwS+tm1E1u7>dQ(N8*5ws6RWOpg(dC&_dWH1cHGrN5dd4Vy~79f2v(#Y;~9CnUS$kH4)(;To+F<9z} zi*x`QWx;f;+Rz+Hkf}FFGYBY3X9AHQX*(`5-=&^5;;nx`h@1AsrJ=WGBp~W9%V!^jMETz^c?c(Z;GFQrARQ6P=zq zeZ9C0CCXA{{d6&n!+lRwQpSKs9hYeBb0DXCV7`7*SjQtQFJyuBXSZ!@-Gs?3IP~n5 z9>o=GOX5`ZP&W;APP5;<1p7-5@HZ0s#{YJM{z@tHzqo0~_*bG~MuvZ28WiKjW7q!d z(6WJ2ZGYe8N92f-FHvQA+A73~ADkVX;@ji@mfg6RAJW!78&sZ2m-If11@>F8h_RVk!2fo!<`S>Kva9)1<4n$+%4IL%_Za$!nwd}@y7ro1u`j*cJ z6@DE^c*SWQXozP01_l+rw0`Y zPJ_U8h76I(1O+q5){s7X5lmzuy{zm{$0+a%6ueH{=B8E<=5_T6RnP5hEf!1%VRc4T z#~7R1d_7;r{HwvPnmc(yf45$um1PTz>OH_@4e5=t4Xi2VAm<^&Uukl-^YD-B>D4k6 zuH#FW2^Ei7!WXs48nn}8IdluISS(xQBuP-S#D|SkiSY2he_R2!yigV!s=ngpy7FM3 zX-^5?b8JlR{l@l+`n_2K&Hrx~<*$^{Uy`za2vfgA{~%vZsQS~X9sPgHVgB#@gfs|$Xu2L$*VqDxD<9V84Y0P}5u|~5@wUfHTNIMw9zRF- z@MqqE!&?0!$hOgcw?yy!V~Jko{>KuXXuZO&yQINS?`wV{Zhyq{!+f*;%x3JF%!F7^ zc-ehPbADs@<7;sk7`eQZXYU(XM*i<;B%rC~%7qzb-3`I7B&NaDpYH6VBElBah#Dof zo`_Bs86|kH=9lpUcU3#WM|NdwGWnL??LkST>P&m#(@x9CI%f=NzBX5BNdS4{XxXrX z9D?3W&Ce_M+}RhD&EYV7b5qI8kfgWC9BB-zqot{Mfla+G6b5HWC=)eW2g+ z5ZM!?N% z<;BUG9!NV$tabC4bSPNjsnVdPhK<@+?eDqV2+}BU%G1T= z#s2fN$;;!ZsR{4%L;Y@cqcxINCo63#D{bryrKldW2-)zNa__^)s;g_4C_DWs3SaHO zic+(=ro4;)JLCr4JEFwIRpOR!-KTad=(?q4U!4!`K52@LD=FKiOI43`kgCC8d$098!7mMDumM_{N*KNo1unf=4eg{=`ormXh% zwgN%F2TU33wDQbhYCziAG_BBtIAVg5M70+{HN~S2jBeci3<*;HwM<9aY&S^BJz3qY z0eIlyqtS?30i|z)XC1PW+5UWwbfF6*_M;v@DG4#47nG}mN)5+bHDyBPQo~&bE$w0*pT;`0wbE2mHb{g_|bDjWkkhR^LJA5|FjHNd=@ zx$D*;`tu}9)iTJFqY)4$C70h32W0mv*wRF#;-(fl;C{^VXcGF`^T;Q`V0q6zg2*>+ z*E2c|qW|U*u<8i0oyq2FoHbLM!oZMg07IFY<0oUiPq$%8wh;n-3Ctx9+yJqhJ#go6 zQ;fK;y$K`8xDO?oSQd0i*3p;+gDXcsQLMPioSa|=P>~`dQ^M+D4AO3<+T`6_b`DAz z>;c*NS&&@Cc3hc5)xz6hFc%3#^)Of^x)+5T(l}TL#Y~vw24@Cn+u{xs)enb{s2$oP zs$f5+9>(2Yd^NJTlPPKQ%upXtQSynNYfO~PX|IX=mva`m;PKy{zh_}55oVWj*p;sFMEczEzw5{7EfD=te%V8O3B(~ZSPhsjo=(d$(V$^G$ta+b&SL{COjZUb>0OP>&S%m79&TwNRLAgzYy<9+ile%kWzz_siUx_8bJoq2aW z+cTZhwdhB^QU`hTsE!)dD8iV{Y>gcuv=$2bJ3tSRF{iI-mX6*`ba@+ES)fJ$c$6Z4 zE5XP>W5B}Xmx6`{B#$Bp3(MrGKbRL)QvaM4TVh|ziJQEEZVaj>MFCoG@H_BG)YMA5{K(C4#L9Gg%Kb0u{)2<`y-JUY z35@)--`XQYsv&(Ju>ocftskyJR+~C8zf$6->8pT9WPYJ$5+^P3%eC3bHmes$I+cA{QJM_Fe(d>qM}=r>DWnXJ&*>v#w5jH! zO^U3+;0BQjC7T z7BBK`3ECe(N+!jQTXWKIM1%Qu5>2#8M0;u6d88}ME+_|pD+d{RKc3prmdxZRQQn`a zsF0HC58)K$Y6PD02>HRns39>;Ygx`QJ4x+q{)$d55Xd+AD{`l0(sfwC7B9kf)b1=# zg0L8g6;;94!mH`0(ZhVase(+j+@sreezP zHHoHt^dWu2wX_UI<3x|}by2K$)nSNn@W2=V5v|Kvh9#3acLuA1F1#9r|^k|x2 zauQ#t_g83FH=V~d4S3u7*`PJJZa-FamFSx4Bz|&R%>krSz}H0c%fLZx88!f~1^a;A zQ!?*8fTov(s%)3I_j7-%GZga!HTD6`I)g~B#Q9|!kRbGyq4CK*u2P0gAOvq2CpTQ+ z2c+z7J&OKWZD)-hx+t9lDhBC@vL%XO=vd%KE@beoMn2Q64%@Gp01j6b57c1IC;+rP zwVhrBFJF)YDvtE^_MPgZ`(4KLC-cb9vn1##YNcR}aT zN|;m{(3~BSx0PG??DLI~P)#QZoC0uf;_h)%ZRfz%N~~M52m@*z@ZAeZhUCAQ*GMsb zwVnb}ne%T#-UZEh0YK}Wxc*B`!u(gjga1*JOqllLBmbcpX--mt(fc$a3nUw24(D<+ zTmYkYhz1B93{u`cDjFs8p@jz7b&lQK7XG`YOAmJN)LHRhu?{Mn>^~Z-M?zEn0NZ#Y znuHeB>=@kJng6Bfdj0R3E(K>rdLZiM9Qu(a8+PKanxqBPzx8;pAmR}$DVrhI7Ku() z5m8`G?2o4F6#73kU7H2_9%~A*A&;ybc%?-{y_!fae4hsusahd&WwoJlWN0;TR%?Gc ztXbNfJ=Vql7O=m9dC>hgi?=V3N&pRv-xH@GmG}HR1ME@C%$$1nN0XjBtS@M9Li&_t zX@}DZB7fs8g_}eQMc3*f`AWImq+utu3nd0dbkYt%($3nbX5)@ekM3LSP;3g0?bRJg z(UVsnf9=G~uka)dnh94?c_Hl{_!gM0!qfIS$SY)M+5w1Ni(>Or5^l*bTWbZs9a8p| zb#WHhFy3_Vj$&sfJiys+(r7#{;=4crHHh|)$ODeB3uMNo64=%2$b`bg@B;}12K-a! z$r7TT_eKdIF`|B=Le{i2+l8(U?Gyr1TKqG6=5}37KLQ%9m(t5fw%96_X%@vOh%IQ6 zz^1^aA_(s0J^`4_O!5CEkW7E2a`|7}Xk-2>H4_ulKh6FWL~Z}Hs-J#511g5>l6DNq z%d@h;CvU;yH(cx^5}6`u0EoI>%?=dSn~?kvHyy$25Gt-erPx|R$C*}S_@)G%P2C?A zJ(LiVXL8%F;HOKX#xiC=$0mg+8U?>`_u3lhJEZ5WPL}KrHzPX>6pu7O2*YarUFO6^ z@|&v<#`%OrI^t%+I5i#JnLr9{*f>`{MPN!k?G1@v`OHh0<6FD}P&&S_J%NnJZrvWh zO%y!dNC&+P&OYS^m@^zrhmR96j*5iX2kZn(WKoCix5IBvoDDFyn%S&QM0f-=LI;ncDOp zpG8tHwaE9nUgoRLj(%Lg_NcAv=oWia{P_?|;i^0^ft&U5UC60T=47|f)RsABh zWKodfd3lphneJ)AeagkfC;!m3jTt7(PG%y%NzbzOt7oQxb;1&)? zYiGiy1qieDOS@iAyL?F!c@~qHV1C211VS|8N18|}Zry1hsH>7}?vZrHy#zjzj4>{D zcr|9BUXJ^4aqlh1`uga+7Y@`ajwN3|&V?LGCoeo^_9)ysh-7+YUAe=tJ2sBk_!!MJ ztUY7&&ol)RzrIDofYUT@WIAjirJ;b{A@u96AMLLtVBz>Y@0_Mwj1MoKabvES2ea3Z3%1);(^{|n0%E4A?aQ$T~0kg754gt`0Z&0 zn^iMtI1FpF`W>w5S>r2<8pLT9Z6^u)9y{zt`0*rUD^qFG7`Jp;NXAKFGn z=~IOI<+5YN8++(8rqU+lDx=G9Yi5|gJ>G#{6YC29?P&g$7@eN!zqgZ9eK}xT zqi_SVP36n|aln=&n$q8oc%W?f#sEQ&6bGGQz&qzxdn^uD$6F+sv&D}VIDOZP0Lie2 z1y(e>A-gZghbZCKPu3a5 zU#`6KN%~8-%y(XvGlG|%qQW2=I6quY#AP~pSxNFXzY-3zg79W`@4GTt6W(SD(e|C* z-rX}R(AkN_Nv==b(M;kMx_LLrGK~We(n5xb-~!mbOlioEly$~%^Eh@NY!8kYn=29x zJ0%_vd#YolT$ZF2p9l^0$+PPA8tXdVa3G-K*MI^to2v>tVx4M$R^Lk6J(Op>dhiN&cYHU` z!eekLW((;V9~=uwHSHDIj<(D}=J?$voxXz+H&R!w&M0m_#q#!l`_BGqw#N8R@S|e< zOw1QdMc_9MD5ONfH@b`xV$q$$}QyQwX^zHbMv-9a_$F&aEPb|2qCvCL!>mRAEEMvFGA@R3rMhB)IP zooYdD>9~P|IcEB8lg@cmfh#W>p;>9H7Bx$eQrd8Mk<216n=LQ<%97~(6qowumV=zO z-ekR6hIdGV<7`CYgr5e5%enZMFevX$7}cot<;2)W;uh*m)MnmB#sHmDh*1Y9Vm3Sq)k6#_%@mN z=GnT%zeIV2rUU|wrbq*hXoB`fmQFrc?>C*D{voiUNJ0GGIEwlKdeaOnCvl10Z2MED z7S!#>770CaEC?g^1RL=$@Fy9hd;5D`4^?L_1v%J)KHNz`mhKh|<{ggRKJ42rH3?3y z-95$2&#i2?q zQ-O!`PcI2AXPzyJ!O05E3HE_yC)3=KMJWU{{)A-5MS-8&P%)U24>|FX&PXTlvd`dza0*bdqhaI-A$yn&~BDj1q8pW zU#UjAgQa3jm7tKoWx$oO<+GKl*-o(*KmsEq%kM!Ca>L`_4#{5) zF&P>EnI9<1@lI@6BW0EsDd2eVVhI&P9#iSP$QY4y#O)sb*ynO2YeV_F`9R7xQ7rCI zZOi9CUug!Ca8D?!q`3^AtqN1Ocyt9+8Zb@nx(NIxz)!}j{@5flg~wGeOWDEA&K%q4 zojYGH1c%{z19u$&SW6pN0^YL#-Gr774_=_FixzU9N=!<4%POL>%LkMskTTM-B1V3sCExXIus)kDd@}>tU9c>mvYM*G+Uw_O{^8vp$8nsL zWnpqk@mdwX+h6T^vRRy6Gs8S`#_FR2s*F0C&Ax`_1JUoJ|90N~ZUp>4;L(3p=mMVD z{=lQ7_lPB-2X)TD+K=HK>;P0T|E$o})?Qz*$MQpIin#ZUyWOE{HA$M;1AOjOoYQn? zN8GZkl_f!NMy^(C!hO-B%Vz%%J$n2f^k|?ou@ziAbgw6<4kjXVDhFW)Cx=ULG6q08 zrNnG&{37L@&3^yxIP*|BT=tkPS97Oy^ZqvYn3vv7B@_&9uFAfPo^IOqznUZ zS|N{@X93^$jvap{|J#|?`2E#rg}im&4R$jRy4TAvlS0^SL`iuiQJ{-wP1AhZcj8zzFUvueQC zib{DaeAXtO)@Sg-n-|#^4qBBJ77LI4(7PSnHwmNmusgA`%Uwn>NLSZS1T0VD# zHE+$yw(7{<#*P7zsEzzaM_xmHW&%pGnBU{VMo?dZ+|t4Z9=N*OoWyB$Gmin4Y^D9mg2s4VV?1?aZ9k!7H&x z>^6M(2JYI{i{9Y#dM5+8H;Zk!1m&)YT~$c*G21W^s?xB-D#lkZuB=f7@Tus zI3ByG%cIK|x43jT*#aBeT0vy!&p&9<)Iw(j^M>M#+GnF)S9oZU1R5B+9dYRS>7ta^ z;|G7Me2F=b7}jskpd_ze#EFg*TuhrjgBz9*qZ({)4bf#C9j--(nGQ+J{hAA;Kfxt= zyt|8F%&P08Mo%KC4)KyPuJVbg`V@(u8yRP&d-W1xze*4H87*tr$t=? z`O5n;*~cVI*BjRg!liEZK`8a#>~za^^AuWgVGl4_T+^_#QuCJwDA) z7R^B;%BOSUOFebneg3)zp$)R|opT7ItO)DLqs(AVos6_BYU$UzsshnZv**Zn$w5wm zAQ{8Gb=Np|LzR;v!6rUm^r}3OR*@me8r!I9AZlQ0$as9v5t>rd(e+Y7Eo4)yD&l(3 zDg9_DTvFv<1q~;*W&NF_g0|XCBP<@g3@6ZMXO&9zba4TX{ezMQlXFFSslBZ?@w48# zdhf~IJO^apfoUttGqcB?z^dyl^6>B7Qt}hN(0r{7!F|2Z-FcNi8YaShBi^gi7e^`B7gK!la~EbX@@eOHw> z3&s`)P&VqS+{GFGlMnpAtwq)7VY+Pe4Qw|vIdZk9^YLzyW5C%^2zHYBiosdi?+R!{ zZFYz}f#alDh1Sn&&~gJ(cY|hKCcVor3Q=r_X<|0kBv;j(1GA$HLT&fZo{*fBDp#I8s&L?ZVAl7XeB3nr8D6YQ@6pjo2NhP+c^|^GzZtxqE1l)(3S_=$dEmWSP;y zxS{gE+y_<6jG9o0h*GDh315=HqhKYVL~Zo-Gb zAzu<-7X*+GZ`oh5mMl+FaZr5>pm_yd%;w+dQVY7Un*$6LYlXXY+^a}HyICwv=XErJ zZMM4Wo%UTnSQ@-54M?}5)zCMrDtME`X~fVC;*)?SXn9d+=Sn*6lfCO}ipVhU+oUTw z1t#YOr*l#DoxGNE%;NCwzv6p>$=q}GAmsU-4d#^qX-pIdZ#ZaTm)p9Zebv8A@ z225bej}Ne%I~6d57|35W;s>ltsGnX8<^vG+@{A2ZtY6bno=!WqXlQYZZbQTIx`_(@ zrf>2SpP}?Hyp2n6&DmxM-&$Txf4$S8Xn)sS-kAPptvIB|yk;W>x4FsqzO6Cq%TTL# zCOO}M=gDk1YqO#K+9JJao}dXG2IQp?oRFhxT*fH@*{q^;Xi5Vw>hkaEh$vQFwIPQT zF0L#@9hMEijsvmm=$s(Uei^WS^WYk^0HVoy3SKYacOpXE9J7y!XH9tLhw8r! z#h|PRO*08)CL|Ao?1O!7Hp-DCNZlJp0!tPZDmhNfM4d~r_H^tdpdpY*>WlV@Or5hg zAk}*!9HmN^r%-^jz`LBRLKV&?>V&hwFeL+WcS-XMTWz?J1zKzs&#P}9wP_bOn`6qE zVc<_eCx~|TjLmc6GNDY9k0ji!l{i#+Bd8MKX<@%RS6L3@G%!AaC-E>%S>d1Ls@6r-DDkgycVP=|JX-7DhIJ-`7fx z=sMFvF6kG$Bwui8CQ51^6AWC+4_m}Y8$_owY-`gA9aUAJYw6R;o~r7Sgo~Si8>gyi zGy2}cwv%9|Fgxb-X^>$RMW0?DCd~jhBqSi|T8pycs=ii|iLL&n({G{ucWqcED3ekm z_r2l6qm*UY?Hb<$g+9(Mu0lPdxqIjICF6qjP84eBtPm|H{hvy-jA}mBh}+i?EF&TD zPV3)lNVGykIWEQq+-(1oEp2g~Qd~#%hb^53>P-G`5&Ww%o}Ts}721DUMR0*q>+rmX z?7GMytL8s#IU17&W<-48!v=BRp!&3xTFi4l3tGkvl+AE&Co7vKl8PK+JpR*hUjW(c8qP07$!4^B4{PN=Tb2{oUP+pYhDWl&Ja~f+>}r z5ZFL%XYUtqV=fnabEnhuFpqXjbsiX@B@K3hgKXbc2Nn;{m#v>aoj23jFiVR$u< za3z{GT9qWkM+e5YVahtUC!q(Lz{3ufVw||TO-&M%}v}Dp$5VzH* zh4|wBSoEchi>8GCdyAsgYmWtOwhsQN}gmn_gQng-`&n9se?K@L}atpOVE7_%6`= z)RX5Nd6NYJqX=*QfE#cQ;0QOiH9=WKSmK|DKd{wG#xt%_1Th}BkXU}*!!ht$FqWEFE00{`DZ4m#G6Sp0k@SPBKN=IVe07QQXQ#p;ZI`i zfPqr&^06(toZLN9*H-R;47}*1%$Q#uwoHjK?w0G$Zh7EF3O1n3euuEJP^mVFwT8{y zbLdsyO-ETOV4*NYYflE=abL0&em8^=+7d>kRJnmWJv6TpqkMDu0Dd`s{I{Z+`R|Hm zhJWC56=TFBfoZ>5{T@(MIw4&*E{G9^v|lGM)N%IIyqt^|1r*hgw6MiE{>M*7+oB?D zL@^?d*)#Gl(xU~8=(gNPF9l$DJv2@sy+2T4-ZHt$Q}WObCXc~=3-qgLj4*We?aA7{ zjeWJn(!MyaQ9ahBahIE;3ZX6Oc}F8@OQPXknkC_@A=`iLGp_5mmkz^^vyls;4U2|kHe>V zo73&U(2lKpme>1M=Y9*YQax{W%;&R4wnfjZpMdgddjD1^Gym1->wod=j`^=9Jd8B| zY~X&O*RlR{zV1_rIG}iC^F6`u?zb7h$Xp65hNLVp1m7ZUAk0W&+lGOvNWuGTS<)z{ z^2pYYLAGm;0frU}I7o4Tb%-IwvDl@BJK!zwLNXn}cW;Wbz>~hNBynCP9 zmdyQ2t^Mtlb=R;Itf5c6D^$)QNq{#-ZDIqEM55cn!Zn^VL*w`Yuw0@=;wlvAX2fiW zRIPjm&O!;{qk9Nh{C3&F$%3Ub`J7|`XpwE8n39Ei(uXv6^?OcgXgvH`u7UBj%EEe~ z%U*y3qv7YF9vX;eoP$_HeLxhpb%v=>snUTN$ZyDKCNGzu_8=+@NQQJ_s*J3>Dm7Hf znLV0Ff_)s19l)IjBl~|DP2#|!WMs+$mt3%(rUZtjAmgLv8KVKpA}YZd#2gBxw~3ao z1Z7e}`)=q`(;d}>+)ybfnBzk=j&+EeZhw1)Q&}DncJlS#e!Tu_;Pl^oroQ|H zEeB~J{GV7s(Sr3iA6?WGHUH!e)ptiV!MBiLV0w0D5tyRWn+wLqGMFBBq!}hnnfFbq zSkO@cmjUwE-N?YDOxbKEY~5fOKk&mlnIhn@X1-&f;6$+G&q`Y<f8F9{|L;ICMVo>U7BT%XO#y@|wD+mCHCf=4)LeVR&zBrw=3AdadI<+^9ta zm2E+_Ajj?H;Tqsj@98$G?+%(7E&EBt>C>BK4m?^ZeXY+xuew*eI0crFF{vOTlj>)f z!2=Jt)E&dbBhYUO1l7CSr^_AeS>%Fd3RC>cKJDKReOB=Bc(`8gdtKb`cLznkKYX%0 zxv~PHclDlb^qkz;BH-9Hg}beXxwB+gz;@u-@_*9-=YSbd23V&C4j8c9GpVFY_PyF; zJaz!4U*N#5F`WObthl`fyG784bE>VCs#OYMy6OPnZ^K9n#F-@(%(R%b=D;gw57GHjy8@o2$Il*Zq{e) z_+g~8ejElST75_mJvw3m&C1{XJLNM9Lig5=uHUilizX zuu2F#ktE=<9-mD-RUaG@Hdl=ra_OBd>kF|DQ&;PfP@L^MNdY?nkFcaVQBd@vS56h1 z9L0@l+e3*~tBDYG5lok7p>$0w_?Bxab_gYr#ApXXLVc9zCtPJuBV@%u9xP0;bn_eu z=OnhHp3;6suq3%To|;hl<4HJU(%nz*Em!_0dQ~ZnvuYF?3okbb4!yGv{A6U1RXCM- zy@uMr|Hs@pb_v#{O}f&Vm9|-F+qP}nwr$(CZQH1{ZQGjccg4IkkMk3b_TDG% zh`8SIK_D+@gOmjv;~*kK;u>x)sAy0)g8O+ht>q`*2MIic`{_Gc)d{fzG~f>qRBLv3 zhF7zjk>ogclg^af510z31#2UFS(&Lqa-Ur?0sZ%`&FynWG8TaVxwjOyioa*?C z*(eUzZQ>gJjohI(9ec`R%8*wkJK>NRk#b9B9I{s+wenEDNG!-FyAXa7#T;7`cA12* zo_wSoqX%Nd-VK?qAzO0ss-=E8{WJ-89Tu-LDPS&KK&GsP;2Z3_5S;5lVJ-fE;hi%n zVFje9RXF$+`}$(`7`o)DlFQ#Pl#oCfZShJUS#)1->f>DK=9j_mH?~^)z!`tUMwcD` z2T8&cB>$(`#q(u?56NjdJ3ZXV{fN zI&s1K;-s}{AirfYywVoA^?Ar9Wy6^&JDpXK=aE$5yLiUK7ftd+0-i-gR=I$tp6buxZqBRfem+lh`NCm?Wyk%RbWOitNS49=yUROH_R z)Gs+KnYUvHi{0ax0FJqs+33tS6{RNc5aK8O;XBDki;L0P0hWyy?@8nj>`Bc)nS?06 zWs*K8CXsp?l_o~SfM=|^-duq_g#u5&jr+|8j|R!KbCBArT+lR<94TSkyrD{~!9C5b zB2PT1`?sO0u;r>exW78Fy1(K)xj3(qsIs7QPLd6!}kY66H#vN0So0!E+TV5>4#8O*aaw z`QoJ81Ruarh|Wo0YV*(u^rcn9QUpECpSI4(LQbJBEk`0JgOmE=m0aP3t(!8aj=cAs zrYTQo=BHu24ms6*QBMtosMS7nR#Yf?eG4`$)ng&w`LfC8 z+p^<$`7!A~1(QV2(K78T)3qeU+}HR<)7CCX2SnzpSC}-^V*IrYaJ*bBg?-&soNQO( z5RDm}B<&$>4~+S)#jJgh;&IP9U+-?vqozZPRY!!UT|m9{&RtrguBBNHN&BJ$nw2;~ zt3M9X@es3ur3p7x6Gy7O?D0|#ttI2teK--MD>VcC1ct<8`#QE`;M9ogq*DDiP@QW6m{#dUs_o|NJtMhAM)NR8P#dWiHEUqXd-FkvaO(|DX(dnv%ur&#y z2J+D~$=gy0eFMS)CP3P+s6MZvgf26Qx`_zJzvRfR^_J;D5M=~U-u+nPef&x&Zp$09 z2Y}5I1xitBFd+CFlu;bsWto>3E6x{z6gGRy&xVnW0pca24k_L=AY^rRS}xL|>V$&* zr4&q`nG)yZ7 zc@kc!J}HtQ6dM7c(krUcMQ-EMHuJeFa08|Wtxn+JFmobrBD8sgi;Qd^3^bnFSVC_- z5MeZ>A04RImJR8>bA0EUxIA1%z8?|iVzlee-WpbvSKF(IT|&-UPsj7(Iz*#KO;tnN zfB<>+G+lXHY<_-dxSZeKH)qbYENyzJyTDr^#AW`#UzSwa-wK&fl0%+f0ht57cFieD zoJM6rzJW+qldu2tn9KH$tlxBO%>O1y{A&um!3_5^h5p;dYib`X(+ghfZ7I$56HIo9 zB?b*AHl0WMp{d9slxd0mk+T?Uifi)I6;ME8yPsmASp%e9&`B!1X4yOG(UyRh1aE%3 zaeH2tCZn+Ym|L6``GpNU^^EL4e~YNB!*k{RnfJy&lPjxxmXskQ0^1F6GY2V%ouZ=j z@%5^;==t*2+Tk9uxWC=;QVFEy*8s3Lp5x3j`wY1u4M$zN>nzK9-#GPGCaEfrKjfh% ziQC$eb;auLNYBNIwmKgIHs^%0$yf=b+5SY}@y5-i>R|nB_n8%Xo{@skD=*T+{iUkC z+3YvDb@;7W_))QFO7XAv{J+w9ISvXiX20L_1I&v*?vCGSmX}rDVYk2BzDEW|e0pYM zu3*8){HE`ptehNNFqida2s50nd^(As(dz+<1lX&qeGWv=l*!NMYs*5SYJa|lI_xD` zSdZO30(07Rkl49X#`R>IZAcq4F~o=i9nt@c#5zY}~&tk3i~Z6h(kO?vgZ>&y98t{>C>KdiB-sibF_A7DxW9oC!O;LEW4(bYN4K zL?D0jel1RF!(7%n%sS?dqK!I^{wrU%gC@U_Y~45tFt_B+ikBrcx;gfAUn;<0$;^h1PDshP>E}Mt@aB3!%Y$(tw&u4Mx6N!@o z*M!|B78RVRH*?&6qs1PGjny7ib&gWqG)lMA#C{+hhR;N8>)ASRnRXZy7} z@xXL^!mu>qpfJPUXpQ*yK_}F{G#Wp4Nx(8DHHFv_NnLXSVVV7GF!5w7645f1xO=4J zsYNRobNnCK=##jmjpAv#KEAt#>Jq}iUDS&1$lq-|x zdKm@E^<|EYLR4hO1You9YEqWy&&gcm1*z%2on?_}and!271bnv1JlSf{YWUOXWxrv zxeW`eWE^I-m}I81Ix|(etugxn5A)@E)F#UE7P_wHj=Rn0oUApvglIw$K0@)jDb4X6 z^|erd_(}X3M-N+GddVvK%00NB=Cx!`dIU&pe#&2+Ytf?;6I5@H`o#7+v-+UuA>wjT z@^!V`qRcwK4qu=Djp-Sf+GoSb3U89~mo5 zi6}M5dYFq%GF#6?NKw-3Y}~aM*f|L1MG>om`g4wr%tNixf}**(R=SfPbu(3zR+(k* zq^|+JQd5Ua7}bQ?S4TeZd$k5^Rkf?X{(^a-P+T$sVP#dZH93Ams1a z0QJ6xZ*M^{3^tDpOY{i7bT4%wgR|JP;*SlfC?R~5(0dQUlBR?R1N-<*E8mop7O9)! zi|p;-3nZ_4$h?YiLxACJB(=a-*wo)}G^{34Id?A)j z>FTI{&C`T9EeYy*?SQGs`ej3}%hvz1g z=l*E%HO_=lkUIH5!XM8{dglxBO^FhEzr16Zhnv z-yh{?J}(d6&v!|cTi;!fIZggb{TMX;_4hAMNkkho|kCMJ*3g4JX};?)IJ<9Gr~W z==>%4PL1Om1A%IzCfil@csUETpOOOjxg&cBd5Oyl^~kVU9JVGD%qLO{=&O;Wn4Y=z zA0{II?_nW~t0u_5`+(W;!A6->sRonntd(mC&1ho~66yo3@FVqoC~)u!bk9=2fdon0 zm2i8SMq#DMG-&(sS<1<0AzGJElz?Bfvw3wJ9us!1J(`|iI8f|~)rA6m5ZLJho!e{= z+VWhWCEy6{*&}4Z59~2|{v2~@w#ZW978#cYycI-kPEV9@#Lztu{f;Rr;@ag9fnts2 zcup@TsK+4`9%N@dMh|9aZ)16%%OX4mSokH#X{^>Pf6(U8$+fhLp%9{y4Fh@55Swq93+)o zjrT@Xz@nFG!Z=!H$)QYZ7_J_sA-JZ>X$&t4i*X_~7hr;kQcC@~PjIu{?}jH1rbeSX z<9LwYPj<_ScF6!+-!O?G8C$ z9L5Z9=JN09XigCAO@sNR-|^`?4Ag{gtC>!f+M?WlBqF}p_HGBc=NYJ8&>F_9yb@)> zmJOUcMTO-`z!F{_AoHjj{YnRb7Dz^?^0Z-+;UeF4UpH&dp*sd=i-~!L_F*jgJ#}4c zZvb8qTihRN@KhHBacf~rVFv~K> zW$6oaZu0L%q>Yp8L_;jDsoV5rUBwusG@w>fEBMWXrWSTf(B2?$M;UZS2O!f3KCO!3 zh&WUZdI+hVGkT2!fJ0w}S^`~z#SQ;=V19|EKCKa(=FAdBs&>UV`SDZ;w;4{zKZOpc zT;iM>b?^Z`l!P_1K#wigwn@ylh@zsCN=Z3JT!p*jfY2tAwv{+bp$ogM9Gk`0hGpOA zkW>^zq~8vUJqG~fyr8!EB(8sW6(b*1Mx)4{x9u+l2&e~GpLS{EcmsPw5#p*GGCHhb zGWG6=KCi{-yfUT@j=Gd|oU1Qg`rsL{APUS|(`EGgSRbkg@W|W80Qwp`;f>+|oT&mb zJ^gS%h<&PVDp>EAWC@n zPsJF%;bDv;f7B)I(5I+U?F;f!w)+l3dFt!Ao8s(m-g>HRz85+4LBDy25t(Z)NQNa6 zEQbyE9dt`p;I+HjRf1k}XfeakA5}}0tyP`79hM~f+HKx&M#WKh4E--^7e`y)sX_W? zSYX}NN4p$yMq?o~l22}Z_6}Z5<)GpTaTsFk%6_85^k1a3(_T&Jso+bp$> zc%G1HJ!neBwg@R?2r|07W_WSX4ady`=ZE`h!YzFAqv1=xfd?_Y zBVLoYjuX4H%w!>Wh>+h9fzYzER{1ZT0l6U%!IU_j;E!kL=(eMPhiL?{XOMJhBns@(Un>j@l* zo$1J*BDbBvPK)d_^ z&VtqX_j2fM%G=Rz`M6^i=Y<*fImu*lWasBVr(Z>D7|J z;93`m`~tN>64#LE)Ucj*;VNPb(NiP^BVe;45job1G5IZIbyX~7mNYD{Y(eZQc!iYg zLLn3FUwuonWciG>RxdzTAM_0-+8Y=#tlNr`I@5}l%|2073KRZz#~Z;L?Kmy!5#Z-^ zdUe0(n$lK|<#pb0V{TlEY5w0dy+ zyy@z>tT>?})9LnMa+yj&u%NIg;n0uDR%{<+6k2ZZ>X&|G;8V5(euLC1oj?6Vz1vbQ zcHUTpS%$8t9(gQ2_jtOGxYcNZCFexuu!XKRQN4a5N}(jv>4?&T7aL@Z*zIh6b8}`0 z2*jdPLIt~MHwL9qqO3x=xHZ(%bbdp3OGXt*5B>>aXCoHy0Io+f&ufk;@(D}i=Z017{3E34# zGIUq+NqHB#D)IU^#9XqqVAq}(^nA7GOS)m7z7U#sFqlOC zm#6Xk8fXh)3Vm#4kyH7|!mKz8ZedR~Z;y>j@6sxfSKR~o3I)EXA=7D-&s!q%cL)@- zyVPXEieK!8$%W0XOydU_5>4WNfzh-5qiH-nE&cyf#wHQ7{v&z3tDI=!X~Jv+qd;Cv zusy~u@y_sR^UeZWr2~GxVv84MV@1>-Ms2#Hf_Fv=LYs7_y&8)~K%AcEgzsNdEWOW@ zO?N(RDPD8ie69ULGwzQmPmfu`)Pcz395~k^3Q~IcvE84COxf@#>D=Iq@t(LJ9VysE ztMp4;rX~_d!YaLd9yATHSDadt!EhsX-*FpjEM>s*eKmv7I2~5TOPvjc^+lg&wosO|`{0`iZ8)({gWg9VLiw^y!-x4p zFI!^%UF#lBu>W>bXKl(g!O0!pfkMa5uiMBA=={3d_a{h#J&7KI)zvUeXB5WC>qHzV zK|~+#c?k-dVX4iO5MbKT_SdOHf%7B(@2;hsB=y5W(28A7reHJ>wn6W9vtD~iqdpl+3s*MNY*SVq<1Dg6TA z*C=!4s#P1%;}F(N!$?{wTDKH&3%FjwFBvxW!J^Q{JqXffi&E@XlU!)y3FQd3CKH7m z_`ikSjk=m>)OkbryA5t!jp_k#JyYP57GRxBR*cM(@UtZ%z@dX;gDdl}HFQ`N_zQ@_N?_6BxwNW!1Kd z!=0TC7g)V8iqRz1V?~ax+8f0Q#~X9?s1=yc;FbeaN)98e@-x);ze7KRbA!VSJ5J8D zm7TJ%fIz784!PdqaJ737c8Kd@5ETJ8T2%j>EZ`^c;=Q(0e=Ql}tpQXZ>Wy0*hRNz! z^jbi_^pv0m}XbMc?{Uqjm9qaL{Xu4y84=JAB`{?s|F+q|^2>fszEF5J#L@ zJf;HjmWpg@9%bg;2-@-`-ZWp&zOgNNo*X;KY-A)xY`Da?)yVB8%JI<$0LM(sE~c;I zwPsmdOzy#l)l4KOdIs}N_1DQr!20jc+JUHIE9)nc!cZtPf^hm}juu78auIKM$a3lS zc1At%Agz#)(L$W?GZmPG%ZelLD+#XhH`8FdjehC9WpE zA8QPdh|44KjSu(x`Gmo^RlKXj^kOgZZ13yzH<2gJCiSwYk1b(zb-QbmQFzGr5MsL{ zsBHH@*jhSK)>R`?KO8G$lU>bdz?v-NkenkDSe2J9;?qNXiQlFfQQ7nN2VmOze`INo?Rm#93peQZL=cCN(hcc-LIdbt}Mf^XAm7Xm?ec6%m`s zC##K6UXU5HtLUSL%>h1@F2E#KcBp(~$Do(QB}KWW7#N__EC!G&`N`C%|7O+ASpdU&Dopr ztq#@1ZMQ4mt_!BtGbop%-_SX-)XZ&$x&~0o@*c@-_x-Ch#PPq zvjiGhvu&#-_8if$%h{c=8i|x;)Gpn5944Ey%WR1Fh`SYz_QB094^nv3fjI%Vc8y7+ zGlTYTU7jT4bNX?!BW`XwlB&Hw{a!q(2}t9~s95W`yYA7~h`w!!1>9T%)Ms!9Vn#7E zcyb61eYZj|kD}LWhOFR^u^uL(+aA;Hg3Oxqn)Xwi2r;9yu$T3d0=F;uFgR}VBHAN{ zcV>MpO4`O5heVA^*hZ6+q` zPs;rTqymkiQZT)9Xr6VJNwgLl=5GK|9#0J>o{y*d-T4;T&yO5|j~?YaarJ#8&DW9vm)&O$!V!k^p;#w zmYn|Rv->C?aE-1{d)Hn0m6^qL9+;FCcd&>w#4o1GF~Ltcz19jcbf_oNx=tCf58;)v znlgCVc?8m6&KY1~TbLif>C^MsA!!Dg&*$=UPj{y$``698PtW)9dd&BGmwTIbwaPTn zx2D&ZW|sDh9yWfdQ>$t7=dr^|1B^U%qRe-p)2B>2`eEx%Y_QV(p)EcAWV4%Uh5HKq zq2`XUwqVw|-UyZ`j;X1zQ3GFJ8cIAE(?mpSV{_C17V50oio`TXDd)x9+7jzD6HEr9 zB_I)&qYH;0KpaBaGU%v9$dtqJTLdejGXyiN4n{w+!>(qL{k){yh`TBsavM$}A+*jt za}|VS+`4fGWvyHxI2-r}d(=3F^tBBam0gAY>Zp53%pEoqYD z@d!xMuAl`uoycN*v>M{upofZcw2ud(7zcHHa|Y~Cq5d!@V(`UDHzV5fme^FCgF?zX zk~%aI1{6?W0Ofiq;q(wC6Pr>K_k8cb6Gp~tw&6-!2MYLYWUlFZcSr+0 z?cx$t_G1eCsuGfHxc5k6!?y0I9R?u##Rck^lrf;IgUd3yZs5}DR9byvXfb_KH)I53 z%FXnzyI3E)Sph=ye@Su;mJ10kFc_&%c+C?>)$r!F*iuSjNevUUy34c6LF4Jjo&}; z#j5k|;rQ1>Oakp70$XuED~|~oRQv>W9ny_HR*%RYLhcfoV+n(Nqw(eN0uUu6-KL_$ z2urYzxy0SHzwp&7Wz3Ul8i>vO&#iBzAEc>+&J~a< zNxbCB?#yZ}s`V6re9XoU|`OtpO7`urB_ohf0j~2x@R=#V|x^-qlTcN zSQs=qJ77x$#KVNqr7?&ordUR3bHW(ex%pj8P-Ge@>4a%dcLMkFgSW#GWe~oRVMcME zptN_QIg(HT=9R9>!%Z@+t@2MJsTs4UkT>@_tJ#N7Wug}51Exn`21ATiY2K%q#_ztd zTYySCKdm@!l@Ys~+56e}VIwn22#FAj#OF0hz7c~4%-;d!?!vk1Xax7&0?C)RF`5-< z)jG%_?utYmN9WeX^3x*E1G>X`z&B-9^J!svM5(}pdhM}TfdjFjQB+r~diF-5>uM}MdiPwb6OIF-w|2WXD*&%7<<>c-p;mK%f>%-`h zYH*62)HoMDQFX@4Z*_kZ)+%+#}twE1hKB%6om{N(sTsb7pAmY}Itq-aH;7KE}?Y zAl1R|)dd4j1&(&+#M?PJ+MQE&$@u3}EpZy^L`^juh6I3>lbb>rA?s)QQrx(baK+aS-R1)UOX zaw7qSqM~W5}l-p6#3S)4tNOWRBTv&nE1j>TT0Yqc5 z^_<}nOPEsiu=f2)&iT;ASx5J*;a{@K4(y|iwIg!BjnbT)vOr;tU*+6a3EdTTt8}aq z)WDP;67Ko3hi6huTv$&imRF(#8n57*^zwb|w9fUirwr0CKB2wEfX%U=OpQ`{i(WUm zxW!%%1D~0@-O5+3JUQMQB@saA_}!VC_oI?$l7$m0xW%xHj~9LkCEtBb$gr2SAOB5x zhpFh_b5ZT6@sqe|`nyc^f>9_1DkjV(2KQ@n5MvOJZpVKPk_C|)*?8DJaUbTWkq(E6 z@|m_y9ITKm^6;)a^?9@Whh%_yJeQHfi>$MQmo++#&5X$FIu(S&P6Bqqd*y52w@n29 zoC=?ERobJU=eV_96alTOT===ENQ`zgMc)H8<`k3tso7?Skth@i2wA2UE-9F7K393Q zHQjmk1&|bqaZR5A6e`4mU+mzfV+AJYV^v2mweE629Ia9u32)M8w7=n*G(wSu#id=n zbn`}c%EMZ+NnzbJ@IhD^I_@;$w#_v1Tl8zNeqN)s$!rK9LFysE*un1MkAD9%1yiF7 zRefc^Gn`Rltm33|X59Zu@D`hmA^I%3_(J|a&v=JV1~%intd zb+&BAY&H;ORJca0MnQ9f*I!j0j-YTZIbVK2s_o+p5O2>xl}~nD-~iS z8pEnp-9!FMCP7{FWMIOEAGjWy>8-#&j-m2KI{c$^btY2kDF~iHOL~}&3zmmLEM!Jz ze6dE}iVQ9hnuAU1vjUKwKKQl4-zE8fqC`Yn8G!eRrRJ&@mGXd6&d$iizeBqlxu~$c z!Y)l8kta9a`B_piF48m4mvEo>jv6nds#dMI9ac4virq+uIs3Sn_du;ytn6pF>GNh1 zA*k3v-$2cz270+KryhosFSLqj_(@ZnD2`97(JO|W9;%JMuhPjTldX9FRnoN{Do#Ks;%vdZEUuH&Yp@+G%r@LY$zr??{p5KX~=%o?;;an zno$}uufmjV2t&`)=mY)n=Hvstwx0QXPZButjTILle#Jk_$?W{TNzy&6PhYJsl*Olc zQnq<}tlGGyb!2EhG?%(BTqa{SJztSu3g3Y^Pt>9Dz2>fb5gj10n)jM33PAGiHK%`^ zGP_?}7jCs~&*h{)IbiB+S`7T}G8Q@04Ma8tufS<^o@EtIItN5@jVoRWbQi0TBc_X7 zhZ59ZbFF%B6Gv!z<@9#F&W^}C#;CoeBy9{+$Va(lF3Wz&VhE17+W@d^bGAHB?i9}@ z=;A7cvDA~(wOZU{_gaZPWp)pi+&g*Yc9f`ay0{DsK;74@!Cfe$Zq;YFca*yXTuHS4 zmbl{GaGif$BB)4$YK+n3@1T`uG~d1{9nX)Wmr}Qw62Nh!-$Bk%%D)c6aar))ZOsY$ z3>yK2cx-#-@H@o_G`S^?>fLdEJG8iqjt_(O2MmE&j&E;mI=nsi1?=P5No8s^NeCr$ zq~M8x-xfhMhB+Vp#2B~HIOc=N_BnmFbhfdxRCYDGPuMN(_ToA182AR*OagQLFQ@Q- z)Uip=_U{>*e@&P7Z5`dK`^TFl3xJ@ z3=!eLm_E#a3|R+!TbKFMf6a1v;**6?^>YmWAA$nAn3+1FkK+8NS=TMJ@12$F78AH~ zP*0}yzX%G;Jd8boGrAOf+oUc&QKp^&YmmzX{Dd?aFYm`T8CNknKI~kc5mPeX+(R?! zaL}sz`0_shg_4_h%Zg7H9R>uNpEwVU!R5=7J+bl+WBrjG=$P2tJRiaO#Gy?l*J~U=3P58 z-)v$6gnfI0e47bapVwoT9!_rV%%1N{p6$>5$B^%@B~ERfY88CE&clh6qtiQ2`YK*H z1kZ0Ze48Av*L?@LSQuhbQb5(@Ci_NQ5s>y-7nq#zL}3c6>-E+FN?>h22wGY=JHyg`_ZnzAx1y4y z$FMj<*X9B)rl(pgXCP&1psdFnXbns*h^^wdG)#tp2kr%dFO$3W|=J7`@ zFI1ImC*|A0w3k$;b`jg*Kg~YxRiZy|8-M;T%Ca%%g&l~jUpm@0@R4>6H%lc4#znLx zQC3;}6(Y0N2i{yx%-rm&LnHKrFh_`=9GCan@HRa|KA;sYkjn7q{z_|1xWgEjex;P> zOI8Qca;F6eRWgI#GK18H>MiVWpX1Ije3nIuJ8a_az1x?eR{olMB)pa`z!_@q^6tZU zSf{-|t8r}RhBGl?BlRc_QMJRZTJy)ZkU#q9dv<8}gxFSr;R*<--S*upxCjV^IvHns z+`1C0dF{~)#B1ZGi1B-=v!bwprnd_lzci@&yp!V_RZ5^?IDdWpglZ1^yGJYN?@x!m ztO3~H^+FY=@Dp72&Fey3TcXk?WOA`xo&|>znL?M75EI2ZVN8-J{X`@>0{6b__#;8u z@#eBg=SoSCE5F_O?ZyMAzmLvmA~;yqR^#eO4^DUl;sHe_EIhDh0heyHtAVO03~LnA z3tewNtDAO7-Q0m!Q`DNyWPd%ol-5VknrX5HRY_c>X09qCqaadjT!8nVx3Kn8n(Mej z1FrIAo8czbv76Ri)4}Q6zy>az)|8CMrk&6Zx5e8UtLqaut(xYspqYBr0-RoQK^bdM z(=X10HHIrtECt4#xCLp_r)p-eoQj9LS`rp&l)aoM4k3eSdeT?~DX^)OS(b_hE3;{q zKG2?IjTkxXB~Pf$B-$xj9Nyz-Ou<{pGZQ{?qM}S#J*F#t$`g2qTIB^o>&H$<{j%pM zWRKFaiW6jcEVEZ4Jm<$cF2>)GjnDR|v`b8_HS6q_pG4UBBa>WOE4C>MRjZgyLBF2W z7SX?zs|A<5;4qQ(i}V=zFtjjRtO?#uo^yLXY9)0eLia%4GAvILuLfA4SelLa3haF@ zGA=ugGM0%ZOyq@C)iQ{l1(zhI>c(!FNL^kQMt?wCt6w`@$H4N;1Dat3l`Gn4jM2a? z(tTn!G@>9v6*p9{s^h=_xKTZXOFfmo^|A6LRK>oD84*=w?uSf5X%iCRletanP8wo5 z?0!bSav0FI#=O&9*j70iwqz^1Jpp#U0WYz7MUww}@S2z7nHPFU%YNuuJYj5wbY*ZS ztrFUwXuZ-PyejN3+xbcR&coc;VONZk^UE? ztoYbbgy&>X|bZm1m5;fp^eb1lZ$+p&wH%`@hunY$UKXV~ebhjfh7 zDP@g+0YE>^v9-PUWvMX~z`T-U5~`6bkYuvs>c#L+(><Jq#h`UE<%z{+#a*qor$jy zmehT4<@IMK3-Tc~^>CiE1+t3?H23-!29-)6CW4$;)Uo7OjU;7AtNgVE?O)sNsB}e`kfxzruTym=jypo|^J_(FO#ZABKZycQH~DU1HpoG)^5yarm-;nx zSV_$06Lgb_gloZR|dY zt9PnHKZpQfOJ{;X^FVwG&j5&30aFjyw}tg>2x=?$e5hC2U-8 znm|(7;)<_Aj}PB32$KC8NO^V zxNo;@z*4#MDt1}(y(4g{m{+Wd-l4fj3>~}cb%cSOO@72cDpFSer3s-yvb^p88QGh5 zzdw!`*3NIrSdy!cXpzz-1;f{y1V%YnzKBC#VjD{Hhad`%WM<=PViv(Bht<#!AfO|B zJ@=Z|m4l)k6Z$AJKYu8D`%N?4ThjV|)$S!WdZ02*HtW{%4DG?IoF%=(vHP-0>)zlp zvgy6r64S_?52TfutPYfRzL`%5)g;+;0ao;?&7k4$9O>{HWjBzU{EDL}{j(AVMfG&1 zDXIrh!!H$<6*0kLCgSnNiCXFeFCaSZEPKk7LZu%BM-m)I<_ouzuH8+;ieuO>y)I1R zN?1OC#P&;dx`TNKq*AVxG+vNiN`Y-bTa>e_yClf<`-Ap0lThKyv7&27p>PK(YeF=T zB%->Lki-CcYF{$Fm;`9%xV#6tdH)k42rMaW~fBko}b8h_eBXtC>bdKWGhtZ&kgBlvKisEskeYvYZ z*wm19TqToofd!+T(4}Gc$QUZPU_{EwDI7Tr)*InnNH*Z3?W4_JHIs@h)&rVh8-p|| zn$Cso(*Rp8rji=`F@@KJr@pv=LK^7v6Icv8IEPz!Pj+^?9HzQ-i{O53BabI!02dG( zrS@+;?1P^=t)mN`m}e}0cu#0$v}-83w7a{PkOc{J8|GTKr}7262v=>7V25;O_35s9 zXlm*1oSJ$)#@;ErGVs~UkI?)PQgJ~ea;z`bt6X;4_i$5<&&)ju06!}pcH_(m zAEX7pgrUHpQK4Z>YQB7dr1Kvgmun3*R_~QGjIF=OZA`G!q&SvjE5&MjTZ3r!9Fm&O z$8RwcdNfAvH^ z1$V8UooM5K(%WO5VcRlLQt06VDz+p`+$S;lKkfp{YL@+ zCyjRhALsO0{}B;^j+x>ArnD)=^1%S%g9|-p?&;_v$RcsdWE4ZXe_JiuWis2{5?ucw zorDTr2p?z|fE>V*$t#Q$z|Ki1R02*r-f~>{4so?SE$9 zvc($Mx(UBP%o;fo5!7X1OMhw&$BDA{N+ z08Lf}&-{SM=yT=2s5`?9>NShn(R27q%O zQDGVk8-EfKsY@Q6$6XSXW6br8rfAG(e3go71R};e3ugShn1N&f<1wD-kd=K|dzCcm zEEy~*)JhF5aw+ZJ?7}b}w_BG$=gN$|-)9=S@}|r5AOx!{zk!&0X#UtYtJE^d4WIyj~f8B za0InGd^4pMTwlFQyhY!omz^G(Skff-Uv%ki6?(5_|HBQzJ#dPF7e0sD-BXzZ2a;Z@ z3;1Q=_Sfq@M;h#5ZVa zbhQ8I<)Npe`?t^J-y=iu10=d=sx>PA+d%Vw{=@~s7CcDJR4_fVR(N-8+ZJqJrdVL( z^$|QD6>LAehS!jl0kyfF{rG1N&rx$WnhnzSQhegEL!O26g&uY$p2pXYAbqO&6)F$%!iLICN6u z@LXCawNv1HuWbI*ORmqfw6MIp=zM#J<>0`DN#LPk3T&QpG1rJ58kC94hze~tjXD3 zaIqmHC}z4)z*{;9BLj>`z4wYGkYm;OdQQ0#A(F$WhkRRiT<)bV1GScqEfNkPbBT$+ zM1k!gYpL;rSEC9taRgQlw$Q8CQ7p4~^hb=kU$-*ne%>m7HbJSe=)9nq>jcB#3c0wy zVZVis0uwF)f>A%0Iol+qnN^MptnfsA0KCxN>P5$efIZJef|px)g!J$qvAedG6tL!~ zh&?!_`o&}aq$KIcbh`GtCCRGF8xyJv1mpz?7{KBA;U@dQ64PYyc!@=4{4&`jjo6NQ zjN_TOLXGllyl03SxY14hP{lY7J<~AnNP%X5&=B{w;1I{#0J@rNx)e<1fub}eR5uqM zti>QuIv&crcVhb_ERlL>sQTUOw%ZgGNtZ~gAh&dO^?7&~dtMdbcyQLED9=;Che`5= zUmGItihT3tzuItxzdEBlO?%!C(a@`1r$u2iR#N7qyb~ymgQDDYI(l(?J3)1|L!M8E zMmNS28p|w%REuxg7@7RI^~)1Er09{J;Vcg0uc5T%VLNx1;)seq zk&9rXOkor$P)`&_N**o76WNChZTPb~Bhg@&#+z{Rr!mj7vKSJTkdsxlgeV^P2HWc2 z+gA~1F3?pBkVC?Th4IDJwo}*BmAx`CoMl+5L(NcgjWBfr|IqhwvG<78)CDozc_v0? zpGRiDy%)r&`?3O)QJQnJg!{tXw~|+;%ia=GfE6H_SWF51VOK!aTOU&dSPUFx05D!* zs|K7zzI_cJY#6A;FT^$EFoafyb!TVEfB2RPc(N!nZtLq0kcJU0(70jfTp94UbUKSC!~!y~dYclA7haS2 z)sf)f_wAT&oA2}}F1EeU?iz*JS0Rs!+f3uW@iGx{7TBuTVD_YNSp?fMmO~ik*?pQ0 zio^U;{P941h09!n$^H18qgKvbv?rxJ-Tgx7k@1_XPN|7{sqFx#dnee>AA@pVX%mpr zUq?fmhv<$t-B%8ysf}3KDtcP@t4oyD+L00xC0K-=8S=B{4;T9~Ws_AcE2vcWFfjhx z=|bvs({DfV@cfme;~O}DC;I$CKN3MXdvzp@b;tII*tXf)l(GpF5oHO1=)JikoZ!@bXJ%^-95_$PV@@jEL$(-FoP73Q?FzJ~ z&=&Ic=Kx_+(G+Z5+b-TyVx^d*QzDUeA~!KUBDE6_sE{LFR> zx2`&r@7XhAq=42NsY`m7Fm%6)ZtBv4tI7)qziBZ~k|0(c1>FuB>U7!Z;>1dl8Fa^y zHE=Et!_^=3v5+DXHE&j5OC;JP5cuBSn$+tU9LsNeTK^87qaOEUG$A6ez59@+6?oC95RO3ytPVERsp zFLZb6O`1V2q^91^(P+vlrOUaE48|&4YoKvNhAU`Y%2<7MPng+|sOb)*UjUYk>yDCV zj`s){B%r2p7I?aA@p%t5^Mc3m$F^L7_noRlWq%JA8Tb;$dfne4U~F2~ z0cdt3l=Oc|tgx7q8KC+1tbW42=w#df=Q)Ix^&k0o{}*%b6dY^2etXBZZQJSCw#^wQ zGtP{i8QV@~Y}>XqW7{@%)_;Eo-`Z={uJ>?Pb#>Qs(A`yiqn~SB;~u~Nm&=WRlsqqzVu&cNf-hhExzeA0AQSP8O2!pG;IQK<%-`aZL}(6T z2=Hmlw{M>`Ic!@%Y%bpVumd?x@)&HoK;zO?{fHmN z%-n+q)vrI4<8|8DzDXN*dFM-PK=bmovlwK+59%hyuZm#=ZRU%d-y_B*Uv=-2q(5suSTgx6`V}EhH zIMc6HVi?ugq)bqxaPb{-DGpx3ESGV-`E~h|npI?Tu%VSa$KoAWP00X>nl1rHjwYRx z$7?rZX4zFmxVc3OK$p!?<|+zqu~=-xkF`a;vejbN*Xp7ccX#%I<%&?E469017eMoswI=Jf@NqsHKHIy$1%s z%}XV5zsmYk&Ft8Md|GN))zk_Z75(`w{n`0|rK<7I0X*TdK89w5u7C2q9^G?ipH$l! z!}yuNhbt#VFZ`i1h#X#vekjS;N);jbxQJKfqxQQ{z~V$P@s=5}!d1-Fppt508(o5q z@+0ZcqfTGiU_y`jC6WcyNr&4Py^v2L^PIl8A$}&#Emh_wEP|{);F%BhveZs)`( z^8s!W%NF}@L-mjHNZHs}{%xpKglxgSi5=75#Et?W;@GE#GLO{HFKwQc%U^}F(VV@8 zaQWZ-#a)U@UehRs!rRi6p~OS}1IQX671O}kVNkb`;a0fJjg&>NK$ER%(WeLRDJQDB z?Bg_^f=wdOOAWd(0(_Wu|CaMyPtr2L$hC@LwK0#y{vHms8;eP@Y8@*S$ACpRLV-rX zqJ8YVeX|S-f7uz~sj=y9Uwdy)sT=NDXcc+Jx{T~i0a5vhv05;Of4{l&aVXU5VZSMy zk-;y0IqItws&)oZHs|X=`6cluq1q3;&QB52utds{i>O!F20!MbTY>d7rE1b+Vp;)v6w_IW@-aj7Wfy4A|yI? z44Xi>lDxFpv;r0N9%_3%V;4s?ioc6dgPO(Icu{=viP*eK)mda*BC13}JQakx*&+pd z(Zql9%{;kv^Y%B%!itDbCb&=b^&wAv(nW_U8mDob;mjIaTRu@I8=R@KTI8Qui7kAM zN-~i&$HE47!<__nlw>>ymQ+<70reAgmji9rX9yQgoU%&~B5$ldn34C6(k z3kR@wjJ|8CVa=3i<_8M9s;L?z_A^An48bo7^-zueCS^7L?*i_|63d4 zn{oFWS;y>g6KvlYvqVV1N_C2bhl(c}HSv3SQAw1v`VnT*yx$OL4(i`u%zz$Ya4~3$ z-Ta{-vi%hC>$Y1wc1hY=?e2oHcuOI%OudzG7S7kI+@?RZhMw0BXR5)kJJRO!`SbqD zm#w?c+)XC!oEp(r&ccg0HMZy=lk2v@q1(WF1VX7A)gX7l|yC*iHatiVWH>yYZp z{uT3`pNOYvF}Z4Gr!x788T3n9ZC<&G>splde+%kwovAXHRx+{!u z!xhm}OuH+z`>jDjgdvb6-aR2beK z^P*Scf1mmV+r5^T1__b%R8RFR-o{qf|DY{&G-gXPy$qv5g4v-E(Q%V6@jDMKC1s3rQ?3?N^8}9`WB=~@N0pI66=+CYpby}a8nH1+_WjZE%RzEV z!w3`cPYwV$`vZ{Z#SHzfgDobS$}lfYY!Nwr1z5W83uA#QN5^D1G9+_%gENW>4=-yD zpb+T{k;QQYt`$@2kY`FdJ|YN-udcHC2En~kRIEkrl&n+q{kdsq1ObmV{*Vrhj+C4| z~d()v)=BcIQu-5Wg>!|estY+9M7)LOWbyxD>wK6{(>>Vi8hd}G8gBE6Jwyk?g~em z2$GiSu0RZUBz2;Ofz8j|t(m|w#s3uv3Lo1uvN5GkZ)gO6&bi9aOj{kbNVDGHWg=Gd z$Iy4Joy6nKLS8S3vY?1wKx$}LI3IuoUoYb?fDfY8oc#r5DKJbre5Tn;E0xmFk_sxi zZmLEQ8LGI9^N~>;hx;l~ePCi_FhoBCQs8QryJVJTKi+7RAN0<5f|@Y$%)nT^b1!rp zr=>E+S^Zc&8!?e%$srq%%Yp=SFLs1QF}oqnR0SePLiMbWF|qCq8-bA;x$hdthIJ~v zop&UGy+*XEjCb30FD{dxD^r80*tt}9ha~3XH7i_LXa(jU$8nnCMVHP5?&i^t`2}|S z#j%{0DUi5X%EAj7h8=A|E;^l*+#1=w-#d^eF9q$$T^%0PX&PndbJ&$i*fa|g%hRWK zoN76p8CDWpC9;u58@)qQ0Tm@)m3XfZn7VEZB`SI(2r57Xd!el$WRn;&m_MN_1qH@? zc_VzF#Wl-e?nN#g1zx2pH05Uy=ghcF5zOm>9xZ|z*=u{aN0o|_R?`>tCR&in)Eq+t z0+gzl_JRpj*x-XyDu7H-JD^CsFPZQTRUm0DazMTP3Hc!&HWgif;P#OZ#xJt+vwQ~1 zhg2xxA4g!J8RM5ptGRiFXEDRBNuuj;^}&Q-Z$qIolEJ){+=Bk73KLc(lpOCyln5F) zPugm`9zeLl6p?4;t0cN&anL+?rB0`14k&|}AtZ7OOOp`~Z8OA928f(G(>jwZ6V@iE~b53VSN59t@~EX(Fab&BznBpX@JRbcm?c zN{+^aUBz`rBEoGgwAN_2SS*=UEe2$qEKmWP7VpRnJ97iw+zD#@YJu>%H;Pe?;mCnX zTM>v_5m4WmRb(U);r;E&rlb_zd6gPRr^w(~*GBMR#$}a6R9K0jej1K}ye+;$lGWy_ z?BZx66ktmr{D9z1y zTAX^(-Oya{ zlVo#DyDuuE?}YhBZTzPhg#ix#G{0v*ez5Gd8k-aHMfO}30bc&ki!eHhcqA#OZl|^q zrpurr`}DAW79L3W?s_X%xV zFX`5q*zw;KmtOQ2__kGav+vJ8#v%3CxNdDHcnr=xTh{PB-xl&c_D3FKKO%e!1KOJh zslaD0LLBmq%9(!qh!AKsi()(9jC?}jC2y%dP4ejHDU};st=EmNrVZ* zFD~@MWdroBE*`Q>l8mdgy{9*z;xG)_tN`W|uDo?5Ny^h=%~X;N7!QBB^Qasa%m6X= z;$soq_xA-_S<=bI1xdq0$r|K)SIF>mx`6c9@0PGO9cP!PV*WAYP5jQAWF=P3o+#k? zhHRKJ#59+v?|u0uzv``?WOgRDSSxP9S3{`6<50rtrs@wKfmSp0UMdN9sx-}lL&~2n zz+XQwoWy4=|5lmC8&BTdp6Fsq~NMl2cmtcw)40UYXWYB7*kG?;AeY^$6^M)9HfF`0WrT;at4+z17z z*znhjs2S7V^{FEj0DMXQ!iIZ$;gQc)>gh7M*3(ar<7)^d^KoQTC7S9Z7|}m--9t^p zW8qR?(f@580LK+%&y3tkY&K_VA94z?dS$dHLKMYXPu(HJE_WlN6 zGAA`2i7uLyU>MPN@>K-zXUKH2!z$tEQwHa$+aDd&hTg*uj#(#ZYKAId7Y_hku&KcI z86O+c?~*OCRBxFhMJTVV63-^PwWDJXD0k5!L)3EQk*gau+vlTI655LVsMp@!(a7FN z0nmova3sXs8TS)*pEO)nRv(}QgvQ&m3y?<|ptK*K1DABrdW}M|ar>Ty;}eoh)4)$0 z(p}1tQs+0M=ub(&Ez5O!-jR{^tBsZvI(7asF!}U#hnt@I&%0Rw*FWq%{#zgRUnRf* z(f^bHT=iINQfaoLSVdrU$ALv3K>3`dOiIjVaoyGFMlgC*Pxb_vo+pi-Clz6W0s$&~ z;#g6Oz?jVtznQj8=t`|-#0Pn3y3-3O+0x#Ah5=lEIR`lby{0Xb2g4&^UO@1oyb`cM z{GbGQ8N3C(ExzGSS&%Z3fLY+F%$48UIf~~F{yKHum0On|y(@lRf`JPIc#^xZr%n{Fw0(%yVlQmG^^ z2tJ@I)MhZH<7YU^nyz~H0rov{JpV7h8Q>o|f&U+t{r*|xA{*_l))r)q(@w-C0F^h&}llr**q{9FNEYr$UO8D zdkzS(7a(N*TONC@l{&9~&gcd;E!=!VL*|7Me3J3uA7XEZtI%=?gHeY6444rI!PtQ% z=h9l;a5Arf1 zxyZq|)*@WJx3R;Iq$z7fjWL50R2jtsIH)B;;;ZaUq}suMIN8g4ug0_mY67_3*+ZeR z6uld^J&%_H?+{Z^MuqfymBu4$eRs+j#GVC%4`%6J5qX1j1xgAg4q;FGj!~>t5lEcX zUcKO+b0wyc3ice~;3N9Z^TZdWPV;cj1*KeM4}xJFSz@MRt$LS9&8GaL)J80&j<^DK zXic(8>yiYR&ckX{hWq}EJ?{uo*+{vhX&VNI?`*@O5w|yn5rQDa7#+3@_okGq6&;$9 znaa`|;K5I?t+9SgD+S{rYElNNs2j&i4r_;^JinV(!yK6f%@|Mm2y6R>acS9+kA439 z0_N-OPW&$um+K$7OW0WdU7Ayoi9h)N2<~OqkhoI`rB(VSm)-c#w{v{Bc`(VbrZY?F zY@-ZEp|K}DBA>^9mI^svJS_@aQj;?ajs2xvlFnv!{Jh7!?fDu@ON`ej9{j3e#=N}< z(P@fPgy!ZrJ2>HNHrlr_*TE5IJ-kAL5N9X1P6=CJyJG$T_l}~Xp9MSxFKOcMFxs5w zy1ALu3{r0>g`G|j9$#}SQ^=AvcL{H|j?X~QwcjAsX!j+z+YDf~JYtelr` z@_+D@8p5D4ypxW%NfKc6_H?WL7q^1V;>2V{f9* zL6#et!jsF%^w_>_JIZhw#;OAm1*k+a?o7fiOkslzU`w{ckgS9^;aE3}vu{B-w@u{R zzaPi91`FDNGwqF4g!OW#Tjl*b_y$?@$Y4ExL`)JILLQkP?Bn=n3xadQyC5w$VP-x& z_i*ge@k)SG08cXdp!DyOYDKQVE&Ho(EvD4!Jwm#fPW%5h^#8~*!Or$?`@GsWdgzP^ zZtV=M!_RymM)HK$eg1@`^Xm~fKB1wUhy~i8$mUSG{_uOTCF{TqIczrdR}jQXnm3&YXscq` z?on&OLZ!j~Hrf2j4vv&hhelM#kC&)lG4R3_=WJSj2K`xuc+LX%OS90%DnJ7E845r) z)CTs5=Ql`SQ3_~q%5ZLo`SS~Z{;9C#z97$Vh}|xT28b0 ze4k>X>6#qlY{J%)mo(ol#xU}ZTSERjKB}PX3BJSj!3WEiX?vHRLM;Ipk-7ja>7EQ5 zX{(poJRZ5Yv;&DcGcbZRzWYVe)}CNcV6qPIXE^{ycKlKXvT${||Ej={Fr_Xs=0<`z z=xTJbK1#xb8w*rox!s?0DrPb()EoKkq+t)Up-l;3LruKfsL zIzYM$1hAt4rYH7<*_iZm_Iivsl!Mto;?Rd@e!9=d-dx$ahmqmesL8P<4rH^Nw^#fY z(vK3wx|5Wj@SLcY-K)OFgeS@WIn$V#{*kE6!o~IP*a|(Z7&3MgpK61+Y1+7pw^(gb zH5av;0nnK_eo~BBbwADIq{k!Fsk$ul=i>?PqGC_7g*^xE{v81Sa9l&=+GR2E^-=k+ zYYkFK>%zvemz(Ad-(UY*Ak+0(R9^{ZHquG`AAxM^TOfP4e473i$og433T1!M8u;K= z+$6=O-z7U8CKpG&-EI7xJV+7nUE%Yd%rN+Dcbc{m$0mhg(olr>$)EAlDlR{!61i?-sB+X+sKxi8h zi!aak6~w6BXj{NkP|_A~K0`(H`-~~_I^TGY$PF(0fTY{OgT?@Zirv=lFa>?pt`Evh z6tWBYkV^<_Vu`oYl)BAm1ICRpAGIq>pXWvxD&&KZWJiWYGAZbeDHo-a3V~x^V>n17 zFx|PYosQ7wO-wh5eg>x9%)Kf;f=J2@*IrF@7RqaG3>kSvg10_Ntws_F!U16pYQ8i* zl#OX{G#W#Nco7fduaIQnLv+pd34Ob4;eYnyguij_G$X`0 zeUI<}_@!!;DLJL<=MRU+UFZl2)15iH5$1@&!8liv{&Or0rFEQf)*!@(kooP@AZTOB zhDK%jQo4-sm?kBIa`7jb@k{H9spC=f?Ak!fb@fo{uyt&BCo!2pG_rmxc8vKqPjM3I z*ddm{yjuXQl=A{alJSf*CKY!3cIWXF4PI_T%pMT(_`HZDXU32bSP&TN&m<8pUecM+ zq@~X!xEe0rIE-WM?o{@)8txPNIh{x9UdW!Bu%?@S(9I!G6k7A)6Zuw7@T*bK=qG;A zjJUQANjO_|H;$u7xvHBgF@vX@D%)=aJpTnHNOr&PSsJmUm%Gn>s9PW{ObL#@qSID zw$(xz70EvvQST(lnz^|^;Su6siNd5rA|Jwqq;uG3b@oVGyB$t>Nc2E%tBGYitM_oi zmJ|r}j8xFSI2LZFUoiVU9Uoxz%)nYbODfuA_P3qX^R z{_rNeq#SwnXVmcca2k|=IWdXAEs)&7% zY}_Pu@!2vfo!T2l8q4&h8Lju$oP`%o$Mo!C)wIfILA5e$mqJFer$E5Py>gN)2|!m& zv;Cx2VLqq${O>gmaLJlv=vO5VmCIsue;_^JnMI5^2U;&W0R@kgQF6;+DuC05E?KCwlYP!pnEhN zaUO`qfqb9060j2PK?`Jp&N`0rFJAA_Z>x@xAT&F>1hb;;@1bxT07TecC>E;NgC!wI zFP`23UQ;wKPPMF`)*3E{;9GZu34g~D9pMU95bG4>*)>>{!a*mjrPXmvny#E*g^ zGVG`1X9LS=dXOum$W)su{Gc-A?pE%62(UDSQ;8^A`79YTZ}E#ej8(0xhyDXOSc9;J zRJBcQq_&iDuqs7QXIu;n^s>W;;n-xIT|?D{V8@T02?W+T+V7s3YGNfkYx=NIG+kyC zSpM^{^PBa*N5w>^Ms6iKNAkG~E5mWAe7LRy$k-3RdxZ|-STUNk} zzi~#x?#JrG7YNoxpQo!wOm;EUas(MR$jJS&rvrq*xaJX*R($je>8=Q&<#msJ4x6O) zHUtpqS!m*vlwdj%u-D}#*%0sy%=N)SK{ue7tmv_16!QY&ChZBDY1(AhLkfSI%eQf0f=~k(q z>-ZbHtLYE%W}~9y$0a{ehPmlLc+o(!?;hme<+GAV6z|_rh|OZR%yDTeH1VV**rFVW zgR`6e{5o;(CY(VRIu~ar4$nv~ts3X{IJo6WROT;%7YJ%qJ=k>sXX1RDt!{XWr*}ElIAEz8xgq))KLWW}Jz{tMg&lhk z#)`=PRg7cScdnC4U^i@l527=<7sV!uDED@ZJ?9$8)htru#*wrO*4v^;IBJ zaN2c#`lZ5IlwE;p_BSEbuL0)EvF+vTsQ+n;&3D_}54v32jd~R(`&ObPIN>3@|q<=r=?^@f)po&_c2abNX9%FnCbJ6=O7DN^#nh__Dpe8XnKtQpCDis-#j-A# zvL87&yAhr4%4UMxqe%%WS#k!G#9DaE<4U1~)9k^OP%9TlNse-v?!@nl-(~)t#^e(u zm-{BQ>ZHaUjOV?NI6Kdffu^)N-Pa%04c{rYTE-hX;AF}qh~m0+*c0aKf*dcUtAfeV zuqPXv+ar%CzvG!(>-Os;1fwRK7=z&`vjKsqQn$FK!&TC;w$)>gf~eX+74pyw5x7L$l<2Tf-UpX6%R&^IZ```r6NV}7pmuJ`&|p2_p7L=j;Qm24dUzg7 zNF4J)(1iV!E2>x|#}(2s9g*scLL7%6(2H9qG9r-^v|XpyogoeeCVddTvDwn-B1@ZT zH@BO+XWt4%t`1gxhRFcU^fDrh8axSdpJ&4rymm9?n=ObKNp&RzIlb=L>Wmj!F9t<) z%Xe*UTv5j_<7Af0?q=zc=M+`H$(0HjbS_WoR_>IWBhS>-;&FL*B9_TN_qX3OR=$G7 zVS@IQz}ZE{k1f);FDt{KV1{%(^<|vwT)k**hgpo@w3cI;O}FHnZd<~`zHo?&pDUkn zsy$UP464S%5aBPb;&eMxlrm<_Y(h}syNB6uB-)7J2y=~}m}z@d-z!t$HYS@)>%p(o z$8%qR;%{^dL%36a0ymWsTbEQBURo^1cB@`!VGZ|<5+lRWZ3g!ZaJ)qbMvDp?gF=BP zGw?J05}wH(NQJWSQN3JS%GtR~#j7kJ-da8T`=%FP*JlV<*2i z*!#tmAU!gw6%JYOL=jT1gHOFRj7Te0plZj$x}5 z>Zi?wnNbD$@ukWntWxMOz1T0OLOh@n*KnZ%DB)0~rhPc=UAL2cjST~U1_UhZ?tXH@ zlWyl|LQW&mTpA6FUBQ*Vp`Xkl&8A=%)@R4&c8u^8*H_zAzQ?h2qbuYdM{5vR)ZL%1(E^`N|#Mamgg$m5S^zBJG_!kfs*lO;f(B2!i zJ+hKy&$auBebNOHo`J!Uu3Xs2T-}dErf9;1LB?z6*IAeG4kSmZwEFX2p^?e7ZFW_c zh&HV6YuCD4b+MEF8TRY_PK_L?>9O+13SHS6hJ?!UQ?PbK;15qfB*S1eB>lOmNtXK5 zXux3PlcVC7CsoP48?ZIRmacVa?try4qz91jZg#y0>_y`WWv;!SXEAjs=N%i9s&@49 z?^sH;4u$FUh-Jk@+Z3(k9m2_IJ&#mH3nF|si>cf1;bu&gQeMRL9WM=9${kN*A{sK2 zIRjqe)7!3v#pb~fWuD@BxxFfJ*rV`Mis+gOkVNxo2&FSA$p4N2&yM?1HnD2!hk(f6 zj^KSia|)B;Q7f~!%}Ga#<^#?eaX34Q=e*Va}R(86Z?e3&kFX8D7 z?8|+ek`1S$SawBFk&f-s6{jO{=M$f|JBjGMrGaP zAW3lq{Kga|_Vn`X9-gTp;zD-ZmHx-4rS;>m=>n|OrgFLeB7vxWD@CwNpN`D=vwlRr z8&+~`aM&YH-lk{QM2eddDc?B}6W{J*CQj(N<>{{CqK)GP2@A=!7KQ#Kdko?k%U7@&_*f1-Jd73Y z!&KScYbIb++(Qb$5%pf6KkHvJV0^IKQ>6S;kOvO%B(pn1I!EK6;>4^aw;_GwrU$2s zTJoe03gsbD@c^dghdo7?@t@#8d78qxM?);8iz}Ks^{Wl!4rzmx{tJydxNgy+G@a3J z0ctJJpi2ZYxyg$v7&SN}XTsvJd@?3B6xXRU6v(5$RO`!*m^B`6;T9%{xai+|9uZI? zhF_V^{}?BU<1#BIH*z9XmFWX>6YdRQ?me#GbSO{=$5jFA3k?x0GCi`GZ8d!vi1bDY;5>y=r$*S6UN0zoC<^7k-fk*mdsi9yLzu;TNbjaCy}<0n@lqE zVVH-K_jw%r_tU8=DNkpj%Md*cdWYF=*7LR%&P>yJ*d$6Tcgd(C+u}80DI6*SJq=xU zJ>Oe57cQW%xgh^i?q;m&rK81P2dIn-=9#DVrHz2hYIAU=mY;L-4=;a!2W@GT5`Umo zPZu?^d`C>H22!LMig7LJ#yx$2VT(8CO1wU3N&r&%=ihZ-p(YPqf3!CY;<~xFvIp{7qSBy@IL{vj>xbzqmdhM&5EU0_ zy1|Q_Xb4*w)M}F`HdU_k8t@yq#7jkaPyP0ax-$QJ!bthj4jlmfVY>T-b8#mp2>bG+ zh9Xv&|45UxZ_`ttMdJ#cFt{&cm7F)O%^=tM;@%%ffdG0SUs{&)!d ziW5Cv&7f*)y>8a;%M;4w*XkwU*9)@2x50l^|Nznuk8C3&SM8lt^kX%~?ECr$zT!9kybn5$pd9`!M8G{jwf*n6m{P zf@_!R{OE)g`FyP=^zn3maelvv`TD$`ZqE69l6Z3L)U6J8Y|oy~X`eq^nTrF<%V+Gb z`|^UYgckXsEiBRyR5H)KXDuru*Z|13(1Qkev%n>UYBQ51f9%OFKVz{PM=EERoD*6X zkPj#G_^@3O(hf8>1sJ{`8%1#KUS_Cw4bM^v;8@e9a~J+}Kcvj?6~`6m^Wz!$Ktf>o zY@Bcta%t#=Ln9I^pNLN!jHE=%Bd#xdcllI$`XcAINGx8LC^ za!GAm^e9dNk_x|}CgxJn&&MYK#KLe#O38l6!1gKi#TqI2+glg1`V0M38!JZCX=s~CQOvK^M;;5Re&%A5p?5iO_{1uzOgaQtTOrcOrG zJiDC|j!5z03eqNKI#7r%8pQ@{-&%+%>X(HQ?^7~jr|DsN0XV!tA!;uQ@V`B@*p1yf zUK`?xnyJwkri6Z-Z%FNv2FA;DH&e2@<@hO|I6wtXKGc-8nj(FwL3es^gQA9(oIUC- z!+;pC`<^vwXLYD;{>(H?G=DiBw}4$)pAu)3VoETP5=!dT-<)eRGi=S!-|l$so+W{$`eZt@5#jM5iDN!M}U?)g{$0#XRL7pQ)LzUw2ze&+3r@HfwA!KzFTgf>g_<;z^r)>|cE zq5Fv;VRa6I&|u&MG+1TP_RziLP!%|Qbzf{ssvvy%`4?&v_9>cKx#aw3F^nYyu$$OT zhHAcAy!6{!MEnJ*Im7Ih!~}!E(%>wM$BK;D=8{KYo-1x7D$6kxC^PmJUaa`cc4-Y zNX_B*lDnke7veK-EMberuDWuMYs*X=GEQPfeW}yB`HK`qt`f1|IWoht0!g4LVVx(q z6#^z2hDcxP2ox$ET;X!!u;4(2v9L6YQ2=miZYKv6(@9TYU(ceq+HWA#8i)nEqP5y! zKM-PX9B_uetc>m`RdDV!Ol*!Zns{K15t|~n6`uZym#E4@(dv@f1o3vbAIGA2oZNHN z-0XlkCDEphO7hd4?ECvsmr@$2bpL`C`0V>c|F+(guGSx=+cJND?vyEjOz2{Nu>s~2 z)+u#N!?yf?Rr!Z;@>Qb1$ZxfBUa(rFt-vxE;}$kS{Qvr0eQX$R_SJ@&Y-zp_oLb%P%*bzfriN>Yi+pF3XfhNC=-K=nsI-t8Cg}^JdGd<}@GFd9w+)ak}D+ z2sjyfVur_7T6R+it%iJ zk_QT5l@fz4DG56)!JRCj-)B~yJqT~>EF`inu_-q9u5^%?`MlU(k?xB z6`OZDF8lK?F!$hbY~?LN99Qj9VQv;57CD_(}`P1W2=Ms{YX+( zr&6|o0vKmX%TEAI^OaJ|+us=69qiL_7DO6F4=G5gFD zvBqA0Yr;Bjj>V+KBHNnO)PhHsN2QB^LXsYeX+`}Y9gFI&7qtNJ(g;SlzKsGJXAzAz zXP8EjLws?7d!;a^xCkjj>b*_^c?fI~-F=r|1ph0Hy?+v-z5id}B}c_f)z!RL1?F%a zqbDr)BHojQyByMYe)_Mag8a?qn~}Lqmr3}wxvOrPS&ic-Ohx(bPB~=$fqp?%w}(i7 z8Q?$jc^9Fs@l*TOgdD(A76}K#jxRS1?1~K3IIA_h;Y}f3eQdwG6px12 z0YjC9JXs^b)iG5DUM{z)RL9ygNDX~TMCCqEA;(y{v(B5HK3o%18sP5l(yfVCSeNbbd~HY#&|0!0nsgGz9$ZxRO`w%$A5!QL1?4H zpiJ=5&feJ<2L0BCauX@GX+bG7{oIZ_2@`M{6Ojh=CQ!y^4(_8b?V|fv?W6&z$%C$f zEim9Zw1G8wwAszh=g!4=iq@Skb6VVqE0l{C(|S~C&oh<4yfci-k^E0xiEanx(<{o#O#|MO-6!9!f_2A@}ud2w#aqsVQUJo4OObDTz3m#4VqJPh$rY- z&@;PYnNekSF zOy6H^Z_21(Y^CD-^Z$Ru^So2EbRfo7Us$S1z$7GTEqQOv30eMm@-}nDQ`}gRt;z$YK+~ zZi%1~rusgn~pV{OBwGU~Qw&t%z;|jn6`c2BOyjZ}vdI zfY})ZA|mwFgav?-$4V^``;82M;R+=V_~VC?;sQXRDM1y8t~vzasI-wo{i4D^%Kc*5 z?U5&zaNQseM-Y|*^zNZediik>9k)kzcklq@M?2V4DDH0l`BVf;{j~spa)1$~VF0WN z*!fOg1V}B?k0?JZ<1{!;9(s=pq0qZ2L6QL-+8P8DV=LwMTP?Ss`A$h z`#Y%d(6b~|Wt!F7#7b-!{&AVJIlS>LCdef58t2f!)S9SB@XBbDNXm88L^Ne-NYLTU z!J0d9@UN|*ScXxg;5?QL+^}a*OHj2Ub1A>m;?Tm#UqTBQJ)2g^Dxilb3QD-_BAdXi zbmb9xEd$d_ri^9_3_YJf7$14Q2^R6 zPhzI8?7BO=+cm$=J{7Jm?Jn)My7b<|dS82kB61jlD8V z2b(y`x8_X6l&U9fi=;B(xxv?L7P!{U5|q>Ak|T}PGa9+k*XXG6E$b`hOD4Iq@_wqL zxvy1dpe&jNPUUgnqARW0C|lKy)+wpl)vHUXW{_tM{7iOvZCrsbwXDx$ZJG4QI{L{$ zC1qW&WG{lqp6Jn}RjyJ#iPAP}iFamVS1&)Rkb(aF-la1(+j^lBp^PZ$ zzpFON!tJ9lZ7Bm(Vm9^arOMx*-ug2+X}d|Q*ravR_Sih*LE325M)7slEI+=QBO{UG zXC1*tf^>G9S*P0ErkT`ml4%yoJ_oI|Dd&n`RbgmVR?xzqlb9TVPG48gule0! zwfg6`i|$KIf%jiduI)Bo3l}+rKA)d&gY9qAz8&3mlzo_~gIix8Zx_468SQIV-M(+X zYTw6qt8@6je%+nze!!m}s(8dUf6x)#&AG=~J_T4@*S5jQp$r zQSeTBS9HOAeinv6^E{IRw4N)!*QaB3J)BuR1ieerd?yJunccNscV{6_ zi^J^(c19TAv(xv@Zi9zEcNx@!@zd_FAAq2PrPVgj9`l{1&&3mrp1j32!yYr>IH5F= zPPbjct?_JOU+S&;qS;C0mY!^I1Oruq4t|rwJ=?*OsmRsRenb2H)8`TRUK^Pt?9 zKvvq@6{yQe4ej|6&Wk+uuQRq(ts_ybY|(~9!h{r^2xIL%{`~O+^0?gxjs)}1;TcQn zqMz>E8sYA8DRJV86URE?qu!a20*r$V_^9{N%m!;NOd;Eh&D~zld(9hgAn~JEU>u6B z=dP;#5DbOrHZ98Mc6`*@4aZEes;WU)F{<`#(uKeP?<7t0%rDgIj0&}1t#1Kdbwbnsuq0l_p3QrKeV=Ym`Ge)k3})k; zFs%>3UBs8*)MdeE-`19CN+i+EQ`YV$pDI-IllfoNy;W>v!LluAW@ct)W@c{NZMT`3 zsm#pG%xz|9Gc#kGnVFes+*eQYbmm=+?&r+UQtZrmMp%4|4`J$n9f zwTQu#4O{!Bs?_;=#ow6nob}a}<;BU8m1=EU#Q0Oh=ifP|kMlQJaUXil_OX}lH*%iA z$d+JvT9t0CvkCjNn%nh>5g%UBXTMU0ZVS(+qP>t&VkH{mFUNofb9+=uP2AxsuDtryPErk25wjLg z$=P|R?2GCYoS-SMr6ciD&0Z++IOX}oVlswjhg;|)q`ojg~z`xk{oAH#f zJ=!v_#vy~O#I;GvV%s3FXNlk#CC(V?UP9X-k$NO8Adxl5+WzF~U~2kh4J!I&P`&8t zKqf-{2T3+?uA>lxh??tplAjC;BwVU+cpox&Z)5X1fAGp_6YxPsv#cfwq6dZaYKZT| zz4{b3QlPu_i(~Q10#-D{Hl%;Lw^*~$%O`L1NV;ii_B)&YcAcfeuD$~)Blo|{O9IJ? zEoun3D@(#@(RVB}ZPAww@mq z*i-ki;#7;7U{TdO5=%`#OVaRPoA>@G2Di!+wRI;@b~2axH5c?pnm-rHL%6~Ux`u?8 z6;7DiqWV}Ct*PN5vPt?{`3@W9IeZpRu`0@^?82FSkBg5#gmJrUP#fYYwA>GzyC9B& zmtN4hv{R&eo8z)-II{%-jRh}N{8blbP7W=2MrBTN^?%B3#hHFsK~e{}Yr+;n9a>xv ze?On0f172{aU^Lartl6L(ta01fxguA#4B?Y%odM`;n@yXy1=$&4CB2Pl_5X>xe{)fDHh|J3q72%WOX5KV>q34{&%(+eB)1lwo zyhiHp_(iUCwX1Sfzv&JSHFl6(_Xk@GOSiH1MsjSov9-pc`fC978~aBTHmx1Sj*OKr z!NYfSMVV?_d$$vFN6dOkrLlD7wLI71*-KiH9_YY}Y>t==M~Th6R#eX#m3A?^qVW+e zTP8f=a&qn#{O{R4^dEvp{zFx(0qfMMYJ{*uS|x>Eb73+!G2Ma=tSmk%K; zH}O0tA+AAA0h1|0H7$O1*`lAc=z=9FGTo_d%dM$gUpL-%flgX{a-WUE-ji8N!qKto z6GI)Zim2NKv_<4ze;$mCb|R|?aYGoJe5@N&I$oL1JT<(72YWHK1Tp--2kC&jmS6uY zQL@tEVF`Ff8pMF*!qCn1mw?F5Yr@d|3HSk?lC=HPzo;Bp&c6sFsE)=7A8D+7!da}v zjZp19%|D;?@g19k!z`q_zze3mFI9j0Xv@oke9(UwUwq5~K3HinAYwT-vTN|T zir1TDEeB?<_qBpYu=xCr=g9?sm6|=NYVIZA!A#SnKb$|JbA*C>`WA3K%L>12MBp~@ zUC;XF;D(7uW(yVkvBd_RSNoM&EHZNL%dtpg{3wWv5;XB?)*lqN@@4H7{tsJ(|4qgq zlhu?;@QI91cFaG-a9C2UVh$%#IGux(YzW+s;ui<5bu$oRW=e{?)?TzCoc#SMFKO2w zb-3dadsX=tJdhv0lV8lpdwy?kbn_BiA1xUtPf#&TPF;0d#JEqhM|=ot5n0M6*?_58j4$)T1p)kQ!b!Bbiw-aurly5UWVElRqiM*9MNX)lh41^L`1ziwaPp7;il^LCVlp0pr#R=N%&)1&`iZ3^pJ~iBQ z8C_3$j;3GH+x@V5qsA-ZQa#{v)rc8sVRKcjIxDdR&h|VtxSzjEQA`eYv>JP4H^#ct z?)M09X){^ZeQX#IHn&|E#`>cYesxR!U0o5?76ZC2DVe(on5gU*O0Gd-lnC9)IF#ZU zF#-jiXfa&lcX-^Jc!k^QW1n3lm5rgZ#=zA8%>-v(>@#JKCSRs2`~#OM&bHWV3LK5D zh%v2>IM#=aE2P0@L`qd5y9_oqhSCILq6*y(W+D#V4q?I$z2;Y!0M=`|f`J`J9ntms zygwH!iT^nf)a2`fj7W>m&fl8*GQgT!KcMapC%GcWLorNJ@^C4H$pW*cf88!tm9U0Q z6b=%)4*`{@h!Jc)Ng1e8e6)Y*ahP8nBpZg}qwf$wQfICMkja)Ap76>YHYE}!W7hBU zhbSw>NB5y`a65#fdgFBh8{3@13#_YQJrdH$TZN{o_{obnHa5@TS4Yzr`MXl;0)?tV zsr~!QOLgGKXy$E;cwZU1AxsuStc2445JM!b;1~+0ml1=F(ZL%inwp1!nz)2OQL(ZP z1hUYIf>5!tkOi>dilR`lG8clFV1&>Kr~4DLmtr|q3w<}t!;e))0%d~NAals-pjuu~Z401NG09}E~u4cZ(!|%+{G>&j`KW9-gCts4Tz?l7u zCmVp%7`wpZ1Q`0JV8G626*UW*0?P)Dc!{`)IEmPaSc#a)TxvaNv~OQJUr*stu2i*F zrBgjusZ_C6p;I}xBCJ5)L7PGAMf-!6hZc|KkLG}8fTn~djK+*cjE0N`4A^$=asCa- zP_w0>D0P<+Q|LoMe-TYy(GbSjP}b?i_8X~p{nJUnItrfS2@rF(C6?(a`Z+U%wVdQ= zrE)IsmkKoPP!LVTY#T-BW!=d?3o+MeHOZP^6n)}t9~9j@G^yub2qbN>_e(T@Cb~C!{;6rXYnFl z-yuA4`wSF-+V=L`i9p*k!DFHty`o^T@ zQWY!el!TrGqcBDM?OS27(inS{rV;0W@lfQvWdHGD z?Ll=^af)w=er{^j!D$0=AqEO;HMX5FX9(r!pv%y+mOmchLfVhdL>Ug>0)wqaX)nhu z;~fRp2B>VET+L$z!ZEIBH)c`KV|N}x9&I}|Mq}pGhjR@-wPSR2(}5{j_McpUhpX)w`@Cb8*acXNltgr}XINqE}R<|c&jKU*}~To&q4ow&*KN=-_< zd!^9rwgtd`31 zXq-v@J}TM0AG3rs3u)FL{(Ko~% zg-eM#yaH`F=}fC-l)0@lu9(IKT9rDl`p?eK=sn%45wvJv?I^U@j_9Zg!$huE;Yu0A zM>@pykzmfu4s^jaeuSI81e0C#lUgCMjxEfn#i6why^>|jwOGWOEAeDk-k`naOFb_Y zCvYZw9`N(Cqj0tzwk8p||M(#fh*RVP7XPSn1BpNO5!0I82^?bFu82gOJa+HS*)Id>H>ax_ zJ%7P(B^TlcDbjQ%Grx-Mr{_yKJ!EM97*Yp;G|A=Cq-zM5iyav^&^0D)XgeXa?Qe4z zEEWDSUBqvk;!SR}#{^!Wcz?SNg77B_msXaJ+Uw>C*4nGTXs$Ja%bQz3VS&ST$~luJRcY&qx9vrc$L^r(-TBSb={->q^( zO2;8xXE0jtS-L*>Q(*D!E2qJ?ugP08Cy(U63?hUu1Rmkfb`C+D52W}n%YX;-=0^-6!b zUItL^w5-pl#hywrt@}z#!YKKN4N|-0nlB$GdI-1lT(5Nd-a1nhsGG$hY=t|XKBfc8 zT`Z=s_}ZV89D^w%xCbqdOA}W7iU4lrsyCR7>_6$0uv1V|p7G0z(V|o+Ystt#B!6_$ z!xYJyXzL~Z4ZuT*8@AWU{u@>j8_`T_Ao-w|ja8F_8uUD%G0((J%G&a;eXi(IJOt#5FXeE^{KK%E&LdrHqxzNf5S$vuBKh*HATazX+RCIxo7t@rbBloUxGd6{tvds(U@|X7yrmdx}Z2{r{ z$H43@?%BN`1&P%99u~*>9tN|i=|O3n0+Frk9Ta36L~n5mD_-`Cj5YtTWzfy1KJolM zI|3q0%**Lpo+FAit*C1&@miiEi&cSsRKa=@#Wze6aAfaFMdH#c>}NVhLJT)6+6GKZLxrYdXWthT|@`y3cHKFk^Dm2kOjN1}?|{G#)9DY0ed!*o@bejKSl zTZ3fR#5WSYt8DUc6FjE#^f>Cp&Yh~yv+M9r6 zrfCMPimMj48&09p6uDkYP5;mgB75;~s+3mhMhw-`yRN6s@SpVyxvz#=EC8X8*bd)h zPN0jMpj7pD8cV*W`^_49Cj#VK@N%{1R`d_ioe<f59XqB4BV z0~lRm09m{pS-Iu`*=LjmdTwvV3MvYNXOtCs@BrVi3ygOP5M{N9@19OqSyfOyl8z!1 zT+)+=O0OE`6QZZ}Sgn_mF6G&s^2i8+J0GRr!-M`!i0$POB}=L^7T0llv&+8^oW?+( z+=?FFFSrEOgzY7nP#_mw!Gubz06#FJ)Ex@l-ZdrNs5gffW&dFYyVv1@1CtA?FJmfJ z7u6MH!BsL<0Y@L^!3PqYm(nJp1GHY!lc?jhxVxYV@b?EoX?ovSYa~_$wO#^M$r~AE z1+X$%y#X4Y0R71l{vjPlWtGgX?+Y6uG=IdJRwsetG=Y0J<0H-%^3zz@(+_RgDfN@d zJI+@yDa!nTk`@Gr7cSUYBSs~s!C#YuJixTSzotFsR2 zK5MELCj@bBtF|cce<5v<*P&x{1=4N$I6Dc{%ftcB%#-8a1PaX{xk2qjl0NRzG@lb_ z$`Y4@R_FmDu1x$KZb0Rc<)y{;cX$-GKjGeJhUQvQMhTCh6a2Fl^zDJ8Z$If(<+ixB z^i<`do9sH{WKTkkS9woRD_;6y@wc@&MI++x@miAh;z;=jwx5fKkhBK5L@3K$VJmDE z=!*{?9{82-d=#e|o78I|JsO#l@i*I*lZPBlQs1^x-8RZ*Y>MXi8NyWwq3^_*F+$4g z)Tc$w3g*VVPW+)F@mjercT!{T4O*Aa>juv?IG8hEw8ES__IaOGaqC!S~*F_ zn+vF^&Kx`c4<7mu29Pc-Fmr`qy*b7iml{f}#fOY9oOUEM*&)qs9V&7i3`1(wYleea ziM_F;SrlCcDXwb0y5v}HRy`>Ci}F8SHBOOjmG^B9+CasB zC_Lh0pZzB&4m68r4~rO*%ei=Vt9Xl)T4?%m;?&H!s}SGK7RyDrkvHs#{ou1KgeEez z|1#?hZxxk>6t8QVdmhH@^^7dK`!#z@ioa-gMbQf!x7tHSPjyxapGI?Z$LsZdO)k%B z|M3_nxT1Q-U`eW*e&u$fWT@9<+%-3Y9M7zI7Tuj|D&Cz|!9GjI0{lphfjhuyaIjIU zuXT5Kb-}*6r6mQ()g%=^9x&o@nY$6&oIkO1C4e$R6~gbW|IPR|*oLvDYbzBJkzw>W zkZ_2`{xm20lzdw}^xT!vQUr|H)v?y-`H>J^IJk0FFIz~iCIijj>FqLFwO!X0BdgqN z;*Fo$5!!xwHjxgne*E;1IjJM*|4V5CwWPpoPS)(R=@X4O{AFFo@NuyMZe0QkTa&cyuCC#C!FIPSXJ{IL0Z zT=SHn8N;qzr`b`)m`Uz+Q>-fqsn=1qROa7|S?*dEmz$}s^HWb*E~9H|UOVBoV1=}+ zsYKbQBjE#oNr^*))R(+Gn97n1lr}8yHzdpQNrMAldRJPFL7x;nXowFm%H6Qm(^~Q@V_fX_#{c6!_@`0_p zY=X;{(3m0i0L1A1#TP}~ZlO;w&ssYE6$M=bb2`_jC3;OvWuOz_IJ5#@)`$Za;X~)E zRbB`S^v4CO2*K_;)LQ}yXticl4>^Q(Y$VBSti`kwl(OK2JJ z)2i!A^`>ML*ODqi>m{F>W?9@OU9USH=G$K)XU!vseC*rXH8A)|5ooqDellm^wzE;f zfIr;K;OWz5OsOAXW%S^0B7tCZzJ+-Mb;X`Yb&S6&cD;?EYi@xCWb|Pp|D|L^k_=#D z;vzrpZz5j0k0uk3$jhXi1NT4Q(2*C;_3DNcXm<5g&UnWU578Z{(1ix4vflox|Pe z?q1*eeC!|g_40Ugx%u+_Jb1hdjdlHs*Ls~E+_JqA@M`^fzxpcmdAaXy{%Uo58ouiO z^7u}34tu6ub#KX2seR4mJz*e(GK?8ObbJ~V{h-Qfh;|c9&s`qJH5~rae~?Q|C-^6g zlLGweZlj5l_M*X)jMR>*)@h?A^7ZgZaLcFV?TFtC&Q3Whsp5?1m0&Br(JsN6Lao)@ zr?d49BICP}0#vC`dWFm-7adc*7aAqA;zIo9K- zhO>J^06&57($44Oy6e^L>5P72cjM#2?hBsM&M=}#l%VY_&v#ArY;O2%E%r(8Z`t3k zgU3`v&ev~39kM41X&La#pWwj(6s_>$iq)KXcTQWi!bkgEGonA1 zz)*Rf9^qMewjSYm0WlZ27yukQct`+@ExaUP)drptP;L`Fcc+_54(MFtk|^n4s##Y2rIlLNF;hj0O( zoj7(g^q|p{`*$5?0NYL+#^Vk*c7c2+xcTRxsk8!Uh$`~o@4^Z|?)~NCXUSk}h3U=H zn(xMU^8e!S(q-kdY)w$`{r%%9()B6#%guKye-m>Vqt&;gwH?0$IZ{JD&aF{m!ZKjM zO5`7&LChTV*X*I^kB90}%8GF&9n?%(SKY5p)$;rI*ew4_H_1R?cnJnQ7xbbjmJ~&rkDm!kN9fw3A{L#+<;~by{yaq)RI~ zmc9~X`*~#g)TsaA{mhB*=*|!~NRwQt&y@Un59%aKozV4&j|Z?lnS-RnOE#b0+sEpf zVsir~qK4c73`WKJ*!*cvUcn*lDnrqS43=DK}Veb~>Am)xWI_RzNUsXfG8 zPt214>tz4qc6?K8q|nM;VEk7=#Iky#tmbyr`B^&E>)BoUw=JqQk5H3OhtS6{=}eQ5 z_ZeP;^Rv4Wz;-4Z83HdHEL*9M%l{RN`ZeX4j|lJ(5{4AZCvYP?T!Qx~%SEKSfX&Eq zvK_^zE0udxB@MRqp&5KruG(V9kIhD2uc4x;F<^+L8VsSN>;Hp4_-&m@doGwvd@ZlA z!ikf}$@chyARC{%ajwTMjUs1v_0M z6rTRQK-VWmn-kB+8{Q-xO!yVwYrDQt7?RmkzkvHlVm**a3oLYoU^Ky0dKPiG|>&auC6Fm8FY z2msRAWmnZMShBz1S$|z}Xt_X^yujd-x2)$rJa2pJX}e2Wby{ENQMU~baLJ_rulkuo z8WZO@f?PMSs7Y@%5Ql!K_JG4|9Q@KavsKsko`G*T0&{m(D+V{r1HW{Lm_G+|(z+B8 znCNo8l5tq$cvSSvH7_xr>$he7Sh;491$*m1K9{cc2Roy?#o2vn6?eMj(Uv+4Z^USG zQoW1mBbMIwb}{0eB-Bl&DE1=Qi^CP_^F$;kzVfccH66FZhIuI z|Em?OUQQG{KIbAIzcBY^@VNGE#m2yqI7@wIlRBVU{B;UQ%hJ>cYAmu#y=lV6#X~0@ ztKEn(-67TAqkVJ&L-$-FB-G$a1KcHczW-WIxc{f!5rMl@Zn+cU92tMERq@3Mt7yRaimRR0-<_PG8C&DT~Ofl=3ng*g>*zK=!;SUPhc_kffUE%UBRx2 z=UOWglkQw&ppwbtC}PTx$X6rmoo!6tWx;`eileQj!% zaj)QruV{=MakT}KlQ9gk74knE54%BM7RYica)2Ca_}ev=-m43zu&p(c!B69G^8oow zNIAVCJr)ZXbpoPx<)NFFeu0iLEqS=?NX%r4SiBOb#b@+)lVkrTNUsW3L2Wewj~c+c`g%XPg5K+|bTvw5t@*z3e`d{n3uXP(*^qet0tW*6`a&m`k^VnRSGlbn2D6uC2 zaLh2{uA!q*^ZCe}H#I;TL4m8&QH$|U*vleW8KZKlVt!%OSF~sC%A%qo=?@ne&{E8$$ zzN@Zt=d@dM2ms~&{)>zM)Q~f#McE?Y8IB1gItwuy!VGncE$(M%f-aU@aSZsfpy*Wb zCw_BSf;vNC{APYE27A=ZON3Ob2sIQmUja$l8F_w8#SC`oumBq9Y*5)Zh2(Gz5QBRA z?K7+dk1IZB5WL3Wc^{mghHx8%8|04e)HH9`#BK8jsQjlV)PLv3{}JQf|Cjk_|7YYO z|0zSI6F2#fggE#~U{Gg@4uX|Bn1M|35V9_!S}MZ+Ir2Yf(QtJd>t99e~8ryTK!wrHRv_2qV-*=^E3~ zce(OP@FdbzC+@=5cX>BYzClmywHJhzhawM(5ek@-55?ps9tptL%=E%s5;gopA+!=I zjTHE7V&2@vYY<*!%U4ug@N3h>^*8=<$DKhul&x1pOil7CXN)Cp&o-s`;ZkFSaX4QX zrAM>bIb*wxK;tH*3j z@q^^-yXldSVk8)oK;o`Obx7>&-I{!Q3gT+4BMevk&2g!&G`*7Zvd8d z#HXR^^h>5FwgxYf(z*wRF_2F4r@L*Unp?nZ;@IV>R8-l2uQnk|J?1u}_M3OH!8*4V z8gB@wlDFr6yXmpi=-2;t)87Ab6G*Tv=6~m=|G&$!asSUGX8x0>S6)dvrvF=(?G63? zTb8Z9>ItAYk$3lHMcsb9pTLA79v^YPIb#rRg&@A#2lpt zEDIu|tH~VsS^kcf<>1V14k}*&v(hrcAZz0I87h|@3w}d&R z>rYqp7(KnKq3e(7{>$!uaEAnkGq%QuaC_Qy&MM2spS`|rw1!nAFpLH>ffny zZ~c;ea1wpKK2QUUL$(Bi%*=cS1R<1hUC?iU>1#=*>D(j2Hcx{R`h=faN!ZOuY6emD zlNI{_j3dlW5nP~?V&8<+om|Dg60Cpu=*|pQ9)IoG8o-QOlI+H{6B%=L;ibqkR+-@S zNjkD3E4APsj@N~UxwxTVhm8yyXgtc0+RW!{9HfW!6oj?#-Qu{}Iq3L7v0z<+{-w>s zma$(&5cUurds&cM&Z8T=XU%N-je`8mD7-9U87{D;3qCO*LrZT&Q4W-=KFW2;k>B%N zRGGe5*5}oF-BY6Ns=#t?Ux?j8lDSW9P0UTy4+*Vo9Q*n>v3W?-+hweQC1pp8TzW0% zXktgyK`%KIwOHm6eG5(jbE^Xa5|}hZJVnI$cf_xqofBgyj#VNnkU%P;3Q0&c-s5^8 zQUddSJ47S~k24G>3sRL5sT#@)tA-1eK0^NZq;OzPjQEU*L&$GVQSmoSJzPhQJFKwC z)9Qf;>~U9IPbcp|?k`D!SM8r%6!v1&|F z>tnKOVts9T!yYNIh$0!b&|?L z@Kiuycn848y*Kr(QE5nFR@x!lZP3SsY_B-DrL`p%>BK5MaNlVyqhcUUhZPnw5^V)7 z>U%h+FeRAXSu2#ha{id3gLw4O z-Nr)Wu=XTbw3J_MxKz-0Y;0eBi30aU9*EL)H7DWWPzKZyLEPtw`cbRkS^mD}xS%LH1 zRCls+Rc%4$YI&Cw`mA)~6N-Ja3!!KudF8xGmbjs0>hJ2njdN2c{e+mMVbjaSqUddW z%D4(b>n3Go25~<5#X1Q3R)bO{t0d4@D!i>{o+f?f2^7$MWmwbLzne^aO3(#G-1# zFm=*c9@KzDrbp$mJ6tfqeN@@kFhGTIJ%5|64nN-0CFi+3<90KHsl{fd?tvzJ#+>jn zw;4bs)bVE}-#9ZT|I#e?hW6MzogCUa6Fnshnx;wexbh?jLh85^ZiXY3SY*3M4(|ea z5Rk;kT^>oW7jdSrjS)5Q2>LkJjK**QgYMspM!AeRQ8Y)v;G~G&HIT_H z>Z|k|?rI)`yq3x7YkZtVpLGUJlcZFj^!GLV+UrALZ=?+)W>E)~-n{#X~5z3EtVFJ{3~F@?Mt-BTh$ z>vlwX^zzcCCJ$u_t03j*x!)!)etp9QVoKhP4m38>QP@Czgx0+yZp{cLA@rw^abIi= zGqiw*9_)7(J-%^wrDrWbQ1w@J6j#IGPd^--HDbR~DDH_gLglXJUFVwhd)IYc!@rR( zJG`(Yzix!Dsn=0sNu84wvMiSs&)O|2Px&#Je`3wKALy6mr>G*+k<$kZqM}2aRXcc^ z1czcQj&eh24h5($uR5HmOyFh2thMPNl-^jNhGQ(?s ztL7MAtD6;CDpxXS_(ufj!(Z7JU={&{Jw@~%O_Vw5uG_qfjO1uNAV0l7F=JGc>bf$~ zfB!_z+b)$DJ zOxoH%Ln%bc>Ftklzy6IbpeE8!vQBSi>kMnkEntjDT5oJKsGPp$H7!o&)nQjuRC=W~ zQ*#(;A=}!iu)e4%_{6-GlJ&ury^D@Hnf}n^=z6IAxf`H%JjHgCKJ)snr51%r6dxhy z0;@IoO(;9t<4%Vwh)I8k!$F^cPdeN(`TOwH6{^?Qdn#@N?2Q+J078@G;;e~xGrMw& zYp+|$GYAUZC|liVV06cXJP)^QNYcMsn{`$pHm%@q8%2MlGmhv0cMANI+^`X*Xc?Bk zaDZ(hmqt+PAf8O(it0&|uJv7eANe1XJsUzj>-0Yp3!7w!X_2(!g$1UK(!e?ni?9dp zPqAXDDmw#(9+Keqh?gn92#)wuU4(CSHMtSuAlXY5@mL=%QTRZVsNT9Gds)j>MDLkg zN$ag35p;z^)g--Oc0%v?wO#KU(~SC0;XS9-e2M)pFVHq1ZTPFVK3k_Z2WjU^Y4vct zd7URLw#w13!xg{*eQt9Yr)PFd*mI6OLC+wugIdI@dLGz%Z>gUTBkpb{D}hIbMg)kc zh`Np&nND(}YcK28+uO*qYf7qTDn~*v*MF4pcPe#*=fz}(`3px=MOJ%>%hJn6pSK?E z)i=&ncq>G!#W%Z&nytJ7IPjsX+$P{fg?=J~NZ(1?kOx=IKa4Ue&V{i9O$^Tx_PEv3UJ zGtPgvfd7&H2rK7*EI?kF+i`#i?(74`r-v2AuG5S)9c}7s4d~ZnDTWm?5VT_qvvC+B z_nPuAy0U~C_at6{$&{@Ez=%e2f*X%j3K?O&CV?b;A5?1$OX)t2aGV0YLm(|%s(H;teoiG^+#&runwm*=sJXnO5@ZQ$wtUAu)#Z0vG58JB*7Yh= z(q}r6N65Wjk#N_EjcJ@F47FBBn@iMLwQ_%txa5q!#EVWdM+b*obICZQaq-gSG3aGL z#E*s|uL(-F;>$D#jy0i#_;YI}xjl8_$US2JQS!7KX?(VEatL(5mZKJgh<8VUfdcGZ z4l3%0Z7K6|(KMx{4qh4* zc*%*#KIJpP&|zMb!#nSHU`Lh|m;Y|-{@=UI|5tB03(J2<2I<6g+71vQ1$pKej>|gq zxlXc=DEFO^VZJD3@d1T22BI;VN5gh|`>A7ocjxs_K-Nob=x@Wqmy7FdH8Hc>tNOQTeLFA$Nc}-s?xCQnXz=8@uy^JBO%z=ge*f86u zVfNXJO8Sj#1B)>1HCG?bXetqh&g;VPCSWr>@lGz#6lP!jdcQUV5h}}uvGO-agk2u? z)Wr%s;qB;RfN7PnU&JCi5!2hfeC4A-(WK53zn$fX!uee@kM9!tN)SzyPDMiyQO#EP z|MU_w%m3^L&d&ay+OYVeF&iy6pIstHK%`E3K>b3DK2BrA=@5c1(IXqx;*%tay`OQgyuO$)x_JgAYC1Xl+kd3`>Tz)v^fIys z#isC9KoTbr8g$?m*1`x-vwD3V+%^lsclWV6?u$T9%$0x)Bd&5&OH$+IAA$OOJq#PX zUcdAoEW|oKU$24ZHU_}=;V>fIy^ZT_zu}S%$1dKnb2|q4>WRU1(| z_N0%zm5f)=LiSBw-xDmNkc~qcKwJXdT;HGEYIb|G@_pRhK2k@0em94Iy}mwFw7YhH z9UaB~=zf2C+Ie`mc%Z>*_hs#HXI&Q*U{jJ2pfQbmj!L_MzR4v-xvg-KSME663HQXv zadS@QiDo4YWE_nX{NYQeE4a!PL~9wg=Ma{#9~Y`aXJ|MM<8!oE(5Do;S;i%nIL<_Z zOjJsY=m15GJOpMXV;o+B6o|*3#-b+hDcx=nZ3s|i8Z`;3Xa`awQho+`i0O9*>IY-W zhzt^!#{;f90ts6M%>;5OU=@mJfPTj+nGXTBP3?K*LhV@>mTy%#3l3~R@lD~g*T~7) z{jG4TQdm3Tjlx~#Om^ zN5?%NaDF#JPAF{I(LuqHFm5KTn^hc~yMvkh#|R3jKGo`ZuqpB}>g_rxE-|}Ksepd8 zu>IU)ou?-)8T2gf%~n`^sYC+uMy%92ibRvG2df!6F^6s>X0r!uv2cbCr%jDX)6B>a zIs69VP;RJPeohOCLGq5Pt?5~S49$Pe99|eNPODMA*&|F!k!8ZfRl=>+isUZ{N+Ny* z4}&Bx%~9I6k6w)Cp|jJ~1%VAs1hN!p$ zZ&6`+Ce;~aj6{A$#DLGja{ju$&LgWA< zY7%Ud+y?dcsKyUNsFYrQ$8NMD(1N2FyP8gW)~fgHKIJEsEQ+JOHAL_d=_7F9$jP59&!&-jE6}6 zDjqb}VX9eG6n*y*@ytCn6i0AB9>K`fQY2vN*-*A$1KOaPu{snBf~rYcBI|iCWSN7? zpSUlxo{0<)JrS#uiy#`2p=Cj-{TmT?x}v4Vp)ATxBHB%Z>rRW$YSX-33^ql!P;6yo z!^=0UD|l4&N$H@LfXdu0JI?N0%0*9>y$TM{Bbfw<23GgMfVH%pafkCV<&_96d7acUClr1Z$>Oq!gbR zBAtAK(K$sc@=vzwoCk9t*$Z!lA!8X5PN)eg00|fpQ(1<1vvGBk0z;pD3jUoOC&f#+ zb-HF`93LyQ*{98Vx^+y}*l1dogp&Y|dgp}tfM~woD_75C{_0hatb57x3MYUjPi_39$+DBRuf?J~GBQ6V zNPPnGdC#6h=TgpQWs~k{#;?kHX;v${2_CXWpXYZ0eVy(_7m9h3)K-&WI}I7hAKNK! zI3>-GWLwllR>e%Y?$23G85&%kAXyiD*5_kJRYe8FUHMkDd2VafKRJ6P!%K5EXnux; z2vPTOFRkoP%fMUDcdQaCci2roqj$n+^)-v4rEgCV!*n>nDN%0&f=U~b=}6R=x9?Vh zkreVaa0(_mDx#&>+19p~@s$gNViTLOcw8Z^Nfdi_!S!~lGv}NksP_M;AO?Y<-A>S2 zIO>8fG)ZA7n>nwS>|FG$ZR#PHQ$x_mWwDqf>tK8zXFIAM+*Y{$S&j$nu-xO7HfA#k zCC-j=Gr6H)vzsFfoGkiRS)f}2D>}w4=hc+nbR{c}>xd8Hrc6jN()`m7?%sr^&=kX`~Ojf(mQ&&H$P`{Nqc4e>Pe5a$-GB?NL zq*)I3CzeI1M#A+pfFlEWe~(kYJT?i7E(G!ZMYHnQ9XptMbJ=Pa+){or!$#uOQTl`4 za^MABLX?=U7A$^}qaQFy6I7}VtwnuY`e+wEQ>3S4nHi_spN|b`m?M20qiO7u52(MT zYKh|u>BXovZ~ATOpvb#Fy}be($!YXXRXk_KeRVRDq%u*Z&C2F0XtEgAuSdL%0`|Dj z?R~L!L8{~}eM|Fou^fLg@(hr?T#w(+ae5}WLaSAuF|OdXEm=&VWQ!p{Bu$E{1u=~E z7pimD-433(H3VZvR71a=jb)W}@JZA{#lG3mLM0%TxT=DF#gdU*;h4@DXG0Drw@bIp zn(F%K45?{|pD61mq{-&zvgh+>bsT&l=mY-Db>t1+4)bg%3WJlLEJ$y4l6Cl*E~e0K z&p3^5HG`)Ts2Ln{$v)ADnXF@%QXgurwb&7gJ?!=4oDt=xA+^}KEA>c$&1s^RZt$BX zJqKYM@$?G6@fmSwR)d4x#0Y)){`pYsFj3VM+;XP(L?DhD?q@yWP0?<8_Jor72+xy77zbmTK#l%@izQ7$i`GTc_#630+3i=LA7lbT``C%Yj5f+tzPz3cN{hTfX&wyv``i)>9 zGj53v<CP;4jn(8(- z)v^+FHkgnm3!&WSNI7Ha3?i+c6JDDJfg9k08p=Wm3^8Jr z0}3|+^#gMjmUG?eDGlcNJ$6w;Q<%q|1HT`3lfsbZ4V_&=mXoN z{I%Gd$L}mFYjsrK1qh1$%d|+tdf$8~Tpd%#ib1>qfddPB+}qUOjK6V z*LT*9Q~e(EDIkKil8n8XbJF`_(Zd<1SImr!E_sK2yZ#9?t~}efkJFbL*Alw^f|dx$ zBv?sg3$8Iqe))8S;sef1JF@YaMtU~JyQAZlwK_bpl(v**_BLx#31Bb&Q{C9H$k#y- zx0KPxewf^`Z&}SAY(&}j@_9C z?YeGkbZpzUZ6_Vuwr$(CosQG7pV+o-Yxi7ZuQAqKdwzI7?!Qp?sJgGJ6Gzv^aGy9; zay1{otk^{}E>D&43C}?T@Aj;0)#UiOqQ?^Y@FO!RClXHgqSJRId(u@b;5J?Vq zSCQloy54#SF5?6DAs{~%rmPTKM>geeKf_x6B4J&9C1iU6W(zLIYv#=@Zq*1|=mYGo zUF{vRInJIvcJ&DbcENr3Z)ehNk^gW$TM(pOFCO00L;3aV`s8}QR>a~gp0{hyaB>L@ zeHrrBI)CZuK@jRyxhb0r9Q*O+T6gULTW>8Yd3=i@5>spnU~a#lTNt^lKz!GFNw#$q zzSfA)W}^*W<1qKq?)2|__wUQb96OlzzsZMu1W+a`bLN7 zQc>CCA+<)oZ0+3Iuie$UUelI-264FR-hwng^VPl~hqQh>|9@|aT#WybUdhD8#rXfe z>o5_ru(2`!pDk@i7}~PhCvP7=#x{0DOe5@=J`FRC@eHqiHgLn3HkUwBeVCH%DKfZ( zM9^TU^EJGpnN`hzbw1IvFcD{?Vy3fqNN8^p}TC&T$NSC{n^S<6zXXi=WZ zIdtzGe;ZIk&(n8S%afIxn~(3GF8yg>LLfy@eJ})&OeTMe_bq}vNJ5Ic$HM_DyIlN3 zX2~DiGZ}LyoQqHpCy-At_oK|zsf9N2o)0;oDY$_5^?PNz#IB#Qh)?;y_WC2BPo_=( z&@7f4_vfW2!FcESz^ZUSvtg+%p_|q9K2P97gJUXPS>ArRe$j4(H}wwM^oPIGbx7}! zfL5Y;Q!lJ>(qRNZux`Eq*N!NU;ULj_Vw}L?255^g;EM2p92R`0hy(GBm zg%5_qi%||CR12=t&q(Jbq7x>CA`HNq%Zj2@98;>Ez0)Mg7G|TB?B;th7AHlD_@EEf zhWC1B>nX7e5}9P+*eM&CD}s_?cta?t5ATH_#0{aR2e^45voly$xpV_eIT10|Ecf_x zC_j-`6ecdHI^Hj5SO8i+=U#Z*Bujw%WVpINhh+B%Zvp3Lzv^HgKOW;B^@oQ&zr)p$ z4;K4aC;nE!+@(j>hM??5c7~`_1S`mJLXPfkkmjT7+)t zYgOidb7PB?3t}EuaMpgl)4Z}kRp7Y#IQ2T+jq*J4X?%pgrEY?NZMxiWku+(zhl<`Uf`t&oNW>PHyk8`Z`H_WUpGi2DZ=w3a1(K~%z4h?PY?>Yn*4~{r-P4v7=r)c1Hlgl$o^DUQ|NYG5jRpBP* zqO8q1Od7OFy(@XWOovtb5~cspAuaf_K~xj{;GqM{vwH@AWB1}|Et-nq73n4r@41yV z3h~Je6u&^RZCo9rOA#a9lRI75_S>L~{n0eY%Ml@Sd@M=xEJ}^6nTte-)#K%;B0lfn zA(-6U7Toa6>tndrCaBjnX4Xt&OW`URqdZbXCKGEmdRAFGfV-%j;SB$?Q{YbUQ@3Rw zvtmU-F~*k?Q^ZxEIATGgGa7!?*S_1mN&eXWML%;1d~_?(dZyz7kolnqj5r*w1Y^ zMr+IRZeY!niqh;XSaE;Vu$*y3B3B>C!6m!;f92_*nP(mSkW;5GD^KwkPm^X9%izl{ zimO*umGxFNooIaXFE9phRf)jsO)8GFjqe>0f9xPakSG9{YPyb-##qcaW#TAlbXqJ+nvW_nA~PMcc+{H=dr$A{tA@&8rUZ*2}gL_%B(# zv@;QH!oal?NkDD6ooMtNzr1$B zmkSDNYvY!zrV>f?%ABEggyY9RKR>W)N-+@G-5XaniaE*S-&v_YL{CiIWUt-Ib&5wf z8iFGje#%|+v-3?d;|%&0@G6SItt$`9!X298C}`)DawnG@i*_U8)N!m2x8H+bMtc-j zycAsvI-WOgZ!T;KLnfAsd?Qs{J#7yMKS$F@ia0IEAo1-t_VrX;L7waj z%q=(Prs#SJU)S&xKK-Qb-*$I6MOg6vr^(GE=9x1V zqyCq?2m5am%Jp*eeO8G@xrnqWki>KbtFtu_$U)Q{<2aVh^=2~uVnMc!#zwf^0O zpLdLGM~rV9mnGSUH2dZ?SNq9sVsJceO{7!W&EXSstG-{G?A|iB>m9#athm^f7Pcwv zcG;PDTSeLz=+#y7!vD0={}Hw0|G&OjF2?`J=*Y(L-$>j41$8%Jg4*a+aR^k+tiO7A zRIS1o|7s)Id$dpf{V51RRAUT>wXf2XR(`3;towb|Vmqe3mb7Di11C#-!4SMGs zx>=XWJ3%F|vWeq>x_{f@GftgzC_lva=2!P|WXcIScXNdD#-)<8cBCx~Ye19aLyaYM zjmAlb7EeeKkcdmRrRPJsV$_kHdSRq&GY3Dd1KoEqkZ5AV&5~e(UoOw0qhT>Fo|A3Q z&_k}!%oKe1`=Ku%)H$IFbbA-NUEk1kdah}iq+}2JEP%rU{Xpexewv6rphkcqxhTRP1SJ{RAapMDTB~G@B=%o<6?1ufsm@pyE zZ0!(sp3{~=zqgaZss;MDJt$ar6@f-n$Q=+Zy8bpk=Z5cF$G5RncuhKj7^CY#Ny@?e z9z>N=UQ1(T{13;HT=wPkP2~U~*P)oG(x0*mp)teD`JIPsdqtTgH^_wx-FZ_1iDJo- z8r>nZ!RYPsfHw~~-3|DHl$Kg@c*o#ZjoU)j|5U-yo9`$t zJH+UuZWInRHl5yaw*PTs^nWQwSk!tG(^LuFgy3n~YB#a3THs?Z${lwDT;4=0b$9 zni}Z2lOBDbLt8r zWV#*I|974K9U=dTa?i}c`QLWxe|M+e`T@s)UaoKRJ`vRh@vBCP*mGl1&jqi2o?K3x-3B@u_(X94gaC$xx&K9=pzL)ayDEdV5n%r{(boM9%!AGYMK4T5l~NQx>gpPwmZjFo&(_; zIo)|8HEaQ~CNx=ifYwH=$yq_(0i#973;K+9Omf9G=S6O^+L=kD-TvL!B)MoaDY@lsl>Pvr6P=uq6uNxRiWRVL~@_ zz{yFCOTi=WFx=2KOGYFAI1`4X-X6V^CIS1Cp*P~8n)qY?yga2={}z=bokRMcIrjWH z#Do&iS6*Zph>ql&MjtQ{Qz?FJP7u67o(8B3NZnXpFk!KiRk_2?6?azAU(>9k@<0aDR2n?h!@3r>>= z2+;9cVp#TP^{Ki9fg^!%%EQ?y!zklAe3sA=gahurdQ~ z8C~;?C_=V;%={6cAI{B;j-bX&9lB*7(Jn*}+E5~gP!ScjkiS7`giEw!U^&PGC|@e; z)KJEWqz;CRf?Yk94W%-1&1#fx86%HnOvjZz&`tMbx9ezO+l=;@UL`&9bfOXuXwNs; zygFqZMgzjd5r0DhxAN7Z6Z!-ke!wVXIuSRMVBjBL3VRHY%F>0NOo(cM@W8GaT8(^= z!)f@mQi3oJ1a2J=>z^U{9U8^Y4BMGMVLLTyKDK0)cTrhy{urzM$ z-87qANBNi-Ii^eh#_OMYKk*`8ega;6O%WplGcsy?p0*JU0FPb@3H{rd3!x9Poa3gS z?^Dj;s57uejMp;hg#I<1->&dz4UE7_@b&AFn%Xad)m309F7nJ-R#A@>>kE+LX$>VI z@{)NT%2BvSOPk>bEO#?V1T@%g`d6Ju1Rooa0WJfvCwPkY5EWfmXv4&5O}SvyL)S1d zhlQeo{*!&6z401^jKusS9SaTJtB2)IAMr3`)?|L@l9iJnz}f$%Gmtr%yD%8DD0cXtk-1pI<7JkY(ajvm#L()VBh*#`5nqU!2aa{g!6lI7W= z)&vb0kR@-tv>ayG)0eGiZRBRyD)SQ_q|;%mI~Q2ouu*5ww@zsD6zn*pQ?KwQNJ{U*e`FwamU`eNMh)7x zUdsnC@+-5I+d!R#;l&1*u>k{GVa8}y`wJBmvX*akLkgizF*yL9&z~X>p!!9#BpU0e zj9R3Vi>|wM#m7dI%jE?*uYYlH<7Alh1-pW2R7|#{Dq3& z6h?h3ZG>#4&AB)(*z@`-0Y5XOU8_tZte!}~z~gUbOEJtVQQVCN?w^qyE~QF_a5I0I za?94-i~H<9gih%Nuyzv8`yjiqLUmDbHM`&EuMB?wx4`t?dUb7E-1gJj(XJ zbdkCPwYU71Xk|#y)0BR)-s)G{Ba|mQEje3fI-MM7{vFoBk~T+G{1+o`H9>Xr$eIJl zd%8R1IKnTM@$s~y`M=LN*4Jbw&nPb{EfQqiNht`xqE+KhE^0*lw6aFC^T?WO+=xs< z0k};n_PDN~t$zAx8OAO2K3y@Wy6bOpEAaGH80@wGR5~hCqos%a5g;c--C$@Z*qJ_~ zRFpaGh1tJXh`DrH$jWK?j2z-0W)$AMUOSDKOVrpMBEma?)0LaX4B)!f zu76JHCtkdEbeF*SsL^-69*Z=CT<&1Vo#cd42&1j)-A$Vx#9 z!R&yDl`FC$f$gE4kytx+)y0@H33^m6jvNJkF$qri4iJMSXhSDmkN{nqo%>pYC6rbj zdvlkozCB+sp45w2*lkcbu~lPrTVaR)Zd%r!P}l$XT}7b%SEpszdTQcjb^7k43#t3s zo*^X`ianw|#sg(cEp(q}^c8YTWP1&_^CkreV?tO&wxMg};4gVUvw(V{9+TEKre84J zxNviMbmnrGs`Q(3-4o!~8#Kq`#H&@WtVhfJ53mXLjMx7(BmZZ22~K9t|5F6{HzUVi z{O<&pg=a6&2=L#Y!+!xTj8o?X97R^4q%uK&KdDF&C1eRwThEz1LOsIKLd&R}w_|>n znRYxqJV$^1r|}3gDz78`BAjh2tL95glvei#xzSAY>s-Yx`yGPs zxAbfC7rbp1^6=DA_i=1}s&V)mE*3w}^|#($|E*YizmL1yd$t?D&zXLIb(VRiTG|Ji zABWfMkB@+j#_m>^ZoKU-e}eUxkCdvqavkRveX*FGRWFD2!;6i*Bk`>fwU_5$o{2uq zb*3dIqn4{}KW^ONGnJR;`ruP=M+SHF`EK78yioiw{qMU84grFAhH`uVpBs)JzmNYK zi$^z}+(<$a6}HM1wl0o*68N0;OW}HCPH(p^^k=SZ<_p7NU$Iv-h6gC+W>fZ{SCN?^z5vKC7Y zOXO)gTOhGm{QKRddPQqVIoFJFg2(g-tw;pgqWz4f*OG2HJ$sxovk5BELsLiQ!5;U# zevj-HpeBwKuJw+0H2}s%Xs&YJIoxrP!Sjf^!t0bljv`9s>v8jA-h}RZFd|qg=I&?6 zqUyDE8rbexw3gLu=MqXFZxJtoI?e^4XR>*Zsno36gBldC4x6PQ8mtjZB5`DuIUMue zRTI?v{oqq)*(s8$RiM_t)M?*-)9%K^b@L^?2mA8RHA{q7_h82nTv2~2!035`LXu-n3Tvkf zN;sn&ti}C{RP8cGMP@EZvrqlwi}d%%*;K9_8c&}{Mvq}k|JC>G2bX}Qj3kLAeQ3P8c0_(^ z&v#PS6h;O}@VuZ{iLh)v?yThXO4(M{w6k;})6Qz+^_l$vQIH4p)9q~qvgF*RC+!op zAD8Xc*>g^HOV&QkqEF3K{dgz_6S=%Hl38>fx=rLyGc|hE(K-IDXZ;j_XOda?abF@X zJe!FVL4-5cII30r!+@ZWwPkHrtXgiwKd~hEsdE8WRx;hoWX`lYazj93H)l-nMu{?He5IXt;aP_=4ye zPmhitl`>AoZuEG~7Qo7|u+A*gFH#n8S};YE@{*2QUdRFzz3|3< zQ_a`Rd=pi>J5FqmBvO3=b*PqY^?S;s^d=dEqoH87llDu6V67X_F89YuH+1C;;HZA* zT2Pw}$FRVbDb|UB5RuwgnR|$+Rj$+IYIae-PZIj|s%IT;XI=VNt81D6l)vBWAo3QN zNS0W~CK-nPHmqZ3NPm&m&ZWTM#Z(K7qZwn2cth68x;6wOn`1hK$WChv;%l+ zC!JGkrRt8}w}Zi9>KB{z<>wdmk*D2jRKT8OiiW(Wm2v=yvlnE8q3q`q2*ora3M6DF zRkaT)gxYEqcz2N((0@suRbyTvr)&lh z9pyWq<}z{Ej0573z@CGMCYaffOA6Cd7%!vch1V%*fF_){6HfEszh9vu&&G$%jz)Ij zoif_Si5EAq1AD@J%(rVvUHFeI~MJ9fab^$zHj+ zSj5Z(D3pu1yKtA%gITqbIe^Wa{avSa{bgmCA~<`Y%@0=)1uQ=wrPJ_4gC_(u^3X}F z94&Dh6;fn7?vo+sgFvRG4v@V+f=(93{9i#2_S%kPzMElw1s9EW+7z>{D!kP&h6t`LFa!XPQJo_>$g8|Gv0~i1H71b_6tV%1 zu}KOj>md?am=iN75raARhW-h;sB>_|^~0@~a&cE!CGqy(TCN)NEBW|av{t)H-(Pq& z&8>m-$390hIHGcn&A%8r(`GktNDNJhuQ?YYdO2YpIO)kgDy*qZSs3G~*b0!q-z*n6 zZ8@u4DE(1Ww-tr|OiBPod!omtAvnOI2ATvA@?+cc{cMf3)~+tOKF=DHbdV1Ft;_zD zNjxgi8bSJ10Ke*?megI5k#H-uBDAj6u(7`H-Ycqk+3VB$;i5lJC;bzQx1O~xbk8UA zq8ES~$3p7lTjr|KSf!oxlzHWWaA&q2$<%!k_tK#X0JA6$-l{ zqxOjVegd7C`-1Ot=0fJ8s)B2)3Z;x4KfZE723~+(=OxX`>k&Hm5O2cOI%1!HNl+MN zd7yP$J-D(>=pLCMP~yEcE7!gx@>$vZ_N8lQ`epX#nq}2f_JQ12&4x!TI#oJhUQ!t* zQXxV&L#dw>2K?~P;+OFm7g|_P+fhVBV%X~fzHUWOxL=MF( z^trm>6L~AzXk-1RNadDF^&=FazfbxvmNqI`7)QHW{C3p4G3mru)s#1vF~yr=Q#$k_ z%Ru2ji$JsuvpAJ1G0EMX^Y97LaXJ&^ta~tt9FFC$Uq9%#%>NA`PLg-JiCY zuX>mtf=Za)&;Mzp{*Q`bnA!jDQUsZJ(l+>iSo#WQAZt-21%n1b_nBq|E>0{$^FfBb z2Kdqc!qO|MkuAy8sH>Y(7|-aKwOBm?$9+ux7kys}9_sYwDBKsrsx;92D7eNM ziSa^z-1x-C*%`!8d+l~ldG}&gsI=z3gjU72A3vY)B1O-`B*$!@^tkKZf3ug8k0d$;q@7G0Ys|}QDfyjrxR!g-Qd4St|-SLH?;sCbH-~LY*-@5RYf>=o@E6d6ZI$jx7 zWL1r0&9L7N+}FzGx>mY~JbN}K12YpUJL`rL3;kwkC`f&rmjqoX#waKS5u*5EZk&`< zz$_DGyql*(uo#wu5vC&SLJPWiwktP7u=bIpooGCUC6~+fW>78745hzoesANLbHmv) z&{;t5HQOZNu!Dg73L_Lm|G>-@gz_w}l^@xk)>zz@aT$j=@{E=(j;%>NZF#G2TX$tD zv5Zca?L6F3Tw70cldo$tk@?pc-Op;*cqxM)LOPP}!#|j_x=9x);*C9x|I#8*+x*!F5s!Lpu3(4*qC z4x_TYMI;P+z4yJBXoyU{P(oI^V@Ymm-$$F-c5vXqp;I;`7<|W?qi=M{iE7T$KSX~i z|B0MjKWP?yAe!K67o%?!2SL8kMXCwE(8Nr(S2G=6ig89wf@=t z!kU{)|9w^2LnKZ-P#t?~vB6LJ#=lLkn%;_=$p56266%}&F`8pqCqy5l*^wKelWN#{ z_-)CxwmEkr)dsP)0C%sdq%S-D7Zp@X;^Vs6+m3^iHrZ6*7!E7^w+OLWi3EA@5?yRz z!Xw)q$1HvNOlso63T)PDQ4VJYTM$2K9V-kt=WUMkax1BMQ&oE`$G#EIB*n#!e9Yo#Lc zL0UHQI@Cj1K<<53S%a!ekAFn0lvV{fMKtovPr;wxLpoT*UDGgf2ZeU*MR3aMDaWHd zMFm1jQ{u_}N`DfSjE4a*>oqcg7Y`4strX(v$45(#E^Aa&2aLok%^0e?-Y~gyzm>r$ z-2gsfqaUXXJfaVZd@r2B=&u=EM&&vvLvZ%|R^2~WA3;pH9V7MeZZlX53^_%UG! z4!LyP^`~%%*NFB}S1l*r9AfC7*eDWl*&sx6iL{yM`wAQsn#`7DQ}>t03Y{dh?=zbo zL>%m>rE&8q-oEk)q8Jd*O`|6s3Q}3`zxc~ZOSO1ykNo7CmM*|nlWpYQ3p3n;*?hTJrmm_WKGd?r{9_`UF;$;me0my zL$?wYc$hCy%`wmt@UJE#=`)LsspUvC_mu%c^Riai4ng)DfOG$aOd^vi#+x(w7gQo(_X& z3xQu@t=M=#GKxsnZpnb%z^xe8OWP%v^JCXL5X3hFK531bt4{iL)ju2qB zR+fFtDgwm`zY;znQ8_-TD`{={#DA!0IFTE17@r<$61s|tSPFy0t@S~gp$OKC(Y}G# z|K&VIjpa||Gnaf==ZMiNSIN#gco|bD)aq7a^nUa7Yu9kL5i-3gI|({-&iU|y_!>Rh zlFux!nt6@H87bRr!s4MLS-uQk0p;c}2b;N_-Ivoys%{m4$#ejXxuU&|aN^U-7Q=$E z59gJJ09gj2>yydTW3P7}O@Obmk);TsC1NVAbmZIu3_YXZoD*B4aQ0*cTIjyP?Rgwe#}6 zdLFWHA6JCKcV#^`)V33Q8Ehm*?&a!BTk(r{oB!ylu5d@-bYhvWBaADf$la#a z>-@o1|J6s@*rj^Q=tIY_LgZ2Wm9I6ib#f#9iKvilXaukx!*mx-{?-!7TDoMQuzYQ(q{ zxxn7qJitGe%e}&FhM)2=!P$ zy6t{>0%G#hcaX?2rbQ8eC<@0d6_hZz(rdDMZpSz}ghb3zG&<#A{F{%S?sz!tyD>UK z{eEe@yJpX*@rhjn=RYfXt3C>%1sQeL*VQvWlJR~1s4rnW`ak6TnVA1GD;p=%e+&Fy z>Bz)iwZd=b>+iV+dWd#|!_hOIq^hBFU=>^t%7LKD&6ZKk<`V$YOx}dX`Rgs^Zzj#^ z>SR?&pAw0w+I0REvKYR8sAYdGNH23A<=FYXn!eQI1=zLxB+5NgoXody?yv{pMEYaR zeLf#t*ZW)g4V#O?>bRzS^fp2EGVg|1gr1Ix9QgNsb)NmaUY%@BtQ5Vy?eP56?A7dv z^ITK?G=IeO{sdpPFLP%6A<-(XGluxVa9IfI8`Ykj^cp!122K8W*_yygaNPR)6JmYF z3JbpikI|>|@I7&@cCRq?vEyE6s>3WuT;utp{XngT>6XLvtBIHP-Ec!XOn@@MPot4h zn`cT8aNoe|HrS+wZRv~?mU$6|Hs2aE&ic6w-|O=-bK~#p>+AWk_wwWO6aPZ+^WpvA z#h*J9~Jiz6HX;45ACH_TC^!& z?!xRU=Sph&qF=_gj+0lbvC0L9lY@{IAGwNd&TJ=3T6AEuD{B!kScJ1o_rh*^>5>8M zkXEjKF1TTJsGUJ3k130_*A7m2Rn*29#n{&#oxDbM30vyGP&TeaIIl%6TO@nwgkm8Z z<~mCkk1l-c?{&i%rpVaTqOa-zM!fe6RY{L{uvm62kUP?#q7tFs&Ja|}j_}igxM(@M zL7h*cr320E1HHg5D81vJb7<`^0mO19F8!w$%?;o5B!J5qr)IOXwrK=_!C5$kE9z66 z5E#bc%;*y=R18G{(JdakAcndKRW4^zOAx@V39eK>1_i37=cEiR3AZLYoJJ`}_u-0v zrRsegT6IaJLCW03SoY{)P2^{JkP{)w3F{MYIxy`xDyb8)LRGP6s-#q( z!XYF|A#Lb6q8_g5YVy)5q}*d}L>jPhu3r zxM37*&qaUd3pT&Kv+VWyiCbylucNsY1DYWR$vvRvn@o^!al5UQ@VKML2yN868{lZ}KbEof!_l(o%fQYzUb26YTaZiqIJ6`&X+9G0_U6lD|)M znmw(3<-KPGu1dCaF1S;;I`zYHQWD*kDuDM>I&lPGod8^&=yp=n%rFV` zOqc2tui}?5%AFsi+}w)qgueBP9avB=P8y#?WU@&zBaBVo_+r!ISdx@$;+iu9^7Ig;1bwXH88WCwyiNpWFu`e zh7(vCF_Ws`cB=8$&AR9*&J{)9M9ckjSXml+E!Tq(0_0;4=lrQxQm?8^5Y&V=1K#32 zQetNKxD>j$V2NYw&ba!5SH8L@qkMm8hLf7>&3CbKR=jNOl2j9Bjh6!`V5z_xbCeZ- zADOSpT*a6dT%iu#h)SLaJrmW2!If|Lp|EQ#DMI2r?2m5-^Mg_+ zD;?@T4upqiZLmtuF!^f|kf9j6o%G>RrF4j%(AtBy5-t&nQEK*EE{v7g)crc=6AkW+ zd+utr{fA!i+i6$#eua1V4&JQBXP=p7y$t0mBm4;MER3%%>NI4MB76+@+1xrzvX1G1hVnKr}j%s{VCNl|Vjd_M%E>Cx0xKX@{+;3@M{R#_n7j`-vRjGjyCX~mP)We^+W ze4Q=jUdbY$FZr)8V5o)SMk@MbcC$3+PxMxU&Abrn#i+9;E$1UMJbc13r?pt9Bs1-fe794p3zT4VfdDX*DkW%6Sv-chl zAVoDtJfUR*+)oCKf}wkN%mq?*yO?>e5XwtX+7mBQLqda3krADwNCbb_?Hz zT-W8s2A+u2ZfeOiuz>9M#*YNZXLRYF8ZI-%XlhK3R#Yp^_MbD<+2A^~sTO?Fs7RaV zyJ%31sC#|E)u>2soPCZJ5us0>6dt&16k1Xlxlc5`LUAwpElfDnSZM7Y^#_s9e3Lh9 z;@u32(1%NV{#L&6!B*}nLpCsT5mZbZzH(9b6g+HE2Taj$`v|5rc%oz$-wOtfCJoqwD&D^zFE3qjPF3*Bc;$DYrz?`WXsV6 zsNL5bx8xoyu|{z@2(K&>J#Q){GUzR=dQZ&fuUIOGu*$f{gPx3ulogqDKmg|N!0l+{ zmur6pr~#zR4)c&I4d{J`39nb5z$Gu-9kNbKr4w9QTCZvDIa72pa<< zWeOuAv?E$$EIvRKswesOi`l@0eOKFT=7$mFPSjAmlCqXwA~;U%EVG?vS|FmWXHp>E zwO<@r=xy}5>Sx}rE&q8$XL`BBbtK3+tLka|Cq z>3tuSvj1^KH+;6}^rVI{!MIhg^A!fNPy)%^6y(XHT~;h1&M9xatSRxp@mM%Ln-8HNomnu1l4Dhzu?4#1$qVGP`Cb zOOh@uot#6$R*Vo|CktBYX8EM@nO-P2Gw-*!i|x?K(18-H<*CxMKIJomRQUYV=nV9} z3)&hFO0OfN7l6AfBp>u1H#M5b`*zsm_`3v_n9O%iz02)0WVio~1jk`-)V^+h{rdrW z4T#J9AKq|T{-a7EW_Hg1yV%zHk`vi)rvCO};Puom(4nAffluM1p)1fA{}^I$vVjL< zHNTfkn{1+0x`{{r_*YV2VLb7Ka^-oof?DZy=;NTm=n)U)dW9Gp%3DhE=8!??Kt4N?S|m{tL<9dL;P9i)bQ|5<#|{AH|((&RrsDKhTqSd zE5|?1$Y{8N{>1g@4#B{m-wpB_2V$6S;o<&$HyE6FJK8fV3yIxJ4B7wHDO7iYjDOQ% zNH1%%#VL3H`FVS0rVELe=O54l$Mavd;llRTtp^6g_~-i$?{DaL$LKHFnO%t{#>$hM zPKh%k43C7+L)eUV?-5o*O#I`*<{Jx9`^acI(<+6N;+J{X?GU(~OIz?zxg>eZb_+*w| zins}K$eEEKdG!p~VK-zEF`g+Ui%Ln$=J-<8fnY}YAmCjFSR(w7!i>9 z8m1`kXv64=iIbffA5|A5+i(|higcN+NY)VTtdxojDPCi4%DSQEX10D*aj^zp&?PDoV!nR@(^bQ zra@%zi?Q<@Og|t-w)EJzVNROUfYM{lWX0L0BtxyBPO#{OxjLK3b$ZJIRSbF8`mzXX zNo+To^o=-F*J8GFwMt5byaW3r)<(Gy5Nr(5;sW1Q`F@9XS{SIID3it+^n91fZ9((r zK`6~cFa~E7Jg1{zYhwh?)dqoOPnoS|W+YO?+@sv$u@0M;5*&Xat<_(-!sqa8qL_$s z9j_?OS$SgV&34N$XzU-U=q6T^2BJZ9&Y<9+46 zI%Pa|ovs7wmE`FLQ9@T6%R4Hh@F;Jsx`V|ZK2k=s&PFm|Ur_n`SyDQqMSRt(F zl7yRH)>3h|Ld6>^Tll!tO_E|Ar7&bl#~9xt=^bui$|Kn%Qu1F8Le=GuVW^fzUx=9u zbr;`FQmLvO$%Hz+iLJW4WKK`RdV6I3ESb&b)zY0DO|^T>i59Pptt!q)l{A8W$_`Lh zHDAbnB(9;k?g;$a^E_a_HMI;h?kxEIrla?RR&34M_VsTDF z&xE{OfC@NkZ6+SJ0v8?3=dsumusj0*IG%WfAqN=S5m5K6fP>QI)DnFN zDL(!)lD~nEWgKZd>3ib+IKL!|hff7K2MuDtFBC%L492M9u^ybU{4l-l*8+oy2Sj(v zaE}Xx^@oViz5n#R>oE6Q=j;;mw#1L%-boYti4PxRLx+o*75>Gg-eWXA6mEGA#86kT znCaP+K!!owoMr_XPN{h=)dpm9ZLo2X`Q7U#r={zii5>uPqM*Tj!x%HrDMs&8n_H!> zS(9kNYJdjGq3{;ALeCzSP$U+PdES!P{;!Uym6g1HS+~u+L|pbpSY5m0BQtlfdukFd zOV_X>vQ0MA%_kCAj#DWMEt*6%tE}-ws=TGoR;n~BpXmtp5I;#os&f|!-8E6}6~q}M z)e;7OX_xFnPTt(oar)LgtUr2rW?F)fE6zb1{Q8$A3m2tVhYO<)!*r%hY(0iX5FyeX z1W3^7rzNtCUKi=c_R<|UG4UwD?XS{!%FYmTcWf%oNOYVxYPZAOvC3t*Qk2~Pa`eJP z%$}(5#gaqvPD-v!I5YLpllXYu`zs=G!_@_tb{+8R;a@D?tVx+@KTz0ICQNOUQ7}|0aILuJ2|FV9fb&-jTKQPE4jWhT2Lm1Vbsql}n8f zj~!j9G)@{)?R=az(ZQHhO z+qP}nHdfoVZQHipZ|!r=d-q_+z5C~VKW0T$)U2qetc=VtMtI@b2{Ju|-tM+4;k^fQH4bx_cw7Mw96 za1_o&H8Za&oU})8#4Sp~OOm5p$94EaiF8gB!V*4us;hwzj#BzC(PjQ4H|x~=!^IgDpqq!nIOG=)Fp#8^lOJ0NDb{JhhPGr2)qwb9|M^gd47FZOSreIl{#Qx!+ z?7dkDomm8+pZjFZzWA8b2y;&n`OGr4R~+=f!P>}VZ6nc^CPkb`RQnJ42L79;tV)eo zIgv$fs4Bw>8Cc#pVptik{RoNdef?2#zI1?=%x=>vPt^rU8vx=q&DHzMZ%_v!x;s{= zvN+V+WA*xXf1hBRC8%FdTyxrg;>rC2OS&T5X71x$fuodq!-vZ$R4dEZoeGh|ih}Gr zy}?;EBtr-6vL64&y*4MM8@2QC%~o$)fOFJDc)?Oy@sfO~&j{jn57Per%|O59D4280 zLEF{;N88p>xd$9FOOLzd1{2aieqo!MxV?q#J~Igte(yNjImC^f9PQWTQhf`gzUH6Q z_Rkd6173iAVSqkQ^5Iz!j6QC(y^)(=9wTV^>YpC)6n>t4(lABUXkFvw)#a%_tm=dJ zYZ=0qH5Jw{?#p3i13+t{{-wPlg7l+hSO zv0iKY&rKiq6)Y!jYwVhnM3*Z?mJ6gfEHVU=FaC_ns|-=4Sqo9?j<*Kg_7Ad?+Zu3*rOM6Nv;uU~$LtLmEVq@gn!-O*^Ty5%M54zFG%5on^T-S))y zZ%=y*!N7YK9$yH{4#`y6X&Grn88S7zyj;KT+&#WN21Qx(KSnO# ze!i?dIGm|EqP+xtE zBE9dl(}vr#(r*f=mV>#2!^3*tA(XW{*pLz`vkMOBGAgqREn7-`$|<9k{kh4&hm7-2 zoaX>EmH}#SE5PI&uBuGHeo05)2ELj^u?N*Avv1grjl=Uu-r_2)Y61Eg!Rpq|QlUHn zZsbFYGHM%wz^O(`JkW{qWXhEMcV!Jn}A7?&UNt3y;OWLh&)0dHzvN^|8VE7LWP zdcKsauqZ|w1=+UyvOqSlgViCNnH%dtJL@J7p}0b3RQeA@9I24t|LzNDY!xp{>ksBn zx{w~gBUt^dPuC1PPStdC#RanZ>D`u8iTOwufv_;(@X&4pz^x+MhuDKi3BtBfaE`fb z@PugdMAa*4CoR|@frY5rC?gJ0;QgG)xt@UeUMj@s zL}7tgKpB3{UX%uXy_ zacv6%EG#XFM<%fm*(e)9$HAY&SL9y&bP75_j+mN0sMcxRbzQ-Z+q^jk?Y@K1U%yq@ z)!rAG4+Rjg-HwhIDEYAOszx7!tMECxZP!l^GgHP0!@J1wdukLH#hhce-X+S~)=1VA zI=L3X$)+T7+!J^4@f)6>&fmS?ooC_3P4KzYdp?8c;bTt6hwCaa=TH1VNMDSkMW>3< zi!0dlGJ^EreC}#|JVc)4mq%RlFRC(}(xt9Z53@zAjHzzQ1!Mc0{^x}sH@^lzqE8SL z;CHF^j2Rs3zockgjIU zsyPq8v#s!Jh^5}eQz`IQ!g!M+Z<;*!yTAV`=ViGAP8^4v^kf*O$HB8XDai>wqr?TE zj*@Lu20+1y@rZr2|ChXCpX+sGpH&@kb|p)n@@-;&ahrqb5@wHX3WlbKqQg^v)B+fF zOSbu|AKioN+$KTBX7+~`yx%W8JiC*+IQxm`PnL%ndw{y3ea(G2n*qQ!%ZGGIyCLLJ zkO}a3_bALwCoa(UTVUf#s!8YJd-MQ9t{lHIDz77!*BZcB(v)BQYC}$w5TYIHnT>s#!i0+(Nbj0b3z2IHWhBuEQrbR>d` zueFAHg+M**6lywOdGz{gs?4XIie$X?qf3^vy0H)r1MFfQ=ti?R*x<%js)Jf5 z+@Sr?UH!p4GF_R&JKTvwT@PxRZV5Lo*+&7Wqb4So00P-z(DH7DwGZ*MM5oBf--EX^ zwMy;%YAzlX#NeA;nVkL?p=M;y*U;5x<)No-?`pJ>lWKZr*bI`=_ z(~{XDniO08ljE`}<6cv~EH_OrQZLG|QBln;pH?Ghed)AEFBVencFDSW=!Yz+d&6-! zX}tEC24ivA+>~yGEn|kJq2elZUFqvjw2Nl0pkLkk=#fDOVak^!qB4;x__Tws8+>OM zbs+d*`Et+j1e}nWd6y|DrtPW{37+tA9SxU_>Z{w0fMk?``=_07a*8H*VX1Dj&h19k zy>-6$gdEYE!%P$;LA!?-kxQsYw$4dBmOI=QR<(98IEn5SfIPufxZ)=_uv}DQMN<%! zm=RU-@K*pub|7tpN*jgz`@O-oyBf2{b{sEuFt5e??3*^5Sq0J zZr<^)(pbwXJ1kJKT+-kqm}OfW=p~sMZ-_g3f-6Y0@Ty6aIxiwsLc{WM+vCku&YN$( zD;)~D&o~dN_v3Trni~Q(O4QgMT<2SfFc;YEuR6noYw8wmpJqvWRW zPhN**He%@(fnP1tJx}<-^TF?l;tINNd2Wejum2=ANqU`w_|hMrsW~kV?`o&=Z86sU z0AJ!0kNz`2`!AK&GP2YE7n`0dZOPb!R)oK$wYTiO7Oy@4!vM`4U8Ptg=6EMVCa!*X z9bzPyc)Ywx!YN|n!V2|9E#TMNf^ez@Vs$vW-pWN7!&gV9qMJ8x(vPx)Vur;{7B3IB zv_C%iO3dY72py-g!$H=YPbU9pSEHSB-SKgM+sJk4>F{(DOz(W${6gM(4b0cb2ziiv zaa#HQJ{r+}8EL7Z&Gk9#sQGa>zgdd9OjAi|;{JKRANum?-CGsWRr12Qe{n@7@x;hyyhi&yZ&lP?Q;9*^!yotLXNg~ zWz^qfzVY3;3T|)A946z}y6IMrkZiCCq7RbKv_-j`x58YZ&$kKcUVH6sVh9qeJB^q6 zs3{G>gm!&*{}I|KYRK5Uf4U~{r*_s-SK_f!R`2ZyYey#!s5N*@!iJOl})S=PBtel z)uk8cu=(-&rJakMu47n8>pE+*x(XkbxpxS!B)AC|>WazzwxF)zi*8%g0I%dnmDN$& z_D3HZyQwD#^D^OKPIrG0xGkt|sgyYjilCRVDK8OtF6_B&S_`_mWK+ES7XkTEu{w(c z8tF=Bp2P`WS^HH`advCELn#)t(=a8cBg<%$W|!E5P2Q#IIdo!C{=B(V5y+Fp0Q#1U zbs68{u~hE-XSRgnQU)w|oCM53k%$cqb8XOimvKQ~Awi5wyLzX`pZJJYVvr>MPtS&a zZUvN0+GZsS13$o{c+fO`#?951dUP?}`zL7;Z~B-p2=Z}Q?93|E_nJaf4lhNuL zq30gaY59V~HpPO3vTRpoyF->IgJB`4wqKE{Rknm2-pS?pRd)urtB!VYRTBrh>^Q`; z?aP|cmou8)EjD5$)_KSt>@#ztdFC=<DMV(~2Plhf5iu`(5}O;rTI% z%S9_SmROV!mtm3;BoqAZ>Kcn3VgX1%IAnn35UNS24H)RgS!-$(IPjO8*(}KBaeQI( ziCMDIzfH=z9`uj6Dw9q?YnEQ-;^If$Q|gi!PdD@BCYyxIIG5a`$Y)4#$YG`jvygs4 zI^Skg1Lm?7MiXF%!&`}2IBVp{TxO}Yu**`Cbr1=NM(=bRpwFfpI93-9sB_>vDgjl) zz#2P+!rgB8E14Q~bPtWn4S$6|_of$y%q~Q7><{P~86jw9 zJ^t(etJJg%eMt#(k$nmF>xnpkXZzKuqkc32$~;6ehhCU#6vYEl5R6Y{@1?Ca`jBa^ z6_Rd1gk!xQ-JJ7rARuJdEC)D+e`u%of&vT|(*lB6r#0OrW)1}3fL~AuDxkkfly;rG z^sFNe)tW$|cwY}}@N}e4o?y2uMVz2H@6Uxm3t()B8Xys1$e_}~LSOH15h210K2lS- zdBM%za7fisw07D?{TjfVHl5A=(BoSvQZkUi2v=a&7*^uVbm;7sA{+$u_-HdX>kroi zDWBSYN;K_gw5@|@c9^J2E>Lp2*HlG-U3$d{iQn0_)fnHm+Yw46C~Glzw2|A)Ho9l{ zLRn?r%Kj;RTk#qE2=q)<9ze150M2|SonSL${XqdqA~IgS5Yk*+7QO)_tr4L~Xf}Wfq?dF0=-g|Zk86{?A70Y{IdND=TFs>t8M6a z7I^ns7LC?ME#z+@0@2;DgOMR@sE=mrCMd@Q+`+5*9lKUZ^Q(((M4}OGPhlIv2U0n( z-x{8kQa41x-Fg%2S`CXea7^;xc=m>Vm8R3-NJ-zVpMhM7NS5E^_j_I-bTed)4`5)_ zF#Mwuj!j&rR2sr0mkkJU+!B_z0)BLMQAj z?XbKucuOEu7)+O#Qxp~osOh?%xgzViTM)%@})e+*B? zY+Q6zdTey3N$#(V^nMN)QHH@GWWqHD)EuE$_z5EWTzy9tDXnz;n&agIdtx-dk>ZL? z3^(9jO2DIf)xKUCetj@${$P(+j^@cO;ussR3cCifTc~bPq&0{!^Gj&bu%bWZkI%4M z?#IH!^fX>J`oFQYvrC*+C60{x)q#jm%ZaUuEHICt$c$~i#|>e+&~N7!fpd$*h`_!2 zu`PjfXK$wb;Dkci)KKikD2A|)YOxSdsnf#;g9~Hl=~LzW8|)fgl$LJXOvqo*(0nd6 zt}$98Omp*O07Bz>*LM%ghi_DsH*YNkWY!R1T>t`^;(gW<(a#Z z1}>mQfjR5a~;N6t|;UpPks4!|H5`}5)0gu)5=WPUx>SeqF zH-LIWFj{s__A7w%)C6btA-lLTG!TDm+MD)Z)y>wg8%+=Dgn2C3CI_N8j*4v!bU?UD zJF)6XQxy?k51SDceszk}>W2-P+U=!v5E8k#B6>$thzO#iha?HMku-xP6&%p-BG|L3 zmpDM*n!QH(9a5`KLqnnIljpbc^TmK>(bH*o8=K+YDhP@i%}D6SBd&wS3k}8CMwX2d zWfog{mG>Ar=bnFDc~e+C;Wvd!qE4A|u?k+Y=`iqoL?dTy+IMJazG2rXZzXA~sl(3>ty#4Z61L~Itz9$N2AlVS+&=e2iR)({gbz7pK|L-}I4zQmZ{l>!AFs|L z&}~A9P8FrdN%!QEDDSj zpeoaOUU2<{5ryglcqIiX zKvT{R8d=c`N_kweJb)d!Cy;w7vnz1wDl^Q9#Nm(4&mXGJZmY9Q*69PwHW9&wM!UEG zPf^IYzk$2)q&Bi<)l-`82_g!PX+ReNvDU9#ovUdIM^_5th>#Gj?EOVoyd}U)i~o0Z zvT!|ae_dpI3oSjV+BM(XotfbDp`RYlF{l>E`|d@(u+84^N-88H~!mZK^P^w<{Yn85RnUDc+GGmr7mH2}i(`vsBnCiFP{s1vyrX z+|M&v?2>$Pjh+3$H|sthg;=i0yn9&7I^5D^*|;uS?sIk|T7NEAKcNFI zKcMMe-xu#aKgs!O3}Kh6c(+w|VnIs))WoCf89Lc2c$vdb5W~yF2G4e@X(Trw&Ln!` zz8!8G;oc#WzC`p>{GPtDHV%z_vcM6xN8X9;L^QfykW8;c2lpS^gk_j8|0AoDeBl+? z9r~_nNJwa{Z9te^Nl!Y0N4h=sAks8J7f_p+X3O?6Ix581m%8lIoBmjQW7Y1STvIH0 zG6*6cexSaa+7{`l>2cQaJmI7G8knoVSr38v;T4l@2KdN=6H(@{)dxPh>=Raaf+tEB zpn^{}n7Du9a3$w!v!LyV#z=8xzydG%0E^5=b?(VdLBKkqZ=9>+KR^JU`7|$!#R3-! zS(Yd?O|1=^3_{ykR;=x432;O-j+#wAL2{DDJ9087mp19&Kq;R_GHp=-VoU|758K;D z{3^q@Y9!uM@;#e@>4SK`puU%b#4{J6niXsy6pTt}0P28%7!VOaioh^{>4QZ;^e%F- z0LU92M~aZ(*r$IEtXol^Oc zZagBmOG5ETXH)#>f|C5;zVc;o_)?au^K` zQ3xMns16QISs<`sV+L*DmeX6y>oeuS@sT!Z6#v#MZbKCV9J)G`jzU?*EW5rE9g=Dq zi2_?ym$IC>!0w@n7C|JJS=O$WW^Oyb&JqSttv1(PrBF%gZqq-?X5JK6a~wRx)v?1h91!?~iW1sMemC6y57Ju^AalNsK=g@OiHxYse_Okcn$r&J$@(?-4VWK9cKtD2>R!#+}R&vUajiIrg(3N!5zLY0c0y+=&Ns z#h|lE9wq((4RgKV-zxy`le>qv=(7|Y_BC$fr|4)dz}yV7dIEHH#HZ-03~!) zQqUssb;{v)WeIlqEUqTNN>vXGmL>-tWl(pAvd6LYtv)!Z#alo!X*&y92{GnR!$v5P zXk4`kLI>c1_qVdE4@AvBOm+)UT^bzQFcDtUm4~`6S53jhqI^+s3Z!Mdh;bqFY~jlX zRB0bAgox+^)|i zu~7HJv^&m&VZH<$av0a@E-SzDxtO?P5x#jmf@%phpkI zx`=XWAAb5ZYnfOPX=U?QSncYn8f8*}1}hS*Lh`xKhDol#?6HU3C|yQZtBrkW7WRgO z=AjqIO)CfW?C#CdiIHZ}#U!wlIG%Wk>6OZfx~Fz|u~Dk9)=>e%;i))ZJ&WT{B=*xe zlH5gYv^qFy5w~G$yVS6`{Ae3O^~CDhrHp(2v&sFhIY|BYjLQGzMthBe(%`>IRBNR^ zn!ok$X0*r|@+8M`zPir;%Z#?k^K4W&?&eZD7C}Tb#r}>GGYF7WbYCmbndG|s(qx%` z6y#+C(f-=(QGguaEq;^u9xOES2^c51vq9K}sc-LMoph@P|8qC$cRr#u6v~LxLItd? zhe#YKr!&hNjZNv`CoxZppB$>UPb2;IK4K&PGE=^s@=`$imWIF4k2@!Q)UjrR+!Qf! zW{Ho{%YxL=SdR1LcveQzon-K5E1}o7+%3MM`xg)>nGN1Q8>N5gDE|Lq*K7KJ$@Q9% z?!PFn+J7aN0X%4-mn;EPF=PQ=HrbMrPP^+KY;S?f$?=(4Y7EHBEK(Zaf2{Ysmta(>hK^9^T&a`uv=kR$etydP+|R|}51 zTdGAisu$Ywt3r^U9+U&++;JV&$UX zA3AH|*4p!u-H>EzP}a)Z$YIoSqq=0wGIvEDUbWV~}9R4D8rOg-MH z#f3UdG6RNkTZctMoZg&u7#)IjDs6)X91H0a;5Fu*wVjN?IiC&Ox0;xg|2b*snf|3^ zmw}#@?eXH(B7jZ)#@S_>3H0!JvV+n^;S6<}ps$#S#I8ks8f^J)XD{#s3zz z7rGK*!7^O|uQ89nn!fm8JG6o7f2s-Do2g)V`;Sh(ZSVE3t^=)>cF!>qcPKpV$S-Tc zUNJ9Ip@*Zd>z3S%nb#;Y_A;NtJ?;Bl%8E4jD)_J4&NCk$_bq7KZm&)4E|jSeUze>0 zFg0Ki&Qp=^TF*dUXZzOLlH_TzH>LZ({d_#BqKzWS#qrc^)>C+LpMimv=!r3rA&d(a zfm6+ie~gF6!H9*BTVxzg4>MU~>aTHKR7F8yR;C8PwG?dku2EU121>V>fo*IuLU~w?KOqP7BzsuohCY(djjrY_V=ETa3xw&w8DdDrM}gvM^Edh5=il%(FTk z6ZbXI*$6fu%JaqO>r*I>cds>yxK=B+oyuH7lKX0x0?tH)9e)rT&J>7k~8I_f09v_#i?O@x9F>%A9gXnqLW#z(|sUbj9 zY6Oc1PULNG@Z((S2cF4ezamP#(TJXO_?5Je7Tz#$$ZZGH0h-ltk5NHM?0@Xa&hJMI zIzcRXLMVDzpl&#XQvxgB!QUb=^lp@vkJy{B zlDA|j1d|sxNkx4t^|KhKje)a5(-d-O?A-C z&HbWaL!O4zqsYK0%_?FtxH4AEF>4T}Jk;TpUrKZ7*0)lJ=FLAqi=$HbrF#FT?nV6# zP8)`fQ;1bxupPmB{D`KdXejI?g_f3kjDA>{^q3{gR6X=I|E+oR5{Sk1RGi3sah!+b z3~{i4fR2TPFZ0j_?o*inNGlIC%Z1i26h33|Jcl?#=!85TxzCN3fj8>41&lYfm$FV2 zHZSqBPfF^y&^Mj~kH7L{NA`t*)*l74Xf@GC z+QGIEwfLQjXLUfnnKa(=X3UHstB%REWyA!?fWXFy_Ka7CRrURm!J|rK$A`RMfXyx% z0R865c-;|B>tD}7A;3G#i84ly0^R&u>IbG{*v-sphm9t-r+ygV%GO z$LW8P4{Qp^*}*2n6=kH@X+}0C8;Hx7@p@M<#?3^EH<7K_#x^ixvPF-AU}Jrxu4rdU z2S)QHoY?r|lHeM|D8hymJ#EC&XwT`G9vm9f&%dI%(ltX!^fZWc9Khs7xLfXn;g1UC zj_~Gz!Jn%=_j-7Nezs=eD-I{Cn(#EG98c59{yt7?O1WyRT)aBwgp%9hMa&5O$&ed? zG!X=pYPs`ExY05@4G!T^#CShkQa!4yw))8dglR&>c*mZVQ@2?@GD+0%LBg<@DajvS zM^&uhc2I{VfCPpK9w3Ms_hjimg>KdB?;BMd??5u!CR9dqKB=d|i2zS!-oooth~11T zmPjZr=Oct}a*4}ijK)pLCZK~6&*BGkfu>l@A_$|9vThoc zB`@bmEkM}Ms(Cr3jJ`jdIJr#!Y^(Y6<+kxH>x{Ojp(@s^VfIyuExRhjHEy4>K-Wdt+~=MR=hR>QhO z!*)P1kX|z?iSq=xI5t{QHh1YAWboVqcbzE?0m#_KwbUtrxCF}i@2q(5$=XwQKtilm z0CkD>=mB7LFi2B|tlxSA1*m^0T_FtOk`g=0In=rI2s*;J^QEtGnGf%h*;SN@15mwQx{dH(MO*yn4n*<0K)GmOLM=-MrwWGNJ#wnEmLA1 z!jdSV2*9qrddfvGn7AZXP>o2l!5*zSR`*+EvrbE%m_p|-5&#t<52av6V0}*e0jF7n zIKomQ3*vf3CXeYq9BNSaCfQnSxFj=Ae!Bz}oTc0B<5mqWve-j_C;8%4EMQ4k)Ea-3UP-Y%@lMY45>by3L5 z86!D)dcVe@jzST?nKOV_DEfp26?l*%#{h)mHTZ1dEeQ;P`dyJcul=cK`=! zQY~iWPiGn`4qdOAUjdy`nt#@0d2UI&d^i=&(^#Nc+Fm1p&W+Ez;oIgpq$Rgl0cGiuP}7!_FaMv%pGof&B7vK<73lpt0`<7X!4ZRFnMkEHA5E zT?rT7e^0BZ{3mG@r8X4*Y?S`BY(j?r;@Y4b!w)+^2LW=^*^gT`m{FwjQ(EpI_U#K3 z7gvsI*$5q(UoMUL=^ejb0^HsV2f~V@u$PmxP4hZp**%7FH@|18=Q=<@s`d(Do=G{( z*1z=7hu&-m=Qg}2vIW72IeEOrc@MQh*G0i$}Dp*`|5(0^2U zUF$N9R1*yNUoLG#j?i z;Te$!J=t|wV@Th%*-iT<1dGFA(kf3sj+SGa((XZ$Wc z0t3aTyv}v4rqL8?4Lzi`mqNJrWoz+&hA+%;;IslPuKXvcKsk?xnIHWt#(rOxz0t^b z8b(_oo?W(rxWYmo?c70)Ys~&rY3pAy8~MMO*E0M|+8pzLHA_)WyJ>3|GL4t5E3OA9&Xf=94rz&<+AP{yNx3(M&u`L#ksN+4W3KHz#PX$G+n{K*gSdv;vldwElr~%oqVbGc2E|ulo59Ur z{T<!G69K$dGrH^);a@)b_yv)ep`OAl2yBKWP|dXZY8sIs@~6X6o8;eKrGs_qZFc zA)E!lpu{q6d`V>rNuI_(=g+`&mD)uf{FYv~JE&pFV(flQlU!-SIU6l0S|D#@+fHcu zc1Gx?J+=I&_D)2wqP5e5(#cy=91!WdHrao{4vY>NC|*wjr>Ktpi-mWrtbooc0Vjt3 zi{o{|bqf5qoj`%HbXuy?brWfjpx50Jz4Kp5AB#+hxnyo$TI-Vtl zd|7!pKeIG-++|=3!k3;8Gt`rp#8_%cyGhGGWH1}&$^QRO2J8I#cNq*A)Z2#OpKNmf zhe#$7a|cH!yuX6Rf5h+N(X%qJ(#qg5FwrWx+Zof!8dxYgS^wuDywuRP-C#rcqCBxb z#uwD_2?v1&4(GmhYe|RMzkbFa4Ya5z_ZNaS0yX>j3P8lBtozJgwLyxaWH$JHpFjXN z)0Rqv;2$W~CyXJ4QS=2DfmiwHYab_pC}t;SOpMhM7h=hV!!I)JKPu2378vBm5Dx?i zN}WY(oTqKX*WYhO$(KNwAmN`Ba1;+B4k($zr_d+D)39hK^2Q1_7|lvlWm~+6cjjd% z1)|U`ar8j`lfwi~uB#Py<)mi2`or`=cd0uJXT>I7Z=mt1*JW$(lRPv0AQM`QwM=fO z)u8lr1X{W9-C~pal^$)HXrAK2QOW!D&ODZAl{IP0CMo-}(fI|iT`$XvlTg}s;j*Te zF$2YWylX`_5fK@=_LBLF-)???Qo~2{yEu1piDAtV~7=XW=Ew|s(k#usa5jAnq0 zdu@k}tM@ami3lRfW4!YA8-{c#L*5*oT$gjW(riY3K0RdL=lvAzA@>Ai;=9-Vl`*g9 zv{F3T0jJeoX+R52$Bk25Hh}CuK3_0(v~C=zz7lH5Cp%GPz>ZJ`CBQ-AsN)-e*L{L7?>11GGN7_(5c8^ zhXfV)f@|auDZGN~!@T8?#+iCdZ={D?gg#TCtHpTrp^7R6%n~0UOa*84Fm0D#3Q#Rgr zS6~<|8Wa!j^G&CzGV<%i46C;{wVQlhu)4Z`&}cPAW?K+@1h>=5-mD+qaGj~XvsPmN70;eis5b=0%im+kn=j9J zR0GWpYDOOF8Jfkbikdkz%asbfRSm!nzdrrPa|rgVwevT=I;fTE^4Hb_gs4eKNP}mcWVBJDXSo zzf^hRji`PzC+4l;Tf4yc&N*pO8N5SNZ)}^{*upOFUygCstl{; z;*{f-c3VGJkG6w3YLZscUHUvCT4yv`0M8T}agce=5wIkgp!7bdTCa5uvC4$nh^LaP zvc3e?ZOTb~d34nU{IddXdZeB%EJY&io@N5-PTk*ph2xM(9MmFtw$W)`3=i^L<<9vu z8$WOx{a!l-UM^c(%;^O0JzKNZRbv!bTwgP@ciQOr6WjV_cGJ#H6WhZrld(+exnS=3 zNvCQJ=tF7(fhpE69#FUdrqHqBx)GyT$u&B$w}=2I!_loD(sdJE48!K@K);le|5I0p zayE{%N*PnMf8>6-taId|dY_6NX;oLgWU=!{__wdw%tYt-Lpcxk(x(A{YV;oe+Y;us zv!y0zJn#N8OBHY16_Azg0H3vT6_7(Yk$tO0OqNpB*)#Xzyr|NC7%AbQ$18}BXAh7{ zYUN0!A9SP#1J(9etsCipf0cEl^L9vB00MH9Qe%hOh)&r0Rm1!b900)2k40)g`~Tb^ z{}YN3bTHO;GPku6(swc@6XIZ?V_>0UqGzXPW@2GxqoiXarK2Myr~S9PY#qo1%^Ym4 z&7G~uX{BwA{`Vh|bFejXHZ*o1vokU=v~$LDp`&4=p~o{ab|L?}vl`nN*&0~u8}(=)NM)3LHJ>a!W?vof*jGqC708ym5*8L%=Kv$FBv{hx2qrWLm_GIrCX z!((S){>KXwTfBb^g2Mlc02tUA>1Y*fZJqG`e%Zghp=j=5{Exf;9TKQ17@N?lYSQB| z;xTCdXJo*X=GrOi{5isVfaSS8(N*nzHPytrhVQ9eKQ=x+DMe(In9j|Bm`wC1EKF!z zHiBsF&0NFY_is{?z0VI9Zte*CiRbi5=g7quhpu6ixOm;3CHXMhfvQlWfi-=ynobn* zLfFz^qXAA`oSHlpu?iwo1pAOx0n$a7`;e0XJ$T?!g$3i3l_@(_gqJAG;z|0F&NaxOwcMEW2|U4WW06*)24Q;aUo&!kS!377$#D;w zr3pOEXtzwNfg8*SD?B4@E&a_ttd{t~n)ZridT>4P>P`1y9`i#7q*Dek!x{*x+AE-~ zw+CI@OICeTR7xZpko|H&B`SpwlU5Ku=h)R`365rXH9K`X^~X*dvegGl>lu2}9-gMG z&{>}?pCjvs-0J|fwG2n=uAtfrM3)@46!f-uy!{wH>loW zeP9mVIW)merqaLxNoEcf)fYhb2B4)2F!iG9#9Vd5aiG|-sRh%{SeVTK#w)sV&3^rD znTRbJYb}2Lt$cp{L1RnWbEEu}+8qg26L!Qlbl{|~g*P*)qDn5CJ1x%&qfPOb-4eB;y`xS9;fu42E!?A zoPEoa@80pWoi^5Hld|bg&<2*5E030Rr5EDd^O0^aa$9K)6=+#IrV1~lr932!Gs+#b zx?2%dk#Y~$5~qnmHWyDn2exM|c|>6Ta~AOj&6i*d@7M&#;)jXS<|Zp0`jI zJ=K)1j@%))m4>Lh=oYIweCby_^|miB0VAsYNQ>dzd`GLYIlVPKp*xcSxpT44wZW#b z1CCF7?qBC&PREje7DA)LdjZO~;FOX)KEc87e(yxS+o!_@xq#>>7wk~9?Vdp1 z=H_1@{TQK(dCh$7qXU1pa*&XPMNoE4Ge3=q3K|i3MON(fe#htXPkD#&-W~2IXtIFK z4baVVIEHYx>@LNaYID9N#9oVK$tZN)9n0MXygEd?v60DVKW=q#9v%3L&mG^1U#S%b zu(BX|FTiFH9UI_a9tTeCc|W@MBBXPc}r$x{z*V^Og?8wWpV|xxQ3qh zpp$hHRJP?tj@G}@ziHVOBlpE6KPNMH@HMy=DGaq?v+3)~ds~(D<`!Ad&m!2ggRCqz zUD6M&Hm6<{v(-)=w~uCq$rQ`InB%YRCNW`j8jey2)y)AU&BHgLavX$KlnPnkGy#z? z7AdbDrVcIy>&MT?H}vr{{6ox()AZOLi#t`bNjd(+G*u5-mYx~Z+%c9xwQyHpQ(`ix z^ga}w$~Y+uleSsj&UGK1SX`%ACdHfV@vsoqlgpE7`JD-tONJ_Y$92rZGtMP+H(BUN zC+zruv%RXl^!N#k?c!@;p<9TwwG%M|b9JKoxU)s`>*mqVHL?$!EVfHZ}OB+{<1*Pj}0lx6&nZ}hkA_s4Eqzt|@jdFmZEE-#dw{mhA(6+#=HA_i}YF2j<1 zx=4?cy6t_)>eW*7J`Parr?Xp%ZH1rhy%8_4Eo!oU=GgQ43Y*2ut7`H!!R~6-UsfMx z-_OJ$&6Ktl@A-8lsLB>#6&)XYJCgt<3FtwpOkKsK-tO&Z;!UeU;#4lKe?Z;9CD<-sat63cH<7KqV4L&VNj-1=Qosz8S+^u*Gfjb%5 z?-6T+ytmy-U)aKiH53*oP1YpzZIc95qz^IwPXNjSHT`A@F3Y@0Cz`1^Ju{8UG0XPA z)%1&*Nxz#lddlMLaJKYz1#ZF2k@t929XeMUK115Fis~}+RqYFr zs%Te}YpNmFl%yDzms_iZ3$-=n&nwf#ww4@w9eKq>W`Qq;gY^aQY`zfw#1~<*_ylH+ z&xI!V;;O(ray9w%eOD4k;oG6BzCC&O9oeUEDV1oYS=I(E@-3K#Zn1P}iIi}u*30Be zm(yQv7Qb~X<=AHvF}Fy+nj>#H7jNL^-Gd8w3->xI?PJwzu}owiqj+w}Co)0IM!qP` z+i~Lmz)fGnEeqkc_lZL)eMP3C`>c^;bW8!8Mm{qg6=E}>fi{p|%~Ts^rXK7)slhcl zy6f^nH|TMDYlhCs{oYYV{?eD7)W+V`m*YRG{{i~K?Z@8i3##x3>eh$(;eDK{KPC-1 zE)6*$CvcKo`BU_`|5EC4fj;tQn1KFUblZPNkN9&azdz6Z{b}5hKlBtH=BwEsk)!xT z`Tdlb_|I`H{-|o>7T=iuHk#vqM@jruzWDrae4F^|`hVc2zCoVSK|Y1w29^p59ADP{ zvHop%EOmNFPJ`@JZxF}S3*v9V1nwN<#uEm4sk$IP4mQY2zXUNq35GEZgW>qDUQl^NXUn>u)){X8P?VGQ0ZjAibG zaeP&UX}d=M0;LvH*)S{L7vTWDp~8XWH|&pAgoCM#a0oRSzKGWf$I;iJ=4Ln+4;D_K z-@}QhM>rXO8+=T)gj1M_;mn!F{9gb70RR8^$a!`OWo~41baG{3Z3<;>WN%_>3UhQ} Xa&&ldWo8O9HZnFdG72RnMNdWwMn2f( literal 0 HcmV?d00001 diff --git a/src/QueryEngine.ts b/src/QueryEngine.ts index feeb372724..78617af710 100644 --- a/src/QueryEngine.ts +++ b/src/QueryEngine.ts @@ -30,6 +30,7 @@ import { getTotalAPIDuration, getTotalCost, } from './cost-tracker.js' +import { emitHarnessEvent } from './observability/harness.js' import type { CanUseToolFn } from './hooks/useCanUseTool.js' import { loadMemoryPrompt } from './memdir/memdir.js' import { hasAutoMemPathOverride } from './memdir/paths.js' @@ -212,6 +213,17 @@ export class QueryEngine { prompt: string | ContentBlockParam[], options?: { uuid?: string; isMeta?: boolean }, ): AsyncGenerator { + await emitHarnessEvent({ + event: 'submit.attempted', + component: 'query_engine', + user_action_id: options?.uuid ?? null, + payload: { + is_meta: options?.isMeta ?? false, + prompt_kind: typeof prompt === 'string' ? 'string' : 'content_blocks', + prompt_chars: typeof prompt === 'string' ? prompt.length : null, + prompt_blocks: Array.isArray(prompt) ? prompt.length : null, + }, + }) const { cwd, commands, @@ -557,6 +569,17 @@ export class QueryEngine { headlessProfilerCheckpoint('system_message_yielded') if (!shouldQuery) { + await emitHarnessEvent({ + event: 'submit.blocked', + component: 'query_engine', + user_action_id: options?.uuid ?? null, + query_source: 'sdk', + payload: { + reason: 'process_user_input_returned_should_query_false', + messages_count: messagesFromUserInput.length, + result_text_chars: resultText?.length ?? null, + }, + }) // Return the results of local slash commands. // Use messagesFromUserInput (not replayableMessages) for command output // because selectableUserMessagesFilter excludes local-command-stdout tags. @@ -655,6 +678,14 @@ export class QueryEngine { }, message.uuid, ) + void emitHarnessEvent({ + event: 'file_history.snapshot.created', + component: 'query_engine', + user_action_id: options?.uuid ?? null, + payload: { + message_uuid: message.uuid, + }, + }) }) } diff --git a/src/observability/harness.ts b/src/observability/harness.ts new file mode 100644 index 0000000000..d60e0dfe4f --- /dev/null +++ b/src/observability/harness.ts @@ -0,0 +1,160 @@ +import { appendFile, mkdir, writeFile } from 'fs/promises' +import { createHash, randomUUID } from 'crypto' +import { join, relative } from 'path' +import { + getCwdState, + getOriginalCwd, + getSessionId, +} from '../bootstrap/state.js' +import { jsonStringify } from '../utils/slowOperations.js' + +export const HARNESS_SCHEMA_VERSION = '2026-04-19' + +type HarnessLevel = 'debug' | 'info' | 'warning' | 'error' + +export type HarnessSnapshotRef = { + snapshot_ref: string + bytes: number + sha256: string + redaction_state: 'raw' | 'redacted' | 'unknown' +} + +export type HarnessEventInput = { + event: string + component: string + level?: HarnessLevel + session_id?: string | null + conversation_id?: string | null + user_action_id?: string | null + query_id?: string | null + turn_id?: string | null + loop_iter?: number | null + parent_turn_id?: string | null + subagent_id?: string | null + subagent_type?: string | null + query_source?: string | null + request_id?: string | null + tool_call_id?: string | null + span_id?: string | null + parent_span_id?: string | null + cwd?: string | null + git_branch?: string | null + build_version?: string | null + payload?: Record +} + +let writeChain: Promise = Promise.resolve() +let ensuredDirs: Promise | null = null + +function getObservabilityDir(): string { + return join(getOriginalCwd(), '.observability') +} + +function getSnapshotsDir(): string { + return join(getObservabilityDir(), 'snapshots') +} + +async function ensureObservabilityDirs(): Promise { + if (!ensuredDirs) { + ensuredDirs = Promise.all([ + mkdir(getObservabilityDir(), { recursive: true }), + mkdir(getSnapshotsDir(), { recursive: true }), + ]).then(() => undefined) + } + await ensuredDirs +} + +function getEventLogPath(now: Date): string { + const yyyymmdd = now.toISOString().slice(0, 10).replaceAll('-', '') + return join(getObservabilityDir(), `events-${yyyymmdd}.jsonl`) +} + +function enqueueWrite(task: () => Promise): Promise { + writeChain = writeChain.then(task, task) + return writeChain +} + +function stableStringify(value: unknown): string { + const result = jsonStringify(value, null, 2) + return result === undefined ? 'null' : result +} + +function digestSha256(content: string): string { + return createHash('sha256').update(content).digest('hex') +} + +function toSnapshotRef(absolutePath: string): string { + const rel = relative(getOriginalCwd(), absolutePath).replaceAll('\\', '/') + return rel.startsWith('.') ? rel : `./${rel}` +} + +export async function storeHarnessSnapshot( + label: string, + data: unknown, + options?: { + ext?: 'json' | 'txt' + redaction_state?: HarnessSnapshotRef['redaction_state'] + }, +): Promise { + await ensureObservabilityDirs() + const ext = options?.ext ?? 'json' + const redaction_state = options?.redaction_state ?? 'raw' + const id = `${Date.now()}-${randomUUID()}-${label}.${ext}` + const absolutePath = join(getSnapshotsDir(), id) + const content = + ext === 'json' + ? stableStringify(data) + : typeof data === 'string' + ? data + : stableStringify(data) + const bytes = Buffer.byteLength(content, 'utf8') + const sha256 = digestSha256(content) + + await enqueueWrite(async () => { + await writeFile(absolutePath, content, 'utf8') + }) + + return { + snapshot_ref: toSnapshotRef(absolutePath), + bytes, + sha256, + redaction_state, + } +} + +export async function emitHarnessEvent( + input: HarnessEventInput, +): Promise { + const now = new Date() + const line = stableStringify({ + schema_version: HARNESS_SCHEMA_VERSION, + ts_wall: now.toISOString(), + ts_mono_ms: Math.round(performance.now()), + level: input.level ?? 'info', + event: input.event, + component: input.component, + session_id: input.session_id ?? getSessionId(), + conversation_id: input.conversation_id ?? input.session_id ?? getSessionId(), + user_action_id: input.user_action_id ?? null, + query_id: input.query_id ?? null, + turn_id: input.turn_id ?? null, + loop_iter: input.loop_iter ?? null, + parent_turn_id: input.parent_turn_id ?? null, + subagent_id: input.subagent_id ?? null, + subagent_type: input.subagent_type ?? null, + query_source: input.query_source ?? null, + request_id: input.request_id ?? null, + tool_call_id: input.tool_call_id ?? null, + span_id: input.span_id ?? null, + parent_span_id: input.parent_span_id ?? null, + cwd: input.cwd ?? getCwdState(), + git_branch: input.git_branch ?? null, + build_version: input.build_version ?? (MACRO.VERSION ?? 'unknown'), + payload: input.payload ?? {}, + }) + + await ensureObservabilityDirs() + await enqueueWrite(async () => { + await appendFile(getEventLogPath(now), `${line}\n`, 'utf8') + }) +} diff --git a/src/query.ts b/src/query.ts index 45af85c02d..67e18d8c4d 100644 --- a/src/query.ts +++ b/src/query.ts @@ -23,6 +23,10 @@ import { logEvent, type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, } from 'src/services/analytics/index.js' +import { + emitHarnessEvent, + storeHarnessSnapshot, +} from 'src/observability/harness.js' import { ImageSizeError } from './utils/imageValidation.js' import { ImageResizeError } from './utils/imageResizer.js' import { findToolByName, type ToolUseContext } from './Tool.js' @@ -182,6 +186,86 @@ function isWithheldMaxOutputTokens( return msg?.type === 'assistant' && msg.apiError === 'max_output_tokens' } +function countMessagesByType(messages: Message[]): Record { + return messages.reduce>((acc, message) => { + acc[message.type] = (acc[message.type] ?? 0) + 1 + return acc + }, {}) +} + +function countToolResultBlocks(messages: Message[]): number { + return messages.reduce((total, message) => { + if (message.type !== 'user' || !Array.isArray(message.message?.content)) { + return total + } + return ( + total + + message.message.content.filter(block => block.type === 'tool_result').length + ) + }, 0) +} + +function countAttachments(messages: Message[]): number { + return messages.filter(message => message.type === 'attachment').length +} + +function asOptionalString(value: unknown): string | null { + return typeof value === 'string' ? value : null +} + +async function emitMessageStageEvent({ + event, + component, + before, + after, + queryId, + turnId, + loopIter, + querySource, + extraPayload, +}: { + event: string + component: string + before: Message[] + after: Message[] + queryId: string + turnId: string + loopIter: number + querySource: string + extraPayload?: Record +}): Promise { + const [snapshotBefore, snapshotAfter] = await Promise.all([ + storeHarnessSnapshot(`${event}-before`, before), + storeHarnessSnapshot(`${event}-after`, after), + ]) + const estimated_tokens_before = tokenCountWithEstimation(before) + const estimated_tokens_after = tokenCountWithEstimation(after) + await emitHarnessEvent({ + event, + component, + query_id: queryId, + turn_id: turnId, + loop_iter: loopIter, + query_source: querySource, + payload: { + messages_before: before.length, + messages_after: after.length, + message_types_before: countMessagesByType(before), + message_types_after: countMessagesByType(after), + estimated_tokens_before, + estimated_tokens_after, + tokens_saved: estimated_tokens_before - estimated_tokens_after, + attachments_before: countAttachments(before), + attachments_after: countAttachments(after), + tool_results_before: countToolResultBlocks(before), + tool_results_after: countToolResultBlocks(after), + snapshot_before_ref: snapshotBefore.snapshot_ref, + snapshot_after_ref: snapshotAfter.snapshot_ref, + ...extraPayload, + }, + }) +} + export type QueryParams = { messages: Message[] systemPrompt: SystemPrompt @@ -334,6 +418,20 @@ async function* queryLoop( // Snapshot immutable env/statsig/session state once at entry. See QueryConfig // for what's included and why feature() gates are intentionally excluded. const config = buildQueryConfig() + await emitHarnessEvent({ + event: 'state.initialized', + component: 'query_loop', + query_source: querySource, + turn_id: 'turn-1', + loop_iter: 1, + payload: { + initial_message_count: state.messages.length, + initial_turn_count: state.turnCount, + streaming_tool_execution: config.gates.streamingToolExecution, + emit_tool_use_summaries: config.gates.emitToolUseSummaries, + is_subagent: Boolean(state.toolUseContext.agentId), + }, + }) // Fired once per user turn — the prompt is invariant across loop iterations, // so per-iteration firing would ask sideQuery the same question N times. @@ -344,6 +442,28 @@ async function* queryLoop( state.toolUseContext, ) + async function emitQueryTerminated( + reason: string, + extraPayload?: Record, + ): Promise { + await emitHarnessEvent({ + event: 'query.terminated', + component: 'query_loop', + query_source: querySource, + query_id: state.toolUseContext.queryTracking?.chainId ?? null, + turn_id: `turn-${state.turnCount}`, + loop_iter: state.turnCount, + subagent_id: state.toolUseContext.agentId ?? null, + subagent_type: state.toolUseContext.agentType ?? null, + payload: { + reason, + final_message_count: state.messages.length, + transition: state.transition?.reason ?? null, + ...extraPayload, + }, + }) + } + // eslint-disable-next-line no-constant-condition while (true) { // Destructure state at the top of each iteration. toolUseContext alone @@ -397,13 +517,67 @@ async function* queryLoop( const queryChainIdForAnalytics = queryTracking.chainId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + const turnId = `turn-${turnCount}` toolUseContext = { ...toolUseContext, queryTracking, } + if (queryTracking.depth === 0) { + await emitHarnessEvent({ + event: 'query.started', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + message_count: messages.length, + has_fallback_model: Boolean(fallbackModel), + max_turns: maxTurns ?? null, + task_budget_total: params.taskBudget?.total ?? null, + }, + }) + } + await emitHarnessEvent({ + event: 'query_tracking.assigned', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + depth: queryTracking.depth, + chain_id: queryTracking.chainId, + }, + }) + await emitHarnessEvent({ + event: 'turn.started', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + turn_count: turnCount, + transition: state.transition?.reason ?? null, + message_count: messages.length, + }, + }) let messagesForQuery = [...getMessagesAfterCompactBoundary(messages)] + await emitMessageStageEvent({ + event: 'messages.compact_boundary.applied', + component: 'query_loop', + before: messages, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) let tracking = autoCompactTracking @@ -417,6 +591,7 @@ async function* queryLoop( const persistReplacements = querySource.startsWith('agent:') || querySource.startsWith('repl_main_thread') + const beforeToolResultBudget = messagesForQuery messagesForQuery = await applyToolResultBudget( messagesForQuery, toolUseContext.contentReplacementState, @@ -433,6 +608,16 @@ async function* queryLoop( .map(t => t.name), ), ) + await emitMessageStageEvent({ + event: 'messages.tool_result_budget.applied', + component: 'query_loop', + before: beforeToolResultBudget, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) // Apply snip before microcompact (both may run — they are not mutually exclusive). // snipTokensFreed is plumbed to autocompact so its threshold check reflects @@ -441,17 +626,33 @@ async function* queryLoop( let snipTokensFreed = 0 if (feature('HISTORY_SNIP')) { queryCheckpoint('query_snip_start') + const beforeSnip = messagesForQuery const snipResult = snipModule!.snipCompactIfNeeded(messagesForQuery) messagesForQuery = snipResult.messages snipTokensFreed = snipResult.tokensFreed if (snipResult.boundaryMessage) { yield snipResult.boundaryMessage } + await emitMessageStageEvent({ + event: 'messages.history_snip.applied', + component: 'query_loop', + before: beforeSnip, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + extraPayload: { + tokens_freed: snipTokensFreed, + boundary_emitted: Boolean(snipResult.boundaryMessage), + }, + }) queryCheckpoint('query_snip_end') } // Apply microcompact before autocompact queryCheckpoint('query_microcompact_start') + const beforeMicrocompact = messagesForQuery const microcompactResult = await deps.microcompact( messagesForQuery, toolUseContext, @@ -464,6 +665,19 @@ async function* queryLoop( const pendingCacheEdits = feature('CACHED_MICROCOMPACT') ? microcompactResult.compactionInfo?.pendingCacheEdits : undefined + await emitMessageStageEvent({ + event: 'messages.microcompact.applied', + component: 'query_loop', + before: beforeMicrocompact, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + extraPayload: { + pending_cache_edits: Boolean(pendingCacheEdits), + }, + }) queryCheckpoint('query_microcompact_end') // Project the collapsed context view and maybe commit more collapses. @@ -479,12 +693,23 @@ async function* queryLoop( // continue site (query.ts:1192), and the next projectView() no-ops // because the archived messages are already gone from its input. if (feature('CONTEXT_COLLAPSE') && contextCollapse) { + const beforeCollapse = messagesForQuery const collapseResult = await contextCollapse.applyCollapsesIfNeeded( messagesForQuery, toolUseContext, querySource, ) messagesForQuery = collapseResult.messages + await emitMessageStageEvent({ + event: 'messages.context_collapse.applied', + component: 'query_loop', + before: beforeCollapse, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) } const fullSystemPrompt = asSystemPrompt( @@ -492,6 +717,20 @@ async function* queryLoop( ) queryCheckpoint('query_autocompact_start') + const beforeAutocompact = messagesForQuery + await emitHarnessEvent({ + event: 'messages.autoconpact.checked', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + message_count: messagesForQuery.length, + token_estimate: tokenCountWithEstimation(messagesForQuery), + snip_tokens_freed: snipTokensFreed, + }, + }) const { compactionResult, consecutiveFailures } = await deps.autocompact( messagesForQuery, toolUseContext, @@ -506,6 +745,19 @@ async function* queryLoop( tracking, snipTokensFreed, ) + await emitHarnessEvent({ + event: 'messages.autoconpact.completed', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + compacted: Boolean(compactionResult), + consecutive_failures: consecutiveFailures ?? 0, + token_estimate_before: tokenCountWithEstimation(beforeAutocompact), + }, + }) queryCheckpoint('query_autocompact_end') if (compactionResult) { @@ -583,6 +835,20 @@ async function* queryLoop( } } + await emitMessageStageEvent({ + event: 'messages.preprocess.completed', + component: 'query_loop', + before: messages, + after: messagesForQuery, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + extraPayload: { + autocompact_applied: Boolean(compactionResult), + }, + }) + //TODO: no need to set toolUseContext.messages during set-up since it is updated here toolUseContext = { ...toolUseContext, @@ -684,6 +950,7 @@ async function* queryLoop( content: PROMPT_TOO_LONG_ERROR_MESSAGE, error: 'invalid_request', }) + await emitQueryTerminated('blocking_limit') return { reason: 'blocking_limit' } } } @@ -696,8 +963,81 @@ async function* queryLoop( attemptWithFallback = false try { let streamingFallbackOccured = false + let firstStreamChunkSeen = false queryCheckpoint('query_api_streaming_start') const requestMessages = prependUserContext(messagesForQuery, userContext) + await emitHarnessEvent({ + event: 'prompt.build.started', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + provider: getAPIProvider(), + model: currentModel, + tool_names_count: toolUseContext.options.tools.length, + }, + }) + const requestSnapshot = await storeHarnessSnapshot('request', { + provider: getAPIProvider(), + querySource, + model: currentModel, + systemPrompt: fullSystemPrompt, + messages: requestMessages, + thinkingConfig: toolUseContext.options.thinkingConfig, + toolNames: toolUseContext.options.tools.map(tool => tool.name), + }) + await emitHarnessEvent({ + event: 'prompt.snapshot.stored', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + request_snapshot_ref: requestSnapshot.snapshot_ref, + serialized_request_bytes: requestSnapshot.bytes, + }, + }) + await emitHarnessEvent({ + event: 'prompt.build.completed', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + provider: getAPIProvider(), + query_source: querySource, + model: currentModel, + system_prompt_segments_count: fullSystemPrompt.length, + system_prompt_chars: jsonStringify(fullSystemPrompt).length, + tool_names_count: toolUseContext.options.tools.length, + tool_names_chars: toolUseContext.options.tools + .map(tool => tool.name) + .join(',').length, + messages_chars_total: jsonStringify(requestMessages).length, + attachments_chars_total: jsonStringify( + requestMessages.filter(message => message.type === 'attachment'), + ).length, + serialized_request_bytes: requestSnapshot.bytes, + request_snapshot_ref: requestSnapshot.snapshot_ref, + }, + }) + await emitHarnessEvent({ + event: 'api.request.started', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + provider: getAPIProvider(), + model: currentModel, + request_snapshot_ref: requestSnapshot.snapshot_ref, + }, + }) logForDebugging( `[PromptDebug] full request snapshot before callModel: ${jsonStringify({ provider: getAPIProvider(), @@ -761,6 +1101,23 @@ async function* queryLoop( langfuseTrace: toolUseContext.langfuseTrace, }, })) { + if ( + !streamingFallbackOccured && + !firstStreamChunkSeen + ) { + firstStreamChunkSeen = true + await emitHarnessEvent({ + event: 'api.stream.first_chunk', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + chunk_type: message.type, + }, + }) + } // We won't use the tool_calls from the first attempt // We could.. but then we'd have to merge assistant messages // with different ids and double up on full the tool_results @@ -802,6 +1159,38 @@ async function* queryLoop( let yieldMessage: typeof message = message if (message.type === 'assistant') { const assistantMsg = message as AssistantMessage + const blocks = Array.isArray(assistantMsg.message?.content) + ? assistantMsg.message.content + : [] + for (const block of blocks) { + await emitHarnessEvent({ + event: 'assistant.block.received', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + request_id: asOptionalString(assistantMsg.requestId), + payload: { + block_type: block.type, + }, + }) + if (block.type === 'tool_use') { + await emitHarnessEvent({ + event: 'assistant.tool_use.detected', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + request_id: asOptionalString(assistantMsg.requestId), + tool_call_id: block.id, + payload: { + tool_name: block.name, + }, + }) + } + } const contentArr = Array.isArray(assistantMsg.message?.content) ? assistantMsg.message.content as unknown as Array<{ type: string; input?: unknown; name?: string; [key: string]: unknown }> : [] let clonedContent: typeof contentArr | undefined for (let i = 0; i < contentArr.length; i++) { @@ -920,6 +1309,28 @@ async function* queryLoop( } } queryCheckpoint('query_api_streaming_end') + const responseSnapshot = await storeHarnessSnapshot('response', { + querySource, + model: currentModel, + assistantMessages, + toolUseBlocks, + }) + const lastAssistantMessage = assistantMessages.at(-1) + await emitHarnessEvent({ + event: 'api.stream.completed', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + request_id: asOptionalString(lastAssistantMessage?.requestId), + payload: { + assistant_message_count: assistantMessages.length, + tool_use_count: toolUseBlocks.length, + response_snapshot_ref: responseSnapshot.snapshot_ref, + stop_reason: lastAssistantMessage?.message?.stop_reason ?? null, + }, + }) // Yield deferred microcompact boundary message using actual API-reported // token deletion count instead of client-side estimates. @@ -1032,6 +1443,9 @@ async function* queryLoop( yield createAssistantAPIErrorMessage({ content: error.message, }) + await emitQueryTerminated('image_error', { + error_message: error.message, + }) return { reason: 'image_error' } } @@ -1051,6 +1465,7 @@ async function* queryLoop( // To help track down bugs, log loudly for ants logAntError('Query error', error) + await emitQueryTerminated('model_error', { error_message: errorMessage }) return { reason: 'model_error', error } } @@ -1106,6 +1521,7 @@ async function* queryLoop( toolUse: false, }) } + await emitQueryTerminated('aborted_streaming') return { reason: 'aborted_streaming' } } @@ -1230,6 +1646,9 @@ async function* queryLoop( // → retry → error → … (the hook injects more tokens each cycle). yield lastMessage! void executeStopFailureHooks(lastMessage!, toolUseContext) + await emitQueryTerminated( + isWithheldMedia ? 'image_error' : 'prompt_too_long', + ) return { reason: isWithheldMedia ? 'image_error' : 'prompt_too_long' } } else if (feature('CONTEXT_COLLAPSE') && isWithheld413) { // reactiveCompact compiled out but contextCollapse withheld and @@ -1237,6 +1656,7 @@ async function* queryLoop( // early-return rationale — don't fall through to stop hooks. yield lastMessage void executeStopFailureHooks(lastMessage, toolUseContext) + await emitQueryTerminated('prompt_too_long') return { reason: 'prompt_too_long' } } @@ -1319,6 +1739,9 @@ async function* queryLoop( // error → hook blocking → retry → error → … if (lastMessage?.isApiErrorMessage) { void executeStopFailureHooks(lastMessage, toolUseContext) + await emitQueryTerminated('completed', { + last_message_api_error: true, + }) return { reason: 'completed' } } @@ -1334,6 +1757,7 @@ async function* queryLoop( ) if (stopHookResult.preventContinuation) { + await emitQueryTerminated('stop_hook_prevented') return { reason: 'stop_hook_prevented' } } @@ -1370,6 +1794,21 @@ async function* queryLoop( getCurrentTurnTokenBudget(), getTurnOutputTokens(), ) + await emitHarnessEvent({ + event: 'token_budget.decision', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + action: decision.action, + continuation_count: + 'continuationCount' in decision + ? decision.continuationCount + : null, + }, + }) if (decision.action === 'continue') { incrementBudgetContinuationCount() @@ -1412,6 +1851,7 @@ async function* queryLoop( } } + await emitQueryTerminated('completed') return { reason: 'completed' } } @@ -1434,6 +1874,18 @@ async function* queryLoop( queryDepth: queryTracking.depth, }) } + await emitHarnessEvent({ + event: 'tool.execution.mode.selected', + component: 'query_loop', + query_id: queryTracking.chainId, + turn_id: turnId, + loop_iter: turnCount, + query_source: querySource, + payload: { + mode: streamingToolExecutor ? 'streaming' : 'runTools', + tool_count: toolUseBlocks.length, + }, + }) const toolUpdates = streamingToolExecutor ? streamingToolExecutor.getRemainingResults() @@ -1570,11 +2022,13 @@ async function* queryLoop( turnCount: nextTurnCountOnAbort, }) } + await emitQueryTerminated('aborted_tools') return { reason: 'aborted_tools' } } // If a hook indicated to prevent continuation, stop here if (shouldPreventContinuation) { + await emitQueryTerminated('hook_stopped') return { reason: 'hook_stopped' } } @@ -1766,6 +2220,9 @@ async function* queryLoop( maxTurns, turnCount: nextTurnCount, }) + await emitQueryTerminated('max_turns', { + turn_count: nextTurnCount, + }) return { reason: 'max_turns', turnCount: nextTurnCount } } diff --git a/src/services/api/logging.ts b/src/services/api/logging.ts index 821ce688a7..ee7300a0b5 100644 --- a/src/services/api/logging.ts +++ b/src/services/api/logging.ts @@ -168,6 +168,50 @@ function getBuildAgeMinutes(): number | undefined { return Math.floor((Date.now() - buildTime) / 60000) } +function logAPIResponseSnapshot({ + model, + preNormalizedModel, + requestId, + stopReason, + usage, + didFallBackToNonStreaming, + querySource, + newMessages, +}: { + model: string + preNormalizedModel: string + requestId: string | null + stopReason: BetaStopReason | null + usage: NonNullableUsage + didFallBackToNonStreaming: boolean + querySource: string + newMessages?: AssistantMessage[] +}): void { + logForDebugging( + `[PromptDebug] full response snapshot after callModel: ${jsonStringify({ + model, + preNormalizedModel, + requestId, + stopReason, + usage, + didFallBackToNonStreaming, + querySource, + messages: + newMessages?.map(msg => ({ + type: msg.type, + uuid: msg.uuid, + timestamp: msg.timestamp, + requestId: msg.requestId ?? null, + parentToolUseId: msg.parent_tool_use_id ?? null, + advisorModel: msg.advisorModel ?? null, + research: msg.research, + message: msg.message, + })) ?? [], + })}`, + { level: 'info' }, + ) +} + export function logAPIQuery({ model, messagesLength, @@ -638,6 +682,17 @@ export function logAPISuccessAndDuration({ previousRequestId?: string | null betas?: string[] }): void { + logAPIResponseSnapshot({ + model, + preNormalizedModel, + requestId, + stopReason, + usage, + didFallBackToNonStreaming, + querySource, + newMessages, + }) + const gateway = detectGateway({ headers, baseUrl: process.env.ANTHROPIC_BASE_URL, diff --git a/src/utils/processUserInput/processUserInput.ts b/src/utils/processUserInput/processUserInput.ts index 94682aebfb..e3b487b732 100644 --- a/src/utils/processUserInput/processUserInput.ts +++ b/src/utils/processUserInput/processUserInput.ts @@ -6,6 +6,10 @@ import type { } from '@anthropic-ai/sdk/resources/messages.mjs' import { randomUUID } from 'crypto' import type { QuerySource } from 'src/constants/querySource.js' +import { + emitHarnessEvent, + storeHarnessSnapshot, +} from 'src/observability/harness.js' import { logEvent } from 'src/services/analytics/index.js' import { getContentText } from 'src/utils/messages.js' import { @@ -138,6 +142,28 @@ export async function processUserInput({ isMeta?: boolean skipAttachments?: boolean }): Promise { + const rawInputSnapshot = await storeHarnessSnapshot('input-raw', { + input, + preExpansionInput: preExpansionInput ?? null, + mode, + querySource: querySource ?? null, + isMeta: isMeta ?? false, + skipSlashCommands: skipSlashCommands ?? false, + skipAttachments: skipAttachments ?? false, + }) + await emitHarnessEvent({ + event: 'input.process.started', + component: 'process_user_input', + user_action_id: uuid ?? null, + query_source: querySource ?? null, + payload: { + mode, + has_string_input: typeof input === 'string', + input_chars: typeof input === 'string' ? input.length : null, + input_blocks: Array.isArray(input) ? input.length : null, + raw_input_snapshot_ref: rawInputSnapshot.snapshot_ref, + }, + }) const inputString = typeof input === 'string' ? input : null // Immediately show the user input prompt while we are still processing the input. // Skip for isMeta (system-generated prompts like scheduled tasks) — those @@ -172,6 +198,23 @@ export async function processUserInput({ queryCheckpoint('query_process_user_input_base_end') if (!result.shouldQuery) { + const blockedMessagesSnapshot = await storeHarnessSnapshot( + 'input-messages', + result.messages, + ) + await emitHarnessEvent({ + event: 'submit.blocked', + component: 'process_user_input', + user_action_id: uuid ?? null, + query_source: querySource ?? null, + payload: { + mode, + should_query: false, + result_text_chars: result.resultText?.length ?? null, + messages_count: result.messages.length, + messages_snapshot_ref: blockedMessagesSnapshot.snapshot_ref, + }, + }) return result } @@ -266,6 +309,39 @@ export async function processUserInput({ // Happy path: onQuery will clear userInputOnProcessing via startTransition // so it resolves in the same frame as deferredMessages (no flicker gap). // Error paths are handled by handlePromptSubmit's finally block. + const completedMessagesSnapshot = await storeHarnessSnapshot( + 'input-messages', + result.messages, + ) + const attachmentMessages = result.messages.filter( + message => message.type === 'attachment', + ) + await emitHarnessEvent({ + event: 'input.process.completed', + component: 'process_user_input', + user_action_id: uuid ?? null, + query_source: querySource ?? null, + payload: { + mode, + should_query: result.shouldQuery, + result_text_chars: result.resultText?.length ?? null, + final_messages_count: result.messages.length, + attachment_count: attachmentMessages.length, + slash_command_detected: + typeof input === 'string' && input.trimStart().startsWith('/'), + allowed_tools_count: result.allowedTools?.length ?? 0, + model_override: result.model ?? null, + raw_input_snapshot_ref: rawInputSnapshot.snapshot_ref, + messages_snapshot_ref: completedMessagesSnapshot.snapshot_ref, + query_params_summary: { + query_source: querySource ?? null, + message_count: result.messages.length, + allowed_tools_count: result.allowedTools?.length ?? 0, + model: result.model ?? null, + should_query: result.shouldQuery, + }, + }, + }) return result } From e7b8f7771e9edfe988555f643f1065345f24c91e Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Mon, 20 Apr 2026 00:56:22 +0800 Subject: [PATCH 03/26] =?UTF-8?q?feat=EF=BC=9A=E6=97=A5=E5=BF=97=E7=B3=BB?= =?UTF-8?q?=E7=BB=9FV1=EF=BC=88=E6=97=A5=E5=BF=97=E9=98=85=E8=AF=BB?= =?UTF-8?q?=E7=94=A8=E4=BE=8B=EF=BC=89=EF=BC=8C=E5=9F=8B=E7=82=B9=E5=BB=BA?= =?UTF-8?q?=E8=AE=BE=E5=AE=8C=E5=85=A8=EF=BC=8C=E4=B8=8ECC=E6=BA=90?= =?UTF-8?q?=E7=A0=81=E5=AF=B9=E7=85=A7=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...70\345\257\271\346\212\245\345\221\212.md" | 143 +++++++ ...1\346\217\220\347\244\272\350\257\215.pdf" | Bin 0 -> 126451 bytes ...\273\266Schema\346\226\207\346\241\243.md" | 284 ++++++++++++++ ...5\205\245Token\345\210\206\346\236\220.md" | 358 ++++++++++++++++++ ...05\350\257\273\346\225\231\345\255\246.md" | 340 +++++++++++++++++ src/query.ts | 332 +++++++++++++++- src/query/stopHooks.ts | 84 ++++ src/services/tools/toolExecution.ts | 87 +++++ src/services/tools/toolOrchestration.ts | 52 +++ src/utils/forkedAgent.ts | 55 +++ 10 files changed, 1733 insertions(+), 2 deletions(-) create mode 100644 "ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" create mode 100644 "ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" create mode 100644 "ObservrityTask/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" create mode 100644 "ObservrityTask/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" create mode 100644 "ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" diff --git "a/ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" "b/ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" new file mode 100644 index 0000000000..73da5634af --- /dev/null +++ "b/ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" @@ -0,0 +1,143 @@ +# PDF 主链核对报告 + +本文是基于当前源码的第一版主链核对报告。 + +核对原则: + +- 以当前项目源码为实现真相 +- 以 PDF/任务书为理论蓝图与检查清单 +- 对无法从当前源码证明的能力标为 `uncertain` +- 对存在但被 gate / stub / no-op 处理的节点标为 `disabled` 或 `rewritten` + +状态含义: + +- `present`:存在且主语义仍然成立 +- `disabled`:代码在,但默认不生效或被 gate/stub 封住 +- `rewritten`:入口仍在,但内部语义已和蓝图有明显差异 +- `deleted`:当前源码中找不到 +- `uncertain`:需要更多 PDF 正文证据或运行证据确认 + +--- + +## 核对表 + +| PDF 节点 | 当前文件 / 位置 | 当前状态 | 证据 | 处理建议 | +| --- | --- | --- | --- | --- | +| `QueryEngine.submitMessage` | `src/QueryEngine.ts` | `present` | `QueryEngine` 持有会话级状态;`submitMessage()` 负责输入处理、写 transcript、触发 `query()` | 作为非交互/SDK 提交主入口接入统一埋点 | +| `processUserInput` | `src/utils/processUserInput/processUserInput.ts` | `present` | 负责 slash command、附件、图片、文本 prompt 归一化 | 已接入输入层埋点 | +| `query` | `src/query.ts` | `present` | `query()` 为导出的 AsyncGenerator,委托给 `queryLoop()` | 作为 query 生命周期起点接入 | +| `queryLoop` | `src/query.ts` | `present` | `while(true)` 主循环,维护 `State` 并处理 request/tool/recovery | 作为核心主编排器埋点主战场 | +| `State` | `src/query.ts` | `present` | 本地 `type State` 持有 messages、toolUseContext、turnCount、transition 等 | 已补 state snapshot/transition 埋点 | +| `getMessagesAfterCompactBoundary` | `src/utils/messages.ts` | `present` | 按 compact boundary 切片,并在 `HISTORY_SNIP` 下投影 snipped view | 已接入预处理链埋点 | +| `applyToolResultBudget` | `src/utils/toolResultStorage.ts` | `present` | 对过大 tool_result 做持久化/替换,query loop 中显式调用 | 已接入预处理链埋点 | +| `HISTORY_SNIP` | `src/query.ts` + `src/utils/messages.ts` | `present` | `feature('HISTORY_SNIP')` 下执行 `snipCompactIfNeeded()` 与 snip 投影 | 属于 feature-gated present,需要在报告中明确受 gate 控制 | +| `microcompact` | `src/services/compact/microCompact.ts` | `present` | query loop 中通过 `deps.microcompact()` 调用 | 已接入预处理链埋点 | +| `contextCollapse` | `src/services/contextCollapse/index.ts` | `disabled` | 当前文件为自动生成 stub;`isContextCollapseEnabled()` 硬编码返回 `false` | 视为已定义但默认关闭,不应按 PDF 的完整能力强套 | +| `autocompact` | `src/services/compact/autoCompact.ts` | `present` | `autoCompactIfNeeded()`、阈值判断、circuit breaker、querySource 保护均存在 | 已接入 checked/completed 埋点 | +| `callModel` | `src/query.ts` + `src/services/api/claude.ts` | `present` | `deps.callModel()` 驱动流式 API 调用,query loop 中处理 yielded message | 已接入 request/build/stream 事件 | +| `StreamingToolExecutor` | `src/services/tools/StreamingToolExecutor.ts` | `present` | 流式期间并发执行工具,支持 queued/executing/completed/yielded | 已接入 mode 选择;后续继续补 streaming executor 内部更细颗粒事件 | +| `runTools` | `src/services/tools/toolOrchestration.ts` | `present` | 串/并行分批执行工具,支持 context modifier 合并 | 已接入 batch/mode/context 事件 | +| `handleStopHooks` | `src/query/stopHooks.ts` | `present` | 主线程/子 agent 结束后执行 stop hooks、teammate hooks、background bookkeeping | 已接入 started/completed 事件 | +| prompt-too-long recover | `src/query.ts` | `present` | 先尝试 collapse drain,再尝试 reactive compact,最后才终止 | 需要继续细化专门 recovery 事件 | +| max_output_tokens recover | `src/query.ts` | `present` | 先 8k→64k 提升,再 meta-message 续写恢复,带次数上限 | 需要继续细化专门 recovery 事件 | +| token budget continuation | `src/query.ts` + `src/query/tokenBudget.ts` | `present` | 达阈值后可注入 nudge message 继续下一轮 | 已接入 `token_budget.decision` | +| subagent 触发链 | `src/utils/forkedAgent.ts` + `extractMemories` + `SessionMemory` + `awaySummary` | `present` | forked agent 基础设施存在;`extract_memories`、`session_memory`、`away_summary` 均有真实调用点 | 已接入子 agent 生命周期基础事件 | + +--- + +## 重点发现 + +### 1. 主链与任务书描述总体一致 + +当前代码确实存在: + +- 提交层 +- 输入归一化 +- `query/queryLoop` +- 预处理链 +- API 流式调用 +- 工具调度 +- 恢复链 +- stop hooks +- subagent/forked agent + +这意味着任务书所要求的统一埋点体系可以直接落在真实运行链路上,而不是靠推测拼装。 + +### 2. `contextCollapse` 不是“完整实现”,而是明确 stub + +这是当前最需要持续警惕的节点。 + +证据: + +- `src/services/contextCollapse/index.ts` +- `isContextCollapseEnabled()` 返回 `false` +- `applyCollapsesIfNeeded()` 返回原消息 +- `recoverFromOverflow()` 返回 `committed: 0` + +因此这个节点应标为 `disabled`,不能假设 PDF 中描述的 collapse 语义在当前项目里真实生效。 + +### 3. `HISTORY_SNIP` 仍然存在,但受 gate 控制 + +这类节点不是 `deleted`,也不是完全 `rewritten`,更准确的是: + +- 结构存在 +- 代码路径存在 +- 是否实际生效取决于 feature gate / build 形态 + +### 4. subagent 链路是真实能力,不是伪实现 + +当前源码可以证明: + +- `runForkedAgent()` 真的调用 `query()` +- 会积累 usage +- 可写 sidechain transcript +- `extract_memories` / `session_memory` 会以 forked 模式发起自己的 prompt 与工具调用 + +这部分必须纳入统一观测模型。 + +--- + +## 当前处理建议 + +### 立即按真实链路埋点 + +优先级最高的真实链路: + +1. `submitMessage` +2. `processUserInput` +3. `query/queryLoop` +4. preprocess +5. prompt build +6. API streaming +7. tools +8. stop hooks +9. subagent +10. termination + +### 对 stub / gate 节点做显式状态化 + +不要删定义,要明确标注: + +- `disabled` +- `present_but_gated` +- `rewritten` + +### 后续继续补证据 + +本报告仍需补强: + +- PDF 正文页级证据 +- 运行时样例日志 +- `StreamingToolExecutor` 内部更细粒度状态 +- recovery 专项事件与状态说明 + +--- + +## 当前结论 + +就当前源码而言: + +- 主编排器、工具调度器、恢复链、forked subagent 都真实存在 +- `contextCollapse` 当前是 disabled/stub +- `HISTORY_SNIP`、`autocompact`、`microcompact`、`toolResultBudget` 都存在 +- 统一埋点应围绕当前真实主链实现,而不是把 PDF 描述硬覆盖到所有节点 diff --git "a/ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" "b/ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" new file mode 100644 index 0000000000000000000000000000000000000000..279e5343c7abd15a1ed6ffec1200472b9933d523 GIT binary patch literal 126451 zcmd3Nby$_%*CySa(%p5=Ar2tjNH9GfcYXeOIct;5 z=YC^#qW1K3(3f=Y@Ku0(kdmq1%}J-*=Hcx(2?gb~ed{giOB*NPeR#J;hpO_4o_|8- z%-Jfax0nq*dw9>7_F}`VDsHf5UX9-S>Ug))tfz&?`?5FiHl!$x{bd-NVAjovg4uzu zn!NU_Z-qp^L>3`d;S??L3lUH3++Col0TBGd(06B_4}Gt1E+^K|eQ(aMeSI#seSI?o zvh_yHK71hf^$lEHoNRW22iqk6VNgr=-VokGAA{y3hYjBmf{Yr0U!3gw=zDSj%y!t{ zUq!jS=9@f3lq)90*wS+dXp=MN_&8oV&0)QlHy!zLK7nnzcWjs;3uebzlO~WqUK^W> zwlj;q{N{dZTG^zQ6>*)fOJ?MlT3D|UhL|ddmhWv% z7cnvAEe50yJxc6oOtPdPjHoCMBNhflfFTs+7C&b8SQrt{`_!}h0=D}ux)2-@*~`n3 zc%yj5O(1G$njR@ps<}JKguQb2Mnzfz;IQZ}sH%#HrBFx{<2+@A4H#blWAMYQNM4ji96drr2UHUCd(BOk%m<}(Ug6?S=4Www7%I^%RdkQE=+9NW`6^fO+ zAuL59zRL-THlthSk92w!XX2eqSa|~Y8=-zgzliZrm6Jp2{gnIcAlfW%Ze*#{sd7qq zdl}>C&0mPB5mL1}Yi>#JDTIlOSW~{yR5O|5zKM>7-mFdGB2$LbAZKN0@k=+wB@86W zn+fgSE6<3=YX@Zw>!*m<7H1OK)3Z9Q77TdFr={cjuX`W8i3{irOZ=elT4vsmXI*IAPx{H!Yc9a{@vcT(pJ0lmr zL*sJ_Lob$cip>VUADfrUsI+5fwgO%egOU->yvqSI_8?wU-;L5MUlUBm?-oH%d|=qj-8n*R(tkF=iI0zbzlV{x^%b|-p>|L zAaAGS$9g~-3J)=YU!-bKRfB+O^xy>w)qtOwM|&c(XNsvVFDPRN*0O#J2d+^WobmGu zMWV!KRd!TT&_=6fC+=nW6^Qld~#zSL;)9xE1vI@*Ht zj9?3iQVxb*wC3hIStKW0Oe@UP-YqnJmFw?R!t3Q<4|N?KQiTxJhR_hLt&su?q#-It zGftR-@LJ&ecGLXHP1EEB2muJH+7TNRk9hAd$ycL&y!4lQ z(>M;Fl}O$Zcqls53uUB7x4>~{Qg$lIH}HKev%E|wx^&6csMX6z|5>o6PY4-zoSB-9 z8N>@QT)8LMQ=h1Di&0Me)qBS2`wwT`7p!kCN)TsMn>bl2WZlTxJp6Q88`A|3@rlIA zArv8gVRLf|9GJ2k+w&g(bU??O??;BoZhQfww81F=!iodE&i$^tM9TdNJGZM_s`t&N z4MR%flO!R$0Tu~b zeTevu!i6|9d_&?~ggC!#y=(^f3un0cBCdS8fQ3Bp#bUL2YaZuL({0saI=xwbDtrKm zS7^nUKd~hfW(A^LR&Ekx$6rtyL!z^+7Fi}y78*AcSbBt|yc#W2)WJOZ{nP}!0LTxZ;O72#n6aOi^OvD_H=SlpfA_rQK zw!VB%wsnduKZpW0-?DwEOn|^8QlSVPySgCHdgUXh^CzRCO%le?1lMxEz`eupk9^*O z`blhGyy0NAT7ZW=L`1b8(lPY$<&+4pS zgrYD2c6ns$4QgHF|PH-=?aMb26FTIkMAUmRL)lUk4*jl5eJ~f?y+Fgp4|> z*|{L>D91|N*|m&@i{!1BAQ6&lGzEn_>KW^Yl8DdkyPl}Jq9l@6uR`I~MtxBES}lCz zo1sS_hy$4sa<-tic=5;2gElmmk7@e14wnUuTjN6@y(~x?;|CO%ugorDy}|HLI;WvJH0RR!4noGYaCq=6cHqoul{N zg;~e`I9S%^A}{->{sAH^U8mHM3*dTw%?ymBZ5{dXDM&llZuPvq64D;}_t(H10XqU8 zewds^am+zsB^|9Tja_YL^TVjdU)?TB*x_%X2|fG$!<4#n4nl;cEtyNDK?-)Ikg@`n`=AS zZip#3c8PJskh8j2j$r7ompjntwP~xVcpLBBP_+upKV;{`ntfPt4C)x`Z~5VZ;dN*s zWD@1QZM1Pl5C2;{P>d^wkjW+EqhqtM+Sky{>Od05D-6OIVT8J$a2qYICN5_R0b6*V zDvefU^-5PuUMW_&*CUKtL78LC!KR^JLL60Dlh7BJw4&iO6?MecxgSXoWO$(#F51w4 zEAFx|^vTaF=c>>3Hewc5m% zhq3^Vl^HCuZ2Y@P6!}hcIu&Z4{j3~NC$Q)$l|dU8%(u}G*JZ&=i|LCy5tdrj=dQkI z=V}%0+7gybamaIQRlp^!TBex~jS4^_&at+d;!yS>lXE^fNMcoj4;!0-!_MnB>txkf zg3b88RlTbHT|tl)L27@&ux359{H%&ibUlFSs#(9eo83eXEm-N)idUbC`_ zwA@zVlOI$E=xh5yn{iPVFFWfO_rc_Hhopq_>P~c3%{gvn8!atW9uqJ-HI+v8wJc7R zY%SRt-?3!hAC6E{YpG(3Z$P|6txL-{?}Z=@(Q=?%bE*gl_5KD>`j(6c(^tJyuH3}~ zhfCliQTgEMTXYDFL2wPR3z>S&4W0;s4@C;W5i^+GwyF(Yyvx-M(ffDW**2lMEkvRp zxq;qHVtJO7UHnKoM1+^{RYfi5Z*j7C_Rlpl*(BqOsG;9yWs*wG@ZRV5njfV(kA9=S zP!sz~mL64~yTfXGw)4pzPP;GDH6D_nlx~ChV}E;{q(btBH%de|zURp0&5Zs+5AXUB zu(%6-!9G;0+6N}*Hhq$5?|_+Y8hzjXLx=UQXV+r*s10Q>$#Pgt`0eZ)rx>(-W;e2| zofanVXti^W@m*?!OtD3_=*)&?%i~7#t_daS_%jBrxAmv4s>jDIJrpXe?7@G;m_AJNUlJ-=(9hoEvoNs_l@ig1*N zdCB7IsiLOqykdM9lUqZ@LJk;Cg9%c3DuQKnJ&1Xb_12d+W;i2IR6NiVjL8ZNIS zf(ODIt{&Z<1|-U4(hobiSyN^^L6|t@EQYwbnY1xXn^3GT^?<$co0n~azn&G^x2)li za0GFc%cY#H55}h3_#9FPg(IuJ>TjLR=0YjLKxQL*;l6;fIn_Bb@J75@fOe96rP!Q# zlAhkmYl$8Du+_uZ2O(vgxye01P>es4fS%u5c{{a@3@P<{j^DXjZxJE1NNQRM>Wr5kA#@gYZx2LTpmyFss<0?{%LQtg zv#^xxm>@V}X&Hs5RdxB;iv(edK?owQKKE@Sbv*`H_hQk#H8*glmXXlL)PJAim)F>v z6w&wLB58!SbP~Y%9@qTg_sZ~DsVreb49b1NDd!r#;~tCR8)%(1U&$aMBKzU2q%oou zwZL?&k;GnZkX%Psy=-A}WH6-PN|m;P@@?3jgKwaewS>+Y+&P*-a&_k2YQ zjym*7_*C#w8J7ImCl$7T(yz@kRsejpQsDnR{B>~*e1V(*H?qD?I|{W{Q-W-#@aVT4 zAuN2qI==|2aIVp@OIbP6R;9k3H0udLcC7wrM-8pU$-=^;zz^z;C!!z9U&7o0VaxYr%X<-@hz+|2+cFzeQQ*G)GO`Maf)lHtl4jsDNtZKQme5wMcCcK)F52UlQ26Za+; zL}LLS&H^7wgy<(iQZ-)pQ|%Yr<^yH>)42zfiZa;bYsgPdkOw!hVhL`k>$=i-9 zxyBgu^UcFzc?svaB#x>(qXUbvSZ+X4}!|`*e)sVMzgb=6e0NJ%^ zY*B?JG5qo)sUJY4g-Lfm#W7gATTN@Q;JBSOH*5>j@* z;+mA3TF)$v!0_*N5zEp{Gx8ksF3IC1H7_pTF;XWrInBp)S*ygSI_~{e@Wa<1KHL_5-bwA_MvZzCVY`&h|7i4D-v5d=oJU5Ch@r3XI|~!LFC9a zb~eX4+UH@HGG677xvW>e7JA$xoJQeZPrMURQ-j`dw1A1b{b?g6n38Uq8|Cs!pgDs~ z#`Wbl)vP_;Ugn_>(gdxlebFyFuF+x1K#4Qx@X59ePx*=KIer3y*#GS(wJ2e+q8snn`6B3s8gv*isq$)8!V>F6uT@Jw z;nDc2mA+diC7tF4f<-!04|m7ZNjpJ}5xRTNs>C7!JJHbQ;>%;?89*~5udzqfnNsG( z;@mnDox>fF@F@V`=K$aY10MlUm2or%A}qA3*!!!de%?PhRV~HuzpbTOJ++HhzY&6? z_+*4a@I$2~sd(=-Nat?5+NMuqE^gUsAC^1x6GRGv7W9!*@b~gmzCi}TneRmQI6r>X zd-XGBtjl-tU;30^ZA*UI2JHpOB0XY_0g+ z!oQN9vj3emRf(=yir!ZMP3up(vyfq>ov5gN%fCp1vTEx;-FiFfRSQ-Q;V?K+ig_RVOxv)f#7eU>8_{XCm>t2cXJz zxM{%zu{PNj=ji$hJ#6A;;QtIjAmCX62m*2bI{Dlq5*jzLf>@XOsk60#wXlzYe9Hy?c#MTYk)E>sS&w`-`SC>2oWDWo|0* z60r%tZs4Nx%&PbZLZyE3Fzs^penLuXS1%sMPTokX&HLIq7@HVQIeJw!&YbxgOsTW% zTI^2q77k>W`U2xc0X3S&Y*R*svl1iq&B=oIdW`G6B&%zo0qCg%yYNuKw)juEiSs%8 z^#9*P0|Y!PQ-U}iCmJOl+b#gg-tX{BKS*7y?sIAcW$t@B_OBH^wuPA8JW?Ot39VpD zmWy~$ATRT}PU>6{@%|3^iK;=nI`s?gFl5=ll-uhKHaIs}G*QT6T!86_la_!zM}%Ld z$$FpYO5>|;bd8!$|ZGThMi4X&2_KTWjLw5twR z8*4O|SNCGs=f>-aSg|s|Q~*DW0JZ=-vLlH7PnY&rP!2r6Z2!9B6=z-JX= zAb|Vd+k-Po7zrAPAby_pLytVh&U*5s07Xaku+`>=B8J!2&+QlfQL10{g`5Jm-Bp~8 zS2ZlRNNUX$%>CF$&As$vG?mSTGi1|~HY{7debnA)4c_$Ruaa+R(D20xmFf)JyAu4e zPL+HEVc9~K_=Lsm&$5{Nk-=)i^{8zC1lZ9m2F*6C7r!bLa0ny--?qO zKVcllvy20R!2foq_%|7}PI45O(Po8JWb(^U{}V-VozZJM=#&B`L1y+fMyWorlyJsP z%acn3O`;Am`w5=jENMao#syKP&|N*{XupJd!%#Uk-1Dhd8kYWOc5B>f&@D=N2gP8I zawoPj0S*RXD-Z?VyB0Ykah0eSG>E6Zr#tm4aAq^C@16j``LFixf8WmqgZ_63HHr_$Z~)XsIcqso0bBqj00&r63b+mx=ATlp z4QMAu7#xM_193>2Zl|TU6)2?$VLR2Wg#2v#Hncjh@@fqVqO1e|Dc^AYX@vi$HU8iG z1_*vc$oQjPqCM;vSWo$OSmxAho8dM3R~7ZXM1V$%p?9G-rlTEE!?Q)`UxBeOdN7m; zr?AzrG15+D{So$^!c$>8LEo(NG}&g`e5k76zmS*Bn)4BvOYd!Vd<1-CbEii8JrpvP z;URz)nVy161T1>Bwb(!;K#l{Zn&Ke}v}$9^I?+~fw1<1QMD z(F7=_6CV3A<_dY9Q(Dw}T$YAr+G;JWy*F!uYr2&!b&>zcf*r^lkT40? zva~v1&zGaV4l6Xqnq6F7ro1jJRjp;!m2+Pb&`!Y`?vSq{4el*T?@O6YgJvkVJyC;Z ziF7xTqGV9~WNXPFHAA-vgq8D~KD->K^do`7gk^*whH*yF&T?LGBxG*-QD&j%9Bj}| zc|by9k}a|O?RC)Yy^0gHb@=zu%}%>Z$YnHX{U^Zvdz$0_7P375-a`B_GFZ?Q2x5Q0 z$tK?X=94b0lsagYqmozW6HW{1UXs-N&Q`Uhg7}$^lUm}QD=pw+?kKKM2rp-Tix?SW zELgHVzn_oC1nM2$J3x3CgK5{jyV$qrl4yF!8J7sSZyjHh@`zq>0J{e~x@A=DRJo&h ztoYH{+&PMs%tOW`>>C_8bnOX~cs^15Og8_X$v0Awa?W;$V=;;=J}@lyeW*$twr<2z zrNOIK;~G-hUE<&=;5^S-RnX&7F(QN13HXzY<#Ul=oYs1iCXb81y|*!;@ptA#H%4_- zl$Sa0g;pF2a! zoYt81-m0t^;aKOgMIIt4j_Q1kA%fjbx=6!XyT~)Q$O6AY$XHSt=bu9Hyx0Uk+J2nT z^{CJQ6lhO{Tv(K?-|W$WdSM6cu-Nls%X6={pgCaGwjoIHXZMM+e9n%xM{%quxSalST{;<2|`;f;ver<Q+#bp}jzO2rS`#B8#=}l+l5JGI0AzMMcM1J`j6I=Ta*YJ@q zOw_w>_{m5J&Ud28E;>+sJIC1#o`A;wtojOk9GCnLhkB8RgnFJ~$odPL-f)ADvnTR= zdES2~Jh^SiXb2u>!XcO`1a9bv1E`>#6&^kNUc&Zs`i8$0GXo0eB<@Q_X-&hoZ$J8y zIk=XaYfK>y-gnR(Rvrq@H|H|r@vq&TY+9E^&Uti(#GVQo#!@3 zicKZq{)vUmx{j0Nh-ufdUS~nS{>`_L}S2+y0?VZ@Lrt>monmyQXJ@B*tupuOW%?CDAK@33t#2kuT1UUeLlqO2c1)!p`kdGt4%_g9fbFf0t z{=~C+!W7`YJouj(Kj6QT5D4@qasAgnV<6HKe**Np4E>l*sl`;F{-qX0`u=SG=tO+c z`Sjf*cQ$SE;i`NNTyCSDW~+E!JZ(A&xHpQHMic26%{d8!Ym*`U5nO2q>0e>&cqGu6 zEnh5Wk>?h9@Pi9VfR}{ZUh$Dfp@~0e6E8daXp2;_`5mmC72fx4D-pDnUHglF@ffv4 zl0{d|6DO*cgJTr+?kZ2U?4q4I9sAy|qEt0coA`xesl@~1gm>UvDnj?P06m_LU;YWm z?9WT*?2opMc$CbaZS%1C^H}veB*jK1;)j{{uI=DCGhZty{tEAb49N{5Ae^xl5COr6 zOZJME`YQ(mrTNcy0OspMq6S0s-(Ip-HZ3~FS?JZYYY+R23&up0&I*tC4ivEQ)DHP< z-Nay=pG*!+Y0RHV>`}6ODCIP%J{y33)0gf;I?eF=DIm{N47eXjxZ=^maoyy9Xhi^W z0TPVZNc;yq-`M(lS3+qJQ&yyFZD(LgP->K7Aq7bMm7*DSmO4 zjcU$IK8^(Ig31?zL?m$KBB9bT@CS1VP;Q;gkw%*fC6tU+E7o@PyS6UXcJYPw+X3N3 zy;3?JeCCWS*=$BXZcW7K>XVK`R=7xKQ@EV~xo+d*NL5`0=fB+uXEQ|Y zis^UzHyYd+P9@|pNdL}gQrUD19TTMsJ7n{0enBNiVdJIg*H?aK#im85 z4gQ@BNSxMDmPn-OZ0#V}r)$C%r_OxB=D+jI|L#zbpa5d{}eCP4*7AK!~W78 zirae^itQ{uIAl>f%lQwz#n}T$;$v(u2h!0PvgDl$NE{?_XWZ$-vvlGRE+2mvijFQ& zx`*w$FukvTlTV*<{CNrHB>9z>$l%U-B1NtCF*IK}dF4|UKaUSRCJmeqc>|gPMf99U zoYhav2WFUkswCpvj=YG$jMs6DWYsq&|l^V0tMzgqXYj1;(c3& zSss1F`rU#vv>kf_ifFZ*>8oTZ57QuHy*yUEd28D%tK9guOtZ79os8p}NVxY9b?NS_ zq-4nH@Me8>$epgIarE#xH54iiwrej+etUp4=lpB8>#CKB12U~l4I*$ZN>CvL-jHYZxi`FjJ-g21f>IHAIq`b&vz@mIBXisTP%x#^3f)Wsb zx+OT%zIAF&()U6Y+sHsaJ|D5Iw;#fFK1gqd6sLi?OTF<6?{6`p_g%gQ0Chf#6|0<( zinK*Z%B^)Tv@;3;N4~#xN5M2EC11R6#r3|*g^1)Dk$|r zlyFG(+963jLcH*@yM*Z}kf}vIMSZd{%wj}AaOu!$!Cf3cP#{5IR}j6QRyK4sIF#wH z)O3?a690s0oX-z4?gEG#eR)<(i+CQc$Cq*>6>34R~#QWnZM^-cH3O=RVl_WovTrDzy&q zW9gQR+fKIWR~NkJJXz}#_HsRo3j#SGRUo-&;KLCK1=_7ygbbkHpJd!hCq9~cd;jJY zYdGsF4XvC+XPr}eyjGX~{s1q$U{DBtv-B_7IL_BUN|^K#Mp5Y4=GUU_H5nV+UGSDa zsXN9db1Bh}hr10P)(g2%&90~=WRD+)M`Te$3xfxxf#{n{oAe9_srIBCtBxCbY9Pq; zYwj05vW1B<10M(ClJAkKZi~?4?ZIVF0ObA`!~DM}7o3l>1%DJ64O;l&==%UrQ^=vy zbC|o1`@WSEJew%tk7MMFH@zOWO1wDMoB+VMiz&?$DB!Ov;miqx{`e&oDx@md9*Er1 zM+yE>zWMt6JCxq56z%oh5mARyL$0<_NJ71l>kPPMVuZ`?S!;?}OJEOu*u70eQiZNeqT!d zRksM3v~>6lp-B_$YxHyBtYRs&>cahXp@xI^q0Evr-~0yP;Of}=mC@u@hD^RwL9>zG zeD5I8dcru4XO`zja)sK1Tp>z#3G(vrlV-D5C>MQA(l zO1E||j7x%kq4G&g*IQ1Ep{F4q>sv1#d%$p+b;5u8WFNq-mQCv#W-@oBGt<4NQQ*3+#pM$7b~<&LpSEcw1l=;3`6+1v{H%}!f*+^AXlW#v zzwMw7WCVu+`z*^e-GYNw8#zW;NHWID5k^?#2FiM{vjsiqSEv3P{3ZJ|C(L?_ocdOC z+Q!T-CV{i?r#l5@j3{EXz3C(uN&B*et@b; zJn}w|UUZO&atk*;v3Q_o6?Gu+5o($!VXOx#|CbGFiYq#JjmX;U8=T+IAi9S|!o2$| zBWE@Pr?-RF&E-vQg?9@poTl6C!i=hA$IuYKB{=b7t&Qc2ONvyHDjGdZyN2Xt3Miz$OZ8_i>f64aRpkjMbLV5Ru|7w2&Ci&fEBmA%yn+0D8P8r(gRgCf4=E#9El35nPAYvNI?mOeap{7>9Ff>~ zRA`rrLd);x0KN@}@sf&p@cAID#ySh}{g$v^ct{vXvpz=pU)_bZKvVo_fzO>RrZ7@yvg~z=;wSC2b}+Cj19IlWj5OO~+(gFX{aojWhoYnr$ z6I~+-==V2qfh?^SUQEiS-AgWqdOre_{aYK?HpW z)c^NE?Xg@UH~hDG0W->9yoIb8OwQ|+RiOMj<)NcWC0}{WkUc%rh9N;v3Q%h0XPUC4 z2ceL~i7LvX#3xM6F{wiR^s5OU_dXV$w;sEmSK}(lLfc2&+(Pczxs*vgE$SQYqxEjp zBewc@PeE{>$f`yB#?#Ib?T}rDX(KxIE{_dL7{vF4=Q*B5E_oi+2RZrvKh^8X6NwF@ zUn9dfyL5+Y@m}j^+!YkGL>RDXLB`Z~i8$F3y=2C+y^WR)l~a=R%ek8QX^n@UW|4 z@y)|pk34t@zfsoTbHOeWUH7w=1aPOPcuiD0h`k6Ki)J>myV4?U%=Inz&eD<}ZHCkO z8SID*!L))Pfa}dxTd-U>+-4k?3Goh|5~-nGm5#Fn6pvyHeO?&{zX+ududWFDwo?~< zDUZa<&8MAsXoP&C021wYOQLI4%1e=>sdZB7xp-nxgcon#dTOE5Ozjg8;MVi&f51SO z`VE1O1ab^z0Y%kU1WIdC`{Qp1pZUPR(u{L} zsdwCu+R^+mf7`zgz0h`8bx=z7NR?88q0>FriUMPF_b)gEbnCwG1Fr+V=U~QL84cA2{DeWTJBr0i z9c@fF<@iFFMjI4o9Cx#dSVgK*7-4~*&fFNnA~uY7>*v{H+fOAAC;5?9yd1L+^|iB^ zJqs6miZ}i)hDmW?o&QB^Z~b;v^kp0T(l6*?H)0mB;%P;3Es?j4NVTEIsIsrCpyIIb!8 zc;qkO8dB{yCMx_+u>*Y6S%yv!@`N4{ysWlJAKFamgvN65KUwZzk!%y+(-jT78pNJn zMWDqlf6y5wOAZq#GxN906=%ch?WP`;r)>5Q83VTb4-R1Fic9u@GfYkRxensmx*P*@6LuLdV%));y=IR@eADMJMlc zcehc6B02~`8?D6(*=P(i6!9RJ1O!Rz-!Cn2JDT~|u&_pc;gVdA&tZ*^f%`YB-iIo> zpIybbZ0T}sTMHDvorp3EC+21OU!7gO z9j+W^)9mw40da=7+ggG>4i8*OK%3BTcL(s2kxd>|eNB)iy=q%d;3EZAoPn0Z)Ji(n z9F9-ylpqdtNI!b1Wj?*$#2hpOO10X{C5{_o4irNvE=|@+E(sLU7V-fxJ<_^<8T612 z<2Uv&+Fd(_NB}BN7egt7RyVt4t3h=A#=iDD@eUWsg_CsWw%PgR%a+X|I{q!_aP5~{ zPb?buvm^!}_|c+?#|Zyvc*7jF1fXluK-0I-eNaV9zt@A_p)KBr-4ky1Cp1vx#3k(f zao8Um3h#x)hpZ7htP11-nwEBbaYs*i1d1HVKGj7VRz7Fi;gOpE_%)_W`k1!m#;i%4bh%I}!IB8cg%Le^nj-XFw^P7Q< z6}b;6CMyxL&jREMN4QMz{tSQlE7=L-;W4A$38T3A6Tms1m7TaA=>hDVK;^%)GXK$5 zI62_{m$ri8PYfutvPCo|c#~J@*Wg&U0=F*YvT1K(U+4e{!3&4^;R!2%&(eecO;g=7 z_y^lj{T`CUSYz=osMsClLIbiO?t*JQ;RMI)%HNjgcn87LjV1PPA|gY{6X&3h?d0hR zzws}fe!vi?J%yt<^)1WVbi5#^1M zhj3lnCG@}xa+eolb5Z*nJL@ScaEWCX&Lv|5gvVb3 z+PIRLn&y2I$jZA7DFR9C^ptte$}rrIVwL|}BWl~9fw3hOcS!0R^lLFp*tyn7Y4XL1 zOL;E#PxSD3Rv{Ap!IMYSDRb+nc8q!40+x~MOU$vw+#Dm{;|8P=FSl**gN(~kye)hP zM^^*jkNb#rpRkVWd0UC|(Ja`XE_@&bLeQU9wLYc7?TFEY^VpKFmLm9<2~5w?T}~&# zH}5vuKwLK3eWg<>5-pG#M-u-S-EGf@WKl9TjIF=p5A`3yy5$>J$* z+s=!Vy`3OaX9qpi!r^+}q5UzrduU-jpbJ1wp(o$+nCGxtZM?5B3j26@(ck8d zTFw_NBm0vzYR5K6VneVxcl5W#n1#?-d$)CE5HzKx8WwA%WmpkOPSO`J0LHe8FYe9c zUe4+}Sd_P+A7+9gjNS9L4mf{uL2O2?p1+QE$okHJm&wNz$b2;>$|h#WEd9hX0iP8_ zJdas@WW}GENEpu|eMlL7;1(394}+OVB@h%IJA#%aucL5TtKR$ z61OdVDY+7foc7yOD80_Og#;tK9uTuS=;oQ&b)w%nEL(CTN6kJ6!2i5P<@kh^T+d2R zK=wyws*U{7D<4KTGaoJkNQKz#*=MOLg`VD7Q50CbJo*$J=}!oTbAO%AZr!y_k)TE1;Kj);XKmII+|k$_DFA3Vlt=f0PvX;)=4X4Nf7 zkU*UXO9c*x4PIQ?&d;%@V|-qE0pA243hymUiNKR%q9n9&9H@RLfH;Pz{wkTUJgoS& zs<{rzXS!Q1h84KThYK*)ne$AqW5WRGWgNt%Uo|9t95Ab{C37bv`u@VQUANqgSVF9x zOXFQM3|TVe|dweX&5QqFO&NNr}4YwQv#Jz?a2+9^f#|&GLJoT3&Zf*S%I13 zi^)xCQ>Ko{pLLR@w;)C%3-ApGyHRrq4NtN@7g>=wn{ce>pIZukQ!V^iaN!?(#jI$= zf=l)m^5Vo_0I`D%K<}XcHatI{)Kfbg!7`;Ad*bJmWIXIBe2GZwka;9v;yjMSzT{=r z22^MyOLq#z>#%p(v~guvEn0AkP~MDnC<5ZK#^Zv^H-n6*QQWiEM?VMFvMX0G@D`mW zVL4Iyk!Nt0U5ni^D1>~!tO&VKBdZg~#0TrWOEv$YCM5QOAXFXRoc~b%MNgN?xkG|h z0H;vnmjqFbagPj7`dTBY(ewVJ0`x1&Fv`YPxhE&9%ZuQQJ6m7HW!G;{IREdk#WT*y zkIEe9!(8eEQHAl$?!Xu7xaeHt6H`ii7}B(g569d)SULDUhgWM6-wYpdEMqMgk}g=5iY%L%$NmaDOW%7P5cnSdR-E2jex zhZUtkde zV{MFRoR*Rgl0z9XK#EB*Eh-YC?c+11?TaF2t#jc~^mMEP)s5Fda03YapRkMLdD75h zZY3Tght)}r5d5c9aCf$D@@+dee0k8iXe{v1vRBsp17Q(U5wtYATRyXeBvRZ!p$~Gq zSxX%Ca>sb~ZVF)GXuVrOd|7KBQ1bKTx!68KKYKd|>+v01=nf0gKKiR_^dk^+ABDVT z-Dasj(*dfo8(X|>Y)KC?3;#Mfhcx^cbYxYZkujKxZAhtCZCaj2amLayfRFX2Zc&_5hmc{GY{v8@I$BhAx$?IuRzg4>5ZPq{w?|ohcT~@V5pWA6Dbx7 zMp8j6?r$xOjQT!rJ5C?!G}^^%%J6>a2ECJYsO^{nCBHhh+Y6ARzyvjROBZ*&ulYr>(SP3iZ{<z;9?NS7!R z8L21>ZGUBd2wdTn3N257mN30eTNrAj!>Hl0z~VO3Z6x~1o*aI2{5`4nQ!aXD1$(4- zvcwEVKHQlRJuDYNeF6Agz1wg2C3NoYGt=w(E_58qI1u}ru256+m@H9@t9JdFyjF^z zB63Aqt=#e>b$wxs5k=hbCDhi#s}b_cxhQGjH((A5LHwb@B%CX_{^{Yis1P&qCQN$N zh#{u#4|A1W!5n@`rLx>FDN<(hgqYq*S;f&#@DL7_c8Et#?m!{Yb=lINi;b0%r>^Fe zfBiv%%AHsMybe>x_gLI0H;3(0)0BX`#u^_QIe#_a8!<7AKa6+c+HNm&=cW> zJ*8oXUipbh0=p9drg=+*LU-tp_(j?f&+uhoog2BVFweyOWRZe8S#1 zBiU9cW!1-7bg^Y}Xi6adxkx60W3}kgdKA^qlo8 z`x#sdY^7I;rUqcY?z28PS-+oVhD7f8<+(<(N9od6Xgq5(rIXy=0&xg9)5`- zAeB`T43MW7(uW@GVb4qTl7n#x<^Cv=J)Us1a0jX15;ptPl02)Z^E^s3a*;359*!rX zZV#eq%m8>*Ky8Ym`F=izQJGHgy>%lpX-dE(Xtb<6oI@|bz34gE94_Rx%9 zftC}(lf5^34}Wv7W+{iQFk}m{r*Gh7i)KX5Nb*VzVVT3JrE1DFuE8rnEwqLEApQrP z7~5P?gn%t=&sv21TZlK>FP<7y20iNt&iP0nVCMpAJcv3#l~GPE4z-8-Ps*d5AfVd+ zyZ~iCMvQ=60D2htrGqaA{bwr1%>mv03BPbYi`D~yk5nbjDBzz_IT%mfc6eqIWZF&s zFnJC8dpps+y5ko~Q_i?|_Sm+ejBet$tQ1l>H_{sf>s~|>Y+fMt4C5W5snWboGp@Qh zNC{2Ipw+J*I7VQ*+6?8-UB@z4W68EV$JC`y82(VdpVc~g zT!H_K$MFB*F&+4mC$k)XZuQ`$rWREJ?rHoWPlpq$osKL};E^vV3PJ@6CcqFT#?8*U zL8g#vU?g}((K=aQDrd9q{81}Ie2~HElcffwBO((G42m; z!tklMdYxt&zQ_g|8PfMJJ^$ReEr)&!(NV)5^u$^LpOqp0w`-^@4?*^DmVOAb3bHTj z-Cb>-th|MFH?62O6OnMP2JeMJQ6FxhertDbVn|JI=}=A3&6k+>7yc0!`MCuY&SjRT z+_)8|wp~b;o0OsMEzf~W%8g58v0*7=By~sPlqY*J}qI)L(g){iS(J+lL-sUZE(-Q`Wqmu0ZwY^1FvUo+oFUV zT@!x)|Df(HyXxGwEseXoyX(T;1A*Y~E`i{|o#5^e+}$m>y95vJF2M;7p;q>}x9a5V z)Ga>lYK=d@eCHZ-Nbk=mQ$DEY>0RBg%xh}Zc#I^+LoPALQ{Yx8+F$GVK`x4BQl_S; z`)DejDl{-?gnEhcekw8@W4z(>fxCl*Vvz;-^~C}V`;q<$DV?1E(-9#Hn^7U{^P3C- zNF9FGHHkLEwrJPEJDiJ807@Kn?SbHsTf@CCkJfWl6@F5t275du(;VBfGAijCii^Bd zu0&JWdLYXeCNBVTTe9g((^EbTZ2|F)xLBu zAD)2O@Rps#jvsGWrXrLjIJw# z=#wdrw}Tt2X1qrS$)STWtnCGj-|UPCns(%#jMdPJma*Bwxxr$=8LfKQCeu5LtiE;E z%R&{!_f!~sPcua_#+cm5Ddfq4tN2u)Qhp)F!b$E*Llu6IIZFBiq`)rl`nlZs8@Ag5 z5rF5jvr18t``OCg_#>m&uyp1n=u;(!``_;wE8ule?4L^jVR_x`iTk(jBpWRq<Alm}~+i2c%|s#VFs7KsXYmx=dYPjm>uW z`Re|4Cs);G^aXTAgCP7LzTLk&Wd84?2NurX8lJ#^{l7Ydkk}z_&;&R6D@vJoZ`bF> zB~{hkx(#|2R24NTy{Z!6J|nnAnXyQ!J0yT+fv;MF(*E3xT_5AH`CNv}qGT$EBr61K zMisqBk#L*P@j)7QO@?L$U@ABnk@_}_-cN}K%aS#Il);&?goPUy#geGbFJ=9-4dN5} zmRVMGf~4@qzDE9g1xMels*~vMe4KPy5FLsbisnv?QiCJS@yv*rs-x*@RC#sZ= zN~KD|ClUJ!;@;2}MCEgn;IIJ1o>jowjHSi6K0VCv4bF!+^UN`Yy!B7$pyZW0!Uw7t zvGc1O0+xg{v}}_h8;;1q)j$8RjFE9%8%l@h^UB69B%%!B3I^MNtJlIQIY0=dgd;yh zP|zr-13=gXgt?p4yt~zIaDx;tU7DvW>6?$}si7t9*G>TRh~zcnN=~O{CF)=#GczpM zC58mcuS>9k3~{Qdg8GDSHPI#uC?`qxX6-sbgpkQ`A?3q%)mzrI;^&gjd1oB$SK-|x zc*K_<&;nV@pH%i22?S&53u~!6u9RZ1T3r-A##x30P9s$~L3;h5`N`uAs(|jY{iiSR zswau%_hF6d-TrZ7s0bowJdx!(yyXaje(Io{>bW=2)sSPfA2m5x zzRek*p{Pvpo`H;L5oMIOnhIJQj!Ct2YS@l+y~{FYidk%VDORZ1uVcFFf%Ose5&2`h ze^p`5`n&W(HBLGTn1}u)ObH1<+;hkz`r^I`{O`iJSL2gvUC|&hb{);As4^S_aCQj- z&Wajivc=vUH12LhmVj?~jzkTMMknU+*wiUNI*F;K-z}~#&#Go0*QRA}wx&#W zjJZ4zdjui1^6@;?u4h~5^Q>M4-;<+50EYYK5_h(?wp}ly>z5IvZVJ4DZ%&s>+;tSG zH-TnXj7^&jf+MSliS%Ve*B=(w5AuSX7+Uh)=qe3|p5X}Zr#S|TAByBc4!Nt%86{p1 z`VboHydeBU*GT=sI|1^6?Hw_eyd{TktNQ!gDZ5 zDa_A~!~w1e&z%WM5YHJVqnxUNetl@e3jE}lR#e|2`a>Ot5LJyLL)kfRdZKR?Fy&dM zJjRhvibl9S0c2iILrmBOxkP{HZ?gisZLdh?S%04$$;J*VLW%%eRp^0e2XxFQ1>2u; zxjn&1Ja#tgR@N4*HRp;$gk!>H@eY_wwp6Ri{Y|~k?PVPf?BmT4Wgm&XZ`YZndzU;0 z%E9RS3|9jXnyORgT$@`m?_9T1+~%Lemo;`_T}Ttq!>9_ly=-wiqes)XG!X>?ydbte zxQ({4Xq4AX5TK|~d{WHu>4Kmk5so9_J-d8h5O&0NF%v;L`u&YpSZW~ReIkZL@htzIB%D?NOBsTp) zt359~C2E{MhMemuD3?U3!dr=!+PNm)XgJKQ^PmWPSas<+e_qZ@)<27}cUfJ3s(3pc z<%>N)n5MPp=MrML5X4(q}98)+;@I@kZXjAY;1SjpANay zN9&#~_V&>bHnwx1=Sl*}N$|>k;p`IJ!`A+<{TBbht$Q6Vhn4HMVi=h9+xP=efLZ@d zG7(t_yehe-q_AIK4szlic$Z#rff0i-(vz39m~siUcHL6S#W*kBcj=1!z=KtpHWB!C zm7WcL#vpP>Zb@-DjDQ{Is-CeywfLdXG^Ymcxza2cW00xf@8fgjc0>K&Ho-&JB816j zL4vgxn_4vMe-5A~1}2-}!W?JJNSEm^qmsha8Z+wXFI^w7`$SBrl20mRHCWtH*v&D% zohb!{4c|rpt}lQ{yqlNi5}&`SnnRI*AC#%I&qPhKN_I~#1Qnt}+TZuh$|7K@1kDQH zXm6d=pLljVlB!=yXI>lQE7~_7DdfBVyTIG^$@m*OA8!!P z!jQ~A#Gco6%US;;_{7OZ0ZoJww*rvZ0NQ#5?&$bX`2C;DpSI+6?~RB(RlBt4Rm`E% zI6>o)RyUyGxq=XuXJ%-X?1r)-B<$ia0a7U zFM2}VO1dAqBWfP{r@k{X;9RxlV#^pxFeiguUvy(csE;hTwHfG51=lKgMi9db;kZy5 z36o{TPU^5x__y21;Nk`eRrpEp-g|s&wgi{VleeC1G3#v)*;w^Z$cCo*@WTXf*5Ed! zXH}^{S=~ezH~!_uDb#m`bOP_fY3GV~?9}%?4NK1YCGr(Zvhbe{`Bhy64#01{Bvlou zIAF=tdIjf`U4}5Q&_=DS%tP_g83iGbF;0rfG1Mxfs;XF%K<*L0Sd>2^&&VSdg+q15 z`qAbNOr=W(7kpk5bfxhy&lRbQv-@gTx&I3iUQ@(UWBl8il$W2JZ{VufHV~P5>b{zN zbmgQ#Q{?hYlx)Huv5K8KP^~4~fq@E%$_Cf;dvC|VPe~VMfR^LMC(^^<@p$F?Wdd^0 zsIZC4H)m@gUej=z95=9%4+#aAU%PmL;;44XX4YLN)81s=ibHgm+8xnlm00d&8 z2Q`W%GW{$*<}*{5;O&=g?xOMABc>u(==RIiMhzShuq3v{sap~cc0^Mq)*4F>5|B+0 zl1$kn0p-<|(X1slp2q6k=|kXTUG~AqZJ@;vB6nB>mRUo5F5J+SzkBs4#}8u+Ix^r|fXpUa73Wqn;5 z$@Y5$j~V;W3qT1ta}N`s3MLsOrGO@tR7Mc|5!~VhqN&yDr-f#DztLEg56s|V43h7Y z7I)`wnw>boe5kAVA^;liVcvJf06A>+Q74uFwZT@p&P){p{_a)Yx9M73u2c{pFF^FD zMc`Qf9Z_RMXRJuAO~Bav7wHyl(YtO~-GDGDSg2t3L-9a$4$^bK6Rmhp{CQoYpJ-W@ z!o@(a@{+l&9nw2;S}rnlRZ0=|NRxxFLRB58&)J=rEQ(o5m#!a};zEmL2q*T%HcvzO zNp0_o3$b&*m8R!@8x}9`9sKf(40wj!kZl+F_b<-&I$**71JzocfJLF8g!S>Df&By- zz$+DKmJrIbv#xHvAc_n(aVzMRXVtp@CxD{d{|Hd5_3q*$)n6pzOWinM{7-=5kKA6> zUjW6yzW|EK{|!)-{||s7(`N~iMV~b}guMG{1{JTYIOOP)W|P|)ncZO`UFsRKR}h2~WOfAE7KhBy#$&c--nSDRk=Ad`KSi?Kx08ArxqmX_Fo$Wo&$y+2ZHrZJ zEs=_-hvxtYIPbWjVkE|~G3J60-aS+6z@WGnF+z2OB?PikD)UkhUf@lN(F;u>x zQ#Jvw>iIc^RpP~(77og4bk7q%wZS;$3K-YXN*)VFZh*JTJFA37x81pOBI!Is&F+oo zw>stUsM4DRY12k0x*VmmkpRnl=C$iGcUP4TLXXwCFX8&z#1Dn}7k~PEubY?tWBY7u zuQOKvb0ag`zkAvKr+VeL8HaAHbX35v9}uun=!@|ZfAzz>&#&Xf`55QGjPu}=u4+T~ z8lU4N(sIaP0;tWoe^=q@lDs~~x58dA2deEBu&kFsrLu05hutrBp*NgCJ$t1rwWZh+;eA7b z5bT`GP>SjEM;%pE%@YPAEEE*rd?AW(D18ModWiNq-tg9YiE10zYQ1Y#12(+#>tQTh z@S*Qp#DlI7?2y$*^QX0m24uQvMlpOfsHz4&uTL$o2_zYO!GPF-wKDn*-V{A5e^{&C*L2F4*>rRxwYF^g2>v$JSKtpf`*qHv|CxXrPxMhFDYrh>Qxg3* z0xt6ZA>i_JRUtORT#m2R(M}x z2*vS<>lrxIcJx-na5W4?K`nS@Eaxxt1P~L%lBwiYb{l2cv>csRzJ<`N)-{el+p3lR zs=ws`3lp4s`zwHRLjno7h(H4FY&O$ptzQBBon!m`9K$zhIU;#m)#1vbl#zIW@5w1V z9JJe?;qL9<-aX}Ph!tMCgMj({F@C=eWAe9S8Vl3#`9TCF31eM} z&A5T&ap4+%OvJAnT2{9{=wiT#+V^y?yj;=K`iQ!N?J;8*&>d@<{@eqVGdLB?P4Gt~ zc^j2WR9nyK*aMM%z>`V%v3bDiZS!AwK|d_5DCH3z1JYrk3!&hT-;m@hbG~*A59j8_ zZT&pv{^b^La!N~~X4$_8*S@qx9zb8^X%^2bzsoG$Q>tCID7-Y8;59RplO4G$5cOU` z74)IGiz#V4g}$_*`@?PGd|i|ApBpD%C;Db*|4m{De5XHd|JOU6O}F*dcmG0#4u0&p zaYlBj)(pK9g=E28c-q6Z-J-s(eu<#nv_a@&@&*mV(t0F$u5NJJ+5)sfxh5DH=+BPJ zhqaEV_1@Lxk}aBoB`!UD*?@xN^RA5ZR%K7`Q>i=Z?N1^4E>yPSS>}cCm2Rqh=*@!J zEU0#3b{n1ZHGsq_re@@POq!#YZMo!!{F8n-Av>4TdkSJ{^ZUqe7BguyaJcEa7|9?n zC`u-Xy;Z8xCdGJU4yWjfj``-Yq9C*(sIko_O`zmUYO5$#w37#jMJGR|kyEofT>4dL ziL5m`T3rcQ9*V*P&zNLw7N**Y?9SvQ*{XU^S+~BumX1~ zSSHUairJ{hVfow*;R5Inb9>9?6)k%WHl68g3opyJ{S=w6&&6`Ud7ALrF_(9kzYI?Mg=UJl#rPf0 z=UeN{OOd*|KcL})U`%xw!IPR*M#F_`5R^V1Wk_SJEo02}fj^B*=yWf8k8kbYLFc3P z+5G(mv%GE~^|v2-jSX=3z#ESaqfu%5o_$Lt66sKR>gFNJuz2LIYUh|O$O`+-PGpqt zWCi~sf?Kspzh(0CG`4k*0PxmEgzGPoAaL`b72%b$*{hp!)gg&eHLWLwIK0ineg`^q zyTcUA#m%oNwBi5XL~yIe{6~%e+w@=Y(4uQA*~Wcpv+zAI9vZ&~#zSWrU_3O2hjE%J zTIU?l*74&UN~OXmEyYuqw>=tU3_L&cB}OutYa;rGJR4a z6*`4E6Tj1mbUNZ9;L4 zrgRZ5@LV^Y4+(b_8K$bRFsPB#`z*F{*CJEvUb2r|jhhb4l`fL;@ZxOTKV!&Z29GY9 zK;+T)L0jeY_Rr55_VA?e8XfvtZGFOZM5GX+I^>QmGenH?S_~Zw&6~Ld4jk$>6mx@B zMug1e4h3$M~}wlv~kQm z$A1=UTJA%uhE37VP?twSdTgdoeeN=Al(D74UTTN(8Q7Lg4W{j&^=nWVydk(&LV z&I|L-hZz@*`l+3teVnp=QT}xB|G!Us|6O_eKO}N&|E9)uP1jWbN+U=xRv$54cytlW z3TYQZP|(0Me~8R@`%a}rOcvwR4Xvw>0Rov4Jh+j^+v^XP*A75_^lI}M_)6jL+J8y= z3`0J?6f=sMkhB?f$&9@T6S-%o56;w{!eEHt>n-7o=pC$FaU3Ea(BToNmxe31K8A8p z&2#Gyp9%uGNq`~8a;VzGRYNjq#dk}0$$PwiaIfrng0z`>v1?9V(K8Dhs+WulOAm)s z{+(MS@2CAluyOj1tZPpxE1mvG?OwpFYGxVd_$mvR^>|a6vewd_qO#4}V9-57vld+e zq9D_Lxpd_ZZ_vmXhd)%0tgjTn{9~V0*4HsKfO3X^FMnlY{|liER4B27s(ib0WIgOz z^Wh`TsJBT~zxTW^ChO_ZyE8BOD4&sb^|oYhB$#s7!w`$e#D-7Yt!+dE+rrY&P9D^N zHvY*8xgNpXjF~_x7Y7PoE<8Mpp6Lc^JJg19$>^BS1B|IVJl0ZSK%9^1(`?b$xd5FJ z1*R~j92Vmp-!R;Gs(7Qt!8gtAul-GAJ|f#vt0INvbD&^D)k6XwculJTp+;j?8aD(l zPtol0G7-i8WHZtw(EacyO-B>nMwqY9_d9VKsn_B{H9!!{Q+uxMzq{=MX39#~RImBS*^BC-bJ$wX*iq zC|>hng&WU0}w*ehwM z*#Dr8;ZRUqz>|so?mqBggTOzu5gwNT)BL)U9Y8 z+!UvxXNfr3&&g8#K7mPMR&_GDQp$2Bq3k;q67*>jGrzlFKP1{ol3HjFxSQoo7e={u zE4~0?#R*@&3G1ct5P0+T=r;aTgH3S#XoX50ET1wT%7@VlJLEK)RyMaZjXXem*e*e{ z;(>-@loYsw`dhcV!&|zgrQOd5ZrL_v=%pNx10m40>3tYt778@+-2f9ZcL*LmC8^k} zDXD}NlCQq+gbG}byEK_FW?-&5MleVtpD}k}5*_ki7O@8fK@oh5%2k?=6GQ`Z%dH&CCm+2cN*j?2;AYJ0-Xk{7-!$7mwZ+H(4qSGaLZG}8KYbuYZYbDwhcf>`}Wc>T6JEFzP)i-}q85x%OeNzPDrP*jJ zOk=?FBa|fFCX*M`WH0k8kzgyxE~dBX)TtYZ*HXwgu8g;@?s&kyA6g&geyZD_N;<=_ z>Y`i#IDw(hW>|>NLhUG&hSI03a70+e^L--l;*YgIqttOw6NYePI#A2y{=E1pQb&9I zdv#llU7bh-7+CQ`V4%7Z)>V)?sC4z_hs@wp3Lf>r`=Fy6EGzSxLhy<7AUVW4B(|y1 zqm7j@f|Ha^N4vp`vkC_agKCf#ze<*E%sb+G=n79y5)bdf186xeYx${9gx0cJ`iqL~ zt(gN{30sW@y}QF6RPKTu(L(;JOi9iKcSagHMt&; z0S&2QymBXCpDr~EB|mWg>Or#>s2}B?q#4aH6CKG^Y5hc)&Hrpa?GYk-yw^6V4stD| zGxP(t$*>gM?F2d#RPfBP@MxY(rBM`>L=;@M#ICWD{}2bBqNU39kQ3j{D;|ZJoXWY| z9iic2zqvYBr682#+Ym_r#ZC-TIG7x{No?rnPu7axxJthd->VlyMa$GdHamJouannY zqrryYt*XnwIs0-~wD7_a_0~0FHLFQrG%)UWZ|g_Ng^aeD;G>yg#_77~-M<+NTVp_u zm}4^$w9A?H$iGLYcIiT2J%($_S?qvY2Y9rEUw?Y<>n1qKqRef04?dr+-bL6dm8HP; zfx*8W5$h$TPd(VmTKnC5_wfqZvg$B+PiYpOBB z+Et)=rsPhPXv1jc-BDp&Y|?#;pqlDa0+zCulcEic;rTH)Tr1-;alGoxx#U*D_NaJD z@eY1`$$J>;Rw46-T0|X)x|0vuhAmG&;tmi_tlupV>-ENlx}X)T%ia!Ey1MQvDM zs+PP!*L!Jqc?YpQ`11d>Q@ys7MGHNlCsb*cTmPQp#|A2?{~s4 z9G*Me{b!s$dGp(?F~)P(DW&X!j8iQ`*>QvwYHm>pOrmN{OH@mCsDV95tSyb4lkk80 zJ`KyF2k!>j-1wG+nG|_5ju#TMiAUTRV#6|IM9|Zqy-wzX-KT&h(Pu)Bjyo`lEc#Tp zgA^Wy1to-}IOgEV34uA6!eDzpgEAiJHzAwlSK{hCyQvL- zL=u&vdv_OEf5#U9W9;nRXO&9h2OJay#HRB}%U|&KvOM_4fj>5mT^pw82!iaJ^a>cQ zH^2fyso^<_*~eG07bH>bGTG=4LA-06Z~9Wx$5M7jkt!TcvUg1g_}B}mt=J^B-crgU z)FBE_h4vd{upLbfhC23Ji*X4|y19`I9md5lxcW{7s8T2dGs|GSEr4r*)-e3xOP#ib zZs=>(a&u>P(G$qmEdzN)`l)ky{9c6Q=aY}5rz$VFyu`&Q`kS6 zhIXn^#sgQY&#Q!ULjBZ&*FiQbtl^E2pHZ%{j}PA*|HcHhuz)iJlAH50!)o7l#0V*o z1!gsx0zLkxJW{ddZ=jM}os-PnPn(txMJ7Xnq>>HvoApVu5GXKg_4fV;H+SB;s(R_WVTq5gxHy7_vvv7Q< z{Y_R8g3rt(-4}qfeMsPJf0hF$eL7zqbM2d77rgVa?4|We^slv|C!iu2UM*jy);91G zB$R9H>$XX-vj2Ww3ct<^rQ;F9VTA}Q-pY_xGJBOhM$E^2 z3HsM)U&7P~$?dUJvtiPom2Dk)9ih`v_Ps2~PdT}390PVWbS4|hK){9x13mquM4;to z9}ejWSB%?bG`rK|gJcwtC-PkBY(%8!B}AaR1mA+fn3~&D_hI3->{f_74^g4I1uD|m z(LMI3>NpJlXjDd26YMg}ho3lEX>W@^>YrL9%rv={#N=nhgyt5D)LV~D9{DUhSK>%q zGlqs$)05eaX*%ssaq#p z(mOcrG$SH=aaaeqEsXK5QOg+^+M+tx4nJ6PRUTMcRd$f)Yqr$e>wP6N+mL&w6G*r9pgl zq7U2zoFmXyBhj>%Fm#I4gW}*P9w^q!zo8?G&GVE>Fobc{rmNGOKuWcSb1y#5oywwp zLeErAmb^~!o9Ro(G~|Lo69<4J*uR;Cmq7v5lxnYL30LyQt3|N9oV4jp^3tsVkrU|& zj{~7dm>cfGnH)5tXxQf0zeLozCP=k0k2q33x>inP{AdG#7-y*#mpAI&F~I0m&T}?% z->JEtltbGEB0A?;-HCzDSNX*T>f5ZH?qczm1aBrk1Fj@7TpKmI3 z#iA)V>(^F_k51iu$vN>3U;BkX@vyqN*9`!BBdj>bVP-+R^CopQi7Nx0mFx5(fNHRU z*mr||av^mWxnRL#vUM{)5#J6HjaGsz8XtG~*(ng{zIJrs)^MS1{aB=rg3~5gv{r(o zfO?61BmE(80tU-pqt<~m$O$UgfgS?3CSxAc?h3S;2lTD{spSwrh?*6MGqfPK#^8k( z{E8tubJ;cjiKs0q;~Y}H=Pcv$o5d2}HvB{Iu}WkkE{(Q539Fsrelis^WKrxloJXa> zj_QoJ8g9yhU5~4$p#5gyQ0?8a3O_%s;Azh;jPb$PS~tD9ZEOIKw-ngw+>pC#4?FvE zUe1Fd4+fpiUj-We)tI8ja-^r8$D$+KDEC|aJw2q+Q zbMw<~N*KCrKu+0oJQNIT7^7nTn_syhrL3CA=*|%AeS!S#5bx>b?V(jbTZ%9Mnl{wa z`rduqIX#0sYaeAdUz)x$I><4h{K&tJxNYKUUy*>(61)i$@LbHjRCT~u0~%kaxKpEO z8%f5sOM5BoY8r%-TB+hk&$lnPlX9eO4cV58r5?5%#vfi5PZ6Ozq?qAKlJB8s5JOpO z&C>*TKind5AtmHScJj_f~4k&asn1FVEesh zlEL&{Z4}y9S)>kL^5li-s26cwNRo&Jl2yL-j}aB2FqgKzJsyWHhC+4wWa-+9R`KB= zF-B@5ImDB~GZZ32s!^QW_u6;L5)n2wfFQY066bE(Z${M-Icb^SF>Hc9d>8^*flnC? z7YLHwZoo~NgvlzR;XmzwV*uPI&{e`Jm1fh1&)`%xah4!TsZB+znrn>DjM%baV2J3n zgEGEcQ$PZqNb(&<1A~M=!7)O^PYZMh?g5znknBx|cP#OP2o@!s z+zhXoi2D*YE10&|F3`f1RS*v8RH8>w*#VJF;XlsC(6HQD8Ya7YV?m&f{XJdL92mw4 zBEuewn@3eXrGk^W9%Kw7Q<;w&oEwOpb%OFowPFj#lY=9|GaGO=A9e(F@?mvYJu$KG z%qaeJlAN#kxl&bPC*lS-NyLlQWYzG5v_B}u{_Sj^*raad58xsogXlE|6^{QH;3!Ld zBw#^ky`{ecRSj+Zktx;u5$ELvPxvFj{x>T$9S|kn{RG-IR<-(Lgc91$A5+e_4Kjkn z(uteaTMedfY%g?KKU%v9oYLl%?>slq8z->@A?MYd-nXc-B~y-aio3-- zrH{!Cze@v^Zu8A@HFqH3%TE0mMEN49%~OPL8Uz@nzC6ZzM=AHo`WXDZuZDd^G&H@E zEaX(UGvZNY;)DdN-#MKaA*S3k@-R~I%pnV;YI9ll=qPgWVd^I_zgoB}Ou#^$<)vWq z@LtyxA_pmhK&>XMf!FPbkG{3*sm+vdyAg1m&)^K!?c)_{@^p~@k~D^7+$oMjOJCS zf8`}xdFTZy^c5b&c=r2?XBf2u>0nz7_?@912wI6NDPxK2FznHM<7sVE1x&-rqtMm# z_dByC{%XGl%UlPM_}y;f-7zlY+=)mcok!I7C!J=p?m1=`QtZgRF|}41re7ZyBz+BC z$?w9d4TR|JZ06?O_?f!rQE}K1GC}0apGO7>sn;>($Y6x72+?)d279!?8iQW?t_diy z;4+d8Tpr;=Qo&5{@L57PK@-b6im2l~? z%4yT3n^}XK69n!HE^oTAoM82}p-~OI(AHs#;u_$E{zu$^STqaf#&@}vz8XA6xajsFCQ+li2AKw28DK5epD_YObK&iWfS!#Z z)^Tq`|H7&yi0J=IB7%&VZ1V@EInS%?B>&v3f#+2z zH2d#NH(<}S5~u|GSI_jj(;y*dYb+}66Nt~oPitC@d_3HVaUK7hyU^}j?Aqp=_fy+uplRKLBDzU79 zAK3SGsS4NOyDvZv-Q0y_=jQV-hxz6505|-ws(N-i=K16#qnV+yK<1x+!`OU7c$U$0 zSi(zz4{E*qp6lXN6xs{3rGE9?P0XQ-eiQyQpcNpX7Z~7Tn9yF^xsiD8!B9|dvKhpI zlf}C>nO!B!Zeg`PG3u#oI}}8-3Fh4`&>N`` zk->9KmKRvSu8THX9X()s1z_P$-x%t%9kHSkW9umCIo)cBoAb6)3qjZA zU2~pHTl{)pO?Q@SKHJZcF@3k-%+jfkl=Xc$!_nf-o{rm@iR=6#hwN$3l{^oc1KL(M)U=r#;Q4ySX-d({z6&jIAq%Gf~RP|{I-ks3XQhg$#z!~$LF5k)F z;oKn`V!kQthiNAt<-Sir6b<$#uUYnYqN*%CHv1+r1kpH(VQ*D@D@qG;pRq!F36{q7 zozD~PGM=$E50tYrnDfvXX)}ymDd62V4vpg(*)v|x15^<#tL25A)+Q_TE6FDYP(%G6 zr*i|Y;mVUxnlrwls;Y=bo|TYs&-)c`=F`H>a-t?^??1X%ymJ;9Q^`Em8~pmUL<+Z4 zWhk1j=Mfo_m^E>p7)rwC4+#(J>u9cjm(c+MC_ovVSAWvyt(X%k+6Yn_f~Fzeka*;n zf-k6j^fgKFFJ2V=&BAbu2yF6SD^;GQ<^Ph=K}7&~Y9q?l7f2$w`Ouue=TcmYS;uny zV_=DHIJTx|cEp##fHJzuSQl;qZJ{myzhuQ`++L-mhlcQ*@YwkO%fMsGLcjWaw*W4x?D#wbJSNLu#}tuUQHr!QT7V9WkS-|Oj&Ej-!%BtD38%c1lWezHAF1L55dXnM&Zpl%d-_tzWUai7^In+x9ADEOozqVB&4U zaNF`5C%*-(;9LyNr0)^RNem5CDe}~jl=xN6Pi}ti7i^!;>NSed zkSa4^^gFun(_~YO0BswDw85SM&e0e9a7c^T1FX5*fxWC2MqfsVFZI4R#x($jh*&JD z6OAV<H;2f-%INpk7m$|--LR&hormI5_Um4 z444^W$4wDVdZ9xSjVcvS`tG7+h4%p_`QIejrs9Ntq&a=sM}29KcbHcO%T)zy-u4fm zi^ZYbh^9p#Y7Dq&&nQKC&mnmU=B%^X==StU8Pe}E`0Mv9oS)k7^yj~H_K?2V_#mdq z{sC{q`bv!UKQ@^UUi(V$pGe5SR`{wjZ+e91O8Cd7JtJHg+V_$ARfkD=-~$0;WUH z4R6x<@Y;dtPzSY3Xj}K@@B`>sfTt-?9PiireujFriqvyE0wV{_2@z}1aV8u|U4VSZ zkB~3T6T8tEXqqF|7UstGdawqH<0*fM<3;}x$2a^Xj?ci6xGK;rE|eV(ejJn@KK*4P zSUpNk{NE-5*T=t11P`Dv6@OC4v;P0Cj%R(HI-c!!4S_5W;tbqx`4_~QHsFbSqO7zr z8Q4gH=ACKm$yHK}K zKJ5bZ^Qswsds;C`w@wVKxj<4(1c)%1A%cLPUWFf43)~{%A*e2zWRXv9?`PNixaDMv zj}>rvv|scv@G~he%N`CgBQ%)2H)eI-ANsFs9jwk*G$s~}mZgZHl@CErJSYWbN%$2V zG}DhvLUO6D_=!~Y0jBWwD8iLW;~e^l!F$;S&Odx98cj~%Tg|3nqYd|MF-F@3`MAKt z68mKeNtYV!M)Bl9nBe1z)m+Wc$neY%7GzinV)0~HdJLJAUzj#!@iIAfoJ=>f>gk~) zZ@}HP{(C3COabScFI7&DiT5s# zkV<5sO@9xeY;3Q}Re?LV|6N`QEJuFXR^OaG{AF9^E5eE(<%aN-RNA@$H?cT@`yf@m z>;EE1Kbt@450RHbJ@$_C1n64zdm{PWwJMkJ7U)`)-PM(+gBiAK97iphL)25j^!2Rf zd#h#W!uqQoabcC5i>!Y9z&i)5N08+X$S{B=o{cUpUDc)%SP`@1NxU0t9ptZ1l!EiV zc3)ebtn=Etd1xZ@S3+{>!thFg^$5_lYD@kg;zPe>GCX`cqa9zjY>_){QpM{|x<%#o zq(drTL)hUBG^Z5TXF^cf5dG+QKc!B&HcpEB4lCB*GelIIMlrr?{kL!78K7?N*G+5Lcv(Jsdk9dsCu~dczWlUB~p9pTSiW2xK_qIZ(ne>4vLo_L^0^XvF;;I%b2wJJ( zqw--CK?_d2)M%Zdmw#!YG;(Igu+(`*FlV-`T?3nEL*1pKc<2m_F*;iOR|fr}2%}M( zyDEM}p#^?9qjhz@5JVYk|3-Zc(|tdDAkW3a+s;&U(QeoTq(*p2kP`XFhV}5!*D-~*s?g#UzMzO^QT?kKM|T`* zSRBJIs?6qYVON#=58ARV*{%ZEU)d;Jcxs%Dh~=2hy`Di4QO14${*z$+3v>Pd1jDm( za|4+FMagC%;b#4Pi}Z=Qmz~NirqFH2GufwODAAD0yS2*&l$C%OhAaz;=;Kr4Ucccu zhE+tcR+Fm&VI?kw0<9!1Oe}zJ!qjxwOii8$jf%v^&D@;5?Z-+23Fc! z*i2N<%}QfhGT?ytgm|2rC(Y%Ir_WhdByAV_wg_3DYXQPn5{>`z34Z;fw_Ruv%P0XJ}h zNjGE2^_Yk|$34J0m!&z61MPXExnr?7c&f7){1bnc1@Y3X`NZkMv_ogf=VTDLsjn&R z&1IBEdbZwp2Ge{X4}OfXkFAJ4TpdB<(HF4|5)JiXJ|#UmkdQnNt*u4^hkP3_M~(JA zw_X*ieBL#iVqdLU_s*e=z>`2uFtQLWljii=^dkkVxBrzFB4W30xy z+ys=Y9*D=k=i=2E#m**)r@i5&>0a{{0HqtNeZf@+P5F5_+t|Uy+nOyM3*JOF>7#wt zcUY04=Zd&=pc0?Chv%3OE+-9|KSfKQ#ML6a8SyY*^%y$S6!BwQjj;6x)u~&e_h&2_ zdm33$DTlag~WykpO(@mtl#b(lw$VhBdelS=pZ73~Z6b$a2udOe&R>c+2n{8;R zcjU;ON<fB4}Bj*s+*n}@I3=zq7m9(Fz(X}H0pSLXq)WOC%CZ(D<-_A=kp z;&VT?#BOj?uhf|u7m%B+E6z1SRqG3R59J~`IPpxvpE3)(g2Y$L+mGMCQwCoRl4Rnv znv5f`5|kBT1LEziE<$bPr`Ci_HtFw0cTK#w{4Km z+N7o8QVKy6V+;9;9UKQ1!Pd#95afbxzVH&P`r*Ceo%T|=!I$fjCpbz-7w$jIpfmx!jN(c z@`R%2r+oeuC}zClZJ-`7+~3$g_bVg}e9jW_z>0eRrS}|z|E9$LwySsOiedaEGyv5L zME)FBvfJTqEm=IB`+K|Q+1W;7nsV&u4kkCmgz*k3PS*`hM_Y%)da#%yqE&${TO5jTEH}vd;4WpgTDuJA>ATn z;mz&;Jj__`7wV`8+iMKHY|x_7BU0*z7yBF(6bxZH)GSG;6yCSK?nZka%oo^2Nd~co zHl2Hqc!5~TF~S^PeiWD$`x&aNKZ_G1oVi1XtZ*VONX3ON6HVz*gfb<0WAZ1N<6X z7n~0|3NalI;Cw4IGw}fFAjD%ARL$1s8I=6B$WjhD3v25bBrgK7He7~ANYx=s&>=1( z-=CT}Gk=%*gWRMsb4>R(cJ$0c$C{wl+Wf3bCG%?0v?JeDB( zgC#9S8&&ME#<<)r#_?R=DOw1Graiy|=axgG7HOsBP1x9id|5i)q_M1HqG+}>iCK(V``5jX4(-jOE0!H5a%qtqA&6;#Oe7bgxzu-oDOMl)Y*Q0h+6M+B5WnLr zjvYs~sXmQ;txglkcGCq&4Q=PVy*UeBN)c3B#4*`SAtesa&9$S93NtmUMfo5mP?Kqm zbmGC7Cmh|28g3tHHyV3kfoI=08EIIPiGwGwOtKep9ixGuA0ZGjI0W_0m<4GNCaQ~i zZ)lPn9L-i3%*vhhTMf>@pq*#Ad2dk%2$hK_@Xv-Ullf_1u^$}{Div@34*%#(t9_)g zcDz;ex=HuTXkx0Vi<(c#hr_u3a4fa2%vtJcTC7D2g4&s;=;3gejC*ag%{=ik68?JB z8|pG8Z`FFy=#kiTYmTpz?6spvP;kKmQUYRl1%spv8D;Q@OAJhw5o6y`Dd9#^V3h!q z;dhkL3^i9an(t!Gd{C~RZFneZZrQ!Q6n+omFowh9#E**M7bmm{jji4>Ji3Vrpgxx9 zFtLe}cK))LkSNMZjXkULO(S$AgMxB2q;L{VUNz3o0xl~F)>)on1n)^Cdig_L;K;l0 z8kxs)Ivdk8Dc$`MK+xc@a&Ws+ts|g@+4PgF;owd&LF5Xdi73P%6@)z51Xe&jmW)*LnoVp$pK90c(3vi7^qbm{ZkuK}9o3q)&;UB= z4`k^f`O_D&wI;FX!^83Dt3>4hb_jbFy65e7wZc<)7U?E`6;tf8EqOnSpH3nYC#By| zMIc^2m@K1}F$%Gx&751))*8K{Ep? zK@*ZYa7O*|29Ubqdx9w#Qe$ro3q2;n&@uMq6O%?Hd)@eJsnegA*AxRFlr(9tcv-dA z1BConifLkxiQM9@M(y8FL}-6IoNI_q99=CJt|VNQ&rB++V<|=)MlyzEg4#?vrcpb! zze=H$Cp#X`?`k&jIcMOW5gUe#yRJunxk7Aqv&(^>P_3axjc8zDv5MT1wT$;u!&eq< z>L%q`z}tdGq-S@5DXOzzV?Fu+_kd@b)ydODktR9Nedy>sfyn-GGBu~0d{D+Y9wT3k zG~g2!1O+B$a|52XS$*1hjX^g@0`ct92U^K;09vF)b#q+NY)0hv4Y!hjkQ~z)OxmZk zq3ug8Yf^2Ak$1E$1;>5dJ$>Ovg!Zt`p6DbJvHy#@w~ETTYuCO>>F)0Ci%#k820>c7 zySt=QN;;%LK)Sn2knWc5626OPt@V!O{qVhm&F3A%EjC2wod0>oJkH-C9_Yppahv)j zz@ZdBo|n9k4dQOW&Apb^G$Ln)uUSKGvy7#;3*e8iO!kh6QiWi)y4Qq`&GYBw%31}U zy$npkp{1;y1GfxxiaPc)<_kpIsadr%^o?ogT)IT%TEac0&JS4!*FVErgS%Sv8a6&%Nn;j;ctcXAdJ<%LrB21FM` z(1E-8ut+9$_r!;NAQ>d`7%4)c4)F|4b`B_uZlRa@W&jwD`(UY6p)-7j8~w}J4!l3g z4IoKEL$@4P<+fESW^${hQ0ez@@zlsdf(H2SKYI8%yPePOyUZldSre;*hwZJJyk)QD z<4t--e4CL1LKF*_^wE6#YW!Yr_^vX5nWSMA2MMi$lkbg%1;*4R~3xsMs*#Ya{SB&*-KCD2v*Wc}rzaM4t{r#A0 zX+VkYMV18o=*xA5TK}8cmGVOEV!M#4iQnBRAi)yysp4c1-y_iK8yvSn=E27g2i{n$ z_G%`ac`rlzx`kmzBCRh&`<4zx18`QEWhbXX1Fru+5EGA?b-5TIg=(>na<$ha9Y~?t zMMb_AJF)q{K};@;q%)&KMgEIovi!d(CT`FF)qsQqGqb^bZK2+z@n`$nyh%+(C>|&a zY<|G->QH+JKXSGN>K2F z%z@XbS`DW_)}QF-svdOBa;Dw%`JiE3$&@FB?sel#W`}+>-?szQpg|KLO4o>hDZ}`d zb}{tLeh(WN1-Ro|F5~ra3tmbs4>m)(G_ny0RP8NGIY}c`J<#Hoq|Yx9;VXHDX(jvj zWKy>NxGpC^$<<%{6@Nd(E;$8Z_9feJx` zpG+nVl+9Wk_tyZDdyZlp-13;}GGx)zw(ItzS`c#5;q zMI1d0`*xT{Iz9xlctrG<*$&T7AIo^46-l0@E0DR%&xhTaom#c zA`YB$LKHi)*chHRu#-w2k})l!p5`=IPnc|}*NQluTtg?SyIocs??0XFMWCjb22dHx zpcuBtq~n1ilB?#ZP5;cRE1u`3;7b_O-E@PjiIbV1MvXf@A-5B^Cl%>pX`b;TgiTx% zsvMn9U?R!uh(VfgRZLRU<6~w$S6+t0q-{_`7gJuZNNNy_4WqCFt7*rR^T+wd^jZWo*+n|yrT z|A+D>5ci2D$vJ`(a%XqjIYM_a`uPC&oXaO#PlY9b?zJs=Q=Y^AkHLG@57A^S(2plT zzt`kQ7Njhy(n%lLsV4Iz(_e(lbfA-38&h!7*i2_Z~znTtly9Y=Lu32*(Lt!v67v84riDq zpqMX>`IRoyd5lY#YY`96L7AJ4(<5ag8hRwks@Gr9_=`M@yy_W~pu6Q&;}^Kvd@g9D zV~NTBy5yfdng|hb!E2(a{Yg54frI9?Mh+6hlcrGnNlYUH4P3y6){uY?j%2;>K5mGT zxWm*lNu_j0r8qLRqL4(4vU+m$d)6kH=CK3EB+j&kpW;~UjK+7+k#a7mL(7PJ! z>{u{a*EQWL>2xFX2?t0o@WXP^7T&?{bVg)ku<;!xv$;UogU}Iv3!IGD<%-yCs8Chd_xPb6v(Dt)V~Q)IMdu#tio{9~MWLo$3z+SNxl zEJVU(hK*3{3!IebVXq8r9xQ>hKE{^xieO7#HZ`7X0sGOPa{)r^9of+D-=`^Z5sR$F z3d|E3p~fN@R3tbBXaHK^Wm0f7he|>kYYq<9TAS`>41h&h7|!rHC!LOES7G%wqq{>B zmWdVe$XR9haaVJ=2^;*zWv`a(_mG;&k>y}l16=nn{_ReW4_0mN1>XlsC)&_25LE1* zEm&DbWX@^JZkNtN)HWs3!$tNUr!vF6(N}^4XbVghvEs^S7;5LG-sn zsk8jL3UNL|;X94=&}dD;dujWC1lCfx{>njoo0lBx-{w#+#1SAp0pi3Rk|@l$qI6`q6)p4S2{!%|ASkO=Qvo^iMWDw1}OLDR>)UkC^B>?3J{B9Yv@*yf7Iftkz9iK^N zr@~5$aHYU^aMQ%HIkBY{*UEOni!FPlaHh47PKP0|pS2qS`Kh123l3mwubCPWc*1vv zsMxWmEk}iG7)2C(qGqb-M$AO{33*q3%5u=ht9X(I)(F*+o6%5i1__%G;1%i z6&C~gDAE>%CRcm#PFA%6qZ15AA|p+TMl*8zoJLiwrJVmM_@C?6!6WU*5Ut-cE@>j- z?%n3t{Fz)`a-Cg`1KxV>?(O~WVU_Y<(H;Z1-Xu*0u>TeOfvT%$@Rtt>=zUPtGwwG& zt;80Q`CzNkP+)Abl}iZOHPW*sR-6_w!qJ-D!OZwOigqwj~7q zEn1VrsW6dO^h&DT{Etx2KW;a-c#Uf}4jnT3-z7^rvS3v(VAq3+H)X5+%$o&&AmG?7 zN*9FHiB8;Xv+M9w`~uVYiTAN~k>2APeWZaH^eY}BOc46z@UhL|98IBVLVFeEZ+Thx zcXx>(t3?2k>BHg1avwXa{Ey{DL?j7=)b`XP2i+80Dpe8|Owo%48HvqeKjZ}`<6?Pb z$2}2Fc(a>AXY#Zx7Be~Tt%?pZZkQZ)ePfUu2z5DMJI8PGZUX$n6cLz1LHy4$LU62J z$fBD}IzC&&x_*i;yD#okReFJdV2mzB3?-57i-@1);&ff&<3yDtM*g7As}g;t^@!Bh zH=bh8Eec{7qF`%{pTb_l^FDF^b&LBUYNs!Dm-!oEi_Onvh63Mx#!AiSiS&HISaknu z#DG~H#8#)NJ@xsZ1x5*j1n$Hb63hAkf{Gj*LAH`??L8ac;yn*u>&1c;o5>$YO)yWq zPAlJT;#aoAi1v(0=BA?Nezp#S@!G$yTJwayKVYvY-WD`qxcX9xBn!WJnL*7PMyD?h&%CL~B>wBVI^I!hOVI_GW! z-D0)6d1ZmQ-*n;s8&+r3rkfZ@Bj@WSoECwwFS!ngol=eS+KV~324ybO&BvX@?;qoE>-&Q+3K zi8%VB*=U(7OWoe0#tNroqo$$sJCSs3)A|ETk8sijT;UhH53ej2*PFDt|H%RVO&JP+ z>#tH5h}E%y0Gig_WOQMf!T2)Y@`dNpe*$84aD>6=K8n=oCa!DKsR8m#zp2#96J8Ed zjOo$Cr>+~ zkIA8{*sUVWd4}MIwZG5wOAF?Xp86(?rtAh$T%)ZI{C@X4c*FXA2)Of4{x$F~#6`{< zv>cF7(w~y#p9aa(H(;JSc@_nxFEs8zgU@0bUpXV(Z@RNs|8Zvlb0}c>`@&(zvP%L1 z-QY`28|ni3MBxq&a^XEBnVp#%9k;4bvGWx26u^j0AaQ;b?%$KDEDYSAaEh1hz$aHn zmX7%`v9xc6`xG2qwu`w`HIxxm<#vX7tBfmwT&m^<=?~DsK|gwI$8ADu~_oAB+`_s-C)rf2eJ~ zJ&slF&G%_09LlU~s6IHagOiw0c|3OHeC(`t*$lL2Pas&0)ZpB}i;@u1(Ki_o4z)JD zP~y@zUUqf^4Cf;I+t3D}u=^&4f>zV@g=O+)8PQdGsa!43Ay}7>cCCw(4PE9K*Fiyk z>uN+nJ6}rq4MzJb%igVWX0YV&l!o`@dbbR-R^8B00TJyma+lESxbs(wt=>@n>NS-1 z_hO=d-$a(TsaF1TKXhPslb_~X1jPx!Q#gslDa9v+c)zHaux~MW2RD@Ahq$ou(RO}@FnuHERCD6= zOzO^0tYKvd)#mfa7sVYpoV5B=aGS0?6X+FIl93$0Zt4e!J#o+*k>!0WxR7Ks+4dZk zpP&I!U?aqI35oIW>AYl;W#9lq6@0=JZm#;l`5?VnxMODJHsnfs=o6U!4s&>kYFIZ6 zQdrKDe4*kksf5z>8X8`Mm5%^3EdQDD<^*24+q<3+#b2#DKHKBQ>oYuzd={UtbfV`V zR}m8O4t2yqB{C=dBk3(avqljV4alqQ5R?e|k=^AEaPeOv?$BY5?%4lVK%1a-7t3qE5f(O}_}YJp_t zi6soPPxPjRG8SJc_1gce9O*@blaC5sC@}*xGI)`73a=%R#Gw`*l3OkbPAS}PmLosu zokN&iNoZ3UEZN|ZfM}wU(65XttwEuf63VHI$fB%1NE^W)NN zc@4@fulq94K!8Vt>mis;I3~}ztgJ(mppyF8L)rh#_;{9k<7ZPRD+$Q0gyF6v-JyKn z%?nKjhx|8iX7p=wP%}jkQO1NAbxNYXoum{KZA%G^zv*Hddw9M5GSzt2h9d&BTHkg7 zkde9j!pI~8D50Z@PD_T2J#HdmNxa{)6_ZU2%`ZzU(gr9IO+U`@k!QKJjbBuc;b>eX z|FHHdEe0A40m&vBi_%)5^}~=v9KWu8Kr>+-CH{Tg0B;+e|2J*>U(_b4Y+Wh+6p~o6 ze^H1$5@1Kw{zYx7Gl12l#vi}C)Y-0UzXA^2abADwfKDJAZn%#^=nAVLQ{n{9mW0wa zYrTxxcDIS^c%;3I+5%@wqFTE54gZ883AcS0b7AY=zu+O~w*tj~yb+hTyeTdRw4l{%f52r8oF8M0Fz&%(zAywR* z*J=;gABxeAPwY*D*_11hu`Gl_$YH@25v`O#9*xPLj2fa7Qpvvysipc3D)C!EC!qMT zgZU4%4Oh?mP8-dDq}vb*O|g|Kh>bt~Ne#G7>lS-Rs9w+|9=5q?*k3)Uk9eS*#TZ2b zeyL;6r((aKV7G^{N%0}Xli;g@pgtDIu&~ZDGt;S?L|=GVV$F%R6C(t`uU)$9;<%$2fi7tTSP30DV?XUdn?@JFq^d~p8Nc`-Y2+_|YsDY^k zjGytE!%)xO6coV0lpe$jeV4leZ;Nu6j@7|pL~!^%i>`Twd}I&&D3fN?TBY@n^GeEn zoR}S(ui*0K1o$stU{M#u+OC~^-54aGotikn1!$*M5Yh0V<+&5a+Am;+-3;GwD&n%z zL3nNchqd#ToM!#xMbWJezi{f=@q##aRLvJysRfi&>d&~M(?8+MpZbp~qkR-t<5Q>= zl-%8{GA${iAJ{ZPX~%YH$NL?}8j4?l?;lkdM5_%1WGa%SaUyPp-FFk{TJ@qx9Z_D} z_cz6g|KNRRf77EqEB9ZW3?P5%e}JQYpda)e_!!>l8Xvba6m!CS>enO{QyEKh97@y$ z9HeBJC{y_UDyTClW4>$i;?7R$k;f?_ze$Xv^sP9UT5!;7{2WAayr=yT#*mZ)B$Jga z|7ss*HZUWQhVo!nBXukMa4w=vT!2T6enRJ>qMXsOgP8>(eWp>_>pBp&glsE7GE&WJ zB~4!T023ZQn7QQ%F)o`?BccLts88EV#Q*tMj|A;ks(VnhU7I`M27-N(%)_7-tcd1a z`V5To2BzQN)2Q>{$$!YFONP|#p=gz!!3)um6>uc<$h9P6SN5c*oY^dVT1PWQ+h;@T zfEj6m*5YLxT7Ne$#&97V_kjC}R=QAb9-DwW{qE3TMoZ{79Iv@5 zF4qe76&|kM8R%%c9z}_d?lUMvQ}%1U-Z!!Ux&9`| z0v%X^eG{Mx187!n-0}FeC~uN};lUQlZkjEEI_t0>{5{@o2D3jnB2};=DBNvlC#jtx za?88&u7cAgYU6N*+rHD7HDM|#(dJFefv*b|wtA9tlJHO=Kqcs?$4AwWeno{{>Zmdu zGAL!bIB>i|ArJnFX}Kv1~B7wt1s{n;CAIM)}Mz_0qh9oc<`0(0y|F#AoH~0V6x~NUb0m0o~xp%UHeSN5Zcy(|&|H#E-K@)UOve>|4B~c4zv-))-sRgo;EF$b6_%1%!c-1Kk zhvzE!A~1dw+xeE|4@GxsUW?gMn5ri$jfk&w*J<9}Sc9ZJ_6<6GTvGh5^o$S3zf>+K z!3D}@w$#*WL+XS4p%(z$%)#zXn$xmkEyv35TJN1V5z|M4*t{Q58i$#lUU?XLogm(y zd`Ly1itlj@dRhp2`9DEudpP(kY=mk z`847B@$e-cZkP`9mI1iF^?Iii>B6rpIOm(vCII{2NqOx578e;4%S{Cbqknr57ZvdP zRYO@6&n2_K*{pT~!S!&9t7bd=?-42&cW2X}ANjdvfN8lVft`ztA~b(=-}e>}Ytage#H2b-DaY{ zp*XptK?;~n7Xsz_$orimT5>N8j}zN# zD9UyE8m+WjI$YLu)|?s01{ltipc*!3Zl|iMO37PUE}G>BziAQ%9^6o?wo*QUq{Mkh z?>d>nyna?1*MW*f1P>HV-uI(kr@9V(`)oNp)GuL7_YUfKskf&PBsFa2HeitBxucpSE4ep1 zi(x`0Jpw9HU-VY7gp*~b5YVBl&788!B3{#hG=fCPWb`T87JLzI6q=AiUs+^FQOGaC zHJ|BH_)rJk>@oNH)G^m)z-j;l39w@G$yAv@IKdViAj4gxd`JMHa>wxOrCamr*Zihz zljR@R=l}Lo>FIwQo6b{b0s#F~exNCMBo%>(VNi-f1dZ7RVdtYXbRyLyB8HZ9oCq6* z))e<(MW@zWrFm>vRJl#X@}zUE4D8!UIuSFuf6Ao%hSrkyNhvDiXFYsjvVexai7J>- zNTHeFNfWxBSK;Sf2xI^??~1^lr2*1M|(qA&@B zHGHkooD`T`OB|fdwroV4B%bwecJCbRtX8_NR}r2D5N0_ZvI8-Q~hDC?6N_vw?0>5= zm8ERRfqcMUbeBLv<)^`uLE?d&i|605*Vzf&PFN(CBsBXODur%CW=!*h$_&1j)ebmX zG^;T;ZKuxbuw4nKaNqIm4t!53^Pk1*la7fRK)UOx2jDIFy`1X3`277BRMgk;80t>z zg0a|*;v>#Sl{_dzXbEBBJQ`a;L)g@q$D4aeTpN>2wy9kT z;-+ui4?fjYgpNRwfhS{z7W{%c?wsK1D^`lm_yR%DGKZd3{{%dFtn_w!Rwkk-d z416>JrGGgmF}8F2{@3k5hS+tDZme76K6YPJjk7N-xYGR0A$*@H?~Zxgvkezs{fz(R%pKQ7q>H*uRJy3K zZS+fj}Hc=hh!!gHIQTh8ZKeHHx}l;^oxdCmq_S74yyKMbD?GBmjJh zV~JdYVu)Y`jA<*0z>IA$UR$KM#p}NpN$o!fV#)HR82fjKt*K&DVpe2kQJG-pU{kMR zu2!622UM#{vTT8c`=>PO*iGU?qvZoa;R>msDgdE$fpk%DAc4b3uRL#ZzbSS7&utU2 zFu&;lu>QlB26)#Av%rhpR0t!q;QVI#B|_O9fF677XaX+9VHbPVipsZ}FE!+3!l2zm z9Vp;7pxmSx(Dsh${VkW7ir?Zo5@G0rolwa>0hTkeeFzw#G#yHQQFfOb-X~&2ST7mb z1LDaJnVe)d)Vy992NYI%9t7Jci}C0cJFSGmr#znVGhvi1M^04mAbQlB>3!=x?aD$V z;iP_?gD=+?-o_-YRr=~iytiBfQ)B=2yQt?ED8OWL!bGAS=19NNUIVn#4^zKe^xInA z*it^D+;@;XMj;=;*2cX{S11S)ZC^`8VQ70x!<%5rA5HDqBpdSd z!vA5KL-l^PDfxBH?E~WQDfRf@KVTN-H=X}~qapqU{u)%CBP3V+(Htw%Tww0&Hz`GH zC7zlh^5i`e!f=^=>UA>BIi&J16(2#oI-ib61|Lnod5FaUte&Gqy+MAQ6~y&}ea8<_PFG~hkQhMI`!?8{vB)N-)qB`{(u)H2K5$Vwex}YFe-dU(wcCafyrz?|;z{ z>Hm#}hy-$K^GZLL<&8o086$7B%dv-E%e)p8{P^c@8LF zTu;f`Tw)Y=8=H!hz8W!`Z*k#>WW%Ol7EqQkm6l zg60pGRnLueW$=!b=87+kq7E68+peW4s&-4<|Ef^o3ZAb*MIQ!_EtgS zUnLlSR5JoN3%rP@Tn;3B%2P(3J3fT@cVE0X0i9mc2%?|bt-xGs0ehwY9PTNEr0?z? z$1l*yKzGWlE$Y-^g|$a?=*N$Mj*1rPEld`LK-jD}YdwZshe{}trY`~%pV4A5_^`HQ z%!TFU@vT^5WU@1;BlZRtJVPRg9>8q_nb2R`hc|_pe~6VvaWvciqN)kUr z>r9Kb0Gand;~%F9c`SlTe9>3_eQ(qB{oTNk8)-)H0~*_{uc$16#U1KMj_?tSqFmd; zxJx=^9ZuVpz(S%06v1=2^_h1$dl(nXYb8*SCf1SxNUg>|n>J$Q=^(vLuD6#LjDD7$ z#>}^E$7316B%Z+6F*CQQp894LAWw4`(szLY zl$sk0#{lxS6NdI^jP`RS%qD*a3`UHh9CBW=gLa;E)Ekm;wy8!NDEYD4nI<)SkR7`s zk!=)>CYiQrMUcA)afA%_YgX@P?QNt|h3=yvw<5s6h4eL!h5Q3kkDv`>i_qMAN9Pp% zjIw}O=gNIZ3`0FXwZNTrNYpdt>vv=$57SjK|?Euy{1s_(9zf}CdF^ZRF zx!8X_Nx)+}W~!=Tr9Rj`bRLnDjc!E!xqCuxxRL)_J6-LhI7jMwzrk^zZ8RoUiD`jX z8O4ZLt7)r)#uaDvhNyZ9?cxZ8H$c8Z~~(NWKMUdhB8fMk}Qw% zl=Bpo?N$G6+8g#d-*v?fZVHyuy=ron7A=8_Jy0|7T|H4$o%zfEp}s#m=FEW|=L@ zzsQanY{Z}#W&4mq4uh^N2lh-h4Lb3uLh8=+-McjKX&wUqw2Pn{|4*%zbkBnIq-^4X z%Z|I?Crfu}2ML_J$n>VH)A@z`7gue0T2tFa<&bn{A+xuU*@BibH)S#^BqIe)DKQ8f z>0_OE2s?OFeMw@$T_0`->k&k|byb`Z?gp#O5@LmpE!)dD-Oe-VJUgdH1x^ufhCiH8 zk2h;E*-_-m8u>Y->h#j>F29k7d_o* z*J{9GA(=qut*5=`DceoCBNI>$?T8<{vB_8C;w71cVV_=KDc)FXRKG z;P!+qHCr`sen8T8lF3+4hD^zTn3NI5o8Qm)92XO*{Lan==B~-j*(~{oM(UlZo#oh8Ekc2tde}Rurh9(yatk z-MqK9Is7tCPB^#?)>Wrof-e1mtuRMu=d!y%Duh1L2g?O6E%+)5dE^7{aP`JvTDXkv z9i2Nd#KkOWbyQ2k57|iHG^#QEl17mo1R8wm9dT4j|~v5zpzTK zD%bpvNRPM7Cs z6(?`Ep)M5ez>lXaUSZ$@HAHXH)N}lmYy;DQ_#WWGy`a?dr9~me5{fId=5AKU`tuhD zNw?vio`ujV>Xa$4c<6Z`kp(bZvU})#J7j|^}0Yq9S z7MI$TlR4*<&`bU6Z=A;d?L!)+ZJU9$gU`v4kUKmwp-0VoirSMGaD^}~|^2*1=_NHJB_`AQQ8z<$^Lkw)sWC&x~ zVc_~-a3!60c|TjXDN@;g7eczEV6!{<%A-|eRF=J+(3g*7(;XPyaWx~Vv98Nyw?yHl z4^z?9u;;43!OPtA=bLXeghu*3JXAB8T{P!PFeyCnCnd0U zCesQ!^D+ZO3pRMq%&b6SOO)zqS4yrdWSP8aLcv{5tP2<9m6^n|vOIgfAV}eqM6zX1 z2xr1lAQxLo1Y(-}`kYk7Ru4JPRIHsk@dS$@H|s%L{2eI~`b1}j^*WBsbm3cZ@bo6|2muhkiivQ zNnua1y(7BMws|j@JX=AW{9+n$LN`bfOi1pM0#!3Sek5ZFbkY=(cgEIdR!1ANAqk$? zG^6KD%#MQPO!Q@!0+#=-zwzxXj@DkBG+DnpW@ETe9eH@^KYnG4uO>Te8`tew-uQz- zRSOEM{wv!)OsaCKeehq}-#V#iVB`y(k5=+KCwj=b5f1ew=#NUdIufxPNG_aA$fdAPS3V+cx zLi-4h)=$s!4N8c63)zZBMG>MlO5EQh>MaXbF3Nm91~uB2@6^gcQqhfaE;hfmFP^Yq@~ zpWn`(Rx?Tz#NP*buX0O9nJ#|bXUk#%w~Pn-(_9{Xfg8t3GnE$iejX*&g-MFd#8$Q3 za}T;c{;SB~9GCo9!)i!og%Ux5Vh*O%68_*3&X&!O=#0~MCW$Vn)&Zc(nlvM?#GIzUtBVV|wmr0PV^JxYtukkJ|-u%zMXl`$*)J|QeB zrYadJm|@dBp`=l14Ug}ysrGW=e1HB@<^HJdCq_pui{7XOkH&F4ud^Woul}CsfD{eKA z>eq3=BCAY8jALi4EAuR$3MkKIca(P?WL5wYLUv*|dw$#jzlkGpCua4y(U7LlX7w{} z>7ULAq2GbrT4W^P7sn*w-g=p3@nyrm{5T@QNa0MF7#~q4pQu_DOx>_-JGd}3hYEx$ zXH(myH(@@G^0W0`vV9npQMq+dowB zKnrj>pwZ(4dQ%4w>g(;5$%n3+f3uo=$!1Q3+!4v3gkCVoT_mgZq2fA`fMn)eA=Gj> zsra0|Or|1@v9Eaub5yV;gfGNmqoR;6lu~m@b>u9=z%aZ&BdURO)zJ$PyuU!5oC^L! zLs6JsZ7kAQ>rK=a>oxS3Zx=smSicBaNiV@Ug<1j4%KMDTg5VAEleBbYSIL#k=!+cM zbBB|9rO_6`yl1X-dLV<&y{y8#T+(u&sLm-)flLlXT+$)4LgxghbFCw@(0~h^s6mEt zdYM2(b?kR|RNNX-SA1UsHACFGhM>0M)=e?*wPhK@!94M1fcmgO?Y?_+}G|izjUEfd^{%*yi9QWO{QLnQ|!e? zeO6BuS+DUj*xn@4`%lj4-z4W|{l{Fl z@&_BB`{P5F${yIevZ8_uS(Hn%)@R$n!B5j*a4Emz%;MvZTTNtw%C3{S8MV%3m|VZ8 zrJnD8m+HV_;r}jAFbQFX(8#2=tvj?^5UQv`f7%F^Efl}k;(IGlx%#DAxjrF-nZZs0 zl;@c`mu7Yh_G1y-gcW6k$spynB=uT*NlWp&8plZx}dcm z!bH06DlSdKkM(i?m3)qDr;KDh)3Yn6{RxA~vbzD$-E)Ges;ZV0%|ORsj042qkNM~+ z$))XPmTh-n0kIPMzEvB+9Y7G^Sz!4R)4j2Y0$L;qY=Zp)%MOclLR&#Pu`KSpWJ4~q zU9+5)Jff^nt<9EI96ooyOD${&he#EoaMO5A;@vbwhN34~huHV1h~m1B-@HB#X#}vR zXA*x0Q-fN%5$jWv9h(c6in8A6gAQ$k3>p{tpcfHAn+I+q7O)RNlbzQPQj|`4`^jiG zLT1_R!fJ)D)>fnT*8UG96kZb6D+vkPn?&vZz~igM$vFfw!w24Q2Q$W|ay?0TWxA|5 zoq0T4o0!p9t%)QPa1UUEgIDZ*csGSbe?}vBH$w)$YF!c47Ge|~pov^R{X7}O!`|N< zPGyfc(m40qikd=nzj>#1Q6v4*HBPG=&O7-d$3!IWhmV|12xHKWAxKx+AdA|x=0+=Y;V~Zw$`Wic=F5QU~3pJHOTb}v4L4(1T z!M7kz9f|bgKONJ^m`Fc|)?^*Xut?H&z+;+bk96~$^{zfJaasD}&A3!vgA=3}z7{9h z-&Dl1a{cw*rIsM%00gW+oOnPI&C&XUNS#R0Zg=lI{~D3}eYIogs!vSTp>3H&yRwZf zXCf|1bY5QInnc#0xH@GtG-)kqi@~hpA?84QuJ)aTyJS!cfVPhvKdr<|RXY}(o=Qfl-?D~>L)H@E?%MEZ(BP63@o>;pePHA&N-h3JhepGeYR06UrUz4!| zDC4nva>(LZ_@}BA(6tKI4;#;xQiho?h!*yLVY+c7mUErO_oQ8*^4Y;v2nXhr#=2qr zz_GmouU}JhBVY?iIIV9KTbwF;BGQ=0^s>9r4S=$-l{xLO_Ho!m+BzRcJ>dM|ETUpa z03%?ix`JGfdDUcXlX+80V*Q;AD?GQmTKaI95pVQN+Lnf4uP<@Xzb|^&{$uSX)kYsT zU4p3SHMFg!LU+f?s;5rnxylnr+xoYzSI#>7o2qFxw!a2Afb(RioBwB?4C3DYWuA-_ znCUc;L?5Wvo?JLElJ{wwuW^l zv>^CK(%Om1QGf)=7P=^L?EV;-J;-e>71%O8f^PGDyTs(ZrZb*cYO=i2mNx*y8G{^$ z0z8Y&pR26yshJcE9ZdsiPJ)=*nx-31KL|dMS&b+QKGk}|)1~FUPN|f2FpMa_`{LSU z?Yqd$mCLccyVrV{i~&mSL~KjIInU7%!KpvK-)H-7?xraMFujxyvo$p|kn00;w@YSl zU%Ihup!l*{e_%A&Et;OZPZ@fMH}N#eT@Q~N2){OFr!mP&6O#!Q0va6Dl*(|+&+!-|jn zue|ZQz&x(Fr6k)W#4Lwumk*6J7M#q*C^WAr$(rKTXSBHlfJN_KH~i==C2a zeQP*5m%;y4rgk5`JW&tqw@(xlCX~(0b7#v#3*_pW|KiOY!E8Zj0N8MXS#knn8!?;UuclV}o zcc*az2@u=~PS6nC-Q9z`yGw8hBtRfo@Sq`RfUnu--1{B&X1s%+=MMfr!eI1MYdy7U z&8j&kIUD8x9GftEUs#t!OT$N3vN8E#F-nb(c$D&#cW=_;tzUnpia4`@C({#ng`T)4 z&j%8EG53dLtYs%!*Xo|{4g$Tl-tMVQ{H!yZljCudP+g&xiX8#G2Xu4FBCZy;hs#Qs z!6V^OCgyS8=?O-n2x8Vpdad&@)*i`;tcj=j6<^}4QR2JcTZhY}q6LgDe~)`Lm^(p> z<%A$Y&^cWS96^We)(zt68ncRGF0B^2HCD$lIr+MsJCKI}AwA_7dNF7TrZ9yzr!l+S z_A!T{Gy)kQfTa4uag*KIwV*?#lvz~M`!yKNp0w5ChA9~V6e52O=3G}h^$|tl4^_kp z5lB=x?%dNtUSX~;;IZ4#!dzfn#SD(w`o#-8s?1kypE#sMRqwAkof}4B~^^Yfs&w!U!It zO(X3oQ>AZj6~(s}8ttl33~~}_kff(giZIm^-(v@1$&)4?LDLVtL?J1Ht;3}qb?S?5 zd{?~@X-pIqk&RD})YGznq~Die9{SO}K! zhnSZBeG8Y6NBAG8{7sf&f~y*pTlFO_dkFLb!sHO)hB$Yn0=2lPj={Es@z>n8L;ZjF zgqwU+@ZK+ewy?8Dl&50gp}^9l5z}Rj%jLND_^pJJfA_M#|FeXnF!x2IqD$S^w`-MA zf)sY{;e}dx38fi%{ljAAecu}V&>uw9Yf$Z<2p0$NS=sWxH{EjppY^J<|4kkO zte)kWZNyrQdFMA-r0P%C8+BwEYewD6@ z4e!mLg1;_atB%5W#W=!iS#8@B?s490rPp8i0iWO(&k+E?GbrUO~LV}=wGSMp!AY^-2D$@ zqLkzPTWU|UxTbi^n1Je%(&GAI`z;D!<7#LHZJFGJ`j+4f(iF~n0+l`0;U!}G;Z2@l_c4!YjAh5<29e{&bN6y_g|nc|O&?($ zD~XxDkHmgoBo2^f>uHWdhY|o&yn~ZedAz%roETn9PM@>7kP?hvDz_lhzWiYOD!Sok zII~e8>uv}(Q#9)Sale!>;Q0Ql=Nk%1!keBMq3hR{kZ}A5iPr)ag?A2rAcj+XpAZ`3 zcwUPc$n$uN|9=FgALute`k{i+;d$X_ci+$|s3WyJxIgse0yxurs7~n#fX< za71{^yngFtgYu>uvujNaUOHK5nht;)h}8Ovx@ysQVy)Gu^pn#c-j@Vd39JaGrftJ~ zz&8hJ=gkU>@V1sBR4784+cXQ@!dcpp0&8SKNX_zg_=BQ2ZEH%+)ZRuHK;C!dQncFI z%(8eICCF#i=bx}={Ji^U2aD?lNNliBhEsYF6lmngBbZhxUw`WH>sCyQhiMvN#r7(W zmG|`fx>hDS%2&RhAs9~kxqNPtPreSa2uP#AC^O-504?-l+h(_ZG09t};4HUrUurts zpy$9a+T+{u_Yct`I(D|UPwet%j#Kgd4T2Q_L)i~+xgQ>Y{7!ODkO)SH_ri`Nfq38> z{^XY!j)JiYE~^4D^R$^M>DaRY_zP)s<&W7^Alx^|@2TzWy$rM=mfz;H1Nd8V=odbv zv!#DEj1x87vDq6WPj8?wS=NgG8r$X+TjXO_4_|3IZv zpOSQuPpC76Mz&EXRwZN4A6-Hzqe_Y2^1Apqk}sV4EZ^`^dtMSUl9l z0TLzT`NeD*uQjMtB=Nzb>-t^9rQFW3r+p8DU;nc>Qd){ElsRf^h~lLcuKeTl*o(YA zD0qW^N?}-l+tA7PHGg_*@2780eeGXH>L|7*q5Du>{VU%a(=e1BYG;al_N)}Gy63Yh zKH9)@w+OI*D%#Jw1_B?K(0VE$>i__D`?gm5#(8yrVq^IW7#AkD9r%V@NOt$d?z8}W z^t&5chXrwTM2@j^!|(YZ*WLP4QIQM`ArJh-Bs9D_eqX?90Vm%kdEzlok~h1Pf(78> z+q%QynFWF|#2H6%xGNDcqa~ad`PqZ?zOwzL?REFf+#=INv5-jtv)DR*+?$s78yrxe zMq@hv%0KT=vbsX6#wmY5x>4;MwRs9&QAxiKzaMa8GhTIR4^*RQ#qAM!cH9U6@8G zI@1LFU{0RhiO|HT1m-+gR>z6dR@ceeTce=X3#PVvyC`-HwG_wMlQN?V1miW@=0 zt)89c(VrDJhe=(=1~bwUP80(FG*fx~Q8}hicfW-~+qM;bB1#<3g6x4jkDwxuib5rL zitpPtwk|nlHpH;|mL+HS#BI|gwI)~kI$U*RAlUvJ@3*DgXrkkEcKt_bA&#AIL07(C zQCn6CE`2>$KXY!aU324fN06(2B*h2p1OPkIz9Khb&2d=z^-4^|dLWS$@KslezqF3W zOGEdRBdFp)Ea{oYqhPoB7WO`_XTA(|3SU`1f?iQkOi0aC1ux!hmNTQ05Uq-qytcV< z3;G(Lrt>4b2};-UNsZ*^ul-vaWr{BgmKO|WzMxh?!?(X-p}0ZDn2bc(?FJ7IFt`~r z5(J7SdhyO7qHW85jji3_!Zlqcj)ld9)Ho@i59+U&;0y=2?yA+;It$u#i5y?v*1O)o z!-rbrJr&buRlfhdr5ropS*_py<_CCI!=K}CavAYrwqOACe`ZtX?X$o-4uuyh=l%k_ z5!eH4=kOcoYu^Q2;>6gJrP%#nPT@7-h*h-&c&7!u735(>G(fa%h8?;yEY&ht*I)f{ zr+UAEOFX`;DydHp^10V3pA|Z3HAu&vlU;+&m{LV$Qi170rKd6)d7PM@5Iw`i9jwCi zG9z-FtQnU7uwH%c1a*KhgaBbzSKF{gBUAjytl1<)_nQ3x>w4h_xe^Icjbp{p;7qs? z+3Z!nFeYz>T$r7ghdc(1!8WsjGu`ExtwcgU!^*Juw~x#v7LwP$**TXpV|gafTyJ}w zj0^djR)fI$&O2C%>g4L+3D-Ii)n8_H87bt(%|n8+R3Lk)iZmw{OnJFwab zf{H!xWTV_e+RN^w*ffHF+3*qmcwc3$hpLkjzF4x&vdHS#c)GHg|Gr7T2vqg^pYIYo z;91QlAn*|o`k&U?g~8->fmsn0)G%HaYX6pP+e`dQ!APrAE*BnAnnO7)QMf5N$qVXs zu;*Qh!SN6L?doc&9fz|5G#UHvh2Z9$5-lSO*jgL0C@Cmsk5IHLT!8M4p{rx=5Z%Y4 zpD{vUYi&W-yf&IayNno2jZ(|*HFBPgG&c?`Ki2t9m-U_#fi;0R2<*i zHGCnS1I7y1Wz2$yhpE~&qoeB73I}7q?&t#Frp~hZL?&3J9xYQN2pw;&OOmS$ZbWQeTBMPsix4 z*1-&mSBM9)E3lKwekm+t9LsE#U&7$aV7KrQ7q1n#{T?O0pEDljrJl^6oLc}LD`A?g zDKvBeIJC&`vz{kuQ?D1*J_hkwiyax;nA5A|>&l3QIcg^nBK}f24nA}=E+r`Tiqt=A z55e^*{w-3xFJ5Y~vzeLUY56lc*1^I{IEKa}I~gBmJ$z#A2%$nMXrnv<9ZjOTshGx| z*>kkvoj+s_7FpuPF!fo_`*k<8*T(P{-E`vUx?Cinl7WNO#p@+DT!#Uk0!MUToHdb~ zEId2x08Z@h`V!6#V&{{wM<-l&P}qh!CjWe+!Nz&d85I0G!CFlriUz#W;EE%s;21IB zSNpUD?%c&nqn>J)cU_nSF*IBzSe#1R@h#J?X6@^{o%sq!(`2v6OgEP7+HJUb#ze?Y zR`fZp{^DwBDSIr z@x12`J%OjcIUI4zzB~8Dwxk*NB*I`=Y*^j9eB~Su#5(xoD))BjbEoY?L#P8e%%ZS6 zb^bJAk()5knm5hYyaMd4oVASWjN?{i>gGfPUWGBb2Bjzxt~JFfYk^Kk zif<{fPp#s!&>jxpqv@~*t@Qt2-Xh@=V{(IX2P}Bl$N`MwOy3zGzLS0s_B~-8s%5~T zP(96O`I{HO3tjvU*#&w}3yxbZdW*s8^S zx$IrHv8v`!M+!brFujm}!;tTrJH@w}pYL#h_5Pr+hfkr9-({Zg9s`&X{9X_*x^6(N z*H0)45Bfu7Hu9#7EJXmNm+a8F8|=@Q!eu7iTH6lX3g4E$cvAu4gUt-E2Su;7+v#{E z=}I|HPXpkD7Rd-I1D;@j0y&;lY6AaXlZ_cN3P`;%gZN&uI*(ON~NWR^LCwY^hFM* zY7}w;(WYNR4f?yg5zal&(j6&)cKU^Cy zcSRIvpp8ZI`LrK>M5<5a`YDl=eDY0*@)sh;O6EfCniq3Ps1w!jD(lu3U{+xFYF(J^ zr{Z~|W9Yb0!8G}V?^G&8;h$*NUW!Xp^hLr_&!-u2mcA_(a!0bWN{2lTFuAr+#i@tA zQ5w!8pQW_0!KXmuN%n*IESK27q=7p)6HjI@iY{7vk!0hnV*IrnS_f#L3jIe~?xPpy z3>7C24;FUA5u^rGaLf430GH>ISODg)fCFZ{p0m|luUSj zHj(RqHjj0Ugmvk>*Pmrdt&y8%+}TbUDWPNU9PXj_n0GyNYu5D}MGovo@V%S2{skmo z3UCIe><;3(R{=nV^01n{-ldPLWM$gw?wzs&nvgMYMGN?!2n;27-VHMiIbXI!ocLcO zTqjDr&E6ntT}ynTW6lA1Rsq4m@kmNcQ|;SZa657?A zAadTb)j@A*4NB7J?;$?;>m`WUqe6(@Rs2aA{l)QSobgD2HCx-s4v zwM}bk*EB18`t=k1kM7@D?=e_V`nkm2uynqECbLHuM6U{^Ru^+NHgay|9%Hnszd|o% zb>e)T8cf;2T18uwJpWoHOu37xQ_vy>HA02ZtWP#!6pC*PZs~9XVV~#?(dh*gwPdyq z%z7;N2Q>4}ofAK#xE>dnZepVa(@mgcZ)h_bA`U$4=GxMDXn%Ip)ec(=uIJ55qp zi79gtr9DyF8yns7u^ zOUnd(lqE0Tq!WBq8GrHXJ^#qX(Uj5q{5L;ZUH4#(X__{-Z++L_(rqH9+b=KKYFrB& z@z`2Se_c>{E8fv!ZeH>Y)?YhE0?y%3n?Ob(-a0cg%*%&exAQ9dkBjnN&B_Me`K@`< zKQ{u$^WX%I$11dci6j5p^g}jLlzhq^$dwqlefP~WAd{D=1Cxx0Wvs(>uRuB)-#>}I zWg5MSJ&;sA;w}xuBB-Pbt&K=FcA7uD&Q=bC=DCq}X73P}<>TFpaQ*tsrrqUuBEH?U zXihbZL&A*Ym6MMKgTR7;(O|G5HIb~wmqQcwP`c#HZ3c_Un&|>)uEPRat;-ov> zJ%Lg90Kd@fR^HsisY1K31pRf_T4tTlfvLl=h5hET7idejR4wSuH7R4CIiI^rM;h1y zCquJTjroR!8Be~}e&bLp89S^b!RSYusFS@g@=wq%XN^p>m(5W)?8ht1vF#G zh0#>XRfSa#!)Kt%pfxjtUg_WAN{k>@GW0{f!uzM+-V{qU`UrKJvaB&jEa|R>gax1vKSw)E zTJ7E3GR|TKP@P`=Z5b2aKvNmsRs`qZ;u38H?E-(Gu4?QCgmgBTUk9#S04Cjg?1tTGycQ z(X5mA^rTD!h4)lPU)2dp7taq9Yr6|CZlQG#W&TVZ;(zdmZydO84gAe?3pvDVW%X1< zpLG-G;Cs|~0;%^nfFpY2iNw4r5Su4ym38Lvz5v?!21?EGfGI@IkK91V8WokwA(^M# z{zreXUaVJw@#@X%+njCbx-g66`yTcC)*!yrF!fwBqQH>AZ%*XpYlj@i&8=C#Whc%U zWN|uX@vx?4Ev&02m(Vu>Y-jcFX zNOAZgkECVkkT-AZEX_Dh%f#q;8XOM>Nmb$*Ai-v>qQAXj$kaS%Ee^$cH{ZnH0?m##VyOg4OJyDwZ}1|DL|X5H$q z?&*(CRu{-KMACJDA0=Ee5wI(9byEV;$bfaRBem~iB|OwIkyc(@1r>eg%~oI1n-hW) z^TmY;!eB0FSsB*&6wt`yH)hh(KK0%3_j<9Jz<{Oj_J`O(%bXI7YQgatCvR;#mo;>L z8gdWZcILkXdR)0uGP4c<9EX|=Gue#OfB#DM7r$!vhLqvY$a!i#pLJaYazCmo9&?9m z=olCf{A-B**EaycO77E0&j)?Ecj{o@fTV@gK*~hTFZg0ZLe-3 zv8>Pu7!+c7VT$l<^{)a%<84+(x|zCu`fpWjIMpCXmHu)eGNhhJLUz`Tmeo#5>}E}l z%SpO#-YDIHci4=omcZdxmU@#AK=KPm1lG$#>GBxg{=HK3NZd-{*oXIxu^{4waU0S7?ft%6Zu$?&VOBec^Y6LPMD2wXDZr3!Hf~ZBwmpO+ zX;SxGv}PpO;aJn3Zc|!rD&yk#jcc=_3&k_M?tNA`T*#YK@**LR zqHeIAmR;8vBFeWLav6U{wYEn>zK+V0(UFBZ6lu(H%uZkL>^esu4vc()EXo0RUe^iu zsEhn}R5#$95`|`9RU%!jiHuyqT1%zsFGe0!39T|hHoFfvoo?n z;&>k62~1l4&zX|P$>(PRCo6)O{r}6fq}?|AwK>(X>uD=EOfEt8=g5{|1Hyf6cL-yb z_Pjs)ad2v)R29_%L@OI48+Il!IVYPHG>e)&=YZk>F*4P4c1CHmTysr=pLwxzbc@jABjT-E!=ED3%H2UID&uf6 z%Cf(dIN~vlt32yfEf#w98$6KMUz>8N8_aMwq8OE6WmD8b8Rmp^sM6wXWyxS|z}L=> zL|lu?uhp$asaE;p$!7L=^eF8f!?T3`?&Vq9c-e#6)NmhNqVLg!ZX#v!gr7N>_4gcH z83=rQP{;lU0Ub$FbmFtRUWe?;2Z5Yv6SE6uGJR2TNl^<_w0$&Il)__)k7<29vd1hh z*#zlc$`kiO7agaDY3c+n?x~Tc^t!jTRZG$BOVwn)H%8h%I20$SK$EiPq0vja?Jv`` zdmnYoLgE7A03>4&+}k6a1{XU_g?OyC-(yc)gwS|??v4^=DdTi~gZM@Yab37o9i~b5 zzAVPo0Jaie6^(ePPaW*}UP&Lz%MCj5wV5(b*3515aoD-ao^>kDl1o)mUC$9uHNLt_ zj0-(FMJBs2eM0_)v@uzJa4 z`2gg*&Kj22bY44JT|IXNay>I6=#iRs zl^PLTJpgc(=F|KfV(nM5GK zqwc9DSl1Y=fYx?{bL|6Jk#w7dD}}y$Uqxe{M%e-rAtgu6Ralyb$>KB=r%rKrYMGDQ zGg%Fs?PyY5iL(MSZ5sE>rEW1quOT1TYZs5XP~S#7O}a%K8NQztfwPFE;b@~l z&3Wf!<=0fb!w%I|7p1TRRgAep{1YyVW+g$8hX%cs$e-vPl*0BGs7a@l!S4Js?oVe1 z{v&IC>x+dDZY9>GdJaEc72Bh8;Wno-VrMHkfs~q4P7z6>L^$^2nx=8J4MQ!U-fwA( z8InOKy%>lakWEHr@E)emo3EWZ0L<7As=7wA8wIUbCyg)+SHd`*!ih;lQCB8c=fs&G zw~K#EWR0??B3@%hGwteD%3{7H`5usXjaJp#K}of0eujRy=)~g!Zm`WtynE^ZdLDb{ zZ@QX#V1Rj9^k2*e-)Yx^U)1!iZY}rRKk(%~mn-fosO5uunV1TP=Lsb=Et`5(Hq)`u z3MZar^TkVFFoQ-=+}p&V5jouT8g&lkREJ!RU+2=Q{z7rr`@j|{X2V5GNaU2HZTxZ^ z*LCRHbf-Ea7_pFXMn?hL^XMmr6e676tb=iyLRO6jOh2|e1!h};DDMka8)PcuN@Si0 z20s>D#F(gzyo+}+Gm_cC;wU4_*m7It3gickKtye$DLCITQ5SY7LZb1-0(NKfYXakW z`~+hI_^|Lza$FN537-%qk|`xyPp`l?LpDRe5b?~o+yS+L6E!?*h zS9aU(K}L{&?jf8+=&`UI27a+{6aPr3v$%m>qgp-LBU?&0@ejK9;rcaq4lNyH|MQki z^46c6?dvgOQelPy4=!YTWepa=j<2d$NfW8SP_D`eJv^gbp#E~i>UWFaSG?tCQTa^` zlNSfykgrd-uVL~&#Tq|Rc0E)2;4zl07Xy0miVM8pA7GA`iZP~NM(~xhI?fZBJ%BW^ z{E1{F7qJ;=r!R-b;7q|D?Y?qscKNaG-GuZi*$*2oAh)7wH|v$*63vp6iEswSr7H+isG-2R*Zd&IR22G)VP zqi`2zgW_+Os`TDv5_j?Mrx>6rNd`?msnB?T?ciu@STbPZ3fn}#{lXK`;(2B<|L9Nw z7xND`*^)Zv5fb7o@!T;VLz)aQ?=vyQRILafEh*q6zAi!Gna9uKo+$7n0N4E8rSc z*AHByfG}{>yv7-x8UBO{r2uZ*`kV^RJ{h+m67WF1n?O$5UO=$We*j}GB z-Y7%Q+Y&t$r2p$zU!Lb(eIE-WfnpRMj*`ESLV!1Cq%E;Ep0_mfP%Jlww9ZTNACn)@wDL82xCH4r=)B+qdE4V&04LFp z2v?{++V_7CsLJ^{PCsC%MA2_?7S}W>Xeb?1h4|LlE{jYSi|L0{`)1w+;oAM&=d#)r zM;5hEZn`<@wTavj>hBSk!$ge4*FU3P!XpA4X(?;8nq1IO0~yP&5e=cFQ@<-$4g)lH zVPLHoYj-~?5rqYtFE_ZRre~7;F|W8emv$DI{BW2T5mbL;Etb^L>Na%5O1gn(Os4%r zw~_C8cp%^r!m1ZLY3mO_89Z_iBW{zBLMk|*4ww~Va+oKJJb<(?(k{Triy?Kp;X~Xn zAo0oWx~%ih)Br78Y+9U}(M|j1cg)Q63NDvJcm#@B)+&@}^j4_4h1YOUi#fH8p$$r3 zU@;)c$GZwzlq1V(-7u>p2}2?=1hCHg@bJwffKRu zE!Bx`i)naPS48vx1W2Miya^*dV8I#6rU;%K+&f?agGfF}_Y+am4hI8T=Ilw_uoN>5?q9W2sLqnK8+NG{N_;~P%o`Cr0o$_?4$K*J0Ys$lrYIr3 z8Auxg`dHwql9HyXdp`G<5)HIo>%zwvo%#D_cv7&2NS(~gGqt}H3C1Ye?8aeG{gQd$ z{nqXZiw!#iFa2WP1bVvSf($#~&Y6fV{;ObP6x1h4POo_xN?cbE^bNM>56j<4Y`{Yq zYcXLM5f7i}(7-!-t_54mLl5&2@T9g3Q|{ zkX-@fYG+g^OaMjOnWns!Uz8l1C2Y??=8FFz25H;zq;!yh&A_M?eyoBlz%alAcQJU=QKW#>owO`I?pwM8}m0VRQy?gMHJ6SmS-G zc?JXFa(rQ(1x~mr9E2S0$Z34suVh_lS%Gr8P?;oeQHoDX(j_-uL}ygH*#D4?@+~@F zm13Ry?M`X|yn;J;?HRa*oYb}rdbJ+}8fvblOxZ61T4~e+l_QXjVHVFQqGpS=3pZke zS*pTcR?b>(6TOrEFy+`pP(>u)a!;$p-?*9LfTD?wfZ%W=DwJEC;MB#ijM^`;GL9 z`K@v0Fh8mNx%KQJQo4lVO5!AZ;$fbbA0IXLnuH!0)Os)4v5Ji+Z_+o2eM=7fkaaz z$$_|3@e3g$w@*+;GD06-aYheU5Ib3nei0Z*Yfa}n{^#w;vpZQobCf~w4osuiw)w50ZEZXmsr zgqiPN>8_51I80k7G*hO^G1J_j*XryIoob{+g|%Mb^6+^rx>jP9eLZ_>{s(@lC0y{I z3&Z!kT?Tm6$dZa`Lj^N^Bs`URU{Q9EhkpIAm(BgwB_t3Md#FlHZ$=%sXOr6NoVk|| z;G_MsT7wp7x)dtDGryB$lT@Q(nuqXqj5RXn_XKwUUHJ_NcJwZ#W}|IApi|^L%OePz z)$0VlH-a4DpC9&lTyr4%qYujxBlj@U15Wl}FY$gGudKVUTG_*{+rVX}tQ`8FAC$D{ zX-p}U@%=kgeQsR0=#y4EBjr4d$U+-ACxr&k1+mD$iHJNz=H>< zicraxn+0JJnFC`}X1&!-k>$OE9TexV3rNdSAj{p2DywAeu z*a45|;9L)mBccySQ7@UcB|9hbk(}o5n7jKlnqs=>>K*_zRt?}K>oK5st)jO?N`#`$ zC+pLYYlb*_1VC4uNo}g#Wk|tQT3=`(S-Mu!(iG?A3eEi%?eaH}a4w;;(+9^9En!D! z>L&8ACqD19V#od%F9St_k4_2{=p%P2VV`gHG~+zniU_W()@1fAz`Czz1Xnj~cDzNsS|UC ziWsI_-n*=H-YaWtl5^Z~!&oI(Nj+yKKkmcK8 zJrirp(c9KJ66)OGxP$a|mFa#W7|*MG{zf;RWPd0EV5c(G?R{mBd${?c%St6Y?i=vw zB8og#o;3A<}Z6)MK#e?A1y zf1ELjrnctJ7Ub;zu>--h4lt(wKVK&7NKgEQXWgrL9*rS$aB!-Cvm3X@6c-P-I-mjK zL=B}SLUGJDLYLK;!k5 z?e6ihwUJf%4Je43E4++jLn_A;l$_EE^t1Z7gMi$jOR3&|pU04c3>L*y+&$HqC@5S4nE7|j?3XzIbK!c$` zfPE;;Q;w~te$A(jfoJ01X)K#@yTKcWNV8Y`qiJ-X`|M4dqrHKzse( zejw~knthF7aO4}ri?57YxdCWHJZhY)=}4&Vvb_rdWdX+(p87$4K0p+gKiz?Ob;QLZvl@GI>kWzQR1hhkwDKV?R z_ilF8JCxa4ms}E5$!gp~5O+`(#!-CLz0|T)E}ebfUV7P!`hnS$`3b|6=a~xs?WofP z+rEG&u)uf19$1(V5!DPj-&pa5E-r>XX|-i6x7O{vd3NIT(Qb;YDjqQnZ(q;N#RbV* zX;Yc7(D4u~UPC1>$T=0mbKSb+edLI(H!WYPmRbc*MC3pDQ(L%Oj)fKOlCuQQ9reQU zdz^dlr3?yT-$`uVbVJe|;xIqa`G2PO&f{bPdbrmCU))dWtjeJ(cM={Q?&Q89X10Nnzg0u_%dNJyp(D6+r?sO!?2 z`2la8gZGY+xRxLb--!>M1i1yW7Ekpn7H@sT3n#(mITp4SN>@9!NcxC1cN+Hs`zv7v zh_29_ytat9ZjWaCK=mw=0WRZe`cpFgK65*fN6-!<*B$-l@|y!>ok--!msEtR~{s@kIkDPF~L-b2?Li zEw8krQT}zv_wg}-dH~pTl=YVtFL*3OgIZ2Z*49qd?W&(MG^6DkZn%0`P^t>ji}$=P zHD>-23|pkxvS(&6geB6vUYeQVZJPUJ87i+r^{ zdf5_EpgPnl_nG^Kn-Qa`Q(+Pg!A=!2ATXXNkGjf;9NU>8faH_iOVgdiEYf~@xn#^` z5z%3#ZTn_(3?;j5kNjdd=6k1W=KHH-yTZfq$c1baFN`f$A#)k%I*z^d($72e>r=6P790lTe3Y9sAK! zcfI|f@My*q6}d}_2w0TEa>y(Rz2qkgF=x#gLs|Fotztai+m4s5*U|^HepE5JEz*=} ze%%#ywH?Mud1fV!f zLO5+sLnrbY`RI}Wd4!6U$K&$&i||WPB~GJ*PUUjmxf6e|{uuY~&+@dV33Q_ihLawA z_lD%?H+_Q64SXgA_3yF34^&fSfTgLw~ z!-T=gE23cIV>wml*d?&>apL@hJo>u*8Nn&OKO87m=wq=UL#nr1x~lbl!}xH$C+&&G zBC02gmZpadW)6XgT$@QO(U+9s;}rTx$}f-r3N-Au#8TuF?zZZ;e#;c_w9ceTGw{K1 zq35Q&Q|{mQTW;U-8!iug6|d;-)b!3x$O<`xt@P1ejvg)6`)u>O3%Bxpv;zfOFzm6e z$}JUqVdt@+ihmw%Inr0Jrg2>m=;^T&nRu z80y!;IvSnN1rA?pABspq2q?&u=$Wvo9l+tL*>+P4e)qO|8wt5c@XC4;#n`NVDm6IR z#Z578$9WHOHA)KZJm4Dk@@=db-)q(|v)Z+QVU8n=pV>owQ@8*~oFJ~nf}F;jTEq?G z-j+G{08w`Sm(=t7NY&ekxwayL^q+1?mN(>UsBV|jcD`rj+yCBri=FRT z^$L*hQR`7k70l$KK>6krrVA1J;#5GntTgfB zO!^2|BQtU@A$4^Q&4+#%g5Ko{zl6xO*j+l$t!qkQGCv3zueva^KfKmgGn~H^Erh7u{dzSPz-M88=TD#j2@3xuj`BL^ zOMjSAlEZ2hYnaWD_Ry{M8AU*CqPj$D5Io%3uxtCX#;bTAzt&(4 zdl&%1&s!v614KaZZB}F+cH6xjhp{QvMI}kqrSpz=+;dCi993L&!@+;{FIEE#;Xw50OJaf z%G!Sw*-%(fX2rU@iV{TazLR&F9{r<4MPwS!J}eZJ#$xL~fQ- zAo90~VQC9GtC_^kNpi@E@DL9LmDLk)6YzPABaX+aRZ?oBHDJliwlXB{Rf5nDU+Pj7 z(Qc4rFCc$jaO0(8%K1RJoPVz!cwNaIKOAdy<&sn=tfj9o8Ox-zNk0PR%IvD(ULiVy zo~FpUPt|dvEV1=djF)D+bV~^6$IF=$VyYo(K@5t);>tauxN_h0S5Si%#60t83dyR! zmR_&of>yalQn?w%yimgdN%B3O$;5pZ4VXzA{T*_gglKFOI1Py59mf)~aVU89+bADC zN>bPq8&GH>!IZQFf2jE4KlFJpS8O3;@QAAcJ(JTNiu&$u1y?H>6^qER>m`A%vS6*> zTJmHpU&34gKBJS*BkqRH_!Z{vN=Ebx;MkmKY*CSi);A8g7Ce6CR5-uSx*0n zZ@6WE;^dSh*Bt63>cK#Mb{-PqOq605di;u5-Ps%$WkOBg$#DN1r4Zj0Ig zR;YCbxr&@}>4FOLdWh|c2EN2DP}oTjF{*KfFXv@la=q+z5aSq#rbiQ9!}*+RwCktW zp=q|}&Nk$@kfMl1$(20`bin+%#4GU$0pzQyrH>d-?aDv`aDS@zY|1cD&02=O5%+@AfFfv%+LquR>A>c110Qm{mvIAGg>Rfz+D0qxv4>u)3To~;av-W z;^-4pcWajyPc6ZJz)8}UHqNGwY|=J{&Zd&4#&#yAZ2v}>{>EkkmmYCoOic2E9~Ms~ z@eEcgOW$QKIkw$#sZ0;Ab2V)%c5lqc}--3?A}&ivR#3(cGSB3#XNR^FC> z*P>B0Bfv=y4{bxID6fyohoJw3Em!_`R3uK{V|Q-Vww%eHK&<4kRO-~(zLfe1JY*&B zEsw^PBs87uAPOF(X2EaHEo&w36fYu+M2ATBfe4JNp|~&EGwLbB6yt_xU)2SbG)JVN zT>@BDzNeRuBb*V-O?;{ozs0s(_8i6iR=cQ2t3DAc;_)hT zdvC=w%%UO8YE-^SK&btPi#x=w?7DM7!Wy%T2@(dUmX?nwSEbgOSj2HaJDbEUe0c_~ z;HNUilSqdlouy$Z)I_q^NBc)O>=~7-_(46~tlnx#c|}woaw&NEngikVMJ|PPV(_G zBs5$RkMDvEVP4c= z{$gBbJ;+yA^i!Tx{0!K`g*`rhakD-Fcr|6HzNrFoM;qt~ibif_T(x!c*Xnx@!!PgO zK7#DGT|^v9%g;XibrI`r!%`6ZlpswqF_Xfv0PeM2evkfdN`ipWER&OnsV@pMY`oq6 zq;F1wO1-0bbop~>HIl^K{5t;7{O0}{K{kg6bL7ymhkmeeQ;hY8%Mvql{@6i`Yhne} z?%1k1`sE_Tx2*g~qa=mTA>}kAiuv?~yEYBBcP5F~%eNvKo)AbPfHn2rO{8`vd%2ou zXgjKylOQjJsrZ->17VujlsMb5K)=ry*dBQ71J=Rc*QaHBt%?fpydusAq%3v^78COP zy=1w9WD^y{u!X;@BCks$-(f%u&>X_@aKJ#)V$Yc{p#*cly37an8vS%M-aek?#tX5K z+-O@WGDuD|s^+4LLRUCkwA5|f>)81gP}RkZKel}dHwvA={?x)f>*CGH{^%{yj0b_a zykOP{A1q#oN%#DgAW3BF&e8_~bc-eFk#+U4mA=WP*ZI=?I~kQf@;^IYuSqQInvkBk z9;XFX^-iFS`LLDyZ09N*uf%F`Zciv+zvLfJkrc?32<#_pt()FZ`LMAcSq2TVOBjk^ zXX8Ap$?Zo4wpxs+zM`MUE9MjX%Ft&)u5jHyjFfgnJ%EcJTEw`)ta2rE5v1~a@mk#a z;?uy34}n6i0Ow_u;J5vPuv0?LFV8pBEwmaKiqmR*wdVTwe)k}`k&b>zOwp#OyFz91 z7&`2qVgJ@jJG_Hm50Q@-ko!BqD2RoS2=&DRjF0mfHV#+k4%)$D_=5m6pM}<7(-FyH52Wo*3G(zcd|N|} zI%0}G!9(@p3k!6t-VGK-p-619N+jBY@%7L^u2^Q1nDD^Vp&Y|8Uvo+&tRaa{flNy$ zcAQ}cqHhE>6p@HFnr0G`7O2P^HGN;BB?qFNH%nD$3JUXJeLeI11Kt0sPxPrpe%6&8 z$oZ(9|CkYGQ}p)ri=Z!{?qK2nkGivp%5zJ)H7>#3A$V}t;O_43&IiHW3Bldno#5{7 z9^4_g1a~-OcmI;!IcH>?tL{r~xLDt+dTUk9dS=lKLR5YoRS{`b!W$A)LP9JXG8k{S z>-eI;)4UwIlXtKf9u&6UqI;1h577-b+}Id=smPZ9UN!bAOw_(%9Jp`}J^~uJlb84- zq=x=wo^~W#SVyb~qf@|`>WuIT&6|^b04rbDAvQBS!%ruB4#FQ+%k`WWWM#|y_ljvM zk#fdb89#w9H$QpNkgoca0TUs=8hpyK@82`i4Pa%oF*=4qOK1P`pTd;QJ1vHa2V_22j#$ofb9TjqcD zbonQ6>94lhmh7yn0;=}groNL%NUTHjjxbr!Y}9Evi#g-xG()Fk9wVvy!yw8CyQC+; ztXNY7;2Z(Vw$1SC_THofhdUe@a*%V{R}i`qSD4mdiusSJ3K_J8*FJFw;6(F~<12t9 zX?#&3>sFsAXBKrmET0vk$c8m!(-%1%dILafy;1*gFiuE*A?Gq^`izKcyORo3EabMaa^-K<`SwEJdO5MPb)XES(ug}Ri+5SBzK2Boe5 zf{zU?cZ};mNSa}v`^#~BEl^lFt!86$UtmA!)AT_ zymbT#*uvD_i0JaN8#3sPl8#S9c6k@?lfl6GXta1eQ$_3!Rk}FO#h$kKu(q?FTd=mL z{u1ne#L{@krl(a~U#$ZLGtrxCFvN$V#Jvhb8VX~m7LSd~cTMi#R#}-SPMR7h(^u_c zT&>1NGC2S*C}|IhZwK0Zffhy{b{$onn*{V#l^}|sRkuGq@xvF_T+>-WFq3d{E8xtM z>zAS4WiNF5UCL#F0wntH;7jHQB3hP|s!0AHDuCI*Qu>7ZIi)ybB-N4|yEsToh@c8_8uQubaz1rAo?l z8G5;7%^?^~G|4_Rl)%r_l_70OH=&fhDKbRgoy`l=rQ8|7+uRQN3EMw{1lXn4S1k{Bqb)tUM6{q z*3fmRB8v7tKk+(7-hE_!WZ0-SC~32-A!G~JBXJ<#cZ5YTBk`t}WsntA59CAa`@QYh zGuP*)RwNsFvC}|%Dv1p&C?4Py9R4Rf9m=(Lm6{;xq6@pmTa0p-(D*Sz>;^D;&6rd! zirH0uU(qG2f9Q;ecapFvi6Sq{;f-X00Z0bn)pOG>R@3MRC%!?cvu)vgXDvAJ63ktY zkcZ)+h@42E9tGm`8GxIiKrQ~_mrHM-L`XSUfvi2b>2?eu6_7-_~2)Jq?0 zb8Ng72Hf5)&iCF{?)Pbmx;%yUtdqA!rH=WiT-YT+12qofw1_ewj=B(Nz(m@$QjpuVCv& z4x61H9f;FJS|7&=?>wJDhb@*YeiIgKfAo#{8*KF#WVZ7+v;@+5&Jxv7Yyh zLN)6Rj0d01IHB&+omUA5*iRL%v$C>lJQ}OTl#|dU z-c%Xg{)DZbIvx-IH*9sy`=78?s06@oH0N*a*FR~6{~KqIKhvo&|BA8Dj+Knr{wMWk zgEE*Pm`k>xz%9D{)d`4GrrGq?K>R%vqkR~mpvQ1>eZI;N{5@0T57NSdky|8XyCtn^ z10WDN_N(j*_WE=BAJnl8^?8ri7w1T>_`0H2t#i^{1?qQc4+MZo&)Bb8*(Q_sJXT>a znHX+xSrd1G+2JOg(OdkbO?~qa$KaadpAF&y_{lsZ!A3=;-DKK@;#tDG9qUduZ?lcX zV3!!MKk<$R}%(>LeUf|5Fa93TTml z;7pvE8(L0*iXY*(*y^{gaW8q6QWhNys-}FYuCd?T+V15{`yf`9ea&=r1(mFRQ1<(- zMFC4=0%SY{4l!Rj978y^Qc*gelhX}zP3Yj38l$r*16J5|AuFqOp3D1@W&H|mNeUqv zd>Xgw`fMuQd)6xtsF8l)yzev%E&ouUsl`{4$ci}|>Rw)?JLS&yKKZM6ZlrS;UET!Z3=~)IyUcdQYhng=XQTdN<)af{y|MClOj)7u;3Ikg2y@AeCDp0& zkc22Jw2Hk|*=sQQKs%;(g@Z6wI^}0S#0RM~jYL7M+9R!}PjI{GBE*m=<>&RPWZBw| zY^R9*s2A$1hMYewlD!3f9awjt8F!32HXHEVz0;+JD-;)mLU(8Iry3%))U~({Y1-hQfBCL0< zx8RI276%siZ^p*hV845k<;)}-Mhki!7fRL7$G9cr@$|a0E!Y8L^LT-7->EK2JbPcm z>G|?~li2;q;o&I~l3`2(rF-lH&+GG5bej&XB7~rE0IfyK_<`G-#G~6W`^2$Yr!e+2 zg?=|r(B+7u8iWW`pz9Lt$cQobfqJL$qeFjv3mR8ibe|5-a*!l6oieq@Fo?oCS>5B4 zs{_0H9^KdL=dT33UM^3Yx36BW6BE2IXSXv0mq+(f8$+2gnLOzkIvJZSfs^zg?a#F8 z9cR}Q6O|v3g)&0&x!BLErL)pDVUp7ki%-DDfAC`F-#9&@CXw8sKI+^zN<-~2!47Z> z*LsJffKe0XN%-SfBg%uJC+u5tj5ox<=SU*oi!!tRt(P$nE(DowV~gPB~DOt1gqMvZ5E-m?C#!d0aluIjkoPxGORTsjcWr)Ep`6+AYbf%1{0!t0@)xoLs@ZGq@8$xLD zPCemd23Z<|?cqH@$>UOB{`w!8V%RtrWYmJ59`3iS>tVU}V#cv`3hBnbI!d>lE#gNs zR>O3xL}H#)Th;;au}4GN@Nxgy&BHecgm&)TCA&gMw8y2ga) zqe#DeT%nGJA5z|!MzNjUeOM?$%SAgu*?gZu$Yjj-ZO2&8f8WG^>$cUUD~(`CUw!rStdrzK-f6pnQpXI$@M}dZAp_Xz=)q)1 z&47!2I4Q-V+u`@y=2AjjAd^+*im(MS1<{a}|FW8iZ2luX5IU8_KUmUMG zmeB27X>CK6`_`wbZXUx1E=~QteI0Mw)ckFSe!p?}WdPE?Gxxj4gMF9UGD>p5K)1Z| zk1f=9?McJ!P;JI@-!Qn34iy86LV9DLi9S=3HYs7~F_n{|6&I~qlVOmPL5wG9#9E#b zY#?>n1cRgN6?kZ{`wtH6%y&MbkpHliqO?Y;gp~54X~O(Izkqzkao1}d#8~pQiJQ6~ zJG9I8Rga#BDS^Bt0y$mhAOM3!!L$X0sX4;blZyBo<{hS_h=v{pEvqBJE>>TSV@ens zcTjZ_`{5VOb<-!m^)`cEo;jF#`(0}PVNpalWIKmZPu#Zngj!;#1GVX~NF8k>kytPX zR5r&uRffa2_7xQw!8fBP^sT3})kVg}K6smw6VPPF#2P(}8OYA@E@$G40;@DwXOtij z!^M$y2e=Ew5y8oRmfUz#Tz@mY#qHLkML$`l0%Qs(ql4t(AQOD@*sYAj zd+t|0V_uo|tNE%`gLP0mCqNEVZ){Pm9Be`pd?<%$$@Ln34pglo2xT{tcY zr43PLgn)X3YKX+t?F>NeLori`t+L4Bw?64JT*>4d5?0FMMs+ODb})cLKblY50izsx z>=rPO^bJ8oEDC@pndzA)6IG>#{Y$aF59BB5et1sMD{o6H%S|O)jqv z?|>Q8V7b9A7Td^->|%~!cb(WKSARK~-H#7c*-{z$K366}fCV6U$L`r|pw+rk`ZY^J zb&=G*M-7#<6)9&si;qhikus8uSpdc2B2pI|J6a$bL%3LV?4em07rbb5CcKSBjx zBuZNe-YOxA2Rw^OP#voF3J3#Ka}{d{NN#aUriO!we^@3K^z`K zSl@a5e8I2GP*G5f*wdj(M#AS(%*Ec@@5CjbaTKFLy+t`77w(07KrRJ|zA%Cg!l8HN3yL^PraFiO3JGIE<&3@7P*n#8hED~L0T2a z7X;PD9yFQOmGRn7hLJ#0y$?P~jFdbp@nf-y^g7V>!4bd@^c~ERM)*pZ0Zr4M$I!mUR=`}-m%oEpmgjcm%}U{Ur-*j zw4P}tapvuN%olBF%r)cc2QL$uv8@!o60bG#^E86I5P9}|c5iI}pRoiw`+|Dc0@6D0 z2r2IRnk_^Q5mRMh=g2Z2tnl_lFe>DtQi}REuD^=PZ~=MdqZO+X#?Q~%a9k2VUbW?r zPb6d^+4ec^7*D)nc>eCno(D4`)-BH0Sz~lfX;DyVBnAVXDg z-v(Om9Kl4NIDHEGdNV(PIPy+Lz zC+R`CHIOR2@T&$N$AFHo;$(GkXHA@;iag=CGFz)dv;D)2j`m;Y;3Y;3-oexPHt}&_ zY=J4DgI!z(50~S|@!D+9%A zgC)mRR1fkPw>^BVBNVGAmre@moI9cwKum;UaOtfmj^wD^;NkQFi|ScB2c?QX{EWN* z{VMlimMu<=BrkSr)-`Rn0++U-95IKo*I%Qyxh8t8Q1~c0cMQ(@rd}?3%v#$yn>Wo; z60>cPW_JcXAU(N!u^zK3G3>5!VHj{LK7S*6ZEB{HN@q<@H;)jnPIyH})F^~;s_^yx z2gib#R5bTM&M4^whE;3&PHT(i$;+Zj)))S?^qb0QeUO8VbQ*5)={rH}mc#GeU>$eW zflE@!zNdTPoE1W?95K=ZTCdWzihf)JEwt7{dJVK|b=Ti`z|X)btxuz~+X&)LqK797 zxt1~7is5SQhM_!%Lm$mrTMiUFqelvKvN78*)W_J%fGLTQ!(vS?RigqkYt-*t)t3ow zrRdg_66ISWTrAJwV`ee>~~JDcuRA6PQ` zav0t|ku{VN69$=A4ymNJkb1*ud;}|xJ&&yPK0Nh`vgN!fV*biesu#a_llrCxbLRSG zcx8cr1=Ctt*aHV*RSa%nabPI1tHyyS<*za+LC*C_a@82KIk#)z#qQ)1v#B7 z570j2=XibIK{0VO!lB6EZf8RTF(5|U(COPl5PO^+IzFkkN*Ih>$f9W!olQT0f1WGb zL^hLbNhv|zRKQt|Zf@*E*;GOipzDqrdpd&teKViAOi~{V-u$3S8{A0LwZzj{Ron4= zpd?Djh6~{LO;&M{E%2kjtkHAv$a^&&@NiA@JzkHnB|Ry@MjH(UPL|0s)+>#fnuF;8 zG7sg@J(83IA~&|cJol;zj+cukcH9X*!ekZ8lMJaQO%Q_<}{keeXhh!JNKvtvHbn7Fq)@g+N}cheJmg8w8{w zTO~TS#W-J$A9WcfK2fjse46)g5*eygtfmQRD}ww)#aWGbUeTJfV6?%5rQ;J}dvdWU zPorvW>n1_j#?de|j{>YHUb|eknqHm+=t@P}m{_s|s9kX~KIA@2*Lvz;U@M(Cqs5Ks z;kETg4e+LMh+c?IOq|&KfQw9GM-r-)3qSNk_B`Ng1|ob$LEuQVtbt?f@1#=d64nCi zq1Ar5YwA4xf$gyJt<^&h{zIqBr65inX=>*wed8iR82sqSc%DAGJFMXugA{hLw9NRi z{GJ_4i&^!`_cUj#+l3A)3g*xpT3D`txsqWkej@`K(aPY+{&U=K^ufiPNuSwY_ok4- z<@$qiWi~~XEeApc4kT)F6J{qnaTr+Xgm zpqY#c8ea_$2%k5P1tdi&<#t>k;W_IOp!`nYjv5ynTF&|Z#OAaVMO4<56(G@?n-|!Nk*XpRsEp5lc^FZSP#e5y zH-jhltd}<*bTr=mcKx_)@-?(0ka2nBxoxMJ4p3(3Ro_zwy(ZRuL@UST{5XcEf!86IwQ>oW;j=(2-<0Mzp=)6LrBxteIXL%jryss{Fd6zNfOIs z)bU81X)L~{xtd3-dLk6kcfE+pIvGaCN@@GVBTv@P;x#63)6y_p*nml-A?d=d$E)tj z1QA7(Ef2avHM?zoJ3(^;o^JUDu5Z(LxkYeexE{~v{(MoTYHV7Y60-B+yYfSZZ$7tI z-V%Yov~m;Y(dDsDb0T-&w*?zV36dHqvLnOVkE9-wpgTGmvzzO62xMe02%nVJOqh-k!DW~!-TBJ9Y#sr{GT7_*X`CV71y=f{o~2Xt zTA<+)uvUL-W6lA=VcxX3go)OK!yi1Nj-Q9&5`s2^bK6IX zkJbb{soI(NkrpzK`4vcK@;5DC=?sykIALq!O+r;;_4=k;=khqzRKmkgBo6|7%qt*R zI=2sSGaA3wyE9Uk#)OW)+ZvzfivXN);a;z7=+13&8eiBJ$PC|yj=hlX`ZE5_y6Za-K{10!zv;WZy zfr;rCEY!~}nf~{%0k>)07+(U;V=wbcn$6BSPB3GhhKaT$JGajfPump9RXG)6$TFZF zw$OJS=w%;T=##xtgIjHPfIg36guI&!;w^%Zy87e;Pbll4)S5oV;j_@&q@x=@GzkJO zvg;X|COy}XU#Q@wL)zRt%D?0~FA=BX5^t-LSWTs|%udZxs;4OcIM{TRq+6*eio`f= z3hKv+p7Pw_52eb6tQaGoTfdp1Jn7+%I`5A1bmM`N6zz;wF6>oKCXB>IDPVbMueIQ? z{3woh;oCK99Qt0j?R!WG-n-%caNAQkr9kjflG0Mbx|&bmW>lR5c@RlYzJjLTV^?$s zpNR^OMZ!X0w}6LytkFYwEI&DRGc*PeC;h-?9b^!QyRQT%RSt#Mc$NB0gd*`oSe$3{e-|WFzz>7^-O=bG|lnr?-IoU|F-P))880I z$e^_{ZBJA_Waq{DE&oMpaV6_x0x_Y?AT}3q+`su%s4{QfVIN*ujeqb!9q;qiv=xX( zY7}|xL3ZdONLWNqllR-O%c-*4zamg7pMEw=T}fTYl2jEk8;f_#%rpODY{VVXnU|}+ zzb%_Pq_X4dz7N8X z&0PKl$S#h7HvJG&8u{pZi$mygm9{zzZVDy9?}F?P1!cd`r-cnSAv#y+j)Q!4HqogJjz5>d{qFE{Ce41S;0~-~>N-t*~-KbEP zA1FFPWd{3$q6@F|pqmwJO0R0c0{$CgztAu%?znO=+_lg zn-AdSOX!ctO6XO412t_NM?n`KMVh@Y$4+rL9Q4%kJq?QvvYd zpy7XbwTR0&Hv=S6Y%DCSF%SaXnHm-H!IVWTIo@^OeKaRAgjDm)T>i4W_(*^|1@S>m z*|(pEOgrB@miUb>?wCt#(_J0$Mi-yoLSn^Ayk#9WPis{Yn}&a*iz~f9Y&I~v<^{Mr zJG3>68*D|zJ)XSO8En;#TyCQ7JtXZ(CNtq5#wDI2+ky2=2V^(}DfzdDNNVI8FaJny$l1U<{2vdcIfm&N3&M55= zo#la1u{T4uXsMVwyJ(H+Z5Unna@XnWtZ2?gLsc4jV{GUFML%G z5-dKjLHSV~e45lKig)zmiRtv-9h-u)v@*lZ7n}@?vPU+fT22W@3rEbwR7jpqE3t@` z46yO}LFhfk;$*A_95I9=c}sJCTB$Q`WH>4XpN9BYH6$oXtU{|?`;8&Yt^Dv3%JqLT6?-p05ivOJNmfQCh=Xd>_?~2;(4N@J`&fPX*T!hn9=OQ`j z&eI+Mpf^ANkKFeRlHxxgtw1?S$p=_`H=I%B+=;Wy4-GI!KTPPj>J^R5rPk`G|h2hfFGlc zwtWGi7F@+0W3}+DI*7cgmlte8>q*|hM*#c^^e&@ZR(32EGvQlpuFPm?unlU=E9W%a@d2|Z$5sqK#%RQU#x^6gbeDH!w{Q-=y*}{Qf zd<^d14SQF^Nn16NbK;NFU41i9tM_%-Zjw)%3vX@b5j}(ZINdm4CJ=zre)-Qmyd8u|1$|!R}}7?fc9$jFc$R5WhpDBReFr67uFp z*)q0g4M6pds5=@kI>Do$@@aG*s%~xZ4wGUZNlH)JUW)JDjnby=WJD=FaHx*ILk|;ZCQn-%P?wU6r=JEUTDV2a zwpAhP$e(2;lPVU#@Y3_XImcb^Ar}St`c*%a+9i|--$6?LM!}RrxCH11ci5!yJ=29I zWkPBXXUjt;(jyQA1ZaJmb3q>GIRp-f9z2>LD-iclhIZ&?CmU^VsM2#5o*7`Ei!+O` zMe?JNEHCXSIcq-(2X3yw&|Hly{mIRqE*o@wA7yHL45JcBdw$H4QkM5{n3`+y! z^}EI&6dGYMmZ=5ZMmVyz?A^R^ogqJN^D^Bm`=zi3^`Nc62cA3mLAtza9^x?`H>*Q{ zn~*FswXn=w!^kJ72;Hf(3@Bk^Yb^G-NDWIJD3kfTMNI7pg;Ux{sN-a zj^elMeUBJ;_7cKzijN*^O~Eauw`fclWGy-K0+bu8WG0fFB>3X_?K90))(?-f8Mfj7 zo80{OHctO7H$M&ZFb+L73s~K2l5Ex#xa)_e5Fkx~poTvvNQI~SDL4OE`%`Wn^iR1t z_5WLLoOvk;ZxQj9w}LZLUWEW5ZY2%!pf~y&JG>AHqJL-n7(n?ja&qyMr=M zrRrzz{&v}gRd0yk(!VLP3dMvoy1V-E0%o4Rjgr&IwdO`X{cd$}hl>2u3&U?1y-e(X zwpQ>fLRvY7Kjw`K26on-!!wEG=Xa5@mbBo1`FiAcB_o?DRafMrP*RtZLlKc3!ts%8 zJ1E4Vzstmh+zG9WF&H1y&(Tu#o@VqycaZB7^Q{SM&=j(pavJF;O~-TusnSUK##4Ki zPxIl^Mv}+AglY|}@0?Q!n|8&n`{S&-FbXpE<1j1hl869Qkn(*1(I)bLNuQc70^?$h8m|sco1bQPEoNcQ@dV zXpMiXEPsiVn(SH(=r|f*+xILljQSuqSS`B_>1G)agM`A21`O=M&1Qt#E78)ah^jrV3K(fpU8 z#GA5OxJYQmr;lOP;oGM21XT5#Ndnn=-(MyPd+3OVITIb$VGWI7)=@0dAC#0?vX<(R zehMs6ddvVRfo>&=)c^tl2`An$rHo^pfwIwJajXud!N`x@j|e}lB<2x-DSzj8#?1If z0`!05$2H?01q$=8m{ny($q1u2AWetj4u~ufufVus?i{tnn{yXViDfhEpdTXc@;;{E z=f>f-N{4vrcwRThlofokS9n;wi&lww$Wb((@_S%%+SYeo{5Re@Ogx(ugGFpor}Yvt zK6Z)JSJc#zRFH=j(zAUA+JQnB*Bqn}uJyUJ{k<7*g!3PBfpN3Y;oBM;u%)?f5y8zg zY|O&vHq2z?6vLE8oQ^Odd2}l|U>=_Y9@R?M>91{Pkv6_?Q=YOeB3@|=l9j;ao+rd_ z3cpJ#pb~l z{0QB+V1kOf^kcl-B!_vpkk39p;e)Rl9=-X5N=wR&fJY(IJ4)ZE)vCV8^!I8w{0-Ba z1f;<%3!0GhN8_duwqLI z>_riEm8+8Xob#o-9A)j8%(B&t+B5J04nPU7*6OqB2tq9VQd`=#!0&ePX`L5ts(Z-d zLV#p0YYj@o81E+Xb^V9yb6b0b4M4r}?(c!k%=m|a{eS%g0uVwMx#RFjq6(;YDwn5g zc{}$>&{E{dcB8y?>-1r`KX9!-UWHem=y2L0?cli27zRv58*4ocL+E!7MYE@OaiTaY zJ24%-H}ZS54rX^|cFRWf@sK@5cJ&uDeD*;QXs;G_V(!|kIr-^+(*8*#L@u})BJ`*G zNssi;z#jN3u#?{0PX^Et$AvS!mN9KUK`*_xOpz-pDdE?qt>dfyYh~gxU%jYad}v^= zu;9OTCPs(TQsT}f1D_g+TVLgV5GuMp1Ht0~ephVM|ECw*e@KC2{k3ft{XhGc?|r-_ z_Qa9?i8bs0(eFY(v-At7it?VcMtbwvODqYU`zXBe0iGoLv) z!W&L5fI|Z5$!ATAZ zUd@R$tPw%BIFJ77y9DA}d^Os<`42+YXU&7r?nMq}nc704wJG3=0-xBey1_DK z!<6s5^ubzz^#OTc@p|t;$-hMs_O2i-t+A@`j;!Rl;A_ccx7IDjs5r!MQ887p}|Xq_>SAwjCX1%VkOusCeN2&bIV>_MV4 zwb~^=gnkXw&eIUA-lmG$NXG-X2IZyj@q))Y|3eApg`=cy)(}OAlJpvT-0j_n`5^4? zukK6%u%b3XQRo4Lg zS3+Sj;*q!1@sp=kHV9GPi)UYQco2hKFNN5~X~98?<&y38j(nF~lsY~z^{y`Vi1rvk zjYsIZ6(B~RrPfL_oI4(hNa@(0dPTROuS`*atKc59jfbcB1n59^4U+iX!T-@<=Wo?W z*Ux^S(Ah%>cfngfkRuP$5Q+lgZQq0Q=e`FHe;cyp{YFP*UL*8XR-pcF@M8SlZZ07X zpI6lk4a7bTR{zjg84@Qhb?_>F*f)J;*F$+%LgKf5kIuJ!kKk&ssP(CI5V}eZtohB< zC{@ZoXOj@r;9mlK-76}q|C48rT%?D#SWTN9GD8b1~#CPu5|+64eHoNC>B z*{zqs5@+z|-)(=h=f%wMM@yEBOuryAwWG!?df%f4-tzQm%V7A6n^AxxnJ9p$Uj-F- z0_Cz5PZq5t39q#u!Yb|yoZF5&bLgzyOtjS?-F|-? zA6=_|L zlS1-SWkJ6<9k0>)KYsd>p7`w?+hrwp(@)VY;aFyH+;8Gm)L z_z9(XL;wE+O2cg>o>3?~g|H5^+HPcja?OK`@=jRZ;f+8@ss;F_SYNC(+!F2(pDH@? z>{mc&Ds&A)vdPtC>Jf2F({ON1e9vQz;1 zh+IuhKnT?>0=#gmd|x&aMz5U8nBWp@Yr@tIOrD>P1iyh~Ibw79Msnq3%p~kAvdR0H z5#w_x3UcMxL^a!?-1Dea6_BCNGilS7!+Fd76%=i;7%fIKS4t7B?-jo5p2zObGii}M zGqeLYgjxND|H;JhN3WINB)I=I7aBl~@k0c|_+N9Ogr7FtC;ylLzwP^un7PihIsDG& z(C{BVhjahxb2y|;WL??v=5v_w51+%zN5DhgBTE3)_&$-6lmtC{f%sR-)?zGQjN{4V zo#jTnpL3xl5CQG0^EnZBe@QuP@h0fxB1-5SYf?ee7uV}>Op)i@FTwVyF@+BY)X8Qg z7d+e;rx}AN7jy*Qt|a0#;+5k#o4z>SEvUQ|vG-3PwQr!Nzu!WpKQqzz7d)eKj5L6n z@J+Pi`A!!I`nAWEEM$$S>GdXFzhK77gl9^)$*QN#`Orwf+HBe(7xrOGGDu-2>B@Zw zDof^Jaleai%vG<*0}KHdJJ1Re0KDuOzUsRejAyz?{wd2#;Jg_VNmC>UV7Ce-;cLb0|pHp4bl+|9YvHe02dnUTt~Gt z)6}tBY|%)#{OeXNZtvAe>Ul;toOF(yXyB2hLngzk)+=o(SpXW-*In$O=y|@VMaU2r zVffl8zBJ;XzX^L}r#JQjLnxc~1M7zb z_^RVKjwNjPwp5OzP?1Ok;1=x~s|dPgm37f}f!;@c;1>UIE@eRxjE5yG@thOvhDUg( zT4MTz4HJ_r?frM1D0%$28dP?Czi@iEqA8tcmSdnjH`Ao+5`xCdw}0SlbE{z?x<2on z2~)=Yo8ON8k21)=@5hOW{g0}Ve?jpn$BbBmy%{i`DF$L=cE()WbK`(LK$kc%vd4MhYLq81UWqYuvuAulcY$hE<6X`2 zL9|1R?KxV75+*l+0s94C74}o2#84dVfiKfFUwH_b6ib1bJ{cr$MX}qCy(F_h){*K8 zlT6lAY-KW<9T*7iJCn3)&>dG#t#IZBReu?ri*eZEKqkQ$JE=JxGYKs{T{3@VRH3o) zfy^PNm0FLfDzQ13J){>RNl2saM_Qa|EzwO=Bq3i|EsSDpZOah`(s0Mdnp-2hLYo5- zY)x!G_VZ{C>~tS(#i$`X{5y=kd0%aIX!;ldsdzx{wV&+FM^nWNBkA>YY~szVKm^@IIZ6nG}hl9!NiWFC;sLZ{vk#8SL?R7 zlJX~?CE$$fFR#>-k`r*kd~urlNrG=Dpn7U8Vk|$+Qcqw=eww9*nY4eae40)FXiEGh zPHrB6%60r~droB@08D{ScnF5+J1%4>naFlg;$nx+?&i z`=Zh-q*%D)`S?VV+~_WK7F1FK<1D=ceV^E1;YKP^K1?Z*uGP!gYE5+)RqWvfTn%dL z`MVSQ4=*a2{tU9s#Q2Lv&70Em4a@e^I5m(Kk*}Jkemu4->!)2`_OZ;;DZW8S4i?NC zK;DV70DicOInXPQbS69c%8H0TNS$0hw})E{HVF8BvA}P1-vX2pfjCpHg-$YX#8p%K zWXk9R|Gm{}&W1*%lkU+E(E0P+gAW4^u;#PKDARTb$$8X1^7753lXYB{y)$Cq0}(K0 z13rBLr#DoWwks4a-5%9|R@Or!y|^QoO;0sd4MFtwuEk9^oOnwS?Tpg4a7!;0VFFA; zTGhvHPw${(lm@;+@9m`qaGNqdy0R!9yMdcccV14p`^f6(qg$7|@as@x_-gs{FgLO; z(A;liAwfWPj>VAntOFD5ll8d^f~czw9{cjkIo0oQ;Di$M;`pgVZpvgh>A+0z~Q@O^7ev2Z33n!^$v2*RHDBLtXz$0v8rYXgCS zbZE9=qY$lWlP6{PWkr;kp5FUx{a~ z2^rC-g$pwgW1wR&Cr1!Ktz`h)Oct6_NL{2dK?P7iwga8$KpJ-Wxm*U6T>=>Y9u>?C|LFr4hCh13u(JJ%rH_~Vse$;*_|Z0i$7h3!n_(*S5Onl} z9HmL?RJn~F>RepDt&U9EWDv_2n3V+tIZl+rHB}Ia012GT;Ay08?x0;L6Nkc2je?Ov9m{aJR%nvutsXMz4IbW z1{_?rA3X>>>4O%}{5y}bmO#cg*<>CL<2cCmNPN6J69-Bj1WuaxmpyPC<8(u+n>eLv zEi%ir)zfm6gd1c9{aMuVmE(`|++*R)E<18f0YWt48C6EiP9L;IMDa^G#uL^pl$p*v ziAr2fDtN0H3Y?z9zjr3TcH=hmKpHN6TmWt8X}|Md&Q>HuSb^WajXhkl_R=HBj}})V zD%?EvW94Jbjk7AvMf+TkB&+_U9KUgvm+z1*=$2bk97i-->TB~!jzW?HC{RvJ8GPsJ zCI7qQV_Y>o8thf!sLx5^kzI>^)s>GFfZd(ynz$2`3apz^AN|=LOp&aeR;qsm9)UXO z?1a$|^62Ba9Tw9%Lk8w!R?f$Kr&cfG8;V1pWNJ{Fl`}IXhj3%I=Z(C( zDrc)%8ruFA<;B;>-E>~%Oez22qT8xgvhXGucxHAv*^;c(Mte*MACtqKd2%RX6X{YS z^oql{OAejJFJ`Tx;%tF*$FQv?D@>ke;&%++!NZdoG6ONv!eL6~-OB9(kOK9pfaBi5 z@Yq;fTz$5=-l@WDRi0BGYLww*0ZazXjL5-%ooJ&z%riJpqe-T@RhOiSLUhJU$h%*_ z`J#U>jl_Bf_nnrO)l$j~JQ4jyx{~TDdw`OaCd-=i?(%-4iIeGjo7; z(>i)s8Z8qXS??Tp;wt_yHzfS`vvY#HeeU1*&@lfQV4m?`DLwzA;{4gQt^G;q0egF` z6aGWRne|^PPAL$Mf2ueY-2rcKKiz+U`)%2R9Z9VyEso{+M?s&?acs>_Zv-j;UBvvA z)7XyOJ&-SA|59=C7XGE;L?F+DiZ8OPWY0@obG(~Gd872S*S%4CR>^lW?sP!AtM+^9 z!q3t7vf!!PD2212N3@z zr_ukX=d?d_fnfRdhgMelZ|FK7kYyD~M0?a1 zAm#T+d^}3ijf-0~RzM>NV3W{t%WgbspvnW0$N}*U3JN_LF#-^A5=#U%5`S#%+7?4y z`oTGU{n`Agw6O~})+>Mm*()Cgkzxrk^+WxJ-JD)~MnLhECfq(53y|H|N8L3?XdLPa z!L(e^w>(BTyRztXlPVtr_Nw4aTWo6f9oJJUIqH0-)wi_#ZMcp|4bNLn4!I}Ys$zM zYd7YUJ@$T^=Lwy*fsLczfKqr1B6o(qJAZ#EX2jf!gW#d z;Y^_J>4N|Fs@ zL%u`Y?ul~*+qD(xzXt^q#~;nw{@&{Q?}z6b7}T&X8Ud5m45tvzA=3e{pc^<|7+8)kIWxq>N(7BV@Jp}4`TTyUZgs2jz|Es7UELS zr$QXUElv+Lfk&TZl+(;%F)71cL>-$P+gX_u3=%r8v`k>?*wywOw8hz*I7@l#Ue z!K-M=m1-=IbBE83$CR>J(VfEc04wGXGa(w>NI!WpH4v|gcMQ7ue{i*y{O>B)1pZgd zST2G8qLTg3W$&0BadjWpBSZ!sb#Om(dR9+{F#w3oSR%5 zefgQMocT{4wmb}sx|eSN<7^&xTl>{Spgr}ttN>}PMh|EsV@_=7jjH4$#fwX2DJm@v zZ^aQ}sQ?XQt5xc^8&$S?XhloEz|JstDI?S=k_0L%3*-`!(KmR>aOaAk-z03mi5E&S z>qmq-5T!>%jS<=roxEw9JH<9MF1Q249qh0yQ{fMtOS;dO_kRU^zI(g>7wW}{o}AMv zFJ{+S&7WVNtdslbj0tN=ofJii4f6K^3edO;?^cX|^41HaR4Hj>i&+E|u!!kN(&f;j zuI~4?nKzA}^5VD7P&gmTuu`>cs*ctIggY7n7lOg+F zWk!JqqMKI1?5s~|QQhwxBW37ds9?PukJGy{Ey(%MK1NxE)}Mg;zdNEJfscRoG{ zWbCQLO&ds`f{cfsAQ`nS{OnD>Aj-8;5NNrd!JcgtafR6Ng*Jjf8GV0FicnG@cAuy; zgSs?zmcEWc8Y$*(OH3#UEm}&;l#-dLWRVOGH*^SMFBUi0&tL6diwSskR4v5(5d|^4 z?F*Fws_>>DC)F4crZc1J0f|s_;SZzW2FQt80d!% z1pjepfbEa4%(a1)Nxj0_!zwzfFF)Kspp+3%^`Vup)| zE+4amZ_vE(OmiUm)pnx--8W3fWs?``9VIRUTiunBTepnCEyj1RC4 zzH!fuNJmH^DulMwqJ+b72QDl=!r`k4O?(!0!K5gn&K$r=P~Mzd!H1D0`S6deXEVaP zi9i0J;^NBInk7DRy(d80^k=#P{u{XlLSb-g9q}{L zKOnXXQR7G~RmGqP#OSX|I*ESNfQOHNjf*r)LdZ~w%c!z#d4@XS@f!7UZr1itC|&m* z_?0;7t;u%gAn|M=^G)*4Sj`s!4X#esOO@yr1QZ!*`z3E;NSXrY+%GPnT=3z_)#tKtQMIu9xLi4uXP^vn!ROlC7U<;S6sPI&Cwzbx1JVsLO zXKef!oP)1LiB`65>>yYXd2RTdy5w3IAGp#y4y3VVw{W%ikLChTL!Dfsdu0V78yNlJ z{^X_aqJvJ2T2{SsYHnK3?m|D0Bt=?|WwLYUb#+ zzwg8`s7ijj{Q7cGF6B9{w~SAb)O!q_fqBdWT<1}0HxmHg^QLrZSGH0ct$ySCUT0VK zXW{!t4}5uxhc6x1s$KFs{mZB%&Dp&=yXO=TUqU@9)7yAFheDc_e8!c50D8`h9{U2! zO81mG!wT8BMwe=LjS9%mC-?lXyOZFr9(cO+$;A!XG5CHU6J1PbC zut=Gej-Ql+HK|iMsC+?xU!9%D(6!z3nrCUP$>t|uDg7rhssmIHYcxC46tKPx>7?)E zU*gNNYe?ba_RA|UX$lCKguvZ5*L!7gIQC9d4p`%@k3*j^K4=_Z8t@c^L`wgS+suz*SA@U6~v(U?0JC8Cu*}8&sEaHmi z(^Z|9cNLG%gFp@JJtf5^cGQ}+4(kmZ--Cumtg1p8+l7%g3RLyS3#9bxr$9)}k0zh} zD^;Ao${_sZ}eio;NJcnw`hm3CC-w z8r}ztnOZ5gx;3?@DB=pte^ge#gpgL2qtdPep3k7m=xX(G+;Oz4Q;k;6vMdog=ZW0m zMZTmzs6qbDcDu929Q^B1g`4w#{#zw&r&hER+1E6`0VDQP&1~#9@a%h;8USIfhI51%?R~_QqjZ z{A7w@5BL4y3Fi6E^ruWvCPi6h-{hcN26Z`Y-IYk0lw1=*BorSiSZ)(C{dS zKAjbOAI6R8dL@Ofb48?jlDtbvj88qvRi85 zv7?!xsq58-DW-`dF{yP30H}7R$hD|)tiIYIyO>Dv?%pE3zXc;=U>E$F|4^WC9QrIt zP5zc%+R5%YZ20@~quDCd&AETN4~Jyv(>b&ijgL3Re)g3!{-jWVAp(I^4TtkI0*2Ej z8cOdWX*441?*_#PD{sZ#UK;PQ=^i-mtmKG<^IQQ6LhrNi$tkO`zZEBYwtgsO-*0D+ zhbE%NVh3(&oX}A9>#asgb`+j4rhn8Zc4T#SwJz&B+{8cr&EsuZ%yU|Ea-5p7`fJo8 z*0F48&$qwdKn`0-(BUsoQC-vg-iwJVtb8q}ocU>=RayRr7dX`HpbPxle28lSF4dxT z$`W_UcaYBc)4U9jnl&$?pXU9NROwJ(r~}4*kFvq51D9fJIrc7i=jBP*59bw1M3LOYLM) z7ssL2I90EAFWawA8rFc8r?QM%8np6Qk0uIVw42Fm);j$K{2DqRo&&+AV3zUY{q?}* zaYLiItPHmI%*2vgy|s2dxL>&3zCzDs>vrkB_`md3%qI|Zj~fOo={vBsbWfze{c4>S z#S^oGQDlkp*wBDeqzG&3@(p^~yab7g>a)A-BxAJPcFGh6Lc1>D>m#?HOE6*t$Kv?psO>?~(Zy{$0vBd3f9cD?3RjNL7g^KQ&-X~R_9ay;$tMx6vX zt{+V}KLUIG7q`v^*n+_N3vbJm(c_>5f*Th5E19>}`^L;poBQK+*6KBVeF*E@#a5ZZ z%ft4-#PR&?*7okxgT{*s43}R6#Q)JKpzci+&^F{>Kjpj8h=B$hNz zRV5e_@O7n0isjI;Q1Nt)TVcx!nEKonkBHrkeMpU{`$0vIV$#GYse@eMgFb4iJs~&0 z;pzuhU$V|Koc_-T25diAd}+lm64Z9Yp&zqoLZnBrY|>W7AGR%d*G8TPEn6^jFDy1P z-Pv=GtSSOB#1jRE@i5Z?T8{={`lpMb_2!0p@V`()9k}~fn*n*^wdgxsh#<{J)KD~l z(J792SBJTJ!USMRVAu-t7w?`F*^^VGDG`YK>X|$K7853=_jYq06*;!o)0Cx^zdnKKexxo#BY@fT26PCHtD<@6i~vZshc zZ^wN@lFIYLcfNL~^cTFvx6_jRugq-Cstc~NA0iIE4L}c$u{6!VkhVhlymG&v*1{)Ne+K`M34p@ z4W5%owuO`_tR?K&Ioi|h)`n`ZoUcePTofUtK4xp=a?3e>Ia+qhIqs%>s8vM2#8R)1 zP5$P_53z{QBFQaF2e`HSJ>QoULX&{-@ITLk926fT73yrj#_KnWq6$7iwO{^pyEek6 z9L8&IT+W#9!LptySYokzII&Nk*?Vy#5Dfk%foVO*-zydohIY{Uycc_0Pnw`2xOM#v zB^Bo7!Q3|LaS6tw{4jd6%S&dgOZKyE8v5CwS$gXDX$h?0wW@^IcBxB(qv!KAM%fv7 z?A`*?*~nTn;JcFwr?(1A4{&0`qS07%S_3WOW=Yr>)DGHGG+M*wa%!NuY_9bKMm#7p zG15&K#cM>?F*5wcb8J%Q&{s?pV6YZ(S^t552ZZ)(^jM(Qx(K7z`mqCQI!Vn0d;prM9_(jv5%)NFa=Fs;9H(DPBh|`Lw{!lr8>QZSa9p_%T%?4M|sTqO<@j21wp31@Hb#l%+ z68nUV&N{&T*14jgJM2JN4Z4FR_Txwv^`Xekjf9S42N?B6J)*OlVcz3u#{{~K99e;- zBGFI|QhN7{2~8v>6oxaDuZ@w7XR&_WIM)k9v%? zV&OX3^phCYFXCO__rLx6A^#-vULex@GhHRU z4I$BpquaG(u4oe3rYRB^hk3~ErCOU3X^O(JKfo`-LO5=3UXX9$Gu>Yo8!d0qKZb$h zE#w!uCDdN)2B&^ADz6V;>thQ3Ha(<$tUQ=zIGvNf(bh;Rq|*YVw8#uL;!-=^RkD*? zW*iG-m~@o1f^-zaN7NiZWx9CcQLMP9ONfEN-%v^+w|^2=wVWAYS^{GTWDyk;T7@Xj zsVfw22sK>l26{t~x68AbEE9CDsfk3BXO_J%Op*pJ9Lch3nQ+vyj}FxbMsw{sE#Q89 zz9utiu)=uM$+Q+J;hf`d|88B4BUwx^G-vnJbs7*~#PVghK1FUZ(=d1 z`+0G1&(zNc5{~zsAwR1EgFs8~<@T1POxoLfQN)w4n(g%#4|A{LJ8RIA;R<;7a{g$- zsRcYyrls}P^}fBm+)YGHOzdjBWX#MOS&2Cuje$=loPsigT3;9UY8v?Rf*y9a_h$EO zQLZj`A+ueNR=H_xgIzZ&FF#@YA8bps=(C@96FYgF2HLssnK-6;h4@{Z1~R!)LeGBN z^n&j-vqc)dGNd(}5Kei}zvnc^1=H@T#i-xsoFlu0@RxgXR8TB|ynFiakC|z$O@QvY zIj^hqvHU|6f$2_MrqIEs!Avg2Yzcvs;q0U<$dvANC99YtVRmcZIPF$HIqgxk49-H% z0B*ZU8`F_g&?xidTc9IBBD>C&!i}NEMLFGK{3{gvmJ7x@dJ5d?#n>lE1uPC01q|z^ zl$7}g)(=3bY?zEhO3LYXN-tJYD|ALJB_+yiKgaNAxfcl6%)UEr0!El{Wp}mWTOIik zYkA?iOJJNfnvk*f>J$1{*C_*TM5WbWtM2r&u zQ%H8hJa-L6GVe;sYNkgfnlor^0-sg&^%y_Nsg^D>p$?b!L!A2Ty-){$?@KG>aGa{8yR{==X;YAT@Z zQ=uJgZI=G$Qwx|UK5*vXKMQ3gPlgibeP+kZRhBa^t%MR}$9<2|(2S2dfnXmM>mDd@C zD?g9q6qe9w0Ve5g(iz;1q$d}3V5E(YC<{R%?>q6q^g%7aJuN*zfv=K${VI8aY-)@I z^5(vDs@|nx`Z4jAB2Ax1u5VPA?sZ9nG`VXA>sxLz^YL0Roty2G^UG)tTATI^%L88d zF%GyED$T)^I6c3tRnGr${_%B?hnM-suOm5q-U>3R$OGCnNkh7pi2!TLmxV>i6ob@V zVqHJyktDiol2-E(oLmIwkrX=N1Y?2h>Bdk``{W<*%M&)K&3%X}*?^kG$_*h?yuMxH9%U`%jGhm>E zGrF=a3=C7SmJv^`_(e>0V?U(2Oy#Wxq%2Tuw#1S82=-)#8 zKc;)4C8@Qn26)Gy|(DA>%^rV zKr~e!dTXA--B+m?RbA)hIAABjSO+dfHXcdtZW#>S_4&Cj2*7a5W4!K|xt(kQ3)IBf zB-9SkChT7G9QEe=yZy9>(!tB(y)~z(QqXg24E~<1J}(!J+cg+CiAe>SCQ1HclyL1u z_4rI)a~DWd-@Vs`{!GPH>DTKbk$q1w zR^6ErVq3`&nLQ0FlST1@uw;Oj{t zKFTW6P6)X5897wyI*P++7rk!DP)LBT1~6%-6g-iF83h7Z7ovUx-%b`*rP4 zvO>l-v_?~Mwce7(p%)QhO|*9lDHGJ1@F2?Y`WYjNR8+X|$=CTpu0W52)*b8u{&%2p zhDuv{ztSRhYm3qph;AHGx=~RD2&-i52-fB!a-d!F+x_#tk3Uwl?c zFRFI?Dxn1J!E<(Ax9;*&tv6_i=#+JWr(Uk{yydC>haZi>G%jFAOYIt}YDoy0=V-gZ z;rUVPQE+;LqB{Uwx;Xte3(MeRR9udU0GY9t;OTAEmnYtmOh_~=!4DR@b!ImrZ=B7; zzf8qQeao*k13c=xGC`XicIi|4k}?i!P%w&h%Ted{nwndSP44t>BNiEztiok?co(^QB;T*E8p(3LFZa zMH#o+z1nW3r3>|l?24R8tRcCC_JUPLq>m~OMW&-YJ7gyPt#&(iQls1fDSqxk;v(zd zCneN0Xz3jv7{QmYg11f~LhSuYM55THsKgh_$^>OBp7cPtf}tN?U&tNez^oJ14mUq< z$t1gv+I?veoMmF#x5H6L`3$B-+I;GD_dgl#$KVny%E=Iq`N#*BzFha8CN4&vL^Jq! zD}jJ-n4&Z=Wqn?bNLP6M>x(SnPHwUt<_Yt@=)5BFKDDVONJ?oD;boQ~{L=FUNA1lr zPb_2P&|Zb>VM1jVP9XEjt~M_vKI^i%jpWi~sAhBt=w8i#^=eoo z;LECD;Z%5O{MGsDe-&8NU;FCu)R%QK;OQ19`XsK5-|9a14qV_d;GXDX6@0r8s)$Ys z%h04A-f{ur>+n_&Q=kmu`6%Z)AP+-J0I_i0xI$Jk6bC>%`?Mi$PU9P^#Cp!Bp``p? zy=2RoEnC6IBYG}Up4(OY6i?@x62n~~X^5_$N@bbr`##a2FYD!FCmb^>fB7_-XsENp z#L{QkS{cD&B~lMj-9@cW`?Slf-_;|qA^tVCvuA2J7g+ho5lZhLg;=q>+T#Vreb(+*$^P)*9o!|a|(n^8V0nL^TpnpEeCe^ z-d*knups`(>?HTDWCy>zEg?L-652)@+crKrvKcBe7`J!!Kf+uULu3l#Vkm0=>7b`S}w7E z@{sJd{LPv~!A7a0&ML?4L39@Y!<~3jf~_FFz$9}qg+DJ#lM`NEkO+CBUAS%z9%nma z-)gG|Z^L8^`g>t?j41z_H+L!;Wiz(X-Of4Us3_d*`3^2Byn9;&lT6|hqClzNU2G3cOcnu7>t};OahE79$7d{(#RPw>#HSh zUJ}C5__%KK_4fS3mlU3bma$&=ZPxC6cj4}RHvh~ZFXPvl>3grUEvI|6!R{znz;FH(BK zd>jJoCYBt$`~nv2ru=+n0_G;BmfR-h=9Zl1?40bDqJaN*hymLdM{^5rJ$3*$C;vYk zODDiTNl^U{6q|#SmzPb|$;lnS_wVo3Yv#`zgusxa)5+bVUr6FG+_^`g>( z(eTofMn)9}6*Idp`!AXMd5ix0qIbX6{ia0r{d3%^E60yDbKMLloBg<#r5Dl^EWR_d zRgA^dXI^Kh)F)R`FpceT>cfxVbAIAYzcCM3mV%cvI;86?$%sp2vB-~0=Xp0gqO2H( z=Tz5Bt1`FK(Hk5Vtp5?0Cd#QO{lo1wxKWm}4Y)Css-JlaI z%;d!`bS6Bdv!~CpKmNEnXt-EsM10&%+754W2K5WAy~I=#+Eczy@pb7}JoAF>*B?6h z^JK=i-p8T{e&!}qjJ`#Do|z$;L&EJKV^*;>0k;PX7Yz?)gRdPfCNq*7nbA z+g-FuCPkFl7ov(aQEF$#f9nW8#_9}l)S{Y|n(U*0pGJGhHGcCUd?R_P{=KBIbi(<1 zM(uePD=v3kDMDw{Qjk3cj5LWQsge&j*t16^j@sl{aw3KQyBvhzsT2NF%#?USoZfQlvG(8D(2as& zctj;M*j-WLDWQ3-S>rX=&|h(%GU_J*z~i@={S69V!vx94ZJ8WvYg%-nFqpfI^KbmJ z7kND7R#qF?A@@fZ{0&0!8qc5YCcm$9|6W_|TdBu9E7=GD9*S%EOXX@CyFD=RwnvMP zv1@z1XfNNGLyJP<-h>CO`X@%+47H>+_24wTN@T4;nrxi?jGs-?x-%dA1r0MJGCbPk zYBn`&XAO#Y4U$ezAII(ku&YT!$k7IQMllGHVU#J1Pp_oKG1rK$VIv1$y|%K@{?FQ-qcSm z{}OvKE}Ry8AI{n&duzujsX^kMqLwzbmoFV6`a(#D5m?WQy16|I=CQ!j{#ty(0b-U% z;lYmBI#4UN1(2b05<)3@Tj;n$)0X{hx09C@macqs?~;`-LG*Z!>93(ykhU|qrY_>Y_$4PS3!?Ph>gRm)ps=l*fN`oP>>}MxLzfdAUCbvKP^sl+D*|a3F*mFxTSN z)&JFLlj-_{+#f!gO|s^a6m3<4xNVk&_~$xv*FIHGGti|~-H~8I>#q_}>fNwN#*+Z6 zZ~r)nV6#Fowu36{HaYXa?s97(t0GcH_qKWcH0@n4gFk^ii1VsPWp#+9wbR`!OB$o@ z@ES8q0DD%+EK*SVBV=#^q`J@<@>if~*LFf*Fqoxulv(NBVGGV6#*x8(g?$kaP zJ1IZV;Nq=l)Ho}+_-f_7K}a?ebaz` user message: + +- `claudeMd` +- `currentDate` + +这非常关键: + +**CLAUDE.md 不是并入 system prompt,而是作为一条额外 user meta message 插入到 messages 最前面。** + +### 2.4 messages 历史 + +来源: + +- [src/query.ts](/abs/path/E:/claude-code/src/query.ts:687) +- [src/query.ts](/abs/path/E:/claude-code/src/query.ts:954) + +进入 API 前,messages 还会经历: + +- compact boundary 截断 +- tool result budget 替换 +- history snip +- microcompact +- autocompact +- context collapse 投影(当前仓库里实际是 stub) + +即便做了这些,保留下来的历史、工具调用、工具结果、附件引用,仍然会进入 request。 + +### 2.5 tools schema + +来源: + +- [src/utils/api.ts](/abs/path/E:/claude-code/src/utils/api.ts:119) +- [src/services/api/claude.ts](/abs/path/E:/claude-code/src/services/api/claude.ts:1250) +- [src/utils/toolSchemaCache.ts](/abs/path/E:/claude-code/src/utils/toolSchemaCache.ts:1) + +这是 input token 偏高的另一大来源。 + +注意: + +- 模型看到的不只是工具名 +- 还包括每个 tool 的: + - name + - description + - input schema + - 某些 beta/cache 字段 +- MCP tools 也会一起算进去 + +所以“工具很多”时,即使用户问题很短,input token 也会很高。 + +## 3. 为什么现在 input token 会这么高 + +不是单点问题,而是多层叠加: + +### 3.1 主 system prompt 本身就很长 + +`prompts.ts` 的静态主提示词已经很重,尤其包含: + +- 行为规范 +- 安全规范 +- 工具使用规范 +- 输出风格规范 +- 交互规范 + +这些段落本身就是长期常驻成本。 + +### 3.2 CLAUDE.md 会被额外再塞进 messages + +当前逻辑不是“让模型去引用一份外部规则文件”,而是把内容直接注入到 request。 + +而且它走的是: + +- `userContext.claudeMd` +- `prependUserContext(...)` +- 生成 `` user message + +这意味着只要 `CLAUDE.md` 大,**每轮首部都会额外多一大段 message 内容**。 + +### 3.3 历史消息远比表面看到的多 + +用户肉眼看到的是“对话”,模型收到的是: + +- 经过若干压缩后仍保留的 assistant/user 历史 +- tool_use blocks +- tool_result blocks +- attachment messages +- 可能的 memory / invoked_skills / nested_memory 等附件 + +所以“我只问了一句话,为什么 input token 这么高”通常是错觉。 + +真实情况是: + +**本轮用户输入只占很小一部分,历史与系统层常常才是大头。** + +### 3.4 tool schemas 非常贵 + +从 [src/utils/analyzeContext.ts](/abs/path/E:/claude-code/src/utils/analyzeContext.ts:363) 可以看出,仓库作者本身就把 tools 单独当成一大类 context 成本去算。 + +这说明工具 schema 在设计上就被视为主要 token 消耗项,而不是边角料。 + +### 3.5 “重叠内容”不会自动去重 + +这点是最容易误解的。 + +即使 cc 源码提示词 PDF 中有很多内容和当前系统提示词“语义上重叠”,只要最终发送到 API 的字节串里: + +- 出现在不同 section +- 出现在不同 role(system vs user) +- 换了措辞 +- 换了顺序 +- 包在不同 wrapper 里 + +它们都仍然会计入 input token。 + +模型不会因为“这两段意思差不多”就免费去重。 + +## 4. 为什么不能直接“复用 cc 源码提示词里的重叠部分” + +这里要把“逻辑复用”和“token 计费复用”分开。 + +### 4.1 逻辑上可以参考,但 token 上不会自动复用 + +如果你的意思是: + +- “能不能发现 cc PDF 里已有同类规则,就不要重复发了” + +那答案是: + +**只有在本地组装 request 时主动删掉一份,才会减少 token。** + +否则只要两份内容都进入 request,哪怕高度重叠,token 还是照算。 + +### 4.2 当前实现里 system prompt 和 userContext 走的是两条不同通道 + +源码上已经分开: + +- system prompt 主体:`getSystemPrompt(...)` +- systemContext:`appendSystemContext(...)` +- userContext:`prependUserContext(...)` + +对应代码: + +- [src/utils/queryContext.ts](/abs/path/E:/claude-code/src/utils/queryContext.ts:44) +- [src/query.ts](/abs/path/E:/claude-code/src/query.ts:831) +- [src/query.ts](/abs/path/E:/claude-code/src/query.ts:1084) + +这意味着即使内容重叠,只要一个在 system,一个在 prepended user meta message,当前实现也不会自动做 cross-channel dedupe。 + +### 4.3 prompt cache 也不是“语义缓存” + +从: + +- [src/constants/prompts.ts](/abs/path/E:/claude-code/src/constants/prompts.ts:109) +- [src/utils/toolSchemaCache.ts](/abs/path/E:/claude-code/src/utils/toolSchemaCache.ts:1) + +可以看出,这套优化更多是: + +- 稳定 prefix 字节 +- 减少 cache bust +- 让相同前缀可复用 + +它依赖的是 **稳定字节序列**,不是“意思差不多”。 + +因此: + +- 如果两段内容只是语义重叠,但文本不同,不会合并 +- 如果本来相同,但位置/顺序/包裹结构变了,也可能失去 cache 价值 + +## 5. “cc 源码提示词 PDF” 和当前实现的关系 + +我已经确认该 PDF 在当前环境下没有现成文本提取器,低层扫描也没有稳定抽出正文,所以这里不能负责任地给出“逐段一一对应”的精确对照表。 + +但按当前源码可以确定: + +- 当前请求确实不是“只发一份系统提示词” +- 而是“系统提示词主体 + systemContext + prepended userContext + 历史消息 + tools schema + 附件/工具结果” + +所以即使 PDF 与 `prompts.ts` 大量重叠,也仍然无法直接推出“那应该天然省 token”。 + +因为真正计费对象是 **最终序列化 request**,不是“源码里有哪些文字看起来像重复”。 + +## 6. 应该怎么优化 + +### 6.1 第一优先级:先看账单,不要凭感觉删 + +现在建议先观察新的 harness 字段: + +- `system_prompt_chars_by_section` +- `system_context_value_chars_by_key` +- `user_context_value_chars_by_key` +- `claude_md_chars` +- `prepended_context_message_chars` +- `base_messages_chars_total` + +这样可以先确认: + +- 是 `CLAUDE.md` 太大 +- 还是 system prompt 主体太长 +- 还是 message history 太长 +- 还是 tools 太多 + +### 6.2 第二优先级:避免同一规则在两条通道重复注入 + +最值得先查的是: + +- `prompts.ts` 已经表达过的规则 +- `CLAUDE.md` 又重复表达了一遍 + +典型重复包括: + +- 输出风格 +- 工具使用方式 +- 代码修改原则 +- 风险操作确认原则 + +如果一条规则已经是全局系统规则,就不要再让项目 `CLAUDE.md` 重复写成长段版本。 + +### 6.3 第三优先级:压缩 CLAUDE.md + +当前 `CLAUDE.md` 直接进入 userContext,是非常昂贵的。 + +适合优化为: + +- 保留真正项目特有的内容 +- 删除已经被全局 system prompt 覆盖的通用行为规范 +- 删除冗长解释,改成短规则 +- 把 rarely-needed 的长篇说明拆出,只在必要时作为附件或技能加载 + +### 6.4 第四优先级:缩小常驻 tools 集 + +若工具很多,tools schema 会很重。 + +可考虑: + +- 更积极地 defer 不常用工具 +- 减少默认常驻 MCP tools +- 缩短工具 description +- 收紧 schema 中冗长字段说明 + +### 6.5 第五优先级:把“长期不变的大块”稳定下来 + +想吃到 prompt cache 红利,需要让 prefix 尽量稳定: + +- 不要每轮改变 section 顺序 +- 不要让动态字段混进静态大段 +- 不要让会频繁变化的说明插在高价值 prefix 前面 + +这个方向当前仓库其实已经在做,只是还可以更激进。 + +## 7. 我建议的具体动作 + +### 立刻可做 + +1. 先用新增的 prompt 分段埋点跑几轮真实请求 +2. 看 `.observability/events-*.jsonl` 里 `prompt.build.completed` +3. 确认前 3 大成本来源 +4. 优先删掉 `CLAUDE.md` 中与全局 prompt 明显重复的规则 + +### 下一步值得实现 + +1. 在 harness 中新增 `tool_schema.*` 专项事件 +2. 把 `analyzeContext.ts` 的分类能力接到 harness 日志里 +3. 在 `prompt.snapshot.stored` 旁边追加 `prompt.composition.snapshot` +4. 加一个“重复规则检测”脚本,对 `prompts.ts` 和 `CLAUDE.md` 做近似重复扫描 + +## 8. 最短答案 + +如果只要一句话: + +**当前 input token 高,不是因为“用户这句话太长”,而是因为请求里长期常驻了很重的 system prompt、CLAUDE.md 注入、历史消息和工具 schema;语义上重叠的内容不会自动去重,只有在本地组装 request 时主动删除其中一份,token 才会真的下降。** diff --git "a/ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" "b/ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" new file mode 100644 index 0000000000..a580375a53 --- /dev/null +++ "b/ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" @@ -0,0 +1,340 @@ +# 统一埋点日志阅读教学 + +本文面向这次任务新增的本地可观测日志,目标是让你能从 `.observability/` 目录里快速回答三个问题: + +1. 这次用户提交到底发生了什么 +2. 主线程在第几轮进入了什么状态 +3. 子 agent、工具调用、stop hooks、恢复链分别在哪一步介入 + +--- + +## 1. 日志放在哪里 + +主事件流: + +```text +.observability/events-YYYYMMDD.jsonl +``` + +大对象快照: + +```text +.observability/snapshots/*.json +``` + +阅读顺序建议永远是: + +1. 先看 `events-YYYYMMDD.jsonl` +2. 发现 `snapshot_ref` +3. 再打开对应快照 + +不要一开始就直接翻快照。主事件是索引,快照是证据。 + +--- + +## 2. 一条事件怎么看 + +每条 JSONL 事件都至少有这些字段: + +```json +{ + "schema_version": "2026-04-19", + "ts_wall": "2026-04-19T10:23:45.123Z", + "ts_mono_ms": 123456, + "level": "info", + "event": "messages.microcompact.applied", + "component": "query_loop", + "session_id": "...", + "conversation_id": "...", + "query_id": "...", + "turn_id": "turn-2", + "loop_iter": 2, + "subagent_id": null, + "subagent_type": null, + "query_source": "sdk", + "request_id": null, + "tool_call_id": null, + "payload": { "...": "..." } +} +``` + +阅读重点: + +- `event`:发生了什么 +- `component`:谁发的 +- `query_id`:属于哪条 query 链 +- `turn_id` / `loop_iter`:属于第几轮 +- `subagent_id` / `subagent_type`:是不是子 agent +- `tool_call_id`:是不是某次工具调用 +- `payload`:这条事件的业务细节 + +--- + +## 3. 最常见的阅读路径 + +### 3.1 看一次完整用户提交 + +先搜: + +```powershell +Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"submit.attempted"|"event":"input.process.completed"|"event":"query.started"|"event":"query.terminated"' +``` + +理想链路应是: + +1. `submit.attempted` +2. `input.process.started` +3. `input.process.completed` +4. `query.started` +5. `turn.started` +6. `messages.*` +7. `prompt.build.*` +8. `api.request.started` +9. `api.stream.*` +10. `tool.*` 或直接 `query.terminated` + +如果在 `submit.attempted` 后直接出现 `submit.blocked`,说明没有进入模型查询。 + +### 3.2 看某一轮为什么继续下一轮 + +先锁定同一个 `query_id`,再按 `loop_iter` 看: + +```powershell +Select-String -Path .\.observability\events-*.jsonl -Pattern '"query_id":"<你的query_id>"' +``` + +重点看: + +- `turn.started` +- `messages.preprocess.completed` +- `assistant.tool_use.detected` +- `tool.execution.mode.selected` +- `token_budget.decision` +- `query.terminated` + +如果本轮没结束而是继续,通常会看到: + +- 有 `assistant.tool_use.detected` +- 随后出现工具执行事件 +- 然后进入下一轮 `turn.started` + +### 3.3 看 Prompt 是否被压缩过 + +看这一组事件: + +- `messages.compact_boundary.applied` +- `messages.tool_result_budget.applied` +- `messages.history_snip.applied` +- `messages.microcompact.applied` +- `messages.context_collapse.applied` +- `messages.autoconpact.checked` +- `messages.autoconpact.completed` +- `messages.preprocess.completed` + +阅读要点: + +- `estimated_tokens_before` +- `estimated_tokens_after` +- `tokens_saved` +- `snapshot_before_ref` +- `snapshot_after_ref` + +如果你想知道“到底删了什么”,不要猜,直接打开 before/after snapshot 对比。 + +### 3.4 看工具调用 + +搜: + +```powershell +Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"assistant.tool_use.detected"|"event":"tool.execution.started"|"event":"tool.execution.completed"|"event":"tool.execution.failed"' +``` + +建议按 `tool_call_id` 串起来看。 + +阅读顺序: + +1. `assistant.tool_use.detected` +2. `tool.enqueued` +3. `tool.execution.started` +4. `tool.execution.completed` 或 `tool.execution.failed` + +如果同时存在多个工具,先用 `tool.execution.mode.selected` 判断是: + +- `streaming` +- `runTools` + +以及是: + +- `parallel` +- `serial` + +### 3.5 看 stop hooks + +搜: + +```powershell +Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"stop_hooks.started"|"event":"stop_hooks.completed"' +``` + +重点看: + +- `hook_count` +- `blocking_error_count` +- `prevent_continuation` +- `duration_ms` + +如果 `prevent_continuation=true`,这轮虽然模型没再调工具,但不是“自然完成”,而是被 hook 拦下了。 + +### 3.6 看子 agent + +搜: + +```powershell +Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"subagent.spawn.requested"|"event":"subagent.spawned"|"event":"subagent.message.received"|"event":"subagent.completed"' +``` + +阅读方法: + +1. 先按 `subagent_id` 聚合 +2. 再看 `subagent_type` +3. 最后对照它自己的 `query_id` + +一个子 agent 至少应有: + +1. `subagent.spawn.requested` +2. `subagent.spawned` +3. 若干 `subagent.message.received` +4. `subagent.completed` + +如果没有 `subagent.completed`,通常表示中断、异常,或者埋点还没覆盖到该分支。 + +--- + +## 4. 快照怎么读 + +主事件中的 `snapshot_ref` 指向 `.observability/snapshots/` 下的文件。 + +常见快照: + +- `request`:发给模型的完整请求 +- `response`:本轮模型响应摘要 +- `input-raw`:用户原始输入 +- `input-messages`:输入归一化后的消息数组 +- `messages.*-before/after`:某一步预处理前后的消息 + +如果你要回答“为什么模型这样回复”,最常用的是: + +1. 找 `prompt.build.completed` +2. 打开它的 `request_snapshot_ref` + +如果你要回答“为什么这一步压缩了上下文”,最常用的是: + +1. 找对应 `messages.*.applied` +2. 打开 `snapshot_before_ref` +3. 再打开 `snapshot_after_ref` + +--- + +## 5. 推荐命令 + +### 5.1 只看事件名和时间 + +```powershell +Get-Content .\.observability\events-*.jsonl | Select-String '"event"' +``` + +### 5.2 查看某个 query + +```powershell +Get-Content .\.observability\events-*.jsonl | Select-String '"query_id":""' +``` + +### 5.3 查看某个工具调用 + +```powershell +Get-Content .\.observability\events-*.jsonl | Select-String '"tool_call_id":""' +``` + +### 5.4 查看某个子 agent + +```powershell +Get-Content .\.observability\events-*.jsonl | Select-String '"subagent_id":""' +``` + +--- + +## 6. 典型分析模板 + +### 模板 A:为什么没有进入模型调用 + +看: + +1. `submit.attempted` +2. `input.process.completed` +3. 是否出现 `submit.blocked` +4. 是否出现 `query.started` + +结论示例: + +“输入被本地 slash command 消化,`should_query=false`,因此没有进入 `api.request.started`。” + +### 模板 B:为什么上下文突然缩短 + +看: + +1. `messages.*.applied` +2. `tokens_saved` +3. `snapshot_before_ref` +4. `snapshot_after_ref` + +结论示例: + +“不是 autocompact 触发,而是 `microcompact` 先清掉了大量 tool_result,节省了约 N tokens。” + +### 模板 C:为什么 query 终止 + +看最后一条 `query.terminated` 的 `payload.reason`。 + +常见值: + +- `completed` +- `blocking_limit` +- `prompt_too_long` +- `image_error` +- `model_error` +- `aborted_streaming` +- `aborted_tools` +- `stop_hook_prevented` +- `hook_stopped` +- `max_turns` + +--- + +## 7. 现阶段已接入的重点事件 + +当前已可用于阅读的主线程事件包括: + +- 提交与输入:`submit.attempted` `submit.blocked` `input.process.started` `input.process.completed` +- query 初始化:`query.started` `state.initialized` `prefetch.memory.started` `turn.started` `query_tracking.assigned` +- messages 预处理:`messages.compact_boundary.applied` `messages.tool_result_budget.applied` `messages.history_snip.applied` `messages.microcompact.applied` `messages.context_collapse.applied` `messages.autoconpact.checked` `messages.autoconpact.completed` `messages.preprocess.completed` +- prompt 与 API:`prompt.build.started` `prompt.build.completed` `prompt.snapshot.stored` `api.request.started` `api.stream.first_chunk` `assistant.block.received` `assistant.tool_use.detected` `api.stream.completed` +- 工具:`tool.execution.mode.selected` `tool.enqueued` `tool.execution.started` `tool.execution.completed` `tool.execution.failed` `tool.batch.started` `tool.context.updated` +- stop hooks:`stop_hooks.started` `stop_hooks.completed` +- 子 agent:`subagent.spawn.requested` `subagent.spawned` `subagent.message.received` `subagent.completed` +- 终止:`token_budget.decision` `query.terminated` + +--- + +## 8. 阅读时最容易犯的错 + +- 只看控制台输出,不看 JSONL +- 只看单条事件,不按 `query_id` 串链 +- 只看主线程,不看 `subagent_id` +- 看到压缩事件就下结论,不打开 before/after 快照 +- 看到 `completed` 就以为正常结束,没有检查是否前面出现过 withheld error 或 stop hook + +--- + +## 9. 最实用的一句话方法 + +先用 `query_id` 串主线,再用 `tool_call_id` 看工具,再用 `subagent_id` 看分叉,最后回到 `snapshot_ref` 看证据。 diff --git a/src/query.ts b/src/query.ts index 67e18d8c4d..065a3c5537 100644 --- a/src/query.ts +++ b/src/query.ts @@ -213,6 +213,75 @@ function asOptionalString(value: unknown): string | null { return typeof value === 'string' ? value : null } +function extractPromptSectionLabel(section: string): string { + const firstLine = section + .split('\n') + .map(line => line.trim()) + .find(line => line.length > 0) + if (!firstLine) { + return '(empty)' + } + return firstLine.length > 80 ? `${firstLine.slice(0, 80)}...` : firstLine +} + +function summarizeStringMap(context: { [k: string]: string }): { + keys: string[] + chars_total: number + serialized_chars: number + value_chars_by_key: Record +} { + const entries = Object.entries(context) + return { + keys: entries.map(([key]) => key), + chars_total: entries.reduce((sum, [, value]) => sum + value.length, 0), + serialized_chars: jsonStringify(context).length, + value_chars_by_key: Object.fromEntries( + entries.map(([key, value]) => [key, value.length]), + ) as Record, + } +} + +function summarizePromptComposition({ + systemPrompt, + systemContext, + userContext, + messagesBeforePrepend, + requestMessages, +}: { + systemPrompt: SystemPrompt + systemContext: { [k: string]: string } + userContext: { [k: string]: string } + messagesBeforePrepend: Message[] + requestMessages: Message[] +}): { + system_prompt_section_labels: string[] + system_prompt_chars_by_section: number[] + system_context: ReturnType + user_context: ReturnType + claude_md_chars: number + current_date_chars: number + base_messages_chars_total: number + request_messages_chars_total: number + prepended_context_message_chars: number +} { + const prependedContextMessage = + requestMessages.length > messagesBeforePrepend.length ? requestMessages[0] : null + + return { + system_prompt_section_labels: systemPrompt.map(extractPromptSectionLabel), + system_prompt_chars_by_section: systemPrompt.map(section => section.length), + system_context: summarizeStringMap(systemContext), + user_context: summarizeStringMap(userContext), + claude_md_chars: userContext.claudeMd?.length ?? 0, + current_date_chars: userContext.currentDate?.length ?? 0, + base_messages_chars_total: jsonStringify(messagesBeforePrepend).length, + request_messages_chars_total: jsonStringify(requestMessages).length, + prepended_context_message_chars: prependedContextMessage + ? jsonStringify(prependedContextMessage).length + : 0, + } +} + async function emitMessageStageEvent({ event, component, @@ -266,6 +335,105 @@ async function emitMessageStageEvent({ }) } +async function emitStateSnapshotEvent({ + event, + state, + queryId, + turnId, + loopIter, + querySource, +}: { + event: 'state.snapshot.before_turn' | 'state.snapshot.after_turn' + state: State + queryId: string + turnId: string + loopIter: number + querySource: string +}): Promise { + const snapshot = await storeHarnessSnapshot(event, { + messages_count: state.messages.length, + turn_count: state.turnCount, + transition: state.transition ?? null, + max_output_tokens_recovery_count: state.maxOutputTokensRecoveryCount, + has_attempted_reactive_compact: state.hasAttemptedReactiveCompact, + max_output_tokens_override: state.maxOutputTokensOverride ?? null, + stop_hook_active: state.stopHookActive ?? false, + auto_compact_tracking: state.autoCompactTracking ?? null, + tool_use_context: { + agent_id: state.toolUseContext.agentId ?? null, + agent_type: state.toolUseContext.agentType ?? null, + query_tracking: state.toolUseContext.queryTracking ?? null, + tool_count: state.toolUseContext.options.tools.length, + main_loop_model: state.toolUseContext.options.mainLoopModel, + }, + }) + await emitHarnessEvent({ + event, + component: 'query_loop', + query_id: queryId, + turn_id: turnId, + loop_iter: loopIter, + query_source: querySource, + subagent_id: state.toolUseContext.agentId ?? null, + subagent_type: state.toolUseContext.agentType ?? null, + payload: { + messages_count: state.messages.length, + snapshot_ref: snapshot.snapshot_ref, + transition: state.transition?.reason ?? null, + }, + }) +} + +async function emitStateTransitionEvent({ + fromState, + toState, + queryId, + turnId, + loopIter, + querySource, +}: { + fromState: State + toState: State + queryId: string + turnId: string + loopIter: number + querySource: string +}): Promise { + const [beforeSnapshot, afterSnapshot] = await Promise.all([ + storeHarnessSnapshot('state-before', { + messages_count: fromState.messages.length, + turn_count: fromState.turnCount, + transition: fromState.transition ?? null, + }), + storeHarnessSnapshot('state-after', { + messages_count: toState.messages.length, + turn_count: toState.turnCount, + transition: toState.transition ?? null, + }), + ]) + await emitHarnessEvent({ + event: 'state.transitioned', + component: 'query_loop', + query_id: queryId, + turn_id: turnId, + loop_iter: loopIter, + query_source: querySource, + subagent_id: toState.toolUseContext.agentId ?? null, + subagent_type: toState.toolUseContext.agentType ?? null, + payload: { + from_transition: fromState.transition?.reason ?? null, + to_transition: toState.transition?.reason ?? null, + from_messages_count: fromState.messages.length, + to_messages_count: toState.messages.length, + message_delta: toState.messages.length - fromState.messages.length, + token_estimate_before: tokenCountWithEstimation(fromState.messages), + token_estimate_after: tokenCountWithEstimation(toState.messages), + before_snapshot_ref: beforeSnapshot.snapshot_ref, + after_snapshot_ref: afterSnapshot.snapshot_ref, + }, + }) +} + export type QueryParams = { messages: Message[] systemPrompt: SystemPrompt @@ -441,6 +609,15 @@ async function* queryLoop( state.messages, state.toolUseContext, ) + await emitHarnessEvent({ + event: 'prefetch.memory.started', + component: 'query_loop', + query_source: querySource, + payload: { + message_count: state.messages.length, + is_subagent: Boolean(state.toolUseContext.agentId), + }, + }) async function emitQueryTerminated( reason: string, @@ -566,6 +743,14 @@ async function* queryLoop( message_count: messages.length, }, }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.before_turn', + state, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) let messagesForQuery = [...getMessagesAfterCompactBoundary(messages)] await emitMessageStageEvent({ @@ -966,6 +1151,13 @@ async function* queryLoop( let firstStreamChunkSeen = false queryCheckpoint('query_api_streaming_start') const requestMessages = prependUserContext(messagesForQuery, userContext) + const promptComposition = summarizePromptComposition({ + systemPrompt: fullSystemPrompt, + systemContext, + userContext, + messagesBeforePrepend: messagesForQuery, + requestMessages, + }) await emitHarnessEvent({ event: 'prompt.build.started', component: 'query_loop', @@ -1017,10 +1209,33 @@ async function* queryLoop( tool_names_chars: toolUseContext.options.tools .map(tool => tool.name) .join(',').length, - messages_chars_total: jsonStringify(requestMessages).length, + messages_chars_total: promptComposition.request_messages_chars_total, attachments_chars_total: jsonStringify( requestMessages.filter(message => message.type === 'attachment'), ).length, + base_messages_chars_total: + promptComposition.base_messages_chars_total, + prepended_context_message_chars: + promptComposition.prepended_context_message_chars, + system_prompt_section_labels: + promptComposition.system_prompt_section_labels, + system_prompt_chars_by_section: + promptComposition.system_prompt_chars_by_section, + system_context_keys: promptComposition.system_context.keys, + system_context_chars_total: + promptComposition.system_context.chars_total, + system_context_serialized_chars: + promptComposition.system_context.serialized_chars, + system_context_value_chars_by_key: + promptComposition.system_context.value_chars_by_key, + user_context_keys: promptComposition.user_context.keys, + user_context_chars_total: promptComposition.user_context.chars_total, + user_context_serialized_chars: + promptComposition.user_context.serialized_chars, + user_context_value_chars_by_key: + promptComposition.user_context.value_chars_by_key, + claude_md_chars: promptComposition.claude_md_chars, + current_date_chars: promptComposition.current_date_chars, serialized_request_bytes: requestSnapshot.bytes, request_snapshot_ref: requestSnapshot.snapshot_ref, }, @@ -1585,6 +1800,22 @@ async function* queryLoop( committed: drained.committed, }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next continue } @@ -1635,6 +1866,22 @@ async function* queryLoop( turnCount, transition: { reason: 'reactive_compact_retry' }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next continue } @@ -1694,6 +1941,22 @@ async function* queryLoop( turnCount, transition: { reason: 'max_output_tokens_escalate' }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next continue } @@ -1725,6 +1988,22 @@ async function* queryLoop( attempt: maxOutputTokensRecoveryCount + 1, }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next continue } @@ -1783,6 +2062,22 @@ async function* queryLoop( turnCount, transition: { reason: 'stop_hook_blocking' }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next continue } @@ -1815,7 +2110,7 @@ async function* queryLoop( logForDebugging( `Token budget continuation #${decision.continuationCount}: ${decision.pct}% (${decision.turnTokens.toLocaleString()} / ${decision.budget.toLocaleString()})`, ) - state = { + const next: State = { messages: [ ...messagesForQuery, ...assistantMessages, @@ -1834,6 +2129,23 @@ async function* queryLoop( turnCount, transition: { reason: 'token_budget_continuation' }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + state = next continue } @@ -2239,6 +2551,22 @@ async function* queryLoop( stopHookActive, transition: { reason: 'next_turn' }, } + await emitStateTransitionEvent({ + fromState: state, + toState: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: next, + queryId: queryTracking.chainId, + turnId, + loopIter: turnCount, + querySource: querySource, + }) state = next } // while (true) } diff --git a/src/query/stopHooks.ts b/src/query/stopHooks.ts index 73aa62df68..d3ea92f41f 100644 --- a/src/query/stopHooks.ts +++ b/src/query/stopHooks.ts @@ -5,6 +5,7 @@ import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../services/analytics/index.js' +import { emitHarnessEvent } from '../observability/harness.js' import type { ToolUseContext } from '../Tool.js' import type { HookProgress } from '../types/hooks.js' import type { @@ -80,6 +81,19 @@ export async function* handleStopHooks( StopHookResult > { const hookStartTime = Date.now() + await emitHarnessEvent({ + event: 'stop_hooks.started', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + messages_for_query: messagesForQuery.length, + assistant_messages: assistantMessages.length, + stop_hook_active: stopHookActive ?? false, + }, + }) const stopHookContext: REPLHookContext = { messages: [...messagesForQuery, ...assistantMessages], @@ -331,11 +345,39 @@ export async function* handleStopHooks( } if (preventedContinuation) { + await emitHarnessEvent({ + event: 'stop_hooks.completed', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + prevent_continuation: true, + blocking_error_count: 0, + hook_count: hookCount, + duration_ms: Date.now() - hookStartTime, + }, + }) return { blockingErrors: [], preventContinuation: true } } // Collect blocking errors from stop hooks if (blockingErrors.length > 0) { + await emitHarnessEvent({ + event: 'stop_hooks.completed', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + prevent_continuation: false, + blocking_error_count: blockingErrors.length, + hook_count: hookCount, + duration_ms: Date.now() - hookStartTime, + }, + }) return { blockingErrors, preventContinuation: false } } @@ -449,10 +491,38 @@ export async function* handleStopHooks( } if (teammatePreventedContinuation) { + await emitHarnessEvent({ + event: 'stop_hooks.completed', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + prevent_continuation: true, + blocking_error_count: 0, + hook_count: hookCount, + duration_ms: Date.now() - hookStartTime, + }, + }) return { blockingErrors: [], preventContinuation: true } } if (teammateBlockingErrors.length > 0) { + await emitHarnessEvent({ + event: 'stop_hooks.completed', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + prevent_continuation: false, + blocking_error_count: teammateBlockingErrors.length, + hook_count: hookCount, + duration_ms: Date.now() - hookStartTime, + }, + }) return { blockingErrors: teammateBlockingErrors, preventContinuation: false, @@ -460,6 +530,20 @@ export async function* handleStopHooks( } } + await emitHarnessEvent({ + event: 'stop_hooks.completed', + component: 'stop_hooks', + query_id: toolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + prevent_continuation: false, + blocking_error_count: 0, + hook_count: hookCount, + duration_ms: Date.now() - hookStartTime, + }, + }) return { blockingErrors: [], preventContinuation: false } } catch (error) { const durationMs = Date.now() - hookStartTime diff --git a/src/services/tools/toolExecution.ts b/src/services/tools/toolExecution.ts index 97852b2adc..536a896576 100644 --- a/src/services/tools/toolExecution.ts +++ b/src/services/tools/toolExecution.ts @@ -8,6 +8,7 @@ import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from 'src/services/analytics/index.js' +import { emitHarnessEvent } from 'src/observability/harness.js' import { extractMcpToolDetails, extractSkillName, @@ -341,6 +342,7 @@ export async function* runToolUse( canUseTool: CanUseToolFn, toolUseContext: ToolUseContext, ): AsyncGenerator { + const startedAt = Date.now() const toolName = toolUse.name // First try to find in the available tools (what the model sees) let tool = findToolByName(toolUseContext.options.tools, toolName) @@ -368,6 +370,21 @@ export async function* runToolUse( // Check if the tool exists if (!tool) { + await emitHarnessEvent({ + event: 'tool.execution.failed', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: toolName, + success: false, + error: 'tool_not_found', + duration_ms: Date.now() - startedAt, + }, + }) const sanitizedToolName = sanitizeToolNameForAnalytics(toolName) logForDebugging(`Unknown tool ${toolName}: ${toolUse.id}`) logEvent('tengu_tool_use_error', { @@ -413,6 +430,32 @@ export async function* runToolUse( const toolInput = toolUse.input as { [key: string]: string } try { + await emitHarnessEvent({ + event: 'tool.enqueued', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: tool.name, + input_keys: Object.keys(toolInput), + }, + }) + await emitHarnessEvent({ + event: 'tool.execution.started', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: tool.name, + input_keys: Object.keys(toolInput), + }, + }) if (toolUseContext.abortController.signal.aborted) { logEvent('tengu_tool_use_cancelled', { toolName: sanitizeToolNameForAnalytics(tool.name), @@ -450,6 +493,21 @@ export async function* runToolUse( sourceToolAssistantUUID: assistantMessage.uuid, }), } + await emitHarnessEvent({ + event: 'tool.execution.failed', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: tool.name, + success: false, + error: 'cancelled_before_execution', + duration_ms: Date.now() - startedAt, + }, + }) return } @@ -467,6 +525,20 @@ export async function* runToolUse( )) { yield update } + await emitHarnessEvent({ + event: 'tool.execution.completed', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: tool.name, + success: true, + duration_ms: Date.now() - startedAt, + }, + }) } catch (error) { logError(error) const errorMessage = error instanceof Error ? error.message : String(error) @@ -487,6 +559,21 @@ export async function* runToolUse( sourceToolAssistantUUID: assistantMessage.uuid, }), } + await emitHarnessEvent({ + event: 'tool.execution.failed', + component: 'tool_execution', + query_id: toolUseContext.queryTracking?.chainId ?? null, + request_id: requestId ?? null, + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: tool?.name ?? toolName, + success: false, + error: errorMessage, + duration_ms: Date.now() - startedAt, + }, + }) } } diff --git a/src/services/tools/toolOrchestration.ts b/src/services/tools/toolOrchestration.ts index 9e5d524490..3315a32efb 100644 --- a/src/services/tools/toolOrchestration.ts +++ b/src/services/tools/toolOrchestration.ts @@ -1,5 +1,6 @@ import type { ToolUseBlock } from '@anthropic-ai/sdk/resources/index.mjs' import type { CanUseToolFn } from '../../hooks/useCanUseTool.js' +import { emitHarnessEvent } from '../../observability/harness.js' import { findToolByName, type ToolUseContext } from '../../Tool.js' import type { AssistantMessage, Message } from '../../types/message.js' import { all } from '../../utils/generators.js' @@ -23,6 +24,18 @@ export async function* runTools( canUseTool: CanUseToolFn, toolUseContext: ToolUseContext, ): AsyncGenerator { + await emitHarnessEvent({ + event: 'tool.batch.started', + component: 'tool_orchestration', + query_id: toolUseContext.queryTracking?.chainId ?? null, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_count: toolUseMessages.length, + tool_names: toolUseMessages.map(block => block.name), + execution_mode: 'runTools', + }, + }) // Wrap all tool calls in this turn under a single Langfuse turn span const turnSpan = toolUseMessages.length > 0 ? createToolBatchSpan(toolUseContext.langfuseTrace ?? null, { @@ -39,6 +52,19 @@ export async function* runTools( toolUseMessages, currentContext, )) { + await emitHarnessEvent({ + event: 'tool.execution.mode.selected', + component: 'tool_orchestration', + query_id: currentContext.queryTracking?.chainId ?? null, + subagent_id: currentContext.agentId ?? null, + subagent_type: currentContext.agentType ?? null, + payload: { + execution_mode: 'runTools', + batch_size: blocks.length, + concurrency: isConcurrencySafe ? 'parallel' : 'serial', + tool_names: blocks.map(block => block.name), + }, + }) if (isConcurrencySafe) { const queuedContextModifiers: Record< string, @@ -72,6 +98,20 @@ export async function* runTools( currentContext = modifier(currentContext) } } + if (blocks.some(block => queuedContextModifiers[block.id]?.length)) { + await emitHarnessEvent({ + event: 'tool.context.updated', + component: 'tool_orchestration', + query_id: currentContext.queryTracking?.chainId ?? null, + subagent_id: currentContext.agentId ?? null, + subagent_type: currentContext.agentType ?? null, + payload: { + execution_mode: 'runTools', + batch_size: blocks.length, + concurrency: 'parallel', + }, + }) + } yield { newContext: currentContext } } else { // Run non-read-only batch serially @@ -89,6 +129,18 @@ export async function* runTools( newContext: currentContext, } } + await emitHarnessEvent({ + event: 'tool.context.updated', + component: 'tool_orchestration', + query_id: currentContext.queryTracking?.chainId ?? null, + subagent_id: currentContext.agentId ?? null, + subagent_type: currentContext.agentType ?? null, + payload: { + execution_mode: 'runTools', + batch_size: blocks.length, + concurrency: 'serial', + }, + }) } } diff --git a/src/utils/forkedAgent.ts b/src/utils/forkedAgent.ts index 8b35fb41dd..af648ca097 100644 --- a/src/utils/forkedAgent.ts +++ b/src/utils/forkedAgent.ts @@ -18,6 +18,7 @@ import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../services/analytics/index.js' +import { emitHarnessEvent } from '../observability/harness.js' import { accumulateUsage, updateUsage } from '../services/api/claude.js' import { EMPTY_USAGE, type NonNullableUsage } from '@ant/model-provider' import type { ToolUseContext } from '../Tool.js' @@ -502,6 +503,18 @@ export async function runForkedAgent({ const startTime = Date.now() const outputMessages: Message[] = [] let totalUsage: NonNullableUsage = { ...EMPTY_USAGE } + await emitHarnessEvent({ + event: 'subagent.spawn.requested', + component: 'forked_agent', + query_source: querySource, + subagent_type: forkLabel, + payload: { + fork_label: forkLabel, + prompt_message_count: promptMessages.length, + skip_transcript: skipTranscript ?? false, + max_turns: maxTurns ?? null, + }, + }) const { systemPrompt, @@ -526,6 +539,20 @@ export async function runForkedAgent({ // Generate agent ID and record initial messages for transcript // When skipTranscript is set, skip agent ID creation and all transcript I/O const agentId = skipTranscript ? undefined : createAgentId(forkLabel) + await emitHarnessEvent({ + event: 'subagent.spawned', + component: 'forked_agent', + query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, + subagent_type: forkLabel, + payload: { + fork_label: forkLabel, + inherited_message_count: forkContextMessages.length, + prompt_message_count: promptMessages.length, + transcript_enabled: Boolean(agentId), + }, + }) let lastRecordedUuid: UUID | null = null if (agentId) { await recordSidechainTranscript(initialMessages, agentId).catch(err => @@ -573,6 +600,17 @@ export async function runForkedAgent({ logForDebugging( `Forked agent [${forkLabel}] received message: type=${message.type}`, ) + await emitHarnessEvent({ + event: 'subagent.message.received', + component: 'forked_agent', + query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, + subagent_type: forkLabel, + payload: { + message_type: (message as Message).type, + }, + }) outputMessages.push(message as Message) onMessage?.(message as Message) @@ -618,6 +656,23 @@ export async function runForkedAgent({ totalUsage, queryTracking: toolUseContext.queryTracking, }) + await emitHarnessEvent({ + event: 'subagent.completed', + component: 'forked_agent', + query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, + query_source: querySource, + subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, + subagent_type: forkLabel, + payload: { + fork_label: forkLabel, + duration_ms: durationMs, + message_count: outputMessages.length, + input_tokens: totalUsage.input_tokens, + output_tokens: totalUsage.output_tokens, + cache_read_input_tokens: totalUsage.cache_read_input_tokens, + cache_creation_input_tokens: totalUsage.cache_creation_input_tokens, + }, + }) return { messages: outputMessages, From dbf2dad4ef70023962ee7ab6588a98a3f029aae1 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Thu, 23 Apr 2026 19:36:11 +0800 Subject: [PATCH 04/26] Add observability v1 tooling and docs --- .../Observersity.md" | 2 +- ...1\346\217\220\347\244\272\350\257\215.pdf" | Bin ...1\347\250\213\344\273\213\347\273\215.pdf" | Bin ...V1\344\273\273\345\212\241\344\271\246.md" | 166 ++ ...52\346\237\245\346\270\205\345\215\225.md" | 204 ++ ...45\345\277\227\346\270\205\346\264\227.md" | 114 + .../observability_dashboard.html" | 1125 +++++++ ...24\347\251\266\346\212\245\345\221\212.md" | 1269 ++++++++ .../DuckDB Schema\346\226\207\346\241\243.md" | 203 ++ ...\273\266Schema\346\226\207\346\241\243.md" | 7 + ...32\344\271\211\346\226\207\346\241\243.md" | 325 +++ ...05\350\257\273\346\225\231\345\255\246.md" | 30 +- .../user_action_9ddd1bff_auto_report.md" | 145 + ...01\347\250\213\350\247\243\346\236\220.md" | 296 ++ ...70\345\257\271\346\212\245\345\221\212.md" | 0 ...20\347\240\201\347\211\210\357\274\211.md" | 1243 ++++++++ ...13\344\273\273\345\212\241\344\271\246.md" | 650 +++++ ...47\350\241\214\346\270\205\345\215\225.md" | 77 + ...5\205\245Token\345\210\206\346\236\220.md" | 0 .../v1/README.md" | 21 + ObservrityTask/README.md | 21 + README.md | 138 + scripts/observability/build_dashboard.ps1 | 743 +++++ scripts/observability/build_duckdb_etl.ts | 2592 +++++++++++++++++ scripts/observability/clean_observability.py | 420 +++ scripts/observability/daily_summary.ps1 | 331 +++ scripts/observability/explain_action.ps1 | 462 +++ scripts/observability/open_duckdb.ps1 | 9 + scripts/observability/read_timeline.ps1 | 103 + .../rebuild_observability_db.ps1 | 40 + scripts/observability/refresh_debug_view.ps1 | 54 + .../reset_observability_debug.ps1 | 46 + scripts/observability/watch_latest_events.ps1 | 84 + src/QueryEngine.ts | 2 + src/Tool.ts | 1 + src/observability/harness.ts | 6 + src/query.ts | 182 +- src/query/stopHooks.ts | 6 + src/screens/REPL.tsx | 6 + src/services/AgentSummary/agentSummary.ts | 7 + .../PromptSuggestion/promptSuggestion.ts | 20 + src/services/PromptSuggestion/speculation.ts | 16 + src/services/SessionMemory/sessionMemory.ts | 76 +- src/services/autoDream/autoDream.ts | 6 + src/services/compact/compact.ts | 8 + .../extractMemories/extractMemories.ts | 11 + src/services/tools/StreamingToolExecutor.ts | 28 + src/services/tools/toolExecution.ts | 6 + src/services/tools/toolOrchestration.ts | 4 + src/utils/forkedAgent.ts | 41 + src/utils/handlePromptSubmit.ts | 2 + src/utils/sideQuestion.ts | 8 + 52 files changed, 11334 insertions(+), 22 deletions(-) rename ObservrityTask/Observersity.md => "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/Observersity.md" (95%) rename "ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" => "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" (100%) rename "ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" => "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" (100%) create mode 100644 "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\350\207\252\346\237\245\346\270\205\345\215\225.md" create mode 100644 "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\346\227\245\345\277\227\346\270\205\346\264\227.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/\345\275\223\345\211\215\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\346\267\261\345\272\246\347\240\224\347\251\266\346\212\245\345\221\212.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/DuckDB Schema\346\226\207\346\241\243.md" rename "ObservrityTask/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" (91%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\214\207\346\240\207\345\256\232\344\271\211\346\226\207\346\241\243.md" rename "ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" (88%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_9ddd1bff_auto_report.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_9ddd1bff_\346\265\201\347\250\213\350\247\243\346\236\220.md" rename "ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/QueryLoop\345\205\250\346\265\201\347\250\213\350\257\246\350\247\243\357\274\210\346\272\220\347\240\201\347\211\210\357\274\211.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\345\217\257\350\247\202\346\265\213\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\346\211\247\350\241\214\346\270\205\345\215\225.md" rename "ObservrityTask/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/README.md" create mode 100644 ObservrityTask/README.md create mode 100644 scripts/observability/build_dashboard.ps1 create mode 100644 scripts/observability/build_duckdb_etl.ts create mode 100644 scripts/observability/clean_observability.py create mode 100644 scripts/observability/daily_summary.ps1 create mode 100644 scripts/observability/explain_action.ps1 create mode 100644 scripts/observability/open_duckdb.ps1 create mode 100644 scripts/observability/read_timeline.ps1 create mode 100644 scripts/observability/rebuild_observability_db.ps1 create mode 100644 scripts/observability/refresh_debug_view.ps1 create mode 100644 scripts/observability/reset_observability_debug.ps1 create mode 100644 scripts/observability/watch_latest_events.ps1 diff --git a/ObservrityTask/Observersity.md "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/Observersity.md" similarity index 95% rename from ObservrityTask/Observersity.md rename to "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/Observersity.md" index 768287c5d2..ccd84e8132 100644 --- a/ObservrityTask/Observersity.md +++ "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/Observersity.md" @@ -634,4 +634,4 @@ --- -如果你愿意,我下一条可以继续帮你把这份任务书再压缩成一个“更像 prompt、可以直接粘贴给 Codex 的简洁版”。 + diff --git "a/ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" similarity index 100% rename from "ObservrityTask/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" rename to "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/cc\346\272\220\347\240\201\346\217\220\347\244\272\350\257\215.pdf" diff --git "a/ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" similarity index 100% rename from "ObservrityTask/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" rename to "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/query loop\345\205\250\346\265\201\347\250\213\344\273\213\347\273\215.pdf" diff --git "a/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\344\273\273\345\212\241\344\271\246.md" "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\344\273\273\345\212\241\344\271\246.md" new file mode 100644 index 0000000000..4b82a57b3f --- /dev/null +++ "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\344\273\273\345\212\241\344\271\246.md" @@ -0,0 +1,166 @@ +任务书:从埋点走向第一版可观测系统 +1. 背景 + +当前项目已完成主线程、工具编排、stop hooks、forked subagent、state snapshot/transition 等基础埋点,并已有《PDF 主链核对报告》。核对结果表明,当前真实主链存在,contextCollapse 为 disabled/stub,HISTORY_SNIP 受 feature gate 控制,subagent 链路真实存在且必须纳入统一观测。 + +2. 本轮目标 + +基于现有 .observability/events-*.jsonl 和 snapshots,建立第一版本地可观测系统,使其可以: + +聚合主线程与子 agent 的完整链路 +计算核心成本/延迟/压缩/工具/恢复指标 +输出基础看板和单次链路阅读报告 +以当前项目源码为真相,不对 disabled/stub 节点做错误假设 +3. 本轮不做 +不接入远端 APM / Prometheus / Loki / Tempo +不恢复开启当前 disabled/stub 功能 +不扩展新一轮大规模埋点,除非为指标闭环所必需 +不改动 query loop 主语义 +4. 数据源要求 + +以本地 .observability/events-*.jsonl 和 snapshots 为唯一事实源。 +远端 telemetry/exporter 当前存在失败和 dropped events,不应作为主数据源。 + +5. 需要实现的内容 +A. 本地分析层 + +新增本地分析数据库,优先使用 DuckDB。 +实现 JSONL → 结构化表的 ETL。 + +建议表: + +events_raw +queries +turns +tools +subagents +recoveries +snapshots_index +daily_rollups +B. 指标计算 + +至少实现以下指标: + +完整性 +query_completion_rate +turn_state_closure_rate +tool_lifecycle_closure_rate +subagent_lifecycle_closure_rate +snapshot_missing_rate +orphan_event_rate +成本 +user_action_total_input_tokens +user_action_total_output_tokens +user_action_total_cache_read_tokens +user_action_total_cache_create_tokens +query_source_cost_share +subagent_amplification_ratio +延迟 +submit_to_first_chunk_ms +preprocess_duration_ms +prompt_build_duration_ms +api_first_chunk_latency_ms +api_total_duration_ms +tool_execution_duration_ms +subagent_duration_ms +user_action_e2e_duration_ms +压缩/上下文治理 +compression_gain_ratio +tool_result_budget_saved_tokens +history_snip_saved_tokens +microcompact_saved_tokens +autocompact_saved_tokens +autocompact_trigger_rate +history_snip_gate_on_rate +contextCollapse_enabled_gauge +工具 +tool_calls_by_name +tool_calls_by_mode +tool_success_rate +tool_failure_rate +tool_avg_duration_ms +tool_p95_duration_ms +context_update_rate +恢复/异常 +prompt_too_long_recovery_attempts +prompt_too_long_recovery_success_rate +max_output_tokens_recovery_attempts +max_output_tokens_recovery_success_rate +token_budget_continue_rate +stop_hook_block_rate +terminal_reason_distribution +exporter_failure_rate +dropped_event_rate +C. 视图与工具 + +至少实现: + +链路阅读器 +输入 user_action_id / query_id / subagent_id +输出完整时序链路 +每日 summary CLI +输出当天的 query/source/cost/error 概览 +本地 dashboard +可以是 HTML 报表或 Streamlit +覆盖成本、延迟、压缩、工具、恢复五个面板 +D. disabled/gated 节点的显式状态化 + +必须把以下状态纳入可观测: + +contextCollapse_enabled = false +HISTORY_SNIP_gate_state +feature/gate 命中情况 + +不能把这些节点默认为“已工作”。 + +6. 冲突处理 + +如果在实现分析层时发现: + +现有事件字段不足以闭合某条链路 +某个指标需要的字段未落日志 +当前 JSONL/snapshot 设计无法稳定关联 user_action_id / query_id / turn_id / subagent_id + +请立即列出: + +缺少的字段 +受影响的指标 +最小补埋点建议 +是否会改动当前事件 schema + +并找我确认后再修改埋点。 + +7. 实施顺序 +Phase 1 +建立 DuckDB ETL +导入 JSONL/snapshot index +产出基础表 +Phase 2 +做完整性 + 成本 + 延迟指标 +产出每日 summary CLI +Phase 3 +做压缩/工具/恢复指标 +产出链路阅读器 +Phase 4 +做本地 dashboard +给出一份“完整用户动作样例链路”报告 +8. 验收标准 + +任务完成时,必须能做到: + +用一个 user_action_id 还原一次完整链路 +分别统计主线程和所有 subagent 的成本 +显示每轮 turn 的 state/transition/termination 关键摘要 +统计每类工具的使用量、耗时、成功率 +统计压缩动作的触发率与节省效果 +显示恢复链是否被触发、是否成功 +显式展示 contextCollapse disabled、HISTORY_SNIP gated 的状态 +不依赖远端 exporter 也能完成本地分析 +9. 交付物 +ETL 脚本 / 模块 +DuckDB schema 文档 +指标定义文档 +CLI summary 工具 +链路阅读器 +本地 dashboard +一份样例链路分析报告 \ No newline at end of file diff --git "a/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\350\207\252\346\237\245\346\270\205\345\215\225.md" "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\350\207\252\346\237\245\346\270\205\345\215\225.md" new file mode 100644 index 0000000000..e0931f911d --- /dev/null +++ "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\350\207\252\346\237\245\346\270\205\345\215\225.md" @@ -0,0 +1,204 @@ +1. 链路完整性指标 + +这是第一优先级。 +因为你先要确认“事件真的够用”。 + +你应该关注: + +每个 user_action_id 是否都能串到至少一个主线程 query +每个 query_id 是否都有 query.started 和 query.terminated +每个 turn_id 是否都有 turn.started、state.snapshot.before_turn、state.snapshot.after_turn +每个 tool_call_id 是否都能从 assistant.tool_use.detected 串到 tool.execution.completed/failed +每个 subagent_id 是否都有 subagent.spawned 和 subagent.completed +snapshot 是否存在缺失、哈希不一致、引用断链 + +为什么这组最重要: +如果链路本身不闭合,后面的成本/延迟/压缩效果都不可信。 + +建议的核心指标 +query_completion_rate +turn_state_closure_rate +tool_lifecycle_closure_rate +subagent_lifecycle_closure_rate +snapshot_missing_rate +orphan_event_rate +2. 成本指标 + +这是你最关心的一组,而且已经有明确信号了。 + +你当前日志里非常典型: + +主线程有很高 input_tokens +子 agent 也会单独触发高 token prompt +extract_memories、session_memory、side_query 会额外放大成本。 +你应该关注 +A. 按 user action 汇总 + +不要只看单次 API 请求。 +你真正要看的是: + +主线程 input/output tokens +子 agent input/output tokens +cache read / cache create +总 prompt bytes +总 response bytes +B. 按 query source 分解 + +至少区分: + +repl_main_thread +extract_memories +session_memory +away_summary +side_query。 +建议指标 +user_action_total_input_tokens +user_action_total_output_tokens +user_action_total_cache_read_tokens +user_action_total_cache_create_tokens +query_source_cost_share +subagent_amplification_ratio +cost_per_successful_completed_query + +其中最关键的一个: + +subagent_amplification_ratio = +(所有 subagent input_tokens 总和) / 主线程 input_tokens + +这会直接告诉你:memory 链到底有多贵。 + +3. 延迟指标 + +因为 PDF 明确说明这个 harness 是“流式模型 + 工具执行 + 下一轮 state”的状态机,而且把“流式模型调用”和“工具执行”并行化当作重要设计意图。 + +所以你不该只看“总耗时”,而应该拆成阶段。 + +你应该关注 +submit → input.process 完成 +preprocess 耗时 +prompt.build 耗时 +request 发起 → first chunk +request 总时长 +tool 调度耗时 +tool 执行耗时 +stop hooks 耗时 +subagent 生命周期总时长 +user action 端到端总耗时 +建议指标 +submit_to_first_chunk_ms +preprocess_duration_ms +prompt_build_duration_ms +api_first_chunk_latency_ms +api_total_duration_ms +tool_execution_duration_ms +subagent_duration_ms +user_action_e2e_duration_ms +4. 压缩与上下文治理指标 + +这一组必须做,因为你的 PDF 里把这条链写得非常清楚: + +getMessagesAfterCompactBoundary → applyToolResultBudget → HISTORY_SNIP → microcompact → contextCollapse → autocompact。 + +但当前核对报告也明确: + +contextCollapse 是 disabled/stub +HISTORY_SNIP 是 gate 控制 +autocompact / microcompact / toolResultBudget 真实存在。 + +所以这组指标要分成两类: + +A. 能真实发生的压缩动作 +applyToolResultBudget +HISTORY_SNIP(如果 gate 开) +microcompact +autocompact +B. 目前应视为状态指标的节点 +contextCollapse_enabled = false +contextCollapse_attempted = 0 +contextCollapse_committed = 0 +建议指标 +preprocess_tokens_before_total +preprocess_tokens_after_total +tokens_saved_total +tool_result_budget_saved_tokens +history_snip_saved_tokens +microcompact_saved_tokens +autocompact_saved_tokens +autocompact_trigger_rate +history_snip_gate_on_rate +contextCollapse_enabled_gauge + +这组里最关键的是: + +compression_gain_ratio = +(tokens_before_preprocess - tokens_after_preprocess) / tokens_before_preprocess + +和: + +autocompact_trigger_rate = +触发 autocompact 的 turn 数 / 总 turn 数 +5. 工具行为指标 + +核对报告已经确认: + +StreamingToolExecutor 存在 +runTools 存在 +handleStopHooks 存在 +subagent 的工具调用也是真实能力。 + +所以你现在最需要知道的不是“工具能不能用”,而是: + +哪些工具最常被调用 +哪些工具最慢 +哪些工具最容易失败 +哪些 turn 走 streaming executor,哪些走 runTools +工具调用是否真的减少了后续轮次 +建议指标 +tool_calls_total +tool_calls_by_name +tool_calls_by_mode (streaming_executor / run_tools) +tool_success_rate +tool_failure_rate +tool_avg_duration_ms +tool_p95_duration_ms +context_update_rate +tools_per_query +tools_per_subagent + +以及一条很有价值的: + +tool_followup_turn_ratio = +包含 tool_use 的 turn 中,最终进入 next_turn 的比例 + +它能告诉你:工具是否真的在驱动 loop,而不是只做装饰。 + +6. 恢复链与异常指标 + +核对报告已经确认这几条恢复链存在: + +prompt-too-long recover +max_output_tokens recover +token budget continuation +stop hooks。 + +所以这一组要看两件事: + +A. 恢复链是否常被触发 + +如果常被触发,说明你的 prompt 治理或 output 策略还有问题。 + +B. 恢复链是否有效 + +如果触发很多但成功率低,说明恢复策略形同虚设。 + +建议指标 +prompt_too_long_recovery_attempts +prompt_too_long_recovery_success_rate +max_output_tokens_recovery_attempts +max_output_tokens_recovery_success_rate +token_budget_continue_rate +stop_hook_block_rate +terminal_reason_distribution +api_error_rate +tool_failure_terminal_rate + diff --git "a/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\346\227\245\345\277\227\346\270\205\346\264\227.md" "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\346\227\245\345\277\227\346\270\205\346\264\227.md" new file mode 100644 index 0000000000..3c26f5802f --- /dev/null +++ "b/ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/\346\227\245\345\277\227\346\270\205\346\264\227.md" @@ -0,0 +1,114 @@ +任务书补丁:Step 0 先做观测数据清洗 +背景 + +当前 .observability/events-*.jsonl 和 snapshots/ 混合了多个实现阶段的运行结果,分别对应前后几次 Codex 执行后的不同埋点版本。 +如果直接基于这些混合日志做 ETL、指标、链路阅读和 dashboard,会导致结果失真,因为不同版本的事件覆盖范围和字段完整性不同。 + +本轮新增前置目标 + +在执行“第一版可观测系统建设”之前,先完成一轮观测数据清洗与基线重建: + +将 昨天(2026-04-19)及更早 的观测数据移出主观测目录 +仅保留 今天(2026-04-20) 的观测数据作为新的分析基线 +确保清洗后的 event 与 snapshot 引用关系闭合 +然后再继续执行后续 ETL / 指标 / trace reader / dashboard 任务 +强制原则 +优先归档,不要先硬删除 +默认方案是把昨天及更早的观测数据移动到归档目录,而不是直接永久删除。 +只有在我明确要求硬删除时,才执行不可逆删除。 +以事件引用关系为准,不只按文件名日期粗删 +需要检查: +哪些 event 是今天生成的 +今天的 event 引用了哪些 snapshots +哪些 snapshots 只被昨天及更早的 event 使用 +清洗后必须做完整性校验 +至少校验: +保留下来的 event 文件是否可解析 +所有 snapshot_ref 是否存在 +不出现明显 orphan 引用 +今天的事件链路仍可正常串联 +建议实现步骤 +Phase 0.1:扫描与清单生成 + +扫描以下目录: + +.observability/events-*.jsonl +.observability/snapshots/ + +生成一份清洗前清单,包括: + +event 文件列表 +每个 event 文件中的事件日期范围 +今天事件总数 +昨天及更早事件总数 +snapshots 总数 +今天事件引用的 snapshot 数 +昨天及更早事件独占的 snapshot 数 +无引用 snapshot 数 + +输出一份报告,例如: + +ObservrityTask/观测数据清洗前清单.md +Phase 0.2:建立“保留集” + +建立两份集合: + +保留事件集:时间戳属于 2026-04-20 的事件 +保留快照集:被保留事件引用到的所有 snapshots + +如果事件文件是按天拆分且内容纯净,可直接按文件保留; +如果单个文件中混有多天事件,则需要重写出新的“仅今日事件文件”。 + +Phase 0.3:归档旧数据(默认方案) + +默认执行: + +将昨天(2026-04-19)及更早的 event 文件移到: + +.observability_archive/2026-04-19/events/ + +将不在保留快照集中的旧 snapshots 移到: + +.observability_archive/2026-04-19/snapshots/ + +保留: + +今日 event 文件 +今日 event 引用到的 snapshots +Phase 0.4:完整性校验 + +清洗后必须输出一份校验报告,至少包含: + +保留事件数 +保留 snapshot 数 +缺失 snapshot 引用数 +orphan event 数 +orphan snapshot 数 +是否可作为新基线继续做 ETL + +输出到: + +ObservrityTask/观测数据清洗后校验报告.md +如果我坚持要“硬删除” + +只有在我明确确认的情况下,才可以在归档完成并校验通过后,进一步删除归档目录。 +默认不要直接不可逆删除。 + +清洗完成后的后续动作 + +只有在“清洗后校验报告”显示通过之后,才继续执行原任务书中的: + +DuckDB ETL +指标计算 +链路阅读器 +本地 dashboard +样例链路分析报告 +交付物 + +本前置任务完成后,至少提交: + +ObservrityTask/观测数据清洗前清单.md +ObservrityTask/观测数据清洗后校验报告.md +清洗/归档脚本或实现代码 +说明“今天的基线数据”具体保留了哪些文件 +说明“昨天及更早的数据”被归档到了哪里 \ No newline at end of file diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" new file mode 100644 index 0000000000..75b6e59f30 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" @@ -0,0 +1,1125 @@ + + + + + + 本地可观测系统 V1 Dashboard + + + +

+
+

本地可观测系统 V1

+

这版 dashboard 按方向 A 执行清单把指标分成更稳定的分析层级。成本侧按 每日总量成本结构主/子链路日均/效率 展示;完整性侧同时提供 原生口径推断口径补链差值;loop 指标单独拆开,用来区分“prompt 大”还是“多轮循环导致贵”。

+
+
日期
2026-04-22
+
源文件
events-20260422.jsonl
+
文件大小(bytes)
868649
+
建库时间
2026-04-22T18:30:43.396Z
+
+
+ +
+

概览

+
+ +
+
+
用户动作数
+ 说明 +
+
2
+
+
+
+
Query 数
+ 说明 +
+
7
+
+
+
+
Turn 数
+ 说明 +
+
24
+
+
+
+
工具调用数
+ 说明 +
+
42
+
+
+
+
Subagent 数
+ 说明 +
+
5
+
+
+
+ +
+

完整性

+
+
+
+
严格 Query 完成率
+ 说明 +
+
0.285714
+
+
+
+
推断 Query 完成率
+ 说明 +
+
0.285714
+
+
+
+
Query 补链差值
+ 说明 +
+
0
+
+
+
+
严格 Turn 闭合率
+ 说明 +
+
0.708333
+
+
+
+
推断 Turn 闭合率
+ 说明 +
+
0.708333
+
+
+
+
Turn 补链差值
+ 说明 +
+
0
+
+
+
+
工具闭合率
+ 说明 +
+
0.904762
+
+
+
+
Subagent 闭合率
+ 说明 +
+
1
+
+
+
+
Snapshot 缺失率
+ 说明 +
+
0
+
+
+
+
Orphan Event 率
+ 说明 +
+
0.007417
+
+
+
+ +
+

成本 - 每日总量

+
+
+
+
总 Prompt 输入 Tokens
+ 说明 +
+
3361798
+
+
+
+
总 Billed Tokens
+ 说明 +
+
3375883
+
+
+
+
Output Tokens
+ 说明 +
+
14085
+
+
+
+ +
+

成本 - 结构拆分

+
+
+
+
裸 Input Tokens
+ 说明 +
+
24
+
+
+
+
Cache Read Tokens
+ 说明 +
+
2209219
+
+
+
+
Cache Create Tokens
+ 说明 +
+
1152555
+
+
+
+ +
+

成本 - 主/子链路

+
+
+
+
主线程 Prompt 输入
+ 说明 +
+
2003202
+
+
+
+
Subagent Prompt 输入
+ 说明 +
+
1358596
+
+
+
+
Subagent 放大倍率
+ 说明 +
+
0.678212
+
+
+
+ +
+

成本 - 日均/效率

+
+
+
+
每个用户动作平均 Prompt 输入
+ 说明 +
+
1680899
+
+
+
+
每个用户动作平均 Billed
+ 说明 +
+
1687941.5
+
+
+
+
每个 Query 平均 Prompt 输入
+ 说明 +
+
480256.857
+
+
+
+
每个 Query 平均 Billed
+ 说明 +
+
482269
+
+
+
+ +
+

Loop / Turn

+
+
+
+
每日平均 Turn/Query
+ 说明 +
+
3.429
+
+
+
+
每日平均 Loop 终点
+ 说明 +
+
3.429
+
+
+
+
每日 Loop 终点 P95
+ 说明 +
+
5.4
+
+
+
+
多轮 Query 占比
+ 说明 +
+
1
+
+
+
+ +
+

延迟

+
+
+
+
Submit -> First Chunk
+ 说明 +
+
23922
+
+
+
+
Preprocess
+ 说明 +
+
49.917
+
+
+
+
Prompt.Build
+ 说明 +
+
5.042
+
+
+
+
Request -> First Chunk
+ 说明 +
+
16010.583
+
+
+
+
API 总时长
+ 说明 +
+
24755.125
+
+
+
+
工具执行平均时长
+ 说明 +
+
2988.526
+
+
+
+
Stop Hooks 平均时长
+ 说明 +
+
2.5
+
+
+
+
Subagent 生命周期均值
+ 说明 +
+
87090.75
+
+
+
+
User Action E2E
+ 说明 +
+
275601
+
+
+
+ +
+
+

压缩与上下文治理

+
+
+
+
Preprocess 前 Tokens
+ 说明 +
+
3481101
+
+
+
+
Preprocess 后 Tokens
+ 说明 +
+
3481101
+
+
+
+
总节省 Tokens
+ 说明 +
+
0
+
+
+
+
压缩收益率
+ 说明 +
+
0
+
+
+
+
Autocompact 触发率
+ 说明 +
+
0
+
+
+
+
HISTORY_SNIP Gate
+ 说明 +
+
样本中观察到命中
+
+
+
+
contextCollapse 启用状态
+ 说明 +
+
0.0
+
+
+
+
+

工具与恢复

+
+
+
+
工具成功率
+ 说明 +
+
0.904762
+
+
+
+
工具失败率
+ 说明 +
+
0
+
+
+
+
工具平均时长
+ 说明 +
+
2988.526
+
+
+
+
工具 P95 时长
+ 说明 +
+
17520.65
+
+
+
+
每个 Query 的工具数
+ 说明 +
+
6
+
+
+
+
每个 Subagent 的工具数
+ 说明 +
+
5.6
+
+
+
+
工具后续驱动率
+ 说明 +
+
0.944444
+
+
+
+
Prompt Too Long 恢复次数
+ 说明 +
+
0
+
+
+
+
Max Output Tokens 恢复次数
+ 说明 +
+
0
+
+
+
+
Token Budget Continue Rate
+ 说明 +
+
0
+
+
+
+
Stop Hook Block Rate
+ 说明 +
+
0
+
+
+
+
API Error Rate
+ 说明 +
+
0.142857
+
+
+
+
Tool Failure Terminal Rate
+ 说明 +
+
null
+
+
+
+
+ +
+

按 Source 成本拆分

+
+ + + + + + + + +
query_sourcetotal_prompt_input_tokenstotal_billed_tokensdaily_cost_share
repl_main_thread200320220076610.594707
session_memory100201710098470.299136
agent:builtin:Explore2228582232290.066125
extract_memories1337211351460.040033
+
+
+
+

按 Agent/Source 成本拆分

+
+ + + + + + + + +
agent_namesource_groupagent_total_prompt_input_tokensagent_total_billed_tokensagent_cost_shareagent_query_countagent_avg_turns_per_queryagent_avg_loop_iter_end
main_threadmain_thread200320220076610.594707255
session_memorymemory100201710098470.29913632.6672.667
Exploreagent2228582232290.066125133
extract_memoriesmemory1337211351460.040033133
+
+
+
+

最近用户动作

+
+ + + + + + +
user_action_idduration_msquery_countmain_thread_query_countsubagent_counttotal_prompt_input_tokenstotal_billed_tokens
c27626a1-a2d0-46f5-8faf-c3937025503623487531215918851599671
036e4c84-da48-4da3-a5e3-1362d9eb625531632741317699131776212
+
+
+
+

按 Source Query 概览

+
+ + + + + + + + +
query_sourcequery_counttotal_duration_mstotal_tool_calls
session_memory327935919
repl_main_thread248219714
agent:builtin:Explore1257086
extract_memories1690053
+
+
+
+

Subagent Reason 明细

+
+ + + + + + +
subagent_reasonagent_namesubagent_countavg_duration_ms
session_memorysession_memory393119.333
extract_memoriesextract_memories169005
+
+
+
+

工具按名称统计

+
+ + + + + + + + + + + +
tool_nametool_callstool_success_ratetool_failure_ratetool_avg_duration_mstool_p95_duration_ms
Edit161030.87550.75
Bash110.8181820972128616.8
Read70.857143023.33348.25
Glob5105070.46671.2
Agent1106262
Grep100nullnull
Write1102727
+
+
+
+

工具按模式统计

+
+ + + + + +
tool_modetool_calls
streaming18
+
+
+
+

终止原因分布

+
+ + + + + + +
terminal_reasonquery_count
completed6
aborted_tools1
+
+
+ +
+

指标说明

+

每张卡片右上角的“说明”都会跳到这里。这里优先解释最容易误解、最容易影响判断的指标,尤其是 token 成本口径。

+
+
+

事件数

+

含义:当天成功入库的结构化事件总数。

+

举例:例:375 代表这批样本里被 ETL 吃进去的事件一共有 375 条。

+
+
+

用户动作数

+

含义:能被同一个 user_action_id 串起来的用户动作数量。

+

举例:例:2 代表今天样本中有 2 次独立用户动作。

+
+
+

Query 数

+

含义:当天成功识别出来的 query 生命周期实体数量。

+

举例:例:6 代表这批样本里一共识别出 6 个 query。

+
+
+

Turn 数

+

含义:当天成功识别出来的 turn 数量。

+

举例:例:12 说明 query 们一共走了 12 轮 turn。

+
+
+

工具调用数

+

含义:当天工具调用总数。

+

举例:例:9 说明主线程和 subagent 合计触发了 9 次工具调用。

+
+
+

Subagent 数

+

含义:当天成功识别到的 subagent 生命周期数量。

+

举例:例:4 说明共有 4 次子代理任务被创建。

+
+
+

严格 Query 完成率

+

含义:只按原始 query_id 检查,同一个 query_id 是否同时出现 query.started 和 query.terminated。

+

举例:例:如果 terminated 丢了原始 query_id,这个值会偏低。

+
+
+

推断 Query 完成率

+

含义:允许使用 effective_query_id 补链后的 query 闭合率。

+

举例:例:它告诉你‘分析层是否还能把链串起来’,通常会高于严格口径。

+
+
+

Query 补链差值

+

含义:推断 Query 完成率减去原生 Query 完成率。

+

举例:例:0.3 代表 ETL 补链帮你多恢复了 30% 的 query 闭合。

+
+
+

严格 Turn 闭合率

+

含义:只按原始 query_id + turn_id 检查 turn.started / before_turn / after_turn 三件套是否齐全。

+

举例:例:最后一轮缺 after_turn 时,这个值就会下降。

+
+
+

推断 Turn 闭合率

+

含义:允许用 effective_query_id 做补链后的 turn 闭合率。

+

举例:例:它反映 ETL 是否还能拼出 turn 生命周期。

+
+
+

Turn 补链差值

+

含义:推断 Turn 闭合率减去原生 Turn 闭合率。

+

举例:例:值越大,说明缺 query_id/turn_id 的事件越多。

+
+
+

工具闭合率

+

含义:工具调用中,从 started 走到 completed 或 failed 的比例。

+

举例:例:1.0 代表工具调用生命周期全部闭合。

+
+
+

Subagent 闭合率

+

含义:subagent 同时出现 spawned 和 completed 的比例。

+

举例:例:1.0 代表子代理生命周期全部闭合。

+
+
+

Snapshot 缺失率

+

含义:事件引用了 snapshot_ref,但本地找不到对应快照文件的比例。

+

举例:例:0 代表这批样本没有缺快照。

+
+
+

Orphan Event 率

+

含义:无法挂靠到 user_action / query / turn / tool / subagent 的孤儿事件比例。

+

举例:例:值高时说明基础埋点键缺失严重。

+
+
+

裸 Input Tokens

+

含义:模型 usage 里的 input_tokens 原值,不包含 cache read 和 cache create。

+

举例:例:你看到它只有 153,并不代表这次输入很小,只代表

+
+
+

Cache Read Tokens

+

含义:本轮请求从 prompt cache 直接复用的输入 tokens。

+

举例:例:如果一个很长的 system prompt 被缓存复用,这里会很大,而裸 input 仍可能很小。

+
+
+

Cache Create Tokens

+

含义:本轮请求为了创建或刷新 prompt cache 而计入的输入 tokens。

+

举例:例:第一次跑一段长 prompt 时,这里可能会突然升高。

+
+
+

总 Prompt 输入 Tokens

+

含义:真正建议优先看的输入成本。= 裸 input + cache read + cache create。

+

举例:例:裸 input 153、cache read 245210、cache create 219661,则总 prompt 输入是 465024。

+
+
+

Output Tokens

+

含义:模型输出的 tokens 总量。

+

举例:例:如果 output 只有 3027,而总 prompt 输入是 46.5 万,说明成本瓶颈主要在输入侧。

+
+
+

总 Billed Tokens

+

含义:总 prompt 输入 tokens 再加 output tokens 后形成的总账单口径。

+

举例:例:465024 + 3027 = 468051。

+
+
+

主线程 Prompt 输入

+

含义:只统计 epl_main_thread 的总 prompt 输入 tokens。

+

举例:例:它能让你看清主线程本身有多贵。

+
+
+

Subagent Prompt 输入

+

含义:只统计非 epl_main_thread 的总 prompt 输入 tokens。

+

举例:例:如果它远高于主线程,说明 memory / side query 链路在放大成本。

+
+
+

Subagent 放大倍率

+

含义:subagent 总 prompt 输入 tokens / 主线程总 prompt 输入 tokens。

+

举例:例:5.3 代表 memory / side query 等子链路把输入成本放大到了主线程的 5.3 倍。

+
+
+

每个用户动作平均 Prompt 输入

+

含义:每天总 prompt 输入成本除以当天 user_action 数。

+

举例:例:它能快速回答‘平均一次用户动作要吃多少输入成本’。

+
+
+

每个用户动作平均 Billed

+

含义:每天总 billed tokens 除以当天 user_action 数。

+

举例:例:适合看整天的平均账单压力。

+
+
+

每个 Query 平均 Prompt 输入

+

含义:每天所有 query 的平均总 prompt 输入成本。

+

举例:例:它能区分‘今天 query 变多’和‘单个 query 变贵’。

+
+
+

每个 Query 平均 Billed

+

含义:每天所有 query 的平均 billed tokens。

+

举例:例:如果这个值升高,说明单个 query 的综合成本变重了。

+
+
+

Submit 到 First Chunk

+

含义:一次用户动作从当前可闭合起点到主线程 first chunk 的平均时长。

+

举例:例:这个值高说明用户等到首字节的时间长。

+
+
+

Preprocess 时长

+

含义:从预处理开始到 prompt.build.started 的平均时长。

+

举例:例:值高说明消息裁剪、压缩或上下文整理耗时较多。

+
+
+

Prompt.Build 时长

+

含义:从 prompt.build.started 到 prompt.build.completed 的平均时长。

+

举例:例:值高说明提示词拼装和序列化成本较高。

+
+
+

Request 到 First Chunk

+

含义:从 API 请求发起到首个流式 chunk 返回的平均时长。

+

举例:例:它主要反映模型首字延迟。

+
+
+

API 总时长

+

含义:单轮 request 从发起到流式完成的平均时长。

+

举例:例:如果它很高,再看工具/恢复链才能知道慢在哪里。

+
+
+

工具执行平均时长

+

含义:所有工具调用的平均执行时长。

+

举例:例:值高时通常要看慢工具明细。

+
+
+

Stop Hooks 平均时长

+

含义:stop hook 生命周期的平均时长。

+

举例:例:值高说明停止逻辑本身在拖慢响应。

+
+
+

Subagent 生命周期均值

+

含义:subagent 从 spawned 到 completed 的平均时长。

+

举例:例:值高通常意味着 memory 相关子链路比较慢。

+
+
+

User Action E2E

+

含义:一次用户动作从最早事件到最晚事件的端到端平均时长。

+

举例:例:这是用户真正感受到的总耗时。

+
+
+

每日平均 Turn/Query

+

含义:按 query 统计的平均 turn 数。

+

举例:例:值高可能意味着更常见的多轮循环。

+
+
+

每日平均 Loop 终点

+

含义:每个 query 的最大 loop_iter 再求平均。

+

举例:例:它能区分‘prompt 大’和‘因为多轮 loop 导致成本高’。

+
+
+

每日 Loop 终点 P95

+

含义:query_max_loop_iter 的 P95。

+

举例:例:它比平均值更容易看出少数长链 loop。

+
+
+

多轮 Query 占比

+

含义:query_max_loop_iter > 1 的 query 占比。

+

举例:例:0.6 代表 60% 的 query 至少循环了 2 轮。

+
+
+

Preprocess 前 Tokens

+

含义:进入上下文治理前的估算 token 总量。

+

举例:例:它是判断压缩压力的起点。

+
+
+

Preprocess 后 Tokens

+

含义:经过上下文治理后的估算 token 总量。

+

举例:例:和前值对比可以看出压缩是否生效。

+
+
+

总节省 Tokens

+

含义:预处理阶段累计节省的 tokens 总量。

+

举例:例:如果是 0,代表这批样本里压缩动作没有明显节省。

+
+
+

压缩收益率

+

含义:preprocess 前后 token 总量的节省比例。

+

举例:例:0.2 代表 preprocess 后上下文整体缩短了 20%。

+
+
+

Autocompact 触发率

+

含义:messages.autoconpact.completed 中 compacted = true 的比例。

+

举例:例:值高说明上下文压力大,经常需要自动压缩。

+
+
+

HISTORY_SNIP Gate 状态

+

含义:当前样本里是否观察到 HISTORY_SNIP 命中。

+

举例:例:‘样本中观察到命中’说明这批日志里 gate 至少生效过一次。

+
+
+

contextCollapse 启用状态

+

含义:当前按源码真相给出。0 代表 disabled / stub,不应被解释成真实已启用。

+

举例:例:即使日志里有相关痕迹,这里仍必须显示 0。

+
+
+

工具成功率

+

含义:工具调用中 success = true 的比例。

+

举例:例:如果它下降,就该优先排查失败最多的工具。

+
+
+

工具失败率

+

含义:工具调用中 failed 的比例。

+

举例:例:它和工具成功率一起决定工具层健康度。

+
+
+

工具平均时长

+

含义:按所有工具调用计算的平均执行时长。

+

举例:例:适合快速判断工具层是否整体变慢。

+
+
+

工具 P95 时长

+

含义:工具执行时长的 P95。

+

举例:例:它比平均值更容易暴露长尾慢调用。

+
+
+

每个 Query 的工具数

+

含义:平均每个 query 触发多少次工具调用。

+

举例:例:值高说明 query 更依赖工具链。

+
+
+

每个 Subagent 的工具数

+

含义:平均每个 subagent 触发多少次工具调用。

+

举例:例:它能看出子代理是否重度依赖工具。

+
+
+

工具后续驱动率

+

含义:包含 tool_use 的 turn 中,最终 transition_out = next_turn 的比例。

+

举例:例:值高说明工具确实在驱动下一轮 loop。

+
+
+

Prompt Too Long 恢复次数

+

含义:恢复链里与 prompt_too_long 相关的尝试次数。

+

举例:例:如果这个值持续升高,说明 prompt 治理本身有问题。

+
+
+

Max Output Tokens 恢复次数

+

含义:恢复链里与 max_output_tokens 相关的尝试次数。

+

举例:例:值高说明输出上限策略经常撞线。

+
+
+

Token Budget Continue Rate

+

含义:token_budget.decision 中 action = continue 的比例。

+

举例:例:值高说明系统经常需要续跑才能完成响应。

+
+
+

Stop Hook Block Rate

+

含义:stop hook 最终阻止继续执行的比例。

+

举例:例:值高时说明停止逻辑频繁打断主链。

+
+
+

API Error Rate

+

含义:API 调用阶段错误的比例。

+

举例:例:这个值非零时要优先检查模型请求和网络错误。

+
+
+

Tool Failure Terminal Rate

+

含义:工具失败后直接导致 query 终止的比例。

+

举例:例:值高说明工具失败很难恢复。

+
+
+
+
+ + diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/\345\275\223\345\211\215\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\346\267\261\345\272\246\347\240\224\347\251\266\346\212\245\345\221\212.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/\345\275\223\345\211\215\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\346\267\261\345\272\246\347\240\224\347\251\266\346\212\245\345\221\212.md" new file mode 100644 index 0000000000..714962fcad --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/\345\275\223\345\211\215\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V1\346\267\261\345\272\246\347\240\224\347\251\266\346\212\245\345\221\212.md" @@ -0,0 +1,1269 @@ +# 当前可观测系统 V1 深度研究报告 + +## 0. 结论先行 + +当前这套可观测系统 V1,已经不是“概念验证”,而是一个**本地可用、链路基本闭合、指标口径基本可信**的调试分析系统。 + +如果只看当前代码和当前 `.observability` 数据,它已经具备下面这些能力: + +1. 能把一次 `user_action` 展开成主线程 query、subagent query、turn、tool call、snapshot 的完整本地事实链。 +2. 能把成本拆成 `Raw Input / Cache Read / Cache Create / Total Prompt Input / Output / Total Billed`,避免再把裸 `input_tokens` 误当总成本。 +3. 能同时给出 `strict` 和 `inferred` 两套完整性指标,区分“原生日志质量”和“ETL 补链能力”。 +4. 能按 `query_source / agent_name / subagent_reason` 看成本、loop、时长和生命周期。 +5. 能通过 `daily_summary.ps1`、`read_timeline.ps1`、`explain_action.ps1` 和 DuckDB 直接做 action 级调试。 + +按当前最新样本,系统状态是: + +1. `query` 闭合率:`1.0` +2. `turn` 闭合率:`1.0` +3. `tool` 生命周期闭合率:`1.0` +4. `subagent` 生命周期闭合率:`1.0` +5. `snapshot_missing_rate = 0.0` +6. 当前唯一残留的完整性风险信号不是断链,而是 `orphan_event_rate = 0.011952` + +换句话说,**V1 的主链完整性问题已经基本修平了**。 +它现在更像是一个“本地 agent 调试工作台”,而不是仅仅一堆日志文件。 + +--- + +## 1. 本报告使用的真相来源 + +本报告优先级如下: + +1. 当前源码 +2. 当前 DuckDB ETL 定义 +3. 当前 `.observability` 实际数据 +4. 旧文档任务书和旧自查文档 + +这意味着: + +- 老文档里如果和当前代码不一致,以当前代码为准 +- 老文档里的旧样本数字,如果和当前库不一致,以当前库为准 + +本轮我实际对照的核心文件是: + +- 事件写入层:[harness.ts](/abs/path/E:/claude-code/src/observability/harness.ts:1) +- query 主循环与 turn/state 埋点:[query.ts](/abs/path/E:/claude-code/src/query.ts:1) +- ETL 主定义:[build_duckdb_etl.ts](/abs/path/E:/claude-code/scripts/observability/build_duckdb_etl.ts:1) +- CLI 摘要入口:[daily_summary.ps1](/abs/path/E:/claude-code/scripts/observability/daily_summary.ps1:1) +- 既有文档: + - [事件Schema文档.md](/abs/path/E:/claude-code/ObservrityTask/事件Schema文档.md:1) + - [DuckDB Schema文档.md](/abs/path/E:/claude-code/ObservrityTask/DuckDB%20Schema文档.md:1) + - [指标定义文档.md](/abs/path/E:/claude-code/ObservrityTask/指标定义文档.md:1) + - [可观测系统V1自查结果.md](/abs/path/E:/claude-code/ObservrityTask/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV1%E8%87%AA%E6%9F%A5%E7%BB%93%E6%9E%9C.md:1) + - [可观测系统V1 Bug解决方案.md](/abs/path/E:/claude-code/ObservrityTask/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV1%20Bug%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88.md:1) + - [可观测系统V1方向A实现任务书.md](/abs/path/E:/claude-code/ObservrityTask/2026-04-23-%E6%96%B9%E5%90%91A/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV1%E6%96%B9%E5%90%91A%E5%AE%9E%E7%8E%B0%E4%BB%BB%E5%8A%A1%E4%B9%A6.md:1) + - [方向A执行清单.md](/abs/path/E:/claude-code/ObservrityTask/2026-04-23-%E6%96%B9%E5%90%91A/%E6%96%B9%E5%90%91A%E6%89%A7%E8%A1%8C%E6%B8%85%E5%8D%95.md:1) + +--- + +## 2. V1 的系统定位 + +这套系统不是线上 APM,也不是公司级分布式 observability 平台。 +它的真实定位是: + +**一个以本地 `.observability/*.jsonl + snapshots/*.json + DuckDB` 为事实源的 agent 调试系统。** + +它主要解决 3 类问题: + +1. 一次用户动作到底触发了哪些 query、哪些 subagent、哪些工具? +2. 这次动作的成本到底花在主线程、记忆链路还是其他 agent/source? +3. 这次运行是不是完整闭合了,哪里断了,哪里只是补链出来的? + +所以它的核心特征不是“集中式收集”,而是: + +- 本地落盘 +- 可重建 +- 可审计 +- 可做 action 级回放 + +--- + +## 3. 系统结构 + +### 3.1 第一层:事件层 + +事件层由 [harness.ts](/abs/path/E:/claude-code/src/observability/harness.ts:1) 负责,输出到: + +- `.observability/events-YYYYMMDD.jsonl` +- `.observability/snapshots/*.json` + +每条事件至少有: + +1. 时间:`ts_wall`、`ts_mono_ms` +2. 结构键:`user_action_id`、`query_id`、`turn_id`、`tool_call_id`、`subagent_id` +3. 维度键:`query_source`、`subagent_type`、`subagent_reason` +4. 业务负载:`payload` + +大对象不直接塞进事件,而是写 sidecar snapshot,再在事件里通过 `snapshot_ref` 引用。 + +这层解决的是: + +- “发生了什么” +- “什么时候发生” +- “这条事件属于谁” + +### 3.2 第二层:ETL 层 + +ETL 由 [build_duckdb_etl.ts](/abs/path/E:/claude-code/scripts/observability/build_duckdb_etl.ts:1) 构建,写入: + +- [\.observability/observability_v1.duckdb](/abs/path/E:/claude-code/.observability/observability_v1.duckdb:1) + +它做了几件关键事: + +1. 自动发现最新 `events-*.jsonl` +2. 把 JSONL 展开成结构化表 +3. 为缺失 `query_id` 的事件计算 `effective_query_id` +4. 解析 snapshot 中的 usage,构建统一成本事实层 `usage_facts` +5. 在 DuckDB 中生成基础表和指标视图 + +### 3.3 第三层:消费层 + +消费层主要有 4 个入口: + +- [daily_summary.ps1](/abs/path/E:/claude-code/scripts/observability/daily_summary.ps1:1) +- [build_dashboard.ps1](/abs/path/E:/claude-code/scripts/observability/build_dashboard.ps1:1) +- [read_timeline.ps1](/abs/path/E:/claude-code/scripts/observability/read_timeline.ps1:1) +- [explain_action.ps1](/abs/path/E:/claude-code/scripts/observability/explain_action.ps1:1) + +这几层对应不同问题: + +1. `daily_summary`: 今天整体运行质量怎么样 +2. `dashboard`: 各指标面板化查看 +3. `read_timeline`: 一次 action 的事件时间线 +4. `explain_action`: 一次 action 的 Markdown + Mermaid 报告 + +--- + +## 4. 这套系统里最重要的几个 ID + +如果不理解这些 ID,后面的指标就很容易读乱。 + +### 4.1 `user_action_id` + +这是整棵执行树的根。 + +它代表: + +**一次用户动作。** + +你表面上“发了一次 query”,系统内部其实通常不是只跑一条 query,而是: + +1. 主线程一条 query +2. 若干 `session_memory` +3. 若干 `extract_memories` +4. 未来可能还有 `side_query`、`away_summary` 等 + +因此: + +- `user_action_id` 最适合做“整次动作级”的成本与链路分析 +- 以后要自己看一次完整运行,应该优先从它开始 + +### 4.2 `query_id` + +这是单条 query 生命周期的 ID。 + +它代表: + +**一条 query 链是谁。** + +它不是循环次数,也不是一个 UI 输入的唯一键。 + +### 4.3 `effective_query_id` + +这是 ETL 补链后的 query ID。 + +存在它的原因是: + +- 某些原始事件没有落 `query_id` +- 但它们在时间上、`user_action_id` 上、`query_source` 上明显属于某条 query +- ETL 就根据时序和维度把它补挂到正确 query 上 + +所以: + +- `query_id` 是原始真相 +- `effective_query_id` 是可分析真相 + +### 4.4 `turn_id` + +这是 query 内的一轮。 + +当前系统里,它通常是 `turn-N`。 + +更准确的理解是: + +- `query_id` = 这条 query 是谁 +- `turn_id` = 这条 query 当前在第几轮结构节点 +- `loop_iter` = 这轮是第几次循环 + +### 4.5 `tool_call_id` + +这是一次工具调用生命周期的键。 + +有了它,可以把: + +- `assistant.tool_use.detected` +- `tool.enqueued` +- `tool.execution.started` +- `tool.execution.completed/failed` + +串成一条完整工具链。 + +### 4.6 `subagent_id` + +这是一个具体 subagent 实例的键。 + +它适合回答: + +- 这次开了几个 subagent +- 每个 subagent 活了多久 +- 这个 subagent 挂在哪条 query 上 + +### 4.7 `subagent_reason` + +这是后来专门补上的字段。 + +它的意义不是“来源”,而是: + +**为什么要开这个 subagent。** + +这比 `query_source` 更贴近分析语义。 + +--- + +## 5. 核心表与视图 + +### 5.1 基础事实表 + +当前最重要的基础表是: + +1. `events_raw` +2. `queries` +3. `turns` +4. `tools` +5. `subagents` +6. `recoveries` +7. `snapshots_index` +8. `usage_facts` +9. `daily_rollups` + +它们的职责可以概括为: + +#### `events_raw` + +最底层原始事件事实表。 + +它解决: + +- 每条事件的原始内容是什么 +- 哪些事件缺失了原始 `query_id` +- ETL 补出来的 `effective_query_id` 是什么 + +#### `queries` + +按 query 聚合后的生命周期表。 + +它适合回答: + +- 一次 action 里有几条 query +- 每条 query 跑了多久 +- 最后是 `completed` 还是其他终态 +- query 的原生/推断完整性是否闭合 + +#### `turns` + +按 `query + turn` 聚合后的 turn 表。 + +它适合回答: + +- 一条 query 一共循环了几轮 +- 每轮有没有工具 +- 每轮是 `next_turn` 还是 `end_turn` +- turn 是否闭合 + +#### `tools` + +按 `tool_call_id` 聚合后的工具生命周期表。 + +它适合回答: + +- 哪些工具被调用了 +- 哪个工具执行失败了 +- 工具平均时长是多少 +- 是否出现“detected 但没执行完”的 dangling tool + +#### `subagents` + +按 `subagent_id` 聚合后的子 agent 生命周期表。 + +它适合回答: + +- 启动了哪些 subagent +- 为什么启动 +- 生命周期是否闭合 +- 平均时长和消息事件数 + +#### `usage_facts` + +这是成本模块最关键的事实层。 + +它统一了两类 usage 来源: + +1. 主线程:从 `api.stream.completed -> response_snapshot_ref -> response.json` 取 usage +2. subagent:从 `subagent.completed.payload` 取 usage + +这个统一抽象是 V1 能把成本算对的关键。 + +### 5.2 聚合视图 + +当前最重要的聚合视图是: + +1. `user_actions` +2. `metrics_integrity_daily` +3. `metrics_cost_daily` +4. `metrics_loop_daily` +5. `metrics_latency_daily` +6. `metrics_compression_daily` +7. `metrics_tools_daily` +8. `metrics_recovery_daily` +9. `query_source_cost_share_daily` +10. `agent_cost_daily` +11. `subagent_reason_daily` +12. `system_flags` + +--- + +## 6. 指标分类总览 + +如果用一句话概括 V1 的指标体系,它可以分成 6 大类: + +1. 完整性指标 +2. 成本指标 +3. Loop / Turn 行为指标 +4. 延迟指标 +5. 压缩 / 上下文治理指标 +6. 工具与恢复指标 + +下面逐类讲。 + +--- + +## 7. 完整性指标 + +完整性指标回答的不是“贵不贵”,而是: + +**这次运行是不是能被完整、可信地还原。** + +### 7.1 `user_action_main_query_coverage_rate` + +定义: + +- 有 `user_action_id` 的动作里,能否至少串到一条主线程 query + +用途: + +- 判断最上层根键是否能稳定挂到主线程 + +当前值: + +- `1.0` + +解释: + +- 当前样本里,每次动作都能找到主线程 query + +### 7.2 `strict_query_completion_rate` + +定义: + +- 只按原始 `query_id` 统计,既有 `query.started` 又有 `query.terminated` 的 query 占比 + +它回答: + +- 原生日志本身的 query 闭合质量如何 + +### 7.3 `inferred_query_completion_rate` + +定义: + +- 允许用 `effective_query_id` 补链后的 query 完成率 + +它回答: + +- 即使原生日志不完美,ETL 能不能把 query 补还原出来 + +### 7.4 `query_completeness_gap` + +定义: + +- `inferred - strict` + +它回答: + +- 当前数据质量有多少是靠 ETL 补链补出来的 + +解读规则: + +1. `strict = inferred = 高` + - 最理想,说明原生日志和分析层都好 +2. `strict 低,inferred 高` + - 分析还能做,但埋点原生质量一般 +3. `strict = inferred = 低` + - 真正断链了 + +当前值: + +- `strict_query_completion_rate = 1.0` +- `inferred_query_completion_rate = 1.0` +- `query_completeness_gap = 0.0` + +说明: + +- 当前样本里 query 层已经是“原生闭合”,不是靠补链勉强维持 + +### 7.5 `strict_turn_state_closure_rate` + +定义: + +- 一个 turn 是否具备: + - `turn.started` + - `state.snapshot.before_turn` + - `state.snapshot.after_turn` + - 或者被 ETL 认定为正常终态 turn + +这里要特别注意: + +当前 V1 已经做过一次重要修复: + +1. 源码在 query 终止前补发终态 `state.snapshot.after_turn` +2. ETL 也允许“`end_turn + query.terminated` 但没有 after_turn”的旧日志被视为闭合终态 turn + +所以它现在比旧文档里写的“必须机械要求三件事同时存在”更贴近真实。 + +### 7.6 `tool_lifecycle_closure_rate` + +定义: + +- 工具调用里,是否从 `assistant.tool_use.detected` 最终闭合到 `completed/failed` + +它回答: + +- 有没有 dangling tool call + +### 7.7 `subagent_lifecycle_closure_rate` + +定义: + +- `subagent.spawned -> subagent.completed` 的闭合率 + +### 7.8 `snapshot_missing_rate` + +定义: + +- 事件引用了 snapshot,但快照文件实际不存在的比例 + +### 7.9 `orphan_event_rate` + +定义: + +- 没法挂到任何 action/query/turn/tool/subagent 主体上的孤儿事件比例 + +当前值: + +- `0.011952` + +解释: + +- 当前系统链路已经基本闭合,但仍然有极少量“无法归属”的事件 +- 这不是主链断裂,但它说明观测层还没做到 100% 无孤儿 + +### 7.10 如何用完整性指标判断系统健康 + +建议顺序: + +1. 先看 `strict_query_completion_rate` +2. 再看 `strict_turn_state_closure_rate` +3. 再看 `tool_lifecycle_closure_rate` +4. 再看 `subagent_lifecycle_closure_rate` +5. 最后看 `orphan_event_rate` + +如果这 5 个都健康,说明: + +- 主链能串起来 +- turn 能闭合 +- 工具没有悬空 +- subagent 没断 +- snapshot 证据完整 + +当前样本在这组指标上的结论是: + +- 主链闭合:健康 +- 工具闭合:健康 +- subagent 闭合:健康 +- turn 闭合:健康 +- 仅剩少量孤儿事件:轻微残留风险 + +--- + +## 8. 成本指标 + +这是你前面最关注、也是最容易被误读的一块。 + +### 8.1 成本模块的核心原则 + +当前 V1 已经明确: + +**不能再把 `input_tokens` 当总输入成本。** + +真实的 prompt 输入成本应拆成: + +1. `Raw Input Tokens` +2. `Cache Read Tokens` +3. `Cache Create Tokens` + +再合成: + +4. `Total Prompt Input Tokens` + +然后加上: + +5. `Output Tokens` + +得到: + +6. `Total Billed Tokens` + +### 8.2 成本事实是怎么来的 + +主线程和 subagent 的 usage 来源不同: + +#### 主线程 + +从: + +- `api.stream.completed.payload.response_snapshot_ref` +- 对应 `response.json` + +取 request-level usage + +#### subagent + +从: + +- `subagent.completed.payload` + +取汇总 usage + +这两路统一进入 `usage_facts`。 + +### 8.3 成本指标分层 + +当前 V1 已按 4 层组织成本指标。 + +#### A. 用户动作级 + +主要看: + +1. `user_action_total_raw_input_tokens` +2. `user_action_total_cache_read_tokens` +3. `user_action_total_cache_create_tokens` +4. `user_action_total_prompt_input_tokens` +5. `user_action_total_output_tokens` +6. `user_action_total_billed_tokens` + +这组回答: + +- 一次动作到底花了多少 + +#### B. 主/子链路级 + +主要看: + +1. `main_thread_total_prompt_input_tokens` +2. `subagent_total_prompt_input_tokens` +3. `subagent_amplification_ratio` + +这组回答: + +- 真正贵的是主线程还是子链路 +- subagent 链到底把主线程放大了多少 + +#### C. 每日总量级 + +主要看: + +1. `daily_total_prompt_input_tokens` +2. `daily_total_billed_tokens` +3. 按 source 和 agent 的日成本分摊 + +#### D. 平均/效率级 + +主要看: + +1. `avg_total_prompt_input_tokens_per_user_action` +2. `avg_total_billed_tokens_per_user_action` +3. `avg_total_prompt_input_tokens_per_query` +4. `avg_total_billed_tokens_per_query` +5. `cost_per_successful_completed_query` + +### 8.4 当前样本的真实成本状态 + +当前最新样本是: + +- `1` 个 user action +- `4` 条 query +- `3` 个 subagent + +它的成本结果是: + +1. `total_prompt_input_tokens = 1221782` +2. `total_billed_tokens = 1233637` +3. `output_tokens = 11855` +4. `raw_input_tokens = 14` +5. `cache_read_input_tokens = 604666` +6. `cache_create_input_tokens = 617102` + +这个结果说明了两件很关键的事: + +1. 真正高的是输入侧,不是输出侧 +2. 输入侧的大头不是裸 input,而是 cache read / cache create + +### 8.5 主/子链路成本 + +当前值: + +1. `main_thread_total_prompt_input_tokens = 376698` +2. `subagent_total_prompt_input_tokens = 845084` +3. `subagent_amplification_ratio = 2.243399` + +解释: + +- 这次动作里,子链路输入成本约为主线程的 `2.24x` +- 当前样本不是“主线程最贵”,而是 memory 子链路更贵 + +### 8.6 按 source 成本拆分 + +当前样本按 `query_source` 看: + +1. `session_memory = 506781` +2. `repl_main_thread = 376698` +3. `extract_memories = 338303` + +解读: + +- 最贵的是 `session_memory` +- 第二贵是主线程 +- 第三是 `extract_memories` + +这对调试非常有价值,因为它直接说明: + +**当前成本大头不是用户眼前的那条主线程,而是后台记忆链路。** + +### 8.7 如何用成本指标分析 agent 运行状态 + +建议顺序: + +1. 先看 `total_prompt_input_tokens` +2. 再拆 `raw / cache_read / cache_create` +3. 再看 `main_thread vs subagent` +4. 再看 `query_source_cost_share_daily` +5. 最后看 `agent_cost_daily` + +典型分析方式: + +#### 情况 A:`raw_input` 小,但 `total_prompt_input` 巨大 + +解释: + +- 不是这次用户输入太长 +- 是稳定前缀、记忆链、缓存重建很贵 + +#### 情况 B:`subagent_amplification_ratio > 1` + +解释: + +- 子链路比主线程更贵 +- 要去看 `session_memory`、`extract_memories` 等 source + +#### 情况 C:主线程贵,但 subagent 不贵 + +解释: + +- 可能是 prompt 主体和工具结果本身很大 +- 不一定是 memory 链的问题 + +--- + +## 9. Loop / Turn 指标 + +这组指标解决的是: + +**成本高,到底是因为 prompt 大,还是因为 loop 多。** + +### 9.1 核心指标 + +1. `daily_avg_turns_per_query` +2. `daily_avg_loop_iter_end` +3. `daily_p95_loop_iter_end` +4. `daily_queries_with_loop_iter_gt_1_rate` + +在 agent 维度上,还会看: + +1. `agent_query_count` +2. `agent_avg_turns_per_query` +3. `agent_avg_loop_iter_end` +4. `agent_p95_loop_iter_end` +5. `agent_queries_with_loop_iter_gt_1_rate` + +### 9.2 当前样本 + +当前值: + +1. `avg_turns_per_query = 3.5` +2. `avg_loop_iter_end = 3.5` +3. `p95_loop_iter_end = 4.85` +4. `loop_iter > 1 的 query 占比 = 1.0` + +解释: + +- 这批 query 没有“只跑一轮”的 +- 当前样本是明显的多轮 agentic loop 场景 + +### 9.3 按 agent 看 loop + +当前值: + +1. `main_thread`: `avg_turns_per_query = 5.0` +2. `session_memory`: `avg_turns_per_query = 3.0` +3. `extract_memories`: `avg_turns_per_query = 3.0` + +解释: + +- 主线程比子链路更“多轮” +- 但成本上子链路更贵 + +这正说明为什么 loop 指标和成本指标要一起看: + +- 主线程更“绕” +- 但子链路更“贵” + +### 9.4 如何用 loop 指标判断状态 + +1. 如果 `avg_loop_iter_end` 很高,但成本不高 + - 可能是多轮轻量探索 +2. 如果 `avg_loop_iter_end` 不高,但成本很高 + - 可能是单轮 prompt 超大 +3. 如果两者都高 + - 这是最重的运行形态 + +--- + +## 10. 延迟指标 + +延迟指标回答的是: + +**慢在哪里。** + +### 10.1 当前延迟指标 + +1. `submit_to_first_chunk_ms` +2. `preprocess_duration_ms` +3. `prompt_build_duration_ms` +4. `api_first_chunk_latency_ms` +5. `api_total_duration_ms` +6. `tool_execution_duration_ms` +7. `stop_hook_duration_ms` +8. `subagent_duration_ms` +9. `user_action_e2e_duration_ms` + +### 10.2 当前样本 + +当前值: + +1. `submit_to_first_chunk = 9821 ms` +2. `preprocess = 66.357 ms` +3. `prompt_build = 6.071 ms` +4. `request -> first_chunk = 10367.643 ms` +5. `api_total_duration = 27723 ms` +6. `tool_execution_avg = 3842.12 ms` +7. `stop_hooks_avg = 4.75 ms` +8. `subagent_duration_avg = 101019.667 ms` +9. `user_action_e2e = 264735 ms` + +### 10.3 如何用延迟指标判断问题 + +#### 如果 `preprocess` 高 + +说明: + +- message 压缩、附件、上下文治理前处理太重 + +#### 如果 `prompt_build` 高 + +说明: + +- prompt 构建本身偏重 + +#### 如果 `api_first_chunk` 高 + +说明: + +- provider 侧首包慢 + +#### 如果 `tool_execution_avg` 高 + +说明: + +- 卡在工具,不是卡在模型 + +#### 如果 `subagent_duration` 高 + +说明: + +- 后台链路长,尤其要看 memory 子链 + +#### 如果 `e2e` 很高,但前几项都不高 + +说明: + +- 多数时间是多轮 loop 累积出来的,不是单个阶段特别慢 + +--- + +## 11. 压缩与上下文治理指标 + +这组指标回答的是: + +**上下文治理到底有没有省 token。** + +### 11.1 当前指标 + +1. `preprocess_tokens_before_total` +2. `preprocess_tokens_after_total` +3. `tokens_saved_total` +4. `compression_gain_ratio` +5. `tool_result_budget_saved_tokens` +6. `history_snip_saved_tokens` +7. `microcompact_saved_tokens` +8. `autocompact_saved_tokens` +9. `autocompact_trigger_rate` +10. `history_snip_gate_on_rate` + +### 11.2 当前样本 + +当前值: + +1. `preprocess_tokens_before_total = 1279853` +2. `preprocess_tokens_after_total = 1279853` +3. `tokens_saved_total = 0` +4. `compression_gain_ratio = 0.0` +5. 各分项 saved tokens 全是 `0` + +### 11.3 如何解释“都是 0” + +这不等于系统坏了。 + +更准确的解释是: + +- 当前这批样本里,这些治理动作没有产生实际 token 节省 +- 或者当前样本没触发对应压缩路径 + +所以: + +- `0` 本身不是 bug +- 但它说明当前样本没有从这组治理动作里拿到收益 + +### 11.4 显式状态指标 + +当前还有一组“状态型指标”: + +1. `contextCollapse_enabled_gauge` +2. `contextCollapse_attempted` +3. `contextCollapse_committed` +4. `history_snip_gate_state` +5. `history_snip_gate_on_rate` + +当前值: + +1. `contextCollapse_enabled_gauge = 0.0` +2. `contextCollapse_attempted = 0` +3. `contextCollapse_committed = 0` +4. `history_snip_gate_state = 样本中观察到命中` +5. `history_snip_gate_on_rate = 1.0` + +解释: + +- `contextCollapse` 当前仍然是 disabled / stub 状态表达 +- 不应把它误读成“真实启用但没命中” + +--- + +## 12. 工具指标 + +这组指标回答的是: + +**工具有没有跑通,哪些工具最重,工具是不是有效驱动了 loop。** + +### 12.1 当前指标 + +1. `tool_calls_total` +2. `tool_success_rate` +3. `tool_failure_rate` +4. `tool_avg_duration_ms` +5. `tool_p95_duration_ms` +6. `context_update_rate` +7. `tools_per_query` +8. `tools_per_subagent` +9. `tool_followup_turn_ratio` + +还有两个明细视图: + +1. `tool_calls_by_name` +2. `tool_calls_by_mode` + +### 12.2 当前样本 + +当前值: + +1. `tool_calls_total = 25` +2. `tool_success_rate = 1.0` +3. `tool_failure_rate = 0.0` +4. `tool_avg_duration_ms = 3842.12` +5. `tool_p95_duration_ms = 10428.2` +6. `tools_per_query = 6.25` +7. `tools_per_subagent = 6.0` +8. `tool_followup_turn_ratio = 1.0` + +### 12.3 工具明细 + +当前样本工具分布: + +1. `Edit`: `12` +2. `Bash`: `5` +3. `Read`: `4` +4. `Write`: `2` +5. `Glob`: `1` +6. `Grep`: `1` + +解释: + +- 当前样本是典型“编辑 + Bash + 文件读写”型 agent 运行 + +### 12.4 如何用工具指标分析状态 + +#### 如果 `tool_success_rate` 低 + +优先看: + +- 哪个工具失败多 +- 是否导致 query 终止 + +#### 如果 `tool_followup_turn_ratio` 低 + +说明: + +- 模型虽然发了 tool_use,但没真正转成有效下一轮 +- 可能存在工具悬空或异常分支 + +#### 如果 `tools_per_query` 高 + +说明: + +- 不是单轮回答型,而是强工具型 agent + +--- + +## 13. 恢复与异常指标 + +这组指标回答的是: + +**系统有没有在异常、恢复和预算控制路径上频繁抖动。** + +### 13.1 当前指标 + +1. `prompt_too_long_recovery_attempts` +2. `prompt_too_long_recovery_success_rate` +3. `max_output_tokens_recovery_attempts` +4. `max_output_tokens_recovery_success_rate` +5. `token_budget_continue_rate` +6. `stop_hook_block_rate` +7. `api_error_rate` +8. `tool_failure_terminal_rate` +9. `exporter_failure_rate` +10. `dropped_event_rate` + +### 13.2 当前样本 + +几乎全是 `0` 或 `NULL`: + +1. 没有 `prompt_too_long` 恢复 +2. 没有 `max_output_tokens` 恢复 +3. 没有 token budget continue +4. 没有 stop hook block +5. 没有 API error +6. 没有工具失败导致终止 + +解释: + +- 当前样本是一次“正常完成型”运行 +- 不适合用来验证恢复链指标,但能说明恢复链没有异常触发 + +--- + +## 14. 目前系统的“高可用”如何理解 + +这里必须先说清楚: + +这套系统不是分布式服务,所以“高可用”不应理解成: + +- 多副本 +- 容灾切换 +- 99.99% SLA + +对于 V1,更合理的定义是: + +**本地观测链是否稳定可写、可重建、可闭合、可用于即时 debug。** + +按这个定义,当前 V1 的高可用由 5 件事决定。 + +### 14.1 事件是否实时落盘 + +答案: + +- 是 + +事件由 [harness.ts](/abs/path/E:/claude-code/src/observability/harness.ts:1) 直接顺序写入 JSONL 和 snapshots。 + +### 14.2 数据库是否会读旧库 + +答案: + +- 当前已基本解决 + +原因: + +- ETL 自动发现最新 `events-*.jsonl` +- `build_meta` 记录源文件、大小、mtime、built_at +- `daily_summary.ps1` 和 `build_dashboard.ps1` 会先做 freshness 校验 + +这意味着: + +- 当前不会再默认悄悄读旧库 + +### 14.3 完整性是否闭合 + +当前样本答案: + +- `query`: 闭合 +- `turn`: 闭合 +- `tool`: 闭合 +- `subagent`: 闭合 +- `snapshot`: 无缺失 + +这是 V1 当前最大的进步。 + +### 14.4 解释链是否可用 + +答案: + +- 已可用 + +现在可以通过: + +1. `daily_summary.ps1` +2. `read_timeline.ps1` +3. `explain_action.ps1` +4. DuckDB 直接查询 + +把一次 action 的结构和路径读出来。 + +### 14.5 当前仍有哪些“可用性约束” + +当前最现实的运行约束有 3 个: + +1. DuckDB 文件锁严格 + - summary、dashboard、手工 DuckDB 查询不要并行跑 +2. `contextCollapse` 仍是状态型占位,不是真实启用链 +3. action 级解释工具已经有了,但中文化和摘要层仍不够强 + +所以我会把当前 V1 的高可用判断为: + +**对于本地单用户调试场景,已经达到“高可用”;对于长期团队化分析场景,还不是终局。** + +--- + +## 15. 如何用这些指标分析“当前 agent 的运行状态” + +如果你以后想快速判断“今天这个 agent 跑得怎么样”,建议固定用下面顺序。 + +### 步骤 1:先看完整性 + +看: + +1. `strict_query_completion_rate` +2. `strict_turn_state_closure_rate` +3. `tool_lifecycle_closure_rate` +4. `subagent_lifecycle_closure_rate` +5. `orphan_event_rate` + +目的: + +- 先确认这批数据值不值得信 + +### 步骤 2:再看成本 + +看: + +1. `total_prompt_input_tokens` +2. `raw / cache_read / cache_create` +3. `main_thread vs subagent` +4. `query_source_cost_share_daily` +5. `agent_cost_daily` + +目的: + +- 先判断贵不贵 +- 再判断贵在哪 + +### 步骤 3:再看 loop + +看: + +1. `avg_turns_per_query` +2. `avg_loop_iter_end` +3. `agent_avg_turns_per_query` +4. `agent_avg_loop_iter_end` + +目的: + +- 判断“贵”是因为大 prompt,还是因为多轮循环 + +### 步骤 4:再看延迟 + +看: + +1. `submit_to_first_chunk_ms` +2. `api_first_chunk_latency_ms` +3. `tool_execution_duration_ms` +4. `subagent_duration_ms` +5. `user_action_e2e_duration_ms` + +目的: + +- 判断慢在哪一段 + +### 步骤 5:如果需要 drill-down,再看 action 级链路 + +用: + +- [read_timeline.ps1](/abs/path/E:/claude-code/scripts/observability/read_timeline.ps1:1) +- [explain_action.ps1](/abs/path/E:/claude-code/scripts/observability/explain_action.ps1:1) + +目的: + +- 把这一次动作展开成 query/subagent/tool/DAG + +--- + +## 16. 当前 V1 的优势 + +我认为当前 V1 最强的地方有 5 个: + +1. 已经从“只有原始日志”升级到了“结构化事实层 + action 级回放” +2. 成本口径已经从误导性的裸 `input_tokens` 修到了可信状态 +3. query / turn / tool / subagent 闭合问题已经基本修平 +4. `subagent_reason`、`agent_name`、`source_group` 让 agent 维度分析变得真正可做 +5. 现在已经能把“一个 UI 动作”还原成一棵可解释的 DAG + +--- + +## 17. 当前 V1 仍然缺什么 + +虽然 V1 已经能用,但如果从“深度调试工作台”的标准看,它还缺下面这些层。 + +### 17.1 因果解释层仍偏弱 + +现在能看到: + +- 分支在哪里发生 +- 哪个 subagent 被启动 + +但还不够稳定地回答: + +- 为什么此刻决定启动它 + +### 17.2 内容摘要层仍不足 + +现在更擅长看结构,不够擅长看“主要内容摘要”。 + +### 17.3 中文化阅读体验还不完整 + +当前 `explain_action.ps1` 已能生成 Mermaid + 报告,但默认报告还是英文结构说明。 + +### 17.4 少量孤儿事件仍然存在 + +`orphan_event_rate` 还不是 `0` + +### 17.5 `contextCollapse` 仍是状态型占位 + +它现在还不是完整行为观测链。 + +--- + +## 18. 我对当前 V1 的最终判断 + +如果只问一句: + +**当前可观测系统 V1 到底处于什么阶段?** + +我的判断是: + +**它已经完成了从“模板”到“可实战调试系统”的跃迁。** + +目前它最适合的用途是: + +1. 分析一次用户动作到底触发了什么 +2. 判断主线程和子链路谁更贵 +3. 判断链路是否完整闭合 +4. 追查某次 debug run 的结构性问题 + +目前它还不适合的用途是: + +1. 作为团队级线上分布式 observability 平台 +2. 作为最终形态的内容理解系统 +3. 作为完全实时、无锁、多人并发分析平台 + +如果把 V1 打一个阶段判断,我会给: + +- 结构化观测能力:高 +- 成本可信度:高 +- 完整性可信度:高 +- 本地 debug 可用性:高 +- 内容摘要能力:中 +- 因果解释能力:中 +- 平台化/工程化成熟度:中 + +--- + +## 19. 附:当前样本的一句话画像 + +当前库中最新样本是: + +- `1` 个 user action +- `4` 条 query +- `14` 个 turn +- `25` 个 tool call +- `3` 个 subagent + +它的运行画像是: + +1. 链路完整闭合 +2. 成本主要花在输入侧 +3. 输入成本主要来自 cache read/create +4. 子链路成本大于主线程 +5. 所有 query 都是多轮 loop +6. 没有明显恢复或异常链 + +所以它更像一次: + +**“链路健康、结构复杂、成本偏高但并非异常”的典型 agent 运行样本。** diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/DuckDB Schema\346\226\207\346\241\243.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/DuckDB Schema\346\226\207\346\241\243.md" new file mode 100644 index 0000000000..35accc0cff --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/DuckDB Schema\346\226\207\346\241\243.md" @@ -0,0 +1,203 @@ +# DuckDB Schema 文档 + +数据库位置: +- `E:\claude-code\.observability\observability_v1.duckdb` + +重建入口: +- `powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\rebuild_observability_db.ps1` + +当前基础表与核心视图如下。 + +## `events_raw` + +用途: +- 保存原始事件的一行一条结构化记录 +- 补充 `effective_query_id`,用于修正少数 `query_id = null` 但可按时序和 `query_source` 推断归属的事件 + +关键字段: +- `event_idx` +- `ts_wall` +- `ts_wall_ms` +- `event_name` +- `user_action_id` +- `query_id` +- `effective_query_id` +- `turn_id` +- `subagent_id` +- `tool_call_id` +- `payload_json` +- `snapshot_refs_json` +- `raw_event_json` + +## `queries` + +用途: +- 按 `query_id` 聚合主线程 query 与 subagent query + +关键字段: +- `query_id` +- `user_action_id` +- `query_source` +- `agent_name` +- `source_group` +- `subagent_id` +- `subagent_type` +- `subagent_reason` +- `started_at` +- `ended_at` +- `duration_ms` +- `terminal_reason` +- `stop_reason` +- `turn_count` +- `tool_call_count` +- `event_count` + +## `turns` + +用途: +- 按 `effective_query_id + turn_id` 聚合 turn +- 当前数据里 `turn_id` 不是全局唯一,所以使用 `turn_key` + +关键字段: +- `turn_key` +- `query_id` +- `turn_id` +- `user_action_id` +- `subagent_id` +- `query_source` +- `loop_iter_start` +- `loop_iter_end` +- `duration_ms` +- `transition_out` +- `termination_reason` +- `stop_reason` +- `tool_call_count` + +## `tools` + +用途: +- 按 `tool_call_id` 聚合工具调用生命周期 + +关键字段: +- `tool_call_id` +- `user_action_id` +- `query_id` +- `subagent_id` +- `tool_name` +- `enqueued_at` +- `started_at` +- `completed_at` +- `duration_ms` +- `success` +- `failure_reason` + +## `subagents` + +用途: +- 按 `subagent_id` 聚合 forked agent 生命周期 + +关键字段: +- `subagent_id` +- `query_id` +- `user_action_id` +- `subagent_type` +- `subagent_reason` +- `query_source` +- `agent_name` +- `source_group` +- `spawned_at` +- `completed_at` +- `duration_ms` +- `transcript_enabled` +- `message_event_count` +- `completed` + +## `recoveries` + +用途: +- 收集恢复链、stop hooks、非 `next_turn` 的状态跳转 + +当前纳入: +- `stop_hooks.started` +- `stop_hooks.completed` +- `state.transitioned` 且 `to_transition != 'next_turn'` +- 名称中包含 `recovery` 的事件 + +关键字段: +- `recovery_key` +- `event_name` +- `user_action_id` +- `query_id` +- `turn_id` +- `subagent_id` +- `transition_to` +- `reason` +- `payload_json` + +## `snapshots_index` + +用途: +- 索引当前保留快照文件,并记录引用次数、hash、大小、类别 + +关键字段: +- `snapshot_ref` +- `file_name` +- `relative_path` +- `absolute_path` +- `exists` +- `size_bytes` +- `sha256` +- `referenced_count` +- `first_event_ts` +- `last_event_ts` +- `category` + +## `daily_rollups` + +用途: +- 提供按天的快速概览,供 summary CLI 和 dashboard 使用 + +关键字段: +- `event_date` +- `event_count` +- `user_action_count` +- `query_count` +- `turn_count` +- `tool_call_count` +- `subagent_count` +- `snapshot_ref_count` +- `latest_event_ts` + +说明: +- `daily_rollups` 是按当前目标事件文件生成的日级摘要,不应写死某一天 +- 当前到底是哪一天、多少条 query,应以 `daily_summary.ps1` 或库内实时查询结果为准 + +## 指标视图 + +当前还新增了以下 DuckDB 视图,供 CLI、dashboard、链路阅读器复用: + +- `user_actions` +- `usage_facts` +- `agent_cost_daily` +- `query_source_cost_share` +- `query_source_cost_share_daily` +- `subagent_reason_daily` +- `metrics_integrity_daily` +- `metrics_cost_daily` +- `metrics_latency_daily` +- `metrics_loop_daily` +- `metrics_compression_daily` +- `metrics_tools_daily` +- `metrics_recovery_daily` +- `tool_calls_by_name` +- `tool_calls_by_mode` +- `terminal_reason_distribution` +- `system_flags` + +## 脚本入口 + +- 重建库:`powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\rebuild_observability_db.ps1` +- 每日 summary:`powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\daily_summary.ps1` +- 链路阅读器:`powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\read_timeline.ps1 -UserActionId ` +- 单次动作解释器:`powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\explain_action.ps1 -UserActionId ` +- 生成 dashboard:`powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\build_dashboard.ps1` diff --git "a/ObservrityTask/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" similarity index 91% rename from "ObservrityTask/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" index b28ea86724..901ab9e7d8 100644 --- "a/ObservrityTask/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\344\272\213\344\273\266Schema\346\226\207\346\241\243.md" @@ -49,6 +49,7 @@ src/observability/harness.ts | `parent_turn_id` | 父 turn,当前预留 | | `subagent_id` | 子 agent ID | | `subagent_type` | 子 agent 类型或 fork label | +| `subagent_reason` | 子 agent 启动原因,优先由调用点显式传入 | | `query_source` | query source | | `request_id` | API request id | | `tool_call_id` | 工具调用 id | @@ -248,6 +249,11 @@ domain.action.stage - `final_message_count` - `transition` +终态约定: + +- 对正常 `end_turn -> query.terminated` 的收尾分支,当前实现会在终止前补发一次 `state.snapshot.after_turn` +- ETL 同时兼容旧日志;即使旧样本缺少这条终态 `after_turn`,也会把“`end_turn + query.terminated`”识别为闭合终态 turn + --- ## 7. 当前未完全覆盖项 @@ -271,6 +277,7 @@ domain.action.stage - 事件写本地文件,旁路现有 analytics - 允许未来补更多字段,但尽量不破坏现有命名 - 快照只做证据存储,主事件保留摘要 +- `user_action_id` 是整次用户动作的根键;阅读完整执行树时,应优先用它串主线程与所有 subagent --- diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\214\207\346\240\207\345\256\232\344\271\211\346\226\207\346\241\243.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\214\207\346\240\207\345\256\232\344\271\211\346\226\207\346\241\243.md" new file mode 100644 index 0000000000..1ad0f5bea2 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\214\207\346\240\207\345\256\232\344\271\211\346\226\207\346\241\243.md" @@ -0,0 +1,325 @@ +# 指标定义文档 + +本轮口径基于: +- 当前目标事件文件:`.observability/events-YYYYMMDD.jsonl` +- 本地快照目录:`E:\claude-code\.observability\snapshots` +- 本地分析库:`E:\claude-code\.observability\observability_v1.duckdb` + +说明: +- 事件文件不再写死某一天;默认由 ETL 自动发现最新文件,也支持显式指定日期或文件 +- 具体当天数值应以 `daily_summary.ps1` 输出为准 + +## 这版重设计解决什么问题 + +上一版最容易让人误判的地方是:把 `input_tokens` 当成“总输入成本”。 + +实际上现在这套 usage 口径里,输入相关成本要拆成 3 块: +- `裸 input tokens` +- `cache read input tokens` +- `cache create input tokens` + +真正建议优先看的输入成本指标是: +- `total_prompt_input_tokens = 裸 input + cache read + cache create` + +所以如果你看到: +- `裸 input = 153` +- `output = 3027` +- `cache read = 245210` +- `cache create = 219661` + +并不代表“output 比 input 大”,而是代表: +- 你之前看的只是“裸 input” +- 真正的总输入成本其实是 `465024` +- 所以这批样本的瓶颈明显在输入侧,不在输出侧 + +## 总原则 + +- 只使用本地 `.observability` 数据,不依赖远端 exporter。 +- 完整性指标同时提供 `严格口径` 和 `推断口径`,避免把补链成功误判为原始日志质量良好。 +- 成本指标优先按 `user_action_id` 汇总,再按 `query_source` 分解。 +- disabled / gated 节点必须显式显示为状态,不得默认为“已工作”。 + +## 完整性指标 + +### `strict_query_completion_rate` +- 来源:`metrics_integrity_daily` +- 定义:只按原始 `query_id` 计算,同时出现 `query.started` 和 `query.terminated` 的 query 占比 +- 用途:衡量原始事件链本身是否闭合 + +### `inferred_query_completion_rate` +- 来源:`metrics_integrity_daily` +- 定义:允许使用 `effective_query_id` 补链后的 query 完成率 +- 用途:衡量分析层是否还能把 query 链补起来 + +### `strict_turn_state_closure_rate` +- 来源:`metrics_integrity_daily` +- 定义:只按原始 `query_id + turn_id` 计算的 turn 闭合率 +- 当前闭合判定: + - 标准路径:同时具有 `turn.started`、`state.snapshot.before_turn`、`state.snapshot.after_turn` + - 终态兼容路径:若本轮以 `stop_reason = end_turn` 正常结束,且随后出现 `query.terminated`,即使旧日志缺少终态 `after_turn`,也视为闭合 +- 用途:衡量 turn 生命周期是否原始闭合 + +### `inferred_turn_state_closure_rate` +- 来源:`metrics_integrity_daily` +- 定义:允许使用 `effective_query_id` 补链后的 turn 闭合率 +- 用途:衡量 ETL 是否还能还原 turn 级链路 + +### `tool_lifecycle_closure_rate` +- 来源:`metrics_integrity_daily` +- 定义:工具调用中,出现 `tool.execution.started` 且最终出现 `tool.execution.completed/failed` 的占比 + +### `subagent_lifecycle_closure_rate` +- 来源:`metrics_integrity_daily` +- 定义:subagent 中,同时具有 `subagent.spawned` 和 `subagent.completed` 的占比 + +### `snapshot_missing_rate` +- 来源:`metrics_integrity_daily` +- 定义:事件引用了 `snapshot_ref`,但本地快照文件缺失的比例 + +### `orphan_event_rate` +- 来源:`metrics_integrity_daily` +- 定义:同时缺失 `user_action_id / effective_query_id / turn_id / tool_call_id / subagent_id` 的事件占比 +- 用途:衡量无法挂靠到任何主链实体的“孤儿事件”比例 + +## 成本指标 + +### `user_action_total_raw_input_tokens` +- 来源:`metrics_cost_daily` +- 定义:按 `user_action_id` 汇总的 `input_tokens` +- 解释:这是“裸输入”,不是总输入成本 + +### `user_action_total_cache_read_tokens` +- 来源:`metrics_cost_daily` +- 定义:按 `user_action_id` 汇总的 `cache_read_input_tokens` +- 解释:代表本轮从 prompt cache 直接读取复用的输入成本 + +### `user_action_total_cache_create_tokens` +- 来源:`metrics_cost_daily` +- 定义:按 `user_action_id` 汇总的 `cache_creation_input_tokens` +- 解释:代表本轮为了创建或刷新 prompt cache 而计入的输入成本 + +### `user_action_total_prompt_input_tokens` +- 来源:`metrics_cost_daily` +- 定义:`raw_input + cache_read + cache_create` +- 解释:这是当前 dashboard 默认建议优先看的“总输入成本” +- 举例: + - `raw = 153` + - `cache_read = 245210` + - `cache_create = 219661` + - `total_prompt_input_tokens = 465024` + +### `user_action_total_output_tokens` +- 来源:`metrics_cost_daily` +- 定义:按 `user_action_id` 汇总的 `output_tokens` + +### `user_action_total_billed_tokens` +- 来源:`metrics_cost_daily` +- 定义:`total_prompt_input_tokens + output_tokens` +- 解释:这是最接近总账单的统一口径 + +### `query_source_cost_share` +- 来源:`query_source_cost_share` / `query_source_cost_share_daily` +- 定义:按 `query_source` 聚合成本后,占当日总 billed 成本的比例 +- 最低要求区分: + - `repl_main_thread` + - `extract_memories` + - `session_memory` + - `away_summary` + - `side_query` + +### `main_thread_total_prompt_input_tokens` +- 来源:`metrics_cost_daily` +- 定义:`query_source = repl_main_thread` 的总 prompt 输入 tokens + +### `subagent_total_prompt_input_tokens` +- 来源:`metrics_cost_daily` +- 定义:非 `repl_main_thread` 的总 prompt 输入 tokens + +### `subagent_amplification_ratio` +- 来源:`metrics_cost_daily` +- 定义:`subagent_total_prompt_input_tokens / main_thread_total_prompt_input_tokens` +- 用途:衡量 memory 链、side query 等子链路把输入成本放大了多少倍 + +### `cost_per_successful_completed_query` +- 来源:`metrics_cost_daily` +- 定义:`total_billed_tokens / 完成态 completed query 数` +- 用途:衡量“完成一个有效 query 平均要花多少 tokens” + +## 延迟指标 + +### `submit_to_first_chunk_ms` +- 来源:`metrics_latency_daily` +- 定义:同一 `user_action_id` 下,从当前可闭合起点到主线程 `api.stream.first_chunk` 的平均时长 + +### `preprocess_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:`state.snapshot.before_turn -> prompt.build.started` + +### `prompt_build_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:`prompt.build.started -> prompt.build.completed` + +### `api_first_chunk_latency_ms` +- 来源:`metrics_latency_daily` +- 定义:`api.request.started -> api.stream.first_chunk` + +### `api_total_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:`api.request.started -> api.stream.completed` + +### `tool_execution_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:工具执行平均时长 + +### `stop_hook_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:`stop_hooks.started -> stop_hooks.completed` 平均时长 + +### `subagent_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:subagent 生命周期平均时长 + +### `user_action_e2e_duration_ms` +- 来源:`metrics_latency_daily` +- 定义:一次用户动作从最早事件到最晚事件的端到端平均时长 + +## 压缩与上下文治理指标 + +### `preprocess_tokens_before_total` +- 来源:`metrics_compression_daily` +- 定义:压缩前估算 tokens 总量 + +### `preprocess_tokens_after_total` +- 来源:`metrics_compression_daily` +- 定义:压缩后估算 tokens 总量 + +### `tokens_saved_total` +- 来源:`metrics_compression_daily` +- 定义:总节省 tokens 数量 + +### `compression_gain_ratio` +- 来源:`metrics_compression_daily` +- 定义:`(before - after) / before` +- 用途:衡量 preprocess 整体压缩收益 + +### `tool_result_budget_saved_tokens` +### `history_snip_saved_tokens` +### `microcompact_saved_tokens` +### `autocompact_saved_tokens` +- 来源:`metrics_compression_daily` +- 定义:按压缩环节分项统计节省的 tokens + +### `autocompact_trigger_rate` +- 来源:`metrics_compression_daily` +- 定义:`messages.autoconpact.completed.payload.compacted = true` 的比例 + +### `history_snip_gate_on_rate` +- 来源:`metrics_compression_daily` / `system_flags` +- 定义:样本内出现 HISTORY_SNIP 命中的比例或状态化结果 + +### `contextCollapse_enabled_gauge` +- 来源:`metrics_compression_daily` / `system_flags` +- 当前定义:固定按源码现实显示 + - `1` 表示启用 + - `0` 表示 disabled / stub +- 当前样本解释:必须视为 `0` + +### `contextCollapse_attempted` +### `contextCollapse_committed` +- 来源:`system_flags` +- 当前定义:在源码事实源未打开前,显式展示为 `0` +- 用途:避免把 disabled / stub 状态误读成“暂时没有命中” + +## 工具行为指标 + +### `tool_calls_total` +- 来源:`metrics_tools_daily` +- 定义:工具调用总数 + +### `tool_calls_by_name` +- 来源:`tool_calls_by_name` +- 定义:按 `tool_name` 聚合调用次数、成功率、失败率、平均耗时、P95 耗时 + +### `tool_calls_by_mode` +- 来源:`tool_calls_by_mode` +- 定义:按 `tool_mode` 聚合 +- 主要模式: + - `streaming` + - `run_tools` + +### `tool_success_rate` +### `tool_failure_rate` +### `tool_avg_duration_ms` +### `tool_p95_duration_ms` +- 来源:`metrics_tools_daily` + +### `context_update_rate` +- 来源:`metrics_tools_daily` +- 定义:工具调用后产生 `tool.context.updated` 的比例 + +### `tools_per_query` +- 来源:`metrics_tools_daily` +- 定义:平均每个 query 的工具调用数 + +### `tools_per_subagent` +- 来源:`metrics_tools_daily` +- 定义:平均每个 subagent 的工具调用数 + +### `tool_followup_turn_ratio` +- 来源:`metrics_tools_daily` +- 定义:包含 `assistant.tool_use.detected` 的 turn 中,最终进入 `next_turn` 的比例 +- 用途:衡量工具是否真的驱动了 loop + +## 恢复与异常指标 + +### `prompt_too_long_recovery_attempts` +### `prompt_too_long_recovery_success_rate` +- 来源:`metrics_recovery_daily` +- 定义:按恢复链事件名匹配 `prompt_too_long` + +### `max_output_tokens_recovery_attempts` +### `max_output_tokens_recovery_success_rate` +- 来源:`metrics_recovery_daily` +- 定义:按恢复链事件名匹配 `max_output_tokens` + +### `token_budget_continue_rate` +- 来源:`metrics_recovery_daily` +- 定义:`token_budget.decision.payload.action = 'continue'` 的比例 + +### `stop_hook_block_rate` +- 来源:`metrics_recovery_daily` +- 定义:`stop_hooks.completed.payload.prevent_continuation = true` 的比例 + +### `terminal_reason_distribution` +- 来源:`terminal_reason_distribution` +- 定义:按 query 终止原因的分布 + +### `api_error_rate` +- 来源:`metrics_recovery_daily` +- 定义:API 调用阶段错误事件占比 + +### `tool_failure_terminal_rate` +- 来源:`metrics_recovery_daily` +- 定义:工具失败后直接导致终止的比例 + +### `exporter_failure_rate` +### `dropped_event_rate` +- 来源:`metrics_recovery_daily` +- 定义:按显式失败事件统计 + +## 当前样本的已知限制 + +### 1. 完整性不能只看推断口径 +- 原因:`effective_query_id` 会补链 +- 处理方式:dashboard 同时展示严格口径和推断口径 + +### 2. 成本必须优先看 `total_prompt_input_tokens` +- 原因:cache read / cache create 在当前样本里明显大于裸 input +- 处理方式:dashboard 把它放在成本区核心位置,并配中文说明 + +### 3. `contextCollapse` 不能误报为已启用 +- 原因:源码核对结论是 disabled / stub +- 处理方式:统一显示 `contextCollapse_enabled_gauge = 0` + +### 4. dashboard 的每个关键指标都要能解释 +- 处理方式:每张卡片右上角都有“说明”链接,跳到页面底部的中文含义与举例说明 diff --git "a/ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" similarity index 88% rename from "ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" index a580375a53..86cb09df72 100644 --- "a/ObservrityTask/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/02-Schema\344\270\216\346\214\207\346\240\207/\346\227\245\345\277\227\351\230\205\350\257\273\346\225\231\345\255\246.md" @@ -6,6 +6,12 @@ 2. 主线程在第几轮进入了什么状态 3. 子 agent、工具调用、stop hooks、恢复链分别在哪一步介入 +当前最推荐的阅读根键不是单条 `query_id`,而是: + +- `user_action_id`:整次用户动作的根 +- `query_id`:其中某条 query 分支 +- `subagent_id`:某个具体子 agent 实例 + --- ## 1. 日志放在哪里 @@ -74,7 +80,19 @@ ### 3.1 看一次完整用户提交 -先搜: +先找 `user_action_id`,再看整条动作时间线。最方便的入口是: + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\explain_action.ps1 -Latest +``` + +或者: + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\read_timeline.ps1 -UserActionId <你的user_action_id> +``` + +如果只想直接 grep 原始事件,再搜: ```powershell Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"submit.attempted"|"event":"input.process.completed"|"event":"query.started"|"event":"query.terminated"' @@ -237,6 +255,12 @@ Select-String -Path .\.observability\events-*.jsonl -Pattern '"event":"subagent. ## 5. 推荐命令 +### 5.0 找最近一次用户动作 + +```powershell +E:\claude-code\tools\duckdb\duckdb.exe -json E:\claude-code\.observability\observability_v1.duckdb "select user_action_id, started_at, duration_ms, query_count, subagent_count, total_prompt_input_tokens from user_actions order by started_at desc limit 10;" +``` + ### 5.1 只看事件名和时间 ```powershell @@ -338,3 +362,7 @@ Get-Content .\.observability\events-*.jsonl | Select-String '"subagent_id":" 2026-04-22T19:02:10.156Z +- Local: 2026-04-23 02:57:45 -> 2026-04-23 03:02:10 +- duration_ms: 264735 +- query_count: 4 +- subagent_count: 3 +- tool_call_count: 25 +- total_prompt_input_tokens: 1221782 +- total_billed_tokens: 1233637 + +## Summary + +This action expanded into 4 queries and 3 subagents. + +## Mermaid DAG + +```mermaid +flowchart TD + UA["user_action
9ddd1bff
02:57:45 -> 03:02:10"] + Q_7493179f["main_thread
7493179f
5 turns
completed"] + Q_cf5ef87f["session_memory
cf5ef87f
4 turns
completed"] + Q_8477fa68["session_memory
8477fa68
2 turns
completed"] + Q_a18e7d35["extract_memories
a18e7d35
3 turns
completed"] + T_7493179f_turn_1["turn-1
Glob + Grep + Read
loop=1"] + T_cf5ef87f_turn_1["turn-1
Edit + Edit + Edit + Edit + Edit + Edit + Edit
loop=1"] + T_7493179f_turn_2["turn-2
Bash
loop=2"] + T_cf5ef87f_turn_2["turn-2
Bash
loop=2"] + T_cf5ef87f_turn_3["turn-3
Bash
loop=3"] + T_7493179f_turn_3["turn-3
Read + Bash
loop=3"] + T_cf5ef87f_turn_4["turn-4
end_turn
loop=4"] + T_7493179f_turn_4["turn-4
Bash
loop=4"] + T_8477fa68_turn_1["turn-1
Edit + Edit + Edit + Edit + Edit
loop=1"] + T_7493179f_turn_5["turn-5
end_turn
loop=5"] + T_a18e7d35_turn_1["turn-1
Read + Read
loop=1"] + T_a18e7d35_turn_2["turn-2
Write + Write
loop=2"] + T_8477fa68_turn_2["turn-2
end_turn
loop=2"] + T_a18e7d35_turn_3["turn-3
end_turn
loop=3"] + Q_7493179f --> T_7493179f_turn_1 + T_7493179f_turn_1 --> T_7493179f_turn_2 + T_7493179f_turn_2 --> T_7493179f_turn_3 + T_7493179f_turn_3 --> T_7493179f_turn_4 + T_7493179f_turn_4 --> T_7493179f_turn_5 + Q_cf5ef87f --> T_cf5ef87f_turn_1 + T_cf5ef87f_turn_1 --> T_cf5ef87f_turn_2 + T_cf5ef87f_turn_2 --> T_cf5ef87f_turn_3 + T_cf5ef87f_turn_3 --> T_cf5ef87f_turn_4 + Q_8477fa68 --> T_8477fa68_turn_1 + T_8477fa68_turn_1 --> T_8477fa68_turn_2 + Q_a18e7d35 --> T_a18e7d35_turn_1 + T_a18e7d35_turn_1 --> T_a18e7d35_turn_2 + T_a18e7d35_turn_2 --> T_a18e7d35_turn_3 + S_1["spawn session_memory
02:58:01"] + T_7493179f_turn_1 --> S_1 --> Q_cf5ef87f + S_2["spawn session_memory
03:00:19"] + T_7493179f_turn_4 --> S_2 --> Q_8477fa68 + S_3["spawn extract_memories
03:00:46"] + T_7493179f_turn_5 --> S_3 --> Q_a18e7d35 + UA --> Q_7493179f +``` + +## Query List + +### main_thread 7493179f-d7ba-4302-bdf5-281cbc86aa9c + +- query_source: repl_main_thread +- subagent_reason: repl_main_thread +- time: 2026-04-23 02:57:45 -> 2026-04-23 03:00:46 +- turn_count: 5 +- max_loop_iter: 5.0 +- tool_call_count: 7 +- terminal_reason: completed +- completeness: strict=true, inferred=true + +- turn-1: tools=Glob + Grep + Read, stop_reason=tool_use, transition_out=next_turn, duration_ms=18769, strict_closed=true +- turn-2: tools=Bash, stop_reason=tool_use, transition_out=next_turn, duration_ms=92324, strict_closed=true +- turn-3: tools=Read + Bash, stop_reason=tool_use, transition_out=next_turn, duration_ms=21222, strict_closed=true +- turn-4: tools=Bash, stop_reason=tool_use, transition_out=next_turn, duration_ms=34112, strict_closed=true +- turn-5: tools=none, stop_reason=end_turn, transition_out=, duration_ms=14503, strict_closed=true + +### session_memory cf5ef87f-e227-4f65-8c28-035da80e85e8 + +- query_source: session_memory +- subagent_reason: session_memory +- time: 2026-04-23 02:58:01 -> 2026-04-23 02:59:59 +- turn_count: 4 +- max_loop_iter: 4.0 +- tool_call_count: 9 +- terminal_reason: completed +- completeness: strict=true, inferred=true + +- turn-1: tools=Edit + Edit + Edit + Edit + Edit + Edit + Edit, stop_reason=tool_use, transition_out=next_turn, duration_ms=68370, strict_closed=true +- turn-2: tools=Bash, stop_reason=tool_use, transition_out=next_turn, duration_ms=16677, strict_closed=true +- turn-3: tools=Bash, stop_reason=tool_use, transition_out=next_turn, duration_ms=24937, strict_closed=true +- turn-4: tools=none, stop_reason=end_turn, transition_out=, duration_ms=8046, strict_closed=true + +### session_memory 8477fa68-0c8d-49de-a6db-22274577b1b2 + +- query_source: session_memory +- subagent_reason: session_memory +- time: 2026-04-23 03:00:19 -> 2026-04-23 03:02:00 +- turn_count: 2 +- max_loop_iter: 2.0 +- tool_call_count: 5 +- terminal_reason: completed +- completeness: strict=true, inferred=true + +- turn-1: tools=Edit + Edit + Edit + Edit + Edit, stop_reason=tool_use, transition_out=next_turn, duration_ms=59493, strict_closed=true +- turn-2: tools=none, stop_reason=end_turn, transition_out=, duration_ms=41634, strict_closed=true + +### extract_memories a18e7d35-8d66-4c2c-af96-3b9bf36d1f51 + +- query_source: extract_memories +- subagent_reason: extract_memories +- time: 2026-04-23 03:00:46 -> 2026-04-23 03:02:10 +- turn_count: 3 +- max_loop_iter: 3.0 +- tool_call_count: 4 +- terminal_reason: completed +- completeness: strict=true, inferred=true + +- turn-1: tools=Read + Read, stop_reason=tool_use, transition_out=next_turn, duration_ms=22639, strict_closed=true +- turn-2: tools=Write + Write, stop_reason=tool_use, transition_out=next_turn, duration_ms=55224, strict_closed=true +- turn-3: tools=none, stop_reason=end_turn, transition_out=, duration_ms=5927, strict_closed=true + +## Branch Points + +- 2026-04-23 02:58:01: spawn session_memory, child_query=cf5ef87f-e227-4f65-8c28-035da80e85e8, attached after main-thread turn-1 by time inference +- 2026-04-23 03:00:19: spawn session_memory, child_query=8477fa68-0c8d-49de-a6db-22274577b1b2, attached after main-thread turn-4 by time inference +- 2026-04-23 03:00:46: spawn extract_memories, child_query=a18e7d35-8d66-4c2c-af96-3b9bf36d1f51, attached after main-thread turn-5 by time inference + +## Reading SOP + +1. Find the target action in user_actions. +2. Use queries to list all agents and branches under that action. +3. Use turns to inspect loop count and turn termination. +4. Use tools to inspect concrete tool calls per turn. +5. Use events_raw for key events only: query.started, api.stream.completed, subagent.spawned, query.terminated. +6. If you need content, follow snapshot refs into .observability/snapshots. + diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_9ddd1bff_\346\265\201\347\250\213\350\247\243\346\236\220.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_9ddd1bff_\346\265\201\347\250\213\350\247\243\346\236\220.md" new file mode 100644 index 0000000000..9b70487e5e --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_9ddd1bff_\346\265\201\347\250\213\350\247\243\346\236\220.md" @@ -0,0 +1,296 @@ +# User Action 流程解析 + +本报告严格依据当前 `.observability/events-20260422.jsonl` 与 DuckDB 中对应记录生成。 +注意:事件文件名按 `UTC` 日期命名,因此北京时间 `2026-04-23 02:57:45` 到 `03:02:10` 的这次动作,落在 `events-20260422.jsonl` 中是正常现象。 + +## 基本信息 + +- `user_action_id`: `9ddd1bff-65b6-414f-bf04-418809eb6ff7` +- 时间范围: + - `UTC`: `2026-04-22T18:57:45.421Z` -> `2026-04-22T19:02:10.156Z` + - `Asia/Shanghai`: `2026-04-23 02:57:45` -> `2026-04-23 03:02:10` +- 总时长: `264735 ms` +- 该次动作展开结果: + - `1` 条主线程 query + - `2` 条 `session_memory` 子链路 query + - `1` 条 `extract_memories` 子链路 query + - `25` 次工具调用 + +## 一句话总结 + +你表面上只发起了一次用户动作,但系统内部把它展开成了 `4` 条 query。 +主线程一共跑了 `5` 个 turn,在推进过程中分叉出了 `2` 条 `session_memory`,主线程完成后又分叉出 `1` 条 `extract_memories`。 +因此这次不是单链条,而是一棵带并发子链路的 DAG。 + +## Mermaid DAG + +下面这段可以直接复制到 Mermaid Live Editor 或支持 Mermaid 的网站查看。 + +```mermaid +flowchart TD + UA["user_action
9ddd1bff-65b6-414f-bf04-418809eb6ff7
02:57:45 -> 03:02:10"] + + Q0["main_thread query
7493179f-d7ba-4302-bdf5-281cbc86aa9c
5 turns
completed"] + Q1["session_memory #1
cf5ef87f-e227-4f65-8c28-035da80e85e8
4 turns
completed"] + Q2["session_memory #2
8477fa68-0c8d-49de-a6db-22274577b1b2
2 turns
completed"] + Q3["extract_memories
a18e7d35-8d66-4c2c-af96-3b9bf36d1f51
3 turns
completed"] + + T1["main turn-1
Glob + Grep + Read
stop_reason=tool_use"] + T2["main turn-2
Bash
stop_reason=tool_use"] + T3["main turn-3
Read + Bash
stop_reason=tool_use"] + T4["main turn-4
Bash
stop_reason=tool_use"] + T5["main turn-5
end_turn
query.terminated=completed"] + + S1["spawn session_memory #1
18:58:01.847Z"] + S2["spawn session_memory #2
19:00:19.775Z"] + S3["spawn extract_memories
19:00:46.360Z"] + + M11["sm#1 turn-1
Edit x7"] + M12["sm#1 turn-2
Bash x1"] + M13["sm#1 turn-3
Bash x1"] + M14["sm#1 turn-4
end_turn"] + + M21["sm#2 turn-1
Edit x5"] + M22["sm#2 turn-2
end_turn"] + + E1["extract turn-1
Read x2"] + E2["extract turn-2
Write x2"] + E3["extract turn-3
end_turn"] + + UA --> Q0 + Q0 --> T1 --> T2 --> T3 --> T4 --> T5 + + T1 --> S1 --> Q1 + T4 --> S2 --> Q2 + T5 --> S3 --> Q3 + + Q1 --> M11 --> M12 --> M13 --> M14 + Q2 --> M21 --> M22 + Q3 --> E1 --> E2 --> E3 +``` + +## 自然语言流程解释 + +### 1. 主线程启动 + +- `18:57:45.443Z` + - 主线程 `query.started` + - `query_id = 7493179f-d7ba-4302-bdf5-281cbc86aa9c` +- `18:57:45.453Z` + - 主线程 `turn-1` 开始 + +这说明这次用户动作先进入主线程 query。 + +### 2. 主线程 turn-1 先做探索 + +在 `turn-1` 中,assistant 决定调用了三种工具: + +- `18:58:00.990Z` `Glob` +- `18:58:01.474Z` `Grep` +- `18:58:01.521Z` `Read` + +随后: + +- `18:58:01.825Z` + - `api.stream.completed` + - `stop_reason = tool_use` + +这表示第一轮不是直接回答完成,而是先产生了一批工具调用。 + +### 3. 第一处分支:启动 session_memory #1 + +紧接着主线程第一轮工具之后: + +- `18:58:01.847Z` + - `subagent.spawned` + - `subagent_reason = session_memory` + - `subagent_id = a00ed066c632706a7` +- `18:58:01.862Z` + - 该 subagent 自己的 `query.started` + - `query_id = cf5ef87f-e227-4f65-8c28-035da80e85e8` + +这就是第一个明显分支点。 +主线程没有停下来,而是继续跑;同时后台起了一条 `session_memory` 子链路。 + +### 4. 主线程继续推进 turn-2 / turn-3 / turn-4 + +主线程接着继续: + +- `turn-2` + - 检测到 `Bash` + - `18:58:17.271Z` `api.stream.completed` + - `stop_reason = tool_use` + +- `turn-3` + - 检测到 `Read + Bash` + - `18:59:57.288Z` `api.stream.completed` + - `stop_reason = tool_use` + +- `turn-4` + - 检测到 `Bash` + - `19:00:19.646Z` `api.stream.completed` + - `stop_reason = tool_use` + +也就是说,主线程本质上是一个多轮 agentic loop: + +- 前四轮都先决定继续用工具 +- 没有在前四轮直接结束 + +### 5. 第一条 session_memory 在后台跑了 4 轮 + +`session_memory #1` 的 query 是: + +- `query_id = cf5ef87f-e227-4f65-8c28-035da80e85e8` +- 时间:`18:58:01.862Z -> 18:59:59.894Z` +- 共 `4` 个 turn + +它的主要动作是: + +- `turn-1`: `Edit x7` +- `turn-2`: `Bash x1` +- `turn-3`: `Bash x1` +- `turn-4`: `end_turn` +- 最终:`query.terminated = completed` + +这说明第一条 `session_memory` 是一个比较重的后台修改链路。 + +### 6. 第二处分支:再次启动 session_memory #2 + +在主线程 `turn-4` 结束后: + +- `19:00:19.775Z` + - 第二次 `subagent.spawned(session_memory)` +- `19:00:19.794Z` + - 第二条 `session_memory` 自己的 `query.started` + - `query_id = 8477fa68-0c8d-49de-a6db-22274577b1b2` + +所以这次用户动作里,`session_memory` 并不是只跑一次,而是跑了两次。 + +### 7. 第二条 session_memory 更短 + +第二条 `session_memory`: + +- `query_id = 8477fa68-0c8d-49de-a6db-22274577b1b2` +- 时间:`19:00:19.794Z -> 19:02:00.961Z` +- 共 `2` 个 turn + +主要动作: + +- `turn-1`: `Edit x5` +- `turn-2`: `end_turn` +- 最终:`query.terminated = completed` + +它比第一条更短,更像一次快速的记忆更新。 + +### 8. 主线程最终在 turn-5 完成 + +主线程最后一轮: + +- `19:00:31.884Z` + - 进入 `turn-5` +- `19:00:46.343Z` + - `api.stream.completed` + - `stop_reason = end_turn` +- `19:00:46.365Z` + - `query.terminated` + - `reason = completed` + +因此主线程自己的轨迹可以概括为: + +- `turn-1`: 工具 +- `turn-2`: 工具 +- `turn-3`: 工具 +- `turn-4`: 工具 +- `turn-5`: 最终结束 + +### 9. 第三处分支:主线程结束后启动 extract_memories + +主线程刚结束: + +- `19:00:46.360Z` + - `subagent.spawned(extract_memories)` +- `19:00:46.366Z` + - `extract_memories query.started` + - `query_id = a18e7d35-8d66-4c2c-af96-3b9bf36d1f51` + +这说明 `extract_memories` 是一个尾处理分支,不是在主线程早期并发拉起的。 + +### 10. extract_memories 走了 3 轮:先读后写 + +`extract_memories`: + +- 时间:`19:00:46.366Z -> 19:02:10.156Z` +- 共 `3` 个 turn + +主要动作: + +- `turn-1`: `Read x2` +- `turn-2`: `Write x2` +- `turn-3`: `end_turn` +- 最终:`query.terminated = completed` + +所以这条链路很清楚: + +1. 先读 +2. 再写 +3. 然后结束 + +## 这次动作的关键分支节点 + +这次日志里一共能明确看到 `3` 个分支节点: + +1. `18:58:01.847Z` + - 主线程 `turn-1` 工具轮结束后 + - 分出 `session_memory #1` + +2. `19:00:19.775Z` + - 主线程 `turn-4` 工具轮结束后 + - 分出 `session_memory #2` + +3. `19:00:46.360Z` + - 主线程 query 完成后 + - 分出 `extract_memories` + +## 严格按现有日志可以得出的结论 + +### 可以确认的 + +- 这是 `1` 次用户动作,不是多次 +- 这 `1` 次用户动作内部展开成了 `4` 条 query +- 主线程跑了 `5` 个 turn +- 两条 `session_memory` 一共跑了 `6` 个 turn +- 一条 `extract_memories` 跑了 `3` 个 turn +- 所有 query 最终都 `completed` +- 所有工具调用最终都闭合 + +### 不能从现有日志直接确认的 + +- 为什么系统“此刻决定”要拉起某条 `session_memory` +- assistant 文本里到底说了什么完整内容 +- 每一次 `Edit/Write` 具体改了什么正文 + +这些内容需要继续看对应的 snapshot,如: + +- `request.json` +- `response.json` +- `state.snapshot.before_turn.json` +- `state.snapshot.after_turn.json` + +## 适合你以后复用的读法 + +如果以后你还想按这个格式读某次动作,顺序就是: + +1. 先拿 `user_action_id` +2. 列出该 action 下所有 `query` +3. 列出所有 `subagent` +4. 拉时间线,只保留关键节点: + - `query.started` + - `turn.started` + - `assistant.tool_use.detected` + - `api.stream.completed` + - `subagent.spawned` + - `state.transitioned` + - `query.terminated` + - `subagent.completed` +5. 再根据需要去看 snapshot 正文 + diff --git "a/ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" similarity index 100% rename from "ObservrityTask/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/PDF\344\270\273\351\223\276\346\240\270\345\257\271\346\212\245\345\221\212.md" diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/QueryLoop\345\205\250\346\265\201\347\250\213\350\257\246\350\247\243\357\274\210\346\272\220\347\240\201\347\211\210\357\274\211.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/QueryLoop\345\205\250\346\265\201\347\250\213\350\257\246\350\247\243\357\274\210\346\272\220\347\240\201\347\211\210\357\274\211.md" new file mode 100644 index 0000000000..1f974f07c8 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/QueryLoop\345\205\250\346\265\201\347\250\213\350\257\246\350\247\243\357\274\210\346\272\220\347\240\201\347\211\210\357\274\211.md" @@ -0,0 +1,1243 @@ +# Query Loop 全流程详解(源码版) + +本文基于**当前源码真实实现**整理,目标是替代早期较粗略的流程介绍材料,帮助你从源码角度完整理解: + +1. 一次主线程 `query` 到底从哪里开始 +2. 一个 `query loop` 每一轮在做什么 +3. 什么时机会判断要不要继续、要不要执行工具、要不要启动旁路/子 agent +4. 为什么一次用户动作会展开成多条 query、多轮 turn,甚至多条子链路 + +本文关注的是**主线程 `query()` / `queryLoop()` 的真实时间顺序**,并补充它和: + +- `session_memory` +- `extract_memories` +- `side_question` +- `prompt_suggestion` +- `compact` + +之间的关系。 + +--- + +## 1. 先建立几个最重要的概念 + +如果不先区分这几个层级,很容易把整个系统看乱。 + +### 1.1 `user_action` + +这是“用户这次动作”的根,比如用户发送了一次消息。 + +它是整个执行树的根键。 +一次 `user_action` 可以展开成: + +- 1 条主线程 query +- 0 到多条子 query +- 多轮 turn +- 多次工具调用 + +### 1.2 `query` + +这是一次完整的 query 生命周期。 + +主线程的主执行链是一条 query。 +每个通过 `runForkedAgent(...)` 启动的 forked subagent,也会有自己独立的一条 query。 + +所以: + +- 一个 `user_action` 往往不止一条 query +- 一个 `query` 可以包含多轮 turn + +### 1.3 `turn` + +可以把它理解成 query loop 的“一轮”。 + +一轮通常包含: + +1. 取当前 messages +2. 做预处理 +3. 组 prompt +4. 调模型 +5. 读响应 +6. 处理 tool_use / stop hook / continuation 决策 + +如果 assistant 决定继续使用工具,或者系统决定继续下一轮,那么 query 不结束,而是进入下一轮 turn。 + +### 1.4 `tool call` + +这是 assistant 输出的某个 `tool_use` block 最终对应的一次工具执行生命周期。 + +### 1.5 `forked subagent` + +最典型的技术特征是: + +- 由 `runForkedAgent(...)` 启动 +- 拥有自己的隔离上下文 +- 内部再次调用 `query(...)` +- 因此拥有自己的 `query_id / turn / tool` 轨迹 + +这一点非常关键: + +**每次 `runForkedAgent(...)` 都不是“插入主线程的一小段逻辑”,而是重新启动一条新的 query loop。** + +证据在: + +- [forkedAgent.ts](/abs/path/E:/claude-code/src/utils/forkedAgent.ts:588) + +这里 `runForkedAgent(...)` 内部直接: + +```ts +for await (const message of query({ ... })) { +``` + +--- + +## 2. `query()` 和 `queryLoop()` 的关系 + +真正的入口在: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:527) + +外层函数是: + +- `query(params)` + +它主要负责: + +1. 包一层 Langfuse trace 生命周期 +2. 调内部真正的主循环 `queryLoop(...)` +3. 在结束时补 trace 关闭和 command lifecycle 完成 + +所以: + +- `query()` 是外层壳 +- `queryLoop()` 是真正的执行状态机 + +内部真实主循环从这里开始: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:586) + +--- + +## 3. 主线程一次 query 的完整时间顺序 + +下面按源码真实顺序讲。 + +--- + +## 4. 第 0 阶段:进入 `query()`,创建外层 trace + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:527) + +主要逻辑: + +1. 初始化 `consumedCommandUuids` +2. 如有需要,创建 Langfuse trace +3. 把 trace 塞回 `toolUseContext` +4. `yield* queryLoop(...)` +5. 结束时关闭 trace + +### 这一层的作用 + +这层不是 query loop 本身,而是给整个 query 生命周期包一个外壳: + +- tracing +- lifecycle 收尾 +- command queue 生命周期通知 + +### 实现思路 + +把“真正做事的逻辑”和“外围观测/trace 生命周期”分开。 +这让 `queryLoop()` 可以只关心状态机本身。 + +--- + +## 5. 第 1 阶段:`queryLoop()` 初始化全局状态 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:586) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:613) + +初始化的核心状态包括: + +- `state.messages` +- `state.toolUseContext` +- `state.turnCount = 1` +- `state.maxOutputTokensRecoveryCount = 0` +- `state.hasAttemptedReactiveCompact = false` +- `state.pendingToolUseSummary = undefined` +- `state.transition = undefined` + +还会初始化: + +- `budgetTracker` +- `taskBudgetRemaining` +- `config = buildQueryConfig()` + +并立刻发出: + +- `state.initialized` +- `prefetch.memory.started` + +对应位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:641) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:665) + +### 这一层的作用 + +这是整个 query loop 的“状态机底座”。 + +它把: + +- 当前 messages +- 当前 turn 计数 +- 当前 recovery 状态 +- 当前预算状态 + +都放进一个可持续推进的 `State` 中。 + +### 实现思路 + +不是在循环里散落一堆变量,而是维护一个统一 `state`,每次进入下一轮时整体替换 `state = next`。 +这样每个“继续点”都能清楚表达: + +- 这轮结束后留下了什么状态 +- 下一轮要从什么状态继续 + +--- + +## 6. 第 2 阶段:进入 while(true),开始第 N 轮 turn + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:722) + +每次进入一轮时,会: + +1. 从 `state` 解构出本轮要用的变量 +2. 启动技能发现预取 `pendingSkillPrefetch` +3. `yield { type: 'stream_request_start' }` +4. 初始化 / 递增 `queryTracking` +5. 计算 `turnId = turn-${turnCount}` + +其中 `queryTracking` 很关键: + +- 第一次进来时创建新的 `chainId` +- 之后继续沿用同一个 `chainId` +- 但 `depth` 会递增 + +对应位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:762) + +然后会发事件: + +- `query.started`(只在第一轮) +- `query_tracking.assigned` +- `turn.started` +- `state.snapshot.before_turn` + +对应位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:781) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:800) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:813) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:827) + +### 这一层的作用 + +这一层定义了: + +- 这是不是某条 query 的第一轮 +- 当前是哪一轮 +- 当前这轮进入前的状态是什么 + +### 实现思路 + +每轮都先把“身份”和“快照”记录清楚,然后才开始做真实处理。 +这就是为什么后面的完整性指标能闭合到 `turn` 级别。 + +--- + +## 7. 第 3 阶段:消息预处理流水线 + +这是 query loop 非常关键的一段。 +它做的事情不是“调用模型”,而是先把要发给模型的上下文整理成当前最合适的版本。 + +本轮会按顺序执行这些步骤。 + +### 7.1 `getMessagesAfterCompactBoundary` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:836) + +作用: + +- 从完整消息历史中取出 compact boundary 之后的那部分消息 +- 也就是当前应该参与本轮请求的可见对话区间 + +事件: + +- `messages.compact_boundary.applied` + +### 7.2 `applyToolResultBudget` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:861) + +作用: + +- 给工具结果做大小预算控制 +- 防止某些 tool result 太大直接膨胀上下文 + +事件: + +- `messages.tool_result_budget.applied` + +### 7.3 `history_snip` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:895) + +作用: + +- 在特定条件下剪掉历史部分内容 +- 并返回 `tokensFreed` + +事件: + +- `messages.history_snip.applied` + +### 7.4 `microcompact` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:925) + +作用: + +- 对消息进行更细粒度压缩 +- 例如对 tool result 或缓存可编辑区做更轻量的处理 + +事件: + +- `messages.microcompact.applied` + +### 7.5 `contextCollapse` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:965) + +作用: + +- 把已经可以折叠的上下文投影成 collapsed view +- 尽量在不做完整 compact summary 的情况下减小上下文压力 + +事件: + +- `messages.context_collapse.applied` + +### 7.6 `autocompact` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1006) + +作用: + +- 检查是否需要正式 autocompact +- 如果触发,会生成 compact summary,并用 post-compact messages 替换当前可见上下文 + +事件: + +- `messages.autoconpact.checked` +- `messages.autoconpact.completed` + +### 7.7 整体预处理完成 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1111) + +事件: + +- `messages.preprocess.completed` + +### 这一整段的作用 + +这段代码的核心目标是: + +**在真正调用模型之前,把 messages 调整到“尽量小、尽量合理、仍保持上下文有效”的状态。** + +### 实现思路 + +它不是只有一种压缩手段,而是一个分层流水线: + +1. 先做轻量预算控制 +2. 再做历史裁剪 +3. 再做微压缩 +4. 再做 collapse +5. 最后再决定是否真的 autocompact + +这样做的好处是: + +- 尽量避免一上来就做重型 compact +- 先尝试保留更细粒度的上下文结构 + +--- + +## 8. 第 4 阶段:准备本轮模型调用环境 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1132) + +这时会初始化本轮模型调用要用的临时变量: + +- `assistantMessages` +- `toolResults` +- `toolUseBlocks` +- `needsFollowUp` +- `streamingToolExecutor` +- `currentModel` +- `dumpPromptsFetch` + +### 这一层的作用 + +这是“正式调模型前的本轮 runtime setup”。 + +它和前面的预处理不同,前面处理的是 messages; +这里准备的是: + +- 本轮 assistant 响应收集容器 +- 本轮 tool 执行器 +- 本轮选用的模型 +- 本轮调试/抓 prompt 的 fetch wrapper + +### 关键点 + +这一层之后,代码就真正准备开始调用模型了。 + +--- + +## 9. 第 5 阶段:阻塞阈值检查 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1173) + +作用: + +- 在某些条件下,如果上下文已经到硬阻塞极限,直接报 `prompt_too_long` +- 保留空间给用户手动 `/compact` + +终止路径: + +- `emitQueryTerminated('blocking_limit')` + +### 实现思路 + +在真正 API 调用前做一次硬保护,避免明显会失败的请求白白发出去。 + +--- + +## 10. 第 6 阶段:真正开始本轮模型调用 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1233) + +这里进入 `attemptWithFallback` 内层循环。 +这个循环的含义是: + +- 本轮 turn 原则上要调一次模型 +- 但如果遇到 fallback 条件,可以切换 fallback model 再重试一次 + +--- + +## 11. 第 7 阶段:构建 prompt + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1241) + +主要步骤: + +1. `prependUserContext(messagesForQuery, userContext)` +2. `summarizePromptComposition(...)` +3. 存 `request` snapshot +4. 发: + - `prompt.build.started` + - `prompt.snapshot.stored` + - `prompt.build.completed` + - `api.request.started` + +对应位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1241) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1263) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1285) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1334) + +### 这一层的作用 + +把“最终发给模型的内容”完全定稿,并把它存证到 snapshot。 + +### 实现思路 + +这里把 prompt 视为一个可审计对象,而不是只在内存里临时拼一下就发出去。 +因此: + +- 你后面能做 prompt token 分析 +- 能做 request snapshot 还原 +- 能检查 system prompt / userContext / messages 到底各占多少 + +--- + +## 12. 第 8 阶段:流式接收模型响应 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1360) + +真正模型调用通过: + +- `deps.callModel(...)` + +进行。 + +它会持续产出流式消息,query loop 一边收,一边处理。 + +### 12.1 第一块流到达 + +第一次收到 chunk 时: + +- 发 `api.stream.first_chunk` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1416) + +### 12.2 处理 streaming fallback + +如果 streaming fallback 发生: + +- tombstone 已经收到的 orphan assistant messages +- 清空当前暂存的 `assistantMessages / toolResults / toolUseBlocks` +- 重建 `StreamingToolExecutor` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1432) + +### 12.3 处理 assistant block + +每收到 assistant message,会: + +1. 发 `assistant.block.received` +2. 如果 block 是 `tool_use`,发 `assistant.tool_use.detected` +3. 把 assistant message 存入 `assistantMessages` +4. 把 `tool_use` block 存入 `toolUseBlocks` +5. 设置 `needsFollowUp = true` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1473) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1487) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1584) + +### 12.4 流式工具执行 + +如果开启 `StreamingToolExecutor`: + +- assistant 一边流出 `tool_use` +- executor 一边接收 tool block +- 已完成的工具结果会被尽快收割进 `toolResults` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1596) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1606) + +### 12.5 响应结束 + +流结束后会: + +1. 存 `response` snapshot +2. 发 `api.stream.completed` + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1624) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1631) + +### 这一整段的作用 + +这是 query loop 最核心的一段: + +- 与模型交互 +- 收集 assistant 输出 +- 识别 tool_use +- 决定本轮是否需要继续 + +### 实现思路 + +这段不是“等模型整段输出完了再统一处理”,而是: + +- **边流边观察** +- 尽可能早发现工具调用 +- 尽可能早启动流式工具执行 + +这就是为什么这套系统不是简单的一问一答,而是一个 agentic loop。 + +--- + +## 13. 第 9 阶段:模型调用错误与 fallback + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1675) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1746) + +这里处理几类问题: + +### 13.1 模型 fallback + +如果抛出 `FallbackTriggeredError`: + +- 切换到 fallback model +- 清空本次失败尝试的 assistant/tool 状态 +- 必要时 strip signature blocks +- 重新进入本轮模型调用 + +### 13.2 图片类错误 + +- `ImageSizeError` +- `ImageResizeError` + +会终止为: + +- `image_error` + +### 13.3 普通模型错误 + +会: + +- 补 missing tool_result blocks +- 发 abandoned tool_use 事件 +- 产出 API error message +- `emitQueryTerminated('model_error')` + +### 实现思路 + +把: + +- 可恢复错误 +- 可 fallback 错误 +- 直接终止错误 + +分开处理,而不是所有错误都一刀切。 + +--- + +## 14. 第 10 阶段:post-sampling hooks + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1807) + +只要本轮有 assistant 响应,就会: + +- `executePostSamplingHooks(...)` + +### 这是整个系统的第一个重要“分叉检查点” + +它不是主线程直接决定“我要不要开 session memory”,而是: + +1. 主线程一轮模型响应结束 +2. 调 post-sampling hooks +3. 某个 hook 自己判断是否要 fork + +最典型的是 `session_memory`: + +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:382) + - 注册 `extractSessionMemory` +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:303) + - `if (!shouldExtractMemory(messages)) return` +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:325) + - `runForkedAgent(...)` + +这说明: + +**主线程并不是“运行到某一行突然强制开一个 session_memory”。** +而是本轮结束后统一执行 hook,由 hook 判断此刻是否满足后台记忆更新条件。 + +--- + +## 15. 第 11 阶段:处理流式中断 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1819) + +如果用户在 streaming 阶段中断: + +- 收尾剩余流式工具结果 +- 或补 synthetic tool_result +- 做 computer use cleanup +- 产出 interruption message +- `emitQueryTerminated('aborted_streaming')` + +--- + +## 16. 第 12 阶段:如果本轮没有 tool_use,进入“收尾 / 终止 / 恢复”路径 + +判断条件: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1881) + +即: + +- `if (!needsFollowUp)` + +含义是: + +assistant 这轮没有提出新的工具调用。 +这时系统会判断: + +1. 是不是该恢复重试 +2. 是不是该 stop hooks +3. 是不是该 token budget continuation +4. 还是直接完成 query + +这是一条非常重要的分支。 + +--- + +## 17. 第 13 阶段:恢复链 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:1884) 之后 + +包括: + +### 17.1 prompt-too-long / media recovery + +- `contextCollapse.recoverFromOverflow(...)` +- `reactiveCompact.tryReactiveCompact(...)` + +如果成功,会构造 `next` state,并 `continue` 进入下一轮。 + +对应 transition: + +- `collapse_drain_retry` +- `reactive_compact_retry` + +### 17.2 max_output_tokens recovery + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2046) + +包括两种: + +1. 提升 `maxOutputTokensOverride` + - `max_output_tokens_escalate` +2. 注入 recovery user message,继续下一轮 + - `max_output_tokens_recovery` + +### 实现思路 + +系统把 recoverable 错误尽量当成: + +**“状态转移后继续下一轮”** + +而不是立刻把 query 打死。 + +这也是 `state.transitioned` 存在的意义之一。 + +--- + +## 18. 第 14 阶段:stop hooks + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2164) +- [stopHooks.ts](/abs/path/E:/claude-code/src/query/stopHooks.ts:66) + +这是第二个重要“分叉检查点”。 + +主线程会在一轮收尾时进入 `handleStopHooks(...)`。 + +这里会做几类事: + +1. 执行 stop hooks 本身 +2. 保存 cache-safe params +3. 触发若干后台逻辑: + - `executePromptSuggestion(...)` + - `executeExtractMemories(...)` + - `executeAutoDream(...)` + +对应源码: + +- [stopHooks.ts](/abs/path/E:/claude-code/src/query/stopHooks.ts:155) + +### 这意味着什么 + +如果你问: + +**“主线程什么时候会考虑开 `extract_memories`?”** + +答案就是: + +**在 stop hook 阶段。** + +调用链大致是: + +```text +queryLoop() +-> handleStopHooks() +-> executeExtractMemories() +-> executeExtractMemoriesImpl() +-> guard 条件通过 +-> runForkedAgent() +``` + +而 `executeExtractMemoriesImpl()` 的 guard 在: + +- [extractMemories.ts](/abs/path/E:/claude-code/src/services/extractMemories/extractMemories.ts:528) + +真正 fork 在: + +- [extractMemories.ts](/abs/path/E:/claude-code/src/services/extractMemories/extractMemories.ts:415) + +--- + +## 19. 第 15 阶段:token budget 决策 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2223) + +如果 `TOKEN_BUDGET` feature 开启: + +- `checkTokenBudget(...)` + +可能返回: + +1. `continue` + - 注入一条 meta user message + - `transition = token_budget_continuation` + - 进入下一轮 +2. `complete` + - 不再继续 + +这也是“虽然没有 tool_use,但系统仍可能继续一轮”的原因之一。 + +--- + +## 20. 第 16 阶段:直接完成 query + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2305) + +如果: + +- 没有 tool_use +- 没有恢复路径 +- stop hook 没拦截 +- token budget 没要求继续 + +那么本轮就: + +- `emitQueryTerminated('completed')` +- `return { reason: 'completed' }` + +这才意味着这条 query 生命周期真正结束。 + +注意: + +**这不是“一轮结束”,而是“整条 query 结束”。** + +--- + +## 21. 第 17 阶段:如果 assistant 产生了 tool_use,进入工具执行路径 + +如果 `needsFollowUp = true`,代码就不会走上面的直接完成路径,而会继续执行工具。 + +入口位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2311) + +### 21.1 决定工具执行模式 + +发: + +- `tool.execution.mode.selected` + +然后选择: + +- `streamingToolExecutor.getRemainingResults()` + 或 +- `runTools(...)` + +对应位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2330) +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2344) + +### 21.2 普通 `runTools(...)` + +实现位置: + +- [toolOrchestration.ts](/abs/path/E:/claude-code/src/services/tools/toolOrchestration.ts:21) + +它会: + +1. 按工具是否并发安全分 batch +2. 并发安全的工具并行跑 +3. 非并发安全的工具串行跑 +4. 产出 message update 和 context update + +### 21.3 `StreamingToolExecutor` + +实现位置: + +- [StreamingToolExecutor.ts](/abs/path/E:/claude-code/src/services/tools/StreamingToolExecutor.ts:1) + +它的作用是: + +- assistant 还在流时,工具就可以边到边执行 +- 但结果仍按工具收到的顺序被缓冲和产出 + +### 这一层的作用 + +把 assistant 的 tool_use blocks 变成真正的 tool_result,并更新上下文。 + +--- + +## 22. 第 18 阶段:工具结果后的附加处理 + +工具执行完之后,query loop 还会做几件事: + +### 22.1 生成 tool summary + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2375) + +这是为了把上一轮工具行为总结成更适合 UI 的摘要。 + +### 22.2 注入 attachment + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2550) + +包括: + +- queued commands +- memory prefetch 结果 +- skill discovery prefetch 结果 + +### 22.3 刷新 tools + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2629) + +例如新连上的 MCP tool 可以在下一轮可用。 + +### 22.4 任务摘要 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2651) + +为后台 session/task 生成 summary。 + +### 22.5 maxTurns 检查 + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2674) + +--- + +## 23. 第 19 阶段:构造下一轮 `State`,继续 loop + +位置: + +- [query.ts](/abs/path/E:/claude-code/src/query.ts:2689) + +本轮如果已经完成了工具执行,而且没有终止,就会构造: + +- `next: State` + +其中包括: + +- `messages = [...messagesForQuery, ...assistantMessages, ...toolResults]` +- `toolUseContext = updated context` +- `turnCount = nextTurnCount` +- `pendingToolUseSummary = nextPendingToolUseSummary` +- `transition = { reason: 'next_turn' }` + +然后: + +1. 发 `state.transitioned` +2. 发 `state.snapshot.after_turn` +3. `state = next` +4. `continue` + +这就进入下一轮 turn。 + +### 实现思路 + +query loop 的核心思想不是递归,而是: + +**在 while(true) 中不断构造下一轮完整状态,然后继续。** + +这样所有 continuation path 都能统一落在 `State` 迁移模型里。 + +--- + +## 24. 那么,系统到底在哪些固定时机检查“要不要开子 agent / 旁路”? + +总结一下,主要有三类时机。 + +### 24.1 post-sampling hooks + +触发时机: + +- 一轮模型响应结束后 + +典型: + +- `session_memory` + +调用链: + +```text +queryLoop +-> executePostSamplingHooks +-> extractSessionMemory +-> shouldExtractMemory +-> runForkedAgent +-> 新的 session_memory query +``` + +### 24.2 stop hooks + +触发时机: + +- 一轮准备收尾时 + +典型: + +- `prompt_suggestion` +- `extract_memories` +- `auto_dream` + +调用链: + +```text +queryLoop +-> handleStopHooks +-> executeExtractMemories / executePromptSuggestion / executeAutoDream +-> 各自 guard +-> runForkedAgent +``` + +### 24.3 显式命令 / 专用入口 + +典型: + +- `/btw` 的 `side_question` +- compact 流程里的 `compact` + +这类不是“主线程每轮都会检查一次”,而是只有进入对应功能路径时才会触发。 + +--- + +## 25. `session_memory` 为什么会在某些轮次后出现? + +因为它的判断函数: + +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:135) + +并不是每时每刻都跑,而是只在: + +- 主线程某轮模型响应结束后 +- 由 post-sampling hook 触发 + +它的核心判断条件是: + +1. 初始化阈值是否达到 +2. 自上次更新以来 token 增量是否达到 +3. 是否满足: + - tool call 数达到阈值,或 + - 最近一轮 assistant 已经没有 tool call,说明到了自然停顿点 + +所以 `session_memory` 的真实契机不是: + +- “出现了某个神秘系统事件” + +而是: + +**“这一轮结束后,系统发现上下文已经积累到值得做一次后台会话记忆更新。”** + +--- + +## 26. `session_memory` 到底维护什么 + +它维护的是当前会话的一份 markdown memory 文件。 + +关键位置: + +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:184) +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:193) + +它会: + +1. 找 session memory 目录 +2. 找 session memory 文件路径 +3. 文件不存在就创建并写模板 +4. 读当前内容 +5. 构造更新 prompt +6. 起一个 forked agent 去更新这个文件 + +而且它的权限被收得很死: + +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:469) + +只允许对那一个 memory 文件执行 `Edit`。 + +--- + +## 27. 旁路和子 agent 的区别 + +不要把这两个词混为一谈。 + +### 子 agent + +更具体: + +- 通过 `runForkedAgent(...)` 启动 +- 内部有自己的 `query()` loop +- 有自己独立的 `query_id / turn / tool` + +### 旁路 + +更宽泛: + +- 不走主线程正面继续路径 +- 是一条“额外处理路径” + +所以: + +- 子 agent 是一种比较重的旁路 +- 但旁路不一定都是子 agent + +最典型的对比: + +### `/btw` 的 `side_question` + +位置: + +- [sideQuestion.ts](/abs/path/E:/claude-code/src/utils/sideQuestion.ts:53) + +它是 forked subagent: + +- [sideQuestion.ts](/abs/path/E:/claude-code/src/utils/sideQuestion.ts:80) + +### `sideQuery(...)` + +位置: + +- [sideQuery.ts](/abs/path/E:/claude-code/src/utils/sideQuery.ts:81) + +它只是主线程外的一次轻量 API wrapper,**不等于 forked subagent**。 + +--- + +## 28. 一次用户动作为什么会变成多条 query + +现在你可以把整个过程理解成一棵树: + +```text +user_action +-> 主线程 query + -> turn 1 + -> turn 2 + -> turn 3 + -> ... + -> 某些时机触发 fork + -> session_memory query + -> extract_memories query + -> side_question query +``` + +所以: + +- 用户只发了一次请求 +- 系统内部却可能开出多条 query +- 每条 query 自己又会有多轮 turn + +这就是为什么必须用: + +- `user_action_id` 看整棵树 +- `query_id` 看某条分支 + +--- + +## 29. 最后给一个“最短但最正确”的总结 + +这套系统的主线程不是“一问一答函数”,而是一个**可持续推进的状态机**。 + +它每轮都做这几件事: + +1. 读取当前状态 +2. 预处理消息 +3. 组 prompt +4. 调模型 +5. 观察 assistant 是否提出 tool_use +6. 如有工具,执行工具并进入下一轮 +7. 如无工具,检查恢复链 / stop hooks / token budget +8. 最终决定: + - 继续下一轮 + - fork 出旁路 / 子 agent + - 或终止整条 query + +而所谓“子 agent”最本质的技术事实就是: + +**某个模块在固定时机点判断条件成立后,调用 `runForkedAgent(...)`,于是系统又启动了一条新的 `query()` loop。** + +--- + +## 30. 你接下来最应该怎么读源码 + +如果你以后要继续深入,建议按这个顺序: + +1. [query.ts](/abs/path/E:/claude-code/src/query.ts:527) + - 先读主线程状态机 +2. [stopHooks.ts](/abs/path/E:/claude-code/src/query/stopHooks.ts:66) + - 看一轮收尾时系统还会做什么 +3. [forkedAgent.ts](/abs/path/E:/claude-code/src/utils/forkedAgent.ts:493) + - 看 forked subagent 怎么启动自己的 query loop +4. [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:135) + - 看一个典型的“后台子 agent 触发条件” +5. [extractMemories.ts](/abs/path/E:/claude-code/src/services/extractMemories/extractMemories.ts:528) + - 看一个典型的“stop hook 后台分支” +6. [toolOrchestration.ts](/abs/path/E:/claude-code/src/services/tools/toolOrchestration.ts:21) + - 看工具执行是怎么接回下一轮的 + +这样读,你会从“一个 query loop 里面到底发生了什么”一路读到“为什么会长出多条分支 query”。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\345\217\257\350\247\202\346\265\213\344\273\273\345\212\241\344\271\246.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\345\217\257\350\247\202\346\265\213\344\273\273\345\212\241\344\271\246.md" new file mode 100644 index 0000000000..b4fe6efc6e --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\345\217\257\350\247\202\346\265\213\344\273\273\345\212\241\344\271\246.md" @@ -0,0 +1,650 @@ +# Subagent 触发因果可观测任务书 + +本文定义可观测系统下一阶段建设任务:为 forked subagent 增加“触发因果”层观测,补齐当前系统只能回答“开了什么”,但不能稳定回答“为什么此刻开”的缺口。 + +--- + +## 0. 理解清单 + +- 当前系统已经能看到: + - 开了哪些 subagent + - 每条 subagent 跑了多久、花了多少 token、是否闭合 +- 当前系统还看不到: + - 为什么是这一刻启动这条 subagent + - 是 hook、阈值、命令、定时器,还是 compact 流程触发 +- 本任务不是替换现有字段,而是补一层新的“触发因果字段” +- 新增字段的核心目标是把三层语义拆开: + - `subagent_reason`:它为什么存在 + - `subagent_trigger_kind`:它通过什么机制被触发 + - `subagent_trigger_detail`:它具体走了哪条判定分支 +- 第一批最关键的对象是: + - `session_memory` + - `extract_memories` + - `side_question` + - 其次再覆盖 `prompt_suggestion / compact / auto_dream / agent_summary / speculation` + +--- + +## 1. 背景 + +当前系统已经能够稳定观测: + +- `user_action_id` +- `query_id` +- `subagent_id` +- `query_source` +- `subagent_type` +- `subagent_reason` + +因此已经可以回答: + +- 开了哪些 subagent +- 每条 subagent 跑了多少 turn +- 花了多少 token +- 最终是否闭合 + +但当前系统仍不能稳定回答: + +- 为什么是这一类 subagent +- 为什么在这一时刻启动 +- 是 hook、阈值、显式命令、定时器,还是 compact 流程触发 +- 同一类 subagent 的不同启动分支分别占多少 + +这导致: + +- action 报告只能描述“这里发生了分叉”,但很难说明“这里为什么分叉” +- dashboard 只能按 `source / reason` 看成本,不能按触发机制看成本 +- 后续 V2/V3 若引入更多 forked agent,现有字段会越来越不够用 + +--- + +## 1.1 预期效果 + +本任务完成后,系统不再只能说: + +- “这里启动了一条 `session_memory`” + +而应能说: + +- “这里启动了一条 `session_memory`” +- “它是由 `post_sampling_hook` 机制触发的” +- “具体触发分支是 `token_threshold_and_natural_break`” +- “触发时的关键判定值是:token 增量已满足阈值,最近一轮已无 tool call” + +也就是说,action 报告和日志阅读结果将从“结构可见”升级为“结构 + 因果可解释”。 + +### 具体回测示例 + +以历史真实样本: + +- `user_action_id = 9ddd1bff-65b6-414f-bf04-418809eb6ff7` + +为例,当前系统只能看到: + +- 主线程 `turn-1` 后起了 `session_memory #1` +- 主线程 `turn-4` 后起了 `session_memory #2` +- 主线程完成后起了 `extract_memories` + +补完本任务后,预期能读成: + +#### `session_memory #1` + +- `subagent_reason = session_memory` +- `subagent_trigger_kind = post_sampling_hook` +- `subagent_trigger_detail = token_threshold_and_tool_threshold` +- `subagent_trigger_payload` + - `has_met_update_threshold = true` + - `tool_calls_since_last_update = N` + - `tool_call_threshold = M` + +#### `session_memory #2` + +- `subagent_reason = session_memory` +- `subagent_trigger_kind = post_sampling_hook` +- `subagent_trigger_detail = token_threshold_and_natural_break` +- `subagent_trigger_payload` + - `has_met_update_threshold = true` + - `has_tool_calls_in_last_turn = false` + +#### `extract_memories` + +- `subagent_reason = extract_memories` +- `subagent_trigger_kind = stop_hook_background` +- `subagent_trigger_detail = post_turn_background_extraction` +- `subagent_trigger_payload` + - `feature_gate_enabled = true` + - `auto_memory_enabled = true` + - `in_progress = false` + +最终效果是: + +1. 日志阅读时不再需要大量猜测 +2. `explain_action` 能直接解释“为什么这里分叉” +3. 后续可以按触发机制分析频率、成本和异常触发 + +--- + +## 1.2 设计思路 + +### 为什么不能只用现有字段 + +- `query_source` 只说明来源,不说明“为什么现在开” +- `subagent_type` 更偏实现标签,不够稳定 +- `subagent_reason` 只能说明业务目的,仍不能说明本次触发契机 + +所以当前缺的不是“再起一个别名”,而是缺一层新的因果表达。 + +### 为什么要拆成 `kind + detail + payload` + +因为这三层承担不同职责: + +- `subagent_trigger_kind` + - 适合做聚合统计 + - 例如:`post_sampling_hook / stop_hook_background / explicit_user_command` +- `subagent_trigger_detail` + - 适合做人类可读解释 + - 例如:`token_threshold_and_tool_threshold` +- `subagent_trigger_payload` + - 适合保留判定现场证据 + - 例如具体阈值、计数、布尔条件 + +如果把这三层揉成一个字段,后续要么不可统计,要么不可解释。 + +### 为什么必须在调用点写入 + +调用点最知道“为什么此刻开”: + +- `sessionMemory.ts` 知道是哪条阈值分支命中 +- `extractMemories.ts` 知道是不是 trailing run +- `sideQuestion.ts` 知道这是 `/btw` + +所以: + +- 事件层应优先由调用点显式传入 trigger 字段 +- `runForkedAgent(...)` 只做统一承载,不做复杂推断 +- ETL 只负责兼容旧日志,不能替代源码事实源 + +### 为什么不替换旧字段 + +因为旧字段仍然有价值,只是语义层级不同: + +- `query_source`:来源 +- `subagent_type`:实现标签 +- `subagent_reason`:业务原因 +- `subagent_trigger_*`:本次触发契机 + +正确做法是分层补充,而不是互相覆盖。 + +--- + +## 2. 本轮目标 + +本轮目标是新增一层稳定的“触发因果观测”,使系统能够同时表达: + +1. 这条 subagent **属于什么业务目的** +2. 这条 subagent **是通过什么机制被触发的** +3. 这条 subagent **在该机制下具体走了哪条判定分支** +4. 必要时,保留当时判定所用的关键上下文事实 + +--- + +## 3. 非目标 + +本轮不做: + +- 不重写 query loop 主结构 +- 不新增新的 subagent 功能 +- 不重构已有 `query_source` / `subagent_type` 的底层语义 +- 不一次性做大量新 dashboard 面板 +- 不修改远端平台或外部 exporter + +--- + +## 4. 核心设计原则 + +### 4.1 不替代旧字段,只新增因果层 + +保留现有字段: + +- `query_source` +- `subagent_type` +- `subagent_reason` + +新增字段: + +- `subagent_trigger_kind` +- `subagent_trigger_detail` +- `subagent_trigger_payload` + +原因: + +- `query_source` 表示来源 +- `subagent_type` 表示实现标签 +- `subagent_reason` 表示业务原因 +- `subagent_trigger_*` 表示本次启动契机 + +这四层语义不同,不能强行合并成一个字段。 + +### 4.2 优先由调用点显式传值 + +原则: + +- 触发因果字段应优先由**调用 `runForkedAgent(...)` 的模块**显式传入 +- 不应主要依赖 `runForkedAgent(...)` 内部推断 +- ETL 只能对历史日志做回退兼容,不能成为主事实源 + +原因: + +- 调用点最知道“为什么在这时开” +- 框架层只知道“有人让我开了” + +### 4.3 兼容旧日志 + +新字段对历史日志允许为空: + +- `subagent_trigger_kind = null` +- `subagent_trigger_detail = null` +- `subagent_trigger_payload = null` + +这样不会破坏已有 V1 库和阅读器。 + +--- + +## 5. 字段定义 + +### 5.1 `subagent_reason` + +定义: + +- 稳定业务原因 +- 回答“这条 subagent 是为哪类业务目的存在的” + +建议枚举: + +- `session_memory` +- `extract_memories` +- `side_query` +- `prompt_suggestion` +- `compact` +- `auto_dream` +- `agent_summary` +- `speculation` + +### 5.2 `subagent_trigger_kind` + +定义: + +- 触发机制大类 +- 回答“这次启动是在哪种机制下被触发的” + +建议枚举: + +- `post_sampling_hook` +- `stop_hook_background` +- `explicit_user_command` +- `manual_command` +- `periodic_timer` +- `internal_pipeline` +- `compaction_flow` +- `direct_feature_entry` + +### 5.3 `subagent_trigger_detail` + +定义: + +- 触发分支细节 +- 回答“在该机制下,具体是哪条判定分支触发的” + +示例值: + +- `token_threshold_and_tool_threshold` +- `token_threshold_and_natural_break` +- `post_turn_background_extraction` +- `coalesced_trailing_run` +- `btw_command` +- `suggestion_generation_allowed` +- `prompt_cache_sharing_compact` +- `summary_interval_elapsed` +- `accepted_prompt_suggestion` + +### 5.4 `subagent_trigger_payload` + +定义: + +- 触发时的关键判定上下文 +- 用于记录具体阈值、开关、模式、计数等 + +类型: + +- JSON 对象 + +示例: + +```json +{ + "has_met_update_threshold": true, + "tool_calls_since_last_update": 7, + "has_tool_calls_in_last_turn": false +} +``` + +--- + +## 6. 首批覆盖范围 + +本轮先覆盖当前最核心、最常见的 forked agent 入口。 + +### 6.1 `session_memory` + +调用点: + +- [sessionMemory.ts](/abs/path/E:/claude-code/src/services/SessionMemory/sessionMemory.ts:325) + +建议写入: + +- `subagent_reason = session_memory` +- `subagent_trigger_kind = post_sampling_hook` +- `subagent_trigger_detail` + - `token_threshold_and_tool_threshold` + - 或 `token_threshold_and_natural_break` +- `subagent_trigger_payload` + - `current_token_count` + - `has_met_initialization_threshold` + - `has_met_update_threshold` + - `tool_calls_since_last_update` + - `tool_call_threshold` + - `has_tool_calls_in_last_turn` + +### 6.2 `extract_memories` + +调用点: + +- [extractMemories.ts](/abs/path/E:/claude-code/src/services/extractMemories/extractMemories.ts:415) + +建议写入: + +- `subagent_reason = extract_memories` +- `subagent_trigger_kind = stop_hook_background` +- `subagent_trigger_detail` + - `post_turn_background_extraction` + - 或 `coalesced_trailing_run` +- `subagent_trigger_payload` + - `feature_gate_enabled` + - `auto_memory_enabled` + - `remote_mode` + - `in_progress` + +### 6.3 `side_question` + +调用点: + +- [sideQuestion.ts](/abs/path/E:/claude-code/src/utils/sideQuestion.ts:80) + +建议写入: + +- `subagent_reason = side_query` +- `subagent_trigger_kind = explicit_user_command` +- `subagent_trigger_detail = btw_command` +- `subagent_trigger_payload` + - `command = /btw` + - `max_turns = 1` + - `tools_allowed = false` + +### 6.4 `prompt_suggestion` + +调用点: + +- [promptSuggestion.ts](/abs/path/E:/claude-code/src/services/PromptSuggestion/promptSuggestion.ts:319) + +建议写入: + +- `subagent_reason = prompt_suggestion` +- `subagent_trigger_kind = stop_hook_background` +- `subagent_trigger_detail = suggestion_generation_allowed` +- `subagent_trigger_payload` + - `assistant_turn_count` + - `suppress_reason = null` + - `is_main_thread = true` + +### 6.5 `compact` + +调用点: + +- [compact.ts](/abs/path/E:/claude-code/src/services/compact/compact.ts:1191) + +建议写入: + +- `subagent_reason = compact` +- `subagent_trigger_kind = compaction_flow` +- `subagent_trigger_detail = prompt_cache_sharing_compact` +- `subagent_trigger_payload` + - `prompt_cache_sharing_enabled` + - `skip_cache_write` + - `max_turns = 1` + +### 6.6 `auto_dream` + +调用点: + +- [autoDream.ts](/abs/path/E:/claude-code/src/services/autoDream/autoDream.ts:225) + +建议写入: + +- `subagent_reason = auto_dream` +- `subagent_trigger_kind = stop_hook_background` +- `subagent_trigger_detail = dream_consolidation_run` + +### 6.7 `agent_summary` + +调用点: + +- [agentSummary.ts](/abs/path/E:/claude-code/src/services/AgentSummary/agentSummary.ts:115) + +建议写入: + +- `subagent_reason = agent_summary` +- `subagent_trigger_kind = periodic_timer` +- `subagent_trigger_detail = summary_interval_elapsed` + +### 6.8 `speculation` + +调用点: + +- [speculation.ts](/abs/path/E:/claude-code/src/services/PromptSuggestion/speculation.ts:457) + +建议写入: + +- `subagent_reason = speculation` +- `subagent_trigger_kind = internal_pipeline` +- `subagent_trigger_detail = accepted_prompt_suggestion` + +--- + +## 7. 事件层改动 + +### 7.1 修改 `ForkedAgentParams` + +文件: + +- [forkedAgent.ts](/abs/path/E:/claude-code/src/utils/forkedAgent.ts:83) + +新增字段: + +```ts +subagentTriggerKind?: string +subagentTriggerDetail?: string +subagentTriggerPayload?: Record +``` + +### 7.2 修改 `runForkedAgent(...)` + +文件: + +- [forkedAgent.ts](/abs/path/E:/claude-code/src/utils/forkedAgent.ts:493) + +要求: + +- 在 `subagent.spawn.requested` +- `subagent.spawned` +- `subagent.completed` + +中统一带出: + +- `subagent_reason` +- `subagent_trigger_kind` +- `subagent_trigger_detail` + +并把复杂对象放入: + +- `payload.subagent_trigger_payload` + +### 7.3 回退逻辑 + +要求: + +- `subagent_reason` 继续保留当前回退: + - `subagentReason ?? forkLabel ?? querySource ?? 'unknown'` +- `subagent_trigger_*` 不做复杂框架级推断 +- 未显式传值时保持 `null` + +--- + +## 8. ETL 改动 + +文件: + +- [build_duckdb_etl.ts](/abs/path/E:/claude-code/scripts/observability/build_duckdb_etl.ts:1) + +要求: + +### 8.1 `events_raw` + +新增列: + +- `subagent_trigger_kind` +- `subagent_trigger_detail` +- `subagent_trigger_payload_json` + +### 8.2 `queries` + +新增列: + +- `subagent_trigger_kind` +- `subagent_trigger_detail` + +规则: + +- 对于同一 query,优先取 `subagent.spawned` +- 否则回退到同链路内最早带值事件 + +### 8.3 `subagents` + +新增列: + +- `subagent_trigger_kind` +- `subagent_trigger_detail` +- `subagent_trigger_payload_json` + +### 8.4 兼容旧日志 + +要求: + +- 历史样本默认 `null` +- 不允许因旧日志缺字段而导致建库失败 + +--- + +## 9. 阅读器与展示层改动 + +本轮只做最小可读性接入,不扩张大面板。 + +### 9.1 `explain_action.ps1` + +要求: + +- 在 subagent 节点下展示: + - `subagent_reason` + - `subagent_trigger_kind` + - `subagent_trigger_detail` + +### 9.2 action 报告 + +要求: + +- 在自然语言解释中,优先用 trigger 字段解释“为什么这里分叉” + +### 9.3 dashboard / daily summary + +本轮非必须,仅做以下最小增强之一即可: + +- `Subagent Reason 明细` 表增加 `trigger_kind / trigger_detail` + 或 +- 新增一张极小的 `Subagent Trigger 明细` 表 + +不要求新增复杂图表。 + +--- + +## 10. 验证要求 + +### 10.1 代码验证 + +- `typecheck` 通过 +- ETL 可正常重建 +- `daily_summary.ps1` 可正常运行 +- `explain_action.ps1` 可正常生成报告 + +### 10.2 日志验证 + +使用新的 debug 样本验证至少这几类: + +- `session_memory` +- `extract_memories` +- 如可复现,再加 `side_question` + +### 10.3 功能验证目标 + +验证时应能明确回答: + +- 这条 subagent 是什么业务原因 +- 这条 subagent 是通过什么机制触发的 +- 这次具体是哪条触发分支 + +--- + +## 11. 验收标准 + +完成后,系统至少应满足: + +1. `subagent.spawn.requested / spawned / completed` 三类事件能稳定带出触发因果字段 +2. DuckDB 中可以按 `subagent_trigger_kind` / `subagent_trigger_detail` 查询 +3. `explain_action` 生成的 action 报告能解释“为什么这里启动了这条 subagent” +4. 历史旧日志不因新字段而失效 +5. 原有 `query_source / subagent_type / subagent_reason` 语义不被破坏 + +--- + +## 12. 推荐实施顺序 + +1. 先改 `forkedAgent.ts` 参数和事件 schema +2. 再改 `session_memory / extract_memories / side_question` 三个最关键调用点 +3. 再改 ETL +4. 最后改 `explain_action.ps1` + +理由: + +- 先把事实源打稳 +- 再把阅读器接上 +- 避免先改展示层却没有真实字段支撑 + +--- + +## 13. 一句话总结 + +本任务不是再给 subagent 起一个新名字,而是要把: + +- **它是什么** +- **为什么有它** +- **为什么在这一刻启动它** + +这三层语义正式拆开,形成稳定的 V1 因果观测能力,为后续 V2/V3 扩展打基础。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\346\211\247\350\241\214\346\270\205\345\215\225.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\346\211\247\350\241\214\346\270\205\345\215\225.md" new file mode 100644 index 0000000000..f81b14c121 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/Subagent\350\247\246\345\217\221\345\233\240\346\236\234\346\211\247\350\241\214\346\270\205\345\215\225.md" @@ -0,0 +1,77 @@ +# Subagent 触发因果执行清单 + +## 理解清单 + +- 这份清单只覆盖首批可落地实现,不继续扩张更多面板 +- 实现顺序是: + 1. 事件 schema + 2. 首批调用点 + 3. ETL + 4. `explain_action` + 5. 验证 +- 第一批重点覆盖: + - `session_memory` + - `extract_memories` + - `side_question` + - 同时补上 `prompt_suggestion / compact / auto_dream / agent_summary / speculation` + +## 预期效果 + +- 新日志里,`subagent.spawn.requested / spawned / completed` 都会带: + - `subagent_trigger_kind` + - `subagent_trigger_detail` + - `payload.subagent_trigger_payload` +- DuckDB 中可以查询: + - 某条 subagent 是什么 reason + - 它是通过什么机制触发的 + - 具体触发分支是什么 +- `explain_action` 报告里可以直接写: + - “这里启动了一条 `session_memory`,由 `post_sampling_hook` 机制触发,具体分支是 `token_threshold_and_natural_break`” + +## 设计思路 + +- 不替换旧字段,只补因果层 +- 触发字段优先由调用点显式传入,不让 ETL 事后猜主事实 +- ETL 只做兼容旧日志 +- 展示层先接入 action 报告,不扩张大 dashboard + +## 执行步骤 + +1. 扩 `HarnessEventInput` + - 增加 `subagent_trigger_kind` + - 增加 `subagent_trigger_detail` + +2. 扩 `ForkedAgentParams` + - 增加 `subagentTriggerKind` + - 增加 `subagentTriggerDetail` + - 增加 `subagentTriggerPayload` + +3. 修改 `runForkedAgent(...)` + - 三类事件统一落 trigger 字段: + - `subagent.spawn.requested` + - `subagent.spawned` + - `subagent.completed` + +4. 修改首批调用点 + - `sessionMemory.ts` + - `extractMemories.ts` + - `sideQuestion.ts` + - `promptSuggestion.ts` + - `compact.ts` + - `autoDream.ts` + - `agentSummary.ts` + - `speculation.ts` + +5. 修改 ETL + - `events_raw` 新增 trigger 列 + - `queries` 新增 trigger 列 + - `subagents` 新增 trigger 列 + +6. 修改 `explain_action.ps1` + - 查询并展示 trigger 字段 + - 在 Markdown 报告中输出 trigger 说明 + +7. 验证 + - `typecheck` + - 重建 DuckDB + - 生成最新 action 报告 diff --git "a/ObservrityTask/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" similarity index 100% rename from "ObservrityTask/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/\346\217\220\347\244\272\350\257\215\350\276\223\345\205\245Token\345\210\206\346\236\220.md" diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/README.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/README.md" new file mode 100644 index 0000000000..1bba4533d5 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/README.md" @@ -0,0 +1,21 @@ +# V1 目录索引 + +当前目录保存可观测系统 V1 的稳定文档。 + +## 子目录 + +- `01-总览` + - V1 主研究报告与 dashboard +- `02-Schema与指标` + - 事件 Schema、DuckDB Schema、指标定义、日志阅读教学 +- `03-样例` + - 基于真实 `user_action_id` 生成的样例解析 +- `04-专题研究` + - 与当前 V1 一致,但更偏专题分析的研究文档 + +## 建议阅读顺序 + +1. `01-总览/当前可观测系统V1深度研究报告.md` +2. `02-Schema与指标/` +3. `03-样例/` +4. `04-专题研究/` diff --git a/ObservrityTask/README.md b/ObservrityTask/README.md new file mode 100644 index 0000000000..ec5a0f31db --- /dev/null +++ b/ObservrityTask/README.md @@ -0,0 +1,21 @@ +# ObservrityTask 目录索引 + +当前目录按“输入材料 / 系统版本”分层,方便后续继续并列扩展 `v2`、`v3`。 + +## 目录结构 + +- `00-资料输入` + - 原始任务书、输入说明、参考 PDF +- `10-系统版本` + - `v1/` + - `01-总览` + - `02-Schema与指标` + - `03-样例` + - `04-专题研究` + +## 阅读顺序 + +1. 先看 `10-系统版本/v1/01-总览/当前可观测系统V1深度研究报告.md` +2. 再看 `10-系统版本/v1/02-Schema与指标/` +3. 需要实操时看 `10-系统版本/v1/03-样例/` +4. 需要回溯输入背景时看 `00-资料输入/` diff --git a/README.md b/README.md index 589bae680a..7046d9b44d 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,144 @@ TUI (REPL) 模式需要真实终端,无法直接通过 VS Code launch 启动 - **在线文档(Mintlify)**: [ccb.agent-aura.top](https://ccb.agent-aura.top/) — 文档源码位于 [`docs/`](docs/) 目录,欢迎投稿 PR - **DeepWiki**: +## 本地可观测系统 V1(推荐运行方案) + +当前仓库已经内置了一套本地优先的可观测系统 V1,目标不是“只看昨天的日报”,而是支持你在本机 `debug` 一次真实 query 后,立刻回看: + +- 一次 `user_action` 展开成了哪些 `query / turn / tool / subagent` +- 主线程和子链路分别花了多少 token +- 当前链路完整性是否闭合 +- 某个 subagent 为什么会在这一刻被拉起 +- 如何把一次动作自动生成为 Mermaid flowchart + +完整研究文档和版本化说明见: + +- [ObservrityTask 总入口](./ObservrityTask/README.md) +- [V1 总览](./ObservrityTask/10-%E7%B3%BB%E7%BB%9F%E7%89%88%E6%9C%AC/v1/01-%E6%80%BB%E8%A7%88/%E5%BD%93%E5%89%8D%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV1%E6%B7%B1%E5%BA%A6%E7%A0%94%E7%A9%B6%E6%8A%A5%E5%91%8A.md) +- [QueryLoop 全流程详解](./ObservrityTask/10-%E7%B3%BB%E7%BB%9F%E7%89%88%E6%9C%AC/v1/04-%E4%B8%93%E9%A2%98%E7%A0%94%E7%A9%B6/QueryLoop%E5%85%A8%E6%B5%81%E7%A8%8B%E8%AF%A6%E8%A7%A3%EF%BC%88%E6%BA%90%E7%A0%81%E7%89%88%EF%BC%89.md) +- [Subagent 触发因果任务书](./ObservrityTask/10-%E7%B3%BB%E7%BB%9F%E7%89%88%E6%9C%AC/v1/04-%E4%B8%93%E9%A2%98%E7%A0%94%E7%A9%B6/Subagent%E8%A7%A6%E5%8F%91%E5%9B%A0%E6%9E%9C%E5%8F%AF%E8%A7%82%E6%B5%8B%E4%BB%BB%E5%8A%A1%E4%B9%A6.md) + +### V1 当前能力 + +| 能力层 | 当前能力 | +|------|------| +| 事件层 | 主线程、turn、tool、subagent、recovery、snapshot 全链路落盘到 `.observability/events-YYYYMMDD.jsonl` | +| ID 层 | `user_action_id / query_id / turn_id / tool_call_id / subagent_id` 已可稳定串联 | +| 成本层 | 区分 `Raw Input / Cache Read / Cache Create / Total Prompt Input / Output / Total Billed` | +| 完整性层 | `query / turn / tool / subagent` 闭合率可统计,当前最新样本主链已闭合 | +| Agent 层 | 可按 `main_thread / session_memory / extract_memories / ...` 拆分成本与流程 | +| 因果层 | `subagent_reason + subagent_trigger_kind + subagent_trigger_detail` 已接入 | +| 阅读层 | 支持 `daily_summary`、`dashboard`、`read_timeline`、`explain_action` | +| 可视化层 | 支持自动生成 Mermaid DAG,直接复制到 Mermaid Live Editor 查看 | + +### 推荐运行方案 + +以前更像“先跑程序,再回头看零散日志”。 +现在推荐直接按下面这套观测驱动流程运行: + +1. 启动 debug 版本 + +```bash +bun run dev +``` + +2. 在 REPL 里真实发送一条 query + +3. 重建本地观测库 + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\rebuild_observability_db.ps1 +``` + +4. 直接生成最近一次动作的自动报告 + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\explain_action.ps1 -Latest +``` + +5. 如果要看日级总览或 dashboard + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\daily_summary.ps1 +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\build_dashboard.ps1 +``` + +这套流程的目标是:**每做一次改动,就能用一条真实 `user_action` 做回放和验收。** + +### 如何从一个 `user_action_id` 得到完整 flowchart + +先查最近几个动作: + +```powershell +E:\claude-code\tools\duckdb\duckdb.exe -json E:\claude-code\.observability\observability_v1.duckdb "select user_action_id, started_at, duration_ms, query_count, subagent_count, total_prompt_input_tokens, total_billed_tokens from user_actions order by started_at_ms desc limit 10;" +``` + +拿到目标 `user_action_id` 后,直接生成 Markdown + Mermaid: + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\explain_action.ps1 -UserActionId 12330098-180b-4063-9f96-af47b7e7c39f +``` + +输出结果会在 `ObservrityTask/` 下生成一份报告,里面自带: + +- Basics +- Query List +- Branch Points +- Mermaid DAG +- Reading SOP + +Mermaid 结构大致会长成这样: + +```mermaid +flowchart TD + UA[user_action] + Q0[main_thread query] + T1[turn-1] + S1[spawn session_memory] + Q1[session_memory query] + S2[spawn extract_memories] + Q2[extract_memories query] + UA --> Q0 --> T1 + T1 --> S1 --> Q1 + Q0 --> S2 --> Q2 +``` + +在最新 V1 里,分叉点不再只是“这里开了个 subagent”,而是会直接写出触发原因,例如: + +- `post_sampling_hook / token_threshold_and_tool_threshold` +- `post_sampling_hook / token_threshold_and_natural_break` +- `stop_hook_background / post_turn_background_extraction` + +### 典型阅读路径 + +如果你的目标是“看懂刚刚这次用户动作到底发生了什么”,推荐顺序: + +1. `user_actions`:先找到目标 `user_action_id` +2. `queries`:看这次动作展开成几条主/子链路 +3. `subagents`:看每条子链路为什么被拉起 +4. `turns`:看每条 query 跑了几轮 +5. `tools`:看每轮具体调用了什么工具 +6. `events_raw + snapshots`:看细节和证据 +7. `explain_action.ps1`:把以上内容收敛成一份可读报告 + +### 一次真实样本会看到什么 + +以一次最新样本为例,报告里已经可以直接看到: + +- `session_memory` + - `trigger_kind = post_sampling_hook` + - `trigger_detail = token_threshold_and_tool_threshold` +- `extract_memories` + - `trigger_kind = stop_hook_background` + - `trigger_detail = post_turn_background_extraction` +- 第二次 `session_memory` + - `trigger_kind = post_sampling_hook` + - `trigger_detail = token_threshold_and_natural_break` + +这意味着 V1 现在已经不只是“记录发生了什么”,而是开始具备回答: + +**“为什么在这一刻分叉出这个子 agent”** + ## Contributors diff --git a/scripts/observability/build_dashboard.ps1 b/scripts/observability/build_dashboard.ps1 new file mode 100644 index 0000000000..a3054161d5 --- /dev/null +++ b/scripts/observability/build_dashboard.ps1 @@ -0,0 +1,743 @@ +param( + [string]$Date, + [string]$EventsFile, + [switch]$SkipRebuild +) + +[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$observabilityDir = Join-Path $repoRoot ".observability" +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" +$rebuildScript = Join-Path $repoRoot "scripts\observability\rebuild_observability_db.ps1" +$outputPath = Join-Path $repoRoot "ObservrityTask\observability_dashboard.html" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +function Get-EpochMilliseconds { + param( + [datetime]$Value + ) + + return ([DateTimeOffset]$Value.ToUniversalTime()).ToUnixTimeMilliseconds() +} + +function Resolve-TargetEventsFile { + param( + [string]$ObservabilityDir, + [string]$RequestedDate, + [string]$RequestedEventsFile + ) + + if (-not [string]::IsNullOrWhiteSpace($RequestedEventsFile)) { + return (Resolve-Path -LiteralPath $RequestedEventsFile).Path + } + + $files = Get-ChildItem -LiteralPath $ObservabilityDir -Filter "events-*.jsonl" | + Where-Object { $_.Name -match '^events-\d{8}\.jsonl$' } | + Sort-Object Name + + if (-not $files -or $files.Count -eq 0) { + throw "No events-YYYYMMDD.jsonl files found in $ObservabilityDir" + } + + if (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + $normalizedDate = $RequestedDate -replace '-', '' + $matched = $files | Where-Object { $_.BaseName -eq "events-$normalizedDate" } | Select-Object -First 1 + if (-not $matched) { + throw "Requested events file not found for date $RequestedDate" + } + return $matched.FullName + } + + return ($files | Select-Object -Last 1).FullName +} + +function Get-TargetDate { + param( + [string]$RequestedDate, + [string]$TargetEventsFile + ) + + if (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + return $RequestedDate + } + + $match = [regex]::Match([System.IO.Path]::GetFileName($TargetEventsFile), '^events-(\d{4})(\d{2})(\d{2})\.jsonl$') + if ($match.Success) { + return "$($match.Groups[1].Value)-$($match.Groups[2].Value)-$($match.Groups[3].Value)" + } + + return $null +} + +function Get-BuildMeta { + param( + [string]$DuckDbExe, + [string]$DatabasePath + ) + + if (-not (Test-Path -LiteralPath $DatabasePath)) { + return $null + } + + $raw = & $DuckDbExe -json $DatabasePath "select * from build_meta limit 1;" 2>$null + if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($raw)) { + return $null + } + + return @($raw | ConvertFrom-Json)[0] +} + +function Ensure-FreshDatabase { + param( + [string]$TargetEventsFile, + [string]$RequestedDate, + [string]$DuckDbExe, + [string]$DatabasePath, + [string]$RebuildScript, + [switch]$SkipRebuild + ) + + $targetStat = Get-Item -LiteralPath $TargetEventsFile + $targetMtimeMs = Get-EpochMilliseconds -Value $targetStat.LastWriteTimeUtc + $buildMeta = Get-BuildMeta -DuckDbExe $DuckDbExe -DatabasePath $DatabasePath + $isStale = + ($null -eq $buildMeta) -or + ($buildMeta.source_events_file -ne $TargetEventsFile) -or + ([int64]$buildMeta.source_events_size_bytes -ne [int64]$targetStat.Length) -or + ([int64]$buildMeta.source_events_mtime_ms -ne $targetMtimeMs) + + if (-not $isStale) { + return + } + + if ($SkipRebuild) { + throw "Observability DB is stale for $TargetEventsFile and -SkipRebuild was provided." + } + + $rebuildArgs = @("-ExecutionPolicy", "Bypass", "-File", $RebuildScript, "-Quiet") + if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $rebuildArgs += @("-EventsFile", $TargetEventsFile) + } elseif (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + $rebuildArgs += @("-Date", $RequestedDate) + } + + & powershell @rebuildArgs + if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE + } +} + +function Invoke-DuckDbJson { + param( + [string]$Sql + ) + + $raw = & $duckdbExe -json $dbPath $Sql + if ($LASTEXITCODE -ne 0) { + throw "DuckDB query failed: $Sql" + } + if ([string]::IsNullOrWhiteSpace($raw)) { + return @() + } + return @($raw | ConvertFrom-Json) +} + +function Get-CellText { + param( + [object]$Value + ) + + if ($null -eq $Value) { + return "null" + } + + if ($Value -is [double] -or $Value -is [float] -or $Value -is [decimal]) { + return ([math]::Round([double]$Value, 6)).ToString() + } + + return [string]$Value +} + +function New-MetricMeta { + param( + [string]$Label, + [string]$Meaning, + [string]$Example + ) + + return [PSCustomObject]@{ + label = $Label + meaning = $Meaning + example = $Example + } +} + +function ConvertTo-CardHtml { + param( + [string]$MetricKey, + [string]$Label, + [object]$Value + ) + + $safeLabel = [System.Net.WebUtility]::HtmlEncode($Label) + $safeValue = [System.Net.WebUtility]::HtmlEncode((Get-CellText $Value)) + $safeKey = [System.Net.WebUtility]::HtmlEncode($MetricKey) + + return @" + +"@ +} + +function ConvertTo-TableHtml { + param( + [string]$Title, + [object[]]$Rows + ) + + $safeTitle = [System.Net.WebUtility]::HtmlEncode($Title) + if (-not $Rows -or $Rows.Count -eq 0) { + return "

$safeTitle

没有数据。

" + } + + $columns = @($Rows[0].PSObject.Properties.Name) + $thead = ($columns | ForEach-Object { "$([System.Net.WebUtility]::HtmlEncode($_))" }) -join "" + $tbody = foreach ($row in $Rows) { + $cells = foreach ($column in $columns) { + $value = Get-CellText $row.$column + "$([System.Net.WebUtility]::HtmlEncode($value))" + } + "$($cells -join '')" + } + + return @" +
+

$safeTitle

+
+ + $thead + + $($tbody -join "`n") + +
+
+
+"@ +} + +$targetEventsFile = Resolve-TargetEventsFile -ObservabilityDir $observabilityDir -RequestedDate $Date -RequestedEventsFile $EventsFile +$targetDate = Get-TargetDate -RequestedDate $Date -TargetEventsFile $targetEventsFile + +Ensure-FreshDatabase -TargetEventsFile $targetEventsFile -RequestedDate $Date -DuckDbExe $duckdbExe -DatabasePath $dbPath -RebuildScript $rebuildScript -SkipRebuild:$SkipRebuild + +if (-not (Test-Path -LiteralPath $dbPath)) { + throw "DuckDB database not found at $dbPath" +} + +if ([string]::IsNullOrWhiteSpace($targetDate)) { + $targetDate = (Invoke-DuckDbJson "select max(event_date) as event_date from daily_rollups;")[0].event_date +} + +$buildMeta = (Invoke-DuckDbJson "select source_events_file_name, source_events_size_bytes, events_row_count, built_at from build_meta limit 1;")[0] +$rollup = (Invoke-DuckDbJson "select * from daily_rollups where event_date = '$targetDate' limit 1;")[0] +$integrity = (Invoke-DuckDbJson "select * from metrics_integrity_daily where event_date = '$targetDate' limit 1;")[0] +$cost = (Invoke-DuckDbJson "select * from metrics_cost_daily where event_date = '$targetDate' limit 1;")[0] +$loops = (Invoke-DuckDbJson "select * from metrics_loop_daily where event_date = '$targetDate' limit 1;")[0] +$latency = (Invoke-DuckDbJson "select * from metrics_latency_daily where event_date = '$targetDate' limit 1;")[0] +$compression = (Invoke-DuckDbJson "select * from metrics_compression_daily where event_date = '$targetDate' limit 1;")[0] +$toolMetrics = (Invoke-DuckDbJson "select * from metrics_tools_daily where event_date = '$targetDate' limit 1;")[0] +$recovery = (Invoke-DuckDbJson "select * from metrics_recovery_daily where event_date = '$targetDate' limit 1;")[0] +$flags = (Invoke-DuckDbJson "select * from system_flags where event_date = '$targetDate' limit 1;")[0] +$costShare = Invoke-DuckDbJson "select query_source, total_prompt_input_tokens, total_billed_tokens, daily_cost_share from query_source_cost_share_daily where event_date = '$targetDate' order by total_billed_tokens desc, query_source asc;" +$agentCosts = Invoke-DuckDbJson "select agent_name, source_group, agent_total_prompt_input_tokens, agent_total_billed_tokens, agent_cost_share, agent_query_count, agent_avg_turns_per_query, agent_avg_loop_iter_end from agent_cost_daily where event_date = '$targetDate' order by agent_total_billed_tokens desc, agent_name asc;" +$recentActions = Invoke-DuckDbJson "select user_action_id, duration_ms, query_count, main_thread_query_count, subagent_count, total_prompt_input_tokens, total_billed_tokens from user_actions where event_date = '$targetDate' order by started_at desc limit 10;" +$subagentReasons = Invoke-DuckDbJson "select subagent_reason, agent_name, subagent_count, avg_duration_ms from subagent_reason_daily where event_date = '$targetDate' order by subagent_count desc, subagent_reason asc;" +$queriesBySource = Invoke-DuckDbJson "select query_source, count(*) as query_count, sum(duration_ms) as total_duration_ms, sum(tool_call_count) as total_tool_calls from queries where started_at like '$targetDate%' group by 1 order by query_count desc, query_source asc;" +$toolByName = Invoke-DuckDbJson "select tool_name, tool_calls, tool_success_rate, tool_failure_rate, tool_avg_duration_ms, tool_p95_duration_ms from tool_calls_by_name order by tool_calls desc, tool_name asc;" +$toolByMode = Invoke-DuckDbJson "select tool_mode, tool_calls from tool_calls_by_mode order by tool_calls desc, tool_mode asc;" +$terminalReasons = Invoke-DuckDbJson "select terminal_reason, query_count from terminal_reason_distribution where event_date = '$targetDate' order by query_count desc, terminal_reason asc;" + +$metricDocs = [ordered]@{ + event_count = (New-MetricMeta "事件数" "当天成功入库的结构化事件总数。" "例:375 代表这批样本里被 ETL 吃进去的事件一共有 375 条。") + user_action_count = (New-MetricMeta "用户动作数" "能被同一个 user_action_id 串起来的用户动作数量。" "例:2 代表今天样本中有 2 次独立用户动作。") + query_count = (New-MetricMeta "Query 数" "当天成功识别出来的 query 生命周期实体数量。" "例:6 代表这批样本里一共识别出 6 个 query。") + turn_count = (New-MetricMeta "Turn 数" "当天成功识别出来的 turn 数量。" "例:12 说明 query 们一共走了 12 轮 turn。") + tool_calls_total = (New-MetricMeta "工具调用数" "当天工具调用总数。" "例:9 说明主线程和 subagent 合计触发了 9 次工具调用。") + subagent_count = (New-MetricMeta "Subagent 数" "当天成功识别到的 subagent 生命周期数量。" "例:4 说明共有 4 次子代理任务被创建。") + strict_query_completion_rate = (New-MetricMeta "严格 Query 完成率" "只按原始 query_id 检查,同一个 query_id 是否同时出现 query.started 和 query.terminated。" "例:如果 terminated 丢了原始 query_id,这个值会偏低。") + inferred_query_completion_rate = (New-MetricMeta "推断 Query 完成率" "允许使用 effective_query_id 补链后的 query 闭合率。" "例:它告诉你‘分析层是否还能把链串起来’,通常会高于严格口径。") + query_completeness_gap = (New-MetricMeta "Query 补链差值" "推断 Query 完成率减去原生 Query 完成率。" "例:0.3 代表 ETL 补链帮你多恢复了 30% 的 query 闭合。") + strict_turn_state_closure_rate = (New-MetricMeta "严格 Turn 闭合率" "只按原始 query_id + turn_id 检查 turn.started / before_turn / after_turn 三件套是否齐全。" "例:最后一轮缺 after_turn 时,这个值就会下降。") + inferred_turn_state_closure_rate = (New-MetricMeta "推断 Turn 闭合率" "允许用 effective_query_id 做补链后的 turn 闭合率。" "例:它反映 ETL 是否还能拼出 turn 生命周期。") + turn_closure_gap = (New-MetricMeta "Turn 补链差值" "推断 Turn 闭合率减去原生 Turn 闭合率。" "例:值越大,说明缺 query_id/turn_id 的事件越多。") + tool_lifecycle_closure_rate = (New-MetricMeta "工具闭合率" "工具调用中,从 started 走到 completed 或 failed 的比例。" "例:1.0 代表工具调用生命周期全部闭合。") + subagent_lifecycle_closure_rate = (New-MetricMeta "Subagent 闭合率" "subagent 同时出现 spawned 和 completed 的比例。" "例:1.0 代表子代理生命周期全部闭合。") + snapshot_missing_rate = (New-MetricMeta "Snapshot 缺失率" "事件引用了 snapshot_ref,但本地找不到对应快照文件的比例。" "例:0 代表这批样本没有缺快照。") + orphan_event_rate = (New-MetricMeta "Orphan Event 率" "无法挂靠到 user_action / query / turn / tool / subagent 的孤儿事件比例。" "例:值高时说明基础埋点键缺失严重。") + raw_input_tokens = (New-MetricMeta "裸 Input Tokens" "模型 usage 里的 input_tokens 原值,不包含 cache read 和 cache create。" "例:你看到它只有 153,并不代表这次输入很小,只代表“新送进模型、未命中缓存的那一部分”只有 153。") + cache_read_tokens = (New-MetricMeta "Cache Read Tokens" "本轮请求从 prompt cache 直接复用的输入 tokens。" "例:如果一个很长的 system prompt 被缓存复用,这里会很大,而裸 input 仍可能很小。") + cache_create_tokens = (New-MetricMeta "Cache Create Tokens" "本轮请求为了创建或刷新 prompt cache 而计入的输入 tokens。" "例:第一次跑一段长 prompt 时,这里可能会突然升高。") + total_prompt_input_tokens = (New-MetricMeta "总 Prompt 输入 Tokens" "真正建议优先看的输入成本。= 裸 input + cache read + cache create。" "例:裸 input 153、cache read 245210、cache create 219661,则总 prompt 输入是 465024。") + output_tokens = (New-MetricMeta "Output Tokens" "模型输出的 tokens 总量。" "例:如果 output 只有 3027,而总 prompt 输入是 46.5 万,说明成本瓶颈主要在输入侧。") + total_billed_tokens = (New-MetricMeta "总 Billed Tokens" "总 prompt 输入 tokens 再加 output tokens 后形成的总账单口径。" "例:465024 + 3027 = 468051。") + main_thread_prompt_tokens = (New-MetricMeta "主线程 Prompt 输入" "只统计 `repl_main_thread` 的总 prompt 输入 tokens。" "例:它能让你看清主线程本身有多贵。") + subagent_prompt_tokens = (New-MetricMeta "Subagent Prompt 输入" "只统计非 `repl_main_thread` 的总 prompt 输入 tokens。" "例:如果它远高于主线程,说明 memory / side query 链路在放大成本。") + subagent_amplification_ratio = (New-MetricMeta "Subagent 放大倍率" "subagent 总 prompt 输入 tokens / 主线程总 prompt 输入 tokens。" "例:5.3 代表 memory / side query 等子链路把输入成本放大到了主线程的 5.3 倍。") + avg_prompt_input_per_user_action = (New-MetricMeta "每个用户动作平均 Prompt 输入" "每天总 prompt 输入成本除以当天 user_action 数。" "例:它能快速回答‘平均一次用户动作要吃多少输入成本’。") + avg_billed_per_user_action = (New-MetricMeta "每个用户动作平均 Billed" "每天总 billed tokens 除以当天 user_action 数。" "例:适合看整天的平均账单压力。") + avg_prompt_input_per_query = (New-MetricMeta "每个 Query 平均 Prompt 输入" "每天所有 query 的平均总 prompt 输入成本。" "例:它能区分‘今天 query 变多’和‘单个 query 变贵’。") + avg_billed_per_query = (New-MetricMeta "每个 Query 平均 Billed" "每天所有 query 的平均 billed tokens。" "例:如果这个值升高,说明单个 query 的综合成本变重了。") + submit_to_first_chunk_ms = (New-MetricMeta "Submit 到 First Chunk" "一次用户动作从当前可闭合起点到主线程 first chunk 的平均时长。" "例:这个值高说明用户等到首字节的时间长。") + preprocess_duration_ms = (New-MetricMeta "Preprocess 时长" "从预处理开始到 prompt.build.started 的平均时长。" "例:值高说明消息裁剪、压缩或上下文整理耗时较多。") + prompt_build_duration_ms = (New-MetricMeta "Prompt.Build 时长" "从 prompt.build.started 到 prompt.build.completed 的平均时长。" "例:值高说明提示词拼装和序列化成本较高。") + api_first_chunk_latency_ms = (New-MetricMeta "Request 到 First Chunk" "从 API 请求发起到首个流式 chunk 返回的平均时长。" "例:它主要反映模型首字延迟。") + api_total_duration_ms = (New-MetricMeta "API 总时长" "单轮 request 从发起到流式完成的平均时长。" "例:如果它很高,再看工具/恢复链才能知道慢在哪里。") + tool_execution_duration_ms = (New-MetricMeta "工具执行平均时长" "所有工具调用的平均执行时长。" "例:值高时通常要看慢工具明细。") + stop_hook_duration_ms = (New-MetricMeta "Stop Hooks 平均时长" "stop hook 生命周期的平均时长。" "例:值高说明停止逻辑本身在拖慢响应。") + subagent_duration_ms = (New-MetricMeta "Subagent 生命周期均值" "subagent 从 spawned 到 completed 的平均时长。" "例:值高通常意味着 memory 相关子链路比较慢。") + user_action_e2e_duration_ms = (New-MetricMeta "User Action E2E" "一次用户动作从最早事件到最晚事件的端到端平均时长。" "例:这是用户真正感受到的总耗时。") + daily_avg_turns_per_query = (New-MetricMeta "每日平均 Turn/Query" "按 query 统计的平均 turn 数。" "例:值高可能意味着更常见的多轮循环。") + daily_avg_loop_iter_end = (New-MetricMeta "每日平均 Loop 终点" "每个 query 的最大 loop_iter 再求平均。" "例:它能区分‘prompt 大’和‘因为多轮 loop 导致成本高’。") + daily_p95_loop_iter_end = (New-MetricMeta "每日 Loop 终点 P95" "query_max_loop_iter 的 P95。" "例:它比平均值更容易看出少数长链 loop。") + daily_queries_with_loop_iter_gt_1_rate = (New-MetricMeta "多轮 Query 占比" "query_max_loop_iter > 1 的 query 占比。" "例:0.6 代表 60% 的 query 至少循环了 2 轮。") + preprocess_tokens_before_total = (New-MetricMeta "Preprocess 前 Tokens" "进入上下文治理前的估算 token 总量。" "例:它是判断压缩压力的起点。") + preprocess_tokens_after_total = (New-MetricMeta "Preprocess 后 Tokens" "经过上下文治理后的估算 token 总量。" "例:和前值对比可以看出压缩是否生效。") + tokens_saved_total = (New-MetricMeta "总节省 Tokens" "预处理阶段累计节省的 tokens 总量。" "例:如果是 0,代表这批样本里压缩动作没有明显节省。") + compression_gain_ratio = (New-MetricMeta "压缩收益率" "preprocess 前后 token 总量的节省比例。" "例:0.2 代表 preprocess 后上下文整体缩短了 20%。") + autocompact_trigger_rate = (New-MetricMeta "Autocompact 触发率" "messages.autoconpact.completed 中 compacted = true 的比例。" "例:值高说明上下文压力大,经常需要自动压缩。") + history_snip_gate_state = (New-MetricMeta "HISTORY_SNIP Gate 状态" "当前样本里是否观察到 HISTORY_SNIP 命中。" "例:‘样本中观察到命中’说明这批日志里 gate 至少生效过一次。") + contextCollapse_enabled_gauge = (New-MetricMeta "contextCollapse 启用状态" "当前按源码真相给出。0 代表 disabled / stub,不应被解释成真实已启用。" "例:即使日志里有相关痕迹,这里仍必须显示 0。") + tool_success_rate = (New-MetricMeta "工具成功率" "工具调用中 success = true 的比例。" "例:如果它下降,就该优先排查失败最多的工具。") + tool_failure_rate = (New-MetricMeta "工具失败率" "工具调用中 failed 的比例。" "例:它和工具成功率一起决定工具层健康度。") + tool_avg_duration_ms = (New-MetricMeta "工具平均时长" "按所有工具调用计算的平均执行时长。" "例:适合快速判断工具层是否整体变慢。") + tool_p95_duration_ms = (New-MetricMeta "工具 P95 时长" "工具执行时长的 P95。" "例:它比平均值更容易暴露长尾慢调用。") + tools_per_query = (New-MetricMeta "每个 Query 的工具数" "平均每个 query 触发多少次工具调用。" "例:值高说明 query 更依赖工具链。") + tools_per_subagent = (New-MetricMeta "每个 Subagent 的工具数" "平均每个 subagent 触发多少次工具调用。" "例:它能看出子代理是否重度依赖工具。") + tool_followup_turn_ratio = (New-MetricMeta "工具后续驱动率" "包含 tool_use 的 turn 中,最终 transition_out = next_turn 的比例。" "例:值高说明工具确实在驱动下一轮 loop。") + prompt_too_long_recovery_attempts = (New-MetricMeta "Prompt Too Long 恢复次数" "恢复链里与 prompt_too_long 相关的尝试次数。" "例:如果这个值持续升高,说明 prompt 治理本身有问题。") + max_output_tokens_recovery_attempts = (New-MetricMeta "Max Output Tokens 恢复次数" "恢复链里与 max_output_tokens 相关的尝试次数。" "例:值高说明输出上限策略经常撞线。") + token_budget_continue_rate = (New-MetricMeta "Token Budget Continue Rate" "token_budget.decision 中 action = continue 的比例。" "例:值高说明系统经常需要续跑才能完成响应。") + stop_hook_block_rate = (New-MetricMeta "Stop Hook Block Rate" "stop hook 最终阻止继续执行的比例。" "例:值高时说明停止逻辑频繁打断主链。") + api_error_rate = (New-MetricMeta "API Error Rate" "API 调用阶段错误的比例。" "例:这个值非零时要优先检查模型请求和网络错误。") + tool_failure_terminal_rate = (New-MetricMeta "Tool Failure Terminal Rate" "工具失败后直接导致 query 终止的比例。" "例:值高说明工具失败很难恢复。") +} + +$overviewCards = @( + (ConvertTo-CardHtml "event_count" "事件数" $rollup.event_count), + (ConvertTo-CardHtml "user_action_count" "用户动作数" $rollup.user_action_count), + (ConvertTo-CardHtml "query_count" "Query 数" $rollup.query_count), + (ConvertTo-CardHtml "turn_count" "Turn 数" $rollup.turn_count), + (ConvertTo-CardHtml "tool_calls_total" "工具调用数" $toolMetrics.tool_calls_total), + (ConvertTo-CardHtml "subagent_count" "Subagent 数" $rollup.subagent_count) +) -join "`n" + +$integrityCards = @( + (ConvertTo-CardHtml "strict_query_completion_rate" "严格 Query 完成率" $integrity.strict_query_completion_rate), + (ConvertTo-CardHtml "inferred_query_completion_rate" "推断 Query 完成率" $integrity.inferred_query_completion_rate), + (ConvertTo-CardHtml "query_completeness_gap" "Query 补链差值" $integrity.query_completeness_gap), + (ConvertTo-CardHtml "strict_turn_state_closure_rate" "严格 Turn 闭合率" $integrity.strict_turn_state_closure_rate), + (ConvertTo-CardHtml "inferred_turn_state_closure_rate" "推断 Turn 闭合率" $integrity.inferred_turn_state_closure_rate), + (ConvertTo-CardHtml "turn_closure_gap" "Turn 补链差值" $integrity.turn_closure_gap), + (ConvertTo-CardHtml "tool_lifecycle_closure_rate" "工具闭合率" $integrity.tool_lifecycle_closure_rate), + (ConvertTo-CardHtml "subagent_lifecycle_closure_rate" "Subagent 闭合率" $integrity.subagent_lifecycle_closure_rate), + (ConvertTo-CardHtml "snapshot_missing_rate" "Snapshot 缺失率" $integrity.snapshot_missing_rate), + (ConvertTo-CardHtml "orphan_event_rate" "Orphan Event 率" $integrity.orphan_event_rate) +) -join "`n" + +$costDailyTotalCards = @( + (ConvertTo-CardHtml "total_prompt_input_tokens" "总 Prompt 输入 Tokens" $cost.user_action_total_prompt_input_tokens), + (ConvertTo-CardHtml "total_billed_tokens" "总 Billed Tokens" $cost.user_action_total_billed_tokens), + (ConvertTo-CardHtml "output_tokens" "Output Tokens" $cost.user_action_total_output_tokens) +) -join "`n" + +$costStructureCards = @( + (ConvertTo-CardHtml "raw_input_tokens" "裸 Input Tokens" $cost.user_action_total_raw_input_tokens), + (ConvertTo-CardHtml "cache_read_tokens" "Cache Read Tokens" $cost.user_action_total_cache_read_tokens), + (ConvertTo-CardHtml "cache_create_tokens" "Cache Create Tokens" $cost.user_action_total_cache_create_tokens) +) -join "`n" + +$costChainCards = @( + (ConvertTo-CardHtml "main_thread_prompt_tokens" "主线程 Prompt 输入" $cost.main_thread_total_prompt_input_tokens), + (ConvertTo-CardHtml "subagent_prompt_tokens" "Subagent Prompt 输入" $cost.subagent_total_prompt_input_tokens), + (ConvertTo-CardHtml "subagent_amplification_ratio" "Subagent 放大倍率" $cost.subagent_amplification_ratio) +) -join "`n" + +$costAverageCards = @( + (ConvertTo-CardHtml "avg_prompt_input_per_user_action" "每个用户动作平均 Prompt 输入" $cost.avg_total_prompt_input_tokens_per_user_action), + (ConvertTo-CardHtml "avg_billed_per_user_action" "每个用户动作平均 Billed" $cost.avg_total_billed_tokens_per_user_action), + (ConvertTo-CardHtml "avg_prompt_input_per_query" "每个 Query 平均 Prompt 输入" $cost.avg_total_prompt_input_tokens_per_query), + (ConvertTo-CardHtml "avg_billed_per_query" "每个 Query 平均 Billed" $cost.avg_total_billed_tokens_per_query) +) -join "`n" + +$loopCards = @( + (ConvertTo-CardHtml "daily_avg_turns_per_query" "每日平均 Turn/Query" $loops.daily_avg_turns_per_query), + (ConvertTo-CardHtml "daily_avg_loop_iter_end" "每日平均 Loop 终点" $loops.daily_avg_loop_iter_end), + (ConvertTo-CardHtml "daily_p95_loop_iter_end" "每日 Loop 终点 P95" $loops.daily_p95_loop_iter_end), + (ConvertTo-CardHtml "daily_queries_with_loop_iter_gt_1_rate" "多轮 Query 占比" $loops.daily_queries_with_loop_iter_gt_1_rate) +) -join "`n" + +$latencyCards = @( + (ConvertTo-CardHtml "submit_to_first_chunk_ms" "Submit -> First Chunk" $latency.submit_to_first_chunk_ms), + (ConvertTo-CardHtml "preprocess_duration_ms" "Preprocess" $latency.preprocess_duration_ms), + (ConvertTo-CardHtml "prompt_build_duration_ms" "Prompt.Build" $latency.prompt_build_duration_ms), + (ConvertTo-CardHtml "api_first_chunk_latency_ms" "Request -> First Chunk" $latency.api_first_chunk_latency_ms), + (ConvertTo-CardHtml "api_total_duration_ms" "API 总时长" $latency.api_total_duration_ms), + (ConvertTo-CardHtml "tool_execution_duration_ms" "工具执行平均时长" $latency.tool_execution_duration_ms), + (ConvertTo-CardHtml "stop_hook_duration_ms" "Stop Hooks 平均时长" $latency.stop_hook_duration_ms), + (ConvertTo-CardHtml "subagent_duration_ms" "Subagent 生命周期均值" $latency.subagent_duration_ms), + (ConvertTo-CardHtml "user_action_e2e_duration_ms" "User Action E2E" $latency.user_action_e2e_duration_ms) +) -join "`n" + +$compressionCards = @( + (ConvertTo-CardHtml "preprocess_tokens_before_total" "Preprocess 前 Tokens" $compression.preprocess_tokens_before_total), + (ConvertTo-CardHtml "preprocess_tokens_after_total" "Preprocess 后 Tokens" $compression.preprocess_tokens_after_total), + (ConvertTo-CardHtml "tokens_saved_total" "总节省 Tokens" $compression.tokens_saved_total), + (ConvertTo-CardHtml "compression_gain_ratio" "压缩收益率" $compression.compression_gain_ratio), + (ConvertTo-CardHtml "autocompact_trigger_rate" "Autocompact 触发率" $compression.autocompact_trigger_rate), + (ConvertTo-CardHtml "history_snip_gate_state" "HISTORY_SNIP Gate" $flags.history_snip_gate_state), + (ConvertTo-CardHtml "contextCollapse_enabled_gauge" "contextCollapse 启用状态" $flags.contextCollapse_enabled_gauge) +) -join "`n" + +$toolCards = @( + (ConvertTo-CardHtml "tool_success_rate" "工具成功率" $toolMetrics.tool_success_rate), + (ConvertTo-CardHtml "tool_failure_rate" "工具失败率" $toolMetrics.tool_failure_rate), + (ConvertTo-CardHtml "tool_avg_duration_ms" "工具平均时长" $toolMetrics.tool_avg_duration_ms), + (ConvertTo-CardHtml "tool_p95_duration_ms" "工具 P95 时长" $toolMetrics.tool_p95_duration_ms), + (ConvertTo-CardHtml "tools_per_query" "每个 Query 的工具数" $toolMetrics.tools_per_query), + (ConvertTo-CardHtml "tools_per_subagent" "每个 Subagent 的工具数" $toolMetrics.tools_per_subagent), + (ConvertTo-CardHtml "tool_followup_turn_ratio" "工具后续驱动率" $toolMetrics.tool_followup_turn_ratio) +) -join "`n" + +$recoveryCards = @( + (ConvertTo-CardHtml "prompt_too_long_recovery_attempts" "Prompt Too Long 恢复次数" $recovery.prompt_too_long_recovery_attempts), + (ConvertTo-CardHtml "max_output_tokens_recovery_attempts" "Max Output Tokens 恢复次数" $recovery.max_output_tokens_recovery_attempts), + (ConvertTo-CardHtml "token_budget_continue_rate" "Token Budget Continue Rate" $recovery.token_budget_continue_rate), + (ConvertTo-CardHtml "stop_hook_block_rate" "Stop Hook Block Rate" $recovery.stop_hook_block_rate), + (ConvertTo-CardHtml "api_error_rate" "API Error Rate" $recovery.api_error_rate), + (ConvertTo-CardHtml "tool_failure_terminal_rate" "Tool Failure Terminal Rate" $recovery.tool_failure_terminal_rate) +) -join "`n" + +$glossarySections = foreach ($entry in $metricDocs.GetEnumerator()) { + $key = [System.Net.WebUtility]::HtmlEncode($entry.Key) + $label = [System.Net.WebUtility]::HtmlEncode($entry.Value.label) + $meaning = [System.Net.WebUtility]::HtmlEncode($entry.Value.meaning) + $example = [System.Net.WebUtility]::HtmlEncode($entry.Value.example) + @" +
+

$label

+

含义:$meaning

+

举例:$example

+
+"@ +} + +$html = @" + + + + + + 本地可观测系统 V1 Dashboard + + + +
+
+

本地可观测系统 V1

+

这版 dashboard 按方向 A 执行清单把指标分成更稳定的分析层级。成本侧按 每日总量成本结构主/子链路日均/效率 展示;完整性侧同时提供 原生口径推断口径补链差值;loop 指标单独拆开,用来区分“prompt 大”还是“多轮循环导致贵”。

+
+
日期
$([System.Net.WebUtility]::HtmlEncode((Get-CellText $targetDate)))
+
源文件
$([System.Net.WebUtility]::HtmlEncode((Get-CellText $buildMeta.source_events_file_name)))
+
文件大小(bytes)
$([System.Net.WebUtility]::HtmlEncode((Get-CellText $buildMeta.source_events_size_bytes)))
+
建库时间
$([System.Net.WebUtility]::HtmlEncode((Get-CellText $buildMeta.built_at)))
+
+
+ +
+

概览

+
+ $overviewCards +
+
+ +
+

完整性

+
+ $integrityCards +
+
+ +
+

成本 - 每日总量

+
+ $costDailyTotalCards +
+
+ +
+

成本 - 结构拆分

+
+ $costStructureCards +
+
+ +
+

成本 - 主/子链路

+
+ $costChainCards +
+
+ +
+

成本 - 日均/效率

+
+ $costAverageCards +
+
+ +
+

Loop / Turn

+
+ $loopCards +
+
+ +
+

延迟

+
+ $latencyCards +
+
+ +
+
+

压缩与上下文治理

+
+ $compressionCards +
+
+
+

工具与恢复

+
+ $toolCards + $recoveryCards +
+
+
+ + $(ConvertTo-TableHtml "按 Source 成本拆分" $costShare) + $(ConvertTo-TableHtml "按 Agent/Source 成本拆分" $agentCosts) + $(ConvertTo-TableHtml "最近用户动作" $recentActions) + $(ConvertTo-TableHtml "按 Source Query 概览" $queriesBySource) + $(ConvertTo-TableHtml "Subagent Reason 明细" $subagentReasons) + $(ConvertTo-TableHtml "工具按名称统计" $toolByName) + $(ConvertTo-TableHtml "工具按模式统计" $toolByMode) + $(ConvertTo-TableHtml "终止原因分布" $terminalReasons) + +
+

指标说明

+

每张卡片右上角的“说明”都会跳到这里。这里优先解释最容易误解、最容易影响判断的指标,尤其是 token 成本口径。

+
+ $($glossarySections -join "`n") +
+
+
+ + +"@ + +Set-Content -LiteralPath $outputPath -Value $html -Encoding UTF8 +Write-Output $outputPath diff --git a/scripts/observability/build_duckdb_etl.ts b/scripts/observability/build_duckdb_etl.ts new file mode 100644 index 0000000000..7ae3e020cf --- /dev/null +++ b/scripts/observability/build_duckdb_etl.ts @@ -0,0 +1,2592 @@ +import { createHash } from "node:crypto" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + statSync, + writeFileSync, +} from "node:fs" +import { basename, join, relative, resolve } from "node:path" + +type JsonValue = + | null + | boolean + | number + | string + | JsonValue[] + | { [key: string]: JsonValue } + +type EventRecord = { + schema_version?: string + ts_wall: string + ts_mono_ms?: number | null + level?: string | null + event: string + component?: string | null + session_id?: string | null + conversation_id?: string | null + user_action_id?: string | null + query_id?: string | null + turn_id?: string | null + loop_iter?: number | null + parent_turn_id?: string | null + subagent_id?: string | null + subagent_type?: string | null + subagent_reason?: string | null + subagent_trigger_kind?: string | null + subagent_trigger_detail?: string | null + query_source?: string | null + request_id?: string | null + tool_call_id?: string | null + span_id?: string | null + parent_span_id?: string | null + cwd?: string | null + git_branch?: string | null + build_version?: string | null + payload?: Record | null +} + +type QuerySpan = { + queryId: string + userActionId: string | null + querySource: string | null + subagentId: string | null + startMs: number + endMs: number +} + +type SnapshotInfo = { + snapshotRef: string + fileName: string + relativePath: string + absolutePath: string + exists: boolean + sizeBytes: number | null + sha256: string | null + referencedCount: number + firstEventTs: string | null + lastEventTs: string | null + category: string | null +} + +type UsageFact = { + usage_fact_id: string + event_date: string + ts_wall: string + ts_wall_ms: number | null + user_action_id: string | null + query_id: string | null + query_source: string | null + subagent_id: string | null + subagent_reason: string | null + agent_name: string | null + source_group: string | null + source_kind: string + source_ref: string | null + request_id: string | null + assistant_message_count: number | null + is_authoritative: boolean + input_tokens: number + output_tokens: number + cache_read_input_tokens: number + cache_creation_input_tokens: number + total_prompt_input_tokens: number + total_billed_tokens: number +} + +const repoRoot = resolve(import.meta.dir, "..", "..") +const observabilityDir = join(repoRoot, ".observability") +const snapshotsDir = join(observabilityDir, "snapshots") +const duckdbExe = join(repoRoot, "tools", "duckdb", "duckdb.exe") +const databasePath = join(observabilityDir, "observability_v1.duckdb") +const sqlPath = join(observabilityDir, "load_observability_v1.sql") + +function fail(message: string): never { + console.error(message) + process.exit(1) +} + +function parseArgs(argv: string[]): { eventsFile?: string; date?: string } { + const parsed: { eventsFile?: string; date?: string } = {} + for (let index = 0; index < argv.length; index += 1) { + const current = argv[index] + if (current === "--events-file") { + parsed.eventsFile = argv[index + 1] + index += 1 + continue + } + if (current === "--date") { + parsed.date = argv[index + 1] + index += 1 + } + } + return parsed +} + +function resolveEventsPath(args: { eventsFile?: string; date?: string }): string { + if (args.eventsFile) { + return resolve(args.eventsFile) + } + + const files = readdirSync(observabilityDir) + .filter(fileName => /^events-\d{8}\.jsonl$/u.test(fileName)) + .sort() + + if (files.length === 0) { + fail(`No events-YYYYMMDD.jsonl files found in ${observabilityDir}`) + } + + if (args.date) { + const normalizedDate = args.date.replace(/-/gu, "") + const fileName = `events-${normalizedDate}.jsonl` + const matched = files.find(candidate => candidate === fileName) + if (!matched) { + fail(`Requested events file not found for date ${args.date}`) + } + return join(observabilityDir, matched) + } + + return join(observabilityDir, files.at(-1)!) +} + +function parseConcatenatedEvents(text: string): EventRecord[] { + const values: EventRecord[] = [] + let index = 0 + while (index < text.length) { + while (index < text.length && /\s/u.test(text[index]!)) { + index += 1 + } + if (index >= text.length) { + break + } + const { object, nextIndex } = readOneObject(text, index) + values.push(object as EventRecord) + index = nextIndex + } + return values +} + +function readOneObject(text: string, startIndex: number): { object: JsonValue; nextIndex: number } { + let depth = 0 + let inString = false + let escaped = false + let index = startIndex + + for (; index < text.length; index += 1) { + const char = text[index]! + + if (inString) { + if (escaped) { + escaped = false + } else if (char === "\\") { + escaped = true + } else if (char === '"') { + inString = false + } + continue + } + + if (char === '"') { + inString = true + continue + } + if (char === "{") { + depth += 1 + continue + } + if (char === "}") { + depth -= 1 + if (depth === 0) { + return { + object: JSON.parse(text.slice(startIndex, index + 1)) as JsonValue, + nextIndex: index + 1, + } + } + } + } + + throw new Error(`Unterminated JSON object at index ${startIndex}`) +} + +function toEpochMs(value: string | null | undefined): number | null { + if (!value) { + return null + } + const parsed = Date.parse(value) + return Number.isNaN(parsed) ? null : parsed +} + +function toNumber(value: unknown): number { + if (typeof value === "number" && Number.isFinite(value)) { + return value + } + if (typeof value === "string" && value.trim().length > 0) { + const parsed = Number(value) + return Number.isFinite(parsed) ? parsed : 0 + } + return 0 +} + +function sqlLiteral(value: unknown): string { + if (value === null || value === undefined) { + return "NULL" + } + if (typeof value === "number") { + return Number.isFinite(value) ? String(value) : "NULL" + } + if (typeof value === "boolean") { + return value ? "TRUE" : "FALSE" + } + const normalized = String(value).replace(/'/g, "''") + return `'${normalized}'` +} + +function compactJson(value: unknown): string | null { + if (value === undefined || value === null) { + return null + } + return JSON.stringify(value) +} + +function jsonPathToAbsolute(snapshotRef: string): string { + return join(repoRoot, ...snapshotRef.split("/")) +} + +function collectSnapshotRefs(value: JsonValue, refs: Set): void { + if (typeof value === "string" && value.startsWith(".observability/snapshots/")) { + refs.add(value) + return + } + if (Array.isArray(value)) { + for (const item of value) { + collectSnapshotRefs(item, refs) + } + return + } + if (value && typeof value === "object") { + for (const item of Object.values(value)) { + collectSnapshotRefs(item, refs) + } + } +} + +function buildExplicitQuerySpans(events: EventRecord[]): QuerySpan[] { + const spans = new Map() + + for (const event of events) { + if (!event.query_id) { + continue + } + const tsMs = toEpochMs(event.ts_wall) + if (tsMs === null) { + continue + } + const existing = spans.get(event.query_id) + if (existing) { + existing.startMs = Math.min(existing.startMs, tsMs) + existing.endMs = Math.max(existing.endMs, tsMs) + existing.userActionId ||= event.user_action_id ?? null + existing.querySource ||= event.query_source ?? null + existing.subagentId ||= event.subagent_id ?? null + continue + } + spans.set(event.query_id, { + queryId: event.query_id, + userActionId: event.user_action_id ?? null, + querySource: event.query_source ?? null, + subagentId: event.subagent_id ?? null, + startMs: tsMs, + endMs: tsMs, + }) + } + + return [...spans.values()] +} + +function resolveEffectiveQueryId(event: EventRecord, spans: QuerySpan[]): string | null { + if (event.query_id) { + return event.query_id + } + const tsMs = toEpochMs(event.ts_wall) + if (tsMs === null || !event.user_action_id) { + return null + } + + const matches = spans.filter(span => { + if (span.userActionId !== event.user_action_id) { + return false + } + if (event.query_source && span.querySource && span.querySource !== event.query_source) { + return false + } + if (event.subagent_id && span.subagentId && span.subagentId !== event.subagent_id) { + return false + } + return tsMs >= span.startMs - 5_000 && tsMs <= span.endMs + 5_000 + }) + + if (matches.length === 0) { + return null + } + if (matches.length === 1) { + return matches[0]!.queryId + } + + matches.sort((left, right) => { + const leftDistance = Math.min(Math.abs(tsMs - left.startMs), Math.abs(tsMs - left.endMs)) + const rightDistance = Math.min(Math.abs(tsMs - right.startMs), Math.abs(tsMs - right.endMs)) + return leftDistance - rightDistance + }) + + return matches[0]!.queryId +} + +function sha256Hex(path: string): string { + const hash = createHash("sha256") + hash.update(readFileSync(path)) + return hash.digest("hex") +} + +function snapshotCategory(fileName: string): string | null { + const lowered = fileName.toLowerCase() + if (lowered.includes("request")) return "request" + if (lowered.includes("response")) return "response" + if (lowered.includes("state.snapshot.before_turn")) return "state_before_turn" + if (lowered.includes("state.snapshot.after_turn")) return "state_after_turn" + if (lowered.includes("state-before")) return "state_before" + if (lowered.includes("state-after")) return "state_after" + if (lowered.includes("input-raw")) return "input_raw" + if (lowered.includes("input-messages")) return "input_messages" + if (lowered.includes("messages.")) return "messages_stage" + return null +} + +function inferString(value: JsonValue | undefined, key: string): string | null { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return null + } + const current = value[key] + return typeof current === "string" ? current : null +} + +function inferNumber(value: JsonValue | undefined, key: string): number | null { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return null + } + const current = value[key] + return typeof current === "number" ? current : null +} + +function inferBoolean(value: JsonValue | undefined, key: string): boolean | null { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return null + } + const current = value[key] + return typeof current === "boolean" ? current : null +} + +function inferObject( + value: JsonValue | undefined, + key: string, +): Record | null { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return null + } + const current = value[key] + if (!current || typeof current !== "object" || Array.isArray(current)) { + return null + } + return current as Record +} + +function resolveSubagentReason(event: EventRecord): string | null { + const resolved = + event.subagent_reason ?? + inferString(event.payload, "subagent_reason") ?? + event.subagent_type ?? + event.query_source ?? + "unknown" + return resolved === "side_question" ? "side_query" : resolved +} + +function resolveSubagentTriggerKind(event: EventRecord): string | null { + return ( + event.subagent_trigger_kind ?? + inferString(event.payload, "subagent_trigger_kind") ?? + null + ) +} + +function resolveSubagentTriggerDetail(event: EventRecord): string | null { + return ( + event.subagent_trigger_detail ?? + inferString(event.payload, "subagent_trigger_detail") ?? + null + ) +} + +function resolveSubagentTriggerPayload( + event: EventRecord, +): Record | null { + return inferObject(event.payload, "subagent_trigger_payload") +} + +function normalizeAgentName( + querySource: string | null | undefined, + subagentType: string | null | undefined, + subagentReason: string | null | undefined, +): string | null { + const candidate = subagentReason ?? subagentType ?? querySource + if (!candidate) { + return null + } + if (candidate === "side_question") { + return "side_query" + } + if (candidate.startsWith("repl_main_thread")) { + return "main_thread" + } + if (candidate.startsWith("agent:builtin:")) { + return candidate.slice("agent:builtin:".length) + } + if (candidate === "agent:custom") { + return "custom_agent" + } + return candidate +} + +function normalizeSourceGroup( + querySource: string | null | undefined, + subagentId: string | null | undefined, + agentName: string | null | undefined, +): string | null { + if (!agentName && !querySource) { + return null + } + if (agentName === "main_thread" || querySource?.startsWith("repl_main_thread")) { + return "main_thread" + } + if ( + agentName && + [ + "extract_memories", + "session_memory", + "session_search", + "away_summary", + "agent_summary", + "memdir_relevance", + ].includes(agentName) + ) { + return "memory" + } + if ( + agentName && + [ + "side_query", + "permission_explainer", + "model_validation", + "session_search", + ].includes(agentName) + ) { + return "side_query" + } + if (querySource?.startsWith("agent:") || agentName === "custom_agent") { + return "agent" + } + if (subagentId) { + return "subagent" + } + return "other" +} + +function createInsertSql( + tableName: string, + columns: string[], + rows: Array>, +): string { + if (rows.length === 0) { + return "" + } + const values = rows + .map(row => `(${columns.map(column => sqlLiteral(row[column])).join(", ")})`) + .join(",\n") + return `INSERT INTO ${tableName} (${columns.join(", ")}) VALUES\n${values};\n` +} + +function extractResponseUsage(snapshotRef: string): { + requestId: string | null + assistantMessageCount: number + inputTokens: number + outputTokens: number + cacheReadInputTokens: number + cacheCreationInputTokens: number +} | null { + const absolutePath = jsonPathToAbsolute(snapshotRef) + if (!existsSync(absolutePath)) { + return null + } + + try { + const parsed = JSON.parse(readFileSync(absolutePath, "utf8")) as { + assistantMessages?: Array<{ + message?: { + id?: string + usage?: Record + } + }> + } + const assistantMessages = parsed.assistantMessages ?? [] + let requestId: string | null = null + let inputTokens = 0 + let outputTokens = 0 + let cacheReadInputTokens = 0 + let cacheCreationInputTokens = 0 + + for (const assistantMessage of assistantMessages) { + const message = assistantMessage.message + if (!message) { + continue + } + requestId ||= typeof message.id === "string" ? message.id : null + const usage = message.usage ?? {} + inputTokens = Math.max(inputTokens, toNumber(usage.input_tokens)) + outputTokens = Math.max(outputTokens, toNumber(usage.output_tokens)) + cacheReadInputTokens = Math.max( + cacheReadInputTokens, + toNumber(usage.cache_read_input_tokens), + ) + cacheCreationInputTokens = Math.max( + cacheCreationInputTokens, + toNumber(usage.cache_creation_input_tokens), + ) + } + + if ( + inputTokens === 0 && + outputTokens === 0 && + cacheReadInputTokens === 0 && + cacheCreationInputTokens === 0 + ) { + return null + } + + return { + requestId, + assistantMessageCount: assistantMessages.length, + inputTokens, + outputTokens, + cacheReadInputTokens, + cacheCreationInputTokens, + } + } catch { + return null + } +} + +if (!existsSync(duckdbExe)) { + fail(`DuckDB executable not found: ${duckdbExe}`) +} + +mkdirSync(observabilityDir, { recursive: true }) + +const args = parseArgs(process.argv.slice(2)) +const eventsPath = resolveEventsPath(args) +if (!existsSync(eventsPath)) { + fail(`Events file not found: ${eventsPath}`) +} + +const eventsFileStat = statSync(eventsPath) +const events = parseConcatenatedEvents(readFileSync(eventsPath, "utf8")) +const querySpans = buildExplicitQuerySpans(events) +const effectiveQueryIds = events.map(event => resolveEffectiveQueryId(event, querySpans)) + +const referencedSnapshots = new Map() +const perEventSnapshotRefs: string[][] = [] + +for (const [index, event] of events.entries()) { + const refs = new Set() + collectSnapshotRefs(event as unknown as JsonValue, refs) + const orderedRefs = [...refs].sort() + perEventSnapshotRefs.push(orderedRefs) + + for (const snapshotRef of orderedRefs) { + const fileName = snapshotRef.split("/").at(-1) ?? snapshotRef + const absolutePath = jsonPathToAbsolute(snapshotRef) + const stat = existsSync(absolutePath) ? statSync(absolutePath) : null + const existing = referencedSnapshots.get(snapshotRef) + if (existing) { + existing.referencedCount += 1 + existing.firstEventTs ||= event.ts_wall + existing.lastEventTs = event.ts_wall + continue + } + referencedSnapshots.set(snapshotRef, { + snapshotRef, + fileName, + relativePath: snapshotRef, + absolutePath, + exists: stat !== null, + sizeBytes: stat?.size ?? null, + sha256: stat ? sha256Hex(absolutePath) : null, + referencedCount: 1, + firstEventTs: event.ts_wall, + lastEventTs: event.ts_wall, + category: snapshotCategory(fileName), + }) + } + + void index +} + +const snapshotFiles = existsSync(snapshotsDir) ? readdirSync(snapshotsDir) : [] +for (const fileName of snapshotFiles) { + const snapshotRef = `.observability/snapshots/${fileName}` + if (referencedSnapshots.has(snapshotRef)) { + continue + } + const absolutePath = join(snapshotsDir, fileName) + const stat = statSync(absolutePath) + referencedSnapshots.set(snapshotRef, { + snapshotRef, + fileName, + relativePath: relative(repoRoot, absolutePath).replace(/\\/g, "/"), + absolutePath, + exists: true, + sizeBytes: stat.size, + sha256: sha256Hex(absolutePath), + referencedCount: 0, + firstEventTs: null, + lastEventTs: null, + category: snapshotCategory(fileName), + }) +} + +const subagentCompletedQueryIds = new Set( + events + .filter(event => event.event === "subagent.completed" && event.query_id) + .map(event => event.query_id!) as string[], +) + +const usageFacts: UsageFact[] = [] + +for (const [index, event] of events.entries()) { + if (event.event !== "api.stream.completed") { + continue + } + const responseSnapshotRef = inferString(event.payload, "response_snapshot_ref") + if (!responseSnapshotRef) { + continue + } + const usage = extractResponseUsage(responseSnapshotRef) + if (!usage) { + continue + } + + const effectiveQueryId = effectiveQueryIds[index] ?? event.query_id ?? null + const subagentReason = resolveSubagentReason(event) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + const sourceGroup = normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ) + const isAuthoritative = + agentName === "main_thread" || + !subagentCompletedQueryIds.has(effectiveQueryId ?? "__missing__") + + usageFacts.push({ + usage_fact_id: `response::${responseSnapshotRef}`, + event_date: event.ts_wall.slice(0, 10), + ts_wall: event.ts_wall, + ts_wall_ms: toEpochMs(event.ts_wall), + user_action_id: event.user_action_id ?? null, + query_id: effectiveQueryId, + query_source: event.query_source ?? null, + subagent_id: event.subagent_id ?? null, + subagent_reason: subagentReason, + agent_name: agentName, + source_group: sourceGroup, + source_kind: "response_snapshot", + source_ref: responseSnapshotRef, + request_id: usage.requestId, + assistant_message_count: usage.assistantMessageCount, + is_authoritative: isAuthoritative, + input_tokens: usage.inputTokens, + output_tokens: usage.outputTokens, + cache_read_input_tokens: usage.cacheReadInputTokens, + cache_creation_input_tokens: usage.cacheCreationInputTokens, + total_prompt_input_tokens: + usage.inputTokens + + usage.cacheReadInputTokens + + usage.cacheCreationInputTokens, + total_billed_tokens: + usage.inputTokens + + usage.cacheReadInputTokens + + usage.cacheCreationInputTokens + + usage.outputTokens, + }) +} + +for (const [index, event] of events.entries()) { + if (event.event !== "subagent.completed") { + continue + } + const inputTokens = inferNumber(event.payload, "input_tokens") ?? 0 + const outputTokens = inferNumber(event.payload, "output_tokens") ?? 0 + const cacheReadInputTokens = inferNumber(event.payload, "cache_read_input_tokens") ?? 0 + const cacheCreationInputTokens = + inferNumber(event.payload, "cache_creation_input_tokens") ?? 0 + const subagentReason = resolveSubagentReason(event) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + const sourceGroup = normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ) + + if ( + inputTokens === 0 && + outputTokens === 0 && + cacheReadInputTokens === 0 && + cacheCreationInputTokens === 0 + ) { + continue + } + + usageFacts.push({ + usage_fact_id: `subagent_completed::${event.subagent_id ?? index}`, + event_date: event.ts_wall.slice(0, 10), + ts_wall: event.ts_wall, + ts_wall_ms: toEpochMs(event.ts_wall), + user_action_id: event.user_action_id ?? null, + query_id: event.query_id ?? effectiveQueryIds[index], + query_source: event.query_source ?? null, + subagent_id: event.subagent_id ?? null, + subagent_reason: subagentReason, + agent_name: agentName, + source_group: sourceGroup, + source_kind: "subagent_completed_payload", + source_ref: `${event.event}:${index + 1}`, + request_id: null, + assistant_message_count: inferNumber(event.payload, "message_count"), + is_authoritative: true, + input_tokens: inputTokens, + output_tokens: outputTokens, + cache_read_input_tokens: cacheReadInputTokens, + cache_creation_input_tokens: cacheCreationInputTokens, + total_prompt_input_tokens: + inputTokens + cacheReadInputTokens + cacheCreationInputTokens, + total_billed_tokens: + inputTokens + + cacheReadInputTokens + + cacheCreationInputTokens + + outputTokens, + }) +} + +const queryRows = new Map>() + +for (const [index, event] of events.entries()) { + const effectiveQueryId = effectiveQueryIds[index] + if (!effectiveQueryId) { + continue + } + const subagentReason = resolveSubagentReason(event) + const subagentTriggerKind = resolveSubagentTriggerKind(event) + const subagentTriggerDetail = resolveSubagentTriggerDetail(event) + const subagentTriggerPayloadJson = compactJson(resolveSubagentTriggerPayload(event)) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + const sourceGroup = normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ) + const tsMs = toEpochMs(event.ts_wall) + if (tsMs === null) { + continue + } + const existing = queryRows.get(effectiveQueryId) ?? { + query_id: effectiveQueryId, + user_action_id: event.user_action_id ?? null, + session_id: event.session_id ?? null, + conversation_id: event.conversation_id ?? null, + query_source: event.query_source ?? null, + subagent_id: event.subagent_id ?? null, + subagent_type: event.subagent_type ?? null, + subagent_reason: subagentReason, + subagent_trigger_kind: subagentTriggerKind, + subagent_trigger_detail: subagentTriggerDetail, + subagent_trigger_payload_json: subagentTriggerPayloadJson, + agent_name: agentName, + source_group: sourceGroup, + started_at: event.ts_wall, + started_at_ms: tsMs, + ended_at: event.ts_wall, + ended_at_ms: tsMs, + first_event: event.event, + last_event: event.event, + terminal_reason: null, + stop_reason: null, + turn_ids: new Set(), + tool_call_ids: new Set(), + event_count: 0, + raw_query_started_count: 0, + raw_query_terminated_count: 0, + inferred_query_started_count: 0, + inferred_query_terminated_count: 0, + } + + existing.user_action_id ||= event.user_action_id ?? null + existing.session_id ||= event.session_id ?? null + existing.conversation_id ||= event.conversation_id ?? null + existing.query_source ||= event.query_source ?? null + existing.subagent_id ||= event.subagent_id ?? null + existing.subagent_type ||= event.subagent_type ?? null + existing.subagent_reason ||= subagentReason + existing.subagent_trigger_kind ||= subagentTriggerKind + existing.subagent_trigger_detail ||= subagentTriggerDetail + existing.subagent_trigger_payload_json ||= subagentTriggerPayloadJson + existing.agent_name ||= agentName + existing.source_group ||= sourceGroup + existing.event_count = Number(existing.event_count) + 1 + + if (tsMs < Number(existing.started_at_ms)) { + existing.started_at = event.ts_wall + existing.started_at_ms = tsMs + existing.first_event = event.event + } + if (tsMs >= Number(existing.ended_at_ms)) { + existing.ended_at = event.ts_wall + existing.ended_at_ms = tsMs + existing.last_event = event.event + } + + if (event.turn_id) { + ;(existing.turn_ids as Set).add(event.turn_id) + } + if (event.tool_call_id) { + ;(existing.tool_call_ids as Set).add(event.tool_call_id) + } + + if (event.event === "query.started") { + existing.inferred_query_started_count = Number(existing.inferred_query_started_count) + 1 + if (event.query_id === effectiveQueryId) { + existing.raw_query_started_count = Number(existing.raw_query_started_count) + 1 + } + } + if (event.event === "query.terminated") { + existing.inferred_query_terminated_count = + Number(existing.inferred_query_terminated_count) + 1 + existing.terminal_reason = inferString(event.payload, "reason") + if (event.query_id === effectiveQueryId) { + existing.raw_query_terminated_count = Number(existing.raw_query_terminated_count) + 1 + } + } + if (event.event === "api.stream.completed") { + existing.stop_reason = inferString(event.payload, "stop_reason") + } + + queryRows.set(effectiveQueryId, existing) +} + +const turnRows = new Map>() + +for (const [index, event] of events.entries()) { + if (!event.turn_id) { + continue + } + const effectiveQueryId = effectiveQueryIds[index] + if (!effectiveQueryId) { + continue + } + const subagentReason = resolveSubagentReason(event) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + const sourceGroup = normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ) + const turnKey = `${effectiveQueryId}::${event.turn_id}` + const tsMs = toEpochMs(event.ts_wall) + if (tsMs === null) { + continue + } + const existing = turnRows.get(turnKey) ?? { + turn_key: turnKey, + query_id: effectiveQueryId, + turn_id: event.turn_id, + user_action_id: event.user_action_id ?? null, + subagent_id: event.subagent_id ?? null, + query_source: event.query_source ?? null, + subagent_reason: subagentReason, + agent_name: agentName, + source_group: sourceGroup, + loop_iter_start: event.loop_iter ?? null, + loop_iter_end: event.loop_iter ?? null, + started_at: event.ts_wall, + started_at_ms: tsMs, + ended_at: event.ts_wall, + ended_at_ms: tsMs, + first_event: event.event, + last_event: event.event, + transition_out: null, + termination_reason: null, + stop_reason: null, + assistant_tool_use_count: 0, + event_count: 0, + tool_call_ids: new Set(), + raw_turn_started_count: 0, + raw_state_before_count: 0, + raw_state_after_count: 0, + inferred_turn_started_count: 0, + inferred_state_before_count: 0, + inferred_state_after_count: 0, + } + + existing.user_action_id ||= event.user_action_id ?? null + existing.subagent_id ||= event.subagent_id ?? null + existing.query_source ||= event.query_source ?? null + existing.subagent_reason ||= subagentReason + existing.agent_name ||= agentName + existing.source_group ||= sourceGroup + + if (event.loop_iter !== null && event.loop_iter !== undefined) { + if ( + existing.loop_iter_start === null || + Number(event.loop_iter) < Number(existing.loop_iter_start) + ) { + existing.loop_iter_start = event.loop_iter + } + if ( + existing.loop_iter_end === null || + Number(event.loop_iter) > Number(existing.loop_iter_end) + ) { + existing.loop_iter_end = event.loop_iter + } + } + + existing.event_count = Number(existing.event_count) + 1 + + if (tsMs < Number(existing.started_at_ms)) { + existing.started_at = event.ts_wall + existing.started_at_ms = tsMs + existing.first_event = event.event + } + if (tsMs >= Number(existing.ended_at_ms)) { + existing.ended_at = event.ts_wall + existing.ended_at_ms = tsMs + existing.last_event = event.event + } + + if (event.tool_call_id) { + ;(existing.tool_call_ids as Set).add(event.tool_call_id) + } + + if (event.event === "turn.started") { + existing.inferred_turn_started_count = Number(existing.inferred_turn_started_count) + 1 + if (event.query_id === effectiveQueryId) { + existing.raw_turn_started_count = Number(existing.raw_turn_started_count) + 1 + } + } + if (event.event === "state.snapshot.before_turn") { + existing.inferred_state_before_count = Number(existing.inferred_state_before_count) + 1 + if (event.query_id === effectiveQueryId) { + existing.raw_state_before_count = Number(existing.raw_state_before_count) + 1 + } + } + if (event.event === "state.snapshot.after_turn") { + existing.inferred_state_after_count = Number(existing.inferred_state_after_count) + 1 + if (event.query_id === effectiveQueryId) { + existing.raw_state_after_count = Number(existing.raw_state_after_count) + 1 + } + } + if (event.event === "assistant.tool_use.detected") { + existing.assistant_tool_use_count = Number(existing.assistant_tool_use_count) + 1 + } + if (event.event === "state.transitioned") { + existing.transition_out = inferString(event.payload, "to_transition") + } + if (event.event === "query.terminated") { + existing.termination_reason = inferString(event.payload, "reason") + } + if (event.event === "api.stream.completed") { + existing.stop_reason = inferString(event.payload, "stop_reason") + } + + turnRows.set(turnKey, existing) +} + +const toolRows = new Map>() + +for (const [index, event] of events.entries()) { + if (!event.tool_call_id) { + continue + } + + const existing = toolRows.get(event.tool_call_id) ?? { + tool_call_id: event.tool_call_id, + user_action_id: event.user_action_id ?? null, + query_id: effectiveQueryIds[index] ?? event.query_id ?? null, + turn_id: event.turn_id ?? null, + subagent_id: event.subagent_id ?? null, + tool_name: inferString(event.payload, "tool_name"), + execution_mode: null, + detected_at: null, + detected_at_ms: null, + enqueued_at: null, + enqueued_at_ms: null, + started_at: null, + started_at_ms: null, + completed_at: null, + completed_at_ms: null, + duration_ms: null, + success: null, + failure_reason: null, + event_count: 0, + has_tool_use_detected: false, + has_started: false, + has_completed: false, + has_failed: false, + } + + existing.user_action_id ||= event.user_action_id ?? null + existing.query_id ||= effectiveQueryIds[index] ?? event.query_id ?? null + existing.turn_id ||= event.turn_id ?? null + existing.subagent_id ||= event.subagent_id ?? null + existing.tool_name ||= inferString(event.payload, "tool_name") + existing.event_count = Number(existing.event_count) + 1 + + const tsMs = toEpochMs(event.ts_wall) + + if (event.event === "assistant.tool_use.detected") { + existing.detected_at = event.ts_wall + existing.detected_at_ms = tsMs + existing.has_tool_use_detected = true + } + if (event.event === "tool.enqueued") { + existing.enqueued_at = event.ts_wall + existing.enqueued_at_ms = tsMs + } + if (event.event === "tool.execution.started") { + existing.started_at = event.ts_wall + existing.started_at_ms = tsMs + existing.has_started = true + } + if (event.event === "tool.execution.completed") { + existing.completed_at = event.ts_wall + existing.completed_at_ms = tsMs + existing.duration_ms = inferNumber(event.payload, "duration_ms") + existing.success = inferBoolean(event.payload, "success") + existing.has_completed = true + } + if (event.event === "tool.execution.failed") { + existing.completed_at = event.ts_wall + existing.completed_at_ms = tsMs + existing.duration_ms = inferNumber(event.payload, "duration_ms") + existing.success = false + existing.failure_reason = + inferString(event.payload, "error_name") ?? inferString(event.payload, "error") + existing.has_failed = true + } + + toolRows.set(event.tool_call_id, existing) +} + +const subagentRows = new Map>() + +for (const event of events) { + if ( + event.event !== "subagent.spawned" && + event.event !== "subagent.completed" && + event.event !== "subagent.message.received" + ) { + continue + } + const key = event.subagent_id + if (!key) { + continue + } + const subagentReason = resolveSubagentReason(event) + const subagentTriggerKind = resolveSubagentTriggerKind(event) + const subagentTriggerDetail = resolveSubagentTriggerDetail(event) + const subagentTriggerPayloadJson = compactJson(resolveSubagentTriggerPayload(event)) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + + const existing = subagentRows.get(key) ?? { + subagent_id: key, + query_id: event.query_id ?? null, + user_action_id: event.user_action_id ?? null, + subagent_type: event.subagent_type ?? null, + subagent_reason: subagentReason, + subagent_trigger_kind: subagentTriggerKind, + subagent_trigger_detail: subagentTriggerDetail, + subagent_trigger_payload_json: subagentTriggerPayloadJson, + query_source: event.query_source ?? null, + agent_name: agentName, + source_group: normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ), + spawned_at: null, + spawned_at_ms: null, + completed_at: null, + completed_at_ms: null, + duration_ms: null, + transcript_enabled: null, + inherited_message_count: null, + prompt_message_count: null, + message_event_count: 0, + has_spawned: false, + has_completed: false, + } + + existing.query_id ||= event.query_id ?? null + existing.user_action_id ||= event.user_action_id ?? null + existing.subagent_type ||= event.subagent_type ?? null + existing.query_source ||= event.query_source ?? null + existing.subagent_reason ||= subagentReason + existing.subagent_trigger_kind ||= subagentTriggerKind + existing.subagent_trigger_detail ||= subagentTriggerDetail + existing.subagent_trigger_payload_json ||= subagentTriggerPayloadJson + existing.agent_name ||= agentName + existing.source_group ||= normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + existing.agent_name as string | null, + ) + + if (event.event === "subagent.spawned") { + existing.spawned_at = event.ts_wall + existing.spawned_at_ms = toEpochMs(event.ts_wall) + existing.transcript_enabled = inferBoolean(event.payload, "transcript_enabled") + existing.inherited_message_count = inferNumber(event.payload, "inherited_message_count") + existing.prompt_message_count = inferNumber(event.payload, "prompt_message_count") + existing.has_spawned = true + } + + if (event.event === "subagent.completed") { + existing.completed_at = event.ts_wall + existing.completed_at_ms = toEpochMs(event.ts_wall) + existing.duration_ms = + inferNumber(event.payload, "duration_ms") ?? + (existing.spawned_at_ms !== null && existing.completed_at_ms !== null + ? Number(existing.completed_at_ms) - Number(existing.spawned_at_ms) + : null) + existing.has_completed = true + } + + if (event.event === "subagent.message.received") { + existing.message_event_count = Number(existing.message_event_count) + 1 + } + + subagentRows.set(key, existing) +} + +const recoveryRows: Record[] = [] +for (const [index, event] of events.entries()) { + const transition = inferString(event.payload, "to_transition") + const reason = inferString(event.payload, "reason") + const isRecoveryEvent = + event.event.includes("recovery") || + event.event.includes("stop_hooks") || + event.event.includes("error") || + event.event.includes("failed") || + (event.event === "state.transitioned" && transition !== null && transition !== "next_turn") + + if (!isRecoveryEvent) { + continue + } + + recoveryRows.push({ + recovery_key: `${event.event}::${index + 1}`, + event_name: event.event, + user_action_id: event.user_action_id ?? null, + query_id: effectiveQueryIds[index] ?? event.query_id ?? null, + turn_id: event.turn_id ?? null, + subagent_id: event.subagent_id ?? null, + ts_wall: event.ts_wall, + ts_wall_ms: toEpochMs(event.ts_wall), + transition_to: transition, + reason, + payload_json: compactJson(event.payload), + }) +} + +const dailyRollups = new Map>() + +for (const [index, event] of events.entries()) { + const eventDate = event.ts_wall.slice(0, 10) + const existing = dailyRollups.get(eventDate) ?? { + event_date: eventDate, + event_count: 0, + user_action_ids: new Set(), + query_ids: new Set(), + turn_keys: new Set(), + tool_call_ids: new Set(), + subagent_ids: new Set(), + snapshot_refs: new Set(), + latest_event_ts: event.ts_wall, + } + + existing.event_count = Number(existing.event_count) + 1 + if (event.user_action_id) { + ;(existing.user_action_ids as Set).add(event.user_action_id) + } + const effectiveQueryId = effectiveQueryIds[index] + if (effectiveQueryId) { + ;(existing.query_ids as Set).add(effectiveQueryId) + } + if (effectiveQueryId && event.turn_id) { + ;(existing.turn_keys as Set).add(`${effectiveQueryId}::${event.turn_id}`) + } + if (event.tool_call_id) { + ;(existing.tool_call_ids as Set).add(event.tool_call_id) + } + if (event.subagent_id) { + ;(existing.subagent_ids as Set).add(event.subagent_id) + } + for (const snapshotRef of perEventSnapshotRefs[index] ?? []) { + ;(existing.snapshot_refs as Set).add(snapshotRef) + } + existing.latest_event_ts = event.ts_wall + dailyRollups.set(eventDate, existing) +} + +const eventsRawRows = events.map((event, index) => { + const subagentReason = resolveSubagentReason(event) + const subagentTriggerKind = resolveSubagentTriggerKind(event) + const subagentTriggerDetail = resolveSubagentTriggerDetail(event) + const subagentTriggerPayloadJson = compactJson(resolveSubagentTriggerPayload(event)) + const agentName = normalizeAgentName( + event.query_source ?? null, + event.subagent_type ?? null, + subagentReason, + ) + const sourceGroup = normalizeSourceGroup( + event.query_source ?? null, + event.subagent_id ?? null, + agentName, + ) + return { + event_idx: index + 1, + schema_version: event.schema_version ?? null, + event_date: event.ts_wall.slice(0, 10), + ts_wall: event.ts_wall, + ts_wall_ms: toEpochMs(event.ts_wall), + ts_mono_ms: event.ts_mono_ms ?? null, + level: event.level ?? null, + event_name: event.event, + component: event.component ?? null, + session_id: event.session_id ?? null, + conversation_id: event.conversation_id ?? null, + user_action_id: event.user_action_id ?? null, + query_id: event.query_id ?? null, + effective_query_id: effectiveQueryIds[index], + turn_id: event.turn_id ?? null, + loop_iter: event.loop_iter ?? null, + parent_turn_id: event.parent_turn_id ?? null, + subagent_id: event.subagent_id ?? null, + subagent_type: event.subagent_type ?? null, + subagent_reason: subagentReason, + subagent_trigger_kind: subagentTriggerKind, + subagent_trigger_detail: subagentTriggerDetail, + subagent_trigger_payload_json: subagentTriggerPayloadJson, + agent_name: agentName, + source_group: sourceGroup, + query_source: event.query_source ?? null, + request_id: event.request_id ?? null, + tool_call_id: event.tool_call_id ?? null, + span_id: event.span_id ?? null, + parent_span_id: event.parent_span_id ?? null, + cwd: event.cwd ?? null, + git_branch: event.git_branch ?? null, + build_version: event.build_version ?? null, + payload_json: compactJson(event.payload), + snapshot_refs_json: compactJson(perEventSnapshotRefs[index] ?? []), + raw_event_json: compactJson(event), + } +}) + +const queryLoopStats = new Map< + string, + { + maxLoopIter: number | null + totalLoopIter: number + loopIterCount: number + } +>() + +for (const row of turnRows.values()) { + const queryId = row.query_id as string + const existing = queryLoopStats.get(queryId) ?? { + maxLoopIter: null, + totalLoopIter: 0, + loopIterCount: 0, + } + const loopIterEnd = + row.loop_iter_end === null || row.loop_iter_end === undefined + ? null + : Number(row.loop_iter_end) + if (loopIterEnd !== null && Number.isFinite(loopIterEnd)) { + existing.maxLoopIter = + existing.maxLoopIter === null + ? loopIterEnd + : Math.max(existing.maxLoopIter, loopIterEnd) + existing.totalLoopIter += loopIterEnd + existing.loopIterCount += 1 + } + queryLoopStats.set(queryId, existing) +} + +const queryInsertRows = [...queryRows.values()].map(row => { + const strictIsComplete = + Number(row.raw_query_started_count) > 0 && Number(row.raw_query_terminated_count) > 0 + const inferredIsComplete = + Number(row.inferred_query_started_count) > 0 && + Number(row.inferred_query_terminated_count) > 0 + const loopStats = queryLoopStats.get(String(row.query_id)) + return { + query_id: row.query_id, + user_action_id: row.user_action_id, + session_id: row.session_id, + conversation_id: row.conversation_id, + query_source: row.query_source, + subagent_id: row.subagent_id, + subagent_type: row.subagent_type, + subagent_reason: row.subagent_reason, + subagent_trigger_kind: row.subagent_trigger_kind, + subagent_trigger_detail: row.subagent_trigger_detail, + subagent_trigger_payload_json: row.subagent_trigger_payload_json, + agent_name: row.agent_name, + source_group: row.source_group, + started_at: row.started_at, + started_at_ms: row.started_at_ms, + ended_at: row.ended_at, + ended_at_ms: row.ended_at_ms, + duration_ms: Number(row.ended_at_ms) - Number(row.started_at_ms), + first_event: row.first_event, + last_event: row.last_event, + terminal_reason: row.terminal_reason, + stop_reason: row.stop_reason, + turn_count: (row.turn_ids as Set).size, + query_max_loop_iter: loopStats?.maxLoopIter ?? null, + query_avg_loop_iter: + loopStats && loopStats.loopIterCount > 0 + ? Math.round((loopStats.totalLoopIter / loopStats.loopIterCount) * 1000) / 1000 + : null, + tool_call_count: (row.tool_call_ids as Set).size, + event_count: row.event_count, + raw_query_started_count: row.raw_query_started_count, + raw_query_terminated_count: row.raw_query_terminated_count, + inferred_query_started_count: row.inferred_query_started_count, + inferred_query_terminated_count: row.inferred_query_terminated_count, + strict_is_complete: strictIsComplete, + inferred_is_complete: inferredIsComplete, + } +}) + +const turnInsertRows = [...turnRows.values()].map(row => { + const strictTerminalTurnClosed = + Number(row.raw_turn_started_count) > 0 && + Number(row.raw_state_before_count) > 0 && + Number(row.raw_state_after_count) === 0 && + row.stop_reason === "end_turn" && + row.termination_reason !== null + const inferredTerminalTurnClosed = + Number(row.inferred_turn_started_count) > 0 && + Number(row.inferred_state_before_count) > 0 && + Number(row.inferred_state_after_count) === 0 && + row.stop_reason === "end_turn" && + row.termination_reason !== null + const strictIsClosed = + ( + Number(row.raw_turn_started_count) > 0 && + Number(row.raw_state_before_count) > 0 && + Number(row.raw_state_after_count) > 0 + ) || strictTerminalTurnClosed + const inferredIsClosed = + ( + Number(row.inferred_turn_started_count) > 0 && + Number(row.inferred_state_before_count) > 0 && + Number(row.inferred_state_after_count) > 0 + ) || inferredTerminalTurnClosed + return { + turn_key: row.turn_key, + query_id: row.query_id, + turn_id: row.turn_id, + user_action_id: row.user_action_id, + subagent_id: row.subagent_id, + query_source: row.query_source, + subagent_reason: row.subagent_reason, + agent_name: row.agent_name, + source_group: row.source_group, + loop_iter_start: row.loop_iter_start, + loop_iter_end: row.loop_iter_end, + started_at: row.started_at, + started_at_ms: row.started_at_ms, + ended_at: row.ended_at, + ended_at_ms: row.ended_at_ms, + duration_ms: Number(row.ended_at_ms) - Number(row.started_at_ms), + first_event: row.first_event, + last_event: row.last_event, + transition_out: row.transition_out, + termination_reason: row.termination_reason, + stop_reason: row.stop_reason, + tool_call_count: (row.tool_call_ids as Set).size, + assistant_tool_use_count: row.assistant_tool_use_count, + event_count: row.event_count, + raw_turn_started_count: row.raw_turn_started_count, + raw_state_before_count: row.raw_state_before_count, + raw_state_after_count: row.raw_state_after_count, + inferred_turn_started_count: row.inferred_turn_started_count, + inferred_state_before_count: row.inferred_state_before_count, + inferred_state_after_count: row.inferred_state_after_count, + strict_is_closed: strictIsClosed, + inferred_is_closed: inferredIsClosed, + } +}) + +const toolInsertRows = [...toolRows.values()].map(row => ({ + tool_call_id: row.tool_call_id, + user_action_id: row.user_action_id, + query_id: row.query_id, + turn_id: row.turn_id, + subagent_id: row.subagent_id, + tool_name: row.tool_name, + execution_mode: row.execution_mode, + detected_at: row.detected_at, + detected_at_ms: row.detected_at_ms, + enqueued_at: row.enqueued_at, + enqueued_at_ms: row.enqueued_at_ms, + started_at: row.started_at, + started_at_ms: row.started_at_ms, + completed_at: row.completed_at, + completed_at_ms: row.completed_at_ms, + duration_ms: row.duration_ms, + success: row.success, + failure_reason: row.failure_reason, + event_count: row.event_count, + has_tool_use_detected: row.has_tool_use_detected, + has_started: row.has_started, + has_completed: row.has_completed, + has_failed: row.has_failed, + is_closed: Boolean(row.has_tool_use_detected) && (Boolean(row.has_completed) || Boolean(row.has_failed)), +})) + +const subagentInsertRows = [...subagentRows.values()].map(row => ({ + subagent_id: row.subagent_id, + query_id: row.query_id, + user_action_id: row.user_action_id, + subagent_type: row.subagent_type, + subagent_reason: row.subagent_reason, + subagent_trigger_kind: row.subagent_trigger_kind, + subagent_trigger_detail: row.subagent_trigger_detail, + subagent_trigger_payload_json: row.subagent_trigger_payload_json, + query_source: row.query_source, + agent_name: row.agent_name, + source_group: row.source_group, + spawned_at: row.spawned_at, + spawned_at_ms: row.spawned_at_ms, + completed_at: row.completed_at, + completed_at_ms: row.completed_at_ms, + duration_ms: row.duration_ms, + transcript_enabled: row.transcript_enabled, + inherited_message_count: row.inherited_message_count, + prompt_message_count: row.prompt_message_count, + message_event_count: row.message_event_count, + has_spawned: row.has_spawned, + has_completed: row.has_completed, +})) + +const snapshotInsertRows = [...referencedSnapshots.values()] +const usageFactRows = usageFacts + +const dailyRollupRows = [...dailyRollups.values()].map(row => ({ + event_date: row.event_date, + event_count: row.event_count, + user_action_count: (row.user_action_ids as Set).size, + query_count: (row.query_ids as Set).size, + turn_count: (row.turn_keys as Set).size, + tool_call_count: (row.tool_call_ids as Set).size, + subagent_count: (row.subagent_ids as Set).size, + snapshot_ref_count: (row.snapshot_refs as Set).size, + latest_event_ts: row.latest_event_ts, +})) + +const buildMetaRows = [ + { + source_events_file: eventsPath, + source_events_file_name: basename(eventsPath), + source_events_size_bytes: eventsFileStat.size, + source_events_mtime_ms: Math.trunc(eventsFileStat.mtimeMs), + built_at: new Date().toISOString(), + built_at_ms: Date.now(), + events_row_count: eventsRawRows.length, + }, +] + +const sql = ` +BEGIN TRANSACTION; +DROP TABLE IF EXISTS build_meta; +DROP TABLE IF EXISTS events_raw; +DROP TABLE IF EXISTS queries; +DROP TABLE IF EXISTS turns; +DROP TABLE IF EXISTS tools; +DROP TABLE IF EXISTS subagents; +DROP TABLE IF EXISTS recoveries; +DROP TABLE IF EXISTS snapshots_index; +DROP TABLE IF EXISTS usage_facts; +DROP TABLE IF EXISTS daily_rollups; + +CREATE TABLE build_meta ( + source_events_file VARCHAR, + source_events_file_name VARCHAR, + source_events_size_bytes BIGINT, + source_events_mtime_ms BIGINT, + built_at VARCHAR, + built_at_ms BIGINT, + events_row_count BIGINT +); + +CREATE TABLE events_raw ( + event_idx BIGINT, + schema_version VARCHAR, + event_date VARCHAR, + ts_wall VARCHAR, + ts_wall_ms BIGINT, + ts_mono_ms BIGINT, + level VARCHAR, + event_name VARCHAR, + component VARCHAR, + session_id VARCHAR, + conversation_id VARCHAR, + user_action_id VARCHAR, + query_id VARCHAR, + effective_query_id VARCHAR, + turn_id VARCHAR, + loop_iter BIGINT, + parent_turn_id VARCHAR, + subagent_id VARCHAR, + subagent_type VARCHAR, + subagent_reason VARCHAR, + subagent_trigger_kind VARCHAR, + subagent_trigger_detail VARCHAR, + subagent_trigger_payload_json VARCHAR, + agent_name VARCHAR, + source_group VARCHAR, + query_source VARCHAR, + request_id VARCHAR, + tool_call_id VARCHAR, + span_id VARCHAR, + parent_span_id VARCHAR, + cwd VARCHAR, + git_branch VARCHAR, + build_version VARCHAR, + payload_json VARCHAR, + snapshot_refs_json VARCHAR, + raw_event_json VARCHAR +); + +CREATE TABLE queries ( + query_id VARCHAR, + user_action_id VARCHAR, + session_id VARCHAR, + conversation_id VARCHAR, + query_source VARCHAR, + subagent_id VARCHAR, + subagent_type VARCHAR, + subagent_reason VARCHAR, + subagent_trigger_kind VARCHAR, + subagent_trigger_detail VARCHAR, + subagent_trigger_payload_json VARCHAR, + agent_name VARCHAR, + source_group VARCHAR, + started_at VARCHAR, + started_at_ms BIGINT, + ended_at VARCHAR, + ended_at_ms BIGINT, + duration_ms BIGINT, + first_event VARCHAR, + last_event VARCHAR, + terminal_reason VARCHAR, + stop_reason VARCHAR, + turn_count BIGINT, + query_max_loop_iter DOUBLE, + query_avg_loop_iter DOUBLE, + tool_call_count BIGINT, + event_count BIGINT, + raw_query_started_count BIGINT, + raw_query_terminated_count BIGINT, + inferred_query_started_count BIGINT, + inferred_query_terminated_count BIGINT, + strict_is_complete BOOLEAN, + inferred_is_complete BOOLEAN +); + +CREATE TABLE turns ( + turn_key VARCHAR, + query_id VARCHAR, + turn_id VARCHAR, + user_action_id VARCHAR, + subagent_id VARCHAR, + query_source VARCHAR, + subagent_reason VARCHAR, + agent_name VARCHAR, + source_group VARCHAR, + loop_iter_start BIGINT, + loop_iter_end BIGINT, + started_at VARCHAR, + started_at_ms BIGINT, + ended_at VARCHAR, + ended_at_ms BIGINT, + duration_ms BIGINT, + first_event VARCHAR, + last_event VARCHAR, + transition_out VARCHAR, + termination_reason VARCHAR, + stop_reason VARCHAR, + tool_call_count BIGINT, + assistant_tool_use_count BIGINT, + event_count BIGINT, + raw_turn_started_count BIGINT, + raw_state_before_count BIGINT, + raw_state_after_count BIGINT, + inferred_turn_started_count BIGINT, + inferred_state_before_count BIGINT, + inferred_state_after_count BIGINT, + strict_is_closed BOOLEAN, + inferred_is_closed BOOLEAN +); + +CREATE TABLE tools ( + tool_call_id VARCHAR, + user_action_id VARCHAR, + query_id VARCHAR, + turn_id VARCHAR, + subagent_id VARCHAR, + tool_name VARCHAR, + execution_mode VARCHAR, + detected_at VARCHAR, + detected_at_ms BIGINT, + enqueued_at VARCHAR, + enqueued_at_ms BIGINT, + started_at VARCHAR, + started_at_ms BIGINT, + completed_at VARCHAR, + completed_at_ms BIGINT, + duration_ms BIGINT, + success BOOLEAN, + failure_reason VARCHAR, + event_count BIGINT, + has_tool_use_detected BOOLEAN, + has_started BOOLEAN, + has_completed BOOLEAN, + has_failed BOOLEAN, + is_closed BOOLEAN +); + +CREATE TABLE subagents ( + subagent_id VARCHAR, + query_id VARCHAR, + user_action_id VARCHAR, + subagent_type VARCHAR, + subagent_reason VARCHAR, + subagent_trigger_kind VARCHAR, + subagent_trigger_detail VARCHAR, + subagent_trigger_payload_json VARCHAR, + query_source VARCHAR, + agent_name VARCHAR, + source_group VARCHAR, + spawned_at VARCHAR, + spawned_at_ms BIGINT, + completed_at VARCHAR, + completed_at_ms BIGINT, + duration_ms BIGINT, + transcript_enabled BOOLEAN, + inherited_message_count BIGINT, + prompt_message_count BIGINT, + message_event_count BIGINT, + has_spawned BOOLEAN, + has_completed BOOLEAN +); + +CREATE TABLE recoveries ( + recovery_key VARCHAR, + event_name VARCHAR, + user_action_id VARCHAR, + query_id VARCHAR, + turn_id VARCHAR, + subagent_id VARCHAR, + ts_wall VARCHAR, + ts_wall_ms BIGINT, + transition_to VARCHAR, + reason VARCHAR, + payload_json VARCHAR +); + +CREATE TABLE snapshots_index ( + snapshot_ref VARCHAR, + file_name VARCHAR, + relative_path VARCHAR, + absolute_path VARCHAR, + exists BOOLEAN, + size_bytes BIGINT, + sha256 VARCHAR, + referenced_count BIGINT, + first_event_ts VARCHAR, + last_event_ts VARCHAR, + category VARCHAR +); + +CREATE TABLE usage_facts ( + usage_fact_id VARCHAR, + event_date VARCHAR, + ts_wall VARCHAR, + ts_wall_ms BIGINT, + user_action_id VARCHAR, + query_id VARCHAR, + query_source VARCHAR, + subagent_id VARCHAR, + subagent_reason VARCHAR, + agent_name VARCHAR, + source_group VARCHAR, + source_kind VARCHAR, + source_ref VARCHAR, + request_id VARCHAR, + assistant_message_count BIGINT, + is_authoritative BOOLEAN, + input_tokens BIGINT, + output_tokens BIGINT, + cache_read_input_tokens BIGINT, + cache_creation_input_tokens BIGINT, + total_prompt_input_tokens BIGINT, + total_billed_tokens BIGINT +); + +CREATE TABLE daily_rollups ( + event_date VARCHAR, + event_count BIGINT, + user_action_count BIGINT, + query_count BIGINT, + turn_count BIGINT, + tool_call_count BIGINT, + subagent_count BIGINT, + snapshot_ref_count BIGINT, + latest_event_ts VARCHAR +); + +${createInsertSql("build_meta", [ + "source_events_file", + "source_events_file_name", + "source_events_size_bytes", + "source_events_mtime_ms", + "built_at", + "built_at_ms", + "events_row_count", +], buildMetaRows)} + +${createInsertSql("events_raw", [ + "event_idx", + "schema_version", + "event_date", + "ts_wall", + "ts_wall_ms", + "ts_mono_ms", + "level", + "event_name", + "component", + "session_id", + "conversation_id", + "user_action_id", + "query_id", + "effective_query_id", + "turn_id", + "loop_iter", + "parent_turn_id", + "subagent_id", + "subagent_type", + "subagent_reason", + "subagent_trigger_kind", + "subagent_trigger_detail", + "subagent_trigger_payload_json", + "agent_name", + "source_group", + "query_source", + "request_id", + "tool_call_id", + "span_id", + "parent_span_id", + "cwd", + "git_branch", + "build_version", + "payload_json", + "snapshot_refs_json", + "raw_event_json", +], eventsRawRows)} + +${createInsertSql("queries", [ + "query_id", + "user_action_id", + "session_id", + "conversation_id", + "query_source", + "subagent_id", + "subagent_type", + "subagent_reason", + "subagent_trigger_kind", + "subagent_trigger_detail", + "subagent_trigger_payload_json", + "agent_name", + "source_group", + "started_at", + "started_at_ms", + "ended_at", + "ended_at_ms", + "duration_ms", + "first_event", + "last_event", + "terminal_reason", + "stop_reason", + "turn_count", + "query_max_loop_iter", + "query_avg_loop_iter", + "tool_call_count", + "event_count", + "raw_query_started_count", + "raw_query_terminated_count", + "inferred_query_started_count", + "inferred_query_terminated_count", + "strict_is_complete", + "inferred_is_complete", +], queryInsertRows)} + +${createInsertSql("turns", [ + "turn_key", + "query_id", + "turn_id", + "user_action_id", + "subagent_id", + "query_source", + "subagent_reason", + "agent_name", + "source_group", + "loop_iter_start", + "loop_iter_end", + "started_at", + "started_at_ms", + "ended_at", + "ended_at_ms", + "duration_ms", + "first_event", + "last_event", + "transition_out", + "termination_reason", + "stop_reason", + "tool_call_count", + "assistant_tool_use_count", + "event_count", + "raw_turn_started_count", + "raw_state_before_count", + "raw_state_after_count", + "inferred_turn_started_count", + "inferred_state_before_count", + "inferred_state_after_count", + "strict_is_closed", + "inferred_is_closed", +], turnInsertRows)} + +${createInsertSql("tools", [ + "tool_call_id", + "user_action_id", + "query_id", + "turn_id", + "subagent_id", + "tool_name", + "execution_mode", + "detected_at", + "detected_at_ms", + "enqueued_at", + "enqueued_at_ms", + "started_at", + "started_at_ms", + "completed_at", + "completed_at_ms", + "duration_ms", + "success", + "failure_reason", + "event_count", + "has_tool_use_detected", + "has_started", + "has_completed", + "has_failed", + "is_closed", +], toolInsertRows)} + +${createInsertSql("subagents", [ + "subagent_id", + "query_id", + "user_action_id", + "subagent_type", + "subagent_reason", + "subagent_trigger_kind", + "subagent_trigger_detail", + "subagent_trigger_payload_json", + "query_source", + "agent_name", + "source_group", + "spawned_at", + "spawned_at_ms", + "completed_at", + "completed_at_ms", + "duration_ms", + "transcript_enabled", + "inherited_message_count", + "prompt_message_count", + "message_event_count", + "has_spawned", + "has_completed", +], subagentInsertRows)} + +${createInsertSql("recoveries", [ + "recovery_key", + "event_name", + "user_action_id", + "query_id", + "turn_id", + "subagent_id", + "ts_wall", + "ts_wall_ms", + "transition_to", + "reason", + "payload_json", +], recoveryRows)} + +${createInsertSql("snapshots_index", [ + "snapshot_ref", + "file_name", + "relative_path", + "absolute_path", + "exists", + "size_bytes", + "sha256", + "referenced_count", + "first_event_ts", + "last_event_ts", + "category", +], snapshotInsertRows)} + +${createInsertSql("usage_facts", [ + "usage_fact_id", + "event_date", + "ts_wall", + "ts_wall_ms", + "user_action_id", + "query_id", + "query_source", + "subagent_id", + "subagent_reason", + "agent_name", + "source_group", + "source_kind", + "source_ref", + "request_id", + "assistant_message_count", + "is_authoritative", + "input_tokens", + "output_tokens", + "cache_read_input_tokens", + "cache_creation_input_tokens", + "total_prompt_input_tokens", + "total_billed_tokens", +], usageFactRows)} + +${createInsertSql("daily_rollups", [ + "event_date", + "event_count", + "user_action_count", + "query_count", + "turn_count", + "tool_call_count", + "subagent_count", + "snapshot_ref_count", + "latest_event_ts", +], dailyRollupRows)} + +CREATE OR REPLACE VIEW user_actions AS +WITH usage_authoritative AS ( + SELECT + event_date, + user_action_id, + SUM(input_tokens) AS raw_input_tokens, + SUM(output_tokens) AS output_tokens, + SUM(cache_read_input_tokens) AS cache_read_tokens, + SUM(cache_creation_input_tokens) AS cache_create_tokens, + SUM(total_prompt_input_tokens) AS total_prompt_input_tokens, + SUM(total_billed_tokens) AS total_billed_tokens, + SUM(CASE WHEN agent_name = 'main_thread' THEN total_prompt_input_tokens ELSE 0 END) AS main_thread_total_prompt_input_tokens, + SUM(CASE WHEN agent_name <> 'main_thread' THEN total_prompt_input_tokens ELSE 0 END) AS subagent_total_prompt_input_tokens + FROM usage_facts + WHERE is_authoritative AND user_action_id IS NOT NULL + GROUP BY 1, 2 +), +event_agg AS ( + SELECT + event_date, + user_action_id, + MIN(ts_wall) AS started_at, + MIN(ts_wall_ms) AS started_at_ms, + MAX(ts_wall) AS ended_at, + MAX(ts_wall_ms) AS ended_at_ms, + MAX(ts_wall_ms) - MIN(ts_wall_ms) AS duration_ms, + COUNT(*) AS event_count, + COUNT(DISTINCT effective_query_id) FILTER (WHERE effective_query_id IS NOT NULL) AS query_count, + COUNT(DISTINCT effective_query_id) FILTER (WHERE effective_query_id IS NOT NULL AND agent_name = 'main_thread') AS main_thread_query_count, + COUNT(DISTINCT effective_query_id) FILTER (WHERE effective_query_id IS NOT NULL AND agent_name <> 'main_thread') AS subagent_query_count, + COUNT(DISTINCT subagent_id) FILTER (WHERE subagent_id IS NOT NULL) AS subagent_count, + COUNT(DISTINCT tool_call_id) FILTER (WHERE tool_call_id IS NOT NULL) AS tool_call_count + FROM events_raw + WHERE user_action_id IS NOT NULL + GROUP BY 1, 2 +) +SELECT + e.event_date, + e.user_action_id, + e.started_at, + e.started_at_ms, + e.ended_at, + e.ended_at_ms, + e.duration_ms, + e.event_count, + e.query_count, + e.main_thread_query_count, + e.subagent_query_count, + e.subagent_count, + e.tool_call_count, + COALESCE(u.raw_input_tokens, 0) AS raw_input_tokens, + COALESCE(u.output_tokens, 0) AS output_tokens, + COALESCE(u.cache_read_tokens, 0) AS cache_read_tokens, + COALESCE(u.cache_create_tokens, 0) AS cache_create_tokens, + COALESCE(u.total_prompt_input_tokens, 0) AS total_prompt_input_tokens, + COALESCE(u.total_billed_tokens, 0) AS total_billed_tokens, + COALESCE(u.main_thread_total_prompt_input_tokens, 0) AS main_thread_total_prompt_input_tokens, + COALESCE(u.subagent_total_prompt_input_tokens, 0) AS subagent_total_prompt_input_tokens +FROM event_agg e +LEFT JOIN usage_authoritative u + ON u.event_date = e.event_date + AND u.user_action_id = e.user_action_id; + +CREATE OR REPLACE VIEW query_source_cost_share AS +WITH per_source AS ( + SELECT + event_date, + user_action_id, + query_source, + SUM(input_tokens) AS raw_input_tokens, + SUM(output_tokens) AS output_tokens, + SUM(cache_read_input_tokens) AS cache_read_tokens, + SUM(cache_creation_input_tokens) AS cache_create_tokens, + SUM(total_prompt_input_tokens) AS total_prompt_input_tokens, + SUM(total_billed_tokens) AS total_billed_tokens + FROM usage_facts + WHERE is_authoritative AND user_action_id IS NOT NULL + GROUP BY 1, 2, 3 +), +per_action AS ( + SELECT + event_date, + user_action_id, + SUM(total_billed_tokens) AS action_total_billed_tokens + FROM per_source + GROUP BY 1, 2 +) +SELECT + s.event_date, + s.user_action_id, + s.query_source, + s.raw_input_tokens, + s.output_tokens, + s.cache_read_tokens, + s.cache_create_tokens, + s.total_prompt_input_tokens, + s.total_billed_tokens, + CASE + WHEN a.action_total_billed_tokens = 0 THEN NULL + ELSE ROUND(s.total_billed_tokens * 1.0 / a.action_total_billed_tokens, 6) + END AS cost_share +FROM per_source s +LEFT JOIN per_action a + ON a.event_date = s.event_date + AND a.user_action_id = s.user_action_id; + +CREATE OR REPLACE VIEW query_source_cost_share_daily AS +WITH per_day AS ( + SELECT + event_date, + query_source, + SUM(raw_input_tokens) AS raw_input_tokens, + SUM(output_tokens) AS output_tokens, + SUM(cache_read_tokens) AS cache_read_tokens, + SUM(cache_create_tokens) AS cache_create_tokens, + SUM(total_prompt_input_tokens) AS total_prompt_input_tokens, + SUM(total_billed_tokens) AS total_billed_tokens + FROM query_source_cost_share + GROUP BY 1, 2 +), +day_total AS ( + SELECT + event_date, + SUM(total_billed_tokens) AS day_total_billed_tokens + FROM per_day + GROUP BY 1 +) +SELECT + p.event_date, + p.query_source, + p.raw_input_tokens, + p.output_tokens, + p.cache_read_tokens, + p.cache_create_tokens, + p.total_prompt_input_tokens, + p.total_billed_tokens, + CASE + WHEN d.day_total_billed_tokens = 0 THEN NULL + ELSE ROUND(p.total_billed_tokens * 1.0 / d.day_total_billed_tokens, 6) + END AS daily_cost_share +FROM per_day p +LEFT JOIN day_total d + ON d.event_date = p.event_date; + +CREATE OR REPLACE VIEW agent_cost_daily AS +WITH per_agent AS ( + SELECT + event_date, + COALESCE(agent_name, 'unknown') AS agent_name, + COALESCE(source_group, 'unknown') AS source_group, + SUM(input_tokens) AS agent_total_raw_input_tokens, + SUM(output_tokens) AS agent_total_output_tokens, + SUM(cache_read_input_tokens) AS agent_total_cache_read_tokens, + SUM(cache_creation_input_tokens) AS agent_total_cache_create_tokens, + SUM(total_prompt_input_tokens) AS agent_total_prompt_input_tokens, + SUM(total_billed_tokens) AS agent_total_billed_tokens + FROM usage_facts + WHERE is_authoritative + GROUP BY 1, 2, 3 +), +per_day AS ( + SELECT + event_date, + SUM(agent_total_billed_tokens) AS day_total_billed_tokens + FROM per_agent + GROUP BY 1 +), +query_stats AS ( + SELECT + SUBSTR(started_at, 1, 10) AS event_date, + COALESCE(agent_name, 'unknown') AS agent_name, + COUNT(*) AS agent_query_count, + SUM(turn_count) AS agent_turn_count, + ROUND(AVG(turn_count), 3) AS agent_avg_turns_per_query, + ROUND(AVG(query_max_loop_iter), 3) AS agent_avg_loop_iter_end, + ROUND(percentile_cont(0.95) WITHIN GROUP (ORDER BY query_max_loop_iter), 3) AS agent_p95_loop_iter_end, + ROUND(AVG(CASE WHEN COALESCE(query_max_loop_iter, 0) > 1 THEN 1.0 ELSE 0.0 END), 6) AS agent_queries_with_loop_iter_gt_1_rate + FROM queries + GROUP BY 1, 2 +) +SELECT + p.event_date, + p.agent_name, + p.source_group, + p.agent_total_raw_input_tokens, + p.agent_total_output_tokens, + p.agent_total_cache_read_tokens, + p.agent_total_cache_create_tokens, + p.agent_total_prompt_input_tokens, + p.agent_total_billed_tokens, + CASE + WHEN d.day_total_billed_tokens = 0 THEN NULL + ELSE ROUND(p.agent_total_billed_tokens * 1.0 / d.day_total_billed_tokens, 6) + END AS agent_cost_share, + COALESCE(qs.agent_query_count, 0) AS agent_query_count, + COALESCE(qs.agent_turn_count, 0) AS agent_turn_count, + qs.agent_avg_turns_per_query, + qs.agent_avg_loop_iter_end, + qs.agent_p95_loop_iter_end, + qs.agent_queries_with_loop_iter_gt_1_rate +FROM per_agent p +LEFT JOIN per_day d ON d.event_date = p.event_date +LEFT JOIN query_stats qs + ON qs.event_date = p.event_date + AND qs.agent_name = p.agent_name; + +CREATE OR REPLACE VIEW subagent_reason_daily AS +SELECT + SUBSTR(COALESCE(spawned_at, completed_at), 1, 10) AS event_date, + COALESCE(subagent_reason, 'unknown') AS subagent_reason, + COALESCE(agent_name, 'unknown') AS agent_name, + COUNT(*) AS subagent_count, + ROUND(AVG(duration_ms), 3) AS avg_duration_ms, + ROUND(AVG(prompt_message_count), 3) AS avg_prompt_message_count, + ROUND(AVG(message_event_count), 3) AS avg_message_event_count +FROM subagents +GROUP BY 1, 2, 3; + +CREATE OR REPLACE VIEW metrics_integrity_daily AS +WITH user_action_coverage AS ( + SELECT + event_date, + ROUND(AVG(CASE WHEN main_thread_query_count > 0 THEN 1.0 ELSE 0.0 END), 6) AS user_action_main_query_coverage_rate + FROM user_actions + GROUP BY 1 +) +SELECT + r.event_date, + COALESCE(u.user_action_main_query_coverage_rate, 0) AS user_action_main_query_coverage_rate, + ROUND((SELECT AVG(CASE WHEN strict_is_complete THEN 1.0 ELSE 0.0 END) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date), 6) AS strict_query_completion_rate, + ROUND((SELECT AVG(CASE WHEN inferred_is_complete THEN 1.0 ELSE 0.0 END) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date), 6) AS inferred_query_completion_rate, + ROUND( + COALESCE((SELECT AVG(CASE WHEN inferred_is_complete THEN 1.0 ELSE 0.0 END) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date), 0) + - + COALESCE((SELECT AVG(CASE WHEN strict_is_complete THEN 1.0 ELSE 0.0 END) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date), 0), + 6 + ) AS query_completeness_gap, + ROUND((SELECT AVG(CASE WHEN strict_is_closed THEN 1.0 ELSE 0.0 END) FROM turns t WHERE SUBSTR(t.started_at, 1, 10) = r.event_date), 6) AS strict_turn_state_closure_rate, + ROUND((SELECT AVG(CASE WHEN inferred_is_closed THEN 1.0 ELSE 0.0 END) FROM turns t WHERE SUBSTR(t.started_at, 1, 10) = r.event_date), 6) AS inferred_turn_state_closure_rate, + ROUND( + COALESCE((SELECT AVG(CASE WHEN inferred_is_closed THEN 1.0 ELSE 0.0 END) FROM turns t WHERE SUBSTR(t.started_at, 1, 10) = r.event_date), 0) + - + COALESCE((SELECT AVG(CASE WHEN strict_is_closed THEN 1.0 ELSE 0.0 END) FROM turns t WHERE SUBSTR(t.started_at, 1, 10) = r.event_date), 0), + 6 + ) AS turn_closure_gap, + ROUND((SELECT AVG(CASE WHEN is_closed THEN 1.0 ELSE 0.0 END) FROM tools t WHERE COALESCE(t.detected_at, t.started_at, t.completed_at, '') LIKE r.event_date || '%'), 6) AS tool_lifecycle_closure_rate, + ROUND((SELECT AVG(CASE WHEN has_spawned AND has_completed THEN 1.0 ELSE 0.0 END) FROM subagents s WHERE COALESCE(s.spawned_at, s.completed_at, '') LIKE r.event_date || '%'), 6) AS subagent_lifecycle_closure_rate, + CASE + WHEN (SELECT COUNT(*) FROM snapshots_index si WHERE COALESCE(si.first_event_ts, '') LIKE r.event_date || '%' AND si.referenced_count > 0) = 0 THEN 0 + ELSE ROUND( + (SELECT COUNT(*) FROM snapshots_index si WHERE COALESCE(si.first_event_ts, '') LIKE r.event_date || '%' AND si.referenced_count > 0 AND NOT si.exists) * 1.0 + / + (SELECT COUNT(*) FROM snapshots_index si WHERE COALESCE(si.first_event_ts, '') LIKE r.event_date || '%' AND si.referenced_count > 0), + 6 + ) + END AS snapshot_missing_rate, + ROUND(AVG(CASE WHEN er.user_action_id IS NULL AND er.effective_query_id IS NULL AND er.turn_id IS NULL AND er.tool_call_id IS NULL AND er.subagent_id IS NULL THEN 1.0 ELSE 0.0 END), 6) AS orphan_event_rate +FROM daily_rollups r +LEFT JOIN events_raw er ON er.event_date = r.event_date +LEFT JOIN user_action_coverage u ON u.event_date = r.event_date +GROUP BY 1, u.user_action_main_query_coverage_rate; + +CREATE OR REPLACE VIEW metrics_cost_daily AS +WITH completed_queries AS ( + SELECT + SUBSTR(started_at, 1, 10) AS event_date, + COUNT(*) FILTER (WHERE inferred_is_complete AND terminal_reason = 'completed') AS successful_completed_query_count + FROM queries + GROUP BY 1 +), +query_costs AS ( + SELECT + event_date, + query_id, + SUM(total_prompt_input_tokens) AS query_total_prompt_input_tokens, + SUM(total_billed_tokens) AS query_total_billed_tokens + FROM usage_facts + WHERE is_authoritative AND query_id IS NOT NULL + GROUP BY 1, 2 +) +SELECT + ua.event_date, + SUM(ua.raw_input_tokens) AS user_action_total_raw_input_tokens, + SUM(ua.output_tokens) AS user_action_total_output_tokens, + SUM(ua.cache_read_tokens) AS user_action_total_cache_read_tokens, + SUM(ua.cache_create_tokens) AS user_action_total_cache_create_tokens, + SUM(ua.total_prompt_input_tokens) AS user_action_total_prompt_input_tokens, + SUM(ua.total_billed_tokens) AS user_action_total_billed_tokens, + SUM(ua.main_thread_total_prompt_input_tokens) AS main_thread_total_prompt_input_tokens, + SUM(ua.subagent_total_prompt_input_tokens) AS subagent_total_prompt_input_tokens, + ROUND(AVG(ua.total_prompt_input_tokens), 3) AS avg_total_prompt_input_tokens_per_user_action, + ROUND(AVG(ua.total_billed_tokens), 3) AS avg_total_billed_tokens_per_user_action, + ROUND((SELECT AVG(query_total_prompt_input_tokens) FROM query_costs qc WHERE qc.event_date = ua.event_date), 3) AS avg_total_prompt_input_tokens_per_query, + ROUND((SELECT AVG(query_total_billed_tokens) FROM query_costs qc WHERE qc.event_date = ua.event_date), 3) AS avg_total_billed_tokens_per_query, + CASE + WHEN SUM(ua.main_thread_total_prompt_input_tokens) = 0 THEN NULL + ELSE ROUND(SUM(ua.subagent_total_prompt_input_tokens) * 1.0 / SUM(ua.main_thread_total_prompt_input_tokens), 6) + END AS subagent_amplification_ratio, + CASE + WHEN COALESCE(MAX(c.successful_completed_query_count), 0) = 0 THEN NULL + ELSE ROUND(SUM(ua.total_billed_tokens) * 1.0 / MAX(c.successful_completed_query_count), 6) + END AS cost_per_successful_completed_query +FROM user_actions ua +LEFT JOIN completed_queries c ON c.event_date = ua.event_date +GROUP BY 1; + +CREATE OR REPLACE VIEW metrics_loop_daily AS +SELECT + SUBSTR(started_at, 1, 10) AS event_date, + ROUND(AVG(turn_count), 3) AS daily_avg_turns_per_query, + ROUND(AVG(query_max_loop_iter), 3) AS daily_avg_loop_iter_end, + ROUND(percentile_cont(0.95) WITHIN GROUP (ORDER BY query_max_loop_iter), 3) AS daily_p95_loop_iter_end, + ROUND(AVG(CASE WHEN COALESCE(query_max_loop_iter, 0) > 1 THEN 1.0 ELSE 0.0 END), 6) AS daily_queries_with_loop_iter_gt_1_rate +FROM queries +GROUP BY 1; + +CREATE OR REPLACE VIEW metrics_latency_daily AS +WITH turn_latencies AS ( + SELECT + event_date, + query_id, + turn_id, + MAX(CASE WHEN event_name = 'turn.started' THEN ts_wall_ms END) AS turn_started_ms, + MAX(CASE WHEN event_name = 'state.snapshot.before_turn' THEN ts_wall_ms END) AS before_turn_ms, + MAX(CASE WHEN event_name = 'prompt.build.started' THEN ts_wall_ms END) AS prompt_build_started_ms, + MAX(CASE WHEN event_name = 'prompt.build.completed' THEN ts_wall_ms END) AS prompt_build_completed_ms, + MAX(CASE WHEN event_name = 'api.request.started' THEN ts_wall_ms END) AS api_request_started_ms, + MIN(CASE WHEN event_name = 'api.stream.first_chunk' THEN ts_wall_ms END) AS api_first_chunk_ms, + MAX(CASE WHEN event_name = 'api.stream.completed' THEN ts_wall_ms END) AS api_completed_ms + FROM events_raw + WHERE effective_query_id IS NOT NULL AND turn_id IS NOT NULL + GROUP BY 1, 2, 3 +), +action_first_chunk AS ( + SELECT + event_date, + user_action_id, + MIN(ts_wall_ms) AS action_started_ms, + MIN(CASE WHEN event_name = 'api.stream.first_chunk' AND agent_name = 'main_thread' THEN ts_wall_ms END) AS main_first_chunk_ms + FROM events_raw + WHERE user_action_id IS NOT NULL + GROUP BY 1, 2 +), +stop_hook_durations AS ( + SELECT + event_date, + AVG(COALESCE(TRY_CAST(json_extract(payload_json, '$.duration_ms') AS DOUBLE), 0)) AS stop_hook_duration_ms + FROM events_raw + WHERE event_name = 'stop_hooks.completed' + GROUP BY 1 +) +SELECT + tl.event_date, + ROUND((SELECT AVG(main_first_chunk_ms - action_started_ms) FROM action_first_chunk afc WHERE afc.event_date = tl.event_date AND afc.main_first_chunk_ms IS NOT NULL), 3) AS submit_to_first_chunk_ms, + ROUND(AVG(CASE WHEN tl.before_turn_ms IS NOT NULL AND tl.prompt_build_started_ms IS NOT NULL THEN tl.prompt_build_started_ms - tl.before_turn_ms END), 3) AS preprocess_duration_ms, + ROUND(AVG(CASE WHEN tl.prompt_build_started_ms IS NOT NULL AND tl.prompt_build_completed_ms IS NOT NULL THEN tl.prompt_build_completed_ms - tl.prompt_build_started_ms END), 3) AS prompt_build_duration_ms, + ROUND(AVG(CASE WHEN tl.api_request_started_ms IS NOT NULL AND tl.api_first_chunk_ms IS NOT NULL THEN tl.api_first_chunk_ms - tl.api_request_started_ms END), 3) AS api_first_chunk_latency_ms, + ROUND(AVG(CASE WHEN tl.api_request_started_ms IS NOT NULL AND tl.api_completed_ms IS NOT NULL THEN tl.api_completed_ms - tl.api_request_started_ms END), 3) AS api_total_duration_ms, + ROUND((SELECT AVG(duration_ms) FROM tools t WHERE COALESCE(t.completed_at, t.started_at, t.enqueued_at, '') LIKE tl.event_date || '%'), 3) AS tool_execution_duration_ms, + ROUND((SELECT AVG(duration_ms) FROM subagents s WHERE COALESCE(s.completed_at, s.spawned_at, '') LIKE tl.event_date || '%'), 3) AS subagent_duration_ms, + ROUND((SELECT AVG(duration_ms) FROM user_actions ua WHERE ua.event_date = tl.event_date), 3) AS user_action_e2e_duration_ms, + ROUND(COALESCE(MAX(sd.stop_hook_duration_ms), 0), 3) AS stop_hook_duration_ms +FROM turn_latencies tl +LEFT JOIN stop_hook_durations sd ON sd.event_date = tl.event_date +GROUP BY 1; + +CREATE OR REPLACE VIEW metrics_compression_daily AS +WITH per_event AS ( + SELECT + event_date, + event_name, + COALESCE(TRY_CAST(json_extract(payload_json, '$.tokens_saved') AS BIGINT), 0) AS tokens_saved, + COALESCE(TRY_CAST(json_extract(payload_json, '$.estimated_tokens_before') AS BIGINT), 0) AS estimated_tokens_before, + COALESCE(TRY_CAST(json_extract(payload_json, '$.estimated_tokens_after') AS BIGINT), 0) AS estimated_tokens_after, + COALESCE(TRY_CAST(json_extract(payload_json, '$.compacted') AS BOOLEAN), FALSE) AS compacted + FROM events_raw + WHERE event_name LIKE 'messages.%' +), +preprocess AS ( + SELECT + event_date, + SUM(CASE WHEN event_name = 'messages.preprocess.completed' THEN estimated_tokens_before ELSE 0 END) AS preprocess_tokens_before_total, + SUM(CASE WHEN event_name = 'messages.preprocess.completed' THEN estimated_tokens_after ELSE 0 END) AS preprocess_tokens_after_total + FROM per_event + GROUP BY 1 +) +SELECT + p.event_date, + p.preprocess_tokens_before_total, + p.preprocess_tokens_after_total, + p.preprocess_tokens_before_total - p.preprocess_tokens_after_total AS tokens_saved_total, + CASE + WHEN p.preprocess_tokens_before_total = 0 THEN 0 + ELSE ROUND((p.preprocess_tokens_before_total - p.preprocess_tokens_after_total) * 1.0 / p.preprocess_tokens_before_total, 6) + END AS compression_gain_ratio, + SUM(CASE WHEN e.event_name = 'messages.tool_result_budget.applied' THEN e.tokens_saved ELSE 0 END) AS tool_result_budget_saved_tokens, + SUM(CASE WHEN e.event_name = 'messages.history_snip.applied' THEN e.tokens_saved ELSE 0 END) AS history_snip_saved_tokens, + SUM(CASE WHEN e.event_name = 'messages.microcompact.applied' THEN e.tokens_saved ELSE 0 END) AS microcompact_saved_tokens, + SUM(CASE WHEN e.event_name = 'messages.autoconpact.completed' THEN e.estimated_tokens_before - e.estimated_tokens_after ELSE 0 END) AS autocompact_saved_tokens, + ROUND(AVG(CASE WHEN e.event_name = 'messages.autoconpact.completed' AND e.compacted THEN 1.0 ELSE 0.0 END), 6) AS autocompact_trigger_rate, + CASE WHEN SUM(CASE WHEN e.event_name = 'messages.history_snip.applied' THEN 1 ELSE 0 END) > 0 THEN 1.0 ELSE 0.0 END AS history_snip_gate_on_rate, + 0.0 AS contextCollapse_enabled_gauge, + 0 AS contextCollapse_attempted, + 0 AS contextCollapse_committed +FROM preprocess p +LEFT JOIN per_event e ON e.event_date = p.event_date +GROUP BY 1, 2, 3; + +CREATE OR REPLACE VIEW tool_calls_by_name AS +SELECT + COALESCE(tool_name, 'unknown') AS tool_name, + COUNT(*) AS tool_calls, + ROUND(AVG(CASE WHEN success = TRUE THEN 1.0 ELSE 0.0 END), 6) AS tool_success_rate, + ROUND(AVG(CASE WHEN success = FALSE THEN 1.0 ELSE 0.0 END), 6) AS tool_failure_rate, + ROUND(AVG(duration_ms), 3) AS tool_avg_duration_ms, + ROUND(percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms), 3) AS tool_p95_duration_ms +FROM tools +GROUP BY 1; + +CREATE OR REPLACE VIEW tool_calls_by_mode AS +SELECT + COALESCE(json_extract_string(payload_json, '$.mode'), 'unknown') AS tool_mode, + COUNT(*) AS tool_calls +FROM events_raw +WHERE event_name = 'tool.execution.mode.selected' +GROUP BY 1; + +CREATE OR REPLACE VIEW metrics_tools_daily AS +WITH daily_tools AS ( + SELECT + SUBSTR(COALESCE(completed_at, started_at, enqueued_at, detected_at), 1, 10) AS event_date, + COUNT(*) AS tool_calls_total, + ROUND(AVG(CASE WHEN success = TRUE THEN 1.0 ELSE 0.0 END), 6) AS tool_success_rate, + ROUND(AVG(CASE WHEN success = FALSE THEN 1.0 ELSE 0.0 END), 6) AS tool_failure_rate, + ROUND(AVG(duration_ms), 3) AS tool_avg_duration_ms, + ROUND(percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms), 3) AS tool_p95_duration_ms + FROM tools + GROUP BY 1 +) +SELECT + r.event_date, + COALESCE(dt.tool_calls_total, 0) AS tool_calls_total, + COALESCE(dt.tool_success_rate, 0) AS tool_success_rate, + COALESCE(dt.tool_failure_rate, 0) AS tool_failure_rate, + COALESCE(dt.tool_avg_duration_ms, 0) AS tool_avg_duration_ms, + COALESCE(dt.tool_p95_duration_ms, 0) AS tool_p95_duration_ms, + ROUND(( + SELECT AVG(CASE WHEN event_name = 'tool.context.updated' THEN 1.0 ELSE 0.0 END) + FROM events_raw er + WHERE er.event_date = r.event_date AND er.event_name IN ('tool.context.updated', 'turn.started') + ), 6) AS context_update_rate, + ROUND((SELECT AVG(tool_call_count) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date), 6) AS tools_per_query, + ROUND((SELECT AVG(tool_call_count) FROM queries q WHERE SUBSTR(q.started_at, 1, 10) = r.event_date AND q.subagent_id IS NOT NULL), 6) AS tools_per_subagent, + ROUND((SELECT AVG(CASE WHEN assistant_tool_use_count > 0 THEN CASE WHEN transition_out = 'next_turn' THEN 1.0 ELSE 0.0 END END) FROM turns t WHERE SUBSTR(t.started_at, 1, 10) = r.event_date), 6) AS tool_followup_turn_ratio +FROM daily_rollups r +LEFT JOIN daily_tools dt ON dt.event_date = r.event_date; + +CREATE OR REPLACE VIEW terminal_reason_distribution AS +SELECT + SUBSTR(started_at, 1, 10) AS event_date, + COALESCE(terminal_reason, 'unknown') AS terminal_reason, + COUNT(*) AS query_count +FROM queries +GROUP BY 1, 2; + +CREATE OR REPLACE VIEW metrics_recovery_daily AS +WITH query_failures AS ( + SELECT + SUBSTR(started_at, 1, 10) AS event_date, + COUNT(*) FILTER (WHERE terminal_reason = 'completed') AS completed_queries, + COUNT(*) FILTER (WHERE terminal_reason IS NOT NULL AND terminal_reason <> 'completed') AS failed_queries + FROM queries + GROUP BY 1 +), +tool_failure_queries AS ( + SELECT + SUBSTR(q.started_at, 1, 10) AS event_date, + COUNT(DISTINCT t.query_id) AS queries_with_failed_tools, + COUNT(DISTINCT CASE WHEN q.terminal_reason IS NOT NULL AND q.terminal_reason <> 'completed' THEN t.query_id END) AS failed_tool_terminal_queries + FROM tools t + LEFT JOIN queries q ON q.query_id = t.query_id + WHERE t.has_failed + GROUP BY 1 +) +SELECT + r.event_date, + SUM(CASE WHEN rec.event_name LIKE '%prompt_too_long%' THEN 1 ELSE 0 END) AS prompt_too_long_recovery_attempts, + CASE + WHEN SUM(CASE WHEN rec.event_name LIKE '%prompt_too_long%' THEN 1 ELSE 0 END) = 0 THEN NULL + ELSE ROUND(AVG(CASE WHEN rec.event_name LIKE '%prompt_too_long%' AND rec.reason = 'completed' THEN 1.0 ELSE 0.0 END), 6) + END AS prompt_too_long_recovery_success_rate, + SUM(CASE WHEN rec.event_name LIKE '%max_output_tokens%' THEN 1 ELSE 0 END) AS max_output_tokens_recovery_attempts, + CASE + WHEN SUM(CASE WHEN rec.event_name LIKE '%max_output_tokens%' THEN 1 ELSE 0 END) = 0 THEN NULL + ELSE ROUND(AVG(CASE WHEN rec.event_name LIKE '%max_output_tokens%' AND rec.reason = 'completed' THEN 1.0 ELSE 0.0 END), 6) + END AS max_output_tokens_recovery_success_rate, + ROUND(AVG(CASE WHEN er.event_name = 'token_budget.decision' AND json_extract_string(er.payload_json, '$.action') = 'continue' THEN 1.0 ELSE 0.0 END), 6) AS token_budget_continue_rate, + ROUND(AVG(CASE WHEN er.event_name = 'stop_hooks.completed' AND COALESCE(TRY_CAST(json_extract(er.payload_json, '$.prevent_continuation') AS BOOLEAN), FALSE) THEN 1.0 ELSE 0.0 END), 6) AS stop_hook_block_rate, + CASE + WHEN COALESCE(MAX(qf.completed_queries), 0) + COALESCE(MAX(qf.failed_queries), 0) = 0 THEN 0 + ELSE ROUND(COALESCE(MAX(qf.failed_queries), 0) * 1.0 / (COALESCE(MAX(qf.completed_queries), 0) + COALESCE(MAX(qf.failed_queries), 0)), 6) + END AS api_error_rate, + CASE + WHEN COALESCE(MAX(tfq.queries_with_failed_tools), 0) = 0 THEN NULL + ELSE ROUND(COALESCE(MAX(tfq.failed_tool_terminal_queries), 0) * 1.0 / MAX(tfq.queries_with_failed_tools), 6) + END AS tool_failure_terminal_rate, + ROUND(AVG(CASE WHEN er.event_name = 'exporter.failure' THEN 1.0 ELSE 0.0 END), 6) AS exporter_failure_rate, + ROUND(AVG(CASE WHEN er.event_name = 'dropped_event' THEN 1.0 ELSE 0.0 END), 6) AS dropped_event_rate +FROM daily_rollups r +LEFT JOIN recoveries rec ON rec.ts_wall LIKE r.event_date || '%' +LEFT JOIN events_raw er ON er.event_date = r.event_date AND er.event_name IN ('token_budget.decision', 'stop_hooks.completed', 'exporter.failure', 'dropped_event') +LEFT JOIN query_failures qf ON qf.event_date = r.event_date +LEFT JOIN tool_failure_queries tfq ON tfq.event_date = r.event_date +GROUP BY 1; + +CREATE OR REPLACE VIEW system_flags AS +SELECT + event_date, + 0.0 AS contextCollapse_enabled_gauge, + 0 AS contextCollapse_attempted, + 0 AS contextCollapse_committed, + CASE + WHEN SUM(CASE WHEN event_name = 'messages.history_snip.applied' THEN 1 ELSE 0 END) > 0 + THEN '样本中观察到命中' + ELSE '样本中未观察到命中' + END AS history_snip_gate_state, + CASE WHEN SUM(CASE WHEN event_name = 'messages.history_snip.applied' THEN 1 ELSE 0 END) > 0 THEN 1.0 ELSE 0.0 END AS history_snip_gate_on_rate +FROM events_raw +GROUP BY 1; + +COMMIT; +` + +writeFileSync(sqlPath, sql, "utf8") + +console.log( + JSON.stringify( + { + duckdbExe, + databasePath, + sqlPath, + eventsPath, + events: eventsRawRows.length, + queries: queryInsertRows.length, + turns: turnInsertRows.length, + tools: toolInsertRows.length, + subagents: subagentInsertRows.length, + recoveries: recoveryRows.length, + snapshots: snapshotInsertRows.length, + usageFacts: usageFactRows.length, + dailyRollups: dailyRollupRows.length, + }, + null, + 2, + ), +) diff --git a/scripts/observability/clean_observability.py b/scripts/observability/clean_observability.py new file mode 100644 index 0000000000..444be0b16b --- /dev/null +++ b/scripts/observability/clean_observability.py @@ -0,0 +1,420 @@ +from __future__ import annotations + +import json +import re +import shutil +from dataclasses import dataclass +from datetime import date +from pathlib import Path +from typing import Any + + +REPO_ROOT = Path(__file__).resolve().parents[2] +OBSERVABILITY_DIR = REPO_ROOT / ".observability" +EVENT_GLOB = "events-*.jsonl" +SNAPSHOTS_DIR = OBSERVABILITY_DIR / "snapshots" +ARCHIVE_ROOT = REPO_ROOT / ".observability_archive" / "2026-04-19" +ARCHIVE_EVENTS_DIR = ARCHIVE_ROOT / "events" +ARCHIVE_SNAPSHOTS_DIR = ARCHIVE_ROOT / "snapshots" +PRE_REPORT_PATH = REPO_ROOT / "ObservrityTask" / "观测数据清洗前清单.md" +POST_REPORT_PATH = REPO_ROOT / "ObservrityTask" / "观测数据清洗后校验报告.md" + +KEEP_DAY = date(2026, 4, 20) +ARCHIVE_CUTOFF_DAY = date(2026, 4, 19) +SNAPSHOT_REF_PREFIX = ".observability/snapshots/" +SNAPSHOT_REF_RE = re.compile(r"\.observability/snapshots/[^\s\"']+\.json") + + +@dataclass +class ParsedEvent: + obj: dict[str, Any] + source_file: Path + day: date | None + snapshot_refs: set[str] + + +@dataclass +class FilePartition: + source_file: Path + keep_events: list[ParsedEvent] + archive_events: list[ParsedEvent] + + +def skip_whitespace(text: str, index: int) -> int: + length = len(text) + while index < length and text[index].isspace(): + index += 1 + return index + + +def parse_concatenated_json(path: Path) -> tuple[list[dict[str, Any]], list[str]]: + text = path.read_text(encoding="utf-8") + decoder = json.JSONDecoder() + index = 0 + objects: list[dict[str, Any]] = [] + errors: list[str] = [] + + while True: + index = skip_whitespace(text, index) + if index >= len(text): + break + try: + obj, next_index = decoder.raw_decode(text, index) + except json.JSONDecodeError as exc: + errors.append(f"{path.name}: JSON decode failed at char {index}: {exc}") + break + if not isinstance(obj, dict): + errors.append(f"{path.name}: top-level object at char {index} is not a JSON object") + else: + objects.append(obj) + index = next_index + + return objects, errors + + +def extract_day(obj: dict[str, Any]) -> date | None: + raw = obj.get("ts_wall") + if not isinstance(raw, str) or len(raw) < 10: + return None + try: + return date.fromisoformat(raw[:10]) + except ValueError: + return None + + +def find_snapshot_refs(value: Any) -> set[str]: + refs: set[str] = set() + + def walk(node: Any) -> None: + if isinstance(node, str): + refs.update(SNAPSHOT_REF_RE.findall(node)) + return + if isinstance(node, dict): + for child in node.values(): + walk(child) + return + if isinstance(node, list): + for child in node: + walk(child) + + walk(value) + return refs + + +def snapshot_ref_to_path(ref: str) -> Path: + if not ref.startswith(SNAPSHOT_REF_PREFIX): + raise ValueError(f"Unexpected snapshot ref: {ref}") + return REPO_ROOT / Path(ref.replace("/", "\\")) + + +def format_event_objects(events: list[ParsedEvent]) -> str: + chunks = [json.dumps(event.obj, ensure_ascii=False, indent=2) for event in events] + return "\n".join(chunks) + ("\n" if chunks else "") + + +def collect_inventory() -> tuple[list[ParsedEvent], dict[Path, list[ParsedEvent]], list[str]]: + all_events: list[ParsedEvent] = [] + events_by_file: dict[Path, list[ParsedEvent]] = {} + parse_errors: list[str] = [] + + for path in sorted(OBSERVABILITY_DIR.glob(EVENT_GLOB)): + objects, errors = parse_concatenated_json(path) + parse_errors.extend(errors) + parsed = [ + ParsedEvent( + obj=obj, + source_file=path, + day=extract_day(obj), + snapshot_refs=find_snapshot_refs(obj), + ) + for obj in objects + ] + events_by_file[path] = parsed + all_events.extend(parsed) + + return all_events, events_by_file, parse_errors + + +def event_day_label(day: date | None) -> str: + return day.isoformat() if day else "" + + +def build_pre_report( + all_events: list[ParsedEvent], + events_by_file: dict[Path, list[ParsedEvent]], + parse_errors: list[str], +) -> str: + today_events = [event for event in all_events if event.day == KEEP_DAY] + older_events = [event for event in all_events if event.day is None or event.day < KEEP_DAY] + today_snapshot_refs = sorted({ref for event in today_events for ref in event.snapshot_refs}) + older_snapshot_refs = sorted({ref for event in older_events for ref in event.snapshot_refs}) + all_snapshot_paths = sorted(path for path in SNAPSHOTS_DIR.iterdir() if path.is_file()) + all_snapshot_refs = { + f"{SNAPSHOT_REF_PREFIX}{path.name}".replace("\\", "/") for path in all_snapshot_paths + } + older_exclusive_snapshot_refs = sorted(set(older_snapshot_refs) - set(today_snapshot_refs)) + unreferenced_snapshot_refs = sorted(all_snapshot_refs - set(today_snapshot_refs) - set(older_snapshot_refs)) + + lines = [ + "# 观测数据清洗前清单", + "", + f"- 扫描日期:{KEEP_DAY.isoformat()}", + f"- 目标保留日:{KEEP_DAY.isoformat()}", + f"- 归档截止日:{ARCHIVE_CUTOFF_DAY.isoformat()} 及更早", + "", + "## Event 文件", + "", + "| 文件 | 事件数 | 日期范围 |", + "|---|---:|---|", + ] + + for path, events in sorted(events_by_file.items()): + days = sorted({event_day_label(event.day) for event in events}) + day_range = f"{days[0]} -> {days[-1]}" if days else "" + lines.append(f"| `{path.relative_to(REPO_ROOT).as_posix()}` | {len(events)} | {day_range} |") + + lines.extend( + [ + "", + "## 汇总", + "", + f"- 今日事件总数:{len(today_events)}", + f"- 昨天及更早事件总数:{len(older_events)}", + f"- snapshots 总数:{len(all_snapshot_paths)}", + f"- 今日事件引用的 snapshot 数:{len(today_snapshot_refs)}", + f"- 昨天及更早事件独占的 snapshot 数:{len(older_exclusive_snapshot_refs)}", + f"- 无引用 snapshot 数:{len(unreferenced_snapshot_refs)}", + "", + "## 解析状态", + "", + f"- event 文件解析错误数:{len(parse_errors)}", + ] + ) + + if parse_errors: + lines.extend(["", "### 解析错误", ""]) + lines.extend(f"- {error}" for error in parse_errors) + + lines.extend( + [ + "", + "## 结论", + "", + f"- 今日保留基线将以 `{KEEP_DAY.isoformat()}` 事件为准。", + f"- 计划归档的旧快照数量:{len(older_exclusive_snapshot_refs) + len(unreferenced_snapshot_refs)}", + "- 快照清洗以事件引用关系为准,不按文件名日期粗删。", + ] + ) + return "\n".join(lines) + "\n" + + +def partition_events(events_by_file: dict[Path, list[ParsedEvent]]) -> list[FilePartition]: + partitions: list[FilePartition] = [] + for source_file, events in sorted(events_by_file.items()): + keep_events = [event for event in events if event.day == KEEP_DAY] + archive_events = [event for event in events if event.day is None or event.day < KEEP_DAY] + partitions.append( + FilePartition( + source_file=source_file, + keep_events=keep_events, + archive_events=archive_events, + ) + ) + return partitions + + +def ensure_archive_dirs() -> None: + ARCHIVE_EVENTS_DIR.mkdir(parents=True, exist_ok=True) + ARCHIVE_SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True) + + +def archive_events(partitions: list[FilePartition]) -> tuple[list[str], list[str]]: + actions: list[str] = [] + retained_files: list[str] = [] + ensure_archive_dirs() + + for partition in partitions: + src = partition.source_file + archive_target = ARCHIVE_EVENTS_DIR / src.name + + if partition.keep_events and not partition.archive_events: + retained_files.append(src.relative_to(REPO_ROOT).as_posix()) + actions.append(f"保留 `{src.relative_to(REPO_ROOT).as_posix()}` 原文件") + continue + + if partition.archive_events and not partition.keep_events: + if archive_target.exists(): + archive_target.unlink() + shutil.move(str(src), str(archive_target)) + actions.append( + f"整文件归档 `{src.relative_to(REPO_ROOT).as_posix()}` -> `{archive_target.relative_to(REPO_ROOT).as_posix()}`" + ) + continue + + if partition.keep_events and partition.archive_events: + archive_target.write_text(format_event_objects(partition.archive_events), encoding="utf-8") + src.write_text(format_event_objects(partition.keep_events), encoding="utf-8") + retained_files.append(src.relative_to(REPO_ROOT).as_posix()) + actions.append( + f"拆分混合文件 `{src.relative_to(REPO_ROOT).as_posix()}`:保留 {len(partition.keep_events)} 条,归档 {len(partition.archive_events)} 条" + ) + + return actions, retained_files + + +def archive_snapshots(keep_snapshot_refs: set[str]) -> tuple[list[str], list[str]]: + actions: list[str] = [] + retained_snapshots: list[str] = [] + ensure_archive_dirs() + + for path in sorted(SNAPSHOTS_DIR.iterdir()): + if not path.is_file(): + continue + ref = f"{SNAPSHOT_REF_PREFIX}{path.name}" + if ref in keep_snapshot_refs: + retained_snapshots.append(path.relative_to(REPO_ROOT).as_posix()) + continue + target = ARCHIVE_SNAPSHOTS_DIR / path.name + if target.exists(): + target.unlink() + shutil.move(str(path), str(target)) + actions.append( + f"归档 snapshot `{path.relative_to(REPO_ROOT).as_posix()}` -> `{target.relative_to(REPO_ROOT).as_posix()}`" + ) + + return actions, retained_snapshots + + +def validate_retained_state() -> dict[str, Any]: + retained_events, retained_by_file, parse_errors = collect_inventory() + retained_today_events = [event for event in retained_events if event.day == KEEP_DAY] + retained_snapshot_refs = {ref for event in retained_today_events for ref in event.snapshot_refs} + retained_snapshot_paths = sorted(path for path in SNAPSHOTS_DIR.iterdir() if path.is_file()) + retained_snapshot_ref_set = { + f"{SNAPSHOT_REF_PREFIX}{path.name}".replace("\\", "/") for path in retained_snapshot_paths + } + + missing_snapshot_refs = sorted(retained_snapshot_refs - retained_snapshot_ref_set) + orphan_snapshot_refs = sorted(retained_snapshot_ref_set - retained_snapshot_refs) + orphan_event_count = sum( + 1 for event in retained_today_events if any(ref not in retained_snapshot_ref_set for ref in event.snapshot_refs) + ) + core_events = { + "input.process.started", + "prompt.build.completed", + "api.request.started", + "api.stream.completed", + } + present_core_events = {event.obj.get("event") for event in retained_today_events} + + return { + "retained_events": retained_today_events, + "retained_by_file": retained_by_file, + "parse_errors": parse_errors, + "retained_snapshot_paths": retained_snapshot_paths, + "missing_snapshot_refs": missing_snapshot_refs, + "orphan_snapshot_refs": orphan_snapshot_refs, + "orphan_event_count": orphan_event_count, + "core_chain_complete": core_events.issubset(present_core_events), + "present_core_events": sorted(event for event in present_core_events if isinstance(event, str)), + } + + +def build_post_report( + validation: dict[str, Any], + event_actions: list[str], + snapshot_actions: list[str], + retained_event_files: list[str], + retained_snapshot_files: list[str], +) -> str: + etl_ready = ( + not validation["parse_errors"] + and not validation["missing_snapshot_refs"] + and validation["orphan_event_count"] == 0 + ) + + lines = [ + "# 观测数据清洗后校验报告", + "", + f"- 基线日期:{KEEP_DAY.isoformat()}", + f"- 是否可作为新基线继续做 ETL:{'是' if etl_ready else '否'}", + "", + "## 校验结果", + "", + f"- 保留事件数:{len(validation['retained_events'])}", + f"- 保留 snapshot 数:{len(validation['retained_snapshot_paths'])}", + f"- 缺失 snapshot 引用数:{len(validation['missing_snapshot_refs'])}", + f"- orphan event 数:{validation['orphan_event_count']}", + f"- orphan snapshot 数:{len(validation['orphan_snapshot_refs'])}", + f"- 核心链路事件是否齐备:{'是' if validation['core_chain_complete'] else '否'}", + "", + "## 保留文件", + "", + "### 今日基线 event 文件", + "", + ] + lines.extend(f"- `{path}`" for path in retained_event_files) + lines.extend(["", "### 今日基线 snapshot 文件", ""]) + lines.extend(f"- `{path}`" for path in retained_snapshot_files) + + lines.extend(["", "## 归档位置", ""]) + lines.append(f"- 旧 event 归档目录:`{ARCHIVE_EVENTS_DIR.relative_to(REPO_ROOT).as_posix()}`") + lines.append(f"- 旧 snapshot 归档目录:`{ARCHIVE_SNAPSHOTS_DIR.relative_to(REPO_ROOT).as_posix()}`") + + lines.extend(["", "## 执行动作", ""]) + lines.extend(f"- {action}" for action in event_actions) + lines.extend(f"- {action}" for action in snapshot_actions) + + lines.extend(["", "## 解析与引用检查", ""]) + lines.append(f"- event 文件解析错误数:{len(validation['parse_errors'])}") + if validation["parse_errors"]: + lines.extend(f"- {error}" for error in validation["parse_errors"]) + lines.append(f"- 缺失 snapshot_ref:{len(validation['missing_snapshot_refs'])}") + for ref in validation["missing_snapshot_refs"]: + lines.append(f"- 缺失:`{ref}`") + lines.append(f"- orphan snapshot:{len(validation['orphan_snapshot_refs'])}") + for ref in validation["orphan_snapshot_refs"]: + lines.append(f"- orphan:`{ref}`") + + lines.extend(["", "## 结论", ""]) + if etl_ready: + lines.append("- 清洗后的今日事件与快照引用关系闭合,可以作为新的 ETL / 指标 / trace reader / dashboard 基线。") + else: + lines.append("- 当前仍存在解析或引用问题,不能直接进入 ETL。") + return "\n".join(lines) + "\n" + + +def main() -> None: + all_events, events_by_file, parse_errors = collect_inventory() + PRE_REPORT_PATH.write_text( + build_pre_report(all_events, events_by_file, parse_errors), + encoding="utf-8", + ) + + keep_snapshot_refs = { + ref for event in all_events if event.day == KEEP_DAY for ref in event.snapshot_refs + } + partitions = partition_events(events_by_file) + event_actions, retained_event_files = archive_events(partitions) + snapshot_actions, retained_snapshot_files = archive_snapshots(keep_snapshot_refs) + + validation = validate_retained_state() + POST_REPORT_PATH.write_text( + build_post_report( + validation, + event_actions, + snapshot_actions, + retained_event_files, + retained_snapshot_files, + ), + encoding="utf-8", + ) + + print("Pre-report:", PRE_REPORT_PATH.relative_to(REPO_ROOT).as_posix()) + print("Post-report:", POST_REPORT_PATH.relative_to(REPO_ROOT).as_posix()) + print("Archived events dir:", ARCHIVE_EVENTS_DIR.relative_to(REPO_ROOT).as_posix()) + print("Archived snapshots dir:", ARCHIVE_SNAPSHOTS_DIR.relative_to(REPO_ROOT).as_posix()) + + +if __name__ == "__main__": + main() diff --git a/scripts/observability/daily_summary.ps1 b/scripts/observability/daily_summary.ps1 new file mode 100644 index 0000000000..a2cc7a82d8 --- /dev/null +++ b/scripts/observability/daily_summary.ps1 @@ -0,0 +1,331 @@ +param( + [string]$Date, + [string]$EventsFile, + [switch]$SkipRebuild +) + +[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$observabilityDir = Join-Path $repoRoot ".observability" +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" +$rebuildScript = Join-Path $repoRoot "scripts\observability\rebuild_observability_db.ps1" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +function Get-EpochMilliseconds { + param( + [datetime]$Value + ) + + return ([DateTimeOffset]$Value.ToUniversalTime()).ToUnixTimeMilliseconds() +} + +function Resolve-TargetEventsFile { + param( + [string]$ObservabilityDir, + [string]$RequestedDate, + [string]$RequestedEventsFile + ) + + if (-not [string]::IsNullOrWhiteSpace($RequestedEventsFile)) { + return (Resolve-Path -LiteralPath $RequestedEventsFile).Path + } + + $files = Get-ChildItem -LiteralPath $ObservabilityDir -Filter "events-*.jsonl" | + Where-Object { $_.Name -match '^events-\d{8}\.jsonl$' } | + Sort-Object Name + + if (-not $files -or $files.Count -eq 0) { + throw "No events-YYYYMMDD.jsonl files found in $ObservabilityDir" + } + + if (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + $normalizedDate = $RequestedDate -replace '-', '' + $matched = $files | Where-Object { $_.BaseName -eq "events-$normalizedDate" } | Select-Object -First 1 + if (-not $matched) { + throw "Requested events file not found for date $RequestedDate" + } + return $matched.FullName + } + + return ($files | Select-Object -Last 1).FullName +} + +function Get-TargetDate { + param( + [string]$RequestedDate, + [string]$TargetEventsFile + ) + + if (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + return $RequestedDate + } + + $match = [regex]::Match([System.IO.Path]::GetFileName($TargetEventsFile), '^events-(\d{4})(\d{2})(\d{2})\.jsonl$') + if ($match.Success) { + return "$($match.Groups[1].Value)-$($match.Groups[2].Value)-$($match.Groups[3].Value)" + } + + return $null +} + +function Get-BuildMeta { + param( + [string]$DuckDbExe, + [string]$DatabasePath + ) + + if (-not (Test-Path -LiteralPath $DatabasePath)) { + return $null + } + + $raw = & $DuckDbExe -json $DatabasePath "select * from build_meta limit 1;" 2>$null + if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($raw)) { + return $null + } + + return @($raw | ConvertFrom-Json)[0] +} + +function Ensure-FreshDatabase { + param( + [string]$TargetEventsFile, + [string]$RequestedDate, + [string]$DuckDbExe, + [string]$DatabasePath, + [string]$RebuildScript, + [switch]$SkipRebuild + ) + + $targetStat = Get-Item -LiteralPath $TargetEventsFile + $targetMtimeMs = Get-EpochMilliseconds -Value $targetStat.LastWriteTimeUtc + $buildMeta = Get-BuildMeta -DuckDbExe $DuckDbExe -DatabasePath $DatabasePath + $isStale = + ($null -eq $buildMeta) -or + ($buildMeta.source_events_file -ne $TargetEventsFile) -or + ([int64]$buildMeta.source_events_size_bytes -ne [int64]$targetStat.Length) -or + ([int64]$buildMeta.source_events_mtime_ms -ne $targetMtimeMs) + + if (-not $isStale) { + return + } + + if ($SkipRebuild) { + throw "Observability DB is stale for $TargetEventsFile and -SkipRebuild was provided." + } + + $rebuildArgs = @("-ExecutionPolicy", "Bypass", "-File", $RebuildScript, "-Quiet") + if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $rebuildArgs += @("-EventsFile", $TargetEventsFile) + } elseif (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + $rebuildArgs += @("-Date", $RequestedDate) + } + + & powershell @rebuildArgs + if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE + } +} + +function Invoke-DuckDbJson { + param( + [string]$Sql + ) + + $raw = & $duckdbExe -json $dbPath $Sql + if ($LASTEXITCODE -ne 0) { + throw "DuckDB query failed: $Sql" + } + if ([string]::IsNullOrWhiteSpace($raw)) { + return @() + } + return @($raw | ConvertFrom-Json) +} + +$targetEventsFile = Resolve-TargetEventsFile -ObservabilityDir $observabilityDir -RequestedDate $Date -RequestedEventsFile $EventsFile +$targetDate = Get-TargetDate -RequestedDate $Date -TargetEventsFile $targetEventsFile + +Ensure-FreshDatabase -TargetEventsFile $targetEventsFile -RequestedDate $Date -DuckDbExe $duckdbExe -DatabasePath $dbPath -RebuildScript $rebuildScript -SkipRebuild:$SkipRebuild + +if (-not (Test-Path -LiteralPath $dbPath)) { + throw "DuckDB database not found at $dbPath" +} + +if ([string]::IsNullOrWhiteSpace($targetDate)) { + $targetDate = (Invoke-DuckDbJson "select max(event_date) as event_date from daily_rollups;")[0].event_date +} + +$buildMeta = (Invoke-DuckDbJson "select source_events_file_name, source_events_size_bytes, events_row_count, built_at from build_meta limit 1;")[0] +$rollup = (Invoke-DuckDbJson "select * from daily_rollups where event_date = '$targetDate' limit 1;")[0] +$integrity = (Invoke-DuckDbJson "select * from metrics_integrity_daily where event_date = '$targetDate' limit 1;")[0] +$cost = (Invoke-DuckDbJson "select * from metrics_cost_daily where event_date = '$targetDate' limit 1;")[0] +$loops = (Invoke-DuckDbJson "select * from metrics_loop_daily where event_date = '$targetDate' limit 1;")[0] +$latency = (Invoke-DuckDbJson "select * from metrics_latency_daily where event_date = '$targetDate' limit 1;")[0] +$compression = (Invoke-DuckDbJson "select * from metrics_compression_daily where event_date = '$targetDate' limit 1;")[0] +$toolMetrics = (Invoke-DuckDbJson "select * from metrics_tools_daily where event_date = '$targetDate' limit 1;")[0] +$recovery = (Invoke-DuckDbJson "select * from metrics_recovery_daily where event_date = '$targetDate' limit 1;")[0] +$flags = (Invoke-DuckDbJson "select * from system_flags where event_date = '$targetDate' limit 1;")[0] +$costShare = Invoke-DuckDbJson "select query_source, total_prompt_input_tokens, total_billed_tokens, daily_cost_share from query_source_cost_share_daily where event_date = '$targetDate' order by total_billed_tokens desc, query_source asc;" +$agentCosts = Invoke-DuckDbJson "select agent_name, source_group, agent_total_prompt_input_tokens, agent_total_billed_tokens, agent_cost_share, agent_query_count, agent_avg_turns_per_query, agent_avg_loop_iter_end from agent_cost_daily where event_date = '$targetDate' order by agent_total_billed_tokens desc, agent_name asc;" +$recentActions = Invoke-DuckDbJson "select user_action_id, duration_ms, query_count, main_thread_query_count, subagent_count, total_prompt_input_tokens, total_billed_tokens from user_actions where event_date = '$targetDate' order by started_at desc limit 10;" +$subagentReasons = Invoke-DuckDbJson "select subagent_reason, agent_name, subagent_count, avg_duration_ms from subagent_reason_daily where event_date = '$targetDate' order by subagent_count desc, subagent_reason asc;" +$queries = Invoke-DuckDbJson "select query_source, count(*) as query_count, sum(duration_ms) as total_duration_ms, sum(tool_call_count) as total_tool_calls from queries where started_at like '$targetDate%' group by 1 order by query_count desc, query_source asc;" +$tools = Invoke-DuckDbJson "select tool_name, tool_calls, tool_success_rate, tool_avg_duration_ms, tool_p95_duration_ms from tool_calls_by_name order by tool_calls desc, tool_name asc;" +$toolModes = Invoke-DuckDbJson "select tool_mode, tool_calls from tool_calls_by_mode order by tool_calls desc, tool_mode asc;" +$subagents = Invoke-DuckDbJson "select coalesce(subagent_type, 'unknown') as subagent_type, count(*) as subagent_count, avg(duration_ms) as avg_duration_ms from subagents where coalesce(spawned_at, completed_at, '') like '$targetDate%' group by 1 order by subagent_count desc, subagent_type asc;" + +if (-not $rollup) { + throw "No daily rollup found for $targetDate" +} + +Write-Output "日期: $($rollup.event_date)" +Write-Output "源文件: $($buildMeta.source_events_file_name)" +Write-Output "源文件大小(bytes): $($buildMeta.source_events_size_bytes)" +Write-Output "建库时间: $($buildMeta.built_at)" +Write-Output "入库事件数: $($buildMeta.events_row_count)" +Write-Output "" +Write-Output "概览:" +Write-Output " 事件数: $($rollup.event_count)" +Write-Output " 用户动作数: $($rollup.user_action_count)" +Write-Output " Query 数: $($rollup.query_count)" +Write-Output " Turn 数: $($rollup.turn_count)" +Write-Output " 工具调用数: $($rollup.tool_call_count)" +Write-Output " Subagent 数: $($rollup.subagent_count)" +Write-Output " Snapshot 引用数: $($rollup.snapshot_ref_count)" +Write-Output " 最新事件时间: $($rollup.latest_event_ts)" +Write-Output "" +Write-Output "完整性:" +Write-Output " user_action -> 主线程 query 覆盖率: $($integrity.user_action_main_query_coverage_rate)" +Write-Output " 原生 query 完成率: $($integrity.strict_query_completion_rate)" +Write-Output " 推断 query 完成率: $($integrity.inferred_query_completion_rate)" +Write-Output " query 补链差值: $($integrity.query_completeness_gap)" +Write-Output " 原生 turn 闭合率: $($integrity.strict_turn_state_closure_rate)" +Write-Output " 推断 turn 闭合率: $($integrity.inferred_turn_state_closure_rate)" +Write-Output " turn 补链差值: $($integrity.turn_closure_gap)" +Write-Output " 工具生命周期闭合率: $($integrity.tool_lifecycle_closure_rate)" +Write-Output " subagent 生命周期闭合率: $($integrity.subagent_lifecycle_closure_rate)" +Write-Output " snapshot 缺失率: $($integrity.snapshot_missing_rate)" +Write-Output " orphan event 率: $($integrity.orphan_event_rate)" +Write-Output "" +Write-Output "成本 - 每日总量:" +Write-Output " 总 prompt 输入 tokens: $($cost.user_action_total_prompt_input_tokens)" +Write-Output " 总 billed tokens: $($cost.user_action_total_billed_tokens)" +Write-Output " output tokens: $($cost.user_action_total_output_tokens)" +Write-Output "成本 - 结构拆分:" +Write-Output " 裸 input tokens: $($cost.user_action_total_raw_input_tokens)" +Write-Output " cache read input tokens: $($cost.user_action_total_cache_read_tokens)" +Write-Output " cache create input tokens: $($cost.user_action_total_cache_create_tokens)" +Write-Output "成本 - 主/子链路:" +Write-Output " 主线程总 prompt 输入 tokens: $($cost.main_thread_total_prompt_input_tokens)" +Write-Output " subagent 总 prompt 输入 tokens: $($cost.subagent_total_prompt_input_tokens)" +Write-Output " subagent 放大倍率: $($cost.subagent_amplification_ratio)" +Write-Output "成本 - 平均/效率:" +Write-Output " 平均每个 user_action 的 prompt 输入: $($cost.avg_total_prompt_input_tokens_per_user_action)" +Write-Output " 平均每个 user_action 的 billed: $($cost.avg_total_billed_tokens_per_user_action)" +Write-Output " 平均每个 query 的 prompt 输入: $($cost.avg_total_prompt_input_tokens_per_query)" +Write-Output " 平均每个 query 的 billed: $($cost.avg_total_billed_tokens_per_query)" +Write-Output " 每个成功 completed query 的平均成本: $($cost.cost_per_successful_completed_query)" +Write-Output "" +Write-Output "Loop / Turn:" +Write-Output " 每个 query 的平均 turn 数: $($loops.daily_avg_turns_per_query)" +Write-Output " 每个 query 的平均 loop 终点: $($loops.daily_avg_loop_iter_end)" +Write-Output " query loop 终点 P95: $($loops.daily_p95_loop_iter_end)" +Write-Output " loop_iter > 1 的 query 占比: $($loops.daily_queries_with_loop_iter_gt_1_rate)" +Write-Output "" +Write-Output "延迟(ms):" +Write-Output " submit -> first chunk: $($latency.submit_to_first_chunk_ms)" +Write-Output " preprocess: $($latency.preprocess_duration_ms)" +Write-Output " prompt.build: $($latency.prompt_build_duration_ms)" +Write-Output " request -> first chunk: $($latency.api_first_chunk_latency_ms)" +Write-Output " request 总时长: $($latency.api_total_duration_ms)" +Write-Output " 工具执行平均时长: $($latency.tool_execution_duration_ms)" +Write-Output " stop hooks 平均时长: $($latency.stop_hook_duration_ms)" +Write-Output " subagent 生命周期平均时长: $($latency.subagent_duration_ms)" +Write-Output " user action 端到端平均时长: $($latency.user_action_e2e_duration_ms)" +Write-Output "" +Write-Output "压缩与上下文治理:" +Write-Output " preprocess 前 tokens 总量: $($compression.preprocess_tokens_before_total)" +Write-Output " preprocess 后 tokens 总量: $($compression.preprocess_tokens_after_total)" +Write-Output " 总节省 tokens: $($compression.tokens_saved_total)" +Write-Output " compression_gain_ratio: $($compression.compression_gain_ratio)" +Write-Output " tool_result_budget_saved_tokens: $($compression.tool_result_budget_saved_tokens)" +Write-Output " history_snip_saved_tokens: $($compression.history_snip_saved_tokens)" +Write-Output " microcompact_saved_tokens: $($compression.microcompact_saved_tokens)" +Write-Output " autocompact_saved_tokens: $($compression.autocompact_saved_tokens)" +Write-Output " autocompact_trigger_rate: $($compression.autocompact_trigger_rate)" +Write-Output "" +Write-Output "工具:" +Write-Output " 工具调用总数: $($toolMetrics.tool_calls_total)" +Write-Output " 工具成功率: $($toolMetrics.tool_success_rate)" +Write-Output " 工具失败率: $($toolMetrics.tool_failure_rate)" +Write-Output " 工具平均时长: $($toolMetrics.tool_avg_duration_ms)" +Write-Output " 工具 P95 时长: $($toolMetrics.tool_p95_duration_ms)" +Write-Output " context_update_rate: $($toolMetrics.context_update_rate)" +Write-Output " tools_per_query: $($toolMetrics.tools_per_query)" +Write-Output " tools_per_subagent: $($toolMetrics.tools_per_subagent)" +Write-Output " tool_followup_turn_ratio: $($toolMetrics.tool_followup_turn_ratio)" +Write-Output "" +Write-Output "恢复与异常:" +Write-Output " prompt_too_long_recovery_attempts: $($recovery.prompt_too_long_recovery_attempts)" +Write-Output " prompt_too_long_recovery_success_rate: $($recovery.prompt_too_long_recovery_success_rate)" +Write-Output " max_output_tokens_recovery_attempts: $($recovery.max_output_tokens_recovery_attempts)" +Write-Output " max_output_tokens_recovery_success_rate: $($recovery.max_output_tokens_recovery_success_rate)" +Write-Output " token_budget_continue_rate: $($recovery.token_budget_continue_rate)" +Write-Output " stop_hook_block_rate: $($recovery.stop_hook_block_rate)" +Write-Output " api_error_rate: $($recovery.api_error_rate)" +Write-Output " tool_failure_terminal_rate: $($recovery.tool_failure_terminal_rate)" +Write-Output " exporter_failure_rate: $($recovery.exporter_failure_rate)" +Write-Output " dropped_event_rate: $($recovery.dropped_event_rate)" +Write-Output "" +Write-Output "显式状态:" +Write-Output " contextCollapse_enabled_gauge: $($flags.contextCollapse_enabled_gauge)" +Write-Output " contextCollapse_attempted: $($flags.contextCollapse_attempted)" +Write-Output " contextCollapse_committed: $($flags.contextCollapse_committed)" +Write-Output " history_snip_gate_state: $($flags.history_snip_gate_state)" +Write-Output " history_snip_gate_on_rate: $($flags.history_snip_gate_on_rate)" +Write-Output "" +Write-Output "按 source 成本拆分:" +foreach ($row in @($costShare)) { + Write-Output (" {0}: total_prompt_input_tokens={1}, total_billed_tokens={2}, daily_cost_share={3}" -f $row.query_source, $row.total_prompt_input_tokens, $row.total_billed_tokens, $row.daily_cost_share) +} +Write-Output "" +Write-Output "按 agent/source 成本拆分:" +foreach ($row in @($agentCosts)) { + Write-Output (" {0} [{1}]: total_prompt_input_tokens={2}, total_billed_tokens={3}, cost_share={4}, queries={5}, avg_turns_per_query={6}, avg_loop_iter_end={7}" -f $row.agent_name, $row.source_group, $row.agent_total_prompt_input_tokens, $row.agent_total_billed_tokens, $row.agent_cost_share, $row.agent_query_count, $row.agent_avg_turns_per_query, $row.agent_avg_loop_iter_end) +} +Write-Output "" +Write-Output "按 source query 概览:" +foreach ($row in @($queries)) { + Write-Output (" {0}: queries={1}, total_duration_ms={2}, tool_calls={3}" -f $row.query_source, $row.query_count, $row.total_duration_ms, $row.total_tool_calls) +} +Write-Output "" +Write-Output "最近用户动作:" +foreach ($row in @($recentActions)) { + Write-Output (" {0}: duration_ms={1}, queries={2}, main_thread_queries={3}, subagents={4}, total_prompt_input_tokens={5}, total_billed_tokens={6}" -f $row.user_action_id, $row.duration_ms, $row.query_count, $row.main_thread_query_count, $row.subagent_count, $row.total_prompt_input_tokens, $row.total_billed_tokens) +} +Write-Output "" +Write-Output "工具明细:" +foreach ($row in @($tools)) { + Write-Output (" {0}: calls={1}, success_rate={2}, avg_duration_ms={3}, p95_duration_ms={4}" -f $row.tool_name, $row.tool_calls, $row.tool_success_rate, $row.tool_avg_duration_ms, $row.tool_p95_duration_ms) +} +Write-Output "" +Write-Output "工具模式:" +foreach ($row in @($toolModes)) { + Write-Output (" {0}: calls={1}" -f $row.tool_mode, $row.tool_calls) +} +Write-Output "" +Write-Output "Subagent 明细:" +foreach ($row in @($subagents)) { + $avgDuration = if ($null -eq $row.avg_duration_ms) { 0 } else { [double]$row.avg_duration_ms } + Write-Output (" {0}: count={1}, avg_duration_ms={2}" -f $row.subagent_type, $row.subagent_count, [math]::Round($avgDuration, 2)) +} +Write-Output "" +Write-Output "Subagent Reason 明细:" +foreach ($row in @($subagentReasons)) { + $avgDuration = if ($null -eq $row.avg_duration_ms) { 0 } else { [double]$row.avg_duration_ms } + Write-Output (" {0} -> {1}: count={2}, avg_duration_ms={3}" -f $row.subagent_reason, $row.agent_name, $row.subagent_count, [math]::Round($avgDuration, 2)) +} diff --git a/scripts/observability/explain_action.ps1 b/scripts/observability/explain_action.ps1 new file mode 100644 index 0000000000..fe68778f2c --- /dev/null +++ b/scripts/observability/explain_action.ps1 @@ -0,0 +1,462 @@ +param( + [string]$UserActionId, + [switch]$Latest, + [string]$OutputPath +) + +$ErrorActionPreference = "Stop" + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" +$defaultOutputDir = Join-Path $repoRoot "ObservrityTask" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +if (-not (Test-Path -LiteralPath $dbPath)) { + throw "DuckDB database not found at $dbPath" +} + +function As-Array { + param([object]$Value) + + if ($null -eq $Value) { + return @() + } + + if ($Value -is [System.Array]) { + return $Value + } + + return @($Value) +} + +function Escape-SqlLiteral { + param([string]$Value) + return $Value.Replace("'", "''") +} + +function Invoke-DuckDbJson { + param([string]$Sql) + + $raw = & $duckdbExe -json $dbPath $Sql + if ([string]::IsNullOrWhiteSpace($raw)) { + return @() + } + + return As-Array ($raw | ConvertFrom-Json) +} + +function To-LocalDisplay { + param([string]$UtcText) + + if ([string]::IsNullOrWhiteSpace($UtcText)) { + return "" + } + + return ([DateTimeOffset]::Parse($UtcText).ToLocalTime().ToString("yyyy-MM-dd HH:mm:ss")) +} + +function To-LocalShort { + param([string]$UtcText) + + if ([string]::IsNullOrWhiteSpace($UtcText)) { + return "" + } + + return ([DateTimeOffset]::Parse($UtcText).ToLocalTime().ToString("HH:mm:ss")) +} + +function To-MermaidLabel { + param([string[]]$Lines) + + $text = ($Lines | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) -join "
" + return $text.Replace('"', "'") +} + +function Short-Id { + param([string]$Value) + + if ([string]::IsNullOrWhiteSpace($Value)) { + return "null" + } + + if ($Value.Length -le 8) { + return $Value + } + + return $Value.Substring(0, 8) +} + +function Get-QueryNodeId { + param([string]$QueryId) + return "Q_" + (Short-Id $QueryId) +} + +function Get-TurnNodeId { + param([string]$QueryId, [string]$TurnId) + return "T_" + (Short-Id $QueryId) + "_" + ($TurnId.Replace("-", "_")) +} + +function Get-SpawnNodeId { + param([int]$Index) + return "S_$Index" +} + +function Get-ToolLabel { + param([object[]]$ToolRows) + + if ($ToolRows.Count -eq 0) { + return $null + } + + $names = @($ToolRows | Select-Object -ExpandProperty tool_name) + return ($names -join " + ") +} + +function Find-MainTurnForSpawn { + param( + [long]$SpawnAtMs, + [object[]]$TurnRows + ) + + if ($TurnRows.Count -eq 0) { + return $null + } + + for ($i = 0; $i -lt $TurnRows.Count; $i++) { + $current = $TurnRows[$i] + $next = if ($i + 1 -lt $TurnRows.Count) { $TurnRows[$i + 1] } else { $null } + $startsBefore = [long]$current.started_at_ms -le $SpawnAtMs + $nextStartsAfter = ($null -eq $next) -or ([long]$next.started_at_ms -gt $SpawnAtMs) + if ($startsBefore -and $nextStartsAfter) { + return $current + } + } + + return $null +} + +if ([string]::IsNullOrWhiteSpace($UserActionId)) { + $Latest = $true +} + +if ($Latest) { + $latestRows = Invoke-DuckDbJson @" +select user_action_id +from user_actions +order by started_at_ms desc +limit 1; +"@ + + if ($latestRows.Count -eq 0) { + throw "No user actions found in user_actions." + } + + $UserActionId = $latestRows[0].user_action_id +} + +$escapedActionId = Escape-SqlLiteral $UserActionId + +$actionRows = Invoke-DuckDbJson @" +select * +from user_actions +where user_action_id = '$escapedActionId'; +"@ + +if ($actionRows.Count -eq 0) { + throw "User action not found: $UserActionId" +} + +$action = $actionRows[0] + +$integrityRows = Invoke-DuckDbJson @" +select * +from metrics_integrity_daily +where event_date = '$($action.event_date)'; +"@ +$integrity = if ($integrityRows.Count -gt 0) { $integrityRows[0] } else { $null } + +$queries = Invoke-DuckDbJson @" +select query_id, user_action_id, query_source, subagent_id, subagent_reason, + subagent_trigger_kind, subagent_trigger_detail, + agent_name, source_group, + started_at, started_at_ms, ended_at, ended_at_ms, duration_ms, + turn_count, query_max_loop_iter, tool_call_count, terminal_reason, + strict_is_complete, inferred_is_complete +from queries +where user_action_id = '$escapedActionId' +order by started_at_ms asc; +"@ + +$turns = Invoke-DuckDbJson @" +select query_id, turn_id, agent_name, query_source, started_at, started_at_ms, ended_at, ended_at_ms, + duration_ms, loop_iter_start, loop_iter_end, tool_call_count, stop_reason, + transition_out, termination_reason, strict_is_closed, inferred_is_closed +from turns +where user_action_id = '$escapedActionId' +order by started_at_ms asc; +"@ + +$subagents = Invoke-DuckDbJson @" +select subagent_id, query_id, subagent_type, subagent_reason, + subagent_trigger_kind, subagent_trigger_detail, + query_source, agent_name, source_group, + spawned_at, spawned_at_ms, completed_at, completed_at_ms, duration_ms +from subagents +where user_action_id = '$escapedActionId' +order by spawned_at_ms asc; +"@ + +$tools = Invoke-DuckDbJson @" +select query_id, turn_id, tool_name, detected_at, detected_at_ms, duration_ms, success +from tools +where user_action_id = '$escapedActionId' +order by detected_at_ms asc; +"@ + +$spawns = Invoke-DuckDbJson @" +select ts_wall, ts_wall_ms, query_id, subagent_id, subagent_reason, + subagent_trigger_kind, subagent_trigger_detail, query_source +from events_raw +where user_action_id = '$escapedActionId' + and event_name = 'subagent.spawned' +order by ts_wall_ms asc; +"@ + +$mainQuery = $queries | Where-Object { $_.agent_name -eq "main_thread" } | Select-Object -First 1 +$mainTurns = @($turns | Where-Object { $_.agent_name -eq "main_thread" } | Sort-Object started_at_ms) + +$toolsByTurnKey = @{} +foreach ($tool in $tools) { + $key = "$($tool.query_id)|$($tool.turn_id)" + if (-not $toolsByTurnKey.ContainsKey($key)) { + $toolsByTurnKey[$key] = @() + } + $toolsByTurnKey[$key] += $tool +} + +$turnsByQuery = @{} +foreach ($turn in $turns) { + if (-not $turnsByQuery.ContainsKey($turn.query_id)) { + $turnsByQuery[$turn.query_id] = @() + } + $turnsByQuery[$turn.query_id] += $turn +} + +if ([string]::IsNullOrWhiteSpace($OutputPath)) { + $OutputPath = Join-Path $defaultOutputDir ("user_action_{0}_auto_report.md" -f (Short-Id $UserActionId)) +} elseif (-not [System.IO.Path]::IsPathRooted($OutputPath)) { + $OutputPath = Join-Path $repoRoot $OutputPath +} + +$mermaidLines = New-Object System.Collections.Generic.List[string] +$mermaidLines.Add("flowchart TD") +$mermaidLines.Add((" UA[""{0}""]" -f (To-MermaidLabel @( + "user_action" + (Short-Id $UserActionId) + ("{0} -> {1}" -f (To-LocalShort $action.started_at), (To-LocalShort $action.ended_at)) + )))) + +$queryNodeIds = @{} +foreach ($query in $queries) { + $queryNodeId = Get-QueryNodeId $query.query_id + $queryNodeIds[$query.query_id] = $queryNodeId + $queryLabel = To-MermaidLabel @( + $query.agent_name + (Short-Id $query.query_id) + ("{0} turns" -f $query.turn_count) + $query.terminal_reason + ) + $mermaidLines.Add((" {0}[""{1}""]" -f $queryNodeId, $queryLabel)) +} + +$turnNodeIds = @{} +foreach ($turn in $turns) { + $turnNodeId = Get-TurnNodeId $turn.query_id $turn.turn_id + $turnNodeIds["$($turn.query_id)|$($turn.turn_id)"] = $turnNodeId + $toolKey = "$($turn.query_id)|$($turn.turn_id)" + $toolLabel = if ($toolsByTurnKey.ContainsKey($toolKey)) { Get-ToolLabel $toolsByTurnKey[$toolKey] } else { $null } + $detail = if (-not [string]::IsNullOrWhiteSpace($toolLabel)) { $toolLabel } elseif (-not [string]::IsNullOrWhiteSpace($turn.stop_reason)) { $turn.stop_reason } else { "no_tool" } + $turnLabel = To-MermaidLabel @( + $turn.turn_id + $detail + ("loop={0}" -f $turn.loop_iter_end) + ) + $mermaidLines.Add((" {0}[""{1}""]" -f $turnNodeId, $turnLabel)) +} + +foreach ($query in $queries) { + $queryTurns = @($turnsByQuery[$query.query_id] | Sort-Object started_at_ms) + if ($queryTurns.Count -eq 0) { + continue + } + + $queryNodeId = $queryNodeIds[$query.query_id] + $firstTurnNodeId = $turnNodeIds["$($query.query_id)|$($queryTurns[0].turn_id)"] + $mermaidLines.Add((" {0} --> {1}" -f $queryNodeId, $firstTurnNodeId)) + + for ($i = 0; $i -lt $queryTurns.Count - 1; $i++) { + $fromNodeId = $turnNodeIds["$($query.query_id)|$($queryTurns[$i].turn_id)"] + $toNodeId = $turnNodeIds["$($query.query_id)|$($queryTurns[$i + 1].turn_id)"] + $mermaidLines.Add((" {0} --> {1}" -f $fromNodeId, $toNodeId)) + } +} + +$spawnIndex = 0 +$spawnSummary = @() +foreach ($spawn in $spawns) { + $spawnIndex += 1 + $spawnNodeId = Get-SpawnNodeId $spawnIndex + $spawnSummary += [PSCustomObject]@{ + NodeId = $spawnNodeId + QueryId = $spawn.query_id + SubagentId = $spawn.subagent_id + SubagentReason = $spawn.subagent_reason + SubagentTriggerKind = $spawn.subagent_trigger_kind + SubagentTriggerDetail = $spawn.subagent_trigger_detail + SpawnedAt = $spawn.ts_wall + SpawnedAtMs = [long]$spawn.ts_wall_ms + } + + $spawnLabel = To-MermaidLabel @( + ("spawn {0}" -f $spawn.subagent_reason) + $spawn.subagent_trigger_detail + (To-LocalShort $spawn.ts_wall) + ) + $mermaidLines.Add((" {0}[""{1}""]" -f $spawnNodeId, $spawnLabel)) + + $queryNodeId = $queryNodeIds[$spawn.query_id] + $parentTurn = Find-MainTurnForSpawn -SpawnAtMs ([long]$spawn.ts_wall_ms) -TurnRows $mainTurns + if ($null -ne $parentTurn) { + $parentTurnNodeId = $turnNodeIds["$($parentTurn.query_id)|$($parentTurn.turn_id)"] + $mermaidLines.Add((" {0} --> {1} --> {2}" -f $parentTurnNodeId, $spawnNodeId, $queryNodeId)) + } else { + $mermaidLines.Add((" UA --> {0} --> {1}" -f $spawnNodeId, $queryNodeId)) + } +} + +foreach ($query in $queries) { + if (($null -ne $mainQuery) -and ($query.query_id -eq $mainQuery.query_id)) { + $mermaidLines.Add((" UA --> {0}" -f $queryNodeIds[$query.query_id])) + continue + } + + $hasSpawn = $spawnSummary | Where-Object { $_.QueryId -eq $query.query_id } | Select-Object -First 1 + if ($null -eq $hasSpawn) { + $mermaidLines.Add((" UA --> {0}" -f $queryNodeIds[$query.query_id])) + } +} + +$content = New-Object System.Collections.Generic.List[string] +$content.Add("# Action Report") +$content.Add("") +$content.Add("This report is generated directly from the current .observability files and DuckDB facts. Copy the Mermaid block into Mermaid Live Editor to visualize the graph.") +$content.Add("") +$content.Add("## Basics") +$content.Add("") +$content.Add("- user_action_id: $UserActionId") +$content.Add("- UTC: $($action.started_at) -> $($action.ended_at)") +$content.Add("- Local: $(To-LocalDisplay $action.started_at) -> $(To-LocalDisplay $action.ended_at)") +$content.Add("- duration_ms: $($action.duration_ms)") +$content.Add("- query_count: $($action.query_count)") +$content.Add("- subagent_count: $($action.subagent_count)") +$content.Add("- tool_call_count: $($action.tool_call_count)") +$content.Add("- total_prompt_input_tokens: $($action.total_prompt_input_tokens)") +$content.Add("- total_billed_tokens: $($action.total_billed_tokens)") +$content.Add("") + +if ($null -ne $integrity) { + $content.Add("## Integrity Snapshot") + $content.Add("") + $content.Add("- strict_query_completion_rate: $($integrity.strict_query_completion_rate)") + $content.Add("- inferred_query_completion_rate: $($integrity.inferred_query_completion_rate)") + $content.Add("- strict_turn_state_closure_rate: $($integrity.strict_turn_state_closure_rate)") + $content.Add("- tool_lifecycle_closure_rate: $($integrity.tool_lifecycle_closure_rate)") + $content.Add("- subagent_lifecycle_closure_rate: $($integrity.subagent_lifecycle_closure_rate)") + $content.Add("- orphan_event_rate: $($integrity.orphan_event_rate)") + $content.Add("") +} + +if ($queries.Count -eq 1) { + $content.Add("## Summary") + $content.Add("") + $content.Add("This action expanded into a single query without extra branches.") + $content.Add("") +} else { + $content.Add("## Summary") + $content.Add("") + $content.Add("This action expanded into $($queries.Count) queries and $($subagents.Count) subagents.") + $content.Add("") +} + +$content.Add("## Mermaid DAG") +$content.Add("") +$content.Add('```mermaid') +foreach ($line in $mermaidLines) { + $content.Add($line) +} +$content.Add('```') +$content.Add("") + +$content.Add("## Query List") +$content.Add("") +foreach ($query in $queries) { + $content.Add("### $($query.agent_name) $($query.query_id)") + $content.Add("") + $content.Add("- query_source: $($query.query_source)") + $content.Add("- subagent_reason: $($query.subagent_reason)") + $content.Add("- subagent_trigger_kind: $($query.subagent_trigger_kind)") + $content.Add("- subagent_trigger_detail: $($query.subagent_trigger_detail)") + $content.Add("- time: $(To-LocalDisplay $query.started_at) -> $(To-LocalDisplay $query.ended_at)") + $content.Add("- turn_count: $($query.turn_count)") + $content.Add("- max_loop_iter: $($query.query_max_loop_iter)") + $content.Add("- tool_call_count: $($query.tool_call_count)") + $content.Add("- terminal_reason: $($query.terminal_reason)") + $content.Add("- completeness: strict=$($query.strict_is_complete), inferred=$($query.inferred_is_complete)") + $content.Add("") + + $queryTurns = @($turnsByQuery[$query.query_id] | Sort-Object started_at_ms) + foreach ($turn in $queryTurns) { + $toolKey = "$($turn.query_id)|$($turn.turn_id)" + $toolLabel = if ($toolsByTurnKey.ContainsKey($toolKey)) { Get-ToolLabel $toolsByTurnKey[$toolKey] } else { "none" } + $content.Add("- $($turn.turn_id): tools=$toolLabel, stop_reason=$($turn.stop_reason), transition_out=$($turn.transition_out), duration_ms=$($turn.duration_ms), strict_closed=$($turn.strict_is_closed)") + } + $content.Add("") +} + +$content.Add("## Branch Points") +$content.Add("") +if ($spawnSummary.Count -eq 0) { + $content.Add("- No subagent.spawned events were observed for this action.") + $content.Add("") +} else { + foreach ($spawn in $spawnSummary) { + $childQuery = $queries | Where-Object { $_.query_id -eq $spawn.QueryId } | Select-Object -First 1 + $parentTurn = Find-MainTurnForSpawn -SpawnAtMs $spawn.SpawnedAtMs -TurnRows $mainTurns + $parentText = if ($null -ne $parentTurn) { + "attached after main-thread $($parentTurn.turn_id) by time inference" + } else { + "no parent turn inferred" + } + $content.Add("- $(To-LocalDisplay $spawn.SpawnedAt): spawn $($spawn.SubagentReason), trigger_kind=$($spawn.SubagentTriggerKind), trigger_detail=$($spawn.SubagentTriggerDetail), child_query=$($childQuery.query_id), $parentText") + } + $content.Add("") +} + +$content.Add("## Reading SOP") +$content.Add("") +$content.Add("1. Find the target action in user_actions.") +$content.Add("2. Use queries to list all agents and branches under that action.") +$content.Add("3. Use turns to inspect loop count and turn termination.") +$content.Add("4. Use tools to inspect concrete tool calls per turn.") +$content.Add("5. Use events_raw for key events only: query.started, api.stream.completed, subagent.spawned, query.terminated.") +$content.Add("6. If you need content, follow snapshot refs into .observability/snapshots.") +$content.Add("") + +[System.IO.Directory]::CreateDirectory((Split-Path -Parent $OutputPath)) | Out-Null +$content | Set-Content -LiteralPath $OutputPath -Encoding utf8 + +Write-Output ("Generated report: {0}" -f $OutputPath) diff --git a/scripts/observability/open_duckdb.ps1 b/scripts/observability/open_duckdb.ps1 new file mode 100644 index 0000000000..ffbcff9ac2 --- /dev/null +++ b/scripts/observability/open_duckdb.ps1 @@ -0,0 +1,9 @@ +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$duckdbExe = Join-Path $repoRoot "tools\\duckdb\\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\\observability_v1.duckdb" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +& $duckdbExe $dbPath @Args diff --git a/scripts/observability/read_timeline.ps1 b/scripts/observability/read_timeline.ps1 new file mode 100644 index 0000000000..02683c0aab --- /dev/null +++ b/scripts/observability/read_timeline.ps1 @@ -0,0 +1,103 @@ +param( + [string]$UserActionId, + [string]$QueryId, + [string]$SubagentId +) + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +if (-not (Test-Path -LiteralPath $dbPath)) { + throw "DuckDB database not found at $dbPath" +} + +$provided = @($UserActionId, $QueryId, $SubagentId | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }).Count +if ($provided -ne 1) { + throw "Pass exactly one of -UserActionId, -QueryId, or -SubagentId" +} + +$whereClause = if (-not [string]::IsNullOrWhiteSpace($UserActionId)) { + "user_action_id = '$UserActionId'" +} elseif (-not [string]::IsNullOrWhiteSpace($QueryId)) { + "coalesce(effective_query_id, query_id) = '$QueryId'" +} else { + "subagent_id = '$SubagentId'" +} + +$sql = @" +select + ts_wall, + event_name, + query_source, + coalesce(effective_query_id, query_id) as effective_query_id, + turn_id, + subagent_id, + tool_call_id, + payload_json +from events_raw +where $whereClause +order by ts_wall_ms asc, event_idx asc; +"@ + +$rows = (& $duckdbExe -json $dbPath $sql) | ConvertFrom-Json + +function Summarize-Payload { + param( + [string]$EventName, + [object]$PayloadText + ) + + if ([string]::IsNullOrWhiteSpace($PayloadText)) { + return "" + } + + $payload = $PayloadText | ConvertFrom-Json + switch ($EventName) { + "prompt.build.completed" { + return "model=$($payload.model), system_prompt_chars=$($payload.system_prompt_chars), messages_chars_total=$($payload.messages_chars_total), claude_md_chars=$($payload.claude_md_chars)" + } + "api.stream.completed" { + return "stop_reason=$($payload.stop_reason), assistant_message_count=$($payload.assistant_message_count), tool_use_count=$($payload.tool_use_count)" + } + "tool.execution.completed" { + return "tool_name=$($payload.tool_name), success=$($payload.success), duration_ms=$($payload.duration_ms)" + } + "tool.execution.failed" { + return "tool_name=$($payload.tool_name), duration_ms=$($payload.duration_ms), error=$($payload.error_name)" + } + "state.transitioned" { + return "to_transition=$($payload.to_transition), message_delta=$($payload.message_delta), token_before=$($payload.token_estimate_before), token_after=$($payload.token_estimate_after)" + } + "query.terminated" { + return "reason=$($payload.reason), final_message_count=$($payload.final_message_count)" + } + "subagent.spawned" { + return "fork_label=$($payload.fork_label), inherited_message_count=$($payload.inherited_message_count), transcript_enabled=$($payload.transcript_enabled)" + } + "subagent.completed" { + return "message_count=$($payload.message_count), transcript_enabled=$($payload.transcript_enabled)" + } + default { + $json = $PayloadText + if ($json.Length -gt 140) { + return $json.Substring(0, 140) + "..." + } + return $json + } + } +} + +foreach ($row in @($rows)) { + $summary = Summarize-Payload -EventName $row.event_name -PayloadText $row.payload_json + $base = "{0} | {1} | query={2} | turn={3} | subagent={4} | tool={5}" -f $row.ts_wall, $row.event_name, $row.effective_query_id, $row.turn_id, $row.subagent_id, $row.tool_call_id + if ([string]::IsNullOrWhiteSpace($summary)) { + Write-Output $base + } else { + Write-Output "$base | $summary" + } +} diff --git a/scripts/observability/rebuild_observability_db.ps1 b/scripts/observability/rebuild_observability_db.ps1 new file mode 100644 index 0000000000..e94515beaa --- /dev/null +++ b/scripts/observability/rebuild_observability_db.ps1 @@ -0,0 +1,40 @@ +param( + [string]$Date, + [string]$EventsFile, + [switch]$Quiet +) + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$etlScript = Join-Path $repoRoot "scripts\observability\build_duckdb_etl.ts" +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" +$sqlPath = Join-Path $repoRoot ".observability\load_observability_v1.sql" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +$etlArgs = @("run", $etlScript) +if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $etlArgs += @("--events-file", $EventsFile) +} elseif (-not [string]::IsNullOrWhiteSpace($Date)) { + $etlArgs += @("--date", $Date) +} + +$etlOutput = & bun @etlArgs +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} + +if (-not $Quiet) { + Write-Output $etlOutput +} + +& $duckdbExe -batch $dbPath -f $sqlPath +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} + +if (-not $Quiet) { + & $duckdbExe -json $dbPath "select source_events_file_name, source_events_size_bytes, events_row_count, built_at from build_meta limit 1; select event_date, event_count, user_action_count, query_count, turn_count, tool_call_count, subagent_count, snapshot_ref_count from daily_rollups order by event_date desc limit 1;" +} diff --git a/scripts/observability/refresh_debug_view.ps1 b/scripts/observability/refresh_debug_view.ps1 new file mode 100644 index 0000000000..a18550b8c3 --- /dev/null +++ b/scripts/observability/refresh_debug_view.ps1 @@ -0,0 +1,54 @@ +param( + [string]$Date, + [string]$EventsFile, + [switch]$SummaryOnly +) + +[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$rebuildScript = Join-Path $repoRoot "scripts\observability\rebuild_observability_db.ps1" +$summaryScript = Join-Path $repoRoot "scripts\observability\daily_summary.ps1" +$dashboardScript = Join-Path $repoRoot "scripts\observability\build_dashboard.ps1" + +$commonArgs = @("-ExecutionPolicy", "Bypass") + +$rebuildArgs = @($commonArgs + @("-File", $rebuildScript)) +if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $rebuildArgs += @("-EventsFile", $EventsFile) +} elseif (-not [string]::IsNullOrWhiteSpace($Date)) { + $rebuildArgs += @("-Date", $Date) +} + +& powershell @rebuildArgs +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} + +$summaryArgs = @($commonArgs + @("-File", $summaryScript, "-SkipRebuild")) +if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $summaryArgs += @("-EventsFile", $EventsFile) +} elseif (-not [string]::IsNullOrWhiteSpace($Date)) { + $summaryArgs += @("-Date", $Date) +} + +& powershell @summaryArgs +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} + +if ($SummaryOnly) { + exit 0 +} + +$dashboardArgs = @($commonArgs + @("-File", $dashboardScript, "-SkipRebuild")) +if (-not [string]::IsNullOrWhiteSpace($EventsFile)) { + $dashboardArgs += @("-EventsFile", $EventsFile) +} elseif (-not [string]::IsNullOrWhiteSpace($Date)) { + $dashboardArgs += @("-Date", $Date) +} + +& powershell @dashboardArgs +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} diff --git a/scripts/observability/reset_observability_debug.ps1 b/scripts/observability/reset_observability_debug.ps1 new file mode 100644 index 0000000000..c2849eee7f --- /dev/null +++ b/scripts/observability/reset_observability_debug.ps1 @@ -0,0 +1,46 @@ +param( + [switch]$KeepSnapshots +) + +[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$observabilityDir = Join-Path $repoRoot ".observability" +$snapshotsDir = Join-Path $observabilityDir "snapshots" + +if (-not (Test-Path -LiteralPath $observabilityDir)) { + throw "Observability directory not found at $observabilityDir" +} + +$eventFiles = @(Get-ChildItem -LiteralPath $observabilityDir -Filter "events-*.jsonl" -File -ErrorAction SilentlyContinue) +$dbFiles = @( + Join-Path $observabilityDir "observability_v1.duckdb" + Join-Path $observabilityDir "load_observability_v1.sql" +) | Where-Object { Test-Path -LiteralPath $_ } + +$snapshotFiles = @() +if ((-not $KeepSnapshots) -and (Test-Path -LiteralPath $snapshotsDir)) { + $snapshotFiles = @(Get-ChildItem -LiteralPath $snapshotsDir -File -Force -ErrorAction SilentlyContinue) +} + +foreach ($file in $eventFiles) { + Remove-Item -LiteralPath $file.FullName -Force +} + +foreach ($file in $dbFiles) { + Remove-Item -LiteralPath $file -Force +} + +foreach ($file in $snapshotFiles) { + Remove-Item -LiteralPath $file.FullName -Force +} + +if (-not (Test-Path -LiteralPath $snapshotsDir)) { + New-Item -ItemType Directory -Path $snapshotsDir | Out-Null +} + +Write-Output "已清空可观测调试数据:" +Write-Output " 删除事件文件: $($eventFiles.Count)" +Write-Output " 删除数据库/SQL 文件: $($dbFiles.Count)" +Write-Output " 删除 snapshots: $($snapshotFiles.Count)" +Write-Output " snapshots 目录保留: $snapshotsDir" diff --git a/scripts/observability/watch_latest_events.ps1 b/scripts/observability/watch_latest_events.ps1 new file mode 100644 index 0000000000..c322000e7c --- /dev/null +++ b/scripts/observability/watch_latest_events.ps1 @@ -0,0 +1,84 @@ +param( + [string]$Date, + [int]$Tail = 0 +) + +[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$observabilityDir = Join-Path $repoRoot ".observability" + +function Resolve-TargetEventsFile { + param( + [string]$ObservabilityDir, + [string]$RequestedDate + ) + + if (-not [string]::IsNullOrWhiteSpace($RequestedDate)) { + $normalizedDate = $RequestedDate -replace '-', '' + $candidate = Join-Path $ObservabilityDir "events-$normalizedDate.jsonl" + if (-not (Test-Path -LiteralPath $candidate)) { + throw "Requested events file not found for date $RequestedDate" + } + return $candidate + } + + while ($true) { + $files = Get-ChildItem -LiteralPath $ObservabilityDir -Filter "events-*.jsonl" -File -ErrorAction SilentlyContinue | + Where-Object { $_.Name -match '^events-\d{8}\.jsonl$' } | + Sort-Object Name + + if ($files.Count -gt 0) { + return ($files | Select-Object -Last 1).FullName + } + + Start-Sleep -Milliseconds 500 + } +} + +function Format-EventLine { + param( + [string]$Line + ) + + if ([string]::IsNullOrWhiteSpace($Line)) { + return $null + } + + try { + $event = $Line | ConvertFrom-Json + $parts = @( + $event.ts_wall + $event.event + "source=$($event.query_source)" + "action=$($event.user_action_id)" + "query=$($event.query_id)" + "turn=$($event.turn_id)" + "subagent=$($event.subagent_id)" + "reason=$($event.subagent_reason)" + "tool=$($event.tool_call_id)" + ) + return ($parts -join " | ") + } catch { + return $Line + } +} + +$targetFile = Resolve-TargetEventsFile -ObservabilityDir $observabilityDir -RequestedDate $Date +Write-Output "正在监听: $targetFile" + +if ($Tail -gt 0) { + Get-Content -LiteralPath $targetFile -Tail $Tail | ForEach-Object { + $formatted = Format-EventLine -Line $_ + if ($null -ne $formatted) { + Write-Output $formatted + } + } +} + +Get-Content -LiteralPath $targetFile -Wait | ForEach-Object { + $formatted = Format-EventLine -Line $_ + if ($null -ne $formatted) { + Write-Output $formatted + } +} diff --git a/src/QueryEngine.ts b/src/QueryEngine.ts index 78617af710..44054d3c7a 100644 --- a/src/QueryEngine.ts +++ b/src/QueryEngine.ts @@ -378,6 +378,7 @@ export class QueryEngine { theme: resolveThemeSetting(getGlobalConfig().theme), maxBudgetUsd, }, + userActionId: options?.uuid, getAppState, setAppState, abortController: this.abortController, @@ -526,6 +527,7 @@ export class QueryEngine { agentDefinitions: { activeAgents: agents, allAgents: [] }, maxBudgetUsd, }, + userActionId: options?.uuid, getAppState, setAppState, abortController: this.abortController, diff --git a/src/Tool.ts b/src/Tool.ts index dd99669831..fcd28723e3 100644 --- a/src/Tool.ts +++ b/src/Tool.ts @@ -244,6 +244,7 @@ export type ToolUseContext = { updater: (prev: AttributionState) => AttributionState, ) => void setConversationId?: (id: UUID) => void + userActionId?: string agentId?: AgentId // Only set for subagents; use getSessionId() for session ID. Hooks use this to distinguish subagent calls. agentType?: string // Subagent type name. For the main thread's --agent type, hooks fall back to getMainThreadAgentType(). /** When true, canUseTool must always be called even when hooks auto-approve. diff --git a/src/observability/harness.ts b/src/observability/harness.ts index d60e0dfe4f..62d27815e8 100644 --- a/src/observability/harness.ts +++ b/src/observability/harness.ts @@ -32,6 +32,9 @@ export type HarnessEventInput = { parent_turn_id?: string | null subagent_id?: string | null subagent_type?: string | null + subagent_reason?: string | null + subagent_trigger_kind?: string | null + subagent_trigger_detail?: string | null query_source?: string | null request_id?: string | null tool_call_id?: string | null @@ -142,6 +145,9 @@ export async function emitHarnessEvent( parent_turn_id: input.parent_turn_id ?? null, subagent_id: input.subagent_id ?? null, subagent_type: input.subagent_type ?? null, + subagent_reason: input.subagent_reason ?? null, + subagent_trigger_kind: input.subagent_trigger_kind ?? null, + subagent_trigger_detail: input.subagent_trigger_detail ?? null, query_source: input.query_source ?? null, request_id: input.request_id ?? null, tool_call_id: input.tool_call_id ?? null, diff --git a/src/query.ts b/src/query.ts index 065a3c5537..42806f957a 100644 --- a/src/query.ts +++ b/src/query.ts @@ -156,6 +156,53 @@ function* yieldMissingToolResultBlocks( } } +async function emitAbandonedToolUseEvents({ + assistantMessages, + toolUseContext, + queryId, + querySource, + turnId, + loopIter, + reason, +}: { + assistantMessages: AssistantMessage[] + toolUseContext: ToolUseContext + queryId: string + querySource: QuerySource + turnId: string + loopIter: number + reason: string +}): Promise { + for (const assistantMessage of assistantMessages) { + const toolUseBlocks = (Array.isArray(assistantMessage.message?.content) + ? assistantMessage.message.content + : [] + ).filter((content: { type: string }) => content.type === 'tool_use') as ToolUseBlock[] + + for (const toolUse of toolUseBlocks) { + await emitHarnessEvent({ + event: 'tool.execution.failed', + component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, + query_id: queryId, + turn_id: turnId, + loop_iter: loopIter, + query_source: querySource, + request_id: asOptionalString(assistantMessage.requestId), + tool_call_id: toolUse.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, + payload: { + tool_name: toolUse.name, + success: false, + error: reason, + duration_ms: 0, + }, + }) + } + } +} + /** * The rules of thinking are lengthy and fortuitous. They require plenty of thinking * of most long duration and deep meditation for a wizard to wrap one's noggin around. @@ -287,6 +334,7 @@ async function emitMessageStageEvent({ component, before, after, + userActionId, queryId, turnId, loopIter, @@ -297,6 +345,7 @@ async function emitMessageStageEvent({ component: string before: Message[] after: Message[] + userActionId?: string | null queryId: string turnId: string loopIter: number @@ -312,6 +361,7 @@ async function emitMessageStageEvent({ await emitHarnessEvent({ event, component, + user_action_id: userActionId ?? null, query_id: queryId, turn_id: turnId, loop_iter: loopIter, @@ -370,6 +420,7 @@ async function emitStateSnapshotEvent({ await emitHarnessEvent({ event, component: 'query_loop', + user_action_id: state.toolUseContext.userActionId ?? null, query_id: queryId, turn_id: turnId, loop_iter: loopIter, @@ -414,6 +465,7 @@ async function emitStateTransitionEvent({ await emitHarnessEvent({ event: 'state.transitioned', component: 'query_loop', + user_action_id: toState.toolUseContext.userActionId ?? null, query_id: queryId, turn_id: turnId, loop_iter: loopIter, @@ -589,6 +641,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'state.initialized', component: 'query_loop', + user_action_id: state.toolUseContext.userActionId ?? null, query_source: querySource, turn_id: 'turn-1', loop_iter: 1, @@ -612,6 +665,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'prefetch.memory.started', component: 'query_loop', + user_action_id: state.toolUseContext.userActionId ?? null, query_source: querySource, payload: { message_count: state.messages.length, @@ -622,20 +676,44 @@ async function* queryLoop( async function emitQueryTerminated( reason: string, extraPayload?: Record, + options?: { + finalMessages?: Message[] + }, ): Promise { + const terminalState: State = options?.finalMessages + ? { + ...state, + messages: options.finalMessages, + } + : state + const terminalQueryId = + terminalState.toolUseContext.queryTracking?.chainId ?? + state.toolUseContext.queryTracking?.chainId ?? + 'unknown' + + await emitStateSnapshotEvent({ + event: 'state.snapshot.after_turn', + state: terminalState, + queryId: terminalQueryId, + turnId: `turn-${terminalState.turnCount}`, + loopIter: terminalState.turnCount, + querySource, + }) + await emitHarnessEvent({ event: 'query.terminated', component: 'query_loop', + user_action_id: terminalState.toolUseContext.userActionId ?? null, query_source: querySource, - query_id: state.toolUseContext.queryTracking?.chainId ?? null, - turn_id: `turn-${state.turnCount}`, - loop_iter: state.turnCount, - subagent_id: state.toolUseContext.agentId ?? null, - subagent_type: state.toolUseContext.agentType ?? null, + query_id: terminalState.toolUseContext.queryTracking?.chainId ?? null, + turn_id: `turn-${terminalState.turnCount}`, + loop_iter: terminalState.turnCount, + subagent_id: terminalState.toolUseContext.agentId ?? null, + subagent_type: terminalState.toolUseContext.agentType ?? null, payload: { reason, - final_message_count: state.messages.length, - transition: state.transition?.reason ?? null, + final_message_count: terminalState.messages.length, + transition: terminalState.transition?.reason ?? null, ...extraPayload, }, }) @@ -700,10 +778,11 @@ async function* queryLoop( ...toolUseContext, queryTracking, } - if (queryTracking.depth === 0) { + if (turnCount === 1) { await emitHarnessEvent({ event: 'query.started', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -721,6 +800,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'query_tracking.assigned', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -733,6 +813,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'turn.started', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -758,6 +839,7 @@ async function* queryLoop( component: 'query_loop', before: messages, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -798,6 +880,7 @@ async function* queryLoop( component: 'query_loop', before: beforeToolResultBudget, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -823,6 +906,7 @@ async function* queryLoop( component: 'query_loop', before: beforeSnip, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -855,6 +939,7 @@ async function* queryLoop( component: 'query_loop', before: beforeMicrocompact, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -890,6 +975,7 @@ async function* queryLoop( component: 'query_loop', before: beforeCollapse, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -906,6 +992,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'messages.autoconpact.checked', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -933,6 +1020,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'messages.autoconpact.completed', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1025,6 +1113,7 @@ async function* queryLoop( component: 'query_loop', before: messages, after: messagesForQuery, + userActionId: toolUseContext.userActionId ?? null, queryId: queryTracking.chainId, turnId, loopIter: turnCount, @@ -1161,6 +1250,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'prompt.build.started', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1183,6 +1273,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'prompt.snapshot.stored', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1195,6 +1286,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'prompt.build.completed', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1243,6 +1335,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'api.request.started', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1324,6 +1417,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'api.stream.first_chunk', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1381,6 +1475,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'assistant.block.received', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1394,12 +1489,15 @@ async function* queryLoop( await emitHarnessEvent({ event: 'assistant.tool_use.detected', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, query_source: querySource, request_id: asOptionalString(assistantMsg.requestId), tool_call_id: block.id, + subagent_id: toolUseContext.agentId ?? null, + subagent_type: toolUseContext.agentType ?? null, payload: { tool_name: block.name, }, @@ -1534,6 +1632,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'api.stream.completed', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -1585,6 +1684,15 @@ async function* queryLoop( assistantMessages, 'Model fallback triggered', ) + await emitAbandonedToolUseEvents({ + assistantMessages, + toolUseContext, + queryId: queryTracking.chainId, + querySource, + turnId, + loopIter: turnCount, + reason: 'model_fallback_triggered', + }) assistantMessages.length = 0 toolResults.length = 0 toolUseBlocks.length = 0 @@ -1660,6 +1768,8 @@ async function* queryLoop( }) await emitQueryTerminated('image_error', { error_message: error.message, + }, { + finalMessages: [...messagesForQuery, ...assistantMessages], }) return { reason: 'image_error' } } @@ -1669,6 +1779,15 @@ async function* queryLoop( // due to a bug, we may end up in a state where we have already emitted // a tool_use block but will stop before emitting the tool_result. yield* yieldMissingToolResultBlocks(assistantMessages, errorMessage) + await emitAbandonedToolUseEvents({ + assistantMessages, + toolUseContext, + queryId: queryTracking.chainId, + querySource, + turnId, + loopIter: turnCount, + reason: 'query_error_before_tool_execution', + }) // Surface the real error instead of a misleading "[Request interrupted // by user]" — this path is a model/runtime failure, not a user action. @@ -1680,7 +1799,9 @@ async function* queryLoop( // To help track down bugs, log loudly for ants logAntError('Query error', error) - await emitQueryTerminated('model_error', { error_message: errorMessage }) + await emitQueryTerminated('model_error', { error_message: errorMessage }, { + finalMessages: [...messagesForQuery, ...assistantMessages], + }) return { reason: 'model_error', error } } @@ -1714,6 +1835,15 @@ async function* queryLoop( assistantMessages, 'Interrupted by user', ) + await emitAbandonedToolUseEvents({ + assistantMessages, + toolUseContext, + queryId: queryTracking.chainId, + querySource, + turnId, + loopIter: turnCount, + reason: 'interrupted_before_tool_execution', + }) } // chicago MCP: auto-unhide + lock release on interrupt. Same cleanup // as the natural turn-end path in stopHooks.ts. Main thread only — @@ -1736,7 +1866,9 @@ async function* queryLoop( toolUse: false, }) } - await emitQueryTerminated('aborted_streaming') + await emitQueryTerminated('aborted_streaming', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages], + }) return { reason: 'aborted_streaming' } } @@ -1895,6 +2027,10 @@ async function* queryLoop( void executeStopFailureHooks(lastMessage!, toolUseContext) await emitQueryTerminated( isWithheldMedia ? 'image_error' : 'prompt_too_long', + undefined, + { + finalMessages: [...messagesForQuery, ...assistantMessages], + }, ) return { reason: isWithheldMedia ? 'image_error' : 'prompt_too_long' } } else if (feature('CONTEXT_COLLAPSE') && isWithheld413) { @@ -1903,7 +2039,9 @@ async function* queryLoop( // early-return rationale — don't fall through to stop hooks. yield lastMessage void executeStopFailureHooks(lastMessage, toolUseContext) - await emitQueryTerminated('prompt_too_long') + await emitQueryTerminated('prompt_too_long', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages], + }) return { reason: 'prompt_too_long' } } @@ -2020,6 +2158,8 @@ async function* queryLoop( void executeStopFailureHooks(lastMessage, toolUseContext) await emitQueryTerminated('completed', { last_message_api_error: true, + }, { + finalMessages: [...messagesForQuery, ...assistantMessages], }) return { reason: 'completed' } } @@ -2036,7 +2176,9 @@ async function* queryLoop( ) if (stopHookResult.preventContinuation) { - await emitQueryTerminated('stop_hook_prevented') + await emitQueryTerminated('stop_hook_prevented', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages], + }) return { reason: 'stop_hook_prevented' } } @@ -2092,6 +2234,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'token_budget.decision', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -2163,7 +2306,9 @@ async function* queryLoop( } } - await emitQueryTerminated('completed') + await emitQueryTerminated('completed', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages], + }) return { reason: 'completed' } } @@ -2189,6 +2334,7 @@ async function* queryLoop( await emitHarnessEvent({ event: 'tool.execution.mode.selected', component: 'query_loop', + user_action_id: toolUseContext.userActionId ?? null, query_id: queryTracking.chainId, turn_id: turnId, loop_iter: turnCount, @@ -2334,13 +2480,17 @@ async function* queryLoop( turnCount: nextTurnCountOnAbort, }) } - await emitQueryTerminated('aborted_tools') + await emitQueryTerminated('aborted_tools', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages, ...toolResults], + }) return { reason: 'aborted_tools' } } // If a hook indicated to prevent continuation, stop here if (shouldPreventContinuation) { - await emitQueryTerminated('hook_stopped') + await emitQueryTerminated('hook_stopped', undefined, { + finalMessages: [...messagesForQuery, ...assistantMessages, ...toolResults], + }) return { reason: 'hook_stopped' } } @@ -2534,6 +2684,8 @@ async function* queryLoop( }) await emitQueryTerminated('max_turns', { turn_count: nextTurnCount, + }, { + finalMessages: [...messagesForQuery, ...assistantMessages, ...toolResults], }) return { reason: 'max_turns', turnCount: nextTurnCount } } diff --git a/src/query/stopHooks.ts b/src/query/stopHooks.ts index d3ea92f41f..8fa0542e54 100644 --- a/src/query/stopHooks.ts +++ b/src/query/stopHooks.ts @@ -84,6 +84,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.started', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, @@ -348,6 +349,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.completed', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, @@ -367,6 +369,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.completed', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, @@ -494,6 +497,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.completed', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, @@ -512,6 +516,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.completed', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, @@ -533,6 +538,7 @@ export async function* handleStopHooks( await emitHarnessEvent({ event: 'stop_hooks.completed', component: 'stop_hooks', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: toolUseContext.agentId ?? null, diff --git a/src/screens/REPL.tsx b/src/screens/REPL.tsx index d47a4507e0..2c78440fe2 100644 --- a/src/screens/REPL.tsx +++ b/src/screens/REPL.tsx @@ -3185,6 +3185,7 @@ export function REPL({ additionalAllowedTools: string[], mainLoopModelParam: string, effort?: EffortValue, + userActionId?: UUID, ) => { // Prepare IDE integration for new prompt. Read mcpClients fresh from // store — useManageMCPConnections may have populated it since the @@ -3293,6 +3294,9 @@ export function REPL({ abortController, mainLoopModelParam, ); + toolUseContext.userActionId = + userActionId ?? + newMessages.find((message): message is UserMessage => message.type === 'user')?.uuid; // getToolUseContext reads tools/mcpClients fresh from store.getState() // (via computeTools/mergeClients). Use those rather than the closure- // captured `tools`/`mcpClients` — useManageMCPConnections may have @@ -3469,6 +3473,7 @@ export function REPL({ onBeforeQueryCallback?: (input: string, newMessages: MessageType[]) => Promise, input?: string, effort?: EffortValue, + userActionId?: UUID, ): Promise => { // If this is a teammate, mark them as active when starting a turn if (isAgentSwarmsEnabled()) { @@ -3546,6 +3551,7 @@ export function REPL({ additionalAllowedTools, mainLoopModelParam, effort, + userActionId, ); } catch (error) { if (feature('UDS_INBOX')) { diff --git a/src/services/AgentSummary/agentSummary.ts b/src/services/AgentSummary/agentSummary.ts index 50146b3c79..6f5176be22 100644 --- a/src/services/AgentSummary/agentSummary.ts +++ b/src/services/AgentSummary/agentSummary.ts @@ -120,6 +120,13 @@ export function startAgentSummarization( canUseTool, querySource: 'agent_summary', forkLabel: 'agent_summary', + subagentReason: 'agent_summary', + subagentTriggerKind: 'periodic_timer', + subagentTriggerDetail: 'summary_interval_elapsed', + subagentTriggerPayload: { + summary_interval_ms: SUMMARY_INTERVAL_MS, + transcript_message_count: cleanMessages.length, + }, overrides: { abortController: summaryAbortController }, skipTranscript: true, }) diff --git a/src/services/PromptSuggestion/promptSuggestion.ts b/src/services/PromptSuggestion/promptSuggestion.ts index c54df2a40d..894685e511 100644 --- a/src/services/PromptSuggestion/promptSuggestion.ts +++ b/src/services/PromptSuggestion/promptSuggestion.ts @@ -167,6 +167,17 @@ export async function tryGenerateSuggestion( abortController, promptId, cacheSafeParams, + { + kind: source === 'cli' ? 'stop_hook_background' : 'direct_feature_entry', + detail: + source === 'cli' + ? 'suggestion_generation_allowed' + : 'suggestion_generation_direct', + payload: { + source: source ?? 'unknown', + assistant_turn_count: assistantTurnCount, + }, + }, ) if (abortController.signal.aborted) { logSuggestionSuppressed('aborted', undefined, undefined, source) @@ -295,6 +306,11 @@ export async function generateSuggestion( abortController: AbortController, promptId: PromptVariant, cacheSafeParams: CacheSafeParams, + triggerInfo?: { + kind?: string + detail?: string + payload?: Record + }, ): Promise<{ suggestion: string | null; generationRequestId: string | null }> { const prompt = SUGGESTION_PROMPTS[promptId] @@ -322,6 +338,10 @@ export async function generateSuggestion( canUseTool, querySource: 'prompt_suggestion', forkLabel: 'prompt_suggestion', + subagentReason: 'prompt_suggestion', + subagentTriggerKind: triggerInfo?.kind ?? undefined, + subagentTriggerDetail: triggerInfo?.detail ?? undefined, + subagentTriggerPayload: triggerInfo?.payload, overrides: { abortController, }, diff --git a/src/services/PromptSuggestion/speculation.ts b/src/services/PromptSuggestion/speculation.ts index 9835d4d860..577d1835b2 100644 --- a/src/services/PromptSuggestion/speculation.ts +++ b/src/services/PromptSuggestion/speculation.ts @@ -376,6 +376,13 @@ async function generatePipelinedSuggestion( pipelineAbortController, promptId, createCacheSafeParams(augmentedContext), + { + kind: 'internal_pipeline', + detail: 'pipelined_suggestion_generation', + payload: { + speculative_message_count: speculatedMessages.length, + }, + }, ) if (pipelineAbortController.signal.aborted) return @@ -632,6 +639,15 @@ export async function startSpeculation( }, querySource: 'speculation', forkLabel: 'speculation', + subagentReason: 'speculation', + subagentTriggerKind: 'internal_pipeline', + subagentTriggerDetail: isPipelined + ? 'accepted_pipelined_prompt_suggestion' + : 'accepted_prompt_suggestion', + subagentTriggerPayload: { + suggestion_length: suggestionText.length, + is_pipelined: isPipelined, + }, maxTurns: MAX_SPECULATION_TURNS, overrides: { abortController, requireCanUseTool: true }, onMessage: msg => { diff --git a/src/services/SessionMemory/sessionMemory.ts b/src/services/SessionMemory/sessionMemory.ts index 2df75aa0d9..dd7c7ce850 100644 --- a/src/services/SessionMemory/sessionMemory.ts +++ b/src/services/SessionMemory/sessionMemory.ts @@ -133,12 +133,35 @@ function countToolCallsSince( } export function shouldExtractMemory(messages: Message[]): boolean { + return evaluateSessionMemoryTrigger(messages).shouldExtract +} + +function evaluateSessionMemoryTrigger(messages: Message[]): { + shouldExtract: boolean + detail: + | 'token_threshold_and_tool_threshold' + | 'token_threshold_and_natural_break' + | null + payload: Record +} { // Check if we've met the initialization threshold // Uses total context window tokens (same as autocompact) for consistent behavior const currentTokenCount = tokenCountWithEstimation(messages) + const initializationThresholdMet = hasMetInitializationThreshold(currentTokenCount) if (!isSessionMemoryInitialized()) { - if (!hasMetInitializationThreshold(currentTokenCount)) { - return false + if (!initializationThresholdMet) { + return { + shouldExtract: false, + detail: null, + payload: { + current_token_count: currentTokenCount, + has_met_initialization_threshold: false, + has_met_update_threshold: false, + tool_calls_since_last_update: 0, + tool_call_threshold: getToolCallsBetweenUpdates(), + has_tool_calls_in_last_turn: hasToolCallsInLastAssistantTurn(messages), + }, + } } markSessionMemoryInitialized() } @@ -170,15 +193,47 @@ export function shouldExtractMemory(messages: Message[]): boolean { (hasMetTokenThreshold && hasMetToolCallThreshold) || (hasMetTokenThreshold && !hasToolCallsInLastTurn) + let detail: + | 'token_threshold_and_tool_threshold' + | 'token_threshold_and_natural_break' + | null = null + if (hasMetTokenThreshold && hasMetToolCallThreshold) { + detail = 'token_threshold_and_tool_threshold' + } else if (hasMetTokenThreshold && !hasToolCallsInLastTurn) { + detail = 'token_threshold_and_natural_break' + } + if (shouldExtract) { const lastMessage = messages[messages.length - 1] if (lastMessage?.uuid) { lastMemoryMessageUuid = lastMessage.uuid } - return true + return { + shouldExtract: true, + detail, + payload: { + current_token_count: currentTokenCount, + has_met_initialization_threshold: true, + has_met_update_threshold: hasMetTokenThreshold, + tool_calls_since_last_update: toolCallsSinceLastUpdate, + tool_call_threshold: getToolCallsBetweenUpdates(), + has_tool_calls_in_last_turn: hasToolCallsInLastTurn, + }, + } } - return false + return { + shouldExtract: false, + detail, + payload: { + current_token_count: currentTokenCount, + has_met_initialization_threshold: true, + has_met_update_threshold: hasMetTokenThreshold, + tool_calls_since_last_update: toolCallsSinceLastUpdate, + tool_call_threshold: getToolCallsBetweenUpdates(), + has_tool_calls_in_last_turn: hasToolCallsInLastTurn, + }, + } } async function setupSessionMemoryFile( @@ -300,7 +355,8 @@ const extractSessionMemory = sequential(async function ( // Initialize config from remote (lazy, only once) initSessionMemoryConfigIfNeeded() - if (!shouldExtractMemory(messages)) { + const triggerInfo = evaluateSessionMemoryTrigger(messages) + if (!triggerInfo.shouldExtract) { return } @@ -328,6 +384,10 @@ const extractSessionMemory = sequential(async function ( canUseTool: createMemoryFileCanUseTool(memoryPath), querySource: 'session_memory', forkLabel: 'session_memory', + subagentReason: 'session_memory', + subagentTriggerKind: 'post_sampling_hook', + subagentTriggerDetail: triggerInfo.detail ?? undefined, + subagentTriggerPayload: triggerInfo.payload, overrides: { readFileState: setupContext.readFileState }, }) @@ -436,6 +496,12 @@ export async function manuallyExtractSessionMemory( canUseTool: createMemoryFileCanUseTool(memoryPath), querySource: 'session_memory', forkLabel: 'session_memory_manual', + subagentReason: 'session_memory', + subagentTriggerKind: 'manual_command', + subagentTriggerDetail: 'manual_session_memory_extraction', + subagentTriggerPayload: { + message_count: messages.length, + }, overrides: { readFileState: setupContext.readFileState }, }) diff --git a/src/services/autoDream/autoDream.ts b/src/services/autoDream/autoDream.ts index d87b34f31c..60a208bf0e 100644 --- a/src/services/autoDream/autoDream.ts +++ b/src/services/autoDream/autoDream.ts @@ -228,6 +228,12 @@ ${sessionIds.map(id => `- ${id}`).join('\n')}` canUseTool: createAutoMemCanUseTool(memoryRoot), querySource: 'auto_dream', forkLabel: 'auto_dream', + subagentReason: 'auto_dream', + subagentTriggerKind: 'stop_hook_background', + subagentTriggerDetail: 'dream_consolidation_run', + subagentTriggerPayload: { + sessions_reviewing: sessionIds.length, + }, skipTranscript: true, overrides: { abortController }, onMessage: makeDreamProgressWatcher(taskId, setAppState), diff --git a/src/services/compact/compact.ts b/src/services/compact/compact.ts index f46194ffbd..f5e56c4f12 100644 --- a/src/services/compact/compact.ts +++ b/src/services/compact/compact.ts @@ -1195,6 +1195,14 @@ async function streamCompactSummary({ canUseTool: createCompactCanUseTool(), querySource: 'compact', forkLabel: 'compact', + subagentReason: 'compact', + subagentTriggerKind: 'compaction_flow', + subagentTriggerDetail: 'prompt_cache_sharing_compact', + subagentTriggerPayload: { + prompt_cache_sharing_enabled: promptCacheSharingEnabled, + max_turns: 1, + skip_cache_write: true, + }, maxTurns: 1, skipCacheWrite: true, // Pass the compact context's abortController so user Esc aborts the diff --git a/src/services/extractMemories/extractMemories.ts b/src/services/extractMemories/extractMemories.ts index bb2ae11034..d7d29e6306 100644 --- a/src/services/extractMemories/extractMemories.ts +++ b/src/services/extractMemories/extractMemories.ts @@ -418,6 +418,17 @@ export function initExtractMemories(): void { canUseTool, querySource: 'extract_memories', forkLabel: 'extract_memories', + subagentReason: 'extract_memories', + subagentTriggerKind: 'stop_hook_background', + subagentTriggerDetail: isTrailingRun + ? 'coalesced_trailing_run' + : 'post_turn_background_extraction', + subagentTriggerPayload: { + feature_gate_enabled: true, + auto_memory_enabled: true, + remote_mode: false, + trailing_run: Boolean(isTrailingRun), + }, // The extractMemories subagent does not need to record to transcript. // Doing so can create race conditions with the main thread. skipTranscript: true, diff --git a/src/services/tools/StreamingToolExecutor.ts b/src/services/tools/StreamingToolExecutor.ts index b924fdd917..08ebb7e0c5 100644 --- a/src/services/tools/StreamingToolExecutor.ts +++ b/src/services/tools/StreamingToolExecutor.ts @@ -5,6 +5,7 @@ import { withMemoryCorrectionHint, } from 'src/utils/messages.js' import type { CanUseToolFn } from '../../hooks/useCanUseTool.js' +import { emitHarnessEvent } from '../../observability/harness.js' import { findToolByName, type Tools, type ToolUseContext } from '../../Tool.js' import { BASH_TOOL_NAME } from '@claude-code-best/builtin-tools/tools/BashTool/toolName.js' import type { AssistantMessage, Message } from '../../types/message.js' @@ -213,6 +214,31 @@ export class StreamingToolExecutor { }) } + private async emitSyntheticFailureEvent( + tool: TrackedTool, + reason: 'sibling_error' | 'user_interrupted' | 'streaming_fallback', + ): Promise { + await emitHarnessEvent({ + event: 'tool.execution.failed', + component: 'streaming_tool_executor', + user_action_id: this.toolUseContext.userActionId ?? null, + query_id: this.toolUseContext.queryTracking?.chainId ?? null, + request_id: + typeof tool.assistantMessage.requestId === 'string' + ? tool.assistantMessage.requestId + : null, + tool_call_id: tool.id, + subagent_id: this.toolUseContext.agentId ?? null, + subagent_type: this.toolUseContext.agentType ?? null, + payload: { + tool_name: tool.block.name, + success: false, + error: reason, + duration_ms: 0, + }, + }) + } + /** * Determine why a tool should be cancelled. */ @@ -286,6 +312,7 @@ export class StreamingToolExecutor { // If already aborted (by error or user), generate synthetic error block instead of running the tool const initialAbortReason = this.getAbortReason(tool) if (initialAbortReason) { + await this.emitSyntheticFailureEvent(tool, initialAbortReason) messages.push( this.createSyntheticErrorMessage( tool.id, @@ -343,6 +370,7 @@ export class StreamingToolExecutor { // Only add the synthetic error if THIS tool didn't produce the error. const abortReason = this.getAbortReason(tool) if (abortReason && !thisToolErrored) { + await this.emitSyntheticFailureEvent(tool, abortReason) messages.push( this.createSyntheticErrorMessage( tool.id, diff --git a/src/services/tools/toolExecution.ts b/src/services/tools/toolExecution.ts index 536a896576..4c0f3987b1 100644 --- a/src/services/tools/toolExecution.ts +++ b/src/services/tools/toolExecution.ts @@ -373,6 +373,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.execution.failed', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, @@ -433,6 +434,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.enqueued', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, @@ -446,6 +448,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.execution.started', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, @@ -496,6 +499,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.execution.failed', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, @@ -528,6 +532,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.execution.completed', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, @@ -562,6 +567,7 @@ export async function* runToolUse( await emitHarnessEvent({ event: 'tool.execution.failed', component: 'tool_execution', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, request_id: requestId ?? null, tool_call_id: toolUse.id, diff --git a/src/services/tools/toolOrchestration.ts b/src/services/tools/toolOrchestration.ts index 3315a32efb..e49ff58a59 100644 --- a/src/services/tools/toolOrchestration.ts +++ b/src/services/tools/toolOrchestration.ts @@ -27,6 +27,7 @@ export async function* runTools( await emitHarnessEvent({ event: 'tool.batch.started', component: 'tool_orchestration', + user_action_id: toolUseContext.userActionId ?? null, query_id: toolUseContext.queryTracking?.chainId ?? null, subagent_id: toolUseContext.agentId ?? null, subagent_type: toolUseContext.agentType ?? null, @@ -55,6 +56,7 @@ export async function* runTools( await emitHarnessEvent({ event: 'tool.execution.mode.selected', component: 'tool_orchestration', + user_action_id: currentContext.userActionId ?? null, query_id: currentContext.queryTracking?.chainId ?? null, subagent_id: currentContext.agentId ?? null, subagent_type: currentContext.agentType ?? null, @@ -102,6 +104,7 @@ export async function* runTools( await emitHarnessEvent({ event: 'tool.context.updated', component: 'tool_orchestration', + user_action_id: currentContext.userActionId ?? null, query_id: currentContext.queryTracking?.chainId ?? null, subagent_id: currentContext.agentId ?? null, subagent_type: currentContext.agentType ?? null, @@ -132,6 +135,7 @@ export async function* runTools( await emitHarnessEvent({ event: 'tool.context.updated', component: 'tool_orchestration', + user_action_id: currentContext.userActionId ?? null, query_id: currentContext.queryTracking?.chainId ?? null, subagent_id: currentContext.agentId ?? null, subagent_type: currentContext.agentType ?? null, diff --git a/src/utils/forkedAgent.ts b/src/utils/forkedAgent.ts index af648ca097..e1c6e6a2cc 100644 --- a/src/utils/forkedAgent.ts +++ b/src/utils/forkedAgent.ts @@ -92,6 +92,14 @@ export type ForkedAgentParams = { querySource: QuerySource /** Label for analytics (e.g., 'session_memory', 'supervisor') */ forkLabel: string + /** Stable business reason for spawning this subagent. */ + subagentReason?: string + /** High-level mechanism that triggered this subagent spawn. */ + subagentTriggerKind?: string + /** Concrete branch detail under the trigger mechanism. */ + subagentTriggerDetail?: string + /** Structured trigger evidence captured at the callsite. */ + subagentTriggerPayload?: Record /** Optional overrides for the subagent context (e.g., readFileState from setup phase) */ overrides?: SubagentContextOverrides /** @@ -445,6 +453,7 @@ export function createSubagentContext( // Fields that can be overridden or copied from parent options: overrides?.options ?? parentContext.options, messages: overrides?.messages ?? parentContext.messages, + userActionId: parentContext.userActionId, // Generate new agentId for subagents (each subagent should have its own ID) agentId: overrides?.agentId ?? createAgentId(), agentType: overrides?.agentType, @@ -493,6 +502,10 @@ export async function runForkedAgent({ canUseTool, querySource, forkLabel, + subagentReason, + subagentTriggerKind, + subagentTriggerDetail, + subagentTriggerPayload, overrides, maxOutputTokens, maxTurns, @@ -503,13 +516,25 @@ export async function runForkedAgent({ const startTime = Date.now() const outputMessages: Message[] = [] let totalUsage: NonNullableUsage = { ...EMPTY_USAGE } + const resolvedSubagentReason = + subagentReason ?? + forkLabel ?? + (typeof querySource === 'string' && querySource.length > 0 + ? querySource + : 'unknown') await emitHarnessEvent({ event: 'subagent.spawn.requested', component: 'forked_agent', + user_action_id: cacheSafeParams.toolUseContext.userActionId ?? null, query_source: querySource, subagent_type: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_kind: subagentTriggerKind ?? null, + subagent_trigger_detail: subagentTriggerDetail ?? null, payload: { fork_label: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_payload: subagentTriggerPayload ?? null, prompt_message_count: promptMessages.length, skip_transcript: skipTranscript ?? false, max_turns: maxTurns ?? null, @@ -542,12 +567,18 @@ export async function runForkedAgent({ await emitHarnessEvent({ event: 'subagent.spawned', component: 'forked_agent', + user_action_id: isolatedToolUseContext.userActionId ?? null, query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, subagent_type: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_kind: subagentTriggerKind ?? null, + subagent_trigger_detail: subagentTriggerDetail ?? null, payload: { fork_label: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_payload: subagentTriggerPayload ?? null, inherited_message_count: forkContextMessages.length, prompt_message_count: promptMessages.length, transcript_enabled: Boolean(agentId), @@ -603,10 +634,14 @@ export async function runForkedAgent({ await emitHarnessEvent({ event: 'subagent.message.received', component: 'forked_agent', + user_action_id: isolatedToolUseContext.userActionId ?? null, query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, subagent_type: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_kind: subagentTriggerKind ?? null, + subagent_trigger_detail: subagentTriggerDetail ?? null, payload: { message_type: (message as Message).type, }, @@ -659,12 +694,18 @@ export async function runForkedAgent({ await emitHarnessEvent({ event: 'subagent.completed', component: 'forked_agent', + user_action_id: isolatedToolUseContext.userActionId ?? null, query_id: isolatedToolUseContext.queryTracking?.chainId ?? null, query_source: querySource, subagent_id: isolatedToolUseContext.agentId ?? agentId ?? null, subagent_type: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_kind: subagentTriggerKind ?? null, + subagent_trigger_detail: subagentTriggerDetail ?? null, payload: { fork_label: forkLabel, + subagent_reason: resolvedSubagentReason, + subagent_trigger_payload: subagentTriggerPayload ?? null, duration_ms: durationMs, message_count: outputMessages.length, input_tokens: totalUsage.input_tokens, diff --git a/src/utils/handlePromptSubmit.ts b/src/utils/handlePromptSubmit.ts index 97b05758f1..356e500983 100644 --- a/src/utils/handlePromptSubmit.ts +++ b/src/utils/handlePromptSubmit.ts @@ -75,6 +75,7 @@ type BaseExecutionParams = { onBeforeQuery?: (input: string, newMessages: Message[]) => Promise, input?: string, effort?: EffortValue, + userActionId?: UUID, ) => Promise setAppState: (updater: (prev: AppState) => AppState) => void onBeforeQuery?: (input: string, newMessages: Message[]) => Promise @@ -585,6 +586,7 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { shouldCallBeforeQuery ? onBeforeQuery : undefined, primaryInput, effort, + primaryCmd?.uuid, ) } else { // Local slash commands that skip messages (e.g., /model, /theme). diff --git a/src/utils/sideQuestion.ts b/src/utils/sideQuestion.ts index 8058dc51fb..107fa907cd 100644 --- a/src/utils/sideQuestion.ts +++ b/src/utils/sideQuestion.ts @@ -90,6 +90,14 @@ ${question}` }), querySource: 'side_question', forkLabel: 'side_question', + subagentReason: 'side_query', + subagentTriggerKind: 'explicit_user_command', + subagentTriggerDetail: 'btw_command', + subagentTriggerPayload: { + command: '/btw', + max_turns: 1, + tools_allowed: false, + }, maxTurns: 1, // Single turn only - no tool use loops // No future request shares this suffix; skip writing cache entries. skipCacheWrite: true, From 84dfc6a6d5166ebab655786b945a5beb84681625 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Thu, 23 Apr 2026 19:46:45 +0800 Subject: [PATCH 05/26] Clarify observability v1 setup requirements --- README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/README.md b/README.md index 7046d9b44d..4640162a95 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,70 @@ TUI (REPL) 模式需要真实终端,无法直接通过 VS Code launch 启动 以前更像“先跑程序,再回头看零散日志”。 现在推荐直接按下面这套观测驱动流程运行: +#### 观测系统 V1 环境要求 + +- 操作系统:当前脚本默认按 **Windows + PowerShell** 编写 +- 运行时:需要 **Bun**,用于执行 `scripts/observability/build_duckdb_etl.ts` +- DuckDB:**不需要单独安装** + - 仓库已经自带 [tools/duckdb/duckdb.exe](./tools/duckdb/duckdb.exe) + - 数据库文件默认落在 `E:\claude-code\.observability\observability_v1.duckdb` +- 目录要求: + - 需要有 `.observability/events-YYYYMMDD.jsonl` + - 这些事件文件会在你实际运行 debug 版本并产生真实动作后自动生成 + +如果用户只是把仓库拉下来,想运行 V1 观测系统,最少需要先完成: + +```bash +bun install +``` + +然后至少真实运行过一次程序,产生日志后,再执行观测脚本。 + +#### 是否需要自己安装 DuckDB? + +不需要。 + +这套 V1 观测系统的设计就是“仓库内自带 DuckDB 可执行文件 + 本地 PowerShell 脚本 + 本地 `.observability` 数据目录”。 +所以用户不需要额外装 Python 版 DuckDB,也不需要自己配置 DuckDB PATH。 + +#### 一次最小可运行流程 + +如果用户是第一次拉代码,推荐按这个顺序: + +1. 安装依赖 + +```bash +bun install +``` + +2. 启动 debug 版本 + +```bash +bun run dev +``` + +3. 在 REPL 里真实发送至少一条 query + 这一步的目的,是生成 `.observability/events-YYYYMMDD.jsonl` + +4. 重建观测数据库 + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\rebuild_observability_db.ps1 +``` + +5. 生成最近一次动作的分析报告 + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\explain_action.ps1 -Latest +``` + +6. 如需总览,再运行: + +```powershell +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\daily_summary.ps1 +powershell -ExecutionPolicy Bypass -File E:\claude-code\scripts\observability\build_dashboard.ps1 +``` + 1. 启动 debug 版本 ```bash From 25f8d060a50fa4ccc694bf855c2e75db5c578869 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Fri, 24 Apr 2026 00:04:42 +0800 Subject: [PATCH 06/26] Refine observability dashboard health view --- .../observability_dashboard.html" | 264 ++++++++---------- scripts/observability/build_dashboard.ps1 | 135 +++++++-- scripts/observability/watch_dashboard.ps1 | 76 +++++ 3 files changed, 311 insertions(+), 164 deletions(-) create mode 100644 scripts/observability/watch_dashboard.ps1 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" index 75b6e59f30..3259130f6b 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/01-\346\200\273\350\247\210/observability_dashboard.html" @@ -4,6 +4,7 @@ 本地可观测系统 V1 Dashboard + + + +
+

$escapedTitle

+
+ diagram: $Diagram
+ generated_at: $generatedAt
+ source_report: $escapedReportPath +
+
+
+
+
+$escapedMermaid
+      
+
+

如果页面没有渲染成图,通常是浏览器无法加载 Mermaid CDN;此时仍可复制源报告中的 Mermaid 代码到 Mermaid Live Editor。

+
+ + + +"@ + +[System.IO.Directory]::CreateDirectory((Split-Path -Parent $OutputPath)) | Out-Null +$html | Set-Content -LiteralPath $OutputPath -Encoding UTF8 + +Write-Output ("Generated flowchart: {0}" -f $OutputPath) +Write-Output ("Source report: {0}" -f $reportPath) + +if ($Open) { + Start-Process -FilePath $OutputPath +} diff --git a/src/observability/v2/evalExperimentTypes.ts b/src/observability/v2/evalExperimentTypes.ts new file mode 100644 index 0000000000..867fb5a9b5 --- /dev/null +++ b/src/observability/v2/evalExperimentTypes.ts @@ -0,0 +1,63 @@ +import type { EvalExperiment, EvalScoreDimension } from './evalTypes' + +export type EvalScoreDirection = + | 'higher_is_better' + | 'lower_is_better' + | 'boolean_pass' + | 'observed_only' + +export type EvalAutomationLevel = 'automatic' | 'manual_review' | 'mixed' + +export interface EvalScoreSpecThresholds { + hard_fail_regression_pct?: number + soft_warn_regression_pct?: number + max_allowed_value?: number + min_allowed_value?: number +} + +export interface EvalScoreSpec { + score_spec_id: string + dimension: EvalScoreDimension + subdimension: string + direction: EvalScoreDirection + formula: string + data_sources: string[] + evidence_requirements: string[] + automation_level: EvalAutomationLevel + thresholds?: EvalScoreSpecThresholds + version: string + notes?: string +} + +export interface EvalScoreSpecCollection { + score_specs: EvalScoreSpec[] +} + +export interface EvalGatePolicyRule { + score_spec_id: string + rule_type: 'hard_fail' | 'soft_warning' + condition: string + threshold?: number + notes?: string +} + +export interface EvalGatePolicy { + gate_policy_id: string + name: string + rules: EvalGatePolicyRule[] +} + +export interface EvalExperimentActionBinding { + scenario_id: string + baseline_user_action_id: string + candidate_user_action_ids: Record +} + +export interface EvalExperimentV21 extends EvalExperiment { + scenario_ids?: string[] + repeat_count?: number + score_spec_ids?: string[] + gate_policy_id?: string + mode?: 'bind_existing' | 'execute_harness' + action_bindings?: EvalExperimentActionBinding[] +} diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index 5073b32481..961d3bafc3 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -38,6 +38,9 @@ export interface EvalScenario { expected_tools: string[] expected_skills: string[] expected_constraints: string[] + max_turn_count?: number + max_total_billed_tokens?: number + max_subagent_count?: number owner: string status: 'draft' | 'ready' | 'archived' } diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index 9765601119..e045b4b1c8 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -10,28 +10,55 @@ Structure: - variant manifests - `experiments/` - experiment manifests +- `score-specs/` + - score definitions: dimension, formula, direction, evidence requirements +- `gates/` + - regression gate policies +- `experiment-runs/` + - generated experiment-level summaries - `scores/` - optional manual review or exported score artifacts - `runs/` - generated run records that bind V2 evaluation to V1 evidence -Recommended usage order: +Recommended V2.1 usage order: -1. Start from `scenarios/_scenario.template.json` -2. Define `variants/_variant.template.json` -3. Compose an experiment from `experiments/_experiment.template.json` -4. Bind every future run back to V1 evidence using: - - `entry_user_action_id` - - `root_query_id` - - `observability_db_ref` +1. Pick or create a `scenario` under `scenarios/`. +2. Define the baseline and candidate `variant` manifests under `variants/`. +3. Produce real V1 traces first. Current V2.1 is `bind_existing`, so you must already have one baseline `user_action_id` and one candidate `user_action_id`. +4. Create or edit an experiment manifest under `experiments/`, including: + - `scenario_ids` + - `baseline_variant_id` + - `candidate_variant_ids` + - `mode: "bind_existing"` + - `action_bindings` +5. Validate all manifests. +6. Run the experiment runner. +7. Read the generated run, score, comparison, gate, and experiment summary artifacts. -Phase-one run recording: +Validate manifests: + +```powershell +bun run scripts/evals/v2_validate_manifests.ts +``` + +Run the current sample V2.1 experiment: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default +``` + +Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records scores, compares baseline vs candidate, applies the configured gate policy, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. + +Lower-level commands are still available when you want to debug one step at a time. + +Record one run manually: ```powershell -bun run scripts/evals/v2_record_run.ts --scenario tool_choice_sensitive --variant baseline_default --latest --snapshot-db +bun run scripts/evals/v2_record_run.ts --scenario tool_choice_sensitive --variant baseline_default --user-action-id --snapshot-db ``` -Phase-one run comparison: +Compare two recorded runs manually: ```powershell bun run scripts/evals/v2_compare_runs.ts --baseline-run --candidate-run @@ -48,9 +75,3 @@ Compare the latest baseline/candidate runs for one scenario: ```powershell bun run scripts/evals/v2_compare_scenario.ts --scenario tool_choice_sensitive --candidate candidate_tool_router_v2 ``` - -Validate manifests: - -```powershell -bun run scripts/evals/v2_validate_manifests.ts -``` diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json new file mode 100644 index 0000000000..ff6e644d21 --- /dev/null +++ b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json @@ -0,0 +1,100 @@ +{ + "experiment": { + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session Memory Sparse vs Default", + "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_first_batch", + "scenario_ids": [ + "cost_sensitive_task" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "candidate_user_action_ids": { + "candidate_session_memory_sparse": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" + } + } + ], + "status": "ready" + }, + "results": [ + { + "scenario_id": "cost_sensitive_task", + "repeat_index": 1, + "baseline_run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "candidate_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "gate_results": [ + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "passed": true, + "baseline_value": 4, + "candidate_value": 2, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ] + } + ] + } + ], + "created_at": "2026-04-27T10:55:24.753Z" +} diff --git a/tests/evals/v2/experiments/_experiment.v2_1.template.json b/tests/evals/v2/experiments/_experiment.v2_1.template.json new file mode 100644 index 0000000000..7466d37801 --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.v2_1.template.json @@ -0,0 +1,29 @@ +{ + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session Memory Sparse vs Default", + "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_first_batch", + "scenario_ids": ["cost_sensitive_task"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "baseline_user_action_id": "REPLACE_WITH_BASELINE_USER_ACTION_ID", + "candidate_user_action_ids": { + "candidate_session_memory_sparse": "REPLACE_WITH_CANDIDATE_USER_ACTION_ID" + } + } + ], + "status": "draft" +} diff --git a/tests/evals/v2/experiments/session_memory_sparse_vs_default.json b/tests/evals/v2/experiments/session_memory_sparse_vs_default.json new file mode 100644 index 0000000000..aacfa56208 --- /dev/null +++ b/tests/evals/v2/experiments/session_memory_sparse_vs_default.json @@ -0,0 +1,29 @@ +{ + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session Memory Sparse vs Default", + "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_first_batch", + "scenario_ids": ["cost_sensitive_task"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "candidate_user_action_ids": { + "candidate_session_memory_sparse": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" + } + } + ], + "status": "ready" +} diff --git a/tests/evals/v2/gates/default_v2_1_gate.json b/tests/evals/v2/gates/default_v2_1_gate.json new file mode 100644 index 0000000000..21d3215b9a --- /dev/null +++ b/tests/evals/v2/gates/default_v2_1_gate.json @@ -0,0 +1,31 @@ +{ + "gate_policy_id": "default_v2_1_gate", + "name": "Default V2.1 Regression Gate", + "rules": [ + { + "score_spec_id": "task_success.main_chain_observed", + "rule_type": "hard_fail", + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "score_spec_id": "efficiency.total_billed_tokens", + "rule_type": "hard_fail", + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "threshold": 30, + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "score_spec_id": "efficiency.total_billed_tokens", + "rule_type": "soft_warning", + "condition": "candidate_regression_pct > 10", + "threshold": 10 + }, + { + "score_spec_id": "decision_quality.subagent_count_observed", + "rule_type": "soft_warning", + "condition": "candidate_regression_pct > 50", + "threshold": 50 + } + ] +} diff --git a/tests/evals/v2/runs/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.json b/tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json similarity index 86% rename from tests/evals/v2/runs/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.json rename to tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json index 847853f3a8..fdf2429a94 100644 --- a/tests/evals/v2/runs/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.json +++ b/tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json @@ -1,6 +1,6 @@ { "run": { - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "scenario_id": "cost_sensitive_task", "variant_id": "baseline_default", "started_at": "2026-04-24T04:48:30.824Z", @@ -14,18 +14,28 @@ "scenario": { "scenario_id": "cost_sensitive_task", "name": "Cost Sensitive Task", - "description": "Catalog scenario: Cost Sensitive Task", - "input_prompt": "", + "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", + "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", "tags": [ "efficiency", - "tradeoff" + "tradeoff", + "observability-v2" ], "expected_artifacts": [], - "expected_tools": [], + "expected_tools": [ + "Read" + ], "expected_skills": [], - "expected_constraints": [], + "expected_constraints": [ + "Must not modify files", + "Should avoid unnecessary background subagent expansion", + "Should keep the main query within a small number of turns" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, "owner": "local", - "status": "draft" + "status": "ready" }, "variant": { "variant_id": "baseline_default", @@ -129,6 +139,13 @@ "subagent_count": 1, "avg_duration_ms": 33043 }, + { + "subagent_reason": "prompt_suggestion", + "subagent_trigger_kind": "stop_hook_background", + "subagent_trigger_detail": "suggestion_generation_allowed", + "subagent_count": 1, + "avg_duration_ms": 8029 + }, { "subagent_reason": "extract_memories", "subagent_trigger_kind": "stop_hook_background", @@ -142,13 +159,6 @@ "subagent_trigger_detail": "token_threshold_and_natural_break", "subagent_count": 1, "avg_duration_ms": 40480 - }, - { - "subagent_reason": "prompt_suggestion", - "subagent_trigger_kind": "stop_hook_background", - "subagent_trigger_detail": "suggestion_generation_allowed", - "subagent_count": 1, - "avg_duration_ms": 8029 } ], "recoveries": [] diff --git a/tests/evals/v2/runs/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json b/tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json similarity index 85% rename from tests/evals/v2/runs/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json rename to tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json index 56676f9a9a..693d3d8faa 100644 --- a/tests/evals/v2/runs/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json +++ b/tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json @@ -1,6 +1,6 @@ { "run": { - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "scenario_id": "cost_sensitive_task", "variant_id": "candidate_session_memory_sparse", "started_at": "2026-04-24T04:55:36.952Z", @@ -14,18 +14,28 @@ "scenario": { "scenario_id": "cost_sensitive_task", "name": "Cost Sensitive Task", - "description": "Catalog scenario: Cost Sensitive Task", - "input_prompt": "", + "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", + "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", "tags": [ "efficiency", - "tradeoff" + "tradeoff", + "observability-v2" ], "expected_artifacts": [], - "expected_tools": [], + "expected_tools": [ + "Read" + ], "expected_skills": [], - "expected_constraints": [], + "expected_constraints": [ + "Must not modify files", + "Should avoid unnecessary background subagent expansion", + "Should keep the main query within a small number of turns" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, "owner": "local", - "status": "draft" + "status": "ready" }, "variant": { "variant_id": "candidate_session_memory_sparse", diff --git a/tests/evals/v2/scenarios/_scenario.template.json b/tests/evals/v2/scenarios/_scenario.template.json index e455d40c9b..2ba46abbee 100644 --- a/tests/evals/v2/scenarios/_scenario.template.json +++ b/tests/evals/v2/scenarios/_scenario.template.json @@ -8,6 +8,9 @@ "expected_tools": ["Read"], "expected_skills": [], "expected_constraints": ["Must not modify unrelated files"], + "max_turn_count": 8, + "max_total_billed_tokens": 250000, + "max_subagent_count": 3, "owner": "owner_name", "status": "draft" } diff --git a/tests/evals/v2/scenarios/cost_sensitive_task.json b/tests/evals/v2/scenarios/cost_sensitive_task.json new file mode 100644 index 0000000000..802850a6ef --- /dev/null +++ b/tests/evals/v2/scenarios/cost_sensitive_task.json @@ -0,0 +1,20 @@ +{ + "scenario_id": "cost_sensitive_task", + "name": "Cost Sensitive Task", + "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", + "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", + "tags": ["efficiency", "tradeoff", "observability-v2"], + "expected_artifacts": [], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should avoid unnecessary background subagent expansion", + "Should keep the main query within a small number of turns" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/tool_choice_sensitive.json b/tests/evals/v2/scenarios/tool_choice_sensitive.json new file mode 100644 index 0000000000..3993ac754b --- /dev/null +++ b/tests/evals/v2/scenarios/tool_choice_sensitive.json @@ -0,0 +1,20 @@ +{ + "scenario_id": "tool_choice_sensitive", + "name": "Tool Choice Sensitive", + "description": "Evaluate whether the agent selects lightweight file-reading and search tools rather than unnecessary write or shell actions.", + "input_prompt": "请定位 V2 评测系统中定义 scenario、variant、run 的代码位置,并说明这些对象之间的关系。不要修改文件。", + "tags": ["decision_quality", "tool_selection", "observability-v2"], + "expected_artifacts": [], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should prefer read/search style inspection", + "Should avoid Edit or Write for this read-only task" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/score-specs/_score_spec.template.json b/tests/evals/v2/score-specs/_score_spec.template.json new file mode 100644 index 0000000000..c9db2aa6ea --- /dev/null +++ b/tests/evals/v2/score-specs/_score_spec.template.json @@ -0,0 +1,16 @@ +{ + "score_spec_id": "dimension.subdimension", + "dimension": "efficiency", + "subdimension": "subdimension", + "direction": "lower_is_better", + "formula": "Describe how this score is computed from V1/V2 evidence.", + "data_sources": ["V1 user_actions"], + "evidence_requirements": ["entry_user_action_id"], + "automation_level": "automatic", + "thresholds": { + "hard_fail_regression_pct": 30, + "soft_warn_regression_pct": 10 + }, + "version": "v2.1", + "notes": "Template for one score spec. Production files usually wrap specs in { \"score_specs\": [...] }." +} diff --git a/tests/evals/v2/score-specs/default-v2-1.score-specs.json b/tests/evals/v2/score-specs/default-v2-1.score-specs.json new file mode 100644 index 0000000000..ca24dac920 --- /dev/null +++ b/tests/evals/v2/score-specs/default-v2-1.score-specs.json @@ -0,0 +1,66 @@ +{ + "score_specs": [ + { + "score_spec_id": "task_success.main_chain_observed", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "direction": "higher_is_better", + "formula": "1 if a main_thread root query exists for run.entry_user_action_id else 0", + "data_sources": ["V1 queries", "V2 run"], + "evidence_requirements": ["entry_user_action_id", "root_query_id"], + "automation_level": "automatic", + "version": "v2.1" + }, + { + "score_spec_id": "efficiency.total_billed_tokens", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "direction": "lower_is_better", + "formula": "user_actions.total_billed_tokens for run.entry_user_action_id", + "data_sources": ["V1 user_actions"], + "evidence_requirements": ["entry_user_action_id", "total_billed_tokens"], + "automation_level": "automatic", + "thresholds": { + "hard_fail_regression_pct": 30, + "soft_warn_regression_pct": 10 + }, + "version": "v2.1" + }, + { + "score_spec_id": "decision_quality.subagent_count_observed", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "direction": "lower_is_better", + "formula": "count(subagents) for run.entry_user_action_id", + "data_sources": ["V1 subagents"], + "evidence_requirements": ["entry_user_action_id", "subagents"], + "automation_level": "automatic", + "thresholds": { + "soft_warn_regression_pct": 50 + }, + "version": "v2.1" + }, + { + "score_spec_id": "stability.recovery_absence", + "dimension": "stability", + "subdimension": "recovery_absence", + "direction": "higher_is_better", + "formula": "1 if no recovery event exists for run.entry_user_action_id else 0", + "data_sources": ["V1 recoveries"], + "evidence_requirements": ["entry_user_action_id", "recoveries"], + "automation_level": "automatic", + "version": "v2.1" + }, + { + "score_spec_id": "controllability.turn_limit_basic", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "direction": "higher_is_better", + "formula": "1 if root_query.turn_count <= scenario.max_turn_count or default limit 8 else 0", + "data_sources": ["V1 queries", "V2 scenario"], + "evidence_requirements": ["root_query_id", "turn_count"], + "automation_level": "automatic", + "version": "v2.1" + } + ] +} diff --git a/tests/evals/v2/scores/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json b/tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json similarity index 52% rename from tests/evals/v2/scores/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json rename to tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json index 04cbfda3ec..a18e4b1728 100644 --- a/tests/evals/v2/scores/run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json +++ b/tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json @@ -1,7 +1,7 @@ [ { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "task_success", "subdimension": "main_chain_observed", "score_value": 1, @@ -10,18 +10,18 @@ "reason": "Main-thread root query is present in V1 evidence." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_expected_tool_hit_rate", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_expected_tool_hit_rate", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "decision_quality", "subdimension": "expected_tool_hit_rate", - "score_value": null, - "score_label": "not_applicable", + "score_value": 1, + "score_label": "pass", "evidence_ref": "tools", - "reason": "Scenario has no expected_tools yet." + "reason": "Observed 4 tool names against 1 expected tools." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "efficiency", "subdimension": "total_billed_tokens", "score_value": 400399, @@ -30,8 +30,18 @@ "reason": "Raw efficiency fact from V1 user_actions." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_v1_closure_health", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_token_budget", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "efficiency", + "subdimension": "total_billed_token_budget", + "score_value": 0, + "score_label": "fail", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "total_billed_tokens=400399; budget=260000." + }, + { + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_v1_closure_health", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "stability", "subdimension": "v1_closure_health", "score_value": 1, @@ -40,8 +50,8 @@ "reason": "Average of query, turn, tool, and subagent closure rates for the action date." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "stability", "subdimension": "recovery_absence", "score_value": 1, @@ -50,23 +60,33 @@ "reason": "No recovery events were observed for this action." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "controllability", "subdimension": "turn_limit_basic", "score_value": 1, "score_label": "pass", "evidence_ref": "queries.turn_count", - "reason": "Root query turn_count=4; phase-one soft limit is 8." + "reason": "Root query turn_count=4; scenario limit is 8." }, { - "score_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-24T050004856Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "decision_quality", "subdimension": "subagent_count_observed", "score_value": 4, "score_label": "observed", "evidence_ref": "subagents", "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_subagent_count_budget", + "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "controllability", + "subdimension": "subagent_count_budget", + "score_value": 0, + "score_label": "fail", + "evidence_ref": "subagents", + "reason": "subagent_count=4; budget=3." } ] diff --git a/tests/evals/v2/scores/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json b/tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json similarity index 53% rename from tests/evals/v2/scores/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json rename to tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json index 95d4b11b5b..da6285230f 100644 --- a/tests/evals/v2/scores/run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json +++ b/tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json @@ -1,7 +1,7 @@ [ { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "task_success", "subdimension": "main_chain_observed", "score_value": 1, @@ -10,18 +10,18 @@ "reason": "Main-thread root query is present in V1 evidence." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_expected_tool_hit_rate", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_expected_tool_hit_rate", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "decision_quality", "subdimension": "expected_tool_hit_rate", - "score_value": null, - "score_label": "not_applicable", + "score_value": 1, + "score_label": "pass", "evidence_ref": "tools", - "reason": "Scenario has no expected_tools yet." + "reason": "Observed 3 tool names against 1 expected tools." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "efficiency", "subdimension": "total_billed_tokens", "score_value": 352691, @@ -30,8 +30,18 @@ "reason": "Raw efficiency fact from V1 user_actions." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_v1_closure_health", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_token_budget", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "efficiency", + "subdimension": "total_billed_token_budget", + "score_value": 0, + "score_label": "fail", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "total_billed_tokens=352691; budget=260000." + }, + { + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_v1_closure_health", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "stability", "subdimension": "v1_closure_health", "score_value": 1, @@ -40,8 +50,8 @@ "reason": "Average of query, turn, tool, and subagent closure rates for the action date." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "stability", "subdimension": "recovery_absence", "score_value": 1, @@ -50,23 +60,33 @@ "reason": "No recovery events were observed for this action." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "controllability", "subdimension": "turn_limit_basic", "score_value": 1, "score_label": "pass", "evidence_ref": "queries.turn_count", - "reason": "Root query turn_count=4; phase-one soft limit is 8." + "reason": "Root query turn_count=4; scenario limit is 8." }, { - "score_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-24T050007063Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "decision_quality", "subdimension": "subagent_count_observed", "score_value": 2, "score_label": "observed", "evidence_ref": "subagents", "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_subagent_count_budget", + "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "controllability", + "subdimension": "subagent_count_budget", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "subagents", + "reason": "subagent_count=2; budget=3." } ] From 14329d2431be37b32d7d5f852d55bcb18b7d378f Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Wed, 29 Apr 2026 00:56:11 +0800 Subject: [PATCH 10/26] Add observability v2.1 bind runner --- ...\212\250\345\256\236\347\216\260runner.md" | 823 ++++++++++++++++++ .../README.md" | 19 +- ...ndidate_session_memory_sparse_dbf9fae1.md" | 33 + ...parse_vs_default_2026-04-28T162912802Z.md" | 41 + ...nsitive_task_baseline_default_1d5eb5e1.md" | 55 ++ ...ndidate_session_memory_sparse_dbf9fae1.md" | 52 ++ scripts/evals/v2_record_run.ts | 45 +- scripts/evals/v2_run_experiment.ts | 121 ++- scripts/evals/v2_validate_manifests.ts | 268 +++++- src/observability/v2/evalExperimentTypes.ts | 18 +- src/observability/v2/evalTypes.ts | 13 + tests/evals/v2/README.md | 29 +- ...arse_vs_default_2026-04-28T162912802Z.json | 114 +++ .../_experiment.v2_1.template.json | 11 +- .../session_memory_sparse_vs_default.json | 11 +- ...sitive_task_baseline_default_1d5eb5e1.json | 182 ++++ ...didate_session_memory_sparse_dbf9fae1.json | 163 ++++ ...task_baseline_default_1d5eb5e1.scores.json | 52 ++ ...session_memory_sparse_dbf9fae1.scores.json | 52 ++ 19 files changed, 2041 insertions(+), 61 deletions(-) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" create mode 100644 tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json create mode 100644 tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json create mode 100644 tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json create mode 100644 tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json create mode 100644 tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" new file mode 100644 index 0000000000..da65abf739 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" @@ -0,0 +1,823 @@ +# V2.1 最小任务书 + +## 任务名称 + +# 可观测系统 V2.1:自动实验 Runner 最小闭环 + +--- + +## 1. 背景 + +当前 V2 已经具备: + +* V2 北极星和评测模型草案 +* V2 第一阶段实施任务书 +* V2 数据模型定稿 +* 第一批 scenario 候选集 +* Variant 组织规范 +* 一次手动生成的 baseline/candidate compare report + +当前系统已经能通过手动方式完成: + +```text +手动运行 scenario +→ 获得 V1 user_action_id +→ 记录 V2 run +→ 生成 score +→ 比较 baseline/candidate +``` + +但它还没有完全形成: + +```text +experiment manifest +→ 自动跑 baseline/candidate +→ 自动绑定 V1 证据 +→ 自动评分 +→ 自动生成 report +→ 自动给 gate verdict +``` + +因此,本轮 V2.1 的目标是把 V2 从**手动绑定式评测**推进到**自动实验 Runner 最小闭环**。 + +--- + +## 2. 本轮目标 + +实现一个本地优先的 V2 experiment runner,使系统能够: + +1. 读取一个 experiment manifest +2. 加载 scenario set +3. 加载 baseline variant 和 candidate variant +4. 针对每个 `scenario × variant` 生成 run +5. 将 run 与 V1 观测证据绑定 +6. 调用 scorer 生成 score +7. 调用 reporter 生成 baseline/candidate compare report +8. 调用 gate 输出 pass / warn / fail 结论 + +--- + +## 3. 本轮不做 + +本轮明确不做: + +* 不做远端平台化 +* 不做复杂前端 dashboard +* 不做全自动模型裁判 +* 不做长上下文专项 benchmark +* 不做 tool / skill 专项价值评测 +* 不做鲁棒性 repeat=10 的完整实现 +* 不重写 V1 观测系统 +* 不新增大量 V1 埋点 +* 不引入推断补链作为评分事实来源 + +--- + +# 三、核心设计原则 + +## 3.1 Fact-only evidence + +正式评分必须基于 V1 可追溯事实。 + +每个 run 必须能绑定: + +* `entry_user_action_id` +* `root_query_id` +* `observability_db_ref` +* 可选:`events_file_ref` +* 可选:`snapshot_bundle_ref` +* 可选:`dag_ref` + +如果无法绑定 V1 事实证据,则该 run 不能进入正式 score / compare / gate。 + +--- + +## 3.2 两阶段 Runner + +Runner 分成两个模式。 + +### 模式 A:`bind_existing` + +含义: + +```text +不自动执行 harness,只把已有 user_action_id 绑定成 V2 run。 +``` + +用途: + +* 复用你已经手动跑出来的 baseline/candidate +* 快速形成 experiment-level 自动闭环 +* 避免在 headless execution 入口还不明确时硬猜 + +### 模式 B:`execute_harness` + +含义: + +```text +自动应用 variant,自动执行 scenario prompt,自动捕获 user_action_id。 +``` + +用途: + +* 真正进入一键自动化评测 +* 后续支持 repeat run、长上下文评测、tool/skill 专项评测 + +本轮优先实现: + +```text +bind_existing + execute_harness scaffold +``` + +不强行一次完成完整 `execute_harness`。 + +--- + +## 3.3 Variant-first + +V2.1 继续遵守 variant-first: + +* harness 改动 +* skill 改动 +* tool 改动 +* model / 配置改动 + +都通过 variant 表达,而不是分别做四套评测系统。V2 第一阶段任务书已经明确:`variant = 一套 agent system 配置快照`,这是统一承载不同改动层的核心抽象。 + +--- + +## 3.4 ScoreSpec-first + +V2.1 不能让 score 变成脚本里写死的临时逻辑。 + +每个 score 必须有: + +* `score_spec_id` +* `dimension` +* `subdimension` +* `direction` +* `formula` +* `data_sources` +* `evidence_requirements` +* `thresholds` +* `version` + +这样后续 score 规则变了,历史实验也能解释。 + +--- + +# 四、需要新增或完善的对象 + +## 4.1 Experiment Manifest + +当前 experiment 对象已有基础字段: + +* `experiment_id` +* `name` +* `goal` +* `baseline_variant_id` +* `candidate_variant_ids` +* `scenario_set_id` +* `status`。 + +V2.1 建议扩展成: + +```json +{ + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session memory sparse policy vs default", + "goal": "评估稀疏 session_memory 策略是否降低成本且不降低成功率", + "mode": "bind_existing", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_ids": ["cost_sensitive_task"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "REPLACE_WITH_BASELINE_USER_ACTION_ID" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "REPLACE_WITH_CANDIDATE_USER_ACTION_ID" + } + ] +} +``` + +--- + +## 4.2 ScoreSpec + +新增目录建议: + +```text +tests/evals/v2/score-specs/ +``` + +第一批 score specs: + +1. `task_success.main_chain_observed` +2. `efficiency.total_billed_tokens` +3. `decision_quality.subagent_count_observed` +4. `stability.recovery_absence` +5. `controllability.turn_limit_basic` + +示例: + +```json +{ + "score_spec_id": "efficiency.total_billed_tokens", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "direction": "lower_is_better", + "formula": "V1 user_action.total_billed_tokens", + "data_sources": ["v1.user_actions"], + "evidence_requirements": ["entry_user_action_id", "observability_db_ref"], + "automation_level": "automatic", + "thresholds": { + "soft_warn_regression_pct": 10, + "hard_fail_regression_pct": 30 + }, + "version": 1 +} +``` + +--- + +## 4.3 GatePolicy + +新增目录建议: + +```text +tests/evals/v2/gates/ +``` + +示例: + +```json +{ + "gate_policy_id": "default_v2_1_gate", + "name": "Default V2.1 regression gate", + "hard_fail_rules": [ + { + "score_spec_id": "task_success.main_chain_observed", + "condition": "candidate < baseline" + }, + { + "score_spec_id": "efficiency.total_billed_tokens", + "condition": "candidate > baseline * 1.30 AND task_success not improved" + } + ], + "soft_warning_rules": [ + { + "score_spec_id": "efficiency.total_billed_tokens", + "condition": "candidate > baseline * 1.10" + }, + { + "score_spec_id": "decision_quality.subagent_count_observed", + "condition": "candidate > baseline" + } + ] +} +``` + +--- + +## 4.4 Run Binding Metadata + +Run 现有字段已经包括: + +* `entry_user_action_id` +* `root_query_id` +* `observability_db_ref`。 + +V2.1 建议补充或在 notes/binding metadata 中记录: + +```json +{ + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "...", + "root_query_id": "...", + "observability_db_ref": "...", + "events_file_ref": "...", + "snapshot_bundle_ref": "...", + "bind_passed": true, + "binding_failure_reason": null + } +} +``` + +--- + +# 五、Runner 职责 + +## 5.1 Runner 不负责“判断好坏” + +Runner 不做主观判断。 + +Runner 只负责流程编排: + +1. 读取 experiment manifest +2. 校验 scenario / variant / score spec / gate 是否存在 +3. 根据 mode 决定执行方式 +4. 创建 run +5. 绑定 V1 evidence +6. 调用 scorer +7. 调用 reporter +8. 调用 gate + +--- + +## 5.2 `bind_existing` 模式流程 + +```text +读取 experiment +→ 读取 action_bindings +→ 对每条 binding: + scenario_id + variant_id + entry_user_action_id +→ 校验 V1 中是否存在该 user_action_id +→ 调用现有 record_run 能力生成 run +→ 调用 scorer 生成 score +→ 所有 run 完成后生成 compare report +→ 运行 gate +``` + +### 验收标准 + +* 不需要自动执行 harness +* 只要用户提供 baseline/candidate 的 user_action_id,就能自动生成 experiment-level report + +--- + +## 5.3 `execute_harness` 模式流程 + +```text +读取 experiment +→ 读取 scenario prompt +→ 应用 baseline variant +→ 执行 scenario +→ 捕获新产生的 user_action_id +→ 记录 baseline run +→ 应用 candidate variant +→ 执行 scenario +→ 捕获新产生的 user_action_id +→ 记录 candidate run +→ 打分 +→ 对比 +→ gate +``` + +### 本轮要求 + +本轮只需要: + +* 定义接口 +* 做 scaffold +* 明确阻塞点 + +如果当前仓库没有稳定 headless harness 入口,不要硬写假实现。 + +--- + +# 六、Scorer 职责 + +Scorer 负责: + +1. 读取 run +2. 读取 score spec +3. 读取 V1 evidence +4. 计算 score +5. 保存 score +6. 写入 `evidence_ref` +7. 写入 `reason` + +## 第一批 scorer + +### 1. `task_success.main_chain_observed` + +含义: + +```text +该 run 是否有可观测到的主链 root query +``` + +来源: + +* V1 action/query evidence + +方向: + +```text +higher_is_better +``` + +--- + +### 2. `efficiency.total_billed_tokens` + +含义: + +```text +该 run 对应 user_action 的总 token 成本 +``` + +来源: + +* V1 user action cost metrics + +方向: + +```text +lower_is_better +``` + +--- + +### 3. `decision_quality.subagent_count_observed` + +含义: + +```text +该 run 观察到的 subagent 数量 +``` + +来源: + +* V1 subagent evidence + +方向: + +```text +lower_is_better / contextual +``` + +注意: + +这个指标不能单独判断“好坏”。 +只有在任务成功不下降时,subagent 数下降才通常是好事。 + +--- + +### 4. `stability.recovery_absence` + +含义: + +```text +该 run 是否没有进入 recovery +``` + +来源: + +* V1 recovery events + +方向: + +```text +higher_is_better +``` + +--- + +### 5. `controllability.turn_limit_basic` + +含义: + +```text +该 run 的 turn 数是否低于基础限制 +``` + +来源: + +* V1 query/turn evidence + +方向: + +```text +higher_is_better +``` + +--- + +# 七、Reporter 职责 + +Reporter 负责把 score 变成可读结论。 + +每条 score 输出: + +* baseline value +* candidate value +* delta +* direction +* verdict + +verdict 可取: + +* `improved` +* `regressed` +* `unchanged` +* `missing` +* `inconclusive` + +## Tradeoff 说明 + +报告必须明确: + +* 更便宜且成功率不降 +* 更贵但没有更好 +* 更快但质量下降 +* subagent 更少但结果不变 +* 成本下降但 stability 下降 + +不能只堆表格。 + +--- + +# 八、Gate 职责 + +Gate 负责给出是否可接受的判断。 + +## 第一版 Gate 输出 + +```json +{ + "gate_policy_id": "default_v2_1_gate", + "verdict": "pass", + "hard_fail_count": 0, + "soft_warning_count": 1, + "reasons": [] +} +``` + +## Gate 规则 + +第一版只做简单规则: + +### Hard Fail + +* task_success 从 1 变 0 +* recovery_absence 从 1 变 0 +* total_billed_tokens 上升超过 30%,且 task_success 没有提升 + +### Soft Warning + +* total_billed_tokens 上升超过 10% +* subagent_count_observed 上升 +* turn_limit_basic 从 1 变 0 + +--- + +# 九、实施 Phase + +## Phase 0:Reality Check + +先不要改代码。检查当前仓库: + +1. 现有 V2 目录结构 +2. 现有 evalTypes +3. 现有 scenario / variant / experiment manifest +4. 现有 record_run / compare_run / compare_scenario 脚本 +5. 现有 V1 metrics 读取入口 +6. 当前是否存在自动 harness execution 入口 + +输出: + +```text +当前能力清单 +缺口清单 +本轮应实现 bind_existing 还是 execute_harness +``` + +--- + +## Phase 1:ScoreSpec / GatePolicy 落地 + +交付: + +```text +tests/evals/v2/score-specs/default-v2-1.score-specs.json +tests/evals/v2/gates/default_v2_1_gate.json +``` + +验收: + +* score spec 可被读取 +* gate policy 可被读取 +* manifest 校验能发现缺失 score spec / gate policy + +--- + +## Phase 2:Experiment Manifest v2.1 + +交付: + +```text +tests/evals/v2/experiments/_experiment.v2_1.template.json +``` + +验收: + +* 支持 `mode` +* 支持 `score_spec_ids` +* 支持 `gate_policy_id` +* 支持 `action_bindings` +* 支持 `repeat_count` + +--- + +## Phase 3:Runner bind_existing + +交付: + +```text +scripts/evals/v2_run_experiment.ts +``` + +功能: + +* 读取 experiment +* 校验 scenario / variant / action binding +* 调用或复用 `v2_record_run` +* 生成 run +* 调用或复用 compare +* 生成 experiment summary + +验收: + +* 用两个已存在 user_action_id 能生成 baseline/candidate runs +* 能生成 compare report +* 能生成 gate summary + +--- + +## Phase 4:execute_harness scaffold + +交付: + +* 在 runner 中预留 `execute_harness` mode +* 明确当前阻塞点 +* 如果没有稳定入口,输出 error: + +```text +execute_harness mode is not implemented yet: missing headless harness execution adapter +``` + +验收: + +* 不写伪实现 +* 不假装已经能自动跑 harness + +--- + +## Phase 5:Manifest Validator 增强 + +增强现有 validator,校验: + +* score-specs +* gate policies +* experiment.score_spec_ids +* experiment.gate_policy_id +* action_bindings 的 scenario/variant 是否存在 + +--- + +# 十、验收标准 + +V2.1 完成时,必须满足: + +1. 能读取 experiment manifest +2. 能识别 baseline 和 candidate variant +3. 能用 `bind_existing` 绑定已有 baseline/candidate user_action_id +4. 能自动生成 run +5. 能自动生成 score +6. 能自动生成 compare report +7. 能自动生成 gate verdict +8. score 规则来自 score-spec,而不是散落在脚本里 +9. gate 规则来自 gate policy,而不是临时硬编码 +10. 如果 execute_harness 还不能做,必须明确报出缺失 adapter,而不是伪造实现 + +--- + +# 十一、输出文件建议 + +```text +tests/evals/v2/ + score-specs/ + default-v2-1.score-specs.json + + gates/ + default_v2_1_gate.json + + experiments/ + _experiment.v2_1.template.json + +scripts/evals/ + v2_run_experiment.ts +``` + +可选新增: + +```text +src/observability/v2/ + evalScoreSpecTypes.ts + evalGateTypes.ts +``` + +--- + +# 十二、Checkpoint 卡片模板 + +完成后 Codex 必须输出: + +```md +## V2.1 Checkpoint + +### 本轮目标 +从手动绑定式评测推进到 experiment-level bind_existing 自动闭环。 + +### 实际完成 +- ... + +### 修改文件 +- ... + +### 可运行命令 +- ... + +### 示例输出 +- ... + +### 验收结果 +- [ ] experiment manifest 可读取 +- [ ] score spec 可读取 +- [ ] gate policy 可读取 +- [ ] bind_existing 可生成 run +- [ ] compare report 可生成 +- [ ] gate verdict 可生成 +- [ ] execute_harness 未实现时有明确错误 + +### 未完成 +- ... + +### 风险 +- ... + +### 下一步候选 A +实现 execute_harness adapter。 + +### 下一步候选 B +扩展 repeat_count / robustness run group。 + +### 是否等待用户拍板 +是。 +``` + +--- + +# 十三、给 Codex 的最短指令版 + +如果你要直接发 Codex,可以这样说: + +```md +本轮目标:实现 V2.1 自动实验 Runner 的最小闭环。 + +当前事实: +V2 已有 scenario / variant / run / score / experiment 数据模型,已有第一批 scenario 和 variant 规范,也已有手动 baseline vs candidate compare report。但当前还处于“手动运行后绑定 user_action_id”的阶段,不是完整自动实验平台。 + +本轮只做: +1. score-specs +2. gate policy +3. experiment v2.1 manifest +4. v2_run_experiment.ts 的 bind_existing 模式 +5. manifest validator 增强 +6. execute_harness scaffold,但不伪造实现 + +本轮不做: +- 不做远端平台 +- 不做模型裁判 +- 不做长上下文专项 +- 不做 tool/skill 价值专项 +- 不重写 V1 + +执行要求: +先做 Reality Check,确认当前仓库已有 V2 脚本和类型。 +如果发现文档与代码不一致,先停下找我确认。 +实现时优先复用现有 v2_record_run 和 v2_compare_runs 能力。 +所有正式 score 必须来自 score-spec,所有 gate 必须来自 gate policy。 +如果无法自动执行 harness,不要硬猜,只实现 bind_existing,并为 execute_harness 留 scaffold 和明确错误。 + +完成后输出 checkpoint,不要自动进入下一阶段。 +``` + +--- diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/README.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/README.md" index e38ba9be47..32a2fd8917 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/README.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/README.md" @@ -91,14 +91,25 @@ powershell -ExecutionPolicy Bypass -File E:\claude-code-transparent\scripts\obse "baseline_variant_id": "baseline_default", "candidate_variant_ids": ["candidate_tool_router_v2"], "scenario_ids": ["tool_choice_sensitive"], + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", "mode": "bind_existing", "action_bindings": [ { "scenario_id": "tool_choice_sensitive", - "baseline_user_action_id": "", - "candidate_user_action_ids": { - "candidate_tool_router_v2": "" - } + "variant_id": "baseline_default", + "entry_user_action_id": "" + }, + { + "scenario_id": "tool_choice_sensitive", + "variant_id": "candidate_tool_router_v2", + "entry_user_action_id": "" } ] } diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" new file mode 100644 index 0000000000..dfb110adb7 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" @@ -0,0 +1,33 @@ +# V2 Run Comparison + +## 理解清单 + +- baseline_run: run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 +- candidate_run: run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 +- scenario: cost_sensitive_task +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## 预期效果 + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## 设计思路 + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 1d5eb5e1-2fe0-42fa-9450-7b05d6367976 +- candidate_user_action_id: dbf9fae1-0a5a-4f50-aba7-02047ced9390 + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 4 | 2 | -2 | improved | +| efficiency.total_billed_tokens | 400399 | 352691 | -47708 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" new file mode 100644 index 0000000000..c41eaea0b5 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" @@ -0,0 +1,41 @@ +# V2.1 Experiment Summary: session_memory_sparse_vs_default + +## 理解清单 + +- experiment: session_memory_sparse_vs_default +- mode: bind_existing +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\session_memory_sparse_vs_default_2026-04-28T162912802Z.json + +## 预期效果 + +This summary records a manifest-driven V2.1 experiment run. In bind-existing mode, every generated V2 run is backed by an existing V1 user_action_id. + +## 设计思路 + +V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs the existing scorer and comparison scripts. + +## Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- gate_status: passed + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | +| cost_sensitive_task | 1 | run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 | candidate_session_memory_sparse | run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 | 0/4 failed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md | + +## Gate Results + +| scenario | candidate_variant | rule_type | score_spec | result | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" new file mode 100644 index 0000000000..b488e5d832 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" @@ -0,0 +1,55 @@ +# V2 Run Report: run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 + +## 理解清单 + +- scenario: cost_sensitive_task (Cost Sensitive Task) +- variant: baseline_default (Baseline Default) +- user_action_id: 1d5eb5e1-2fe0-42fa-9450-7b05d6367976 +- root_query_id: 15ecf197-b1c6-47e2-8d94-df1f88f0d822 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-04-24T04:48:30.824Z +- duration_ms: 88207 +- query_count: 5 +- subagent_count: 4 +- tool_call_count: 22 +- total_prompt_input_tokens: 397412 +- total_billed_tokens: 400399 +- root_turn_count: 4 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Edit: count=11, closed=11, failed=0 +- Read: count=5, closed=5, failed=0 +- Glob: count=3, closed=3, failed=0 +- Write: count=3, closed=3, failed=0 + +## Subagents + +- prompt_suggestion: count=1, trigger=suggestion_generation_allowed +- session_memory: count=1, trigger=token_threshold_and_tool_threshold +- extract_memories: count=1, trigger=post_turn_background_extraction +- session_memory: count=1, trigger=token_threshold_and_natural_break + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (400399) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) +- decision_quality.subagent_count_observed: observed (4) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" new file mode 100644 index 0000000000..e776a76bf0 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" @@ -0,0 +1,52 @@ +# V2 Run Report: run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 + +## 理解清单 + +- scenario: cost_sensitive_task (Cost Sensitive Task) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- user_action_id: dbf9fae1-0a5a-4f50-aba7-02047ced9390 +- root_query_id: f15ca52c-e702-448a-9cd8-8d5c942ff4e2 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-04-24T04:55:36.952Z +- duration_ms: 46081 +- query_count: 3 +- subagent_count: 2 +- tool_call_count: 15 +- total_prompt_input_tokens: 348534 +- total_billed_tokens: 352691 +- root_turn_count: 4 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Read: count=8, closed=8, failed=0 +- Edit: count=5, closed=5, failed=0 +- Glob: count=2, closed=2, failed=0 + +## Subagents + +- extract_memories: count=1, trigger=post_turn_background_extraction +- session_memory: count=1, trigger=token_threshold_and_tool_threshold + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (352691) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) +- decision_quality.subagent_count_observed: observed (2) diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index c540c7bab7..c5ac6cd30e 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -4,6 +4,7 @@ import path from 'node:path' import type { EvalRun, + EvalRunBinding, EvalScenario, EvalScore, EvalVariant, @@ -171,6 +172,10 @@ function scoreLabel(value: number): string { return 'fail' } +function scoreKey(score: EvalScore): string { + return `${score.dimension}.${score.subdimension}` +} + function buildScores(params: { runId: string scenario: EvalScenario @@ -409,6 +414,9 @@ The report does not judge final answer quality by itself. It records trace-backe ## V1 Evidence +- binding_mode: ${run.binding?.binding_mode ?? 'unknown'} +- bind_passed: ${run.binding?.bind_passed ?? false} +- binding_failure_reason: ${run.binding?.binding_failure_reason ?? 'n/a'} - started_at: ${asString(action.started_at)} - duration_ms: ${asNumber(action.duration_ms)} - query_count: ${asNumber(action.query_count)} @@ -473,6 +481,12 @@ async function main(): Promise { `SELECT * FROM queries WHERE user_action_id = ${sqlString(userActionId)} AND agent_name = 'main_thread' ORDER BY started_at ASC LIMIT 1;`, )[0] + if (!rootQuery?.query_id) { + throw new Error( + `Fact-only binding failed: user_action_id=${userActionId} has no main_thread root query in V1 evidence. This run cannot enter formal score/compare/gate.`, + ) + } + const tools = queryDuckDb( dbPath, `SELECT tool_name, COUNT(*) AS tool_count, SUM(CASE WHEN is_closed THEN 1 ELSE 0 END) AS closed_count, SUM(CASE WHEN has_failed THEN 1 ELSE 0 END) AS failed_count FROM tools WHERE user_action_id = ${sqlString(userActionId)} GROUP BY 1 ORDER BY tool_count DESC;`, @@ -493,6 +507,14 @@ async function main(): Promise { const runId = sanitizeId( `run_${new Date().toISOString().replaceAll(':', '').replaceAll('.', '')}_${scenario.scenario_id}_${variant.variant_id}_${userActionId.slice(0, 8)}`, ) + const binding: EvalRunBinding = { + binding_mode: 'fact_only', + entry_user_action_id: userActionId, + root_query_id: asString(rootQuery.query_id), + observability_db_ref: path.relative(repoRoot, sourceDbPath), + bind_passed: true, + binding_failure_reason: null, + } const run: EvalRun = { run_id: runId, scenario_id: scenario.scenario_id, @@ -501,12 +523,17 @@ async function main(): Promise { ended_at: asString(action.ended_at), status: 'completed', entry_user_action_id: userActionId, - root_query_id: asString(rootQuery?.query_id), + root_query_id: binding.root_query_id, observability_db_ref: path.relative(repoRoot, sourceDbPath), + binding, notes: 'Generated by scripts/evals/v2_record_run.ts', } - const scores = buildScores({ + const requestedScoreSpecIds = String(args['score-spec-ids'] ?? '') + .split(',') + .map(item => item.trim()) + .filter(Boolean) + const allScores = buildScores({ runId, scenario, action, @@ -516,6 +543,18 @@ async function main(): Promise { subagents, recoveries, }) + const scores = + requestedScoreSpecIds.length === 0 + ? allScores + : allScores.filter(score => requestedScoreSpecIds.includes(scoreKey(score))) + const emittedScoreIds = new Set(scores.map(scoreKey)) + for (const scoreSpecId of requestedScoreSpecIds) { + if (!emittedScoreIds.has(scoreSpecId)) { + throw new Error( + `Score spec has no implemented scorer yet: ${scoreSpecId}`, + ) + } + } const runsDir = path.join(evalRoot, 'runs') const scoresDir = path.join(evalRoot, 'scores') @@ -525,7 +564,7 @@ async function main(): Promise { await writeFile( path.join(runsDir, `${runId}.json`), - `${JSON.stringify({ run, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries } }, null, 2)}\n`, + `${JSON.stringify({ run, binding, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries } }, null, 2)}\n`, ) await writeFile( path.join(scoresDir, `${runId}.scores.json`), diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index be60a162b7..7d337514b5 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -4,6 +4,9 @@ import path from 'node:path' import type { EvalScore } from '../../src/observability/v2/evalTypes' import type { + EvalExperimentActionBinding, + EvalExperimentFlatActionBinding, + EvalExperimentNestedActionBinding, EvalExperimentV21, EvalGatePolicy, EvalGatePolicyRule, @@ -112,6 +115,52 @@ async function loadGatePolicy(gatePolicyId?: string): Promise(filePath) } +function normalizeGateRules(gatePolicy: EvalGatePolicy | undefined): EvalGatePolicyRule[] { + if (!gatePolicy) return [] + return [ + ...(gatePolicy.rules ?? []), + ...(gatePolicy.hard_fail_rules ?? []).map(rule => ({ + ...rule, + rule_type: 'hard_fail' as const, + })), + ...(gatePolicy.soft_warning_rules ?? []).map(rule => ({ + ...rule, + rule_type: 'soft_warning' as const, + })), + ] +} + +function isFlatActionBinding( + binding: EvalExperimentActionBinding, +): binding is EvalExperimentFlatActionBinding { + return 'variant_id' in binding && 'entry_user_action_id' in binding +} + +function isNestedActionBinding( + binding: EvalExperimentActionBinding, +): binding is EvalExperimentNestedActionBinding { + return 'baseline_user_action_id' in binding && 'candidate_user_action_ids' in binding +} + +function findBoundUserActionId(params: { + experiment: EvalExperimentV21 + scenarioId: string + variantId: string +}): string | undefined { + const { experiment, scenarioId, variantId } = params + for (const binding of experiment.action_bindings ?? []) { + if (binding.scenario_id !== scenarioId) continue + if (isFlatActionBinding(binding) && binding.variant_id === variantId) { + return binding.entry_user_action_id + } + if (isNestedActionBinding(binding)) { + if (variantId === experiment.baseline_variant_id) return binding.baseline_user_action_id + return binding.candidate_user_action_ids[variantId] + } + } + return undefined +} + function runBunScript(script: string, args: string[]): string { const result = spawnSync('bun', ['run', script, ...args], { cwd: repoRoot, @@ -225,14 +274,15 @@ function evaluateGate(params: { baselineScores, candidateScores, } = params - if (!gatePolicy) return [] + const rules = normalizeGateRules(gatePolicy) + if (rules.length === 0) return [] const taskBaseline = valueFor(baselineScores, 'task_success.main_chain_observed') const taskCandidate = valueFor(candidateScores, 'task_success.main_chain_observed') const taskSuccessNotImproved = taskBaseline !== null && taskCandidate !== null && taskCandidate <= taskBaseline - return gatePolicy.rules.map(rule => { + return rules.map(rule => { const spec = scoreSpecs.get(rule.score_spec_id) const baselineValue = valueFor(baselineScores, rule.score_spec_id) const candidateValue = valueFor(candidateScores, rule.score_spec_id) @@ -269,8 +319,9 @@ function buildRecordRunArgs(params: { scenarioId: string variantId: string userActionId: string + scoreSpecIds: string[] }): string[] { - return [ + const args = [ '--scenario', params.scenarioId, '--variant', @@ -279,6 +330,10 @@ function buildRecordRunArgs(params: { params.userActionId, '--snapshot-db', ] + if (params.scoreSpecIds.length > 0) { + args.push('--score-spec-ids', params.scoreSpecIds.join(',')) + } + return args } function buildMarkdownReport(params: { @@ -327,6 +382,8 @@ function buildMarkdownReport(params: { - baseline_variant: ${experiment.baseline_variant_id} - candidate_variants: ${experiment.candidate_variant_ids.join(', ')} - scenario_count: ${experiment.scenario_ids?.length ?? 0} +- score_specs: ${(experiment.score_spec_ids ?? []).join(', ') || 'not configured'} +- gate_policy: ${experiment.gate_policy_id ?? 'not configured'} - output_json: ${outputJson} ## 预期效果 @@ -364,10 +421,16 @@ async function main(): Promise { const experimentPath = await findExperimentPath(experimentArg) const experiment = await readJson(experimentPath) + const mode = experiment.mode ?? 'bind_existing' - if ((experiment.mode ?? 'bind_existing') !== 'bind_existing') { + if (mode === 'execute_harness') { throw new Error( - `Only bind_existing mode is implemented in V2.1. mode=${experiment.mode}`, + 'execute_harness mode is not implemented yet: missing headless harness execution adapter', + ) + } + if (mode !== 'bind_existing') { + throw new Error( + `Unsupported V2.1 experiment mode: ${mode}`, ) } @@ -378,16 +441,37 @@ async function main(): Promise { const scoreSpecs = await loadScoreSpecs() const gatePolicy = await loadGatePolicy(experiment.gate_policy_id) + for (const scoreSpecId of experiment.score_spec_ids ?? []) { + if (!scoreSpecs.has(scoreSpecId)) { + throw new Error( + `Experiment references missing score_spec_id: ${scoreSpecId}`, + ) + } + } + if (experiment.gate_policy_id && !gatePolicy) { + throw new Error( + `Experiment references missing gate_policy_id: ${experiment.gate_policy_id}`, + ) + } + for (const rule of normalizeGateRules(gatePolicy)) { + if (!scoreSpecs.has(rule.score_spec_id)) { + throw new Error( + `Gate policy ${experiment.gate_policy_id} references missing score_spec_id: ${rule.score_spec_id}`, + ) + } + } const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) const results: ScenarioExperimentResult[] = [] for (const scenarioId of scenarioIds) { - const binding = experiment.action_bindings?.find( - item => item.scenario_id === scenarioId, - ) - if (!binding) { + const baselineUserActionId = findBoundUserActionId({ + experiment, + scenarioId, + variantId: experiment.baseline_variant_id, + }) + if (!baselineUserActionId) { throw new Error( - `Missing action_bindings for scenario=${scenarioId}. V2.1 bind_existing mode requires user_action_id bindings.`, + `Missing action binding for scenario=${scenarioId}, variant=${experiment.baseline_variant_id}. V2.1 bind_existing mode requires user_action_id bindings.`, ) } @@ -397,7 +481,8 @@ async function main(): Promise { buildRecordRunArgs({ scenarioId, variantId: experiment.baseline_variant_id, - userActionId: binding.baseline_user_action_id, + userActionId: baselineUserActionId, + scoreSpecIds: experiment.score_spec_ids ?? [], }), ) const baselineRunId = extractCreatedRunId(baselineOutput) @@ -407,7 +492,11 @@ async function main(): Promise { const candidates: CandidateExperimentResult[] = [] for (const candidateVariantId of experiment.candidate_variant_ids) { - const candidateActionId = binding.candidate_user_action_ids[candidateVariantId] + const candidateActionId = findBoundUserActionId({ + experiment, + scenarioId, + variantId: candidateVariantId, + }) if (!candidateActionId) { throw new Error( `Missing candidate user_action_id for scenario=${scenarioId}, variant=${candidateVariantId}`, @@ -420,6 +509,7 @@ async function main(): Promise { scenarioId, variantId: candidateVariantId, userActionId: candidateActionId, + scoreSpecIds: experiment.score_spec_ids ?? [], }), ) const candidateRunId = extractCreatedRunId(candidateOutput) @@ -454,7 +544,7 @@ async function main(): Promise { scenario_id: scenarioId, repeat_index: repeatIndex, baseline_run_id: baselineRunId, - baseline_user_action_id: binding.baseline_user_action_id, + baseline_user_action_id: baselineUserActionId, candidates, }) } @@ -472,6 +562,11 @@ async function main(): Promise { `${JSON.stringify( { experiment, + runner: { + mode, + score_spec_ids: experiment.score_spec_ids ?? [], + gate_policy_id: experiment.gate_policy_id ?? null, + }, results, created_at: new Date().toISOString(), }, diff --git a/scripts/evals/v2_validate_manifests.ts b/scripts/evals/v2_validate_manifests.ts index 8a3d42dfda..c84a80c11c 100644 --- a/scripts/evals/v2_validate_manifests.ts +++ b/scripts/evals/v2_validate_manifests.ts @@ -7,8 +7,12 @@ import type { EvalVariant, } from '../../src/observability/v2/evalTypes' import type { + EvalExperimentActionBinding, + EvalExperimentFlatActionBinding, + EvalExperimentNestedActionBinding, EvalExperimentV21, EvalGatePolicy, + EvalGatePolicyRule, EvalScoreSpecCollection, } from '../../src/observability/v2/evalExperimentTypes' @@ -37,6 +41,13 @@ const scoreDirections = new Set([ const automationLevels = new Set(['automatic', 'manual_review', 'mixed']) const experimentModes = new Set(['bind_existing', 'execute_harness']) +interface ValidationContext { + scenarioIds: Set + variantIds: Set + scoreSpecIds: Set + gatePolicyIds: Set +} + async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } @@ -81,6 +92,36 @@ function requireOptionalNumber( } } +function isFlatActionBinding( + binding: EvalExperimentActionBinding, +): binding is EvalExperimentFlatActionBinding { + return 'variant_id' in binding && 'entry_user_action_id' in binding +} + +function isNestedActionBinding( + binding: EvalExperimentActionBinding, +): binding is EvalExperimentNestedActionBinding { + return 'baseline_user_action_id' in binding && 'candidate_user_action_ids' in binding +} + +function isPlaceholderActionId(value: string): boolean { + return value.startsWith('REPLACE_WITH_') || value.trim() === '' +} + +function normalizeGateRules(gate: EvalGatePolicy): EvalGatePolicyRule[] { + return [ + ...(gate.rules ?? []), + ...(gate.hard_fail_rules ?? []).map(rule => ({ + ...rule, + rule_type: 'hard_fail' as const, + })), + ...(gate.soft_warning_rules ?? []).map(rule => ({ + ...rule, + rule_type: 'soft_warning' as const, + })), + ] +} + function validateScenario(filePath: string, scenario: EvalScenario): string[] { const errors: string[] = [] requireString(errors, filePath, 'scenario_id', scenario.scenario_id) @@ -114,7 +155,11 @@ function validateVariant(filePath: string, variant: EvalVariant): string[] { return errors } -function validateExperiment(filePath: string, experiment: EvalExperimentV21): string[] { +function validateExperiment( + filePath: string, + experiment: EvalExperimentV21, + context?: ValidationContext, +): string[] { const errors: string[] = [] requireString(errors, filePath, 'experiment_id', experiment.experiment_id) requireString(errors, filePath, 'name', experiment.name) @@ -124,6 +169,25 @@ function validateExperiment(filePath: string, experiment: EvalExperimentV21): st requireArray(errors, filePath, 'candidate_variant_ids', experiment.candidate_variant_ids) if (experiment.scenario_ids !== undefined) { requireArray(errors, filePath, 'scenario_ids', experiment.scenario_ids) + for (const scenarioId of experiment.scenario_ids) { + if (typeof scenarioId === 'string' && context && !context.scenarioIds.has(scenarioId)) { + errors.push(`${filePath}.scenario_ids references unknown scenario_id: ${scenarioId}`) + } + } + } + if (context && !context.variantIds.has(experiment.baseline_variant_id)) { + errors.push( + `${filePath}.baseline_variant_id references unknown variant_id: ${experiment.baseline_variant_id}`, + ) + } + if (Array.isArray(experiment.candidate_variant_ids)) { + for (const variantId of experiment.candidate_variant_ids) { + if (typeof variantId === 'string' && context && !context.variantIds.has(variantId)) { + errors.push( + `${filePath}.candidate_variant_ids references unknown variant_id: ${variantId}`, + ) + } + } } if ( experiment.repeat_count !== undefined && @@ -133,6 +197,26 @@ function validateExperiment(filePath: string, experiment: EvalExperimentV21): st } if (experiment.score_spec_ids !== undefined) { requireArray(errors, filePath, 'score_spec_ids', experiment.score_spec_ids) + for (const scoreSpecId of experiment.score_spec_ids) { + if ( + typeof scoreSpecId === 'string' && + context && + !context.scoreSpecIds.has(scoreSpecId) + ) { + errors.push( + `${filePath}.score_spec_ids references unknown score_spec_id: ${scoreSpecId}`, + ) + } + } + } + if ( + experiment.gate_policy_id !== undefined && + context && + !context.gatePolicyIds.has(experiment.gate_policy_id) + ) { + errors.push( + `${filePath}.gate_policy_id references unknown gate_policy_id: ${experiment.gate_policy_id}`, + ) } if ( experiment.mode !== undefined && @@ -143,26 +227,99 @@ function validateExperiment(filePath: string, experiment: EvalExperimentV21): st if (experiment.action_bindings !== undefined) { requireArray(errors, filePath, 'action_bindings', experiment.action_bindings) for (const [index, binding] of experiment.action_bindings.entries()) { + const objectName = `${filePath}.action_bindings[${index}]` requireString( errors, - `${filePath}.action_bindings[${index}]`, + objectName, 'scenario_id', binding.scenario_id, ) - requireString( - errors, - `${filePath}.action_bindings[${index}]`, - 'baseline_user_action_id', - binding.baseline_user_action_id, - ) if ( - typeof binding.candidate_user_action_ids !== 'object' || - binding.candidate_user_action_ids === null || - Array.isArray(binding.candidate_user_action_ids) + typeof binding.scenario_id === 'string' && + context && + !context.scenarioIds.has(binding.scenario_id) ) { - errors.push( - `${filePath}.action_bindings[${index}].candidate_user_action_ids must be an object`, + errors.push(`${objectName}.scenario_id references unknown scenario_id: ${binding.scenario_id}`) + } + + if (isFlatActionBinding(binding)) { + requireString(errors, objectName, 'variant_id', binding.variant_id) + requireString( + errors, + objectName, + 'entry_user_action_id', + binding.entry_user_action_id, ) + if (context && !context.variantIds.has(binding.variant_id)) { + errors.push(`${objectName}.variant_id references unknown variant_id: ${binding.variant_id}`) + } + if (isPlaceholderActionId(binding.entry_user_action_id)) { + errors.push(`${objectName}.entry_user_action_id still contains a placeholder`) + } + continue + } + + if (isNestedActionBinding(binding)) { + requireString( + errors, + objectName, + 'baseline_user_action_id', + binding.baseline_user_action_id, + ) + if (isPlaceholderActionId(binding.baseline_user_action_id)) { + errors.push(`${objectName}.baseline_user_action_id still contains a placeholder`) + } + if ( + typeof binding.candidate_user_action_ids !== 'object' || + binding.candidate_user_action_ids === null || + Array.isArray(binding.candidate_user_action_ids) + ) { + errors.push(`${objectName}.candidate_user_action_ids must be an object`) + } else { + for (const [variantId, actionId] of Object.entries(binding.candidate_user_action_ids)) { + if (context && !context.variantIds.has(variantId)) { + errors.push( + `${objectName}.candidate_user_action_ids references unknown variant_id: ${variantId}`, + ) + } + if (isPlaceholderActionId(actionId)) { + errors.push( + `${objectName}.candidate_user_action_ids.${variantId} still contains a placeholder`, + ) + } + } + } + continue + } + + errors.push( + `${objectName} must use either flat {scenario_id, variant_id, entry_user_action_id} or nested {scenario_id, baseline_user_action_id, candidate_user_action_ids} format`, + ) + } + } + if ((experiment.mode ?? 'bind_existing') === 'bind_existing') { + for (const scenarioId of experiment.scenario_ids ?? []) { + const variantIds = [experiment.baseline_variant_id, ...experiment.candidate_variant_ids] + for (const variantId of variantIds) { + const hasBinding = (experiment.action_bindings ?? []).some(binding => { + if (binding.scenario_id !== scenarioId) return false + if (isFlatActionBinding(binding)) { + return binding.variant_id === variantId && !isPlaceholderActionId(binding.entry_user_action_id) + } + if (isNestedActionBinding(binding)) { + if (variantId === experiment.baseline_variant_id) { + return !isPlaceholderActionId(binding.baseline_user_action_id) + } + const actionId = binding.candidate_user_action_ids[variantId] + return typeof actionId === 'string' && !isPlaceholderActionId(actionId) + } + return false + }) + if (!hasBinding) { + errors.push( + `${filePath}.action_bindings missing bind_existing user_action_id for scenario=${scenarioId}, variant=${variantId}`, + ) + } } } } @@ -183,7 +340,12 @@ function validateScoreSpecCollection( requireString(errors, objectName, 'score_spec_id', spec.score_spec_id) requireString(errors, objectName, 'subdimension', spec.subdimension) requireString(errors, objectName, 'formula', spec.formula) - requireString(errors, objectName, 'version', spec.version) + if ( + (typeof spec.version !== 'string' || spec.version.trim() === '') && + typeof spec.version !== 'number' + ) { + errors.push(`${objectName}.version must be a non-empty string or number`) + } requireArray(errors, objectName, 'data_sources', spec.data_sources) requireArray(errors, objectName, 'evidence_requirements', spec.evidence_requirements) if (!scoreDimensions.has(spec.dimension)) { @@ -205,14 +367,21 @@ function validateScoreSpecCollection( return errors } -function validateGatePolicy(filePath: string, gate: EvalGatePolicy): string[] { +function validateGatePolicy( + filePath: string, + gate: EvalGatePolicy, + context?: ValidationContext, +): string[] { const errors: string[] = [] requireString(errors, filePath, 'gate_policy_id', gate.gate_policy_id) requireString(errors, filePath, 'name', gate.name) - requireArray(errors, filePath, 'rules', gate.rules) - if (!Array.isArray(gate.rules)) return errors + const rules = normalizeGateRules(gate) + if (rules.length === 0) { + errors.push(`${filePath} must define at least one gate rule`) + return errors + } - for (const [index, rule] of gate.rules.entries()) { + for (const [index, rule] of rules.entries()) { const objectName = `${filePath}.rules[${index}]` requireString(errors, objectName, 'score_spec_id', rule.score_spec_id) requireString(errors, objectName, 'condition', rule.condition) @@ -220,42 +389,83 @@ function validateGatePolicy(filePath: string, gate: EvalGatePolicy): string[] { errors.push(`${objectName}.rule_type has invalid value: ${rule.rule_type}`) } requireOptionalNumber(errors, objectName, 'threshold', rule.threshold) + if (context && !context.scoreSpecIds.has(rule.score_spec_id)) { + errors.push(`${objectName}.score_spec_id references unknown score_spec_id: ${rule.score_spec_id}`) + } } return errors } async function validateAll(): Promise { const errors: string[] = [] + const context: ValidationContext = { + scenarioIds: new Set(), + variantIds: new Set(), + scoreSpecIds: new Set(), + gatePolicyIds: new Set(), + } + + const scenarioFiles = await listJsonFiles(path.join(evalRoot, 'scenarios')) + const variantFiles = await listJsonFiles(path.join(evalRoot, 'variants')) + const experimentFiles = await listJsonFiles(path.join(evalRoot, 'experiments')) + const scoreSpecFiles = await listJsonFiles(path.join(evalRoot, 'score-specs')) + const gateFiles = await listJsonFiles(path.join(evalRoot, 'gates')) - for (const filePath of await listJsonFiles(path.join(evalRoot, 'scenarios'))) { + for (const filePath of scenarioFiles) { if (path.basename(filePath).startsWith('_')) continue if (path.basename(filePath) === 'first-batch-catalog.json') continue - errors.push(...validateScenario(filePath, await readJson(filePath))) + const scenario = await readJson(filePath) + if (typeof scenario.scenario_id === 'string') context.scenarioIds.add(scenario.scenario_id) + errors.push(...validateScenario(filePath, scenario)) } - for (const filePath of await listJsonFiles(path.join(evalRoot, 'variants'))) { + for (const filePath of variantFiles) { if (path.basename(filePath).startsWith('_')) continue - errors.push(...validateVariant(filePath, await readJson(filePath))) + const variant = await readJson(filePath) + if (typeof variant.variant_id === 'string') context.variantIds.add(variant.variant_id) + errors.push(...validateVariant(filePath, variant)) } - for (const filePath of await listJsonFiles(path.join(evalRoot, 'experiments'))) { + for (const filePath of scoreSpecFiles) { if (path.basename(filePath).startsWith('_')) continue - errors.push(...validateExperiment(filePath, await readJson(filePath))) + const collection = await readJson(filePath) + for (const spec of collection.score_specs ?? []) { + if (typeof spec.score_spec_id === 'string') { + context.scoreSpecIds.add(spec.score_spec_id) + } + } + errors.push( + ...validateScoreSpecCollection( + filePath, + collection, + ), + ) + } + + for (const filePath of gateFiles) { + if (path.basename(filePath).startsWith('_')) continue + const gate = await readJson(filePath) + if (typeof gate.gate_policy_id === 'string') { + context.gatePolicyIds.add(gate.gate_policy_id) + } } - for (const filePath of await listJsonFiles(path.join(evalRoot, 'score-specs'))) { + for (const filePath of experimentFiles) { if (path.basename(filePath).startsWith('_')) continue errors.push( - ...validateScoreSpecCollection( + ...validateExperiment( filePath, - await readJson(filePath), + await readJson(filePath), + context, ), ) } - for (const filePath of await listJsonFiles(path.join(evalRoot, 'gates'))) { + for (const filePath of gateFiles) { if (path.basename(filePath).startsWith('_')) continue - errors.push(...validateGatePolicy(filePath, await readJson(filePath))) + errors.push( + ...validateGatePolicy(filePath, await readJson(filePath), context), + ) } return errors diff --git a/src/observability/v2/evalExperimentTypes.ts b/src/observability/v2/evalExperimentTypes.ts index 867fb5a9b5..e2717ea058 100644 --- a/src/observability/v2/evalExperimentTypes.ts +++ b/src/observability/v2/evalExperimentTypes.ts @@ -25,7 +25,7 @@ export interface EvalScoreSpec { evidence_requirements: string[] automation_level: EvalAutomationLevel thresholds?: EvalScoreSpecThresholds - version: string + version: string | number notes?: string } @@ -44,15 +44,27 @@ export interface EvalGatePolicyRule { export interface EvalGatePolicy { gate_policy_id: string name: string - rules: EvalGatePolicyRule[] + rules?: EvalGatePolicyRule[] + hard_fail_rules?: Array> + soft_warning_rules?: Array> } -export interface EvalExperimentActionBinding { +export interface EvalExperimentFlatActionBinding { + scenario_id: string + variant_id: string + entry_user_action_id: string +} + +export interface EvalExperimentNestedActionBinding { scenario_id: string baseline_user_action_id: string candidate_user_action_ids: Record } +export type EvalExperimentActionBinding = + | EvalExperimentFlatActionBinding + | EvalExperimentNestedActionBinding + export interface EvalExperimentV21 extends EvalExperiment { scenario_ids?: string[] repeat_count?: number diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index 961d3bafc3..7a512238fb 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -66,9 +66,22 @@ export interface EvalRun { entry_user_action_id?: string root_query_id?: string observability_db_ref?: string + binding?: EvalRunBinding notes?: string } +export interface EvalRunBinding { + binding_mode: 'fact_only' + entry_user_action_id: string + root_query_id: string + observability_db_ref: string + events_file_ref?: string + snapshot_bundle_ref?: string + dag_ref?: string + bind_passed: boolean + binding_failure_reason: string | null +} + export interface EvalExpectation { expectation_id: string scenario_id: string diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index e045b4b1c8..f89f322c0e 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -32,10 +32,31 @@ Recommended V2.1 usage order: - `candidate_variant_ids` - `mode: "bind_existing"` - `action_bindings` + - `score_spec_ids` + - `gate_policy_id` 5. Validate all manifests. 6. Run the experiment runner. 7. Read the generated run, score, comparison, gate, and experiment summary artifacts. +Recommended V2.1 `action_bindings` shape: + +```json +[ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "" + } +] +``` + +The runner still accepts the older nested binding shape for compatibility. New experiment manifests should use the flat `scenario_id + variant_id + entry_user_action_id` shape. + Validate manifests: ```powershell @@ -48,7 +69,13 @@ Run the current sample V2.1 experiment: bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default ``` -Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records scores, compares baseline vs candidate, applies the configured gate policy, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. +Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records score-spec-backed scores, compares baseline vs candidate, applies the configured gate policy, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. + +`execute_harness` is reserved but intentionally blocked until a stable headless harness execution adapter exists. If a manifest uses that mode now, the runner exits with: + +```text +execute_harness mode is not implemented yet: missing headless harness execution adapter +``` Lower-level commands are still available when you want to debug one step at a time. diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json new file mode 100644 index 0000000000..0083e353e6 --- /dev/null +++ b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json @@ -0,0 +1,114 @@ +{ + "experiment": { + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session Memory Sparse vs Default", + "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_first_batch", + "scenario_ids": [ + "cost_sensitive_task" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" + } + ], + "status": "ready" + }, + "runner": { + "mode": "bind_existing", + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "cost_sensitive_task", + "repeat_index": 1, + "baseline_run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "candidate_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "gate_results": [ + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "passed": true, + "baseline_value": 4, + "candidate_value": 2, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ] + } + ] + } + ], + "created_at": "2026-04-28T16:29:12.803Z" +} diff --git a/tests/evals/v2/experiments/_experiment.v2_1.template.json b/tests/evals/v2/experiments/_experiment.v2_1.template.json index 7466d37801..82a4d96204 100644 --- a/tests/evals/v2/experiments/_experiment.v2_1.template.json +++ b/tests/evals/v2/experiments/_experiment.v2_1.template.json @@ -19,10 +19,13 @@ "action_bindings": [ { "scenario_id": "cost_sensitive_task", - "baseline_user_action_id": "REPLACE_WITH_BASELINE_USER_ACTION_ID", - "candidate_user_action_ids": { - "candidate_session_memory_sparse": "REPLACE_WITH_CANDIDATE_USER_ACTION_ID" - } + "variant_id": "baseline_default", + "entry_user_action_id": "REPLACE_WITH_BASELINE_USER_ACTION_ID" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "REPLACE_WITH_CANDIDATE_USER_ACTION_ID" } ], "status": "draft" diff --git a/tests/evals/v2/experiments/session_memory_sparse_vs_default.json b/tests/evals/v2/experiments/session_memory_sparse_vs_default.json index aacfa56208..ae5d2d3448 100644 --- a/tests/evals/v2/experiments/session_memory_sparse_vs_default.json +++ b/tests/evals/v2/experiments/session_memory_sparse_vs_default.json @@ -19,10 +19,13 @@ "action_bindings": [ { "scenario_id": "cost_sensitive_task", - "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "candidate_user_action_ids": { - "candidate_session_memory_sparse": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" - } + "variant_id": "baseline_default", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" } ], "status": "ready" diff --git a/tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json b/tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json new file mode 100644 index 0000000000..5b1f57c076 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json @@ -0,0 +1,182 @@ +{ + "run": { + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "started_at": "2026-04-24T04:48:30.824Z", + "ended_at": "2026-04-24T04:49:59.031Z", + "status": "completed", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "root_query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "root_query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "root_query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "cost_sensitive_task", + "name": "Cost Sensitive Task", + "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", + "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", + "tags": [ + "efficiency", + "tradeoff", + "observability-v2" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should avoid unnecessary background subagent expansion", + "Should keep the main query within a small number of turns" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "path/to/baseline-config.json", + "notes": "Use this as the default baseline unless a scenario explicitly requires another baseline." + }, + "evidence": { + "action": { + "event_date": "2026-04-24", + "user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "started_at": "2026-04-24T04:48:30.824Z", + "started_at_ms": 1777006110824, + "ended_at": "2026-04-24T04:49:59.031Z", + "ended_at_ms": 1777006199031, + "duration_ms": 88207, + "event_count": 438, + "query_count": 5, + "main_thread_query_count": 1, + "subagent_query_count": 5, + "subagent_count": 4, + "tool_call_count": 22, + "raw_input_tokens": "9", + "output_tokens": "2987", + "cache_read_tokens": "187198", + "cache_create_tokens": "210205", + "total_prompt_input_tokens": "397412", + "total_billed_tokens": "400399", + "main_thread_total_prompt_input_tokens": "158157", + "subagent_total_prompt_input_tokens": "239255" + }, + "rootQuery": { + "query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", + "user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "session_id": "eca68c72-ad03-4e56-a18f-f50000e8c0c7", + "conversation_id": "eca68c72-ad03-4e56-a18f-f50000e8c0c7", + "query_source": "repl_main_thread", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "repl_main_thread", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-04-24T04:48:30.824Z", + "started_at_ms": 1777006110824, + "ended_at": "2026-04-24T04:49:06.168Z", + "ended_at_ms": 1777006146168, + "duration_ms": 35344, + "first_event": "state.initialized", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 4, + "query_max_loop_iter": 4, + "query_avg_loop_iter": 2.5, + "tool_call_count": 7, + "event_count": 122, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Edit", + "tool_count": 11, + "closed_count": "11", + "failed_count": "0" + }, + { + "tool_name": "Read", + "tool_count": 5, + "closed_count": "5", + "failed_count": "0" + }, + { + "tool_name": "Glob", + "tool_count": 3, + "closed_count": "3", + "failed_count": "0" + }, + { + "tool_name": "Write", + "tool_count": 3, + "closed_count": "3", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "prompt_suggestion", + "subagent_trigger_kind": "stop_hook_background", + "subagent_trigger_detail": "suggestion_generation_allowed", + "subagent_count": 1, + "avg_duration_ms": 8029 + }, + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 1, + "avg_duration_ms": 33043 + }, + { + "subagent_reason": "extract_memories", + "subagent_trigger_kind": "stop_hook_background", + "subagent_trigger_detail": "post_turn_background_extraction", + "subagent_count": 1, + "avg_duration_ms": 29954 + }, + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": 40480 + } + ], + "recoveries": [] + } +} diff --git a/tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json b/tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json new file mode 100644 index 0000000000..6826640f1c --- /dev/null +++ b/tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json @@ -0,0 +1,163 @@ +{ + "run": { + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "started_at": "2026-04-24T04:55:36.952Z", + "ended_at": "2026-04-24T04:56:23.033Z", + "status": "completed", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "root_query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "root_query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "root_query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "cost_sensitive_task", + "name": "Cost Sensitive Task", + "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", + "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", + "tags": [ + "efficiency", + "tradeoff", + "observability-v2" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should avoid unnecessary background subagent expansion", + "Should keep the main query within a small number of turns" + ], + "max_turn_count": 8, + "max_total_billed_tokens": 260000, + "max_subagent_count": 3, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Increase the default session memory tool-call threshold from 3 to 6 to reduce background memory subagent cost.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "src/services/SessionMemory/sessionMemoryUtils.ts", + "notes": "Token-saving harness candidate. Keeps natural-break trigger intact while reducing tool-threshold-triggered updates." + }, + "evidence": { + "action": { + "event_date": "2026-04-24", + "user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "started_at": "2026-04-24T04:55:36.952Z", + "started_at_ms": 1777006536952, + "ended_at": "2026-04-24T04:56:23.033Z", + "ended_at_ms": 1777006583033, + "duration_ms": 46081, + "event_count": 286, + "query_count": 3, + "main_thread_query_count": 1, + "subagent_query_count": 3, + "subagent_count": 2, + "tool_call_count": 15, + "raw_input_tokens": "8", + "output_tokens": "4157", + "cache_read_tokens": "160020", + "cache_create_tokens": "188506", + "total_prompt_input_tokens": "348534", + "total_billed_tokens": "352691", + "main_thread_total_prompt_input_tokens": "158909", + "subagent_total_prompt_input_tokens": "189625" + }, + "rootQuery": { + "query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", + "user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "session_id": "e34e7a32-552b-4608-af59-8b48025e0ea0", + "conversation_id": "e34e7a32-552b-4608-af59-8b48025e0ea0", + "query_source": "repl_main_thread", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "repl_main_thread", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-04-24T04:55:36.952Z", + "started_at_ms": 1777006536952, + "ended_at": "2026-04-24T04:56:02.640Z", + "ended_at_ms": 1777006562640, + "duration_ms": 25688, + "first_event": "state.initialized", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 4, + "query_max_loop_iter": 4, + "query_avg_loop_iter": 2.5, + "tool_call_count": 7, + "event_count": 122, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Read", + "tool_count": 8, + "closed_count": "8", + "failed_count": "0" + }, + { + "tool_name": "Edit", + "tool_count": 5, + "closed_count": "5", + "failed_count": "0" + }, + { + "tool_name": "Glob", + "tool_count": 2, + "closed_count": "2", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "extract_memories", + "subagent_trigger_kind": "stop_hook_background", + "subagent_trigger_detail": "post_turn_background_extraction", + "subagent_count": 1, + "avg_duration_ms": 18519 + }, + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 1, + "avg_duration_ms": 29679 + } + ], + "recoveries": [] + } +} diff --git a/tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json b/tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json new file mode 100644 index 0000000000..99ea06233c --- /dev/null +++ b/tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 400399, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=4; scenario limit is 8." + }, + { + "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + } +] diff --git a/tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json b/tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json new file mode 100644 index 0000000000..87ca28e79b --- /dev/null +++ b/tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 352691, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=4; scenario limit is 8." + }, + { + "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + } +] From eb8a4ea3621cb5c4f73153b438dcfcaaf5bd7378 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Wed, 29 Apr 2026 17:41:05 +0800 Subject: [PATCH 11/26] Stabilize observability v2.1 bind runner --- scripts/evals/v2_record_run.ts | 9 +- scripts/evals/v2_run_experiment.ts | 184 +++++++- .../evals/v2_validate_experiment_artifacts.ts | 76 +++ scripts/evals/v2_verify_bind_runner.ts | 431 ++++++++++++++++++ tests/evals/v2/README.md | 20 + tests/evals/v2/V2.1-bind_existing-usage.md | 178 ++++++++ tests/evals/v2/experiment-runs/README.md | 67 +++ ...arse_vs_default_2026-04-27T105524752Z.json | 26 ++ ...arse_vs_default_2026-04-28T162912802Z.json | 26 ++ tests/evals/v2/gates/README.md | 62 +++ tests/evals/v2/score-specs/README.md | 57 +++ ...2_1_bind_runner_2026-04-29T072125437Z.json | 91 ++++ 12 files changed, 1203 insertions(+), 24 deletions(-) create mode 100644 scripts/evals/v2_validate_experiment_artifacts.ts create mode 100644 scripts/evals/v2_verify_bind_runner.ts create mode 100644 tests/evals/v2/V2.1-bind_existing-usage.md create mode 100644 tests/evals/v2/experiment-runs/README.md create mode 100644 tests/evals/v2/gates/README.md create mode 100644 tests/evals/v2/score-specs/README.md create mode 100644 tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index c5ac6cd30e..3987e288dc 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -156,7 +156,14 @@ async function loadVariant(variantId: string): Promise { try { return await readJson(directPath) } catch { - // Fall through to shipped templates. + // Fall through to shipped templates and fixture variants. + } + + const templatePath = path.join(evalRoot, 'variants', `${variantId}.template.json`) + try { + return await readJson(templatePath) + } catch { + // Fall through to the baseline template compatibility path. } const baseline = await readJson( diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index 7d337514b5..77201991ba 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -35,6 +35,7 @@ interface GateResult { candidate_variant_id: string rule_type: 'hard_fail' | 'soft_warning' score_spec_id: string + verdict: 'pass' | 'hard_fail' | 'soft_warning' | 'missing' | 'inconclusive' passed: boolean baseline_value: number | null candidate_value: number | null @@ -43,6 +44,15 @@ interface GateResult { notes?: string } +interface GateVerdict { + status: 'pass' | 'warning' | 'fail' | 'inconclusive' + hard_fail_count: number + soft_warning_count: number + missing_score_count: number + inconclusive_count: number + candidate_count: number +} + const repoRoot = path.resolve(import.meta.dirname, '..', '..') const evalRoot = path.join(repoRoot, 'tests', 'evals', 'v2') const scoresRoot = path.join(evalRoot, 'scores') @@ -112,7 +122,11 @@ async function loadScoreSpecs(): Promise> { async function loadGatePolicy(gatePolicyId?: string): Promise { if (!gatePolicyId) return undefined const filePath = path.join(evalRoot, 'gates', `${gatePolicyId}.json`) - return await readJson(filePath) + try { + return await readJson(filePath) + } catch { + return undefined + } } function normalizeGateRules(gatePolicy: EvalGatePolicy | undefined): EvalGatePolicyRule[] { @@ -258,6 +272,10 @@ function rulePassed(params: { return true } +function isSupportedGateCondition(condition: string): boolean { + return condition === 'candidate < baseline' || condition.includes('candidate_regression_pct >') +} + function evaluateGate(params: { scenarioId: string candidateVariantId: string @@ -286,6 +304,8 @@ function evaluateGate(params: { const spec = scoreSpecs.get(rule.score_spec_id) const baselineValue = valueFor(baselineScores, rule.score_spec_id) const candidateValue = valueFor(candidateScores, rule.score_spec_id) + const hasMissingScore = baselineValue === null || candidateValue === null + const hasUnsupportedCondition = !isSupportedGateCondition(rule.condition) const regressionPctValue = spec ? regressionPct({ baselineValue, @@ -293,18 +313,30 @@ function evaluateGate(params: { direction: spec.direction, }) : null - return { - scenario_id: scenarioId, - candidate_variant_id: candidateVariantId, - rule_type: rule.rule_type, - score_spec_id: rule.score_spec_id, - passed: rulePassed({ + const passed = + !hasMissingScore && + !hasUnsupportedCondition && + rulePassed({ rule, baselineValue, candidateValue, regressionPctValue, taskSuccessNotImproved, - }), + }) + const verdict: GateResult['verdict'] = hasMissingScore + ? 'missing' + : !spec || hasUnsupportedCondition + ? 'inconclusive' + : passed + ? 'pass' + : rule.rule_type + return { + scenario_id: scenarioId, + candidate_variant_id: candidateVariantId, + rule_type: rule.rule_type, + score_spec_id: rule.score_spec_id, + verdict, + passed, baseline_value: baselineValue, candidate_value: candidateValue, regression_pct: @@ -320,6 +352,8 @@ function buildRecordRunArgs(params: { variantId: string userActionId: string scoreSpecIds: string[] + dbPath?: string + snapshotDb: boolean }): string[] { const args = [ '--scenario', @@ -328,14 +362,66 @@ function buildRecordRunArgs(params: { params.variantId, '--user-action-id', params.userActionId, - '--snapshot-db', ] + if (params.snapshotDb) args.push('--snapshot-db') + if (params.dbPath) args.push('--db', params.dbPath) if (params.scoreSpecIds.length > 0) { args.push('--score-spec-ids', params.scoreSpecIds.join(',')) } return args } +function summarizeGate(results: ScenarioExperimentResult[]): GateVerdict { + const candidates = results.flatMap(result => result.candidates) + const allGateResults = candidates.flatMap(candidate => candidate.gate_results) + const hardFailCount = allGateResults.filter(result => result.verdict === 'hard_fail').length + const softWarningCount = allGateResults.filter(result => result.verdict === 'soft_warning').length + const missingScoreCount = allGateResults.filter(result => result.verdict === 'missing').length + const inconclusiveCount = allGateResults.filter(result => result.verdict === 'inconclusive').length + return { + status: + hardFailCount > 0 + ? 'fail' + : missingScoreCount > 0 || inconclusiveCount > 0 + ? 'inconclusive' + : softWarningCount > 0 + ? 'warning' + : 'pass', + hard_fail_count: hardFailCount, + soft_warning_count: softWarningCount, + missing_score_count: missingScoreCount, + inconclusive_count: inconclusiveCount, + candidate_count: candidates.length, + } +} + +function runRefs(results: ScenarioExperimentResult[]): string[] { + return results.flatMap(result => [ + path.join('tests', 'evals', 'v2', 'runs', `${result.baseline_run_id}.json`), + ...result.candidates.map(candidate => + path.join('tests', 'evals', 'v2', 'runs', `${candidate.candidate_run_id}.json`), + ), + ]) +} + +function scoreRefs(results: ScenarioExperimentResult[]): string[] { + return results.flatMap(result => [ + path.join('tests', 'evals', 'v2', 'scores', `${result.baseline_run_id}.scores.json`), + ...result.candidates.map(candidate => + path.join('tests', 'evals', 'v2', 'scores', `${candidate.candidate_run_id}.scores.json`), + ), + ]) +} + +function reportRefs(results: ScenarioExperimentResult[], experimentReport: string): string[] { + return [ + ...results.flatMap(result => + result.candidates.map(candidate => candidate.compare_report), + ), + experimentReport, + ].filter(Boolean) +} + function buildMarkdownReport(params: { experiment: EvalExperimentV21 results: ScenarioExperimentResult[] @@ -346,17 +432,20 @@ function buildMarkdownReport(params: { result.candidates.flatMap(candidate => candidate.gate_results), ) const hardFailures = allGateResults.filter( - result => result.rule_type === 'hard_fail' && !result.passed, + result => result.verdict === 'hard_fail', ) const softWarnings = allGateResults.filter( - result => result.rule_type === 'soft_warning' && !result.passed, + result => result.verdict === 'soft_warning', + ) + const missingOrInconclusive = allGateResults.filter( + result => result.verdict === 'missing' || result.verdict === 'inconclusive', ) const rows = results .flatMap(result => result.candidates.map(candidate => { const gateSummary = candidate.gate_results.length - ? `${candidate.gate_results.filter(gate => !gate.passed).length}/${candidate.gate_results.length} failed` + ? `${candidate.gate_results.filter(gate => gate.verdict !== 'pass').length}/${candidate.gate_results.length} not passed` : 'not configured' return `| ${result.scenario_id} | ${result.repeat_index} | ${result.baseline_run_id} | ${candidate.candidate_variant_id} | ${candidate.candidate_run_id} | ${gateSummary} | ${candidate.compare_report} |` }), @@ -369,7 +458,7 @@ function buildMarkdownReport(params: { : allGateResults .map( result => - `| ${result.scenario_id} | ${result.candidate_variant_id} | ${result.rule_type} | ${result.score_spec_id} | ${result.passed ? 'pass' : 'fail'} | ${result.regression_pct ?? 'n/a'} |`, + `| ${result.scenario_id} | ${result.candidate_variant_id} | ${result.rule_type} | ${result.score_spec_id} | ${result.verdict} | ${result.regression_pct ?? 'n/a'} |`, ) .join('\n') @@ -398,7 +487,8 @@ V2.1 intentionally does not execute the harness automatically. It turns existing - hard_failures: ${hardFailures.length} - soft_warnings: ${softWarnings.length} -- gate_status: ${hardFailures.length > 0 ? 'failed' : softWarnings.length > 0 ? 'warning' : 'passed'} +- missing_or_inconclusive: ${missingOrInconclusive.length} +- gate_status: ${hardFailures.length > 0 ? 'failed' : missingOrInconclusive.length > 0 ? 'inconclusive' : softWarnings.length > 0 ? 'warning' : 'passed'} ## Runs @@ -408,7 +498,7 @@ ${rows} ## Gate Results -| scenario | candidate_variant | rule_type | score_spec | result | regression_pct | +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | | --- | --- | --- | --- | --- | ---: | ${gateRows} ` @@ -441,6 +531,8 @@ async function main(): Promise { const scoreSpecs = await loadScoreSpecs() const gatePolicy = await loadGatePolicy(experiment.gate_policy_id) + const dbPath = typeof args.db === 'string' ? args.db : undefined + const snapshotDb = !Boolean(args['no-snapshot-db']) for (const scoreSpecId of experiment.score_spec_ids ?? []) { if (!scoreSpecs.has(scoreSpecId)) { throw new Error( @@ -463,6 +555,21 @@ async function main(): Promise { const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) const results: ScenarioExperimentResult[] = [] + for (const scenarioId of scenarioIds) { + for (const variantId of [experiment.baseline_variant_id, ...experiment.candidate_variant_ids]) { + const userActionId = findBoundUserActionId({ + experiment, + scenarioId, + variantId, + }) + if (!userActionId) { + throw new Error( + `Missing action binding for scenario=${scenarioId}, variant=${variantId}. V2.1 bind_existing mode requires user_action_id bindings.`, + ) + } + } + } + for (const scenarioId of scenarioIds) { const baselineUserActionId = findBoundUserActionId({ experiment, @@ -483,6 +590,8 @@ async function main(): Promise { variantId: experiment.baseline_variant_id, userActionId: baselineUserActionId, scoreSpecIds: experiment.score_spec_ids ?? [], + dbPath, + snapshotDb, }), ) const baselineRunId = extractCreatedRunId(baselineOutput) @@ -510,6 +619,8 @@ async function main(): Promise { variantId: candidateVariantId, userActionId: candidateActionId, scoreSpecIds: experiment.score_spec_ids ?? [], + dbPath, + snapshotDb, }), ) const candidateRunId = extractCreatedRunId(candidateOutput) @@ -557,10 +668,43 @@ async function main(): Promise { `${experiment.experiment_id}_${runStamp}.json`, ) const outputJsonRel = path.relative(repoRoot, outputJsonPath) + const reportRoot = await resolveReportRoot() + await mkdir(reportRoot, { recursive: true }) + const outputMarkdownPath = path.join( + reportRoot, + `experiment_${experiment.experiment_id}_${runStamp}.md`, + ) + const outputMarkdownRel = path.relative(repoRoot, outputMarkdownPath) + const generatedAt = new Date().toISOString() + const gateVerdict = summarizeGate(results) + const warningMessages = results + .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) + .filter(result => result.verdict === 'soft_warning' || result.verdict === 'missing' || result.verdict === 'inconclusive') + .map( + result => + `${result.verdict}: scenario=${result.scenario_id}, candidate=${result.candidate_variant_id}, score=${result.score_spec_id}`, + ) + const errorMessages = results + .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) + .filter(result => result.verdict === 'hard_fail') + .map( + result => + `hard_fail: scenario=${result.scenario_id}, candidate=${result.candidate_variant_id}, score=${result.score_spec_id}`, + ) await writeFile( outputJsonPath, `${JSON.stringify( { + experiment_id: experiment.experiment_id, + manifest_ref: path.relative(repoRoot, experimentPath), + generated_at: generatedAt, + mode, + run_refs: runRefs(results), + score_refs: scoreRefs(results), + report_refs: reportRefs(results, outputMarkdownRel), + gate_verdict: gateVerdict, + errors: errorMessages, + warnings: warningMessages, experiment, runner: { mode, @@ -568,19 +712,13 @@ async function main(): Promise { gate_policy_id: experiment.gate_policy_id ?? null, }, results, - created_at: new Date().toISOString(), + created_at: generatedAt, }, null, 2, )}\n`, ) - const reportRoot = await resolveReportRoot() - await mkdir(reportRoot, { recursive: true }) - const outputMarkdownPath = path.join( - reportRoot, - `experiment_${experiment.experiment_id}_${runStamp}.md`, - ) await writeFile( outputMarkdownPath, buildMarkdownReport({ @@ -591,7 +729,7 @@ async function main(): Promise { ) console.log(`Created V2.1 experiment summary: ${outputJsonRel}`) - console.log(`Created V2.1 experiment report: ${path.relative(repoRoot, outputMarkdownPath)}`) + console.log(`Created V2.1 experiment report: ${outputMarkdownRel}`) } main().catch(error => { diff --git a/scripts/evals/v2_validate_experiment_artifacts.ts b/scripts/evals/v2_validate_experiment_artifacts.ts new file mode 100644 index 0000000000..f8e4e4666c --- /dev/null +++ b/scripts/evals/v2_validate_experiment_artifacts.ts @@ -0,0 +1,76 @@ +import { readFile, readdir } from 'node:fs/promises' +import path from 'node:path' + +type JsonRecord = Record + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const experimentRunsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'experiment-runs') +const gateStatuses = new Set(['pass', 'warning', 'fail', 'inconclusive']) + +async function readJson(filePath: string): Promise { + return JSON.parse(await readFile(filePath, 'utf8')) as JsonRecord +} + +function requireString(errors: string[], filePath: string, fieldName: string, value: unknown) { + if (typeof value !== 'string' || value.trim() === '') { + errors.push(`${filePath}.${fieldName} must be a non-empty string`) + } +} + +function requireArray(errors: string[], filePath: string, fieldName: string, value: unknown) { + if (!Array.isArray(value)) { + errors.push(`${filePath}.${fieldName} must be an array`) + } +} + +function requireNumber(errors: string[], objectName: string, fieldName: string, value: unknown) { + if (typeof value !== 'number') { + errors.push(`${objectName}.${fieldName} must be a number`) + } +} + +function validateArtifact(filePath: string, artifact: JsonRecord): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'experiment_id', artifact.experiment_id) + requireString(errors, filePath, 'manifest_ref', artifact.manifest_ref) + requireString(errors, filePath, 'generated_at', artifact.generated_at) + requireString(errors, filePath, 'mode', artifact.mode) + requireArray(errors, filePath, 'run_refs', artifact.run_refs) + requireArray(errors, filePath, 'score_refs', artifact.score_refs) + requireArray(errors, filePath, 'report_refs', artifact.report_refs) + requireArray(errors, filePath, 'errors', artifact.errors) + requireArray(errors, filePath, 'warnings', artifact.warnings) + + const gateVerdict = artifact.gate_verdict as JsonRecord | undefined + if (!gateVerdict || typeof gateVerdict !== 'object' || Array.isArray(gateVerdict)) { + errors.push(`${filePath}.gate_verdict must be an object`) + return errors + } + if (!gateStatuses.has(String(gateVerdict.status))) { + errors.push(`${filePath}.gate_verdict.status has invalid value: ${gateVerdict.status}`) + } + requireNumber(errors, `${filePath}.gate_verdict`, 'hard_fail_count', gateVerdict.hard_fail_count) + requireNumber(errors, `${filePath}.gate_verdict`, 'soft_warning_count', gateVerdict.soft_warning_count) + requireNumber(errors, `${filePath}.gate_verdict`, 'missing_score_count', gateVerdict.missing_score_count) + requireNumber(errors, `${filePath}.gate_verdict`, 'inconclusive_count', gateVerdict.inconclusive_count) + requireNumber(errors, `${filePath}.gate_verdict`, 'candidate_count', gateVerdict.candidate_count) + return errors +} + +const entries = await readdir(experimentRunsRoot, { withFileTypes: true }).catch(() => []) +const files = entries + .filter(entry => entry.isFile() && entry.name.endsWith('.json')) + .map(entry => path.join(experimentRunsRoot, entry.name)) + +const errors: string[] = [] +for (const filePath of files) { + errors.push(...validateArtifact(filePath, await readJson(filePath))) +} + +if (errors.length > 0) { + console.error('V2 experiment artifact schema validation failed:') + for (const error of errors) console.error(`- ${error}`) + process.exit(1) +} + +console.log(`V2 experiment artifact schema validation passed: ${files.length} file(s).`) diff --git a/scripts/evals/v2_verify_bind_runner.ts b/scripts/evals/v2_verify_bind_runner.ts new file mode 100644 index 0000000000..84a13a50c9 --- /dev/null +++ b/scripts/evals/v2_verify_bind_runner.ts @@ -0,0 +1,431 @@ +import { spawnSync } from 'node:child_process' +import { mkdir, readFile, rm, unlink, writeFile } from 'node:fs/promises' +import path from 'node:path' + +type JsonRecord = Record + +interface VerifyCase { + case_id: string + description: string + manifest: JsonRecord + expect: 'success' | 'failure' + expected_error?: string + db_path?: string + no_snapshot_db?: boolean +} + +interface VerifyResult { + case_id: string + description: string + passed: boolean + expected: 'success' | 'failure' + status: number | null + summary_ref?: string + report_ref?: string + artifacts_cleaned?: boolean + error_excerpt?: string +} + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') +const stamp = new Date().toISOString().replace(/[:.]/g, '') +const tempRoot = path.join(repoRoot, '.observability', 'v2-runner-verification', stamp) +const manifestsRoot = path.join(tempRoot, 'manifests') +const reportsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'verification-reports') + +const baselineActionId = '1d5eb5e1-2fe0-42fa-9450-7b05d6367976' +const candidateActionId = 'dbf9fae1-0a5a-4f50-aba7-02047ced9390' +const missingRootActionId = 'v2-verify-missing-root-action' +const nonexistentActionId = '00000000-0000-0000-0000-000000000000' + +const scoreSpecIds = [ + 'task_success.main_chain_observed', + 'efficiency.total_billed_tokens', + 'decision_quality.subagent_count_observed', + 'stability.recovery_absence', + 'controllability.turn_limit_basic', +] + +function experiment(params: { + id: string + scenarioIds: string[] + candidateVariantIds: string[] + bindings: Array + scoreSpecIds?: string[] + gatePolicyId?: string + mode?: 'bind_existing' | 'execute_harness' +}): JsonRecord { + return { + experiment_id: params.id, + name: params.id, + goal: 'V2.1 bind_existing runner verification case.', + baseline_variant_id: 'baseline_default', + candidate_variant_ids: params.candidateVariantIds, + scenario_set_id: 'v2_1_verify', + scenario_ids: params.scenarioIds, + repeat_count: 1, + score_spec_ids: params.scoreSpecIds ?? scoreSpecIds, + gate_policy_id: params.gatePolicyId ?? 'default_v2_1_gate', + mode: params.mode ?? 'bind_existing', + action_bindings: params.bindings, + status: 'ready', + } +} + +function bindingsFor(params: { + scenarioIds: string[] + candidateVariantIds: string[] + baselineActionId?: string + candidateActionId?: string +}): JsonRecord[] { + return params.scenarioIds.flatMap(scenarioId => [ + { + scenario_id: scenarioId, + variant_id: 'baseline_default', + entry_user_action_id: params.baselineActionId ?? baselineActionId, + }, + ...params.candidateVariantIds.map(variantId => ({ + scenario_id: scenarioId, + variant_id: variantId, + entry_user_action_id: params.candidateActionId ?? candidateActionId, + })), + ]) +} + +async function writeJson(filePath: string, value: unknown): Promise { + await mkdir(path.dirname(filePath), { recursive: true }) + await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') +} + +function runBun(args: string[]) { + return spawnSync('bun', ['run', ...args], { + cwd: repoRoot, + encoding: 'utf8', + }) +} + +function extractOutputRef(output: string, label: string): string | undefined { + const match = output.match(new RegExp(`${label}:\\s*(.+)`)) + return match?.[1]?.trim() +} + +function relToAbs(ref: string): string { + return path.isAbsolute(ref) ? ref : path.resolve(repoRoot, ref) +} + +async function removeIfExists(filePath: string): Promise { + await unlink(filePath).catch(() => undefined) +} + +async function cleanupGeneratedArtifacts(summaryRef?: string): Promise { + if (!summaryRef) return + const summaryPath = relToAbs(summaryRef) + const summary = JSON.parse(await readFile(summaryPath, 'utf8')) as { + run_refs?: string[] + score_refs?: string[] + report_refs?: string[] + } + const runReportRefs = (summary.run_refs ?? []).map(runRef => { + const runId = path.basename(runRef, '.json') + return path.join( + 'ObservrityTask', + '10-系统版本', + 'v2', + '06-运行报告', + `${runId}.md`, + ) + }) + const refs = [ + ...(summary.run_refs ?? []), + ...(summary.score_refs ?? []), + ...(summary.report_refs ?? []), + ...runReportRefs, + summaryRef, + ] + for (const ref of refs) { + await removeIfExists(relToAbs(ref)) + } +} + +function assertExperimentArtifactSchema(summary: JsonRecord): string[] { + const errors: string[] = [] + const requiredStrings = ['experiment_id', 'manifest_ref', 'generated_at', 'mode'] + for (const field of requiredStrings) { + if (typeof summary[field] !== 'string' || String(summary[field]).trim() === '') { + errors.push(`${field} must be a non-empty string`) + } + } + for (const field of ['run_refs', 'score_refs', 'report_refs', 'errors', 'warnings']) { + if (!Array.isArray(summary[field])) errors.push(`${field} must be an array`) + } + const gateVerdict = summary.gate_verdict as JsonRecord | undefined + if (!gateVerdict || typeof gateVerdict !== 'object') { + errors.push('gate_verdict must be an object') + } else if (!['pass', 'warning', 'fail', 'inconclusive'].includes(String(gateVerdict.status))) { + errors.push('gate_verdict.status has invalid value') + } + return errors +} + +async function createMissingRootDb(): Promise { + const dbPath = path.join(tempRoot, 'missing-root.duckdb') + const sql = [ + 'CREATE TABLE user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, ended_at VARCHAR, total_billed_tokens BIGINT);', + `INSERT INTO user_actions VALUES ('2026-04-29', '${missingRootActionId}', '2026-04-29T00:00:00.000Z', '2026-04-29T00:00:01.000Z', 1);`, + 'CREATE TABLE queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR);', + ].join(' ') + const result = spawnSync(duckdbExe, [dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + throw new Error(String(result.stderr || result.stdout || result.error?.message)) + } + return dbPath +} + +async function runCase(testCase: VerifyCase): Promise { + const manifestPath = path.join(manifestsRoot, `${testCase.case_id}.json`) + await writeJson(manifestPath, testCase.manifest) + const args = ['scripts/evals/v2_run_experiment.ts', '--experiment', manifestPath] + if (testCase.db_path) args.push('--db', testCase.db_path) + if (testCase.no_snapshot_db) args.push('--no-snapshot-db') + + const result = runBun(args) + const output = [String(result.stdout ?? '').trim(), String(result.stderr ?? '').trim()] + .filter(Boolean) + .join('\n') + const summaryRef = extractOutputRef(output, 'Created V2.1 experiment summary') + const reportRef = extractOutputRef(output, 'Created V2.1 experiment report') + + if (testCase.expect === 'failure') { + const hasExpectedError = + result.status !== 0 && + (!testCase.expected_error || output.includes(testCase.expected_error)) + return { + case_id: testCase.case_id, + description: testCase.description, + passed: hasExpectedError, + expected: testCase.expect, + status: result.status, + error_excerpt: output.slice(0, 500), + } + } + + let passed = result.status === 0 && Boolean(summaryRef) + let errorExcerpt = '' + if (summaryRef) { + const summary = JSON.parse(await readFile(relToAbs(summaryRef), 'utf8')) as JsonRecord + const schemaErrors = assertExperimentArtifactSchema(summary) + if (schemaErrors.length > 0) { + passed = false + errorExcerpt = schemaErrors.join('; ') + } + await cleanupGeneratedArtifacts(summaryRef) + } + + return { + case_id: testCase.case_id, + description: testCase.description, + passed, + expected: testCase.expect, + status: result.status, + summary_ref: summaryRef, + report_ref: reportRef, + artifacts_cleaned: Boolean(summaryRef), + error_excerpt: errorExcerpt || output.slice(0, 500), + } +} + +async function main(): Promise { + await mkdir(manifestsRoot, { recursive: true }) + await mkdir(reportsRoot, { recursive: true }) + const missingRootDb = await createMissingRootDb() + + const cases: VerifyCase[] = [ + { + case_id: 'single_scenario_single_candidate', + description: 'Single scenario plus one candidate should complete.', + expect: 'success', + manifest: experiment({ + id: `v2_1_verify_single_candidate_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + }), + }), + }, + { + case_id: 'single_scenario_multi_candidate', + description: 'Single scenario plus multiple candidates should complete.', + expect: 'success', + manifest: experiment({ + id: `v2_1_verify_multi_candidate_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: [ + 'candidate_session_memory_sparse', + 'candidate_tool_router_v2', + ], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: [ + 'candidate_session_memory_sparse', + 'candidate_tool_router_v2', + ], + }), + }), + }, + { + case_id: 'multi_scenario_single_candidate', + description: 'Multiple scenarios plus one candidate should complete.', + expect: 'success', + manifest: experiment({ + id: `v2_1_verify_multi_scenario_${stamp}`, + scenarioIds: ['cost_sensitive_task', 'tool_choice_sensitive'], + candidateVariantIds: ['candidate_session_memory_sparse'], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task', 'tool_choice_sensitive'], + candidateVariantIds: ['candidate_session_memory_sparse'], + }), + }), + }, + { + case_id: 'missing_action_binding', + description: 'Missing candidate action binding should fail clearly.', + expect: 'failure', + expected_error: 'Missing action binding', + manifest: experiment({ + id: `v2_1_verify_missing_binding_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + bindings: [ + { + scenario_id: 'cost_sensitive_task', + variant_id: 'baseline_default', + entry_user_action_id: baselineActionId, + }, + ], + }), + }, + { + case_id: 'nonexistent_user_action_id', + description: 'Nonexistent V1 user_action_id should fail.', + expect: 'failure', + expected_error: 'user_action_id not found', + manifest: experiment({ + id: `v2_1_verify_missing_action_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + baselineActionId: nonexistentActionId, + }), + }), + }, + { + case_id: 'root_query_missing', + description: 'V1 action without main_thread root query should fail.', + expect: 'failure', + expected_error: 'Fact-only binding failed', + db_path: missingRootDb, + no_snapshot_db: true, + manifest: experiment({ + id: `v2_1_verify_missing_root_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + baselineActionId: missingRootActionId, + candidateActionId: missingRootActionId, + }), + }), + }, + { + case_id: 'missing_score_spec_id', + description: 'Missing score_spec_id should fail before run creation.', + expect: 'failure', + expected_error: 'Experiment references missing score_spec_id', + manifest: experiment({ + id: `v2_1_verify_missing_score_spec_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + scoreSpecIds: ['not.real.score'], + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + }), + }), + }, + { + case_id: 'missing_gate_policy_id', + description: 'Missing gate_policy_id should fail before run creation.', + expect: 'failure', + expected_error: 'Experiment references missing gate_policy_id', + manifest: experiment({ + id: `v2_1_verify_missing_gate_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + gatePolicyId: 'not_real_gate', + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + }), + }), + }, + { + case_id: 'execute_harness_blocked', + description: 'execute_harness mode should fail with the explicit adapter error.', + expect: 'failure', + expected_error: + 'execute_harness mode is not implemented yet: missing headless harness execution adapter', + manifest: experiment({ + id: `v2_1_verify_execute_harness_${stamp}`, + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + mode: 'execute_harness', + bindings: bindingsFor({ + scenarioIds: ['cost_sensitive_task'], + candidateVariantIds: ['candidate_session_memory_sparse'], + }), + }), + }, + ] + + const results: VerifyResult[] = [] + for (const testCase of cases) { + results.push(await runCase(testCase)) + } + + const failed = results.filter(result => !result.passed) + const report = { + verification_id: `v2_1_bind_runner_${stamp}`, + generated_at: new Date().toISOString(), + temp_root: path.relative(repoRoot, tempRoot), + passed: failed.length === 0, + case_count: results.length, + failed_count: failed.length, + results, + } + const reportPath = path.join(reportsRoot, `v2_1_bind_runner_${stamp}.json`) + await writeJson(reportPath, report) + await rm(tempRoot, { recursive: true, force: true }).catch(() => undefined) + + console.log(`Created V2.1 verification report: ${path.relative(repoRoot, reportPath)}`) + if (failed.length > 0) { + for (const result of failed) { + console.error(`FAILED ${result.case_id}: ${result.error_excerpt ?? ''}`) + } + process.exit(1) + } + console.log(`V2.1 bind runner verification passed: ${results.length}/${results.length}`) +} + +main().catch(async error => { + await rm(tempRoot, { recursive: true, force: true }).catch(() => undefined) + console.error(error instanceof Error ? error.message : error) + process.exit(1) +}) diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index f89f322c0e..e57d4a3399 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -16,6 +16,8 @@ Structure: - regression gate policies - `experiment-runs/` - generated experiment-level summaries +- `verification-reports/` + - generated V2.1 runner verification summaries - `scores/` - optional manual review or exported score artifacts - `runs/` @@ -63,6 +65,18 @@ Validate manifests: bun run scripts/evals/v2_validate_manifests.ts ``` +Run the V2.1 bind runner verification suite: + +```powershell +bun run scripts/evals/v2_verify_bind_runner.ts +``` + +Validate generated experiment artifact schema: + +```powershell +bun run scripts/evals/v2_validate_experiment_artifacts.ts +``` + Run the current sample V2.1 experiment: ```powershell @@ -71,6 +85,12 @@ bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records score-spec-backed scores, compares baseline vs candidate, applies the configured gate policy, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. +Detailed V2.1 usage: + +```text +tests/evals/v2/V2.1-bind_existing-usage.md +``` + `execute_harness` is reserved but intentionally blocked until a stable headless harness execution adapter exists. If a manifest uses that mode now, the runner exits with: ```text diff --git a/tests/evals/v2/V2.1-bind_existing-usage.md b/tests/evals/v2/V2.1-bind_existing-usage.md new file mode 100644 index 0000000000..d4b334aae8 --- /dev/null +++ b/tests/evals/v2/V2.1-bind_existing-usage.md @@ -0,0 +1,178 @@ +# V2.1 bind_existing 使用说明 + +## 理解清单 + +- V2.1 当前稳定入口是 `bind_existing`。 +- `bind_existing` 不会自动启动 harness,也不会自动发送 prompt。 +- 它只把你已经真实跑出来的 V1 `user_action_id` 绑定成 V2 run,再自动生成 score、compare、gate、experiment summary。 + +## 预期效果 + +你可以用一组固定 scenario,对比 baseline 和 candidate 的真实运行证据: + +```text +真实运行 baseline -> 得到 baseline user_action_id +真实运行 candidate -> 得到 candidate user_action_id +填写 experiment manifest +运行 validator +运行 runner +阅读 report 和 gate verdict +``` + +## 设计思路 + +V2.1 先保证实验证据可追溯。只要没有稳定 headless harness adapter,就不自动执行 harness,避免把“无法确认的执行过程”伪装成正式评测结果。 + +## 1. 创建 Experiment Manifest + +在 `tests/evals/v2/experiments/` 下创建一个 JSON,例如: + +```json +{ + "experiment_id": "my_candidate_vs_default", + "name": "My Candidate vs Default", + "goal": "Check whether my candidate reduces cost without hurting trace-backed success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_first_batch", + "scenario_ids": ["cost_sensitive_task"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "" + } + ], + "status": "ready" +} +``` + +## 2. 填写 action_bindings + +推荐格式是扁平绑定: + +```text +scenario_id + variant_id + entry_user_action_id +``` + +含义: + +| field | meaning | +| --- | --- | +| `scenario_id` | 这条 V1 trace 对应哪个评测场景。 | +| `variant_id` | 这条 V1 trace 对应 baseline 还是某个 candidate。 | +| `entry_user_action_id` | V1 可观测系统里的真实用户动作 ID。 | + +一个 scenario 有 1 个 baseline 和 N 个 candidate,就需要 N+1 条 binding。 + +## 3. 运行 Validator + +```powershell +bun run scripts/evals/v2_validate_manifests.ts +``` + +validator 会检查: + +- scenario 是否存在。 +- variant 是否存在。 +- score_spec 是否存在。 +- gate_policy 是否存在。 +- `bind_existing` 是否覆盖了每个 `scenario × variant`。 + +## 4. 运行 Runner + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment my_candidate_vs_default +``` + +当前 runner 默认会通过 DB snapshot 读取 V1 DuckDB,减少 dashboard watcher 占用数据库导致的失败。 + +## 5. 查看 Report + +主要输出位置: + +| path | content | +| --- | --- | +| `tests/evals/v2/runs/` | 每个 scenario/variant 的 V2 run 记录。 | +| `tests/evals/v2/scores/` | 每个 run 的正式 score artifact。 | +| `tests/evals/v2/experiment-runs/` | experiment-level JSON summary。 | +| `ObservrityTask/10-系统版本/v2/06-运行报告/` | 面向人工阅读的 run / compare / experiment Markdown report。 | + +优先看 `experiment-runs/*.json` 的顶层字段: + +- `run_refs` +- `score_refs` +- `report_refs` +- `gate_verdict` +- `errors` +- `warnings` + +## 6. 解释 Gate Verdict + +| status | meaning | +| --- | --- | +| `pass` | 没有 hard fail、soft warning、missing score、inconclusive。 | +| `warning` | 没有 hard fail,但存在 soft warning。 | +| `fail` | 至少一个 candidate 触发 hard fail。 | +| `inconclusive` | 没有 hard fail,但存在 missing score 或无法判断。 | + +不要只看成本下降。至少同时看: + +- `task_success.main_chain_observed` +- `efficiency.total_billed_tokens` +- `decision_quality.subagent_count_observed` +- `stability.recovery_absence` +- `controllability.turn_limit_basic` + +## 7. 运行回归验证 + +```powershell +bun run scripts/evals/v2_verify_bind_runner.ts +``` + +该脚本覆盖: + +- 单 scenario + 单 candidate +- 单 scenario + 多 candidate +- 多 scenario + 单 candidate +- 缺失 action_binding +- 不存在的 user_action_id +- root query 缺失 +- 不存在的 score_spec_id +- 不存在的 gate_policy_id +- `execute_harness` 明确报错路径 + +脚本会清理自己生成的 run/score/report 临时 artifacts,只保留 verification report。 + +## 8. 为什么 execute_harness 当前不可用 + +`execute_harness` 需要稳定的 headless harness execution adapter。当前仓库还没有一个可以可靠完成以下动作的入口: + +- 自动应用 variant。 +- 自动发送 scenario prompt。 +- 自动等待执行完成。 +- 自动捕获本次新增的 `user_action_id`。 +- 自动保证这条 trace 和当前 run 一一对应。 + +因此 V2.1 明确阻塞该模式: + +```text +execute_harness mode is not implemented yet: missing headless harness execution adapter +``` + +这不是缺陷,而是当前阶段的安全边界。 diff --git a/tests/evals/v2/experiment-runs/README.md b/tests/evals/v2/experiment-runs/README.md new file mode 100644 index 0000000000..d4560c8229 --- /dev/null +++ b/tests/evals/v2/experiment-runs/README.md @@ -0,0 +1,67 @@ +# V2.1 Experiment Artifact Schema + +## 理解清单 + +- 本目录保存 experiment-level JSON summary。 +- 这些 JSON 是 V2.1 runner 的稳定回归证据。 +- V2.1-stable 要求每个新 summary 都包含固定顶层 schema,不能只依赖历史的 `experiment/results` 内部结构。 + +## 预期效果 + +读取任意 `tests/evals/v2/experiment-runs/*.json` 时,应能快速回答: + +- 这次实验来自哪个 manifest。 +- 使用的是哪个 mode。 +- 生成了哪些 run / score / report artifact。 +- gate 最终是 pass、warning、fail 还是 inconclusive。 +- 是否存在错误或警告。 + +## 设计思路 + +顶层字段用于机器读取和回归判断;保留 `experiment`、`runner`、`results` 用于人工追溯和向后兼容。 + +## Required Top-Level Fields + +| field | type | meaning | +| --- | --- | --- | +| `experiment_id` | string | 实验 ID,来自 manifest。 | +| `manifest_ref` | string | 本次 runner 读取的 manifest 路径。 | +| `generated_at` | ISO timestamp string | summary 生成时间。 | +| `mode` | string | 当前只允许 `bind_existing`;`execute_harness` 会被明确阻塞。 | +| `run_refs` | string[] | 本次生成的 V2 run JSON 路径。 | +| `score_refs` | string[] | 本次生成的 score JSON 路径。 | +| `report_refs` | string[] | 本次生成的 compare / experiment Markdown report 路径。 | +| `gate_verdict` | object | 聚合后的 gate 结论。 | +| `errors` | string[] | hard fail 或 runner 级错误摘要。成功但 gate hard fail 时也可非空。 | +| `warnings` | string[] | soft warning、missing score、inconclusive 等非阻塞问题。 | + +## Gate Verdict Shape + +```json +{ + "status": "pass", + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1 +} +``` + +`status` 的优先级: + +1. 任意 hard fail => `fail` +2. 任意 missing score 或 inconclusive => `inconclusive` +3. 任意 soft warning => `warning` +4. 其他情况 => `pass` + +## Backward Compatibility + +V2.1 仍保留以下字段: + +- `experiment` +- `runner` +- `results` +- `created_at` + +这些字段可以用于人工阅读,但新脚本应优先依赖顶层稳定 schema。 diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json index ff6e644d21..66f9171093 100644 --- a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json +++ b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json @@ -1,4 +1,30 @@ { + "experiment_id": "session_memory_sparse_vs_default", + "manifest_ref": "tests/evals/v2/experiments/session_memory_sparse_vs_default.json", + "generated_at": "2026-04-27T10:55:24.753Z", + "mode": "bind_existing", + "run_refs": [ + "tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json", + "tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json" + ], + "score_refs": [ + "tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json", + "tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json" + ], + "report_refs": [ + "ObservrityTask/10-系统版本/v2/06-运行报告/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" + ], + "gate_verdict": { + "status": "pass", + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1 + }, + "errors": [], + "warnings": [], "experiment": { "experiment_id": "session_memory_sparse_vs_default", "name": "Session Memory Sparse vs Default", diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json index 0083e353e6..aaeb095f38 100644 --- a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json +++ b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json @@ -1,4 +1,30 @@ { + "experiment_id": "session_memory_sparse_vs_default", + "manifest_ref": "tests/evals/v2/experiments/session_memory_sparse_vs_default.json", + "generated_at": "2026-04-28T16:29:12.803Z", + "mode": "bind_existing", + "run_refs": [ + "tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json", + "tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json" + ], + "score_refs": [ + "tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json", + "tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json" + ], + "report_refs": [ + "ObservrityTask/10-系统版本/v2/06-运行报告/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" + ], + "gate_verdict": { + "status": "pass", + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1 + }, + "errors": [], + "warnings": [], "experiment": { "experiment_id": "session_memory_sparse_vs_default", "name": "Session Memory Sparse vs Default", diff --git a/tests/evals/v2/gates/README.md b/tests/evals/v2/gates/README.md new file mode 100644 index 0000000000..18b4245968 --- /dev/null +++ b/tests/evals/v2/gates/README.md @@ -0,0 +1,62 @@ +# V2.1 Gate Semantics + +## 理解清单 + +- gate 不是 scorer;gate 只解释 baseline 和 candidate 的 score 差异。 +- gate policy 定义 hard fail 和 soft warning。 +- runner 负责把每个 candidate 的 gate result 汇总成 experiment-level verdict。 + +## 预期效果 + +读 `gate_verdict.status` 时,应能得到稳定含义: + +- `pass`:没有 hard fail、soft warning、missing score、inconclusive。 +- `warning`:没有 hard fail,但至少有 soft warning。 +- `fail`:至少有一个 hard fail。 +- `inconclusive`:没有 hard fail,但存在 missing score 或无法判断的规则。 + +## 设计思路 + +V2.1 的 gate 语义要保守。缺失 score 不应被当作 pass;无法判断时应暴露为 `inconclusive`。 + +## Rule Types + +| rule type | meaning | effect | +| --- | --- | --- | +| `hard_fail` | 不可接受的退化 | 任意触发时,experiment verdict 为 `fail`。 | +| `soft_warning` | 需要人工注意的退化 | 没有 hard fail 时,experiment verdict 为 `warning`。 | + +## Missing Score + +如果某条 gate rule 需要的 baseline 或 candidate score 缺失: + +- 该 rule 的 verdict 是 `missing`。 +- experiment `missing_score_count` 加 1。 +- 如果没有 hard fail,则 experiment status 为 `inconclusive`。 + +## Inconclusive + +如果 gate rule 无法被当前 runner 解释,或 score spec 不足以计算方向: + +- 该 rule 的 verdict 是 `inconclusive`。 +- experiment `inconclusive_count` 加 1。 +- 如果没有 hard fail,则 experiment status 为 `inconclusive`。 + +## Multi-Candidate Summary + +多 candidate 时,runner 按所有 candidate 的 gate results 汇总: + +- 任一 candidate hard fail => 总 verdict `fail`。 +- 无 hard fail,但任一 candidate missing/inconclusive => 总 verdict `inconclusive`。 +- 无 hard fail/missing/inconclusive,但任一 candidate soft warning => 总 verdict `warning`。 +- 所有 candidate 都 pass => 总 verdict `pass`。 + +## Current Supported Conditions + +V2.1 runner 当前支持以下 condition 模式: + +- `candidate < baseline` +- `candidate_regression_pct > ` +- `candidate_regression_pct > and task_success_not_improved` + +更复杂的 gate condition 应先写成文档和测试,再扩展 runner,不应默默当作 pass。 diff --git a/tests/evals/v2/score-specs/README.md b/tests/evals/v2/score-specs/README.md new file mode 100644 index 0000000000..e880eebfcd --- /dev/null +++ b/tests/evals/v2/score-specs/README.md @@ -0,0 +1,57 @@ +# V2.1 ScoreSpec And Scorer Mapping + +## 理解清单 + +- `score-specs/*.json` 定义“哪些分数是正式分数”。 +- `scripts/evals/v2_record_run.ts` 目前负责实际计算这些分数。 +- V2.1 当前不是公式解释器;score formula 仍由脚本中的 scorer implementation 实现。 + +## 预期效果 + +当 experiment manifest 声明 `score_spec_ids` 时: + +- 每个声明的 `score_spec_id` 必须有对应 scorer。 +- runner 只输出 manifest 声明过的 score。 +- 如果声明了没有实现的 score,`v2_record_run.ts` 必须失败。 +- 未声明的临时 score 不得进入正式 score artifact。 + +## 设计思路 + +V2.1 先固化 contract,再逐步演进实现。当前 contract 是: + +```text +score_spec_id -> implemented scorer in scripts/evals/v2_record_run.ts +``` + +后续可以把公式解析、规则执行、外部 scorer registry 拆出去,但本轮不做。 + +## Current Mapping + +| score_spec_id | implementation | data source | current boundary | +| --- | --- | --- | --- | +| `task_success.main_chain_observed` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `queries` + run binding | 判断是否存在 `main_thread` root query。 | +| `efficiency.total_billed_tokens` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `user_actions.total_billed_tokens` | 只记录事实值,不单独判断好坏。 | +| `decision_quality.subagent_count_observed` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `subagents` | 只记录数量事实;是否好坏交给 compare/gate 结合任务成功判断。 | +| `stability.recovery_absence` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `recoveries` | 无 recovery 为 1,有 recovery 为 0。 | +| `controllability.turn_limit_basic` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `queries.turn_count` + scenario limit | 当前使用 scenario `max_turn_count`,缺省为 8。 | + +## Not Formal In V2.1 + +`v2_record_run.ts` 内部还能计算一些辅助分数,例如: + +- `decision_quality.expected_tool_hit_rate` +- `efficiency.total_billed_token_budget` +- `stability.v1_closure_health` +- `controllability.subagent_count_budget` + +这些只有在 experiment manifest 的 `score_spec_ids` 中显式声明并有 score-spec 文件支持时,才应进入正式 experiment score artifact。 + +## Failure Rules + +- experiment 引用不存在的 `score_spec_id`:runner 失败。 +- score-spec 存在但 scorer 未实现:record_run 失败。 +- scorer 产生了未声明 score:runner 通过 `--score-spec-ids` 过滤,不写入正式 score artifact。 + +## V2.1 Boundary + +当前 `formula` 字段是解释说明,不是自动执行语言。V2.1-stable 的重点是让 score contract 可验证,而不是实现通用公式引擎。 diff --git a/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json b/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json new file mode 100644 index 0000000000..2a8f9ac60a --- /dev/null +++ b/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json @@ -0,0 +1,91 @@ +{ + "verification_id": "v2_1_bind_runner_2026-04-29T072125437Z", + "generated_at": "2026-04-29T07:22:28.161Z", + "temp_root": ".observability\\v2-runner-verification\\2026-04-29T072125437Z", + "passed": true, + "case_count": 9, + "failed_count": 0, + "results": [ + { + "case_id": "single_scenario_single_candidate", + "description": "Single scenario plus one candidate should complete.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.md" + }, + { + "case_id": "single_scenario_multi_candidate", + "description": "Single scenario plus multiple candidates should complete.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.md" + }, + { + "case_id": "multi_scenario_single_candidate", + "description": "Multiple scenarios plus one candidate should complete.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.md" + }, + { + "case_id": "missing_action_binding", + "description": "Missing candidate action binding should fail clearly.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Missing action binding for scenario=cost_sensitive_task, variant=candidate_session_memory_sparse. V2.1 bind_existing mode requires user_action_id bindings." + }, + { + "case_id": "nonexistent_user_action_id", + "description": "Nonexistent V1 user_action_id should fail.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Command failed: bun run scripts/evals/v2_record_run.ts --scenario cost_sensitive_task --variant baseline_default --user-action-id 00000000-0000-0000-0000-000000000000 --snapshot-db --score-spec-ids task_success.main_chain_observed,efficiency.total_billed_tokens,decision_quality.subagent_count_observed,stability.recovery_absence,controllability.turn_limit_basic\nuser_action_id not found: 00000000-0000-0000-0000-000000000000" + }, + { + "case_id": "root_query_missing", + "description": "V1 action without main_thread root query should fail.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Command failed: bun run scripts/evals/v2_record_run.ts --scenario cost_sensitive_task --variant baseline_default --user-action-id v2-verify-missing-root-action --db E:\\claude-code-transparent\\.observability\\v2-runner-verification\\2026-04-29T072125437Z\\missing-root.duckdb --score-spec-ids task_success.main_chain_observed,efficiency.total_billed_tokens,decision_quality.subagent_count_observed,stability.recovery_absence,controllability.turn_limit_basic\nFact-only binding failed: user_action_id=v2-ve" + }, + { + "case_id": "missing_score_spec_id", + "description": "Missing score_spec_id should fail before run creation.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Experiment references missing score_spec_id: not.real.score" + }, + { + "case_id": "missing_gate_policy_id", + "description": "Missing gate_policy_id should fail before run creation.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Experiment references missing gate_policy_id: not_real_gate" + }, + { + "case_id": "execute_harness_blocked", + "description": "execute_harness mode should fail with the explicit adapter error.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "execute_harness mode is not implemented yet: missing headless harness execution adapter" + } + ] +} From 4f3bfd6e176f55f271ca2a533050d5ffc8b81856 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Wed, 29 Apr 2026 22:36:14 +0800 Subject: [PATCH 12/26] Solidify observability v2 score registry --- scripts/evals/v2_record_run.ts | 198 +--------------------- scripts/evals/v2_score_registry.ts | 225 +++++++++++++++++++++++++ scripts/evals/v2_validate_manifests.ts | 7 + tests/evals/v2/score-specs/README.md | 24 +-- 4 files changed, 247 insertions(+), 207 deletions(-) create mode 100644 scripts/evals/v2_score_registry.ts diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index 3987e288dc..fe13180cce 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -9,6 +9,7 @@ import type { EvalScore, EvalVariant, } from '../../src/observability/v2/evalTypes' +import { buildScoresForSpecIds } from './v2_score_registry' type JsonRecord = Record @@ -173,187 +174,6 @@ async function loadVariant(variantId: string): Promise { throw new Error(`Variant not found: ${variantId}`) } -function scoreLabel(value: number): string { - if (value >= 1) return 'pass' - if (value > 0) return 'partial' - return 'fail' -} - -function scoreKey(score: EvalScore): string { - return `${score.dimension}.${score.subdimension}` -} - -function buildScores(params: { - runId: string - scenario: EvalScenario - action: JsonRecord - rootQuery: JsonRecord | undefined - integrity: JsonRecord | undefined - tools: JsonRecord[] - subagents: JsonRecord[] - recoveries: JsonRecord[] -}): EvalScore[] { - const { - runId, - scenario, - action, - rootQuery, - integrity, - tools, - subagents, - recoveries, - } = params - - const expectedTools = new Set(scenario.expected_tools) - const observedTools = new Set(tools.map(t => asString(t.tool_name))) - const expectedToolHitRate = - expectedTools.size === 0 - ? null - : [...expectedTools].filter(tool => observedTools.has(tool)).length / - expectedTools.size - - const closureValues = [ - integrity?.strict_query_completion_rate, - integrity?.strict_turn_state_closure_rate, - integrity?.tool_lifecycle_closure_rate, - integrity?.subagent_lifecycle_closure_rate, - ].map(asNumber) - const closureHealth = - closureValues.length === 0 - ? 0 - : closureValues.reduce((sum, value) => sum + value, 0) / - closureValues.length - - const maxTurnCount = asNumber(rootQuery?.turn_count) - const turnLimit = scenario.max_turn_count ?? 8 - const maxTurnScore = maxTurnCount > 0 && maxTurnCount <= turnLimit ? 1 : 0 - const billedLimit = scenario.max_total_billed_tokens - const billedTokens = asNumber(action.total_billed_tokens) - const billedBudgetScore = - billedLimit === undefined ? null : billedTokens <= billedLimit ? 1 : 0 - const subagentLimit = scenario.max_subagent_count - const subagentCount = subagents.reduce( - (sum, subagent) => sum + asNumber(subagent.subagent_count), - 0, - ) - const subagentBudgetScore = - subagentLimit === undefined ? null : subagentCount <= subagentLimit ? 1 : 0 - const recoveryScore = recoveries.length === 0 ? 1 : 0 - - return [ - { - score_id: `${runId}_task_success_main_chain_observed`, - run_id: runId, - dimension: 'task_success', - subdimension: 'main_chain_observed', - score_value: rootQuery ? 1 : 0, - score_label: rootQuery ? 'pass' : 'fail', - evidence_ref: 'queries', - reason: rootQuery - ? 'Main-thread root query is present in V1 evidence.' - : 'No main-thread root query found for this user_action_id.', - }, - { - score_id: `${runId}_decision_quality_expected_tool_hit_rate`, - run_id: runId, - dimension: 'decision_quality', - subdimension: 'expected_tool_hit_rate', - score_value: expectedToolHitRate, - score_label: - expectedToolHitRate === null ? 'not_applicable' : scoreLabel(expectedToolHitRate), - evidence_ref: 'tools', - reason: - expectedToolHitRate === null - ? 'Scenario has no expected_tools yet.' - : `Observed ${observedTools.size} tool names against ${expectedTools.size} expected tools.`, - }, - { - score_id: `${runId}_efficiency_total_billed_tokens`, - run_id: runId, - dimension: 'efficiency', - subdimension: 'total_billed_tokens', - score_value: asNumber(action.total_billed_tokens), - score_label: 'observed', - evidence_ref: 'user_actions.total_billed_tokens', - reason: 'Raw efficiency fact from V1 user_actions.', - }, - { - score_id: `${runId}_efficiency_total_billed_token_budget`, - run_id: runId, - dimension: 'efficiency', - subdimension: 'total_billed_token_budget', - score_value: billedBudgetScore, - score_label: - billedBudgetScore === null ? 'not_applicable' : scoreLabel(billedBudgetScore), - evidence_ref: 'user_actions.total_billed_tokens', - reason: - billedLimit === undefined - ? 'Scenario has no max_total_billed_tokens budget.' - : `total_billed_tokens=${billedTokens}; budget=${billedLimit}.`, - }, - { - score_id: `${runId}_stability_v1_closure_health`, - run_id: runId, - dimension: 'stability', - subdimension: 'v1_closure_health', - score_value: Number(closureHealth.toFixed(6)), - score_label: scoreLabel(closureHealth), - evidence_ref: 'metrics_integrity_daily', - reason: - 'Average of query, turn, tool, and subagent closure rates for the action date.', - }, - { - score_id: `${runId}_stability_recovery_absence`, - run_id: runId, - dimension: 'stability', - subdimension: 'recovery_absence', - score_value: recoveryScore, - score_label: scoreLabel(recoveryScore), - evidence_ref: 'recoveries', - reason: - recoveries.length === 0 - ? 'No recovery events were observed for this action.' - : `${recoveries.length} recovery events were observed for this action.`, - }, - { - score_id: `${runId}_controllability_turn_limit_basic`, - run_id: runId, - dimension: 'controllability', - subdimension: 'turn_limit_basic', - score_value: maxTurnScore, - score_label: scoreLabel(maxTurnScore), - evidence_ref: 'queries.turn_count', - reason: `Root query turn_count=${maxTurnCount}; scenario limit is ${turnLimit}.`, - }, - { - score_id: `${runId}_decision_quality_subagent_count_observed`, - run_id: runId, - dimension: 'decision_quality', - subdimension: 'subagent_count_observed', - score_value: subagentCount, - score_label: 'observed', - evidence_ref: 'subagents', - reason: 'Observed subagent count is a fact for later baseline vs candidate comparison.', - }, - { - score_id: `${runId}_controllability_subagent_count_budget`, - run_id: runId, - dimension: 'controllability', - subdimension: 'subagent_count_budget', - score_value: subagentBudgetScore, - score_label: - subagentBudgetScore === null - ? 'not_applicable' - : scoreLabel(subagentBudgetScore), - evidence_ref: 'subagents', - reason: - subagentLimit === undefined - ? 'Scenario has no max_subagent_count budget.' - : `subagent_count=${subagentCount}; budget=${subagentLimit}.`, - }, - ] -} - function buildReport(params: { run: EvalRun scenario: EvalScenario @@ -540,7 +360,7 @@ async function main(): Promise { .split(',') .map(item => item.trim()) .filter(Boolean) - const allScores = buildScores({ + const scores = buildScoresForSpecIds({ runId, scenario, action, @@ -549,19 +369,7 @@ async function main(): Promise { tools, subagents, recoveries, - }) - const scores = - requestedScoreSpecIds.length === 0 - ? allScores - : allScores.filter(score => requestedScoreSpecIds.includes(scoreKey(score))) - const emittedScoreIds = new Set(scores.map(scoreKey)) - for (const scoreSpecId of requestedScoreSpecIds) { - if (!emittedScoreIds.has(scoreSpecId)) { - throw new Error( - `Score spec has no implemented scorer yet: ${scoreSpecId}`, - ) - } - } + }, requestedScoreSpecIds) const runsDir = path.join(evalRoot, 'runs') const scoresDir = path.join(evalRoot, 'scores') diff --git a/scripts/evals/v2_score_registry.ts b/scripts/evals/v2_score_registry.ts new file mode 100644 index 0000000000..e23cfde03c --- /dev/null +++ b/scripts/evals/v2_score_registry.ts @@ -0,0 +1,225 @@ +import type { EvalScenario, EvalScore } from '../../src/observability/v2/evalTypes' + +type JsonRecord = Record + +export interface V2ScoreInput { + runId: string + scenario: EvalScenario + action: JsonRecord + rootQuery: JsonRecord + integrity: JsonRecord | undefined + tools: JsonRecord[] + subagents: JsonRecord[] + recoveries: JsonRecord[] +} + +type V2ScoreScorer = (input: V2ScoreInput) => EvalScore + +function asNumber(value: unknown): number { + if (typeof value === 'number') return value + if (typeof value === 'string' && value.trim() !== '') return Number(value) + return 0 +} + +function asString(value: unknown): string { + return typeof value === 'string' ? value : '' +} + +function scoreLabel(value: number): string { + if (value >= 1) return 'pass' + if (value > 0) return 'partial' + return 'fail' +} + +export function scoreKey(score: EvalScore): string { + return `${score.dimension}.${score.subdimension}` +} + +function subagentCount(subagents: JsonRecord[]): number { + return subagents.reduce( + (sum, subagent) => sum + asNumber(subagent.subagent_count), + 0, + ) +} + +export const V2_SCORE_SCORERS: Record = { + 'task_success.main_chain_observed': ({ runId, rootQuery }) => ({ + score_id: `${runId}_task_success_main_chain_observed`, + run_id: runId, + dimension: 'task_success', + subdimension: 'main_chain_observed', + score_value: rootQuery ? 1 : 0, + score_label: rootQuery ? 'pass' : 'fail', + evidence_ref: 'queries', + reason: rootQuery + ? 'Main-thread root query is present in V1 evidence.' + : 'No main-thread root query found for this user_action_id.', + }), + + 'decision_quality.expected_tool_hit_rate': ({ runId, scenario, tools }) => { + const expectedTools = new Set(scenario.expected_tools) + const observedTools = new Set(tools.map(tool => asString(tool.tool_name))) + const expectedToolHitRate = + expectedTools.size === 0 + ? null + : [...expectedTools].filter(tool => observedTools.has(tool)).length / + expectedTools.size + return { + score_id: `${runId}_decision_quality_expected_tool_hit_rate`, + run_id: runId, + dimension: 'decision_quality', + subdimension: 'expected_tool_hit_rate', + score_value: expectedToolHitRate, + score_label: + expectedToolHitRate === null + ? 'not_applicable' + : scoreLabel(expectedToolHitRate), + evidence_ref: 'tools', + reason: + expectedToolHitRate === null + ? 'Scenario has no expected_tools yet.' + : `Observed ${observedTools.size} tool names against ${expectedTools.size} expected tools.`, + } + }, + + 'efficiency.total_billed_tokens': ({ runId, action }) => ({ + score_id: `${runId}_efficiency_total_billed_tokens`, + run_id: runId, + dimension: 'efficiency', + subdimension: 'total_billed_tokens', + score_value: asNumber(action.total_billed_tokens), + score_label: 'observed', + evidence_ref: 'user_actions.total_billed_tokens', + reason: 'Raw efficiency fact from V1 user_actions.', + }), + + 'efficiency.total_billed_token_budget': ({ runId, scenario, action }) => { + const billedLimit = scenario.max_total_billed_tokens + const billedTokens = asNumber(action.total_billed_tokens) + const billedBudgetScore = + billedLimit === undefined ? null : billedTokens <= billedLimit ? 1 : 0 + return { + score_id: `${runId}_efficiency_total_billed_token_budget`, + run_id: runId, + dimension: 'efficiency', + subdimension: 'total_billed_token_budget', + score_value: billedBudgetScore, + score_label: + billedBudgetScore === null ? 'not_applicable' : scoreLabel(billedBudgetScore), + evidence_ref: 'user_actions.total_billed_tokens', + reason: + billedLimit === undefined + ? 'Scenario has no max_total_billed_tokens budget.' + : `total_billed_tokens=${billedTokens}; budget=${billedLimit}.`, + } + }, + + 'stability.v1_closure_health': ({ runId, integrity }) => { + const closureValues = [ + integrity?.strict_query_completion_rate, + integrity?.strict_turn_state_closure_rate, + integrity?.tool_lifecycle_closure_rate, + integrity?.subagent_lifecycle_closure_rate, + ].map(asNumber) + const closureHealth = + closureValues.length === 0 + ? 0 + : closureValues.reduce((sum, value) => sum + value, 0) / + closureValues.length + return { + score_id: `${runId}_stability_v1_closure_health`, + run_id: runId, + dimension: 'stability', + subdimension: 'v1_closure_health', + score_value: Number(closureHealth.toFixed(6)), + score_label: scoreLabel(closureHealth), + evidence_ref: 'metrics_integrity_daily', + reason: + 'Average of query, turn, tool, and subagent closure rates for the action date.', + } + }, + + 'stability.recovery_absence': ({ runId, recoveries }) => { + const recoveryScore = recoveries.length === 0 ? 1 : 0 + return { + score_id: `${runId}_stability_recovery_absence`, + run_id: runId, + dimension: 'stability', + subdimension: 'recovery_absence', + score_value: recoveryScore, + score_label: scoreLabel(recoveryScore), + evidence_ref: 'recoveries', + reason: + recoveries.length === 0 + ? 'No recovery events were observed for this action.' + : `${recoveries.length} recovery events were observed for this action.`, + } + }, + + 'controllability.turn_limit_basic': ({ runId, scenario, rootQuery }) => { + const maxTurnCount = asNumber(rootQuery.turn_count) + const turnLimit = scenario.max_turn_count ?? 8 + const maxTurnScore = maxTurnCount > 0 && maxTurnCount <= turnLimit ? 1 : 0 + return { + score_id: `${runId}_controllability_turn_limit_basic`, + run_id: runId, + dimension: 'controllability', + subdimension: 'turn_limit_basic', + score_value: maxTurnScore, + score_label: scoreLabel(maxTurnScore), + evidence_ref: 'queries.turn_count', + reason: `Root query turn_count=${maxTurnCount}; scenario limit is ${turnLimit}.`, + } + }, + + 'decision_quality.subagent_count_observed': ({ runId, subagents }) => ({ + score_id: `${runId}_decision_quality_subagent_count_observed`, + run_id: runId, + dimension: 'decision_quality', + subdimension: 'subagent_count_observed', + score_value: subagentCount(subagents), + score_label: 'observed', + evidence_ref: 'subagents', + reason: 'Observed subagent count is a fact for later baseline vs candidate comparison.', + }), + + 'controllability.subagent_count_budget': ({ runId, scenario, subagents }) => { + const limit = scenario.max_subagent_count + const count = subagentCount(subagents) + const budgetScore = limit === undefined ? null : count <= limit ? 1 : 0 + return { + score_id: `${runId}_controllability_subagent_count_budget`, + run_id: runId, + dimension: 'controllability', + subdimension: 'subagent_count_budget', + score_value: budgetScore, + score_label: budgetScore === null ? 'not_applicable' : scoreLabel(budgetScore), + evidence_ref: 'subagents', + reason: + limit === undefined + ? 'Scenario has no max_subagent_count budget.' + : `subagent_count=${count}; budget=${limit}.`, + } + }, +} + +export function listImplementedScoreSpecIds(): string[] { + return Object.keys(V2_SCORE_SCORERS) +} + +export function buildScoresForSpecIds( + input: V2ScoreInput, + requestedScoreSpecIds: string[], +): EvalScore[] { + const scoreSpecIds = + requestedScoreSpecIds.length > 0 + ? requestedScoreSpecIds + : listImplementedScoreSpecIds() + return scoreSpecIds.map(scoreSpecId => { + const scorer = V2_SCORE_SCORERS[scoreSpecId] + if (!scorer) { + throw new Error(`Score spec has no implemented scorer yet: ${scoreSpecId}`) + } + return scorer(input) + }) +} diff --git a/scripts/evals/v2_validate_manifests.ts b/scripts/evals/v2_validate_manifests.ts index c84a80c11c..dace60be88 100644 --- a/scripts/evals/v2_validate_manifests.ts +++ b/scripts/evals/v2_validate_manifests.ts @@ -15,6 +15,7 @@ import type { EvalGatePolicyRule, EvalScoreSpecCollection, } from '../../src/observability/v2/evalExperimentTypes' +import { listImplementedScoreSpecIds } from './v2_score_registry' const repoRoot = path.resolve(import.meta.dirname, '..', '..') const evalRoot = path.join(repoRoot, 'tests', 'evals', 'v2') @@ -329,6 +330,7 @@ function validateExperiment( function validateScoreSpecCollection( filePath: string, collection: EvalScoreSpecCollection, + implementedScoreSpecIds: Set, ): string[] { const errors: string[] = [] requireArray(errors, filePath, 'score_specs', collection.score_specs) @@ -362,6 +364,9 @@ function validateScoreSpecCollection( if (seen.has(spec.score_spec_id)) { errors.push(`${objectName}.score_spec_id is duplicated: ${spec.score_spec_id}`) } + if (!implementedScoreSpecIds.has(spec.score_spec_id)) { + errors.push(`${objectName}.score_spec_id has no implemented scorer: ${spec.score_spec_id}`) + } seen.add(spec.score_spec_id) } return errors @@ -404,6 +409,7 @@ async function validateAll(): Promise { scoreSpecIds: new Set(), gatePolicyIds: new Set(), } + const implementedScoreSpecIds = new Set(listImplementedScoreSpecIds()) const scenarioFiles = await listJsonFiles(path.join(evalRoot, 'scenarios')) const variantFiles = await listJsonFiles(path.join(evalRoot, 'variants')) @@ -438,6 +444,7 @@ async function validateAll(): Promise { ...validateScoreSpecCollection( filePath, collection, + implementedScoreSpecIds, ), ) } diff --git a/tests/evals/v2/score-specs/README.md b/tests/evals/v2/score-specs/README.md index e880eebfcd..25168b60b5 100644 --- a/tests/evals/v2/score-specs/README.md +++ b/tests/evals/v2/score-specs/README.md @@ -3,8 +3,8 @@ ## 理解清单 - `score-specs/*.json` 定义“哪些分数是正式分数”。 -- `scripts/evals/v2_record_run.ts` 目前负责实际计算这些分数。 -- V2.1 当前不是公式解释器;score formula 仍由脚本中的 scorer implementation 实现。 +- `scripts/evals/v2_score_registry.ts` 负责登记 `score_spec_id -> scorer implementation`。 +- V2.1 当前不是公式解释器;score formula 仍由 registry 中的 scorer implementation 实现。 ## 预期效果 @@ -20,24 +20,24 @@ V2.1 先固化 contract,再逐步演进实现。当前 contract 是: ```text -score_spec_id -> implemented scorer in scripts/evals/v2_record_run.ts +score_spec_id -> implemented scorer in scripts/evals/v2_score_registry.ts ``` -后续可以把公式解析、规则执行、外部 scorer registry 拆出去,但本轮不做。 +后续可以把公式解析、规则执行、外部 scorer backend 拆出去,但本轮不做。 ## Current Mapping | score_spec_id | implementation | data source | current boundary | | --- | --- | --- | --- | -| `task_success.main_chain_observed` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `queries` + run binding | 判断是否存在 `main_thread` root query。 | -| `efficiency.total_billed_tokens` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `user_actions.total_billed_tokens` | 只记录事实值,不单独判断好坏。 | -| `decision_quality.subagent_count_observed` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `subagents` | 只记录数量事实;是否好坏交给 compare/gate 结合任务成功判断。 | -| `stability.recovery_absence` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `recoveries` | 无 recovery 为 1,有 recovery 为 0。 | -| `controllability.turn_limit_basic` | `buildScores()` in `scripts/evals/v2_record_run.ts` | V1 `queries.turn_count` + scenario limit | 当前使用 scenario `max_turn_count`,缺省为 8。 | +| `task_success.main_chain_observed` | `V2_SCORE_SCORERS['task_success.main_chain_observed']` | V1 `queries` + run binding | 判断是否存在 `main_thread` root query。 | +| `efficiency.total_billed_tokens` | `V2_SCORE_SCORERS['efficiency.total_billed_tokens']` | V1 `user_actions.total_billed_tokens` | 只记录事实值,不单独判断好坏。 | +| `decision_quality.subagent_count_observed` | `V2_SCORE_SCORERS['decision_quality.subagent_count_observed']` | V1 `subagents` | 只记录数量事实;是否好坏交给 compare/gate 结合任务成功判断。 | +| `stability.recovery_absence` | `V2_SCORE_SCORERS['stability.recovery_absence']` | V1 `recoveries` | 无 recovery 为 1,有 recovery 为 0。 | +| `controllability.turn_limit_basic` | `V2_SCORE_SCORERS['controllability.turn_limit_basic']` | V1 `queries.turn_count` + scenario limit | 当前使用 scenario `max_turn_count`,缺省为 8。 | ## Not Formal In V2.1 -`v2_record_run.ts` 内部还能计算一些辅助分数,例如: +`v2_score_registry.ts` 内部还登记了一些辅助分数,例如: - `decision_quality.expected_tool_hit_rate` - `efficiency.total_billed_token_budget` @@ -49,8 +49,8 @@ score_spec_id -> implemented scorer in scripts/evals/v2_record_run.ts ## Failure Rules - experiment 引用不存在的 `score_spec_id`:runner 失败。 -- score-spec 存在但 scorer 未实现:record_run 失败。 -- scorer 产生了未声明 score:runner 通过 `--score-spec-ids` 过滤,不写入正式 score artifact。 +- score-spec 存在但 scorer 未实现:manifest validator 和 record_run 都会失败。 +- 未声明 score 不会进入正式 score artifact,因为 record_run 只按 `--score-spec-ids` 从 registry 取分。 ## V2.1 Boundary From 34cca6aa452419fcce7f393d3123d46d1f694c7f Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Thu, 30 Apr 2026 19:04:00 +0800 Subject: [PATCH 13/26] Refine observability v2 verdict semantics --- .claude/skills/codex-controlled/SKILL.md | 253 +++++++++ .../skills/01_requirement_framing.md | 62 +++ .../skills/02_discussion_mode.md | 123 +++++ .../skills/03_layered_explanation.md | 165 ++++++ .../skills/04_preflight_hygiene.md | 111 ++++ .../skills/05_controlled_execution.md | 101 ++++ .../skills/06_acceptance_review.md | 105 ++++ .../codex-controlled/skills/07_coach_mode.md | 119 +++++ ...05\345\256\271\346\200\273\350\247\210.md" | 483 ++++++++++++++++++ ...ndidate_session_memory_sparse_dbf9fae1.md" | 37 -- ...ndidate_session_memory_sparse_dbf9fae1.md" | 4 +- ...parse_vs_default_2026-04-27T105524752Z.md" | 39 -- ...parse_vs_default_2026-04-28T162912802Z.md" | 41 -- ...parse_vs_default_2026-04-30T021206270Z.md" | 61 +++ ...nsitive_task_baseline_default_1d5eb5e1.md" | 56 -- ...ndidate_session_memory_sparse_dbf9fae1.md" | 53 -- ...nsitive_task_baseline_default_1d5eb5e1.md" | 8 +- ...ndidate_session_memory_sparse_dbf9fae1.md" | 6 +- scripts/evals/v2_run_experiment.ts | 273 +++++++++- .../evals/v2_validate_experiment_artifacts.ts | 51 +- scripts/evals/v2_verify_bind_runner.ts | 27 +- tests/evals/v2/README.md | 6 +- tests/evals/v2/V2.1-bind_existing-usage.md | 19 +- tests/evals/v2/experiment-runs/README.md | 46 +- ...arse_vs_default_2026-04-27T105524752Z.json | 126 ----- ...arse_vs_default_2026-04-28T162912802Z.json | 140 ----- ...arse_vs_default_2026-04-30T021206270Z.json | 272 ++++++++++ tests/evals/v2/gates/README.md | 54 +- ...sitive_task_baseline_default_1d5eb5e1.json | 166 ------ ...didate_session_memory_sparse_dbf9fae1.json | 147 ------ ...itive_task_baseline_default_1d5eb5e1.json} | 20 +- ...idate_session_memory_sparse_dbf9fae1.json} | 16 +- ...task_baseline_default_1d5eb5e1.scores.json | 92 ---- ...session_memory_sparse_dbf9fae1.scores.json | 92 ---- ...ask_baseline_default_1d5eb5e1.scores.json} | 36 +- ...ession_memory_sparse_dbf9fae1.scores.json} | 36 +- ..._1_bind_runner_2026-04-30T015859120Z.json} | 26 +- 37 files changed, 2358 insertions(+), 1114 deletions(-) create mode 100644 .claude/skills/codex-controlled/SKILL.md create mode 100644 .claude/skills/codex-controlled/skills/01_requirement_framing.md create mode 100644 .claude/skills/codex-controlled/skills/02_discussion_mode.md create mode 100644 .claude/skills/codex-controlled/skills/03_layered_explanation.md create mode 100644 .claude/skills/codex-controlled/skills/04_preflight_hygiene.md create mode 100644 .claude/skills/codex-controlled/skills/05_controlled_execution.md create mode 100644 .claude/skills/codex-controlled/skills/06_acceptance_review.md create mode 100644 .claude/skills/codex-controlled/skills/07_coach_mode.md create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.1\347\263\273\347\273\237\345\205\250\345\206\205\345\256\271\346\200\273\350\247\210.md" delete mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" (89%) delete mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" delete mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-30T021206270Z.md" delete mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" delete mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" (96%) rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" (95%) delete mode 100644 tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json delete mode 100644 tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json create mode 100644 tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-30T021206270Z.json delete mode 100644 tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json delete mode 100644 tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json rename tests/evals/v2/runs/{run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json => run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.json} (98%) rename tests/evals/v2/runs/{run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json => run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json} (98%) delete mode 100644 tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json delete mode 100644 tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json rename tests/evals/v2/scores/{run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json => run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json} (72%) rename tests/evals/v2/scores/{run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json => run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json} (73%) rename tests/evals/v2/verification-reports/{v2_1_bind_runner_2026-04-29T072125437Z.json => v2_1_bind_runner_2026-04-30T015859120Z.json} (78%) diff --git a/.claude/skills/codex-controlled/SKILL.md b/.claude/skills/codex-controlled/SKILL.md new file mode 100644 index 0000000000..b34994f2f7 --- /dev/null +++ b/.claude/skills/codex-controlled/SKILL.md @@ -0,0 +1,253 @@ +--- +name: codex-controlled +description: Use for controlled Codex collaboration workflows: requirement framing, discussion, layered explanation, preflight hygiene, controlled execution, acceptance review, and coaching checkpoints. +--- + +# Skill: Codex 协作主控调度器(Master Orchestrator) + +## 目标 + +本文件是项目内所有 Codex/Agent 协作 skill 的**总调度器**。 +它不负责承载所有细节,而是负责判断当前任务应进入哪种模式,并调用对应的子 skill。 + +核心目标: + +- 防止任务漂移 +- 防止未经用户拍板自动推进 +- 防止文档蓝图和当前代码真相混淆 +- 防止用用户不懂的术语解释用户不懂的术语 +- 让 Codex 既能执行,也能帮助用户逐步掌握验证、命令、排查、架构理解能力 + +--- + +## 最高原则 + +### 1. 当前代码与当前运行结果是真相 + +当同时存在: + +- 当前源码 +- 当前运行日志 +- 当前数据库 +- 当前观测结果 +- PDF / 任务书 / 设计稿 / 历史总结 / 心得文档 + +默认优先级为: + +1. 当前源码与当前运行结果 +2. 当前日志 / 数据库 / 观测事实 +3. 当前任务书 +4. PDF / 上游分析 / 历史总结 / 心得 + +不得默认文档与当前项目完全一致。 + +--- + +### 2. 用户理解优先于执行速度 + +如果用户还不能理解: + +- 本轮目标 +- 思路来源 +- 设计选择 +- 约束条件 +- 架构设计点 +- 风险 +- 验收口径 + +则不得进入写代码 / 写文件 / 自动推进下一 phase。 + +--- + +### 3. Checkpoint 是唯一推进闸门 + +每个 phase 结束后,必须等待用户拍板。 +未经用户明确批准,不得: + +- 自动进入下一 phase +- 顺手补 unrelated 功能 +- 从“能跑”扩展成“全系统完成” +- 用“建议继续”替代“等待确认” + +--- + +### 4. 事实 / 推断 / 不确定点必须分离 + +输出必须区分: + +- 【事实】来自源码、日志、文档原文、当前运行结果 +- 【推断】基于调用链、命名、行为的合理判断 +- 【不确定点】需要用户确认或进一步查看源码 + +不得把推断包装成事实。 + +--- + +## 模式选择 + +收到任务后,先判断当前进入哪种模式。 + +| 模式 | 何时使用 | 调用子 skill | +|---|---|---| +| Framing / 定格 | 需求混乱、不确定本轮边界 | `01_requirement_framing.md` | +| Discussion / 讨论 | 用户和 agent 对方案理解不一致,需要反复澄清 | `02_discussion_mode.md` | +| Explanation / 分层解释 | 用户看不懂术语、代码结构、文档必要性 | `03_layered_explanation.md` | +| Preflight / 卫生检查 | 涉及日志、ETL、指标、数据、实验、runner | `04_preflight_hygiene.md` | +| Execution / 受控执行 | 已拍板,可以写代码或文件 | `05_controlled_execution.md` | +| Review / 验收复盘 | Codex 已执行,需自查、验收、checkpoint | `06_acceptance_review.md` | +| Coaching / 教练式学习 | 用户希望掌握命令、验证、排查能力 | `07_coach_mode.md` | + +--- + +## 默认 Phase + +### Phase 0:问题定格 + +输出: + +- 本轮目标 +- 真实约束 +- 输入材料 +- 输出形式 +- 冲突处理要求 +- 本轮不做 +- 应进入哪种模式 + +禁止写代码。 + +--- + +### Phase 1:理解与讨论 + +如果用户不能完全理解方案,进入 Discussion 或 Explanation 模式。 + +目标不是“说服用户”,而是双方把: + +- 概念 +- 约束 +- 分歧 +- 方案 +- 风险 +- 验收口径 + +说清楚。 + +未通过不得进入执行。 + +--- + +### Phase 2:Spec Bundle + +如果已经明确要执行,输出统一 Spec Bundle: + +- 背景解读 +- 强制要求 +- 验收标准 +- 冲突处理规则 +- Checkpoint 规则 +- 本轮不做 + +--- + +### Phase 3:Preflight / Hygiene Gate + +任何涉及以下内容的任务必须先做卫生检查: + +- 日志 +- 指标 +- ETL +- dashboard +- runner +- scorer +- gate +- 数据清洗 +- schema +- 实验平台 + +Preflight 未通过,禁止实现。 + +--- + +### Phase 4:Execution Plan + +输出: + +- 修改哪些文件 +- 不修改哪些文件 +- 本轮最小闭环 +- 修改顺序 +- 验证顺序 +- 风险点 +- 失败时停下条件 + +--- + +### Phase 5:Controlled Execution + +一次只做一个最小闭环任务。 +禁止顺手扩展。 + +--- + +### Phase 6:Self-check + +完成后输出: + +- 修改摘要 +- 自查结果 +- 未通过项 +- 风险项 +- 严格口径 / 推断口径 +- 最小验证清单 +- 下一步候选 A/B + +--- + +### Phase 7:Human Review + +等待用户拍板。 +没有用户批准,不得自动继续。 + +--- + +## 冲突处理模板 + +如果发现文档与当前项目冲突,必须暂停: + +```md +冲突点: +文档中的描述: +当前项目中的实际情况: +我的判断: +候选处理方案 A: +候选处理方案 B: +我暂停在这里等待确认: +``` + +--- + +## 最短提问模板 + +用户可用以下格式发起任务: + +```md +本轮目标: +真实约束: +输入材料: +输出形式: +冲突处理要求: +本轮不做: +是否先做理解清单: +是否需要 Preflight / Hygiene Gate: +我希望你用 Level 几的教练式辅助: +``` + +--- + +## 子 skill 调用规则 + +- 如果用户说“我没理解”,优先调用 `02_discussion_mode.md` 或 `03_layered_explanation.md` +- 如果用户说“请执行”,但尚未经过理解清单,必须先回到理解阶段 +- 如果任务涉及数据/日志/指标/实验,必须调用 `04_preflight_hygiene.md` +- 如果已经写代码,必须调用 `06_acceptance_review.md` +- 如果涉及命令或验证,必须调用 `07_coach_mode.md` diff --git a/.claude/skills/codex-controlled/skills/01_requirement_framing.md b/.claude/skills/codex-controlled/skills/01_requirement_framing.md new file mode 100644 index 0000000000..7ff5d9d9ef --- /dev/null +++ b/.claude/skills/codex-controlled/skills/01_requirement_framing.md @@ -0,0 +1,62 @@ +--- +title: 需求定格与任务收敛 +type: reference +description: Use when a task boundary is unclear and the request must be compressed into goals, constraints, inputs, outputs, non-goals, and a recommended execution mode. +--- + +# Skill: 需求定格与任务收敛(Requirement Framing) + +## 目标 + +当用户需求较散、约束不完整、阶段不明确时,先把任务收敛到可执行范围。 + +--- + +## 输出模板 + +```md +## 需求压缩 + +### 本轮目标 +... + +### 真实约束 +... + +### 输入材料 +... + +### 输出形式 +... + +### 冲突处理要求 +... + +### 本轮不做 +... + +### 推荐进入的模式 +- Discussion / Explanation / Preflight / Execution / Review / Coach +``` + +--- + +## 高价值信息识别 + +主动指出: + +- 哪些信息决定任务方向 +- 哪些内容重复 +- 哪些内容展开过早 +- 哪些内容与当前阶段无关 + +--- + +## 本轮边界 + +必须明确: + +- 做什么 +- 不做什么 +- 谁拍板 +- 何时停下 diff --git a/.claude/skills/codex-controlled/skills/02_discussion_mode.md b/.claude/skills/codex-controlled/skills/02_discussion_mode.md new file mode 100644 index 0000000000..644a61d04e --- /dev/null +++ b/.claude/skills/codex-controlled/skills/02_discussion_mode.md @@ -0,0 +1,123 @@ +--- +title: 技术方案讨论模式 +type: reference +description: Use when the user and Codex need to align on project understanding, tradeoffs, terminology, risks, and decision points before implementation. +--- + +# Skill: 技术方案讨论模式(Discussion Mode) + +## 目标 + +当用户和 Codex 对项目理解、技术方案、约束条件、实现路径不完全一致时,进入讨论模式。 + +讨论模式不是执行模式。 +它的目标是让双方把意思说清楚,再由用户拍板。 + +--- + +## 适用场景 + +- 用户说“我没想清楚” +- 用户质疑 agent 的方案 +- 用户觉得 agent 用不懂的术语解释不懂的术语 +- 用户和 agent 对项目状态、技术栈、解决思路不一致 +- 需要比较多个方案 +- 需要解释为什么写代码、为什么写文档、为什么改架构 + +--- + +## 讨论模式禁止事项 + +在讨论模式中,Codex 不得: + +- 写代码 +- 改文件 +- 生成最终任务书 +- 自动进入执行 +- 试图用更长总结替代解释 +- 用用户不懂的新术语解释旧术语 + +--- + +## 输出结构 + +### 1. 分歧定位 + +```md +我认为当前分歧可能在: +1. ... +2. ... +3. ... +``` + +### 2. 双方理解对齐 + +```md +我理解你的意思是: +... + +我目前的判断是: +... + +我们不一致的地方是: +... +``` + +### 3. 术语降维 + +列出本轮关键术语: + +| 术语 | 大白话解释 | 在本项目中的具体含义 | 对应文件/数据结构 | 不理解会影响什么 | +|---|---|---|---|---| + +### 4. 方案对比 + +| 方案 | 做法 | 优点 | 风险 | 适合场景 | 是否推荐 | +|---|---|---|---|---|---| + +### 5. 拍板点 + +```md +需要你拍板: +A. ... +B. ... +C. ... +``` + +--- + +## 讨论结束条件 + +只有当以下内容明确后,才能退出讨论模式: + +- 用户理解关键术语 +- 用户理解方案差异 +- 用户理解风险 +- 用户知道自己要拍板什么 +- 用户明确选择下一步 + +--- + +## 讨论模式的回答风格 + +- 用短段落 +- 少用新术语 +- 允许反复追问 +- 不急着收束 +- 不把“不确定”伪装成确定 +- 不把“我建议”伪装成“必须如此” + +--- + +## 小型讨论模板 + +```md +当前讨论主题: +我理解你的疑问: +我认为有几种可能解释: +方案 A: +方案 B: +我更推荐: +原因: +你需要拍板: +``` diff --git a/.claude/skills/codex-controlled/skills/03_layered_explanation.md b/.claude/skills/codex-controlled/skills/03_layered_explanation.md new file mode 100644 index 0000000000..b809490993 --- /dev/null +++ b/.claude/skills/codex-controlled/skills/03_layered_explanation.md @@ -0,0 +1,165 @@ +--- +title: 分层解释与实现讲解 +type: reference +description: Use when Codex must explain complex code, architecture, documents, schemas, runners, scorers, gates, or design choices in layered language. +--- + +# Skill: 分层解释与实现讲解(Layered Explanation) + +## 目标 + +解决“用用户不懂的内容解释用户不懂的内容”的问题。 + +当 Codex 完成代码、文档、方案、任务书或复杂分析后,不能只输出摘要,必须提供分层解释。 + +--- + +## 适用场景 + +- 用户看不懂理解清单 +- 本轮出现大量术语 +- Codex 写了代码 +- Codex 写了文档 +- Codex 引入新数据结构、新 schema、新 runner、新 scorer、新 gate +- 用户需要知道“为什么这么设计” + +--- + +## 分层解释结构 + +### Layer 1:一句话解释 + +用一句话说明这次做了什么。 + +```md +这次做了什么: +... +``` + +--- + +### Layer 2:大白话解释 + +不用新术语解释一遍。 + +```md +不用术语说: +... +``` + +--- + +### Layer 3:术语解释 + +| 术语 | 大白话含义 | 本项目中的具体含义 | 对应位置 | 不理解会影响什么 | +|---|---|---|---|---| + +要求: + +- 不得用未解释的新术语解释旧术语 +- 每个术语必须落到本项目具体对象 +- 如果术语只是临时概念,要说明 + +--- + +### Layer 4:代码结构解释 + +如果写了代码,必须输出: + +```md +## 代码实现讲解卡 + +### 本轮改了哪些文件 +- 文件: + - 改动目的: + - 系统角色: + - 为什么改这里: + +### 代码如何串起来 +命令/入口 +→ ... +→ ... +→ 输出 + +### 数据如何流动 +输入: +中间处理: +输出: + +### 为什么这样组织 +- 为什么拆成这些文件: +- 为什么不是写在一个文件里: +- 为什么不是改旧模块: +- 哪些地方为了后续扩展: +``` + +--- + +### Layer 5:文档必要性解释 + +如果写了文档,必须输出: + +```md +## 文档必要性说明卡 + +### 这份文档解决什么问题 +... + +### 为什么不能只靠代码 +... + +### 读者是谁 +... + +### 是长期规范、临时报告,还是 checkpoint +... + +### 不写它会有什么后果 +... +``` + +--- + +### Layer 6:设计选择解释 + +```md +## 设计选择说明卡 + +### 考虑过哪些替代方案 +方案 A: +方案 B: + +### 为什么选择当前方案 +... + +### 当前方案牺牲了什么 +... + +### 风险是什么 +... + +### 如何验证风险没有发生 +... +``` + +--- + +## 禁止事项 + +- 禁止只给“看起来很清晰”的摘要 +- 禁止用“最佳实践”代替具体理由 +- 禁止把实现结果包装成用户已经理解 +- 禁止把术语堆在一起不解释 + +--- + +## 退出条件 + +用户能回答: + +- 这次做了什么 +- 为什么这么做 +- 哪些文件参与了 +- 数据怎么流动 +- 有什么风险 +- 怎么验证 diff --git a/.claude/skills/codex-controlled/skills/04_preflight_hygiene.md b/.claude/skills/codex-controlled/skills/04_preflight_hygiene.md new file mode 100644 index 0000000000..cc2ab00b13 --- /dev/null +++ b/.claude/skills/codex-controlled/skills/04_preflight_hygiene.md @@ -0,0 +1,111 @@ +--- +title: Preflight / Hygiene Gate +type: reference +description: Use before work involving logs, metrics, ETL, dashboards, runners, scorers, gates, schemas, data cleaning, or evaluation experiments. +--- + +# Skill: Preflight / Hygiene Gate + +## 目标 + +在任何涉及日志、指标、ETL、dashboard、runner、scorer、gate、schema、数据清洗、评测实验的任务前,先确认系统状态是否干净、输入是否可信、历史数据是否会污染结果。 + +--- + +## 适用场景 + +- 可观测系统 +- 指标计算 +- 数据库重建 +- dashboard +- V2 experiment runner +- score / gate +- schema migration +- 旧日志清洗 +- baseline vs candidate 对比 + +--- + +## 必查项 + +### 1. 数据新鲜度 + +- 当前事件文件是否最新 +- 数据库是否过期 +- summary/dashboard 是否读旧库 +- 是否需要 rebuild + +### 2. 数据污染 + +- 是否混入旧版本日志 +- 是否混入旧 schema +- 是否存在旧 run / score / report 被误用 +- 是否需要归档/清洗 + +### 3. 引用闭合 + +- snapshot_ref 是否存在 +- user_action_id 是否存在 +- run 是否绑定 V1 事实证据 +- score 是否有 evidence_ref +- gate 是否有 score 输入 + +### 4. Schema 兼容 + +- manifest 字段是否和 validator 一致 +- score-spec 是否存在 +- gate policy 是否存在 +- experiment 引用是否有效 + +### 5. 影响分析 + +- 影响哪些模块 +- 影响哪些指标 +- 影响哪些报表 +- 影响哪些已有结论 +- 是否造成局部正确、全局错误 + +--- + +## 输出模板 + +```md +## Preflight / Hygiene Gate + +### 数据新鲜度 +- 结果: +- 证据: +- 是否通过: + +### 数据污染 +- 结果: +- 证据: +- 是否通过: + +### 引用闭合 +- 结果: +- 证据: +- 是否通过: + +### Schema 兼容 +- 结果: +- 证据: +- 是否通过: + +### Impact Analysis +- 影响模块: +- 影响指标: +- 影响报表: +- 影响已有结论: +- 风险: + +### 结论 +- 通过 / 不通过 +- 如果不通过,必须先处理: +``` + +--- + +## 硬规则 + +Preflight 不通过,不得进入实现。 diff --git a/.claude/skills/codex-controlled/skills/05_controlled_execution.md b/.claude/skills/codex-controlled/skills/05_controlled_execution.md new file mode 100644 index 0000000000..eeb8b2a623 --- /dev/null +++ b/.claude/skills/codex-controlled/skills/05_controlled_execution.md @@ -0,0 +1,101 @@ +--- +title: 受控执行 +type: reference +description: Use after user approval to execute one minimal closed-loop task without scope creep, fake capabilities, or unplanned file changes. +--- + +# Skill: 受控执行(Controlled Execution) + +## 目标 + +在用户已经拍板后,Codex 只执行一个最小闭环任务,避免范围扩大和架构漂移。 + +--- + +## 执行前要求 + +必须已有: + +- 明确任务书 / Spec Bundle +- 用户拍板 +- 通过理解清单 +- 通过 Preflight / Hygiene Gate +- 明确本轮不做什么 +- 明确最小验证清单 + +--- + +## 执行原则 + +### 1. 一次只做一个最小闭环 + +例如: + +- 只实现 bind_existing runner +- 只固化 experiment-run schema +- 只新增 score-spec 校验 +- 只修 freshness +- 只补一个指标 + +不得顺手扩展。 + +--- + +### 2. 只改计划内文件 + +如果需要修改计划外文件,必须暂停说明: + +```md +计划外修改需求: +为什么需要: +不改会怎样: +是否等待确认: +``` + +--- + +### 3. 不伪造能力 + +如果某能力尚无真实入口,例如 headless harness execution adapter,不得假装实现。 +应明确报错或留 scaffold。 + +--- + +### 4. 事实优先 + +正式结果必须能回溯到事实证据: + +- run_id +- user_action_id +- observability_db_ref +- evidence_ref + +无证据不得进入正式 score / compare / gate。 + +--- + +## 完成后输出 + +```md +## 执行完成摘要 + +### 修改文件 +- ... + +### 实现内容 +- ... + +### 未完成项 +- ... + +### 风险 +- ... + +### 验证命令 +- ... + +### 最小验证清单 +- [ ] ... +``` + +然后进入 Acceptance Review。 diff --git a/.claude/skills/codex-controlled/skills/06_acceptance_review.md b/.claude/skills/codex-controlled/skills/06_acceptance_review.md new file mode 100644 index 0000000000..82edbc8b8c --- /dev/null +++ b/.claude/skills/codex-controlled/skills/06_acceptance_review.md @@ -0,0 +1,105 @@ +--- +title: 验收与 Checkpoint Review +type: reference +description: Use after implementation to review goal fit, evidence, risks, validation results, and checkpoint choices before any next phase. +--- + +# Skill: 验收与 Checkpoint Review + +## 目标 + +在 Codex 完成一轮实现后,不直接继续,而是审查: + +- 是否完成本轮目标 +- 是否产生漂移 +- 是否证据充分 +- 是否可以进入下一 phase + +--- + +## 验收输入 + +- 修改文件列表 +- 自查结果 +- 运行命令 +- 输出 artifacts +- errors/warnings +- run/report/score/gate 结果 +- 未完成项 +- 风险项 + +--- + +## 验收维度 + +### 1. 目标匹配 + +- 本轮目标是否完成 +- 是否做了本轮不做的事情 +- 是否出现 scope creep + +### 2. 证据充分 + +- 是否有运行命令 +- 是否有输出文件 +- 是否有 report +- 是否有 evidence_ref +- 是否有 errors/warnings 说明 + +### 3. 事实优先 + +- 是否基于真实数据 +- 是否使用了推断口径 +- 推断是否明确标注 + +### 4. 风险暴露 + +- 未完成项是否说清 +- 风险是否可接受 +- 是否需要用户拍板 + +--- + +## Checkpoint 卡片 + +```md +## Checkpoint + +### 本轮目标 +... + +### 实际完成 +... + +### 修改文件 +... + +### 验证结果 +... + +### 未完成项 +... + +### 风险项 +... + +### 是否满足验收 +- [ ] ... + +### 下一步候选 A +... + +### 下一步候选 B +... + +### 是否等待用户拍板 +是 +``` + +--- + +## 硬规则 + +- 没有 checkpoint,不算完成 +- 用户未拍板,不得继续 +- 如果 Codex 想自动进入下一 phase,判定为执行意图漂移 diff --git a/.claude/skills/codex-controlled/skills/07_coach_mode.md b/.claude/skills/codex-controlled/skills/07_coach_mode.md new file mode 100644 index 0000000000..19c9c50f75 --- /dev/null +++ b/.claude/skills/codex-controlled/skills/07_coach_mode.md @@ -0,0 +1,119 @@ +--- +title: 教练式能力迁移 +type: reference +description: Use when the user should learn commands, verification, report reading, failure diagnosis, and gradually take over engineering checks. +--- + +# Skill: 教练式能力迁移(Coach Mode) + +## 目标 + +让用户逐步掌握基础工程能力,而不是只复制 Codex 的命令。 + +--- + +## 适用场景 + +- 命令执行 +- 验证阶段结果 +- 阅读 JSON / report / manifest +- 判断指标或 gate +- 排查失败原因 +- 审查执行结果 + +--- + +## 回答必须包含 + +### 1. 本轮基础能力 + +```md +本轮对应的基础能力: +1. ... +2. ... +``` + +### 2. 命令三段式 + +```md +命令: +... + +它在做什么: +... + +成功应该看到什么: +... + +失败先查哪里: +... +``` + +### 3. 最小验证清单 + +```md +- [ ] ... +- [ ] ... +``` + +### 4. 观察点 + +```md +你重点观察: +1. ... +2. ... +``` + +### 5. 失败排查路径 + +```md +如果失败,按顺序查: +1. ... +2. ... +3. ... +``` + +### 6. 小练习 + +```md +小练习: +请你自己检查: +1. ... +2. ... +3. ... + +把结果贴给我,我帮你判断。 +``` + +--- + +## 渐隐式辅助 Level + +### Level 1:完整扶手 + +提供完整命令、解释、成功标准、失败排查、小练习。 + +### Level 2:半成品命令 + +提供脚本名、目标和参数提示,让用户补全参数。 + +### Level 3:用户先写命令 + +用户先写命令,Codex 负责检查。 + +### Level 4:用户先给验证结论 + +用户先说“我认为通过,因为……”,Codex 检查证据是否充分。 + +--- + +## 目标 + +逐步把用户从“复制命令”训练到: + +- 能读懂命令 +- 能读懂 manifest +- 能读懂 report +- 能判断 gate verdict +- 能排查常见错误 +- 能给出初步验收结论 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.1\347\263\273\347\273\237\345\205\250\345\206\205\345\256\271\346\200\273\350\247\210.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.1\347\263\273\347\273\237\345\205\250\345\206\205\345\256\271\346\200\273\350\247\210.md" new file mode 100644 index 0000000000..67a712d962 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.1\347\263\273\347\273\237\345\205\250\345\206\205\345\256\271\346\200\273\350\247\210.md" @@ -0,0 +1,483 @@ +──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +• 理解清单 + + V2.1-stable 的本质不是“再做一个看板”,而是把 V1 已经观测到的真实运行轨迹,转化成可以做实验、评分、对比、回归门禁的评测 + 系统。V1 回答“刚刚发生了什么”,V2.1 回答“某个 harness 改动相对 baseline 是变好了、变差了,还是证据不足”。 + + 当前 V2.1 的稳定模式是 bind_existing。它不会自动运行 agent,也不会自动发送 prompt,而是要求你先真实运行 baseline 和 + candidate,拿到各自的 V1 user_action_id,再把这些真实 trace 绑定成 V2 run。 + + V2.1 目前已经具备完整闭环:scenario -> variant -> action_binding -> run -> score -> compare -> gate -> experiment + summary -> verification。但它还不是自动 harness runner,也还没有模型裁判、人工评分、长上下文专项、tool/skill 专项评 + 测。 + + 预期效果 + + 你以后可以这样研究一个 harness 改动: + + 1. 先定义一个任务场景,比如“完成同样任务但尽量省 token”。 + 2. 用当前版本跑一次,得到 baseline 的 user_action_id。 + 3. 修改 harness,比如减少 session memory 注入,跑同一个任务,得到 candidate 的 user_action_id。 + 4. 把两个 user_action_id 写进 V2.1 experiment manifest。 + 5. 运行 V2.1 runner,系统自动生成 run、score、compare report、risk verdict 和探索复盘提示。 + 6. 最后你根据数据判断:这个改动是不是更便宜、有没有破坏主链路、有没有引入 recovery、有没有让 turn 数或 subagent 数异 + 常。 + + 一句话:V2.1 把“我感觉这次改动不错”变成“这次改动在固定 scenario 下,有 trace 证据、有指标、有对比、有门禁结论”。 + + 设计思路 + + V2.1 的核心设计是“先绑定事实,再做判断”。因为当前还没有稳定的 headless harness execution adapter,所以系统故意不自动执 + 行 harness,避免把不可靠的自动运行伪装成正式评测。现在的边界很清楚:V1 负责采集真实证据,V2.1 负责把这些证据组织成实 + 验。 + + ——— + + 一、V2.1 的总体架构 + + 当前系统可以理解成 6 层: + + | 层级 | 作用 | 当前实现 | + | --- | --- | --- | + | V1 证据层 | 保存真实 agent 运行轨迹 | .observability/observability_v1.duckdb | + | 绑定层 | 把 V1 user_action_id 变成 V2 run | v2_record_run.ts | + | 评测建模层 | 定义 scenario、variant、experiment、score-spec、gate | tests/evals/v2/** | + | 评分层 | 根据 V1 证据计算 score | v2_score_registry.ts | + | 对比层 | 比较 baseline 和 candidate | v2_compare_runs.ts | + | 门禁层 | 判断 candidate 是否可接受 | v2_run_experiment.ts | + + 关键代码入口: + + | 模块 | 位置 | 职责 | + | --- | --- | --- | + | 类型定义 | src/observability/v2/evalTypes.ts:31 | 定义 scenario、variant、run、score | + | 实验类型定义 | src/observability/v2/evalExperimentTypes.ts:18 | 定义 score-spec、gate、action binding、experiment | + | manifest 校验 | scripts/evals/v2_validate_manifests.ts:159 | 检查配置文件是否合法 | + | run 绑定 | scripts/evals/v2_record_run.ts:272 | 从 V1 DuckDB 读取证据,生成 V2 run 和 scores | + | scorer registry | scripts/evals/v2_score_registry.ts:45 | 明确 score_spec_id -> scorer implementation | + | 实验 runner | scripts/evals/v2_run_experiment.ts:507 | 批量跑 baseline/candidate、比较、gate、summary | + | 对比报告 | scripts/evals/v2_compare_runs.ts:1 | 生成 baseline vs candidate 的 score delta | + | artifact 校验 | scripts/evals/v2_validate_experiment_artifacts.ts:1 | 校验 experiment summary 顶层 schema | + | 回归验证 | scripts/evals/v2_verify_bind_runner.ts:1 | 覆盖 9 类 V2.1 runner 用例 | + + ——— + + 二、V2.1 的核心抽象 + + scenario 是“要评测的任务”。它不是一次真实运行,而是一类固定任务,比如 cost_sensitive_task。它包含任务说明、预期工具、 + 预期 skill、最大 turn 数、最大 token 预算、最大 subagent 数等字段,定义在 src/observability/v2/evalTypes.ts:31。 + + variant 是“某一套 harness 配置或代码状态”。比如 baseline_default 是默认版本,candidate_session_memory_sparse 是候选改 + 动版本。它描述改动属于 harness、skill、tool、model 还是 mixed,定义在 src/observability/v2/evalTypes.ts:48。 + + user_action_id 是 V1 的真实用户动作 ID。它代表一次真实运行入口,是 V1 证据的主索引。V2.1 不直接相信“你说这是一次实 + 验”,而是要求它能在 V1 DuckDB 里查到对应的 user_actions 和 main_thread root query。 + + action_binding 是 V2.1 的关键桥梁。它把 scenario_id + variant_id 绑定到某个真实 entry_user_action_id。也就是说,它声 + 明:“这条 V1 trace 是这个 scenario 在这个 variant 下跑出来的证据”。 + + run 是 V2 对一次绑定后的运行记录。它不是重新执行 agent,而是把一个 V1 user_action_id 包装成一个有 scenario、variant、 + root query、DB 证据引用的评测 run,定义在 src/observability/v2/evalTypes.ts:59。 + + score_spec 是“正式指标定义”。它声明某个指标的维度、方向、公式描述、数据来源、证据要求和自动化级别,定义在 src/ + observability/v2/evalExperimentTypes.ts:18。 + + scorer 是“指标计算实现”。最新版本已经把 scorer 从 v2_record_run.ts 中抽出到 scripts/evals/v2_score_registry.ts:45,形 + 成明确的 score_spec_id -> scorer implementation 映射。 + + gate_policy 是“是否接受 candidate 的规则”。它不直接计算分数,只解释 baseline 和 candidate 的分数差异,定义在 src/ + observability/v2/evalExperimentTypes.ts:44。 + + ——— + + 三、完整数据流 + + 一次 V2.1 实验的真实流程是: + + 真实运行 baseline + ↓ + V1 产生 baseline user_action_id + ↓ + 真实运行 candidate + ↓ + V1 产生 candidate user_action_id + ↓ + 在 experiment manifest 里填写 action_bindings + ↓ + validator 检查 scenario / variant / score-spec / gate / binding + ↓ + runner 调用 v2_record_run.ts 生成 baseline run + ↓ + runner 调用 v2_record_run.ts 生成 candidate run + ↓ + score registry 计算正式 scores + ↓ + compare_runs 生成 baseline vs candidate 对比报告 + ↓ + gate policy 判断 pass / warning / fail / inconclusive + ↓ + 生成 experiment-level JSON summary 和 Markdown 报告 + + 这里最重要的是:V2.1 不创造事实,只解释事实。事实来自 V1 DuckDB。 + + ——— + + 四、v2_record_run.ts 具体做什么 + + scripts/evals/v2_record_run.ts:272 是“把一条 V1 trace 变成一个 V2 run”的核心脚本。 + + 它会读取参数: + + bun run scripts/evals/v2_record_run.ts --scenario cost_sensitive_task --variant baseline_default --user-action-id + --snapshot-db + + 它会查询 V1 DuckDB: + + | V1 表 | 用途 | + | --- | --- | + | user_actions | 找到这次用户动作的总成本、时延、query 数、tool 数、subagent 数 | + | queries | 找到 main_thread root query | + | tools | 汇总 tool 使用次数、关闭情况、失败情况 | + | subagents | 汇总 subagent reason、trigger、数量、平均时延 | + | recoveries | 判断是否发生 recovery | + | metrics_integrity_daily | 获取当日完整性/闭合度事实 | + + 它有一个硬性要求:必须能找到 agent_name = 'main_thread' 的 root query。如果找不到,它会报错,不允许进入正式 score/ + compare/gate,错误逻辑在 scripts/evals/v2_record_run.ts:313。 + + 这个约束很重要。它保证 V2 run 不是孤立的 token 数字,而是能绑定到一次主链路执行。 + + 输出包括: + + | 输出 | 位置 | + | --- | --- | + | run JSON | tests/evals/v2/runs/*.json | + | score JSON | tests/evals/v2/scores/*.scores.json | + | run Markdown 报告 | ObservrityTask/10-系统版本/v2/06-运行报告/*.md | + + ——— + + 五、当前正式指标有哪些 + + 当前默认正式 score-spec 在 tests/evals/v2/score-specs/default-v2-1.score-specs.json。 + + | 指标 | 维度 | 含义 | 方向 | + | --- | --- | --- | --- | + | task_success.main_chain_observed | 任务完成度代理 | 是否存在 main_thread root query | 越高越好 | + | efficiency.total_billed_tokens | 效率 | V1 user_actions.total_billed_tokens | 越低越好 | + | decision_quality.subagent_count_observed | 决策质量代理 | 观察到的 subagent 数 | 越低通常越好 | + | stability.recovery_absence | 稳定性 | 没有 recovery 为 1,有 recovery 为 0 | 越高越好 | + | controllability.turn_limit_basic | 可控性 | root query turn 数是否不超过 scenario 限制 | 越高越好 | + + 这里要注意:这些指标是 V2.1 的第一批“trace-backed 自动指标”,不是最终的智能程度评分。它们更像基础体征:主链路有没有、 + 成本多少、有没有 recovery、turn 是否失控、subagent 是否异常。 + + ——— + + 六、scorer registry 的意义 + + 最新版本已经新增 scripts/evals/v2_score_registry.ts:45。 + + 以前的问题是:score-spec 声明“我要这些分数”,但具体公式藏在 v2_record_run.ts 里。现在变成: + + score-spec 声明正式指标 + ↓ + validator 检查该 score_spec_id 是否有 scorer + ↓ + record_run 按 score_spec_id 调用 registry + ↓ + 只生成被 experiment 声明的正式 score + + 这个设计让 V2.1 从“脚本能跑”升级为“指标 contract 可维护”。 + + 当前 registry 里还实现了一些辅助 scorer,例如: + + | 辅助 scorer | 当前状态 | + | --- | --- | + | decision_quality.expected_tool_hit_rate | registry 中已有,但默认 score-spec 未正式启用 | + | efficiency.total_billed_token_budget | registry 中已有,但默认 score-spec 未正式启用 | + | stability.v1_closure_health | registry 中已有,但默认 score-spec 未正式启用 | + | controllability.subagent_count_budget | registry 中已有,但默认 score-spec 未正式启用 | + + 如果以后要把这些变成正式指标,需要把它们加入 score-spec 文件,再放进 experiment 的 score_spec_ids。 + + ——— + + 七、manifest validator 做什么 + + scripts/evals/v2_validate_manifests.ts:404 是 V2.1 的配置安全网。 + + 运行: + + bun run scripts/evals/v2_validate_manifests.ts + + 它会检查: + + | 检查项 | 意义 | + | --- | --- | + | scenario 是否存在 | 防止 experiment 引用不存在的任务 | + | variant 是否存在 | 防止 candidate 配置写错 | + | score-spec 是否存在 | 防止请求不存在的指标 | + | score-spec 是否有 scorer | 防止“声明了指标但没人会算” | + | gate-policy 是否存在 | 防止门禁配置失效 | + | bind_existing 是否覆盖所有 scenario × variant | 防止某个 candidate 没有 V1 证据 | + | action id 是否还是 placeholder | 防止忘记替换模板值 | + + 最新的 scorer 校验在 scripts/evals/v2_validate_manifests.ts:368。这一步非常关键,因为它把 V2.1 的指标 contract 固化 + 了。 + + ——— + + 八、experiment runner 做什么 + + scripts/evals/v2_run_experiment.ts:507 是 V2.1 的总调度器。 + + 你运行: + + bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default + + 它会做这些事: + + 1. 读取 tests/evals/v2/experiments/session_memory_sparse_vs_default.json。 + 2. 确认 mode 是 bind_existing。 + 3. 如果 mode 是 execute_harness,立即报错并退出。 + 4. 检查每个 scenario 和每个 variant 是否都有 action_binding。 + 5. 对 baseline 调用 v2_record_run.ts。 + 6. 对每个 candidate 调用 v2_record_run.ts。 + 7. 读取 baseline 和 candidate 的 scores。 + 8. 调用 v2_compare_runs.ts 生成对比报告。 + 9. 用 gate policy 计算每个 candidate 的 regression-risk gate result。 + 10. 汇总成 experiment-level JSON summary 和 Markdown report。 + + execute_harness 当前明确阻塞,逻辑在 scripts/evals/v2_run_experiment.ts:516。这不是 bug,而是设计边界。 + + ——— + + 九、risk verdict 如何判断回归风险 + + 当前默认 gate policy 在 tests/evals/v2/gates/default_v2_1_gate.json。 + + 规则大致是: + + | 规则 | 类型 | 含义 | + | --- | --- | --- | + | task_success.main_chain_observed candidate < baseline | hard fail | candidate 不能丢失主链路成功信号 | + | efficiency.total_billed_tokens regression > 30 and task_success_not_improved | hard fail | 成本大涨且成功信号没变 + 好,不可接受 | + | efficiency.total_billed_tokens regression > 10 | soft warning | 成本上涨超过 10%,需要注意 | + | decision_quality.subagent_count_observed regression > 50 | soft warning | subagent 数大幅增加,需要注意 | + + gate 的聚合逻辑在 scripts/evals/v2_run_experiment.ts:374。 + + 最终 risk_verdict.status 有 4 种: + + | status | 含义 | + | --- | --- | + | pass | 没有 hard fail、warning、missing、inconclusive | + | warning | 没有 hard fail,但有 soft warning | + | fail | 至少一个 hard fail | + | inconclusive | 没有 hard fail,但存在 missing score 或无法判断 | + + 这套设计是保守的。证据缺失不会被当作通过,而是 inconclusive。 + + 但 risk_verdict 不是最终实验结论。它只回答“这个 candidate 是否触发当前 gate policy 已知的回归风险”。它不能回答 + harness 是否更聪明、是否有探索价值、是否应该长期保留。旧字段 gate_verdict 暂时保留为兼容别名。 + + ——— + + 十、当前样例实验怎么理解 + + 当前样例 manifest 是: + + tests/evals/v2/experiments/session_memory_sparse_vs_default.json + + 它表达的是: + + 实验目标:评估 sparse session memory 是否能降低成本,同时不破坏任务成功 + baseline:baseline_default + candidate:candidate_session_memory_sparse + scenario:cost_sensitive_task + mode:bind_existing + baseline action:1d5eb5e1-2fe0-42fa-9450-7b05d6367976 + candidate action:dbf9fae1-0a5a-4f50-aba7-02047ced9390 + + 这个实验不是 mock。它绑定的是现有 V1 DuckDB 中真实存在的 user_action_id。runner 做的是把这些 trace 转换为 V2 run、 + score 和 comparison。 + + ——— + + 十一、你应该如何使用 V2.1 + + 最标准流程如下。 + + 第一步,选一个 scenario。可以先用现有的: + + tests/evals/v2/scenarios/cost_sensitive_task.json + + 第二步,确认 baseline variant 和 candidate variant。可以先用现有的: + + tests/evals/v2/variants/baseline.template.json + tests/evals/v2/variants/candidate_session_memory_sparse.json + + 第三步,真实运行 baseline。也就是不要改 harness,发送 scenario 里的任务 prompt,让 V1 记录这次运行。 + + 第四步,拿到 baseline 的 user_action_id。可以从 dashboard、V1 action report,或者 DuckDB 查询最新记录。 + + tools\duckdb\duckdb.exe -csv .observability\observability_v1.duckdb "SELECT user_action_id, started_at, + total_billed_tokens FROM user_actions ORDER BY started_at DESC LIMIT 10;" + + 第五步,修改 harness。比如减少某段 memory 注入、调整 tool 路由、改变 skill 触发策略。 + + 第六步,真实运行 candidate。用尽量相同的 prompt,再拿到 candidate 的 user_action_id。 + + 第七步,编辑 experiment manifest。核心是填: + + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "" + } + ] + + 第八步,运行 validator: + + bun run scripts/evals/v2_validate_manifests.ts + + 第九步,运行 experiment: + + bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default + + 第十步,看结果。优先看: + + | 输出 | 用途 | + | --- | --- | + | tests/evals/v2/experiment-runs/*.json | 机器可读的实验总结果 | + | ObservrityTask/10-系统版本/v2/06-运行报告/experiment_*.md | 人工阅读的实验摘要 | + | ObservrityTask/10-系统版本/v2/06-运行报告/compare_*.md | baseline vs candidate 指标对比 | + | tests/evals/v2/runs/*.json | 单次 run 的证据绑定详情 | + | tests/evals/v2/scores/*.scores.json | 单次 run 的正式分数 | + + ——— + + 十二、如何解读实验报告 + + 先看 experiment-runs/*.json 顶层字段。 + + | 字段 | 含义 | + | --- | --- | + | experiment_id | 这次实验是谁 | + | manifest_ref | 用的是哪个 manifest | + | mode | 当前应为 bind_existing | + | run_refs | 生成了哪些 V2 run | + | score_refs | 生成了哪些 score artifact | + | report_refs | 生成了哪些 Markdown report | + | risk_verdict | 回归风险门禁结果,不是最终实验判断 | + | gate_verdict | 兼容旧脚本的别名;新流程优先看 risk_verdict | + | verdict_boundary | 明确说明 verdict 只代表 regression risk | + | scorecard_summary | baseline vs candidate 的多指标变化摘要 | + | exploration_signals | 自动提取的探索复盘提示 | + | recommended_review_mode | 建议用回归、人工或探索模式复盘 | + | final_decision | 人类最终决策;runner 默认保持 null | + | errors | hard fail 摘要 | + | warnings | soft warning、missing、inconclusive 摘要 | + + 然后看 risk_verdict: + + { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result..." + } + + 如果 status = pass,说明 candidate 在当前规则下没有明显退化。如果 status = warning,说明不是直接失败,但有成本或结构异 + 常。如果 status = fail,说明 candidate 触发硬性退化。如果 status = inconclusive,说明证据或规则不足,不能轻易说它通 + 过。 + + 接下来还要看 scorecard_summary 和 exploration_signals。一个 candidate 可以在 risk_verdict 上是 warning,但仍然因为能力 + 路径、工具选择、subagent 分支或结果质量变化而值得进入 exploratory_review。 + + ——— + + 十三、V2.1 当前的抽象能力 + + V2.1 已经把你的系统从“日志查看器”推进到了“实验平台雏形”。 + + 第一种抽象能力是“任务抽象”。scenario 把一次自然语言任务变成可重复讨论的评测对象。 + + 第二种抽象能力是“改动抽象”。variant 把 harness、skill、tool、model、mixed 改动统一成可比较对象。 + + 第三种抽象能力是“证据绑定抽象”。action_binding 让每个实验 run 都能追溯到 V1 的真实 user_action_id,避免实验结果脱离运 + 行事实。 + + 第四种抽象能力是“指标抽象”。score-spec 负责声明指标,scorer registry 负责实现指标,二者分离后,指标体系可以逐步扩展。 + + 第五种抽象能力是“对比抽象”。baseline 和 candidate 不再只是两次日志,而是同一 scenario 下两个 variant 的 score delta。 + + 第六种抽象能力是“风险门禁抽象”。gate policy 把“是否触发已知回归风险”从主观判断变成规则判断,但不替代人的最终实验 + 判断。 + + 第七种抽象能力是“回归抽象”。v2_verify_bind_runner.ts 用 9 个 case 检查 runner 的稳定性,保证 V2.1 自己不会悄悄坏掉。 + + ——— + + 十四、V2.1 已经验证了什么 + + 当前回归验证覆盖 9 类情况: + + | case | 目的 | + | --- | --- | + | 单 scenario + 单 candidate | 最小实验闭环 | + | 单 scenario + 多 candidate | 多候选对比 | + | 多 scenario + 单 candidate | 多任务评测 | + | 缺失 action binding | 必须报错 | + | 不存在的 user_action_id | 必须报错 | + | root query 缺失 | 必须阻止进入正式评分 | + | 不存在的 score_spec_id | 必须报错 | + | 不存在的 gate_policy_id | 必须报错 | + | execute_harness mode | 必须明确阻塞 | + + 最近一次验证结果是 9/9 通过。 + + ——— + + 十五、当前边界和不足 + + V2.1 现在不是自动化 benchmark runner。你仍然需要自己真实运行 baseline 和 candidate,再把 user_action_id 绑定进 + manifest。 + + V2.1 现在的 repeat_count 在 bind_existing 模式下不是“重复执行 harness”。它只是基于同一组绑定 trace 重复生成评测 + artifact,不能代表统计意义上的多次独立实验。 + + V2.1 现在没有判断“最终回答质量”。task_success.main_chain_observed 只是主链路存在的 trace-backed 成功代理,不等于真正完 + 成了任务。未来需要人工评分、规则评分或模型裁判补上。 + + V2.1 现在没有正式展开 tool/skill 使用质量指标。虽然已有 expected_tool_hit_rate 这样的辅助 scorer,但默认 score-spec 还 + 没有启用它。 + + V2.1 现在没有自动应用 variant。比如 candidate_session_memory_sparse 只是一个 variant manifest,系统不会自动帮你改代码 + 或切配置。 + + V2.1 现在不会自动捕获“刚刚这次运行就是 candidate”。这正是未来 execute_harness 要解决的问题。 + + ——— + + 十六、我对当前 V2.1 的一句话评价 + + 当前 V2.1-stable 已经完成了“可观测证据 -> 评测 run -> 指标分数 -> baseline/candidate 对比 -> risk verdict + 探索复盘提示”的基础闭环。它 + 还不是全自动实验室,但已经是一个可信的本地 harness 实验平台地基。 + + 最重要的是,它现在的可信点在于:每个 V2 run 都必须绑定 V1 的真实 user_action_id 和 main_thread root query。也就是说, + V2.1 的每个结论都能回到“真实发生过的一次 agent 运行”。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" deleted file mode 100644 index 897dc42844..0000000000 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" +++ /dev/null @@ -1,37 +0,0 @@ -# V2 Run Comparison - -## 理解清单 - -- baseline_run: run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1 -- candidate_run: run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 -- scenario: cost_sensitive_task -- baseline_variant: baseline_default -- candidate_variant: candidate_session_memory_sparse - -## 预期效果 - -This report compares two V2 runs using score artifacts generated from V1 observability evidence. - -## 设计思路 - -Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. - -## Summary - -- regression_count: 0 -- baseline_user_action_id: 1d5eb5e1-2fe0-42fa-9450-7b05d6367976 -- candidate_user_action_id: dbf9fae1-0a5a-4f50-aba7-02047ced9390 - -## Score Deltas - -| score | baseline | candidate | delta | verdict | -| --- | ---: | ---: | ---: | --- | -| controllability.subagent_count_budget | 0 | 1 | 1 | improved | -| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | -| decision_quality.expected_tool_hit_rate | 1 | 1 | 0 | unchanged | -| decision_quality.subagent_count_observed | 4 | 2 | -2 | improved | -| efficiency.total_billed_token_budget | 0 | 0 | 0 | unchanged | -| efficiency.total_billed_tokens | 400399 | 352691 | -47708 | improved | -| stability.recovery_absence | 1 | 1 | 0 | unchanged | -| stability.v1_closure_health | 1 | 1 | 0 | unchanged | -| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" similarity index 89% rename from "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" index dfb110adb7..0860659e47 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" @@ -2,8 +2,8 @@ ## 理解清单 -- baseline_run: run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 -- candidate_run: run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 +- baseline_run: run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1 +- candidate_run: run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 - scenario: cost_sensitive_task - baseline_variant: baseline_default - candidate_variant: candidate_session_memory_sparse diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" deleted file mode 100644 index 847626dfd0..0000000000 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" +++ /dev/null @@ -1,39 +0,0 @@ -# V2.1 Experiment Summary: session_memory_sparse_vs_default - -## 理解清单 - -- experiment: session_memory_sparse_vs_default -- mode: bind_existing -- baseline_variant: baseline_default -- candidate_variants: candidate_session_memory_sparse -- scenario_count: 1 -- output_json: tests\evals\v2\experiment-runs\session_memory_sparse_vs_default_2026-04-27T105524752Z.json - -## 预期效果 - -This summary records a manifest-driven V2.1 experiment run. In bind-existing mode, every generated V2 run is backed by an existing V1 user_action_id. - -## 设计思路 - -V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs the existing scorer and comparison scripts. - -## Verdict - -- hard_failures: 0 -- soft_warnings: 0 -- gate_status: passed - -## Runs - -| scenario | repeat | baseline_run | candidate_variant | candidate_run | gate | compare_report | -| --- | ---: | --- | --- | --- | --- | --- | -| cost_sensitive_task | 1 | run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1 | candidate_session_memory_sparse | run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 | 0/4 failed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md | - -## Gate Results - -| scenario | candidate_variant | rule_type | score_spec | result | regression_pct | -| --- | --- | --- | --- | --- | ---: | -| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" deleted file mode 100644 index c41eaea0b5..0000000000 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" +++ /dev/null @@ -1,41 +0,0 @@ -# V2.1 Experiment Summary: session_memory_sparse_vs_default - -## 理解清单 - -- experiment: session_memory_sparse_vs_default -- mode: bind_existing -- baseline_variant: baseline_default -- candidate_variants: candidate_session_memory_sparse -- scenario_count: 1 -- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic -- gate_policy: default_v2_1_gate -- output_json: tests\evals\v2\experiment-runs\session_memory_sparse_vs_default_2026-04-28T162912802Z.json - -## 预期效果 - -This summary records a manifest-driven V2.1 experiment run. In bind-existing mode, every generated V2 run is backed by an existing V1 user_action_id. - -## 设计思路 - -V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs the existing scorer and comparison scripts. - -## Verdict - -- hard_failures: 0 -- soft_warnings: 0 -- gate_status: passed - -## Runs - -| scenario | repeat | baseline_run | candidate_variant | candidate_run | gate | compare_report | -| --- | ---: | --- | --- | --- | --- | --- | -| cost_sensitive_task | 1 | run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 | candidate_session_memory_sparse | run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 | 0/4 failed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md | - -## Gate Results - -| scenario | candidate_variant | rule_type | score_spec | result | regression_pct | -| --- | --- | --- | --- | --- | ---: | -| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | -| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-30T021206270Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-30T021206270Z.md" new file mode 100644 index 0000000000..7ea8496a56 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_sparse_vs_default_2026-04-30T021206270Z.md" @@ -0,0 +1,61 @@ +# V2.1 Experiment Summary: session_memory_sparse_vs_default + +## 理解清单 + +- experiment: session_memory_sparse_vs_default +- mode: bind_existing +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\session_memory_sparse_vs_default_2026-04-30T021206270Z.json + +## 预期效果 + +This summary records a manifest-driven V2.1 experiment run. In bind-existing mode, every generated V2 run is backed by an existing V1 user_action_id. + +## 设计思路 + +V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs scorer, comparison, and regression-risk gate scripts. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 0 +- risk_status: passed +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: regression_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| cost_sensitive_task | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| cost_sensitive_task | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 4 | 2 | -2 | improved | +| cost_sensitive_task | candidate_session_memory_sparse | efficiency.total_billed_tokens | 400399 | 352691 | -47708 | improved | +| cost_sensitive_task | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| cost_sensitive_task | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- 2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | +| cost_sensitive_task | 1 | run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1 | candidate_session_memory_sparse | run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| cost_sensitive_task | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" deleted file mode 100644 index 4cd9341bfa..0000000000 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" +++ /dev/null @@ -1,56 +0,0 @@ -# V2 Run Report: run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1 - -## 理解清单 - -- scenario: cost_sensitive_task (Cost Sensitive Task) -- variant: baseline_default (Baseline Default) -- user_action_id: 1d5eb5e1-2fe0-42fa-9450-7b05d6367976 -- root_query_id: 15ecf197-b1c6-47e2-8d94-df1f88f0d822 -- observability_db_ref: .observability\observability_v1.duckdb - -## 预期效果 - -This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. - -## 设计思路 - -The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. - -## V1 Evidence - -- started_at: 2026-04-24T04:48:30.824Z -- duration_ms: 88207 -- query_count: 5 -- subagent_count: 4 -- tool_call_count: 22 -- total_prompt_input_tokens: 397412 -- total_billed_tokens: 400399 -- root_turn_count: 4 -- root_terminal_reason: completed -- recovery_count: 0 - -## Tools - -- Edit: count=11, closed=11, failed=0 -- Read: count=5, closed=5, failed=0 -- Glob: count=3, closed=3, failed=0 -- Write: count=3, closed=3, failed=0 - -## Subagents - -- session_memory: count=1, trigger=token_threshold_and_tool_threshold -- prompt_suggestion: count=1, trigger=suggestion_generation_allowed -- extract_memories: count=1, trigger=post_turn_background_extraction -- session_memory: count=1, trigger=token_threshold_and_natural_break - -## Scores - -- task_success.main_chain_observed: pass (1) -- decision_quality.expected_tool_hit_rate: pass (1) -- efficiency.total_billed_tokens: observed (400399) -- efficiency.total_billed_token_budget: fail (0) -- stability.v1_closure_health: pass (1) -- stability.recovery_absence: pass (1) -- controllability.turn_limit_basic: pass (1) -- decision_quality.subagent_count_observed: observed (4) -- controllability.subagent_count_budget: fail (0) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" deleted file mode 100644 index d72655f20b..0000000000 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" +++ /dev/null @@ -1,53 +0,0 @@ -# V2 Run Report: run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 - -## 理解清单 - -- scenario: cost_sensitive_task (Cost Sensitive Task) -- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) -- user_action_id: dbf9fae1-0a5a-4f50-aba7-02047ced9390 -- root_query_id: f15ca52c-e702-448a-9cd8-8d5c942ff4e2 -- observability_db_ref: .observability\observability_v1.duckdb - -## 预期效果 - -This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. - -## 设计思路 - -The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. - -## V1 Evidence - -- started_at: 2026-04-24T04:55:36.952Z -- duration_ms: 46081 -- query_count: 3 -- subagent_count: 2 -- tool_call_count: 15 -- total_prompt_input_tokens: 348534 -- total_billed_tokens: 352691 -- root_turn_count: 4 -- root_terminal_reason: completed -- recovery_count: 0 - -## Tools - -- Read: count=8, closed=8, failed=0 -- Edit: count=5, closed=5, failed=0 -- Glob: count=2, closed=2, failed=0 - -## Subagents - -- extract_memories: count=1, trigger=post_turn_background_extraction -- session_memory: count=1, trigger=token_threshold_and_tool_threshold - -## Scores - -- task_success.main_chain_observed: pass (1) -- decision_quality.expected_tool_hit_rate: pass (1) -- efficiency.total_billed_tokens: observed (352691) -- efficiency.total_billed_token_budget: fail (0) -- stability.v1_closure_health: pass (1) -- stability.recovery_absence: pass (1) -- controllability.turn_limit_basic: pass (1) -- decision_quality.subagent_count_observed: observed (2) -- controllability.subagent_count_budget: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" similarity index 96% rename from "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" index b488e5d832..9e4b1c3b93 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.md" @@ -1,4 +1,4 @@ -# V2 Run Report: run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1 +# V2 Run Report: run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1 ## 理解清单 @@ -36,20 +36,20 @@ The report does not judge final answer quality by itself. It records trace-backe - Edit: count=11, closed=11, failed=0 - Read: count=5, closed=5, failed=0 -- Glob: count=3, closed=3, failed=0 - Write: count=3, closed=3, failed=0 +- Glob: count=3, closed=3, failed=0 ## Subagents - prompt_suggestion: count=1, trigger=suggestion_generation_allowed -- session_memory: count=1, trigger=token_threshold_and_tool_threshold - extract_memories: count=1, trigger=post_turn_background_extraction - session_memory: count=1, trigger=token_threshold_and_natural_break +- session_memory: count=1, trigger=token_threshold_and_tool_threshold ## Scores - task_success.main_chain_observed: pass (1) - efficiency.total_billed_tokens: observed (400399) +- decision_quality.subagent_count_observed: observed (4) - stability.recovery_absence: pass (1) - controllability.turn_limit_basic: pass (1) -- decision_quality.subagent_count_observed: observed (4) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" similarity index 95% rename from "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" rename to "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" index e776a76bf0..29118c811c 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md" @@ -1,4 +1,4 @@ -# V2 Run Report: run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 +# V2 Run Report: run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1 ## 理解清单 @@ -40,13 +40,13 @@ The report does not judge final answer quality by itself. It records trace-backe ## Subagents -- extract_memories: count=1, trigger=post_turn_background_extraction - session_memory: count=1, trigger=token_threshold_and_tool_threshold +- extract_memories: count=1, trigger=post_turn_background_extraction ## Scores - task_success.main_chain_observed: pass (1) - efficiency.total_billed_tokens: observed (352691) +- decision_quality.subagent_count_observed: observed (2) - stability.recovery_absence: pass (1) - controllability.turn_limit_basic: pass (1) -- decision_quality.subagent_count_observed: observed (2) diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index 77201991ba..8163b3888b 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -20,6 +20,9 @@ interface CandidateExperimentResult { candidate_user_action_id: string compare_report: string gate_results: GateResult[] + scorecard_summary: ScorecardItem[] + exploration_signals: string[] + recommended_review_mode: ReviewMode } interface ScenarioExperimentResult { @@ -44,13 +47,39 @@ interface GateResult { notes?: string } -interface GateVerdict { +interface RiskVerdict { status: 'pass' | 'warning' | 'fail' | 'inconclusive' + scope: 'regression_risk_only' + is_final_experiment_judgment: false hard_fail_count: number soft_warning_count: number missing_score_count: number inconclusive_count: number candidate_count: number + notes: string +} + +type ReviewMode = + | 'regression_review' + | 'manual_review' + | 'exploratory_review' + +interface ScorecardItem { + scenario_id: string + candidate_variant_id: string + score_spec_id: string + direction: EvalScoreSpec['direction'] | 'unknown' + baseline_value: number | null + candidate_value: number | null + delta: number | null + interpretation: + | 'improved' + | 'regressed' + | 'unchanged' + | 'changed' + | 'missing' + | 'observed' + | 'not_applicable' } const repoRoot = path.resolve(import.meta.dirname, '..', '..') @@ -216,6 +245,136 @@ function valueFor(scores: EvalScore[], scoreSpecId: string): number | null { return score?.score_value ?? null } +function scorecardItem(params: { + scenarioId: string + candidateVariantId: string + scoreSpecId: string + spec: EvalScoreSpec | undefined + baselineValue: number | null + candidateValue: number | null +}): ScorecardItem { + const { + scenarioId, + candidateVariantId, + scoreSpecId, + spec, + baselineValue, + candidateValue, + } = params + const delta = + baselineValue === null || candidateValue === null + ? null + : Number((candidateValue - baselineValue).toFixed(6)) + let interpretation: ScorecardItem['interpretation'] = 'not_applicable' + if (baselineValue === null || candidateValue === null) { + interpretation = 'missing' + } else if (delta === 0) { + interpretation = 'unchanged' + } else if (!spec || spec.direction === 'observed_only') { + interpretation = 'observed' + } else if (spec.direction === 'lower_is_better') { + interpretation = candidateValue < baselineValue ? 'improved' : 'regressed' + } else if (spec.direction === 'higher_is_better' || spec.direction === 'boolean_pass') { + interpretation = candidateValue > baselineValue ? 'improved' : 'regressed' + } else { + interpretation = 'changed' + } + return { + scenario_id: scenarioId, + candidate_variant_id: candidateVariantId, + score_spec_id: scoreSpecId, + direction: spec?.direction ?? 'unknown', + baseline_value: baselineValue, + candidate_value: candidateValue, + delta, + interpretation, + } +} + +function buildScorecardSummary(params: { + scenarioId: string + candidateVariantId: string + scoreSpecs: Map + baselineScores: EvalScore[] + candidateScores: EvalScore[] +}): ScorecardItem[] { + const { + scenarioId, + candidateVariantId, + scoreSpecs, + baselineScores, + candidateScores, + } = params + const scoreSpecIds = [ + ...new Set([ + ...baselineScores.map(scoreKey), + ...candidateScores.map(scoreKey), + ]), + ].sort() + return scoreSpecIds.map(scoreSpecId => + scorecardItem({ + scenarioId, + candidateVariantId, + scoreSpecId, + spec: scoreSpecs.get(scoreSpecId), + baselineValue: valueFor(baselineScores, scoreSpecId), + candidateValue: valueFor(candidateScores, scoreSpecId), + }), + ) +} + +function buildExplorationSignals(params: { + scorecard: ScorecardItem[] + gateResults: GateResult[] +}): string[] { + const { scorecard, gateResults } = params + const signals: string[] = [] + const changedScores = scorecard.filter(item => + ['improved', 'regressed', 'changed', 'observed'].includes(item.interpretation), + ) + const improvedScores = scorecard.filter(item => item.interpretation === 'improved') + const regressedScores = scorecard.filter(item => item.interpretation === 'regressed') + const hardOrSoftGateResults = gateResults.filter(result => + result.verdict === 'hard_fail' || result.verdict === 'soft_warning', + ) + + if (changedScores.length > 0) { + signals.push( + `${changedScores.length} score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.`, + ) + } + if (improvedScores.length > 0 && regressedScores.length > 0) { + signals.push( + 'Candidate shows a tradeoff pattern: at least one score improved while another regressed.', + ) + } + if (hardOrSoftGateResults.length > 0 && improvedScores.length > 0) { + signals.push( + 'Risk gate raised a warning/failure, but at least one score improved; this may be worth exploratory review instead of immediate rejection.', + ) + } + if (signals.length === 0) { + signals.push( + 'No exploratory signal was derived from the current automatic scorecard; manual review may still find qualitative differences.', + ) + } + return signals +} + +function recommendReviewMode(params: { + scorecard: ScorecardItem[] + gateResults: GateResult[] +}): ReviewMode { + const { scorecard, gateResults } = params + const hasRisk = gateResults.some(result => result.verdict !== 'pass') + const hasTradeoff = + scorecard.some(item => item.interpretation === 'improved') && + scorecard.some(item => item.interpretation === 'regressed') + if (hasTradeoff) return 'exploratory_review' + if (hasRisk) return 'manual_review' + return 'regression_review' +} + function regressionPct(params: { baselineValue: number | null candidateValue: number | null @@ -371,7 +530,7 @@ function buildRecordRunArgs(params: { return args } -function summarizeGate(results: ScenarioExperimentResult[]): GateVerdict { +function summarizeRisk(results: ScenarioExperimentResult[]): RiskVerdict { const candidates = results.flatMap(result => result.candidates) const allGateResults = candidates.flatMap(candidate => candidate.gate_results) const hardFailCount = allGateResults.filter(result => result.verdict === 'hard_fail').length @@ -387,14 +546,43 @@ function summarizeGate(results: ScenarioExperimentResult[]): GateVerdict { : softWarningCount > 0 ? 'warning' : 'pass', + scope: 'regression_risk_only', + is_final_experiment_judgment: false, hard_fail_count: hardFailCount, soft_warning_count: softWarningCount, missing_score_count: missingScoreCount, inconclusive_count: inconclusiveCount, candidate_count: candidates.length, + notes: + 'This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential.', } } +function aggregateScorecard(results: ScenarioExperimentResult[]): ScorecardItem[] { + return results.flatMap(result => + result.candidates.flatMap(candidate => candidate.scorecard_summary), + ) +} + +function aggregateExplorationSignals(results: ScenarioExperimentResult[]): string[] { + return [ + ...new Set( + results.flatMap(result => + result.candidates.flatMap(candidate => candidate.exploration_signals), + ), + ), + ] +} + +function aggregateReviewMode(results: ScenarioExperimentResult[]): ReviewMode { + const modes = results.flatMap(result => + result.candidates.map(candidate => candidate.recommended_review_mode), + ) + if (modes.includes('exploratory_review')) return 'exploratory_review' + if (modes.includes('manual_review')) return 'manual_review' + return 'regression_review' +} + function runRefs(results: ScenarioExperimentResult[]): string[] { return results.flatMap(result => [ path.join('tests', 'evals', 'v2', 'runs', `${result.baseline_run_id}.json`), @@ -461,6 +649,16 @@ function buildMarkdownReport(params: { `| ${result.scenario_id} | ${result.candidate_variant_id} | ${result.rule_type} | ${result.score_spec_id} | ${result.verdict} | ${result.regression_pct ?? 'n/a'} |`, ) .join('\n') + const scorecardRows = aggregateScorecard(results) + .map( + item => + `| ${item.scenario_id} | ${item.candidate_variant_id} | ${item.score_spec_id} | ${item.baseline_value ?? 'n/a'} | ${item.candidate_value ?? 'n/a'} | ${item.delta ?? 'n/a'} | ${item.interpretation} |`, + ) + .join('\n') + const explorationRows = aggregateExplorationSignals(results) + .map(signal => `- ${signal}`) + .join('\n') + const reviewMode = aggregateReviewMode(results) return `# V2.1 Experiment Summary: ${experiment.experiment_id} @@ -481,22 +679,37 @@ This summary records a manifest-driven V2.1 experiment run. In bind-existing mod ## 设计思路 -V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs the existing scorer and comparison scripts. +V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs scorer, comparison, and regression-risk gate scripts. -## Verdict +## Risk Verdict - hard_failures: ${hardFailures.length} - soft_warnings: ${softWarnings.length} - missing_or_inconclusive: ${missingOrInconclusive.length} -- gate_status: ${hardFailures.length > 0 ? 'failed' : missingOrInconclusive.length > 0 ? 'inconclusive' : softWarnings.length > 0 ? 'warning' : 'passed'} +- risk_status: ${hardFailures.length > 0 ? 'failed' : missingOrInconclusive.length > 0 ? 'inconclusive' : softWarnings.length > 0 ? 'warning' : 'passed'} +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: ${reviewMode} + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +${scorecardRows || '| n/a | n/a | n/a | n/a | n/a | n/a | n/a |'} + +## Exploration Signals + +${explorationRows || '- No exploration signal generated.'} ## Runs -| scenario | repeat | baseline_run | candidate_variant | candidate_run | gate | compare_report | +| scenario | repeat | baseline_run | candidate_variant | candidate_run | risk_gate | compare_report | | --- | ---: | --- | --- | --- | --- | --- | ${rows} -## Gate Results +## Risk Gate Details | scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | | --- | --- | --- | --- | --- | ---: | @@ -635,18 +848,36 @@ async function main(): Promise { candidateRunId, ]) + const gateResults = evaluateGate({ + scenarioId, + candidateVariantId, + gatePolicy, + scoreSpecs, + baselineScores, + candidateScores, + }) + const scorecard = buildScorecardSummary({ + scenarioId, + candidateVariantId, + scoreSpecs, + baselineScores, + candidateScores, + }) + candidates.push({ candidate_variant_id: candidateVariantId, candidate_run_id: candidateRunId, candidate_user_action_id: candidateActionId, compare_report: extractCreatedReport(compareOutput), - gate_results: evaluateGate({ - scenarioId, - candidateVariantId, - gatePolicy, - scoreSpecs, - baselineScores, - candidateScores, + gate_results: gateResults, + scorecard_summary: scorecard, + exploration_signals: buildExplorationSignals({ + scorecard, + gateResults, + }), + recommended_review_mode: recommendReviewMode({ + scorecard, + gateResults, }), }) } @@ -676,7 +907,10 @@ async function main(): Promise { ) const outputMarkdownRel = path.relative(repoRoot, outputMarkdownPath) const generatedAt = new Date().toISOString() - const gateVerdict = summarizeGate(results) + const riskVerdict = summarizeRisk(results) + const scorecardSummary = aggregateScorecard(results) + const explorationSignals = aggregateExplorationSignals(results) + const recommendedReviewMode = aggregateReviewMode(results) const warningMessages = results .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) .filter(result => result.verdict === 'soft_warning' || result.verdict === 'missing' || result.verdict === 'inconclusive') @@ -702,7 +936,14 @@ async function main(): Promise { run_refs: runRefs(results), score_refs: scoreRefs(results), report_refs: reportRefs(results, outputMarkdownRel), - gate_verdict: gateVerdict, + risk_verdict: riskVerdict, + gate_verdict: riskVerdict, + verdict_boundary: + 'risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.', + scorecard_summary: scorecardSummary, + exploration_signals: explorationSignals, + recommended_review_mode: recommendedReviewMode, + final_decision: null, errors: errorMessages, warnings: warningMessages, experiment, diff --git a/scripts/evals/v2_validate_experiment_artifacts.ts b/scripts/evals/v2_validate_experiment_artifacts.ts index f8e4e4666c..cbb737235f 100644 --- a/scripts/evals/v2_validate_experiment_artifacts.ts +++ b/scripts/evals/v2_validate_experiment_artifacts.ts @@ -29,6 +29,17 @@ function requireNumber(errors: string[], objectName: string, fieldName: string, } } +function requireOptionalString( + errors: string[], + filePath: string, + fieldName: string, + value: unknown, +) { + if (value !== undefined && typeof value !== 'string') { + errors.push(`${filePath}.${fieldName} must be a string when present`) + } +} + function validateArtifact(filePath: string, artifact: JsonRecord): string[] { const errors: string[] = [] requireString(errors, filePath, 'experiment_id', artifact.experiment_id) @@ -41,19 +52,39 @@ function validateArtifact(filePath: string, artifact: JsonRecord): string[] { requireArray(errors, filePath, 'errors', artifact.errors) requireArray(errors, filePath, 'warnings', artifact.warnings) - const gateVerdict = artifact.gate_verdict as JsonRecord | undefined - if (!gateVerdict || typeof gateVerdict !== 'object' || Array.isArray(gateVerdict)) { - errors.push(`${filePath}.gate_verdict must be an object`) + const riskVerdict = (artifact.risk_verdict ?? artifact.gate_verdict) as JsonRecord | undefined + if (!riskVerdict || typeof riskVerdict !== 'object' || Array.isArray(riskVerdict)) { + errors.push(`${filePath}.risk_verdict or ${filePath}.gate_verdict must be an object`) return errors } - if (!gateStatuses.has(String(gateVerdict.status))) { - errors.push(`${filePath}.gate_verdict.status has invalid value: ${gateVerdict.status}`) + const verdictObjectName = artifact.risk_verdict ? 'risk_verdict' : 'gate_verdict' + if (!gateStatuses.has(String(riskVerdict.status))) { + errors.push(`${filePath}.${verdictObjectName}.status has invalid value: ${riskVerdict.status}`) + } + requireNumber(errors, `${filePath}.${verdictObjectName}`, 'hard_fail_count', riskVerdict.hard_fail_count) + requireNumber(errors, `${filePath}.${verdictObjectName}`, 'soft_warning_count', riskVerdict.soft_warning_count) + requireNumber(errors, `${filePath}.${verdictObjectName}`, 'missing_score_count', riskVerdict.missing_score_count) + requireNumber(errors, `${filePath}.${verdictObjectName}`, 'inconclusive_count', riskVerdict.inconclusive_count) + requireNumber(errors, `${filePath}.${verdictObjectName}`, 'candidate_count', riskVerdict.candidate_count) + if (artifact.risk_verdict !== undefined) { + requireString(errors, `${filePath}.risk_verdict`, 'scope', riskVerdict.scope) + if (riskVerdict.is_final_experiment_judgment !== false) { + errors.push(`${filePath}.risk_verdict.is_final_experiment_judgment must be false`) + } + } + if (artifact.scorecard_summary !== undefined) { + requireArray(errors, filePath, 'scorecard_summary', artifact.scorecard_summary) + } + if (artifact.exploration_signals !== undefined) { + requireArray(errors, filePath, 'exploration_signals', artifact.exploration_signals) } - requireNumber(errors, `${filePath}.gate_verdict`, 'hard_fail_count', gateVerdict.hard_fail_count) - requireNumber(errors, `${filePath}.gate_verdict`, 'soft_warning_count', gateVerdict.soft_warning_count) - requireNumber(errors, `${filePath}.gate_verdict`, 'missing_score_count', gateVerdict.missing_score_count) - requireNumber(errors, `${filePath}.gate_verdict`, 'inconclusive_count', gateVerdict.inconclusive_count) - requireNumber(errors, `${filePath}.gate_verdict`, 'candidate_count', gateVerdict.candidate_count) + requireOptionalString( + errors, + filePath, + 'recommended_review_mode', + artifact.recommended_review_mode, + ) + requireOptionalString(errors, filePath, 'verdict_boundary', artifact.verdict_boundary) return errors } diff --git a/scripts/evals/v2_verify_bind_runner.ts b/scripts/evals/v2_verify_bind_runner.ts index 84a13a50c9..b77927c692 100644 --- a/scripts/evals/v2_verify_bind_runner.ts +++ b/scripts/evals/v2_verify_bind_runner.ts @@ -158,11 +158,32 @@ function assertExperimentArtifactSchema(summary: JsonRecord): string[] { for (const field of ['run_refs', 'score_refs', 'report_refs', 'errors', 'warnings']) { if (!Array.isArray(summary[field])) errors.push(`${field} must be an array`) } + const riskVerdict = summary.risk_verdict as JsonRecord | undefined + if (!riskVerdict || typeof riskVerdict !== 'object') { + errors.push('risk_verdict must be an object') + } else { + if (!['pass', 'warning', 'fail', 'inconclusive'].includes(String(riskVerdict.status))) { + errors.push('risk_verdict.status has invalid value') + } + if (riskVerdict.scope !== 'regression_risk_only') { + errors.push('risk_verdict.scope must be regression_risk_only') + } + if (riskVerdict.is_final_experiment_judgment !== false) { + errors.push('risk_verdict.is_final_experiment_judgment must be false') + } + } const gateVerdict = summary.gate_verdict as JsonRecord | undefined if (!gateVerdict || typeof gateVerdict !== 'object') { - errors.push('gate_verdict must be an object') - } else if (!['pass', 'warning', 'fail', 'inconclusive'].includes(String(gateVerdict.status))) { - errors.push('gate_verdict.status has invalid value') + errors.push('gate_verdict compatibility alias must be an object') + } + for (const field of ['scorecard_summary', 'exploration_signals']) { + if (!Array.isArray(summary[field])) errors.push(`${field} must be an array`) + } + if (typeof summary.recommended_review_mode !== 'string') { + errors.push('recommended_review_mode must be a string') + } + if (summary.final_decision !== null) { + errors.push('final_decision must be null until a human decision is recorded') } return errors } diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index e57d4a3399..af0b6b1bef 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -38,7 +38,7 @@ Recommended V2.1 usage order: - `gate_policy_id` 5. Validate all manifests. 6. Run the experiment runner. -7. Read the generated run, score, comparison, gate, and experiment summary artifacts. +7. Read the generated run, score, comparison, risk gate, scorecard, exploration, and experiment summary artifacts. Recommended V2.1 `action_bindings` shape: @@ -83,7 +83,9 @@ Run the current sample V2.1 experiment: bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default ``` -Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records score-spec-backed scores, compares baseline vs candidate, applies the configured gate policy, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. +Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records score-spec-backed scores, compares baseline vs candidate, applies the configured gate policy as a regression-risk check, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. + +The top-level `risk_verdict` is not a final experiment judgment. It is only a regression-risk signal. New summaries also include `scorecard_summary`, `exploration_signals`, `recommended_review_mode`, and `final_decision` so exploratory harness work is not reduced to pass/fail. Detailed V2.1 usage: diff --git a/tests/evals/v2/V2.1-bind_existing-usage.md b/tests/evals/v2/V2.1-bind_existing-usage.md index d4b334aae8..be71f61ccf 100644 --- a/tests/evals/v2/V2.1-bind_existing-usage.md +++ b/tests/evals/v2/V2.1-bind_existing-usage.md @@ -118,11 +118,17 @@ bun run scripts/evals/v2_run_experiment.ts --experiment my_candidate_vs_default - `run_refs` - `score_refs` - `report_refs` -- `gate_verdict` +- `risk_verdict` +- `scorecard_summary` +- `exploration_signals` +- `recommended_review_mode` +- `final_decision` - `errors` - `warnings` -## 6. 解释 Gate Verdict +旧字段 `gate_verdict` 暂时保留为兼容别名;新的使用流程优先看 `risk_verdict`。 + +## 6. 解释 Risk Verdict | status | meaning | | --- | --- | @@ -131,6 +137,8 @@ bun run scripts/evals/v2_run_experiment.ts --experiment my_candidate_vs_default | `fail` | 至少一个 candidate 触发 hard fail。 | | `inconclusive` | 没有 hard fail,但存在 missing score 或无法判断。 | +`risk_verdict` 不是最终实验判断。它只说明 candidate 是否触发当前 gate policy 已知的回归风险。 + 不要只看成本下降。至少同时看: - `task_success.main_chain_observed` @@ -139,6 +147,13 @@ bun run scripts/evals/v2_run_experiment.ts --experiment my_candidate_vs_default - `stability.recovery_absence` - `controllability.turn_limit_basic` +再结合: + +- `scorecard_summary`:多指标变化。 +- `exploration_signals`:是否出现值得人工复盘的探索信号。 +- `recommended_review_mode`:建议按回归、人工、探索哪种方式阅读。 +- `final_decision`:默认是 `null`,表示最终结论应由人类填写或另行记录。 + ## 7. 运行回归验证 ```powershell diff --git a/tests/evals/v2/experiment-runs/README.md b/tests/evals/v2/experiment-runs/README.md index d4560c8229..5b060e12c0 100644 --- a/tests/evals/v2/experiment-runs/README.md +++ b/tests/evals/v2/experiment-runs/README.md @@ -13,8 +13,9 @@ - 这次实验来自哪个 manifest。 - 使用的是哪个 mode。 - 生成了哪些 run / score / report artifact。 -- gate 最终是 pass、warning、fail 还是 inconclusive。 +- risk gate 最终是 pass、warning、fail 还是 inconclusive。 - 是否存在错误或警告。 +- 是否存在 scorecard 变化、探索信号、推荐复盘模式。 ## 设计思路 @@ -31,20 +32,29 @@ | `run_refs` | string[] | 本次生成的 V2 run JSON 路径。 | | `score_refs` | string[] | 本次生成的 score JSON 路径。 | | `report_refs` | string[] | 本次生成的 compare / experiment Markdown report 路径。 | -| `gate_verdict` | object | 聚合后的 gate 结论。 | +| `risk_verdict` | object | 聚合后的回归风险结论;不是最终实验判断。 | +| `gate_verdict` | object | 兼容旧脚本的别名;新代码应优先读 `risk_verdict`。 | +| `verdict_boundary` | string | 明确说明 verdict 只代表 regression risk。 | +| `scorecard_summary` | array | baseline vs candidate 的多指标变化摘要。 | +| `exploration_signals` | string[] | 自动提取的探索复盘提示。 | +| `recommended_review_mode` | string | 建议复盘模式:`regression_review` / `manual_review` / `exploratory_review`。 | +| `final_decision` | null or object | 人类最终决策;runner 默认保持 `null`。 | | `errors` | string[] | hard fail 或 runner 级错误摘要。成功但 gate hard fail 时也可非空。 | | `warnings` | string[] | soft warning、missing score、inconclusive 等非阻塞问题。 | -## Gate Verdict Shape +## Risk Verdict Shape ```json { "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, "hard_fail_count": 0, "soft_warning_count": 0, "missing_score_count": 0, "inconclusive_count": 0, - "candidate_count": 1 + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result..." } ``` @@ -55,6 +65,31 @@ 3. 任意 soft warning => `warning` 4. 其他情况 => `pass` +## Verdict Boundary + +`risk_verdict` 只回答: + +```text +这个 candidate 有没有触发当前 gate policy 已知的回归风险? +``` + +它不回答: + +```text +这个 harness 是否更聪明? +这个 candidate 是否有探索价值? +这个改动是否应被长期保留? +``` + +因此新的 summary 会同时输出: + +- `scorecard_summary` +- `exploration_signals` +- `recommended_review_mode` +- `final_decision` + +最终判断应由人类结合这些材料完成。 + ## Backward Compatibility V2.1 仍保留以下字段: @@ -63,5 +98,6 @@ V2.1 仍保留以下字段: - `runner` - `results` - `created_at` +- `gate_verdict` -这些字段可以用于人工阅读,但新脚本应优先依赖顶层稳定 schema。 +这些字段可以用于人工阅读或兼容历史脚本,但新脚本应优先依赖 `risk_verdict` 和其他顶层稳定 schema。 diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json deleted file mode 100644 index 66f9171093..0000000000 --- a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-27T105524752Z.json +++ /dev/null @@ -1,126 +0,0 @@ -{ - "experiment_id": "session_memory_sparse_vs_default", - "manifest_ref": "tests/evals/v2/experiments/session_memory_sparse_vs_default.json", - "generated_at": "2026-04-27T10:55:24.753Z", - "mode": "bind_existing", - "run_refs": [ - "tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json", - "tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json" - ], - "score_refs": [ - "tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json", - "tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json" - ], - "report_refs": [ - "ObservrityTask/10-系统版本/v2/06-运行报告/compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", - "ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_sparse_vs_default_2026-04-27T105524752Z.md" - ], - "gate_verdict": { - "status": "pass", - "hard_fail_count": 0, - "soft_warning_count": 0, - "missing_score_count": 0, - "inconclusive_count": 0, - "candidate_count": 1 - }, - "errors": [], - "warnings": [], - "experiment": { - "experiment_id": "session_memory_sparse_vs_default", - "name": "Session Memory Sparse vs Default", - "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", - "baseline_variant_id": "baseline_default", - "candidate_variant_ids": [ - "candidate_session_memory_sparse" - ], - "scenario_set_id": "v2_first_batch", - "scenario_ids": [ - "cost_sensitive_task" - ], - "repeat_count": 1, - "score_spec_ids": [ - "task_success.main_chain_observed", - "efficiency.total_billed_tokens", - "decision_quality.subagent_count_observed", - "stability.recovery_absence", - "controllability.turn_limit_basic" - ], - "gate_policy_id": "default_v2_1_gate", - "mode": "bind_existing", - "action_bindings": [ - { - "scenario_id": "cost_sensitive_task", - "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "candidate_user_action_ids": { - "candidate_session_memory_sparse": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" - } - } - ], - "status": "ready" - }, - "results": [ - { - "scenario_id": "cost_sensitive_task", - "repeat_index": 1, - "baseline_run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "candidates": [ - { - "candidate_variant_id": "candidate_session_memory_sparse", - "candidate_run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "candidate_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", - "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", - "gate_results": [ - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "hard_fail", - "score_spec_id": "task_success.main_chain_observed", - "passed": true, - "baseline_value": 1, - "candidate_value": 1, - "regression_pct": 0, - "condition": "candidate < baseline", - "notes": "Candidate cannot lose the main-chain success signal." - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "hard_fail", - "score_spec_id": "efficiency.total_billed_tokens", - "passed": true, - "baseline_value": 400399, - "candidate_value": 352691, - "regression_pct": 0, - "condition": "candidate_regression_pct > 30 and task_success_not_improved", - "notes": "Cost cannot rise sharply without a success improvement." - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "soft_warning", - "score_spec_id": "efficiency.total_billed_tokens", - "passed": true, - "baseline_value": 400399, - "candidate_value": 352691, - "regression_pct": 0, - "condition": "candidate_regression_pct > 10" - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "soft_warning", - "score_spec_id": "decision_quality.subagent_count_observed", - "passed": true, - "baseline_value": 4, - "candidate_value": 2, - "regression_pct": 0, - "condition": "candidate_regression_pct > 50" - } - ] - } - ] - } - ], - "created_at": "2026-04-27T10:55:24.753Z" -} diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json deleted file mode 100644 index aaeb095f38..0000000000 --- a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-28T162912802Z.json +++ /dev/null @@ -1,140 +0,0 @@ -{ - "experiment_id": "session_memory_sparse_vs_default", - "manifest_ref": "tests/evals/v2/experiments/session_memory_sparse_vs_default.json", - "generated_at": "2026-04-28T16:29:12.803Z", - "mode": "bind_existing", - "run_refs": [ - "tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json", - "tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json" - ], - "score_refs": [ - "tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json", - "tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json" - ], - "report_refs": [ - "ObservrityTask/10-系统版本/v2/06-运行报告/compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", - "ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_sparse_vs_default_2026-04-28T162912802Z.md" - ], - "gate_verdict": { - "status": "pass", - "hard_fail_count": 0, - "soft_warning_count": 0, - "missing_score_count": 0, - "inconclusive_count": 0, - "candidate_count": 1 - }, - "errors": [], - "warnings": [], - "experiment": { - "experiment_id": "session_memory_sparse_vs_default", - "name": "Session Memory Sparse vs Default", - "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", - "baseline_variant_id": "baseline_default", - "candidate_variant_ids": [ - "candidate_session_memory_sparse" - ], - "scenario_set_id": "v2_first_batch", - "scenario_ids": [ - "cost_sensitive_task" - ], - "repeat_count": 1, - "score_spec_ids": [ - "task_success.main_chain_observed", - "efficiency.total_billed_tokens", - "decision_quality.subagent_count_observed", - "stability.recovery_absence", - "controllability.turn_limit_basic" - ], - "gate_policy_id": "default_v2_1_gate", - "mode": "bind_existing", - "action_bindings": [ - { - "scenario_id": "cost_sensitive_task", - "variant_id": "baseline_default", - "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976" - }, - { - "scenario_id": "cost_sensitive_task", - "variant_id": "candidate_session_memory_sparse", - "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" - } - ], - "status": "ready" - }, - "runner": { - "mode": "bind_existing", - "score_spec_ids": [ - "task_success.main_chain_observed", - "efficiency.total_billed_tokens", - "decision_quality.subagent_count_observed", - "stability.recovery_absence", - "controllability.turn_limit_basic" - ], - "gate_policy_id": "default_v2_1_gate" - }, - "results": [ - { - "scenario_id": "cost_sensitive_task", - "repeat_index": 1, - "baseline_run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "candidates": [ - { - "candidate_variant_id": "candidate_session_memory_sparse", - "candidate_run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "candidate_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", - "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", - "gate_results": [ - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "hard_fail", - "score_spec_id": "task_success.main_chain_observed", - "passed": true, - "baseline_value": 1, - "candidate_value": 1, - "regression_pct": 0, - "condition": "candidate < baseline", - "notes": "Candidate cannot lose the main-chain success signal." - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "hard_fail", - "score_spec_id": "efficiency.total_billed_tokens", - "passed": true, - "baseline_value": 400399, - "candidate_value": 352691, - "regression_pct": 0, - "condition": "candidate_regression_pct > 30 and task_success_not_improved", - "notes": "Cost cannot rise sharply without a success improvement." - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "soft_warning", - "score_spec_id": "efficiency.total_billed_tokens", - "passed": true, - "baseline_value": 400399, - "candidate_value": 352691, - "regression_pct": 0, - "condition": "candidate_regression_pct > 10" - }, - { - "scenario_id": "cost_sensitive_task", - "candidate_variant_id": "candidate_session_memory_sparse", - "rule_type": "soft_warning", - "score_spec_id": "decision_quality.subagent_count_observed", - "passed": true, - "baseline_value": 4, - "candidate_value": 2, - "regression_pct": 0, - "condition": "candidate_regression_pct > 50" - } - ] - } - ] - } - ], - "created_at": "2026-04-28T16:29:12.803Z" -} diff --git a/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-30T021206270Z.json b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-30T021206270Z.json new file mode 100644 index 0000000000..d50f95e22c --- /dev/null +++ b/tests/evals/v2/experiment-runs/session_memory_sparse_vs_default_2026-04-30T021206270Z.json @@ -0,0 +1,272 @@ +{ + "experiment_id": "session_memory_sparse_vs_default", + "manifest_ref": "tests\\evals\\v2\\experiments\\session_memory_sparse_vs_default.json", + "generated_at": "2026-04-30T02:12:06.272Z", + "mode": "bind_existing", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.json", + "tests\\evals\\v2\\runs\\run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json", + "tests\\evals\\v2\\scores\\run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_session_memory_sparse_vs_default_2026-04-30T021206270Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 4, + "candidate_value": 2, + "delta": -2, + "interpretation": "improved" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 400399, + "candidate_value": 352691, + "delta": -47708, + "interpretation": "improved" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "session_memory_sparse_vs_default", + "name": "Session Memory Sparse vs Default", + "goal": "Evaluate whether sparse session memory reduces cost without hurting task success.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_first_batch", + "scenario_ids": [ + "cost_sensitive_task" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "action_bindings": [ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390" + } + ], + "status": "ready" + }, + "runner": { + "mode": "bind_existing", + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "cost_sensitive_task", + "repeat_index": 1, + "baseline_run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "baseline_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "candidate_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_vs_run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.md", + "gate_results": [ + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 400399, + "candidate_value": 352691, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 4, + "candidate_value": 2, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 4, + "candidate_value": 2, + "delta": -2, + "interpretation": "improved" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 400399, + "candidate_value": 352691, + "delta": -47708, + "interpretation": "improved" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "cost_sensitive_task", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "created_at": "2026-04-30T02:12:06.272Z" +} diff --git a/tests/evals/v2/gates/README.md b/tests/evals/v2/gates/README.md index 18b4245968..aaa40291f1 100644 --- a/tests/evals/v2/gates/README.md +++ b/tests/evals/v2/gates/README.md @@ -1,30 +1,49 @@ -# V2.1 Gate Semantics +# V2.1 Risk Gate Semantics ## 理解清单 - gate 不是 scorer;gate 只解释 baseline 和 candidate 的 score 差异。 - gate policy 定义 hard fail 和 soft warning。 -- runner 负责把每个 candidate 的 gate result 汇总成 experiment-level verdict。 +- runner 负责把每个 candidate 的 gate result 汇总成 experiment-level `risk_verdict`。 +- `risk_verdict` 不是最终实验结论,只是回归风险门禁。 ## 预期效果 -读 `gate_verdict.status` 时,应能得到稳定含义: +读 `risk_verdict.status` 时,应能得到稳定含义: - `pass`:没有 hard fail、soft warning、missing score、inconclusive。 - `warning`:没有 hard fail,但至少有 soft warning。 - `fail`:至少有一个 hard fail。 - `inconclusive`:没有 hard fail,但存在 missing score 或无法判断的规则。 +旧字段 `gate_verdict` 暂时保留为兼容别名,新的脚本和文档应优先使用 `risk_verdict`。 + ## 设计思路 V2.1 的 gate 语义要保守。缺失 score 不应被当作 pass;无法判断时应暴露为 `inconclusive`。 +更重要的是,gate 只能回答: + +```text +这个 candidate 有没有触发已知回归风险? +``` + +它不能回答: + +```text +这个 harness 是否更聪明? +这个改动是否有探索价值? +这个 candidate 是否应该被长期保留? +``` + +最终判断必须结合 scorecard、exploration signals、人工复盘和后续实验。 + ## Rule Types | rule type | meaning | effect | | --- | --- | --- | -| `hard_fail` | 不可接受的退化 | 任意触发时,experiment verdict 为 `fail`。 | -| `soft_warning` | 需要人工注意的退化 | 没有 hard fail 时,experiment verdict 为 `warning`。 | +| `hard_fail` | 不可接受的退化 | 任意触发时,experiment `risk_verdict` 为 `fail`。 | +| `soft_warning` | 需要人工注意的退化 | 没有 hard fail 时,experiment `risk_verdict` 为 `warning`。 | ## Missing Score @@ -32,7 +51,7 @@ V2.1 的 gate 语义要保守。缺失 score 不应被当作 pass;无法判断 - 该 rule 的 verdict 是 `missing`。 - experiment `missing_score_count` 加 1。 -- 如果没有 hard fail,则 experiment status 为 `inconclusive`。 +- 如果没有 hard fail,则 experiment `risk_verdict.status` 为 `inconclusive`。 ## Inconclusive @@ -40,16 +59,29 @@ V2.1 的 gate 语义要保守。缺失 score 不应被当作 pass;无法判断 - 该 rule 的 verdict 是 `inconclusive`。 - experiment `inconclusive_count` 加 1。 -- 如果没有 hard fail,则 experiment status 为 `inconclusive`。 +- 如果没有 hard fail,则 experiment `risk_verdict.status` 为 `inconclusive`。 ## Multi-Candidate Summary 多 candidate 时,runner 按所有 candidate 的 gate results 汇总: -- 任一 candidate hard fail => 总 verdict `fail`。 -- 无 hard fail,但任一 candidate missing/inconclusive => 总 verdict `inconclusive`。 -- 无 hard fail/missing/inconclusive,但任一 candidate soft warning => 总 verdict `warning`。 -- 所有 candidate 都 pass => 总 verdict `pass`。 +- 任一 candidate hard fail => 总 `risk_verdict.status = fail`。 +- 无 hard fail,但任一 candidate missing/inconclusive => 总 `risk_verdict.status = inconclusive`。 +- 无 hard fail/missing/inconclusive,但任一 candidate soft warning => 总 `risk_verdict.status = warning`。 +- 所有 candidate 都 pass => 总 `risk_verdict.status = pass`。 + +## Final Decision Boundary + +`risk_verdict` 的输出对象固定包含: + +```json +{ + "scope": "regression_risk_only", + "is_final_experiment_judgment": false +} +``` + +这表示它只能作为风险提示,不应替代人的实验判断。一个 candidate 可以在 `risk_verdict` 上是 `warning`,但仍然因为探索价值而进入下一轮人工复盘。 ## Current Supported Conditions diff --git a/tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json b/tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json deleted file mode 100644 index fdf2429a94..0000000000 --- a/tests/evals/v2/runs/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.json +++ /dev/null @@ -1,166 +0,0 @@ -{ - "run": { - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "scenario_id": "cost_sensitive_task", - "variant_id": "baseline_default", - "started_at": "2026-04-24T04:48:30.824Z", - "ended_at": "2026-04-24T04:49:59.031Z", - "status": "completed", - "entry_user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "root_query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", - "observability_db_ref": ".observability\\observability_v1.duckdb", - "notes": "Generated by scripts/evals/v2_record_run.ts" - }, - "scenario": { - "scenario_id": "cost_sensitive_task", - "name": "Cost Sensitive Task", - "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", - "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", - "tags": [ - "efficiency", - "tradeoff", - "observability-v2" - ], - "expected_artifacts": [], - "expected_tools": [ - "Read" - ], - "expected_skills": [], - "expected_constraints": [ - "Must not modify files", - "Should avoid unnecessary background subagent expansion", - "Should keep the main query within a small number of turns" - ], - "max_turn_count": 8, - "max_total_billed_tokens": 260000, - "max_subagent_count": 3, - "owner": "local", - "status": "ready" - }, - "variant": { - "variant_id": "baseline_default", - "name": "Baseline Default", - "description": "Current default harness baseline used for comparison.", - "change_layer": "mixed", - "git_commit": "HEAD", - "config_snapshot_ref": "path/to/baseline-config.json", - "notes": "Use this as the default baseline unless a scenario explicitly requires another baseline." - }, - "evidence": { - "action": { - "event_date": "2026-04-24", - "user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "started_at": "2026-04-24T04:48:30.824Z", - "started_at_ms": 1777006110824, - "ended_at": "2026-04-24T04:49:59.031Z", - "ended_at_ms": 1777006199031, - "duration_ms": 88207, - "event_count": 438, - "query_count": 5, - "main_thread_query_count": 1, - "subagent_query_count": 5, - "subagent_count": 4, - "tool_call_count": 22, - "raw_input_tokens": "9", - "output_tokens": "2987", - "cache_read_tokens": "187198", - "cache_create_tokens": "210205", - "total_prompt_input_tokens": "397412", - "total_billed_tokens": "400399", - "main_thread_total_prompt_input_tokens": "158157", - "subagent_total_prompt_input_tokens": "239255" - }, - "rootQuery": { - "query_id": "15ecf197-b1c6-47e2-8d94-df1f88f0d822", - "user_action_id": "1d5eb5e1-2fe0-42fa-9450-7b05d6367976", - "session_id": "eca68c72-ad03-4e56-a18f-f50000e8c0c7", - "conversation_id": "eca68c72-ad03-4e56-a18f-f50000e8c0c7", - "query_source": "repl_main_thread", - "subagent_id": null, - "subagent_type": null, - "subagent_reason": "repl_main_thread", - "subagent_trigger_kind": null, - "subagent_trigger_detail": null, - "subagent_trigger_payload_json": null, - "agent_name": "main_thread", - "source_group": "main_thread", - "started_at": "2026-04-24T04:48:30.824Z", - "started_at_ms": 1777006110824, - "ended_at": "2026-04-24T04:49:06.168Z", - "ended_at_ms": 1777006146168, - "duration_ms": 35344, - "first_event": "state.initialized", - "last_event": "query.terminated", - "terminal_reason": "completed", - "stop_reason": "end_turn", - "turn_count": 4, - "query_max_loop_iter": 4, - "query_avg_loop_iter": 2.5, - "tool_call_count": 7, - "event_count": 122, - "raw_query_started_count": 1, - "raw_query_terminated_count": 1, - "inferred_query_started_count": 1, - "inferred_query_terminated_count": 1, - "strict_is_complete": "true", - "inferred_is_complete": "true" - }, - "tools": [ - { - "tool_name": "Edit", - "tool_count": 11, - "closed_count": "11", - "failed_count": "0" - }, - { - "tool_name": "Read", - "tool_count": 5, - "closed_count": "5", - "failed_count": "0" - }, - { - "tool_name": "Glob", - "tool_count": 3, - "closed_count": "3", - "failed_count": "0" - }, - { - "tool_name": "Write", - "tool_count": 3, - "closed_count": "3", - "failed_count": "0" - } - ], - "subagents": [ - { - "subagent_reason": "session_memory", - "subagent_trigger_kind": "post_sampling_hook", - "subagent_trigger_detail": "token_threshold_and_tool_threshold", - "subagent_count": 1, - "avg_duration_ms": 33043 - }, - { - "subagent_reason": "prompt_suggestion", - "subagent_trigger_kind": "stop_hook_background", - "subagent_trigger_detail": "suggestion_generation_allowed", - "subagent_count": 1, - "avg_duration_ms": 8029 - }, - { - "subagent_reason": "extract_memories", - "subagent_trigger_kind": "stop_hook_background", - "subagent_trigger_detail": "post_turn_background_extraction", - "subagent_count": 1, - "avg_duration_ms": 29954 - }, - { - "subagent_reason": "session_memory", - "subagent_trigger_kind": "post_sampling_hook", - "subagent_trigger_detail": "token_threshold_and_natural_break", - "subagent_count": 1, - "avg_duration_ms": 40480 - } - ], - "recoveries": [] - } -} diff --git a/tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json b/tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json deleted file mode 100644 index 693d3d8faa..0000000000 --- a/tests/evals/v2/runs/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json +++ /dev/null @@ -1,147 +0,0 @@ -{ - "run": { - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "scenario_id": "cost_sensitive_task", - "variant_id": "candidate_session_memory_sparse", - "started_at": "2026-04-24T04:55:36.952Z", - "ended_at": "2026-04-24T04:56:23.033Z", - "status": "completed", - "entry_user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", - "root_query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", - "observability_db_ref": ".observability\\observability_v1.duckdb", - "notes": "Generated by scripts/evals/v2_record_run.ts" - }, - "scenario": { - "scenario_id": "cost_sensitive_task", - "name": "Cost Sensitive Task", - "description": "Evaluate whether the agent can inspect V2 observability status with controlled token cost and limited background branching.", - "input_prompt": "请阅读当前项目中 V2 可观测系统相关文件,简单总结目前 V2 已实现了哪些能力,不要修改文件。", - "tags": [ - "efficiency", - "tradeoff", - "observability-v2" - ], - "expected_artifacts": [], - "expected_tools": [ - "Read" - ], - "expected_skills": [], - "expected_constraints": [ - "Must not modify files", - "Should avoid unnecessary background subagent expansion", - "Should keep the main query within a small number of turns" - ], - "max_turn_count": 8, - "max_total_billed_tokens": 260000, - "max_subagent_count": 3, - "owner": "local", - "status": "ready" - }, - "variant": { - "variant_id": "candidate_session_memory_sparse", - "name": "Candidate Session Memory Sparse", - "description": "Increase the default session memory tool-call threshold from 3 to 6 to reduce background memory subagent cost.", - "change_layer": "harness", - "base_variant_id": "baseline_default", - "git_commit": "HEAD", - "config_snapshot_ref": "src/services/SessionMemory/sessionMemoryUtils.ts", - "notes": "Token-saving harness candidate. Keeps natural-break trigger intact while reducing tool-threshold-triggered updates." - }, - "evidence": { - "action": { - "event_date": "2026-04-24", - "user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", - "started_at": "2026-04-24T04:55:36.952Z", - "started_at_ms": 1777006536952, - "ended_at": "2026-04-24T04:56:23.033Z", - "ended_at_ms": 1777006583033, - "duration_ms": 46081, - "event_count": 286, - "query_count": 3, - "main_thread_query_count": 1, - "subagent_query_count": 3, - "subagent_count": 2, - "tool_call_count": 15, - "raw_input_tokens": "8", - "output_tokens": "4157", - "cache_read_tokens": "160020", - "cache_create_tokens": "188506", - "total_prompt_input_tokens": "348534", - "total_billed_tokens": "352691", - "main_thread_total_prompt_input_tokens": "158909", - "subagent_total_prompt_input_tokens": "189625" - }, - "rootQuery": { - "query_id": "f15ca52c-e702-448a-9cd8-8d5c942ff4e2", - "user_action_id": "dbf9fae1-0a5a-4f50-aba7-02047ced9390", - "session_id": "e34e7a32-552b-4608-af59-8b48025e0ea0", - "conversation_id": "e34e7a32-552b-4608-af59-8b48025e0ea0", - "query_source": "repl_main_thread", - "subagent_id": null, - "subagent_type": null, - "subagent_reason": "repl_main_thread", - "subagent_trigger_kind": null, - "subagent_trigger_detail": null, - "subagent_trigger_payload_json": null, - "agent_name": "main_thread", - "source_group": "main_thread", - "started_at": "2026-04-24T04:55:36.952Z", - "started_at_ms": 1777006536952, - "ended_at": "2026-04-24T04:56:02.640Z", - "ended_at_ms": 1777006562640, - "duration_ms": 25688, - "first_event": "state.initialized", - "last_event": "query.terminated", - "terminal_reason": "completed", - "stop_reason": "end_turn", - "turn_count": 4, - "query_max_loop_iter": 4, - "query_avg_loop_iter": 2.5, - "tool_call_count": 7, - "event_count": 122, - "raw_query_started_count": 1, - "raw_query_terminated_count": 1, - "inferred_query_started_count": 1, - "inferred_query_terminated_count": 1, - "strict_is_complete": "true", - "inferred_is_complete": "true" - }, - "tools": [ - { - "tool_name": "Read", - "tool_count": 8, - "closed_count": "8", - "failed_count": "0" - }, - { - "tool_name": "Edit", - "tool_count": 5, - "closed_count": "5", - "failed_count": "0" - }, - { - "tool_name": "Glob", - "tool_count": 2, - "closed_count": "2", - "failed_count": "0" - } - ], - "subagents": [ - { - "subagent_reason": "extract_memories", - "subagent_trigger_kind": "stop_hook_background", - "subagent_trigger_detail": "post_turn_background_extraction", - "subagent_count": 1, - "avg_duration_ms": 18519 - }, - { - "subagent_reason": "session_memory", - "subagent_trigger_kind": "post_sampling_hook", - "subagent_trigger_detail": "token_threshold_and_tool_threshold", - "subagent_count": 1, - "avg_duration_ms": 29679 - } - ], - "recoveries": [] - } -} diff --git a/tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json b/tests/evals/v2/runs/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.json similarity index 98% rename from tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json rename to tests/evals/v2/runs/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.json index 5b1f57c076..8f48d09a70 100644 --- a/tests/evals/v2/runs/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.json +++ b/tests/evals/v2/runs/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.json @@ -1,6 +1,6 @@ { "run": { - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", "scenario_id": "cost_sensitive_task", "variant_id": "baseline_default", "started_at": "2026-04-24T04:48:30.824Z", @@ -135,13 +135,13 @@ "failed_count": "0" }, { - "tool_name": "Glob", + "tool_name": "Write", "tool_count": 3, "closed_count": "3", "failed_count": "0" }, { - "tool_name": "Write", + "tool_name": "Glob", "tool_count": 3, "closed_count": "3", "failed_count": "0" @@ -155,13 +155,6 @@ "subagent_count": 1, "avg_duration_ms": 8029 }, - { - "subagent_reason": "session_memory", - "subagent_trigger_kind": "post_sampling_hook", - "subagent_trigger_detail": "token_threshold_and_tool_threshold", - "subagent_count": 1, - "avg_duration_ms": 33043 - }, { "subagent_reason": "extract_memories", "subagent_trigger_kind": "stop_hook_background", @@ -175,6 +168,13 @@ "subagent_trigger_detail": "token_threshold_and_natural_break", "subagent_count": 1, "avg_duration_ms": 40480 + }, + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 1, + "avg_duration_ms": 33043 } ], "recoveries": [] diff --git a/tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json b/tests/evals/v2/runs/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json similarity index 98% rename from tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json rename to tests/evals/v2/runs/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json index 6826640f1c..a544f4879a 100644 --- a/tests/evals/v2/runs/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json +++ b/tests/evals/v2/runs/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.json @@ -1,6 +1,6 @@ { "run": { - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "scenario_id": "cost_sensitive_task", "variant_id": "candidate_session_memory_sparse", "started_at": "2026-04-24T04:55:36.952Z", @@ -143,19 +143,19 @@ } ], "subagents": [ - { - "subagent_reason": "extract_memories", - "subagent_trigger_kind": "stop_hook_background", - "subagent_trigger_detail": "post_turn_background_extraction", - "subagent_count": 1, - "avg_duration_ms": 18519 - }, { "subagent_reason": "session_memory", "subagent_trigger_kind": "post_sampling_hook", "subagent_trigger_detail": "token_threshold_and_tool_threshold", "subagent_count": 1, "avg_duration_ms": 29679 + }, + { + "subagent_reason": "extract_memories", + "subagent_trigger_kind": "stop_hook_background", + "subagent_trigger_detail": "post_turn_background_extraction", + "subagent_count": 1, + "avg_duration_ms": 18519 } ], "recoveries": [] diff --git a/tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json b/tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json deleted file mode 100644 index a18e4b1728..0000000000 --- a/tests/evals/v2/scores/run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json +++ /dev/null @@ -1,92 +0,0 @@ -[ - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "task_success", - "subdimension": "main_chain_observed", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "queries", - "reason": "Main-thread root query is present in V1 evidence." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_expected_tool_hit_rate", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "decision_quality", - "subdimension": "expected_tool_hit_rate", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "tools", - "reason": "Observed 4 tool names against 1 expected tools." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "efficiency", - "subdimension": "total_billed_tokens", - "score_value": 400399, - "score_label": "observed", - "evidence_ref": "user_actions.total_billed_tokens", - "reason": "Raw efficiency fact from V1 user_actions." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_token_budget", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "efficiency", - "subdimension": "total_billed_token_budget", - "score_value": 0, - "score_label": "fail", - "evidence_ref": "user_actions.total_billed_tokens", - "reason": "total_billed_tokens=400399; budget=260000." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_v1_closure_health", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "stability", - "subdimension": "v1_closure_health", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "metrics_integrity_daily", - "reason": "Average of query, turn, tool, and subagent closure rates for the action date." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "stability", - "subdimension": "recovery_absence", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "recoveries", - "reason": "No recovery events were observed for this action." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "controllability", - "subdimension": "turn_limit_basic", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "queries.turn_count", - "reason": "Root query turn_count=4; scenario limit is 8." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "decision_quality", - "subdimension": "subagent_count_observed", - "score_value": 4, - "score_label": "observed", - "evidence_ref": "subagents", - "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." - }, - { - "score_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_subagent_count_budget", - "run_id": "run_2026-04-27T105508448Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "controllability", - "subdimension": "subagent_count_budget", - "score_value": 0, - "score_label": "fail", - "evidence_ref": "subagents", - "reason": "subagent_count=4; budget=3." - } -] diff --git a/tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json b/tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json deleted file mode 100644 index da6285230f..0000000000 --- a/tests/evals/v2/scores/run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json +++ /dev/null @@ -1,92 +0,0 @@ -[ - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "task_success", - "subdimension": "main_chain_observed", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "queries", - "reason": "Main-thread root query is present in V1 evidence." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_expected_tool_hit_rate", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "decision_quality", - "subdimension": "expected_tool_hit_rate", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "tools", - "reason": "Observed 3 tool names against 1 expected tools." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "efficiency", - "subdimension": "total_billed_tokens", - "score_value": 352691, - "score_label": "observed", - "evidence_ref": "user_actions.total_billed_tokens", - "reason": "Raw efficiency fact from V1 user_actions." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_token_budget", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "efficiency", - "subdimension": "total_billed_token_budget", - "score_value": 0, - "score_label": "fail", - "evidence_ref": "user_actions.total_billed_tokens", - "reason": "total_billed_tokens=352691; budget=260000." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_v1_closure_health", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "stability", - "subdimension": "v1_closure_health", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "metrics_integrity_daily", - "reason": "Average of query, turn, tool, and subagent closure rates for the action date." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "stability", - "subdimension": "recovery_absence", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "recoveries", - "reason": "No recovery events were observed for this action." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "controllability", - "subdimension": "turn_limit_basic", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "queries.turn_count", - "reason": "Root query turn_count=4; scenario limit is 8." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "decision_quality", - "subdimension": "subagent_count_observed", - "score_value": 2, - "score_label": "observed", - "evidence_ref": "subagents", - "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." - }, - { - "score_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_subagent_count_budget", - "run_id": "run_2026-04-27T105524538Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "controllability", - "subdimension": "subagent_count_budget", - "score_value": 1, - "score_label": "pass", - "evidence_ref": "subagents", - "reason": "subagent_count=2; budget=3." - } -] diff --git a/tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json b/tests/evals/v2/scores/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json similarity index 72% rename from tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json rename to tests/evals/v2/scores/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json index 99ea06233c..68356d9d07 100644 --- a/tests/evals/v2/scores/run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json +++ b/tests/evals/v2/scores/run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1.scores.json @@ -1,7 +1,7 @@ [ { - "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_task_success_main_chain_observed", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "task_success", "subdimension": "main_chain_observed", "score_value": 1, @@ -10,8 +10,8 @@ "reason": "Main-thread root query is present in V1 evidence." }, { - "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "efficiency", "subdimension": "total_billed_tokens", "score_value": 400399, @@ -20,8 +20,18 @@ "reason": "Raw efficiency fact from V1 user_actions." }, { - "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_stability_recovery_absence", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "stability", "subdimension": "recovery_absence", "score_value": 1, @@ -30,23 +40,13 @@ "reason": "No recovery events were observed for this action." }, { - "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", + "score_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1_controllability_turn_limit_basic", + "run_id": "run_2026-04-30T021205319Z_cost_sensitive_task_baseline_default_1d5eb5e1", "dimension": "controllability", "subdimension": "turn_limit_basic", "score_value": 1, "score_label": "pass", "evidence_ref": "queries.turn_count", "reason": "Root query turn_count=4; scenario limit is 8." - }, - { - "score_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-28T162901612Z_cost_sensitive_task_baseline_default_1d5eb5e1", - "dimension": "decision_quality", - "subdimension": "subagent_count_observed", - "score_value": 4, - "score_label": "observed", - "evidence_ref": "subagents", - "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." } ] diff --git a/tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json b/tests/evals/v2/scores/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json similarity index 73% rename from tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json rename to tests/evals/v2/scores/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json index 87ca28e79b..b526c331a4 100644 --- a/tests/evals/v2/scores/run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json +++ b/tests/evals/v2/scores/run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1.scores.json @@ -1,7 +1,7 @@ [ { - "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_task_success_main_chain_observed", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "task_success", "subdimension": "main_chain_observed", "score_value": 1, @@ -10,8 +10,8 @@ "reason": "Main-thread root query is present in V1 evidence." }, { - "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_efficiency_total_billed_tokens", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "efficiency", "subdimension": "total_billed_tokens", "score_value": 352691, @@ -20,8 +20,18 @@ "reason": "Raw efficiency fact from V1 user_actions." }, { - "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_stability_recovery_absence", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "stability", "subdimension": "recovery_absence", "score_value": 1, @@ -30,23 +40,13 @@ "reason": "No recovery events were observed for this action." }, { - "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", + "score_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_controllability_turn_limit_basic", + "run_id": "run_2026-04-30T021206101Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", "dimension": "controllability", "subdimension": "turn_limit_basic", "score_value": 1, "score_label": "pass", "evidence_ref": "queries.turn_count", "reason": "Root query turn_count=4; scenario limit is 8." - }, - { - "score_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1_decision_quality_subagent_count_observed", - "run_id": "run_2026-04-28T162912577Z_cost_sensitive_task_candidate_session_memory_sparse_dbf9fae1", - "dimension": "decision_quality", - "subdimension": "subagent_count_observed", - "score_value": 2, - "score_label": "observed", - "evidence_ref": "subagents", - "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." } ] diff --git a/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json b/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-30T015859120Z.json similarity index 78% rename from tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json rename to tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-30T015859120Z.json index 2a8f9ac60a..18e1e08ac7 100644 --- a/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-29T072125437Z.json +++ b/tests/evals/v2/verification-reports/v2_1_bind_runner_2026-04-30T015859120Z.json @@ -1,7 +1,7 @@ { - "verification_id": "v2_1_bind_runner_2026-04-29T072125437Z", - "generated_at": "2026-04-29T07:22:28.161Z", - "temp_root": ".observability\\v2-runner-verification\\2026-04-29T072125437Z", + "verification_id": "v2_1_bind_runner_2026-04-30T015859120Z", + "generated_at": "2026-04-30T01:59:10.761Z", + "temp_root": ".observability\\v2-runner-verification\\2026-04-30T015859120Z", "passed": true, "case_count": 9, "failed_count": 0, @@ -12,10 +12,10 @@ "passed": true, "expected": "success", "status": 0, - "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.json", - "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.md", + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-30T015859120Z_2026-04-30T015902609Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-30T015859120Z_2026-04-30T015902609Z.md", "artifacts_cleaned": true, - "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-29T072125437Z_2026-04-29T072218602Z.md" + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_single_candidate_2026-04-30T015859120Z_2026-04-30T015902609Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_single_candidate_2026-04-30T015859120Z_2026-04-30T015902609Z.md" }, { "case_id": "single_scenario_multi_candidate", @@ -23,10 +23,10 @@ "passed": true, "expected": "success", "status": 0, - "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.json", - "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.md", + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-30T015859120Z_2026-04-30T015905575Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-30T015859120Z_2026-04-30T015905575Z.md", "artifacts_cleaned": true, - "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-29T072125437Z_2026-04-29T072221989Z.md" + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_candidate_2026-04-30T015859120Z_2026-04-30T015905575Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_candidate_2026-04-30T015859120Z_2026-04-30T015905575Z.md" }, { "case_id": "multi_scenario_single_candidate", @@ -34,10 +34,10 @@ "passed": true, "expected": "success", "status": 0, - "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.json", - "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.md", + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-30T015859120Z_2026-04-30T015909308Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-30T015859120Z_2026-04-30T015909308Z.md", "artifacts_cleaned": true, - "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-29T072125437Z_2026-04-29T072226554Z.md" + "error_excerpt": "Created V2.1 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_1_verify_multi_scenario_2026-04-30T015859120Z_2026-04-30T015909308Z.json\nCreated V2.1 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_1_verify_multi_scenario_2026-04-30T015859120Z_2026-04-30T015909308Z.md" }, { "case_id": "missing_action_binding", @@ -61,7 +61,7 @@ "passed": true, "expected": "failure", "status": 1, - "error_excerpt": "Command failed: bun run scripts/evals/v2_record_run.ts --scenario cost_sensitive_task --variant baseline_default --user-action-id v2-verify-missing-root-action --db E:\\claude-code-transparent\\.observability\\v2-runner-verification\\2026-04-29T072125437Z\\missing-root.duckdb --score-spec-ids task_success.main_chain_observed,efficiency.total_billed_tokens,decision_quality.subagent_count_observed,stability.recovery_absence,controllability.turn_limit_basic\nFact-only binding failed: user_action_id=v2-ve" + "error_excerpt": "Command failed: bun run scripts/evals/v2_record_run.ts --scenario cost_sensitive_task --variant baseline_default --user-action-id v2-verify-missing-root-action --db E:\\claude-code-transparent\\.observability\\v2-runner-verification\\2026-04-30T015859120Z\\missing-root.duckdb --score-spec-ids task_success.main_chain_observed,efficiency.total_billed_tokens,decision_quality.subagent_count_observed,stability.recovery_absence,controllability.turn_limit_basic\nFact-only binding failed: user_action_id=v2-ve" }, { "case_id": "missing_score_spec_id", From 3abccf508ccead1f7ce8aa907d6de6fbcb8534d4 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sat, 2 May 2026 22:16:56 +0800 Subject: [PATCH 14/26] Add observability v2.2 execute_harness alpha --- ...ha\344\273\273\345\212\241\344\271\246.md" | 385 ++++++++++++++ ...ndidate_session_memory_sparse_0acb35d4.md" | 33 ++ ...te_harness_smoke_2026-05-02T132328195Z.md" | 61 +++ ...moke_minimal_baseline_default_1e3c516e.md" | 49 ++ ...ndidate_session_memory_sparse_0acb35d4.md" | 49 ++ scripts/evals/v2_emit_fixture_trace.ts | 131 +++++ scripts/evals/v2_harness_execution.ts | 465 +++++++++++++++++ scripts/evals/v2_run_experiment.ts | 235 +++++++-- scripts/evals/v2_verify_bind_runner.ts | 86 +++- .../evals/v2_verify_execute_harness_alpha.ts | 476 ++++++++++++++++++ scripts/observability/build_duckdb_etl.ts | 183 ++++++- src/cli/print.ts | 2 +- src/cli/structuredIO.ts | 2 +- src/main.tsx | 2 +- src/observability/harness.ts | 43 ++ src/observability/v2/evalExperimentTypes.ts | 12 + src/observability/v2/evalTypes.ts | 8 + tests/evals/v2/README.md | 157 +++--- tests/evals/v2/V2.1-bind_existing-usage.md | 9 + .../v2/V2.2-execute_harness-alpha-usage.md | 148 ++++++ tests/evals/v2/experiment-runs/README.md | 118 ++--- ...e_harness_smoke_2026-05-02T132328195Z.json | 372 ++++++++++++++ .../_experiment.execute_harness.smoke.json | 38 ++ ...oke_minimal_baseline_default_1e3c516e.json | 131 +++++ ...didate_session_memory_sparse_0acb35d4.json | 132 +++++ .../execute_harness_smoke_minimal.json | 20 + ...imal_baseline_default_1e3c516e.scores.json | 52 ++ ...session_memory_sparse_0acb35d4.scores.json | 52 ++ ...e_harness_alpha_2026-05-02T141434752Z.json | 89 ++++ 29 files changed, 3336 insertions(+), 204 deletions(-) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T132328195Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" create mode 100644 scripts/evals/v2_emit_fixture_trace.ts create mode 100644 scripts/evals/v2_harness_execution.ts create mode 100644 scripts/evals/v2_verify_execute_harness_alpha.ts create mode 100644 tests/evals/v2/V2.2-execute_harness-alpha-usage.md create mode 100644 tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T132328195Z.json create mode 100644 tests/evals/v2/experiments/_experiment.execute_harness.smoke.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.json create mode 100644 tests/evals/v2/scenarios/execute_harness_smoke_minimal.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.scores.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T141434752Z.json diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" new file mode 100644 index 0000000000..e78e67fac5 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" @@ -0,0 +1,385 @@ +# 开发任务书:V2.2-alpha execute_harness 最小闭环 + +--- + +## 任务书:V2.2-alpha 一键自动化评测最小闭环 + +### 1. 背景 + +当前 V2.1 已完成 `bind_existing` 模式:通过已有 V1 `user_action_id` 自动生成 run、score、compare report、risk verdict、scorecard、exploration signals。README 明确说明当前 V2.1 仍需要先产生真实 V1 traces,并通过 `action_bindings` 绑定 baseline/candidate 的 `user_action_id`。 + +当前 `execute_harness` 已预留,但在没有稳定 headless harness execution adapter 前被明确阻塞。 + +本轮目标是实现 V2.2-alpha:让系统能自动执行最小 experiment,并自动捕获本次运行产生的 V1 `user_action_id`。 + +--- + +## 2. 本轮目标 + +实现一个最小可用的 `execute_harness` 闭环: + +```text +experiment manifest +→ scenario +→ baseline/candidate variant +→ execute harness +→ capture user_action_id +→ fact-only bind to V1 +→ generate run/score/report/risk_verdict +``` + +--- + +## 3. 本轮不做 + +* 不做长上下文专项; +* 不做 tool/skill 价值专项; +* 不做 repeat=10 鲁棒性; +* 不做远端平台; +* 不做模型裁判; +* 不做自动 git checkout; +* 不做大规模 variant 切换; +* 不改写 V1 观测系统主结构; +* 不把 `risk_verdict` 当最终智能裁判。 + +--- + +## 4. 理解清单 + +先不要改代码。先输出: + +1. 当前 `bind_existing` 已解决什么; +2. `execute_harness` 真正缺什么; +3. 为什么不能靠“取最新 user_action_id”; +4. 为什么需要 `benchmark_run_id`; +5. 第一版为什么只支持 1 scenario / 1 baseline / 1 candidate / repeat=1; +6. 哪些 variant 类型第一版不支持; +7. 本轮如果找不到 headless harness 入口,应该如何停下而不是伪实现。 + +--- + +## 5. Preflight / Reality Check + +先检查仓库: + +1. 是否已有可 headless 执行 prompt 的 CLI / SDK / script 入口; +2. 当前 REPL 是否能非交互式接收 prompt; +3. 是否已有 querySource / user_action / benchmark context 注入机制; +4. V1 event schema 是否能容纳: + + * `benchmark_run_id` + * `experiment_id` + * `scenario_id` + * `variant_id` +5. 当前 V1 DB 是否能按这些字段查询回 user_action; +6. variant manifest 当前能否表达 env/config/model/feature overrides; +7. 当前 `v2_run_experiment.ts` 哪些逻辑可复用,哪些需要分支。 + +如果任一关键点不成立,先输出阻塞点,不要硬实现。 + +--- + +## 6. Phase A:Eval Execution Context + +新增或复用一种运行上下文: + +```ts +interface EvalExecutionContext { + experiment_id: string + scenario_id: string + variant_id: string + benchmark_run_id: string + eval_run_id: string +} +``` + +要求: + +* 自动执行 scenario 时注入; +* V1 事件能记录; +* 后续可通过 `benchmark_run_id` 查回 user_action。 + +验收: + +* 能在 V1 event / DB 中看到该 context; +* 不影响正常用户交互模式; +* 没有 context 时正常运行。 + +--- + +## 7. Phase B:HarnessExecutionAdapter + +新增 adapter 接口: + +```ts +interface HarnessExecutionAdapter { + execute(input: { + experimentId: string + scenarioId: string + variantId: string + runId: string + prompt: string + timeoutMs: number + }): Promise<{ + status: 'completed' | 'failed' | 'timeout' + entryUserActionId?: string + stdoutRef?: string + stderrRef?: string + error?: string + }> +} +``` + +第一版实现要求: + +* 使用最稳定的现有 CLI / SDK / script 入口; +* 如果没有稳定入口,保留接口并明确报错; +* 不做伪自动执行。 + +--- + +## 8. Phase C:Action Capture + +实现: + +```text +benchmark_run_id → user_action_id +``` + +查询逻辑。 + +禁止正式使用: + +```text +取最新 user_action_id +``` + +除非只作为 debug fallback,并且不能进入正式 score。 + +验收: + +* 给定 `benchmark_run_id` 能查到唯一 user_action; +* 查不到时 run 状态为 `capture_failed`; +* 查到多个时 run 状态为 `ambiguous_capture`; +* 只有唯一绑定成功时才能进入 score。 + +--- + +## 9. Phase D:Variant Applier v0 + +实现最小 variant 应用能力。 + +第一版只支持: + +* env overrides +* config snapshot ref +* model config +* feature gates + +暂不支持: + +* 自动 git checkout +* 自动源码 patch +* 复杂文件系统 mutation + +验收: + +* baseline 和 candidate 可以在同一 experiment 中按顺序应用; +* 每次运行后能恢复; +* 出错时能清理或提示人工恢复。 + +--- + +## 10. Phase E:Runner `execute_harness` mode + +修改 `v2_run_experiment.ts`: + +当前逻辑: + +```text +mode === execute_harness → throw error +``` + +改为: + +```text +mode === execute_harness +→ create planned run +→ apply variant +→ execute scenario prompt +→ capture user_action_id +→ call existing record/score/compare/gate logic +``` + +限制: + +* 第一版只支持 `repeat_count = 1`; +* 第一版可以只支持一个 scenario; +* 如果 manifest 超出支持范围,明确报错。 + +--- + +## 11. Phase F:最小样例 experiment + +新增一个样例: + +```text +tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +要求: + +* 1 个 scenario; +* baseline_default; +* 1 个 candidate; +* repeat_count = 1; +* mode = execute_harness。 + +--- + +## 12. Phase G:验证 + +必须覆盖: + +1. execute_harness 成功路径; +2. adapter 不存在时报明确错误; +3. capture 失败; +4. capture 多匹配; +5. variant 应用失败; +6. scenario 不存在; +7. baseline/candidate 任一失败; +8. 生成 report 成功。 + +--- + +## 13. 验收标准 + +完成后必须满足: + +* `bind_existing` 仍然可用; +* `execute_harness` 不再只是固定报错; +* 至少一个最小 smoke experiment 可以自动执行; +* 自动执行后能通过 `benchmark_run_id` 捕获唯一 `user_action_id`; +* captured action 能进入现有 V2 score/report/risk verdict 流程; +* 不使用“最新 user_action_id”作为正式绑定; +* 失败路径有明确状态和错误; +* 不影响普通交互运行。 + +--- + +## 14. 完成后 Checkpoint + +输出: + +```md +## Checkpoint + +### 本轮目标 +实现 V2.2-alpha execute_harness 最小闭环。 + +### 实际完成 +... + +### 修改文件 +... + +### 新增命令 +... + +### 最小验证命令 +... + +### 成功样例 +... + +### 未完成项 +... + +### 风险项 +... + +### 当前一键自动化程度 +- bind_existing: +- execute_harness: + +### 下一步候选 A +扩展多 scenario / 多 candidate。 + +### 下一步候选 B +加入 repeat_count 鲁棒性评测。 + +### 是否等待用户拍板 +是。 +``` + +--- + +# 八、如果是我开发,我会按这个顺序做 + +## Step 1:不要先写 runner,先找入口 + +我会先做 Reality Check: + +```text +有没有现成非交互式 prompt 执行入口? +``` + +因为如果没有入口,runner 写得再漂亮也跑不起来。 + +--- + +## Step 2:先实现 EvalExecutionContext + +我会先解决: + +```text +这次自动运行如何被 V1 标记? +``` + +因为如果不能标记,就没法准确 capture action。 + +--- + +## Step 3:实现 capture + +我会优先做: + +```text +benchmark_run_id → user_action_id +``` + +而不是先做复杂 variant。 + +原因: + +```text +自动评测最重要的不是跑起来,而是跑完以后知道这次运行是哪一次。 +``` + +--- + +## Step 4:做最小 adapter + +我会实现最小可用 headless 执行,不追求通用。 + +第一版只要能跑一个 smoke scenario 就够。 + +--- + +## Step 5:把现有后半段复用起来 + +一旦拿到 `user_action_id`,后面全部复用现有: + +```text +record_run +score +compare +risk_verdict +scorecard +exploration_signals +``` + +因为这部分已经基本成熟。 + +--- + diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" new file mode 100644 index 0000000000..380d8ecdfa --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" @@ -0,0 +1,33 @@ +# V2 Run Comparison + +## 理解清单 + +- baseline_run: run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e +- candidate_run: run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4 +- scenario: execute_harness_smoke_minimal +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## 预期效果 + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## 设计思路 + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 1e3c516e-125b-4575-b3ee-5e7e6b45a8ed +- candidate_user_action_id: 0acb35d4-75b8-4219-86fc-ad5f291bc9ff + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 26628 | 26628 | 0 | unchanged | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T132328195Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T132328195Z.md" new file mode 100644 index 0000000000..eaba0e3c68 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T132328195Z.md" @@ -0,0 +1,61 @@ +# V2 Experiment Summary: execute_harness_smoke + +## 理解清单 + +- experiment: execute_harness_smoke +- mode: execute_harness +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\execute_harness_smoke_2026-05-02T132328195Z.json + +## 预期效果 + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2.2-alpha executes the scenario first, then captures the generated user_action_id through benchmark_run_id. + +## 设计思路 + +The runner always scores only trace-backed V1 facts. V2.2-alpha adds an execution front half, but the score/compare/gate back half is the same fact-only pipeline used by V2.1. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 0 +- risk_status: passed +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: regression_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | efficiency.total_billed_tokens | 26628 | 26628 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- No exploratory signal was derived from the current automatic scorecard; manual review may still find qualitative differences. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | +| execute_harness_smoke_minimal | 1 | run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e | candidate_session_memory_sparse | run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4 | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.md" new file mode 100644 index 0000000000..f888d5ad22 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.md" @@ -0,0 +1,49 @@ +# V2 Run Report: run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: baseline_default (Baseline Default) +- user_action_id: 1e3c516e-125b-4575-b3ee-5e7e6b45a8ed +- root_query_id: 601131c9-79b4-497c-9dd2-51761534caeb +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T13:23:08.789Z +- duration_ms: 3958 +- query_count: 2 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 26626 +- total_billed_tokens: 26628 +- root_turn_count: 1 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (26628) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" new file mode 100644 index 0000000000..326d69d4e7 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md" @@ -0,0 +1,49 @@ +# V2 Run Report: run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4 + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- user_action_id: 0acb35d4-75b8-4219-86fc-ad5f291bc9ff +- root_query_id: a3751c61-21ef-410c-a46f-bc117accc262 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T13:23:20.784Z +- duration_ms: 3599 +- query_count: 2 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 26626 +- total_billed_tokens: 26628 +- root_turn_count: 1 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (26628) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git a/scripts/evals/v2_emit_fixture_trace.ts b/scripts/evals/v2_emit_fixture_trace.ts new file mode 100644 index 0000000000..7d51c7e8aa --- /dev/null +++ b/scripts/evals/v2_emit_fixture_trace.ts @@ -0,0 +1,131 @@ +import { randomUUID } from 'node:crypto' +import { spawnSync } from 'node:child_process' +import { appendFile, mkdir } from 'node:fs/promises' +import path from 'node:path' + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const observabilityDir = path.join(repoRoot, '.observability') +const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') + +function requiredEnv(name: string): string { + const value = process.env[name] + if (!value || value.trim() === '') { + throw new Error(`Missing required fixture env: ${name}`) + } + return value +} + +function sqlString(value: string): string { + return `'${value.replaceAll("'", "''")}'` +} + +function writeFixtureDb(params: { + dbPath: string + userActionId: string + queryId: string + startedAt: string + endedAt: string +}) { + const benchmarkRunId = requiredEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID') + const experimentId = requiredEnv('CLAUDE_CODE_EVAL_EXPERIMENT_ID') + const scenarioId = requiredEnv('CLAUDE_CODE_EVAL_SCENARIO_ID') + const variantId = requiredEnv('CLAUDE_CODE_EVAL_VARIANT_ID') + const evalRunId = requiredEnv('CLAUDE_CODE_EVAL_RUN_ID') + const sql = [ + 'CREATE TABLE IF NOT EXISTS user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, experiment_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, benchmark_run_id VARCHAR, eval_run_id VARCHAR, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', + 'CREATE TABLE IF NOT EXISTS queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR, turn_count BIGINT, terminal_reason VARCHAR);', + 'CREATE TABLE IF NOT EXISTS tools(user_action_id VARCHAR, tool_name VARCHAR, is_closed BOOLEAN, has_failed BOOLEAN);', + 'CREATE TABLE IF NOT EXISTS subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', + 'CREATE TABLE IF NOT EXISTS recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', + 'CREATE TABLE IF NOT EXISTS metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', + `INSERT INTO user_actions VALUES (${sqlString(params.startedAt.slice(0, 10))}, ${sqlString(params.userActionId)}, ${sqlString(params.startedAt)}, 0, ${sqlString(params.endedAt)}, 10, 10, 2, 1, 1, 0, 0, 0, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, 100, 10, 0, 0, 100, 110, 100, 0);`, + `INSERT INTO queries VALUES (${sqlString(params.queryId)}, ${sqlString(params.userActionId)}, 'main_thread', ${sqlString(params.startedAt)}, 1, 'fixture_completed');`, + `INSERT INTO metrics_integrity_daily VALUES (${sqlString(params.startedAt.slice(0, 10))}, 1, 1, 1, 1);`, + ].join('\n') + const result = spawnSync(duckdbExe, [params.dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + throw new Error( + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim(), + ) + } +} + +async function main(): Promise { + await mkdir(observabilityDir, { recursive: true }) + const now = new Date() + const endedAt = new Date(now.getTime() + 10).toISOString() + const filePath = path.join( + observabilityDir, + `events-${now.toISOString().slice(0, 10).replaceAll('-', '')}.jsonl`, + ) + const userActionId = randomUUID() + const queryId = randomUUID() + const fixtureDbPath = process.env.V2_FIXTURE_DB_PATH + if (process.env.V2_FIXTURE_FAIL_VARIANT === process.env.CLAUDE_CODE_EVAL_VARIANT_ID) { + throw new Error(`Fixture requested failure for variant ${process.env.CLAUDE_CODE_EVAL_VARIANT_ID}`) + } + if (fixtureDbPath) { + writeFixtureDb({ + dbPath: fixtureDbPath, + userActionId, + queryId, + startedAt: now.toISOString(), + endedAt, + }) + if (process.env.V2_FIXTURE_DUPLICATE_CAPTURE === '1') { + writeFixtureDb({ + dbPath: fixtureDbPath, + userActionId: randomUUID(), + queryId: randomUUID(), + startedAt: now.toISOString(), + endedAt, + }) + } + console.log(`fixture_user_action_id=${userActionId}`) + return + } + const base = { + schema_version: '2026-04-19', + level: 'info', + component: 'v2_fixture_trace', + session_id: `v2-fixture-${randomUUID()}`, + conversation_id: `v2-fixture-${randomUUID()}`, + user_action_id: userActionId, + query_id: queryId, + query_source: 'repl_main_thread', + experiment_id: requiredEnv('CLAUDE_CODE_EVAL_EXPERIMENT_ID'), + scenario_id: requiredEnv('CLAUDE_CODE_EVAL_SCENARIO_ID'), + variant_id: requiredEnv('CLAUDE_CODE_EVAL_VARIANT_ID'), + benchmark_run_id: requiredEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID'), + eval_run_id: requiredEnv('CLAUDE_CODE_EVAL_RUN_ID'), + cwd: repoRoot, + git_branch: null, + build_version: 'v2-fixture', + } + const started = { + ...base, + ts_wall: now.toISOString(), + ts_mono_ms: 1, + event: 'query.started', + payload: {}, + } + const ended = { + ...base, + ts_wall: endedAt, + ts_mono_ms: 11, + event: 'query.terminated', + payload: { reason: 'fixture_completed' }, + } + await appendFile(filePath, `${JSON.stringify(started)}\n${JSON.stringify(ended)}\n`, 'utf8') + console.log(`fixture_user_action_id=${userActionId}`) +} + +main().catch(error => { + console.error(error instanceof Error ? error.message : error) + process.exit(1) +}) diff --git a/scripts/evals/v2_harness_execution.ts b/scripts/evals/v2_harness_execution.ts new file mode 100644 index 0000000000..861b1c6f0e --- /dev/null +++ b/scripts/evals/v2_harness_execution.ts @@ -0,0 +1,465 @@ +import { spawnSync } from 'node:child_process' +import { existsSync } from 'node:fs' +import { mkdir, writeFile } from 'node:fs/promises' +import path from 'node:path' + +import type { EvalScenario, EvalVariant } from '../../src/observability/v2/evalTypes' +import type { EvalExperimentExecutionConfig } from '../../src/observability/v2/evalExperimentTypes' + +type JsonRecord = Record + +export interface EvalExecutionContext { + experiment_id: string + scenario_id: string + variant_id: string + benchmark_run_id: string + eval_run_id: string +} + +export interface HarnessExecutionAdapterInput { + experimentId: string + scenarioId: string + variantId: string + runId: string + prompt: string + timeoutMs: number +} + +export interface HarnessExecutionAdapterOutput { + status: 'completed' | 'failed' | 'timeout' + entryUserActionId?: string + stdoutRef?: string + stderrRef?: string + error?: string +} + +export interface HarnessExecutionAdapter { + execute(input: HarnessExecutionAdapterInput): Promise +} + +export interface CaptureResult { + status: 'captured' | 'capture_failed' | 'ambiguous_capture' + user_action_id?: string + match_count: number + error?: string +} + +export interface VariantApplyResult { + env: Record + cliArgs: string[] + metadata: JsonRecord +} + +export interface ExecuteHarnessResult { + execution: HarnessExecutionAdapterOutput + capture: CaptureResult + variant_apply: VariantApplyResult + benchmark_run_id: string + eval_run_id: string +} + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') +const defaultDbPath = path.join(repoRoot, '.observability', 'observability_v1.duckdb') +const harnessRunsRoot = path.join(repoRoot, '.observability', 'v2-harness-runs') + +function sqlString(value: string): string { + return `'${value.replaceAll("'", "''")}'` +} + +function sanitizeId(value: string): string { + return value.replace(/[^a-zA-Z0-9_-]+/g, '_').replace(/^_+|_+$/g, '') +} + +function stringifyEnv(value: string | number | boolean): string { + return typeof value === 'string' ? value : String(value) +} + +function mergeEnvRecords(...records: Array | undefined>) { + const env: Record = {} + for (const record of records) { + for (const [key, value] of Object.entries(record ?? {})) { + env[key] = stringifyEnv(value) + } + } + return env +} + +function featureGateEnvName(key: string): string { + return `CLAUDE_CODE_FEATURE_${key.replace(/[^a-zA-Z0-9]+/g, '_').toUpperCase()}` +} + +function queryDuckDb(dbPath: string, sql: string): T[] { + const result = spawnSync(duckdbExe, ['-json', dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + const message = + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim() + throw new Error(`DuckDB query failed: ${message}`) + } + const output = String(result.stdout ?? '').trim() + return output ? (JSON.parse(output) as T[]) : [] +} + +function escapeSqlLiteral(value: string): string { + return value.replaceAll("'", "''") +} + +function relationColumns(dbPath: string, relation: string): string[] { + const rows = queryDuckDb<{ name?: string }>( + dbPath, + `PRAGMA table_info('${escapeSqlLiteral(relation)}');`, + ) + return rows + .map(row => (typeof row.name === 'string' ? row.name : null)) + .filter((value): value is string => Boolean(value)) +} + +function hasRelationColumn(dbPath: string, relation: string, column: string): boolean { + return relationColumns(dbPath, relation).includes(column) +} + +export function buildEvalContextEnv(context: EvalExecutionContext): Record { + return { + CLAUDE_CODE_EVAL_EXPERIMENT_ID: context.experiment_id, + CLAUDE_CODE_EVAL_SCENARIO_ID: context.scenario_id, + CLAUDE_CODE_EVAL_VARIANT_ID: context.variant_id, + CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID: context.benchmark_run_id, + CLAUDE_CODE_EVAL_RUN_ID: context.eval_run_id, + } +} + +export function isExecuteHarnessDisabled(args: Record): boolean { + return ( + Boolean(args['disable-execute-harness']) || + process.env.V2_2_EXECUTE_HARNESS === '0' || + process.env.V2_EXECUTE_HARNESS === '0' + ) +} + +export function createRunIdentity(params: { + experimentId: string + scenarioId: string + variantId: string + stamp: string +}): { eval_run_id: string; benchmark_run_id: string } { + const base = sanitizeId( + `${params.experimentId}_${params.scenarioId}_${params.variantId}_${params.stamp}`, + ) + return { + eval_run_id: `eval_${base}`, + benchmark_run_id: `bench_${base}`, + } +} + +export function applyVariantV0(params: { + variant: EvalVariant + execution?: EvalExperimentExecutionConfig + context: EvalExecutionContext +}): VariantApplyResult { + const { variant, execution, context } = params + const featureGateEnv = Object.fromEntries( + Object.entries(variant.feature_gates ?? {}).map(([key, value]) => [ + featureGateEnvName(key), + stringifyEnv(value), + ]), + ) + const env = { + ...buildEvalContextEnv(context), + ...mergeEnvRecords(execution?.env, variant.env_overrides), + ...featureGateEnv, + } + const cliArgs: string[] = [] + const maxTurns = variant.model_config?.max_turns ?? execution?.max_turns + if (variant.model_config?.model) cliArgs.push('--model', variant.model_config.model) + if (variant.model_config?.thinking) cliArgs.push('--thinking', variant.model_config.thinking) + if (typeof maxTurns === 'number') cliArgs.push('--max-turns', String(maxTurns)) + if (typeof variant.model_config?.max_budget_usd === 'number') { + cliArgs.push('--max-budget-usd', String(variant.model_config.max_budget_usd)) + } + + if (variant.config_snapshot_ref) { + env.CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF = variant.config_snapshot_ref + } + if (execution?.require_config_snapshot && variant.config_snapshot_ref) { + const candidatePath = path.resolve(repoRoot, variant.config_snapshot_ref) + if (!existsSync(candidatePath)) { + throw new Error( + `Variant apply failed: config_snapshot_ref does not exist: ${variant.config_snapshot_ref}`, + ) + } + } + + return { + env, + cliArgs, + metadata: { + supported_variant_fields: [ + 'env_overrides', + 'config_snapshot_ref', + 'model_config', + 'feature_gates', + ], + config_snapshot_ref: variant.config_snapshot_ref ?? null, + feature_gate_count: Object.keys(variant.feature_gates ?? {}).length, + env_override_count: Object.keys(variant.env_overrides ?? {}).length, + model_config: variant.model_config ?? null, + }, + } +} + +function expandTemplateArgs(args: string[], input: HarnessExecutionAdapterInput): string[] { + return args.map(arg => + arg + .replaceAll('{prompt}', input.prompt) + .replaceAll('{runId}', input.runId) + .replaceAll('{experimentId}', input.experimentId) + .replaceAll('{scenarioId}', input.scenarioId) + .replaceAll('{variantId}', input.variantId), + ) +} + +export class DisabledHarnessExecutionAdapter implements HarnessExecutionAdapter { + async execute(): Promise { + return { + status: 'failed', + error: + 'execute_harness adapter is disabled. Use bind_existing or remove --disable-execute-harness/V2_2_EXECUTE_HARNESS=0.', + } + } +} + +export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter { + constructor( + private readonly options: { + execution?: EvalExperimentExecutionConfig + env: Record + cliArgs: string[] + }, + ) {} + + async execute(input: HarnessExecutionAdapterInput): Promise { + const runDir = path.join(harnessRunsRoot, sanitizeId(input.runId)) + await mkdir(runDir, { recursive: true }) + const stdoutPath = path.join(runDir, 'stdout.txt') + const stderrPath = path.join(runDir, 'stderr.txt') + const commandPath = path.join(runDir, 'command.json') + const command = this.options.execution?.command ?? 'bun' + const defaultArgs = [ + 'run', + 'src/entrypoints/cli.tsx', + '--print', + '--output-format', + 'json', + ...this.options.cliArgs, + input.prompt, + ] + const args = this.options.execution?.args + ? expandTemplateArgs(this.options.execution.args, input) + : defaultArgs + + await writeFile( + commandPath, + `${JSON.stringify( + { + command, + args, + timeout_ms: input.timeoutMs, + env_keys: Object.keys(this.options.env).sort(), + }, + null, + 2, + )}\n`, + 'utf8', + ) + + const result = spawnSync(command, args, { + cwd: repoRoot, + encoding: 'utf8', + timeout: input.timeoutMs, + env: { + ...process.env, + ...this.options.env, + }, + }) + await writeFile(stdoutPath, String(result.stdout ?? ''), 'utf8') + await writeFile(stderrPath, String(result.stderr ?? result.error?.message ?? ''), 'utf8') + + const stdoutRef = path.relative(repoRoot, stdoutPath) + const stderrRef = path.relative(repoRoot, stderrPath) + if (result.error && result.error.name === 'ETIMEDOUT') { + return { + status: 'timeout', + stdoutRef, + stderrRef, + error: result.error.message, + } + } + if (result.status !== 0) { + return { + status: 'failed', + stdoutRef, + stderrRef, + error: + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim() || + `command exited with status ${result.status}`, + } + } + return { + status: 'completed', + stdoutRef, + stderrRef, + } + } +} + +export function createHarnessExecutionAdapter(params: { + execution?: EvalExperimentExecutionConfig + env: Record + cliArgs: string[] +}): HarnessExecutionAdapter { + const adapter = params.execution?.adapter ?? 'cli_print' + if (adapter === 'disabled') return new DisabledHarnessExecutionAdapter() + if (adapter === 'cli_print') return new CliPrintHarnessExecutionAdapter(params) + throw new Error(`Unsupported execute_harness adapter: ${adapter}`) +} + +export function rebuildObservabilityDb(dbPath?: string): void { + const args = ['run', 'scripts/observability/build_duckdb_etl.ts'] + if (dbPath) args.push('--db-path', dbPath) + const result = spawnSync('bun', args, { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + const message = + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim() + throw new Error(`Failed to rebuild V1 observability DB before capture: ${message}`) + } +} + +export function captureUserActionByBenchmarkRunId(params: { + benchmarkRunId: string + dbPath?: string +}): CaptureResult { + try { + const captureDbPath = params.dbPath ?? defaultDbPath + if (!hasRelationColumn(captureDbPath, 'user_actions', 'benchmark_run_id')) { + return { + status: 'capture_failed', + match_count: 0, + error: [ + `user_actions is missing benchmark_run_id in ${captureDbPath}.`, + 'The V1 DuckDB schema is stale and was not rebuilt with the current ETL.', + 'Run bun run scripts/observability/build_duckdb_etl.ts and retry.', + ].join(' '), + } + } + const rows = queryDuckDb<{ user_action_id: string }>( + captureDbPath, + [ + 'SELECT DISTINCT user_action_id', + 'FROM user_actions', + `WHERE benchmark_run_id = ${sqlString(params.benchmarkRunId)}`, + " AND TRIM(COALESCE(user_action_id, '')) <> ''", + 'ORDER BY user_action_id;', + ].join(' '), + ) + if (rows.length === 0) { + return { + status: 'capture_failed', + match_count: 0, + error: `No user_action_id found for benchmark_run_id=${params.benchmarkRunId}`, + } + } + if (rows.length > 1) { + return { + status: 'ambiguous_capture', + match_count: rows.length, + error: `Multiple user_action_id values found for benchmark_run_id=${params.benchmarkRunId}`, + } + } + return { + status: 'captured', + user_action_id: rows[0].user_action_id, + match_count: 1, + } + } catch (error) { + return { + status: 'capture_failed', + match_count: 0, + error: error instanceof Error ? error.message : String(error), + } + } +} + +export async function executeHarnessAndCapture(params: { + experimentId: string + scenario: EvalScenario + variant: EvalVariant + execution?: EvalExperimentExecutionConfig + evalRunId: string + benchmarkRunId: string + dbPath?: string +}): Promise { + const context: EvalExecutionContext = { + experiment_id: params.experimentId, + scenario_id: params.scenario.scenario_id, + variant_id: params.variant.variant_id, + benchmark_run_id: params.benchmarkRunId, + eval_run_id: params.evalRunId, + } + const variantApply = applyVariantV0({ + variant: params.variant, + execution: params.execution, + context, + }) + const timeoutMs = params.execution?.timeout_ms ?? 180_000 + const adapter = createHarnessExecutionAdapter({ + execution: params.execution, + env: variantApply.env, + cliArgs: variantApply.cliArgs, + }) + const execution = await adapter.execute({ + experimentId: params.experimentId, + scenarioId: params.scenario.scenario_id, + variantId: params.variant.variant_id, + runId: params.evalRunId, + prompt: params.scenario.input_prompt, + timeoutMs, + }) + const shouldRebuildDb = + execution.status === 'completed' && + (!params.dbPath || + (!params.execution?.command && !params.execution?.args)) + + if (shouldRebuildDb) { + rebuildObservabilityDb(params.dbPath) + } + const capture = + execution.status === 'completed' + ? captureUserActionByBenchmarkRunId({ + benchmarkRunId: params.benchmarkRunId, + dbPath: params.dbPath, + }) + : { + status: 'capture_failed' as const, + match_count: 0, + error: execution.error ?? `Harness execution did not complete: ${execution.status}`, + } + return { + execution, + capture, + variant_apply: variantApply, + benchmark_run_id: params.benchmarkRunId, + eval_run_id: params.evalRunId, + } +} diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index 8163b3888b..5c7d7866d1 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -2,7 +2,7 @@ import { spawnSync } from 'node:child_process' import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises' import path from 'node:path' -import type { EvalScore } from '../../src/observability/v2/evalTypes' +import type { EvalScenario, EvalScore, EvalVariant } from '../../src/observability/v2/evalTypes' import type { EvalExperimentActionBinding, EvalExperimentFlatActionBinding, @@ -13,11 +13,20 @@ import type { EvalScoreSpec, EvalScoreSpecCollection, } from '../../src/observability/v2/evalExperimentTypes' +import { + createRunIdentity, + executeHarnessAndCapture, + isExecuteHarnessDisabled, + type ExecuteHarnessResult, +} from './v2_harness_execution' interface CandidateExperimentResult { candidate_variant_id: string candidate_run_id: string candidate_user_action_id: string + candidate_eval_run_id?: string + candidate_benchmark_run_id?: string + candidate_execution?: ExecuteHarnessResult compare_report: string gate_results: GateResult[] scorecard_summary: ScorecardItem[] @@ -30,6 +39,9 @@ interface ScenarioExperimentResult { repeat_index: number baseline_run_id: string baseline_user_action_id: string + baseline_eval_run_id?: string + baseline_benchmark_run_id?: string + baseline_execution?: ExecuteHarnessResult candidates: CandidateExperimentResult[] } @@ -158,6 +170,37 @@ async function loadGatePolicy(gatePolicyId?: string): Promise { + const filePath = path.join(evalRoot, 'scenarios', `${scenarioId}.json`) + try { + return await readJson(filePath) + } catch { + throw new Error(`Scenario not found: ${scenarioId}`) + } +} + +async function loadVariant(variantId: string): Promise { + const directPath = path.join(evalRoot, 'variants', `${variantId}.json`) + try { + return await readJson(directPath) + } catch { + // Fall through to template compatibility paths used by V2.1 samples. + } + + const templatePath = path.join(evalRoot, 'variants', `${variantId}.template.json`) + try { + return await readJson(templatePath) + } catch { + // Fall through to baseline.template.json compatibility. + } + + const baseline = await readJson( + path.join(evalRoot, 'variants', 'baseline.template.json'), + ) + if (baseline.variant_id === variantId) return baseline + throw new Error(`Variant not found: ${variantId}`) +} + function normalizeGateRules(gatePolicy: EvalGatePolicy | undefined): EvalGatePolicyRule[] { if (!gatePolicy) return [] return [ @@ -530,6 +573,24 @@ function buildRecordRunArgs(params: { return args } +function requireCapturedAction(params: { + label: string + result: ExecuteHarnessResult +}): string { + const { label, result } = params + if (result.execution.status !== 'completed') { + throw new Error( + `${label} execute_harness failed: ${result.execution.error ?? result.execution.status}`, + ) + } + if (result.capture.status !== 'captured' || !result.capture.user_action_id) { + throw new Error( + `${label} action capture ${result.capture.status}: ${result.capture.error ?? 'no user_action_id'}`, + ) + } + return result.capture.user_action_id +} + function summarizeRisk(results: ScenarioExperimentResult[]): RiskVerdict { const candidates = results.flatMap(result => result.candidates) const allGateResults = candidates.flatMap(candidate => candidate.gate_results) @@ -660,7 +721,7 @@ function buildMarkdownReport(params: { .join('\n') const reviewMode = aggregateReviewMode(results) - return `# V2.1 Experiment Summary: ${experiment.experiment_id} + return `# V2 Experiment Summary: ${experiment.experiment_id} ## 理解清单 @@ -675,11 +736,11 @@ function buildMarkdownReport(params: { ## 预期效果 -This summary records a manifest-driven V2.1 experiment run. In bind-existing mode, every generated V2 run is backed by an existing V1 user_action_id. +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2.2-alpha executes the scenario first, then captures the generated user_action_id through benchmark_run_id. ## 设计思路 -V2.1 intentionally does not execute the harness automatically. It turns existing V1 traces into comparable V2 runs, then runs scorer, comparison, and regression-risk gate scripts. +The runner always scores only trace-backed V1 facts. V2.2-alpha adds an execution front half, but the score/compare/gate back half is the same fact-only pipeline used by V2.1. ## Risk Verdict @@ -724,17 +785,28 @@ async function main(): Promise { const experimentPath = await findExperimentPath(experimentArg) const experiment = await readJson(experimentPath) - const mode = experiment.mode ?? 'bind_existing' - - if (mode === 'execute_harness') { + const requestedMode = experiment.mode ?? 'bind_existing' + const automationDisabled = isExecuteHarnessDisabled(args) + const mode = + requestedMode === 'execute_harness' && automationDisabled + ? 'bind_existing' + : requestedMode + + if ( + requestedMode === 'execute_harness' && + automationDisabled && + experiment.execution?.allow_fallback_to_bind_existing === false + ) { throw new Error( - 'execute_harness mode is not implemented yet: missing headless harness execution adapter', + 'execute_harness is disabled and this experiment does not allow bind_existing fallback.', ) } if (mode !== 'bind_existing') { - throw new Error( - `Unsupported V2.1 experiment mode: ${mode}`, - ) + if (mode !== 'execute_harness') { + throw new Error( + `Unsupported V2 experiment mode: ${mode}`, + ) + } } const scenarioIds = experiment.scenario_ids ?? [] @@ -766,36 +838,81 @@ async function main(): Promise { } } const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) + if (mode === 'execute_harness') { + if (scenarioIds.length !== 1) { + throw new Error('V2.2-alpha execute_harness supports exactly one scenario.') + } + if (experiment.candidate_variant_ids.length !== 1) { + throw new Error('V2.2-alpha execute_harness supports exactly one candidate variant.') + } + if (repeatCount !== 1) { + throw new Error('V2.2-alpha execute_harness supports repeat_count=1 only.') + } + } const results: ScenarioExperimentResult[] = [] + if (mode === 'bind_existing') { + for (const scenarioId of scenarioIds) { + for (const variantId of [experiment.baseline_variant_id, ...experiment.candidate_variant_ids]) { + const userActionId = findBoundUserActionId({ + experiment, + scenarioId, + variantId, + }) + if (!userActionId) { + throw new Error( + `Missing action binding for scenario=${scenarioId}, variant=${variantId}. bind_existing mode requires user_action_id bindings.`, + ) + } + } + } + } + + const executionStamp = new Date().toISOString().replace(/[:.]/g, '') + for (const scenarioId of scenarioIds) { - for (const variantId of [experiment.baseline_variant_id, ...experiment.candidate_variant_ids]) { - const userActionId = findBoundUserActionId({ + const scenario = mode === 'execute_harness' ? await loadScenario(scenarioId) : undefined + + for (let repeatIndex = 1; repeatIndex <= repeatCount; repeatIndex += 1) { + let baselineUserActionId = findBoundUserActionId({ experiment, scenarioId, - variantId, + variantId: experiment.baseline_variant_id, }) - if (!userActionId) { + let baselineExecution: ExecuteHarnessResult | undefined + let baselineEvalRunId: string | undefined + let baselineBenchmarkRunId: string | undefined + if (mode === 'execute_harness') { + if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) + const baselineVariant = await loadVariant(experiment.baseline_variant_id) + const identity = createRunIdentity({ + experimentId: experiment.experiment_id, + scenarioId, + variantId: experiment.baseline_variant_id, + stamp: executionStamp, + }) + baselineEvalRunId = identity.eval_run_id + baselineBenchmarkRunId = identity.benchmark_run_id + baselineExecution = await executeHarnessAndCapture({ + experimentId: experiment.experiment_id, + scenario, + variant: baselineVariant, + execution: experiment.execution, + evalRunId: identity.eval_run_id, + benchmarkRunId: identity.benchmark_run_id, + dbPath, + }) + baselineUserActionId = requireCapturedAction({ + label: `baseline scenario=${scenarioId} variant=${experiment.baseline_variant_id}`, + result: baselineExecution, + }) + } + if (!baselineUserActionId) { throw new Error( - `Missing action binding for scenario=${scenarioId}, variant=${variantId}. V2.1 bind_existing mode requires user_action_id bindings.`, + `Missing action binding for scenario=${scenarioId}, variant=${experiment.baseline_variant_id}. bind_existing mode requires user_action_id bindings.`, ) } - } - } - for (const scenarioId of scenarioIds) { - const baselineUserActionId = findBoundUserActionId({ - experiment, - scenarioId, - variantId: experiment.baseline_variant_id, - }) - if (!baselineUserActionId) { - throw new Error( - `Missing action binding for scenario=${scenarioId}, variant=${experiment.baseline_variant_id}. V2.1 bind_existing mode requires user_action_id bindings.`, - ) - } - - for (let repeatIndex = 1; repeatIndex <= repeatCount; repeatIndex += 1) { const baselineOutput = runBunScript( 'scripts/evals/v2_record_run.ts', buildRecordRunArgs({ @@ -814,11 +931,39 @@ async function main(): Promise { const candidates: CandidateExperimentResult[] = [] for (const candidateVariantId of experiment.candidate_variant_ids) { - const candidateActionId = findBoundUserActionId({ + let candidateActionId = findBoundUserActionId({ experiment, scenarioId, variantId: candidateVariantId, }) + let candidateExecution: ExecuteHarnessResult | undefined + let candidateEvalRunId: string | undefined + let candidateBenchmarkRunId: string | undefined + if (mode === 'execute_harness') { + if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) + const candidateVariant = await loadVariant(candidateVariantId) + const identity = createRunIdentity({ + experimentId: experiment.experiment_id, + scenarioId, + variantId: candidateVariantId, + stamp: executionStamp, + }) + candidateEvalRunId = identity.eval_run_id + candidateBenchmarkRunId = identity.benchmark_run_id + candidateExecution = await executeHarnessAndCapture({ + experimentId: experiment.experiment_id, + scenario, + variant: candidateVariant, + execution: experiment.execution, + evalRunId: identity.eval_run_id, + benchmarkRunId: identity.benchmark_run_id, + dbPath, + }) + candidateActionId = requireCapturedAction({ + label: `candidate scenario=${scenarioId} variant=${candidateVariantId}`, + result: candidateExecution, + }) + } if (!candidateActionId) { throw new Error( `Missing candidate user_action_id for scenario=${scenarioId}, variant=${candidateVariantId}`, @@ -868,6 +1013,9 @@ async function main(): Promise { candidate_variant_id: candidateVariantId, candidate_run_id: candidateRunId, candidate_user_action_id: candidateActionId, + candidate_eval_run_id: candidateEvalRunId, + candidate_benchmark_run_id: candidateBenchmarkRunId, + candidate_execution: candidateExecution, compare_report: extractCreatedReport(compareOutput), gate_results: gateResults, scorecard_summary: scorecard, @@ -887,6 +1035,9 @@ async function main(): Promise { repeat_index: repeatIndex, baseline_run_id: baselineRunId, baseline_user_action_id: baselineUserActionId, + baseline_eval_run_id: baselineEvalRunId, + baseline_benchmark_run_id: baselineBenchmarkRunId, + baseline_execution: baselineExecution, candidates, }) } @@ -933,6 +1084,8 @@ async function main(): Promise { manifest_ref: path.relative(repoRoot, experimentPath), generated_at: generatedAt, mode, + requested_mode: requestedMode, + automation_disabled: automationDisabled, run_refs: runRefs(results), score_refs: scoreRefs(results), report_refs: reportRefs(results, outputMarkdownRel), @@ -948,7 +1101,21 @@ async function main(): Promise { warnings: warningMessages, experiment, runner: { + requested_mode: requestedMode, mode, + automation_disabled: automationDisabled, + fallback_reason: + requestedMode === 'execute_harness' && mode === 'bind_existing' + ? 'execute_harness disabled by flag or environment; bind_existing fallback used' + : null, + execute_harness_alpha_limits: + mode === 'execute_harness' + ? { + scenario_count: 1, + candidate_count: 1, + repeat_count: 1, + } + : null, score_spec_ids: experiment.score_spec_ids ?? [], gate_policy_id: experiment.gate_policy_id ?? null, }, @@ -969,8 +1136,8 @@ async function main(): Promise { }), ) - console.log(`Created V2.1 experiment summary: ${outputJsonRel}`) - console.log(`Created V2.1 experiment report: ${outputMarkdownRel}`) + console.log(`Created V2 experiment summary: ${outputJsonRel}`) + console.log(`Created V2 experiment report: ${outputMarkdownRel}`) } main().catch(error => { diff --git a/scripts/evals/v2_verify_bind_runner.ts b/scripts/evals/v2_verify_bind_runner.ts index b77927c692..9d310ac430 100644 --- a/scripts/evals/v2_verify_bind_runner.ts +++ b/scripts/evals/v2_verify_bind_runner.ts @@ -1,5 +1,5 @@ import { spawnSync } from 'node:child_process' -import { mkdir, readFile, rm, unlink, writeFile } from 'node:fs/promises' +import { mkdir, readFile, readdir, rm, unlink, writeFile } from 'node:fs/promises' import path from 'node:path' type JsonRecord = Record @@ -12,6 +12,7 @@ interface VerifyCase { expected_error?: string db_path?: string no_snapshot_db?: boolean + extra_args?: string[] } interface VerifyResult { @@ -97,6 +98,20 @@ async function writeJson(filePath: string, value: unknown): Promise { await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') } +async function findChildDir(parent: string, matcher: (name: string) => boolean): Promise { + const entries = await readdir(parent, { withFileTypes: true }) + const found = entries.find(entry => entry.isDirectory() && matcher(entry.name)) + if (!found) throw new Error(`Directory not found under ${parent}`) + return path.join(parent, found.name) +} + +async function resolveV2ReportRoot(): Promise { + const taskRoot = path.join(repoRoot, 'ObservrityTask') + const versionsRoot = await findChildDir(taskRoot, name => name.startsWith('10-')) + const v2Root = path.join(versionsRoot, 'v2') + return await findChildDir(v2Root, name => name.startsWith('06-')) +} + function runBun(args: string[]) { return spawnSync('bun', ['run', ...args], { cwd: repoRoot, @@ -105,7 +120,8 @@ function runBun(args: string[]) { } function extractOutputRef(output: string, label: string): string | undefined { - const match = output.match(new RegExp(`${label}:\\s*(.+)`)) + const flexibleLabel = label.replace('V2.1', 'V2(?:\\\\.1)?') + const match = output.match(new RegExp(`${flexibleLabel}:\\s*(.+)`)) return match?.[1]?.trim() } @@ -125,6 +141,7 @@ async function cleanupGeneratedArtifacts(summaryRef?: string): Promise { score_refs?: string[] report_refs?: string[] } + const v2ReportRoot = await resolveV2ReportRoot() const runReportRefs = (summary.run_refs ?? []).map(runRef => { const runId = path.basename(runRef, '.json') return path.join( @@ -147,6 +164,56 @@ async function cleanupGeneratedArtifacts(summaryRef?: string): Promise { } } +async function cleanupGeneratedArtifactsResolved(summaryRef?: string): Promise { + if (!summaryRef) return + const summaryPath = relToAbs(summaryRef) + const summary = JSON.parse(await readFile(summaryPath, 'utf8')) as { + run_refs?: string[] + score_refs?: string[] + report_refs?: string[] + } + const v2ReportRoot = await resolveV2ReportRoot() + const runReportRefs = (summary.run_refs ?? []).map(runRef => + path.join(v2ReportRoot, `${path.basename(runRef, '.json')}.md`), + ) + const refs = [ + ...(summary.run_refs ?? []), + ...(summary.score_refs ?? []), + ...(summary.report_refs ?? []), + ...runReportRefs, + summaryRef, + ] + for (const ref of refs) { + await removeIfExists(relToAbs(ref)) + } +} + +async function listFilesInDir(dir: string): Promise { + const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) + return entries + .filter(entry => entry.isFile()) + .map(entry => path.join(dir, entry.name)) +} + +async function listGeneratedArtifactFiles(): Promise> { + const v2ReportRoot = await resolveV2ReportRoot() + const files = [ + ...(await listFilesInDir(path.join(repoRoot, 'tests', 'evals', 'v2', 'runs'))), + ...(await listFilesInDir(path.join(repoRoot, 'tests', 'evals', 'v2', 'scores'))), + ...(await listFilesInDir(v2ReportRoot)), + ] + return new Set(files.map(file => path.resolve(file))) +} + +async function cleanupArtifactsCreatedAfter(before: Set): Promise { + const after = await listGeneratedArtifactFiles() + for (const filePath of after) { + if (!before.has(filePath)) { + await removeIfExists(filePath) + } + } +} + function assertExperimentArtifactSchema(summary: JsonRecord): string[] { const errors: string[] = [] const requiredStrings = ['experiment_id', 'manifest_ref', 'generated_at', 'mode'] @@ -208,9 +275,11 @@ async function createMissingRootDb(): Promise { async function runCase(testCase: VerifyCase): Promise { const manifestPath = path.join(manifestsRoot, `${testCase.case_id}.json`) await writeJson(manifestPath, testCase.manifest) + const beforeArtifacts = await listGeneratedArtifactFiles() const args = ['scripts/evals/v2_run_experiment.ts', '--experiment', manifestPath] if (testCase.db_path) args.push('--db', testCase.db_path) if (testCase.no_snapshot_db) args.push('--no-snapshot-db') + if (testCase.extra_args) args.push(...testCase.extra_args) const result = runBun(args) const output = [String(result.stdout ?? '').trim(), String(result.stderr ?? '').trim()] @@ -220,6 +289,7 @@ async function runCase(testCase: VerifyCase): Promise { const reportRef = extractOutputRef(output, 'Created V2.1 experiment report') if (testCase.expect === 'failure') { + await cleanupArtifactsCreatedAfter(beforeArtifacts) const hasExpectedError = result.status !== 0 && (!testCase.expected_error || output.includes(testCase.expected_error)) @@ -242,8 +312,9 @@ async function runCase(testCase: VerifyCase): Promise { passed = false errorExcerpt = schemaErrors.join('; ') } - await cleanupGeneratedArtifacts(summaryRef) + await cleanupGeneratedArtifactsResolved(summaryRef) } + await cleanupArtifactsCreatedAfter(beforeArtifacts) return { case_id: testCase.case_id, @@ -398,11 +469,10 @@ async function main(): Promise { }), }, { - case_id: 'execute_harness_blocked', - description: 'execute_harness mode should fail with the explicit adapter error.', - expect: 'failure', - expected_error: - 'execute_harness mode is not implemented yet: missing headless harness execution adapter', + case_id: 'execute_harness_disabled_fallback', + description: 'execute_harness can be disabled and falls back to bind_existing when action bindings are present.', + expect: 'success', + extra_args: ['--disable-execute-harness'], manifest: experiment({ id: `v2_1_verify_execute_harness_${stamp}`, scenarioIds: ['cost_sensitive_task'], diff --git a/scripts/evals/v2_verify_execute_harness_alpha.ts b/scripts/evals/v2_verify_execute_harness_alpha.ts new file mode 100644 index 0000000000..2db9cc568f --- /dev/null +++ b/scripts/evals/v2_verify_execute_harness_alpha.ts @@ -0,0 +1,476 @@ +import { spawnSync } from 'node:child_process' +import { mkdir, readFile, readdir, rm, unlink, writeFile } from 'node:fs/promises' +import path from 'node:path' + +type JsonRecord = Record + +interface VerifyCase { + case_id: string + description: string + manifest: JsonRecord + expect: 'success' | 'failure' + expected_error?: string + db_path?: string + no_snapshot_db?: boolean + extra_args?: string[] +} + +interface VerifyResult { + case_id: string + description: string + passed: boolean + expected: 'success' | 'failure' + status: number | null + summary_ref?: string + report_ref?: string + artifacts_cleaned?: boolean + error_excerpt?: string +} + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') +const stamp = new Date().toISOString().replace(/[:.]/g, '') +const tempRoot = path.join(repoRoot, '.observability', 'v2-execute-harness-verification', stamp) +const manifestsRoot = path.join(tempRoot, 'manifests') +const reportsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'verification-reports') + +const scoreSpecIds = [ + 'task_success.main_chain_observed', + 'efficiency.total_billed_tokens', + 'decision_quality.subagent_count_observed', + 'stability.recovery_absence', + 'controllability.turn_limit_basic', +] + +function sqlString(value: string): string { + return `'${value.replaceAll("'", "''")}'` +} + +function fixtureExperiment(params: { + id: string + scenarioId?: string + baselineVariantId?: string + candidateVariantId?: string + execution?: JsonRecord + actionBindings?: JsonRecord[] +}): JsonRecord { + return { + experiment_id: params.id, + name: params.id, + goal: 'V2.2-alpha execute_harness verification fixture.', + baseline_variant_id: params.baselineVariantId ?? 'baseline_default', + candidate_variant_ids: [params.candidateVariantId ?? 'candidate_session_memory_sparse'], + scenario_set_id: 'v2_2_alpha_verify', + scenario_ids: [params.scenarioId ?? 'cost_sensitive_task'], + repeat_count: 1, + score_spec_ids: scoreSpecIds, + gate_policy_id: 'default_v2_1_gate', + mode: 'execute_harness', + execution: params.execution ?? {}, + action_bindings: params.actionBindings, + status: 'ready', + } +} + +function fixtureExecution(dbPath: string, env?: JsonRecord): JsonRecord { + return { + adapter: 'cli_print', + command: 'bun', + args: ['run', 'scripts/evals/v2_emit_fixture_trace.ts'], + timeout_ms: 30000, + env: { + V2_FIXTURE_DB_PATH: dbPath, + ...env, + }, + } +} + +async function writeJson(filePath: string, value: unknown): Promise { + await mkdir(path.dirname(filePath), { recursive: true }) + await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') +} + +async function findChildDir(parent: string, matcher: (name: string) => boolean): Promise { + const entries = await readdir(parent, { withFileTypes: true }) + const found = entries.find(entry => entry.isDirectory() && matcher(entry.name)) + if (!found) throw new Error(`Directory not found under ${parent}`) + return path.join(parent, found.name) +} + +async function resolveV2ReportRoot(): Promise { + const taskRoot = path.join(repoRoot, 'ObservrityTask') + const versionsRoot = await findChildDir(taskRoot, name => name.startsWith('10-')) + const v2Root = path.join(versionsRoot, 'v2') + return await findChildDir(v2Root, name => name.startsWith('06-')) +} + +function runBun(args: string[]) { + return spawnSync('bun', ['run', ...args], { + cwd: repoRoot, + encoding: 'utf8', + }) +} + +function runDuckDb(dbPath: string, sql: string): void { + const result = spawnSync(duckdbExe, [dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + throw new Error( + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim(), + ) + } +} + +function extractOutputRef(output: string, label: string): string | undefined { + const match = output.match(new RegExp(`${label}:\\s*(.+)`)) + return match?.[1]?.trim() +} + +function extractAllOutputRefs(output: string, label: string): string[] { + return [...output.matchAll(new RegExp(`${label}:\\s*(.+)`, 'g'))] + .map(match => match[1]?.trim()) + .filter((value): value is string => Boolean(value)) +} + +function relToAbs(ref: string): string { + return path.isAbsolute(ref) ? ref : path.resolve(repoRoot, ref) +} + +async function removeIfExists(filePath: string): Promise { + await unlink(filePath).catch(() => undefined) +} + +async function cleanupGeneratedArtifacts(summaryRef?: string): Promise { + if (!summaryRef) return + const summaryPath = relToAbs(summaryRef) + const summary = JSON.parse(await readFile(summaryPath, 'utf8')) as { + run_refs?: string[] + score_refs?: string[] + report_refs?: string[] + } + const v2ReportRoot = await resolveV2ReportRoot() + const runReportRefs = (summary.run_refs ?? []).map(runRef => { + const runId = path.basename(runRef, '.json') + return path.join(v2ReportRoot, `${runId}.md`) + }) + const refs = [ + ...(summary.run_refs ?? []), + ...(summary.score_refs ?? []), + ...(summary.report_refs ?? []), + ...runReportRefs, + summaryRef, + ] + for (const ref of refs) { + await removeIfExists(relToAbs(ref)) + } +} + +async function cleanupPartialArtifacts(output: string): Promise { + const runIds = extractAllOutputRefs(output, 'Created V2 run') + const reportRefs = extractAllOutputRefs(output, 'report') + const refs = [ + ...runIds.flatMap(runId => [ + path.join('tests', 'evals', 'v2', 'runs', `${runId}.json`), + path.join('tests', 'evals', 'v2', 'scores', `${runId}.scores.json`), + ]), + ...reportRefs, + ] + for (const ref of refs) { + await removeIfExists(relToAbs(ref)) + } +} + +async function listFilesInDir(dir: string): Promise { + const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) + return entries + .filter(entry => entry.isFile()) + .map(entry => path.join(dir, entry.name)) +} + +async function listGeneratedArtifactFiles(): Promise> { + const v2ReportRoot = await resolveV2ReportRoot() + const files = [ + ...(await listFilesInDir(path.join(repoRoot, 'tests', 'evals', 'v2', 'runs'))), + ...(await listFilesInDir(path.join(repoRoot, 'tests', 'evals', 'v2', 'scores'))), + ...(await listFilesInDir(v2ReportRoot)), + ] + return new Set(files.map(file => path.resolve(file))) +} + +async function cleanupArtifactsCreatedAfter(before: Set): Promise { + const after = await listGeneratedArtifactFiles() + for (const filePath of after) { + if (!before.has(filePath)) { + await removeIfExists(filePath) + } + } +} + +function createEmptyCaptureDb(dbPath: string): void { + runDuckDb( + dbPath, + 'CREATE TABLE user_actions(user_action_id VARCHAR, benchmark_run_id VARCHAR);', + ) +} + +function createBindExistingDb(dbPath: string): JsonRecord[] { + const baselineActionId = 'v2-verify-baseline-action' + const candidateActionId = 'v2-verify-candidate-action' + const startedAt = '2026-05-01T00:00:00.000Z' + const sql = [ + 'CREATE TABLE user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', + 'CREATE TABLE queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR, turn_count BIGINT, terminal_reason VARCHAR);', + 'CREATE TABLE tools(user_action_id VARCHAR, tool_name VARCHAR, is_closed BOOLEAN, has_failed BOOLEAN);', + 'CREATE TABLE subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', + 'CREATE TABLE recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', + 'CREATE TABLE metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', + `INSERT INTO user_actions VALUES ('2026-05-01', ${sqlString(baselineActionId)}, ${sqlString(startedAt)}, 0, '2026-05-01T00:00:01.000Z', 1000, 1000, 2, 1, 1, 0, 0, 0, 100, 10, 0, 0, 100, 110, 100, 0);`, + `INSERT INTO user_actions VALUES ('2026-05-01', ${sqlString(candidateActionId)}, ${sqlString(startedAt)}, 0, '2026-05-01T00:00:01.000Z', 1000, 1000, 2, 1, 1, 0, 0, 0, 90, 10, 0, 0, 90, 100, 90, 0);`, + `INSERT INTO queries VALUES ('q-baseline', ${sqlString(baselineActionId)}, 'main_thread', ${sqlString(startedAt)}, 1, 'fixture_completed');`, + `INSERT INTO queries VALUES ('q-candidate', ${sqlString(candidateActionId)}, 'main_thread', ${sqlString(startedAt)}, 1, 'fixture_completed');`, + "INSERT INTO metrics_integrity_daily VALUES ('2026-05-01', 1, 1, 1, 1);", + ].join('\n') + runDuckDb(dbPath, sql) + return [ + { + scenario_id: 'cost_sensitive_task', + variant_id: 'baseline_default', + entry_user_action_id: baselineActionId, + }, + { + scenario_id: 'cost_sensitive_task', + variant_id: 'candidate_session_memory_sparse', + entry_user_action_id: candidateActionId, + }, + ] +} + +async function runCase(testCase: VerifyCase): Promise { + const manifestPath = path.join(manifestsRoot, `${testCase.case_id}.json`) + await writeJson(manifestPath, testCase.manifest) + const beforeArtifacts = await listGeneratedArtifactFiles() + const args = ['scripts/evals/v2_run_experiment.ts', '--experiment', manifestPath] + if (testCase.db_path) args.push('--db', testCase.db_path) + if (testCase.no_snapshot_db) args.push('--no-snapshot-db') + if (testCase.extra_args) args.push(...testCase.extra_args) + const result = runBun(args) + const output = [String(result.stdout ?? '').trim(), String(result.stderr ?? '').trim()] + .filter(Boolean) + .join('\n') + const summaryRef = extractOutputRef(output, 'Created V2 experiment summary') + const reportRef = extractOutputRef(output, 'Created V2 experiment report') + + if (testCase.expect === 'failure') { + await cleanupPartialArtifacts(output) + await cleanupArtifactsCreatedAfter(beforeArtifacts) + const hasExpectedError = + result.status !== 0 && + (!testCase.expected_error || output.includes(testCase.expected_error)) + return { + case_id: testCase.case_id, + description: testCase.description, + passed: hasExpectedError, + expected: testCase.expect, + status: result.status, + error_excerpt: output.slice(0, 700), + } + } + + const passed = result.status === 0 && Boolean(summaryRef) + if (summaryRef) await cleanupGeneratedArtifacts(summaryRef) + await cleanupArtifactsCreatedAfter(beforeArtifacts) + return { + case_id: testCase.case_id, + description: testCase.description, + passed, + expected: testCase.expect, + status: result.status, + summary_ref: summaryRef, + report_ref: reportRef, + artifacts_cleaned: Boolean(summaryRef), + error_excerpt: output.slice(0, 700), + } +} + +async function main(): Promise { + await mkdir(manifestsRoot, { recursive: true }) + await mkdir(reportsRoot, { recursive: true }) + + const successDb = path.join(tempRoot, 'success.duckdb') + const missingCaptureDb = path.join(tempRoot, 'missing-capture.duckdb') + const ambiguousCaptureDb = path.join(tempRoot, 'ambiguous-capture.duckdb') + const baselineFailDb = path.join(tempRoot, 'baseline-fail.duckdb') + const candidateFailDb = path.join(tempRoot, 'candidate-fail.duckdb') + const fallbackDb = path.join(tempRoot, 'fallback.duckdb') + createEmptyCaptureDb(missingCaptureDb) + const fallbackBindings = createBindExistingDb(fallbackDb) + + const cases: VerifyCase[] = [ + { + case_id: 'execute_harness_success_fixture', + description: 'execute_harness success path creates run, score, report, and risk verdict through benchmark_run_id capture.', + expect: 'success', + db_path: successDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_success_${stamp}`, + execution: fixtureExecution(successDb), + }), + }, + { + case_id: 'adapter_not_found', + description: 'Unsupported adapter should fail clearly.', + expect: 'failure', + expected_error: 'Unsupported execute_harness adapter', + db_path: missingCaptureDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_adapter_missing_${stamp}`, + execution: { adapter: 'not_real_adapter' }, + }), + }, + { + case_id: 'capture_failed', + description: 'Completed execution without matching benchmark_run_id should fail capture.', + expect: 'failure', + expected_error: 'action capture capture_failed', + db_path: missingCaptureDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_capture_failed_${stamp}`, + execution: { + adapter: 'cli_print', + command: 'bun', + args: ['--version'], + timeout_ms: 30000, + }, + }), + }, + { + case_id: 'ambiguous_capture', + description: 'Multiple user_action_id rows for one benchmark_run_id should fail capture.', + expect: 'failure', + expected_error: 'action capture ambiguous_capture', + db_path: ambiguousCaptureDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_ambiguous_capture_${stamp}`, + execution: fixtureExecution(ambiguousCaptureDb, { + V2_FIXTURE_DUPLICATE_CAPTURE: '1', + }), + }), + }, + { + case_id: 'variant_apply_failed', + description: 'Strict config snapshot check should fail before execution when the referenced snapshot is missing.', + expect: 'failure', + expected_error: 'Variant apply failed', + db_path: missingCaptureDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_variant_apply_failed_${stamp}`, + execution: { + ...fixtureExecution(missingCaptureDb), + require_config_snapshot: true, + }, + }), + }, + { + case_id: 'scenario_missing', + description: 'Missing scenario manifest should fail before execution.', + expect: 'failure', + expected_error: 'Scenario not found', + db_path: missingCaptureDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_scenario_missing_${stamp}`, + scenarioId: 'not_real_scenario', + execution: fixtureExecution(missingCaptureDb), + }), + }, + { + case_id: 'baseline_failure', + description: 'Baseline execution failure should stop the experiment.', + expect: 'failure', + expected_error: 'baseline scenario=cost_sensitive_task variant=baseline_default execute_harness failed', + db_path: baselineFailDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_baseline_failure_${stamp}`, + execution: fixtureExecution(baselineFailDb, { + V2_FIXTURE_FAIL_VARIANT: 'baseline_default', + }), + }), + }, + { + case_id: 'candidate_failure', + description: 'Candidate execution failure should stop the experiment after the baseline succeeds.', + expect: 'failure', + expected_error: 'candidate scenario=cost_sensitive_task variant=candidate_session_memory_sparse execute_harness failed', + db_path: candidateFailDb, + no_snapshot_db: true, + manifest: fixtureExperiment({ + id: `v2_2_verify_candidate_failure_${stamp}`, + execution: fixtureExecution(candidateFailDb, { + V2_FIXTURE_FAIL_VARIANT: 'candidate_session_memory_sparse', + }), + }), + }, + { + case_id: 'disabled_fallback_to_bind_existing', + description: 'Automation can be disabled and fall back to bind_existing.', + expect: 'success', + db_path: fallbackDb, + no_snapshot_db: true, + extra_args: ['--disable-execute-harness'], + manifest: fixtureExperiment({ + id: `v2_2_verify_disabled_fallback_${stamp}`, + execution: { + ...fixtureExecution(fallbackDb), + allow_fallback_to_bind_existing: true, + }, + actionBindings: fallbackBindings, + }), + }, + ] + + const results: VerifyResult[] = [] + for (const testCase of cases) { + results.push(await runCase(testCase)) + } + + const failed = results.filter(result => !result.passed) + const report = { + verification_id: `v2_2_execute_harness_alpha_${stamp}`, + generated_at: new Date().toISOString(), + temp_root: path.relative(repoRoot, tempRoot), + passed: failed.length === 0, + case_count: results.length, + failed_count: failed.length, + note: + 'Success-path verification uses a fixture command to avoid model/API spend; the production default adapter is cli_print.', + results, + } + const reportPath = path.join(reportsRoot, `v2_2_execute_harness_alpha_${stamp}.json`) + await writeJson(reportPath, report) + await rm(tempRoot, { recursive: true, force: true }).catch(() => undefined) + + console.log(`Created V2.2 execute_harness verification report: ${path.relative(repoRoot, reportPath)}`) + if (failed.length > 0) { + for (const result of failed) { + console.error(`FAILED ${result.case_id}: ${result.error_excerpt ?? ''}`) + } + process.exit(1) + } + console.log(`V2.2 execute_harness alpha verification passed: ${results.length}/${results.length}`) +} + +main().catch(async error => { + await rm(tempRoot, { recursive: true, force: true }).catch(() => undefined) + console.error(error instanceof Error ? error.message : error) + process.exit(1) +}) diff --git a/scripts/observability/build_duckdb_etl.ts b/scripts/observability/build_duckdb_etl.ts index 7ae3e020cf..0b7f0fd399 100644 --- a/scripts/observability/build_duckdb_etl.ts +++ b/scripts/observability/build_duckdb_etl.ts @@ -1,10 +1,12 @@ import { createHash } from "node:crypto" +import { spawnSync } from "node:child_process" import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, + unlinkSync, writeFileSync, } from "node:fs" import { basename, join, relative, resolve } from "node:path" @@ -44,6 +46,11 @@ type EventRecord = { cwd?: string | null git_branch?: string | null build_version?: string | null + experiment_id?: string | null + scenario_id?: string | null + variant_id?: string | null + benchmark_run_id?: string | null + eval_run_id?: string | null payload?: Record | null } @@ -99,16 +106,23 @@ const repoRoot = resolve(import.meta.dir, "..", "..") const observabilityDir = join(repoRoot, ".observability") const snapshotsDir = join(observabilityDir, "snapshots") const duckdbExe = join(repoRoot, "tools", "duckdb", "duckdb.exe") -const databasePath = join(observabilityDir, "observability_v1.duckdb") -const sqlPath = join(observabilityDir, "load_observability_v1.sql") +const defaultDatabasePath = join(observabilityDir, "observability_v1.duckdb") +const sqlPath = join( + observabilityDir, + `load_observability_v1.${process.pid}.${Date.now()}.sql`, +) function fail(message: string): never { console.error(message) process.exit(1) } -function parseArgs(argv: string[]): { eventsFile?: string; date?: string } { - const parsed: { eventsFile?: string; date?: string } = {} +function parseArgs(argv: string[]): { + eventsFile?: string + date?: string + dbPath?: string +} { + const parsed: { eventsFile?: string; date?: string; dbPath?: string } = {} for (let index = 0; index < argv.length; index += 1) { const current = argv[index] if (current === "--events-file") { @@ -119,6 +133,11 @@ function parseArgs(argv: string[]): { eventsFile?: string; date?: string } { if (current === "--date") { parsed.date = argv[index + 1] index += 1 + continue + } + if (current === "--db-path") { + parsed.dbPath = argv[index + 1] + index += 1 } } return parsed @@ -370,6 +389,24 @@ function inferString(value: JsonValue | undefined, key: string): string | null { return typeof current === "string" ? current : null } +function topLevelOrPayloadString(event: EventRecord, key: keyof EventRecord): string | null { + const value = event[key] + if (typeof value === "string" && value.trim() !== "") return value + return inferString(event.payload, String(key)) +} + +function nonEmptyString(value: string | null | undefined): string | null { + return typeof value === "string" && value.trim() !== "" ? value : null +} + +function shouldReplacePlaceholder( + current: unknown, + next: string | null | undefined, +): next is string { + if (!next || next.trim() === "") return false + return current === null || current === undefined || current === "" || current === "unknown" +} + function inferNumber(value: JsonValue | undefined, key: string): number | null { if (!value || typeof value !== "object" || Array.isArray(value)) { return null @@ -437,14 +474,17 @@ function normalizeAgentName( subagentType: string | null | undefined, subagentReason: string | null | undefined, ): string | null { - const candidate = subagentReason ?? subagentType ?? querySource + const candidate = + (subagentReason && subagentReason !== "unknown" ? subagentReason : null) ?? + (subagentType && subagentType !== "unknown" ? subagentType : null) ?? + querySource if (!candidate) { return null } if (candidate === "side_question") { return "side_query" } - if (candidate.startsWith("repl_main_thread")) { + if (candidate === "sdk" || candidate.startsWith("repl_main_thread")) { return "main_thread" } if (candidate.startsWith("agent:builtin:")) { @@ -464,7 +504,11 @@ function normalizeSourceGroup( if (!agentName && !querySource) { return null } - if (agentName === "main_thread" || querySource?.startsWith("repl_main_thread")) { + if ( + agentName === "main_thread" || + querySource === "sdk" || + querySource?.startsWith("repl_main_thread") + ) { return "main_thread" } if ( @@ -591,6 +635,7 @@ if (!existsSync(duckdbExe)) { mkdirSync(observabilityDir, { recursive: true }) const args = parseArgs(process.argv.slice(2)) +const databasePath = args.dbPath ? resolve(args.dbPath) : defaultDatabasePath const eventsPath = resolveEventsPath(args) if (!existsSync(eventsPath)) { fail(`Events file not found: ${eventsPath}`) @@ -855,12 +900,18 @@ for (const [index, event] of events.entries()) { existing.query_source ||= event.query_source ?? null existing.subagent_id ||= event.subagent_id ?? null existing.subagent_type ||= event.subagent_type ?? null - existing.subagent_reason ||= subagentReason + if (shouldReplacePlaceholder(existing.subagent_reason, subagentReason)) { + existing.subagent_reason = subagentReason + } existing.subagent_trigger_kind ||= subagentTriggerKind existing.subagent_trigger_detail ||= subagentTriggerDetail existing.subagent_trigger_payload_json ||= subagentTriggerPayloadJson - existing.agent_name ||= agentName - existing.source_group ||= sourceGroup + if (shouldReplacePlaceholder(existing.agent_name, agentName)) { + existing.agent_name = agentName + } + if (shouldReplacePlaceholder(existing.source_group, sourceGroup)) { + existing.source_group = sourceGroup + } existing.event_count = Number(existing.event_count) + 1 if (tsMs < Number(existing.started_at_ms)) { @@ -963,9 +1014,15 @@ for (const [index, event] of events.entries()) { existing.user_action_id ||= event.user_action_id ?? null existing.subagent_id ||= event.subagent_id ?? null existing.query_source ||= event.query_source ?? null - existing.subagent_reason ||= subagentReason - existing.agent_name ||= agentName - existing.source_group ||= sourceGroup + if (shouldReplacePlaceholder(existing.subagent_reason, subagentReason)) { + existing.subagent_reason = subagentReason + } + if (shouldReplacePlaceholder(existing.agent_name, agentName)) { + existing.agent_name = agentName + } + if (shouldReplacePlaceholder(existing.source_group, sourceGroup)) { + existing.source_group = sourceGroup + } if (event.loop_iter !== null && event.loop_iter !== undefined) { if ( @@ -1166,16 +1223,23 @@ for (const event of events) { existing.user_action_id ||= event.user_action_id ?? null existing.subagent_type ||= event.subagent_type ?? null existing.query_source ||= event.query_source ?? null - existing.subagent_reason ||= subagentReason + if (shouldReplacePlaceholder(existing.subagent_reason, subagentReason)) { + existing.subagent_reason = subagentReason + } existing.subagent_trigger_kind ||= subagentTriggerKind existing.subagent_trigger_detail ||= subagentTriggerDetail existing.subagent_trigger_payload_json ||= subagentTriggerPayloadJson - existing.agent_name ||= agentName - existing.source_group ||= normalizeSourceGroup( + if (shouldReplacePlaceholder(existing.agent_name, agentName)) { + existing.agent_name = agentName + } + const normalizedSourceGroup = normalizeSourceGroup( event.query_source ?? null, event.subagent_id ?? null, existing.agent_name as string | null, ) + if (shouldReplacePlaceholder(existing.source_group, normalizedSourceGroup)) { + existing.source_group = normalizedSourceGroup + } if (event.event === "subagent.spawned") { existing.spawned_at = event.ts_wall @@ -1251,8 +1315,9 @@ for (const [index, event] of events.entries()) { } existing.event_count = Number(existing.event_count) + 1 - if (event.user_action_id) { - ;(existing.user_action_ids as Set).add(event.user_action_id) + const normalizedUserActionId = nonEmptyString(event.user_action_id) + if (normalizedUserActionId) { + ;(existing.user_action_ids as Set).add(normalizedUserActionId) } const effectiveQueryId = effectiveQueryIds[index] if (effectiveQueryId) { @@ -1301,7 +1366,7 @@ const eventsRawRows = events.map((event, index) => { component: event.component ?? null, session_id: event.session_id ?? null, conversation_id: event.conversation_id ?? null, - user_action_id: event.user_action_id ?? null, + user_action_id: nonEmptyString(event.user_action_id), query_id: event.query_id ?? null, effective_query_id: effectiveQueryIds[index], turn_id: event.turn_id ?? null, @@ -1323,6 +1388,11 @@ const eventsRawRows = events.map((event, index) => { cwd: event.cwd ?? null, git_branch: event.git_branch ?? null, build_version: event.build_version ?? null, + experiment_id: topLevelOrPayloadString(event, "experiment_id"), + scenario_id: topLevelOrPayloadString(event, "scenario_id"), + variant_id: topLevelOrPayloadString(event, "variant_id"), + benchmark_run_id: topLevelOrPayloadString(event, "benchmark_run_id"), + eval_run_id: topLevelOrPayloadString(event, "eval_run_id"), payload_json: compactJson(event.payload), snapshot_refs_json: compactJson(perEventSnapshotRefs[index] ?? []), raw_event_json: compactJson(event), @@ -1549,6 +1619,38 @@ const buildMetaRows = [ const sql = ` BEGIN TRANSACTION; +DROP VIEW IF EXISTS user_actions; +DROP TABLE IF EXISTS user_actions; +DROP VIEW IF EXISTS query_source_cost_share; +DROP TABLE IF EXISTS query_source_cost_share; +DROP VIEW IF EXISTS query_source_cost_share_daily; +DROP TABLE IF EXISTS query_source_cost_share_daily; +DROP VIEW IF EXISTS agent_cost_daily; +DROP TABLE IF EXISTS agent_cost_daily; +DROP VIEW IF EXISTS subagent_reason_daily; +DROP TABLE IF EXISTS subagent_reason_daily; +DROP VIEW IF EXISTS metrics_integrity_daily; +DROP TABLE IF EXISTS metrics_integrity_daily; +DROP VIEW IF EXISTS metrics_cost_daily; +DROP TABLE IF EXISTS metrics_cost_daily; +DROP VIEW IF EXISTS metrics_loop_daily; +DROP TABLE IF EXISTS metrics_loop_daily; +DROP VIEW IF EXISTS metrics_latency_daily; +DROP TABLE IF EXISTS metrics_latency_daily; +DROP VIEW IF EXISTS metrics_compression_daily; +DROP TABLE IF EXISTS metrics_compression_daily; +DROP VIEW IF EXISTS tool_calls_by_name; +DROP TABLE IF EXISTS tool_calls_by_name; +DROP VIEW IF EXISTS tool_calls_by_mode; +DROP TABLE IF EXISTS tool_calls_by_mode; +DROP VIEW IF EXISTS metrics_tools_daily; +DROP TABLE IF EXISTS metrics_tools_daily; +DROP VIEW IF EXISTS terminal_reason_distribution; +DROP TABLE IF EXISTS terminal_reason_distribution; +DROP VIEW IF EXISTS metrics_recovery_daily; +DROP TABLE IF EXISTS metrics_recovery_daily; +DROP VIEW IF EXISTS system_flags; +DROP TABLE IF EXISTS system_flags; DROP TABLE IF EXISTS build_meta; DROP TABLE IF EXISTS events_raw; DROP TABLE IF EXISTS queries; @@ -1604,6 +1706,11 @@ CREATE TABLE events_raw ( cwd VARCHAR, git_branch VARCHAR, build_version VARCHAR, + experiment_id VARCHAR, + scenario_id VARCHAR, + variant_id VARCHAR, + benchmark_run_id VARCHAR, + eval_run_id VARCHAR, payload_json VARCHAR, snapshot_refs_json VARCHAR, raw_event_json VARCHAR @@ -1841,6 +1948,11 @@ ${createInsertSql("events_raw", [ "cwd", "git_branch", "build_version", + "experiment_id", + "scenario_id", + "variant_id", + "benchmark_run_id", + "eval_run_id", "payload_json", "snapshot_refs_json", "raw_event_json", @@ -2065,7 +2177,12 @@ event_agg AS ( COUNT(DISTINCT effective_query_id) FILTER (WHERE effective_query_id IS NOT NULL AND agent_name = 'main_thread') AS main_thread_query_count, COUNT(DISTINCT effective_query_id) FILTER (WHERE effective_query_id IS NOT NULL AND agent_name <> 'main_thread') AS subagent_query_count, COUNT(DISTINCT subagent_id) FILTER (WHERE subagent_id IS NOT NULL) AS subagent_count, - COUNT(DISTINCT tool_call_id) FILTER (WHERE tool_call_id IS NOT NULL) AS tool_call_count + COUNT(DISTINCT tool_call_id) FILTER (WHERE tool_call_id IS NOT NULL) AS tool_call_count, + MAX(experiment_id) FILTER (WHERE experiment_id IS NOT NULL) AS experiment_id, + MAX(scenario_id) FILTER (WHERE scenario_id IS NOT NULL) AS scenario_id, + MAX(variant_id) FILTER (WHERE variant_id IS NOT NULL) AS variant_id, + MAX(benchmark_run_id) FILTER (WHERE benchmark_run_id IS NOT NULL) AS benchmark_run_id, + MAX(eval_run_id) FILTER (WHERE eval_run_id IS NOT NULL) AS eval_run_id FROM events_raw WHERE user_action_id IS NOT NULL GROUP BY 1, 2 @@ -2084,6 +2201,11 @@ SELECT e.subagent_query_count, e.subagent_count, e.tool_call_count, + e.experiment_id, + e.scenario_id, + e.variant_id, + e.benchmark_run_id, + e.eval_run_id, COALESCE(u.raw_input_tokens, 0) AS raw_input_tokens, COALESCE(u.output_tokens, 0) AS output_tokens, COALESCE(u.cache_read_tokens, 0) AS cache_read_tokens, @@ -2569,6 +2691,27 @@ COMMIT; writeFileSync(sqlPath, sql, "utf8") +for (const stalePath of [databasePath, `${databasePath}.wal`]) { + if (existsSync(stalePath)) { + unlinkSync(stalePath) + } +} + +const applyResult = spawnSync(duckdbExe, [databasePath, `.read '${sqlPath}'`], { + cwd: repoRoot, + encoding: "utf8", +}) + +if (applyResult.status !== 0) { + const message = + String(applyResult.stderr ?? "").trim() || + String(applyResult.stdout ?? "").trim() || + String(applyResult.error?.message ?? "").trim() + fail(`DuckDB ETL apply failed: ${message}`) +} + +unlinkSync(sqlPath) + console.log( JSON.stringify( { diff --git a/src/cli/print.ts b/src/cli/print.ts index 0d134e6079..cdd9a22969 100644 --- a/src/cli/print.ts +++ b/src/cli/print.ts @@ -5376,7 +5376,7 @@ function getStructuredIO( jsonStringify({ type: 'user', content: inputPrompt, - uuid: '', + uuid: randomUUID(), session_id: '', message: { role: 'user', diff --git a/src/cli/structuredIO.ts b/src/cli/structuredIO.ts index fba44e61bd..403c476ddd 100644 --- a/src/cli/structuredIO.ts +++ b/src/cli/structuredIO.ts @@ -208,7 +208,7 @@ export class StructuredIO { jsonStringify({ type: 'user', content, - uuid: '', + uuid: randomUUID(), session_id: '', message: { role: 'user', content }, parent_tool_use_id: null, diff --git a/src/main.tsx b/src/main.tsx index ecb8ff0670..ba030cf822 100644 --- a/src/main.tsx +++ b/src/main.tsx @@ -4164,7 +4164,7 @@ async function run(): Promise { profileCheckpoint("before_print_import"); const { runHeadless } = await import("src/cli/print.js"); profileCheckpoint("after_print_import"); - void runHeadless( + await runHeadless( inputPrompt, () => headlessStore.getState(), headlessStore.setState, diff --git a/src/observability/harness.ts b/src/observability/harness.ts index 62d27815e8..db10fcdaa7 100644 --- a/src/observability/harness.ts +++ b/src/observability/harness.ts @@ -12,6 +12,19 @@ export const HARNESS_SCHEMA_VERSION = '2026-04-19' type HarnessLevel = 'debug' | 'info' | 'warning' | 'error' +export type EvalExecutionContext = { + experiment_id: string + scenario_id: string + variant_id: string + benchmark_run_id: string + eval_run_id: string +} + +export function isQuerySendDebugEnabled(): boolean { + const value = process.env.CLAUDE_CODE_QUERY_SEND_DEBUG + return value === '1' || value === 'true' || value === 'TRUE' +} + export type HarnessSnapshotRef = { snapshot_ref: string bytes: number @@ -43,6 +56,7 @@ export type HarnessEventInput = { cwd?: string | null git_branch?: string | null build_version?: string | null + eval_context?: EvalExecutionContext | null payload?: Record } @@ -72,6 +86,29 @@ function getEventLogPath(now: Date): string { return join(getObservabilityDir(), `events-${yyyymmdd}.jsonl`) } +function nonEmptyEnv(name: string): string | null { + const value = process.env[name] + return value && value.trim() !== '' ? value : null +} + +export function getEvalExecutionContextFromEnv(): EvalExecutionContext | null { + const experiment_id = nonEmptyEnv('CLAUDE_CODE_EVAL_EXPERIMENT_ID') + const scenario_id = nonEmptyEnv('CLAUDE_CODE_EVAL_SCENARIO_ID') + const variant_id = nonEmptyEnv('CLAUDE_CODE_EVAL_VARIANT_ID') + const benchmark_run_id = nonEmptyEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID') + const eval_run_id = nonEmptyEnv('CLAUDE_CODE_EVAL_RUN_ID') + if (!experiment_id || !scenario_id || !variant_id || !benchmark_run_id || !eval_run_id) { + return null + } + return { + experiment_id, + scenario_id, + variant_id, + benchmark_run_id, + eval_run_id, + } +} + function enqueueWrite(task: () => Promise): Promise { writeChain = writeChain.then(task, task) return writeChain @@ -129,6 +166,7 @@ export async function emitHarnessEvent( input: HarnessEventInput, ): Promise { const now = new Date() + const evalContext = input.eval_context ?? getEvalExecutionContextFromEnv() const line = stableStringify({ schema_version: HARNESS_SCHEMA_VERSION, ts_wall: now.toISOString(), @@ -156,6 +194,11 @@ export async function emitHarnessEvent( cwd: input.cwd ?? getCwdState(), git_branch: input.git_branch ?? null, build_version: input.build_version ?? (MACRO.VERSION ?? 'unknown'), + experiment_id: evalContext?.experiment_id ?? null, + scenario_id: evalContext?.scenario_id ?? null, + variant_id: evalContext?.variant_id ?? null, + benchmark_run_id: evalContext?.benchmark_run_id ?? null, + eval_run_id: evalContext?.eval_run_id ?? null, payload: input.payload ?? {}, }) diff --git a/src/observability/v2/evalExperimentTypes.ts b/src/observability/v2/evalExperimentTypes.ts index e2717ea058..24f4651bdf 100644 --- a/src/observability/v2/evalExperimentTypes.ts +++ b/src/observability/v2/evalExperimentTypes.ts @@ -65,11 +65,23 @@ export type EvalExperimentActionBinding = | EvalExperimentFlatActionBinding | EvalExperimentNestedActionBinding +export interface EvalExperimentExecutionConfig { + adapter?: 'cli_print' | 'disabled' + timeout_ms?: number + max_turns?: number + allow_fallback_to_bind_existing?: boolean + require_config_snapshot?: boolean + env?: Record + command?: string + args?: string[] +} + export interface EvalExperimentV21 extends EvalExperiment { scenario_ids?: string[] repeat_count?: number score_spec_ids?: string[] gate_policy_id?: string mode?: 'bind_existing' | 'execute_harness' + execution?: EvalExperimentExecutionConfig action_bindings?: EvalExperimentActionBinding[] } diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index 7a512238fb..9dfe775c95 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -53,6 +53,14 @@ export interface EvalVariant { base_variant_id?: string git_commit?: string config_snapshot_ref?: string + env_overrides?: Record + model_config?: { + model?: string + max_turns?: number + thinking?: 'enabled' | 'adaptive' | 'disabled' + max_budget_usd?: number + } + feature_gates?: Record notes?: string } diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index af0b6b1bef..7fadac4ec2 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -1,63 +1,27 @@ # V2 Eval Workspace -This directory holds the local-first working skeleton for observability V2. - -Structure: - -- `scenarios/` - - machine-readable scenario manifests -- `variants/` - - variant manifests -- `experiments/` - - experiment manifests -- `score-specs/` - - score definitions: dimension, formula, direction, evidence requirements -- `gates/` - - regression gate policies -- `experiment-runs/` - - generated experiment-level summaries -- `verification-reports/` - - generated V2.1 runner verification summaries -- `scores/` - - optional manual review or exported score artifacts -- `runs/` - - generated run records that bind V2 evaluation to V1 evidence - -Recommended V2.1 usage order: - -1. Pick or create a `scenario` under `scenarios/`. -2. Define the baseline and candidate `variant` manifests under `variants/`. -3. Produce real V1 traces first. Current V2.1 is `bind_existing`, so you must already have one baseline `user_action_id` and one candidate `user_action_id`. -4. Create or edit an experiment manifest under `experiments/`, including: - - `scenario_ids` - - `baseline_variant_id` - - `candidate_variant_ids` - - `mode: "bind_existing"` - - `action_bindings` - - `score_spec_ids` - - `gate_policy_id` -5. Validate all manifests. -6. Run the experiment runner. -7. Read the generated run, score, comparison, risk gate, scorecard, exploration, and experiment summary artifacts. - -Recommended V2.1 `action_bindings` shape: +This directory stores the local-first V2 evaluation system. -```json -[ - { - "scenario_id": "cost_sensitive_task", - "variant_id": "baseline_default", - "entry_user_action_id": "" - }, - { - "scenario_id": "cost_sensitive_task", - "variant_id": "candidate_session_memory_sparse", - "entry_user_action_id": "" - } -] -``` +## Structure + +- `scenarios/`: scenario manifests. +- `variants/`: baseline and candidate variant manifests. +- `experiments/`: experiment manifests. +- `score-specs/`: score definitions and evidence requirements. +- `gates/`: regression-risk gate policies. +- `runs/`: generated run records bound to V1 evidence. +- `scores/`: generated score artifacts. +- `experiment-runs/`: experiment-level JSON summaries. +- `verification-reports/`: runner verification reports. + +## Modes -The runner still accepts the older nested binding shape for compatibility. New experiment manifests should use the flat `scenario_id + variant_id + entry_user_action_id` shape. +- `bind_existing`: V2.1 stable mode. You provide existing V1 `user_action_id` values through `action_bindings`. +- `execute_harness`: V2.2-alpha mode. The runner executes one scenario through the headless harness, injects eval context into V1 events, captures the generated `user_action_id` by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. + +V2.2-alpha deliberately supports only 1 scenario, 1 baseline, 1 candidate, and `repeat_count=1`. + +## Basic Commands Validate manifests: @@ -65,41 +29,96 @@ Validate manifests: bun run scripts/evals/v2_validate_manifests.ts ``` +Validate generated experiment artifact schema: + +```powershell +bun run scripts/evals/v2_validate_experiment_artifacts.ts +``` + Run the V2.1 bind runner verification suite: ```powershell bun run scripts/evals/v2_verify_bind_runner.ts ``` -Validate generated experiment artifact schema: +Run the V2.2-alpha execute_harness verification suite: ```powershell -bun run scripts/evals/v2_validate_experiment_artifacts.ts +bun run scripts/evals/v2_verify_execute_harness_alpha.ts ``` -Run the current sample V2.1 experiment: +Run the current V2.1 sample: ```powershell bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default ``` -Current V2.1 mode is `bind_existing`. It does not execute the harness by itself yet. Instead, it binds existing V1 `user_action_id` traces into V2 runs, records score-spec-backed scores, compares baseline vs candidate, applies the configured gate policy as a regression-risk check, and writes an experiment summary under `experiment-runs/` plus a Markdown report under `ObservrityTask/10-系统版本/v2/06-运行报告/`. +Run the V2.2-alpha smoke manifest with automatic execution enabled: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +Disable automatic execution and fall back to `bind_existing`: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json --disable-execute-harness +``` + +Equivalent environment switch: -The top-level `risk_verdict` is not a final experiment judgment. It is only a regression-risk signal. New summaries also include `scorecard_summary`, `exploration_signals`, `recommended_review_mode`, and `final_decision` so exploratory harness work is not reduced to pass/fail. +```powershell +$env:V2_2_EXECUTE_HARNESS='0' +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` -Detailed V2.1 usage: +## bind_existing Binding Shape + +```json +[ + { + "scenario_id": "cost_sensitive_task", + "variant_id": "baseline_default", + "entry_user_action_id": "" + }, + { + "scenario_id": "cost_sensitive_task", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "" + } +] +``` + +The runner still accepts the older nested binding shape for compatibility. New manifests should use the flat shape. + +## execute_harness Binding Mechanism + +The formal binding key is `benchmark_run_id`, not “latest user_action_id”. + +Flow: ```text -tests/evals/v2/V2.1-bind_existing-usage.md +experiment manifest +-> scenario prompt +-> variant apply v0 +-> headless --print adapter +-> V1 events with eval context +-> DuckDB rebuild +-> benchmark_run_id -> unique user_action_id +-> V2 record/score/compare/risk_verdict/report ``` -`execute_harness` is reserved but intentionally blocked until a stable headless harness execution adapter exists. If a manifest uses that mode now, the runner exits with: +If capture returns zero matches, the run fails as `capture_failed`. If it returns multiple actions, the run fails as `ambiguous_capture`. + +## Detailed Docs ```text -execute_harness mode is not implemented yet: missing headless harness execution adapter +tests/evals/v2/V2.1-bind_existing-usage.md +tests/evals/v2/V2.2-execute_harness-alpha-usage.md +tests/evals/v2/experiment-runs/README.md ``` -Lower-level commands are still available when you want to debug one step at a time. +## Low-Level Debug Commands Record one run manually: @@ -118,9 +137,3 @@ List recorded runs: ```powershell bun run scripts/evals/v2_list_runs.ts --scenario tool_choice_sensitive ``` - -Compare the latest baseline/candidate runs for one scenario: - -```powershell -bun run scripts/evals/v2_compare_scenario.ts --scenario tool_choice_sensitive --candidate candidate_tool_router_v2 -``` diff --git a/tests/evals/v2/V2.1-bind_existing-usage.md b/tests/evals/v2/V2.1-bind_existing-usage.md index be71f61ccf..55bf3c6291 100644 --- a/tests/evals/v2/V2.1-bind_existing-usage.md +++ b/tests/evals/v2/V2.1-bind_existing-usage.md @@ -191,3 +191,12 @@ execute_harness mode is not implemented yet: missing headless harness execution ``` 这不是缺陷,而是当前阶段的安全边界。 +## V2.2 Update + +This document is the V2.1 `bind_existing` usage guide. Since V2.2-alpha, `execute_harness` is no longer a fixed blocked path. For automatic execution, use: + +```text +tests/evals/v2/V2.2-execute_harness-alpha-usage.md +``` + +V2.1 `bind_existing` remains supported and is still the fallback mode when `execute_harness` is disabled. diff --git a/tests/evals/v2/V2.2-execute_harness-alpha-usage.md b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md new file mode 100644 index 0000000000..87a1a723b7 --- /dev/null +++ b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md @@ -0,0 +1,148 @@ +# V2.2-alpha execute_harness Usage + +## 理解清单 + +- V2.1 `bind_existing` 已经能把已有 V1 `user_action_id` 转成 V2 run、score、compare report、risk verdict。 +- V2.2-alpha 新增的是“前半段自动化”:由 runner 自动执行 scenario,并自动找到这次执行生成的 V1 action。 +- 正式绑定不允许用“最新 user_action_id”,因为并发、后台任务或手动调试都可能生成更新的 action。 +- 正式绑定使用 `benchmark_run_id -> user_action_id`,只有唯一命中时才进入 score/report。 +- 本阶段只支持 1 scenario / 1 baseline / 1 candidate / repeat=1。 +- 自动化可以一键关闭,关闭后回退到 V2.1 `bind_existing`。 + +## 预期效果 + +你可以用一个 manifest 完成最小自动实验: + +```text +scenario prompt +-> baseline 自动跑一次 +-> candidate 自动跑一次 +-> 分别捕获 user_action_id +-> 生成 V2 run/scores/compare/risk verdict/report +``` + +如果你临时不想自动跑模型,可以执行同一个 manifest 但加 `--disable-execute-harness`,runner 会改用 `action_bindings` 中已有的 action。 + +## 设计思路 + +V2.2-alpha 把系统拆成两段: + +- 前半段:`execute_harness` 自动执行并捕获 action。 +- 后半段:复用 V2.1 已稳定的 fact-only scoring pipeline。 + +这样做的原因是:执行自动化可以逐步增强,但评分和回归判断必须始终基于 V1 事实证据,避免把“跑起来了”误当成“评测可信”。 + +## Manifest Example + +See: + +```text +tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +Core fields: + +```json +{ + "mode": "execute_harness", + "execution": { + "adapter": "cli_print", + "timeout_ms": 180000, + "max_turns": 8, + "allow_fallback_to_bind_existing": true + } +} +``` + +The same manifest may still include `action_bindings`. They are ignored when automatic execution is enabled, but used when automation is disabled. + +## Run With Automation + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +Default production adapter: + +```text +bun run src/entrypoints/cli.tsx --print --output-format json +``` + +Variant v0 can pass: + +- `env_overrides` +- `config_snapshot_ref` metadata +- `model_config` +- `feature_gates` + +It does not do git checkout or source patching. + +## Disable Automation + +Command-line switch: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json --disable-execute-harness +``` + +Environment switch: + +```powershell +$env:V2_2_EXECUTE_HARNESS='0' +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +When disabled: + +- requested mode remains `execute_harness` +- effective mode becomes `bind_existing` +- `action_bindings` are required +- output summary includes `requested_mode`, `mode`, `automation_disabled`, and `runner.fallback_reason` + +## Capture Rules + +V2.2-alpha injects these fields into V1 events: + +```text +experiment_id +scenario_id +variant_id +benchmark_run_id +eval_run_id +``` + +After execution, the runner rebuilds the V1 DuckDB database and runs: + +```sql +SELECT DISTINCT user_action_id +FROM user_actions +WHERE benchmark_run_id = ''; +``` + +Outcomes: + +- exactly 1 match: enter V2 score/report flow +- 0 matches: `capture_failed` +- more than 1 match: `ambiguous_capture` + +## Verification + +Run: + +```powershell +bun run scripts/evals/v2_verify_execute_harness_alpha.ts +``` + +The verification suite covers: + +- execute_harness success path through a local fixture command +- missing adapter +- capture failed +- ambiguous capture +- variant apply failed +- missing scenario +- baseline failure +- candidate failure +- disabled automation fallback + +The success-path verification uses a fixture command to avoid real model/API spend. The production default adapter remains `cli_print`. diff --git a/tests/evals/v2/experiment-runs/README.md b/tests/evals/v2/experiment-runs/README.md index 5b060e12c0..bf371c33bf 100644 --- a/tests/evals/v2/experiment-runs/README.md +++ b/tests/evals/v2/experiment-runs/README.md @@ -1,46 +1,34 @@ -# V2.1 Experiment Artifact Schema +# V2 Experiment Artifact Schema ## 理解清单 -- 本目录保存 experiment-level JSON summary。 -- 这些 JSON 是 V2.1 runner 的稳定回归证据。 -- V2.1-stable 要求每个新 summary 都包含固定顶层 schema,不能只依赖历史的 `experiment/results` 内部结构。 - -## 预期效果 - -读取任意 `tests/evals/v2/experiment-runs/*.json` 时,应能快速回答: - -- 这次实验来自哪个 manifest。 -- 使用的是哪个 mode。 -- 生成了哪些 run / score / report artifact。 -- risk gate 最终是 pass、warning、fail 还是 inconclusive。 -- 是否存在错误或警告。 -- 是否存在 scorecard 变化、探索信号、推荐复盘模式。 - -## 设计思路 - -顶层字段用于机器读取和回归判断;保留 `experiment`、`runner`、`results` 用于人工追溯和向后兼容。 +- This directory stores experiment-level JSON summaries. +- V2.1 summaries are usually produced by `bind_existing`. +- V2.2-alpha summaries may be produced by `execute_harness`, or by `execute_harness` disabled and falling back to `bind_existing`. +- The top-level schema is stable enough for regression checks and documentation. ## Required Top-Level Fields | field | type | meaning | | --- | --- | --- | -| `experiment_id` | string | 实验 ID,来自 manifest。 | -| `manifest_ref` | string | 本次 runner 读取的 manifest 路径。 | -| `generated_at` | ISO timestamp string | summary 生成时间。 | -| `mode` | string | 当前只允许 `bind_existing`;`execute_harness` 会被明确阻塞。 | -| `run_refs` | string[] | 本次生成的 V2 run JSON 路径。 | -| `score_refs` | string[] | 本次生成的 score JSON 路径。 | -| `report_refs` | string[] | 本次生成的 compare / experiment Markdown report 路径。 | -| `risk_verdict` | object | 聚合后的回归风险结论;不是最终实验判断。 | -| `gate_verdict` | object | 兼容旧脚本的别名;新代码应优先读 `risk_verdict`。 | -| `verdict_boundary` | string | 明确说明 verdict 只代表 regression risk。 | -| `scorecard_summary` | array | baseline vs candidate 的多指标变化摘要。 | -| `exploration_signals` | string[] | 自动提取的探索复盘提示。 | -| `recommended_review_mode` | string | 建议复盘模式:`regression_review` / `manual_review` / `exploratory_review`。 | -| `final_decision` | null or object | 人类最终决策;runner 默认保持 `null`。 | -| `errors` | string[] | hard fail 或 runner 级错误摘要。成功但 gate hard fail 时也可非空。 | -| `warnings` | string[] | soft warning、missing score、inconclusive 等非阻塞问题。 | +| `experiment_id` | string | Experiment id from the manifest. | +| `manifest_ref` | string | Manifest path used by the runner. | +| `generated_at` | string | ISO timestamp. | +| `mode` | string | Effective mode: `bind_existing` or `execute_harness`. | +| `requested_mode` | string | Manifest-requested mode, when present in newer artifacts. | +| `automation_disabled` | boolean | Whether `execute_harness` was disabled and fallback was used. | +| `run_refs` | string[] | Generated V2 run JSON refs. | +| `score_refs` | string[] | Generated score JSON refs. | +| `report_refs` | string[] | Generated report refs. | +| `risk_verdict` | object | Regression-risk verdict. Not final experiment judgment. | +| `gate_verdict` | object | Compatibility alias for older readers. | +| `verdict_boundary` | string | Explicit boundary of verdict semantics. | +| `scorecard_summary` | array | Baseline vs candidate score changes. | +| `exploration_signals` | string[] | Automatic review hints. | +| `recommended_review_mode` | string | Suggested review mode. | +| `final_decision` | null or object | Human final decision; runner keeps it `null`. | +| `errors` | string[] | Hard failures or blocking runner errors. | +| `warnings` | string[] | Soft warnings, missing scores, or inconclusive signals. | ## Risk Verdict Shape @@ -58,46 +46,42 @@ } ``` -`status` 的优先级: +Priority: -1. 任意 hard fail => `fail` -2. 任意 missing score 或 inconclusive => `inconclusive` -3. 任意 soft warning => `warning` -4. 其他情况 => `pass` +1. any hard fail -> `fail` +2. any missing score or inconclusive -> `inconclusive` +3. any soft warning -> `warning` +4. otherwise -> `pass` -## Verdict Boundary - -`risk_verdict` 只回答: - -```text -这个 candidate 有没有触发当前 gate policy 已知的回归风险? -``` +## Runner Metadata -它不回答: +Newer artifacts include: -```text -这个 harness 是否更聪明? -这个 candidate 是否有探索价值? -这个改动是否应被长期保留? +```json +{ + "runner": { + "requested_mode": "execute_harness", + "mode": "bind_existing", + "automation_disabled": true, + "fallback_reason": "execute_harness disabled by flag or environment; bind_existing fallback used" + } +} ``` -因此新的 summary 会同时输出: - -- `scorecard_summary` -- `exploration_signals` -- `recommended_review_mode` -- `final_decision` +For actual V2.2-alpha automatic runs, `results[*].baseline_execution` and `results[*].candidates[*].candidate_execution` contain the adapter result, capture result, `benchmark_run_id`, and `eval_run_id`. -最终判断应由人类结合这些材料完成。 +## Boundary -## Backward Compatibility +`risk_verdict` answers only: -V2.1 仍保留以下字段: +```text +Did this candidate trigger the current regression-risk gate policy? +``` -- `experiment` -- `runner` -- `results` -- `created_at` -- `gate_verdict` +It does not answer: -这些字段可以用于人工阅读或兼容历史脚本,但新脚本应优先依赖 `risk_verdict` 和其他顶层稳定 schema。 +```text +Is this harness smarter? +Is this candidate worth exploring? +Should this change be kept long-term? +``` diff --git a/tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T132328195Z.json b/tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T132328195Z.json new file mode 100644 index 0000000000..d46e13e8da --- /dev/null +++ b/tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T132328195Z.json @@ -0,0 +1,372 @@ +{ + "experiment_id": "execute_harness_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.execute_harness.smoke.json", + "generated_at": "2026-05-02T13:23:28.196Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_execute_harness_smoke_2026-05-02T132328195Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 26628, + "candidate_value": 26628, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "No exploratory signal was derived from the current automatic scorecard; manual review may still find qualitative differences." + ], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "execute_harness_smoke", + "name": "Execute Harness Smoke", + "goal": "Run one minimal real-model scenario through V2.2-alpha execute_harness, then capture the generated V1 user_action_id by benchmark_run_id.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_2_alpha_smoke", + "scenario_ids": [ + "execute_harness_smoke_minimal" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "execution": { + "adapter": "cli_print", + "timeout_ms": 180000, + "max_turns": 8, + "allow_fallback_to_bind_existing": true + }, + "action_bindings": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "entry_user_action_id": "e0e2f2b7-7667-4fe2-85a4-17d09a12a5ce" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "e0e2f2b7-7667-4fe2-85a4-17d09a12a5ce" + } + ], + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "execute_harness_alpha_limits": { + "scenario_count": 1, + "candidate_count": 1, + "repeat_count": 1 + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "repeat_index": 1, + "baseline_run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "baseline_user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "baseline_eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "baseline_benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2-harness-runs\\eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z\\stdout.txt", + "stderrRef": ".observability\\v2-harness-runs\\eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "execute_harness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_ID": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "path/to/baseline-config.json" + }, + "cliArgs": [ + "--max-turns", + "8" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "path/to/baseline-config.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "candidate_user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "candidate_eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "candidate_benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2-harness-runs\\eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z\\stdout.txt", + "stderrRef": ".observability\\v2-harness-runs\\eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "execute_harness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_ID": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "src/services/SessionMemory/sessionMemoryUtils.ts" + }, + "cliArgs": [ + "--max-turns", + "8" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "src/services/SessionMemory/sessionMemoryUtils.ts", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z" + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_vs_run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 26628, + "candidate_value": 26628, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 26628, + "candidate_value": 26628, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 26628, + "candidate_value": 26628, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "No exploratory signal was derived from the current automatic scorecard; manual review may still find qualitative differences." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "created_at": "2026-05-02T13:23:28.196Z" +} diff --git a/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json b/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json new file mode 100644 index 0000000000..02b7ef712a --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json @@ -0,0 +1,38 @@ +{ + "experiment_id": "execute_harness_smoke", + "name": "Execute Harness Smoke", + "goal": "Run one minimal real-model scenario through V2.2-alpha execute_harness, then capture the generated V1 user_action_id by benchmark_run_id.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_2_alpha_smoke", + "scenario_ids": ["execute_harness_smoke_minimal"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "execution": { + "adapter": "cli_print", + "timeout_ms": 180000, + "max_turns": 8, + "allow_fallback_to_bind_existing": true + }, + "action_bindings": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "entry_user_action_id": "e0e2f2b7-7667-4fe2-85a4-17d09a12a5ce" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "entry_user_action_id": "e0e2f2b7-7667-4fe2-85a4-17d09a12a5ce" + } + ], + "status": "ready" +} diff --git a/tests/evals/v2/runs/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.json b/tests/evals/v2/runs/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.json new file mode 100644 index 0000000000..bb62d1c859 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.json @@ -0,0 +1,131 @@ +{ + "run": { + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "started_at": "2026-05-02T13:23:08.789Z", + "ended_at": "2026-05-02T13:23:12.747Z", + "status": "completed", + "entry_user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "root_query_id": "601131c9-79b4-497c-9dd2-51761534caeb", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "root_query_id": "601131c9-79b4-497c-9dd2-51761534caeb", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "root_query_id": "601131c9-79b4-497c-9dd2-51761534caeb", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "path/to/baseline-config.json", + "notes": "Use this as the default baseline unless a scenario explicitly requires another baseline." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "started_at": "2026-05-02T13:23:08.789Z", + "started_at_ms": 1777728188789, + "ended_at": "2026-05-02T13:23:12.747Z", + "ended_at_ms": 1777728192747, + "duration_ms": 3958, + "event_count": 26, + "query_count": 2, + "main_thread_query_count": 2, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "execute_harness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T132304712Z", + "raw_input_tokens": "90", + "output_tokens": "2", + "cache_read_tokens": "1173", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26626", + "total_billed_tokens": "26628", + "main_thread_total_prompt_input_tokens": "26626", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "601131c9-79b4-497c-9dd2-51761534caeb", + "user_action_id": "1e3c516e-125b-4575-b3ee-5e7e6b45a8ed", + "session_id": "eb401c74-9f95-4617-9e8d-f71fa319caa3", + "conversation_id": "eb401c74-9f95-4617-9e8d-f71fa319caa3", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T13:23:08.789Z", + "started_at_ms": 1777728188789, + "ended_at": "2026-05-02T13:23:12.747Z", + "ended_at_ms": 1777728192747, + "duration_ms": 3958, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 25, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [], + "recoveries": [] + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.json b/tests/evals/v2/runs/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.json new file mode 100644 index 0000000000..db8e8a6d85 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.json @@ -0,0 +1,132 @@ +{ + "run": { + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "started_at": "2026-05-02T13:23:20.784Z", + "ended_at": "2026-05-02T13:23:24.383Z", + "status": "completed", + "entry_user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "root_query_id": "a3751c61-21ef-410c-a46f-bc117accc262", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "root_query_id": "a3751c61-21ef-410c-a46f-bc117accc262", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "root_query_id": "a3751c61-21ef-410c-a46f-bc117accc262", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Increase the default session memory tool-call threshold from 3 to 6 to reduce background memory subagent cost.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "src/services/SessionMemory/sessionMemoryUtils.ts", + "notes": "Token-saving harness candidate. Keeps natural-break trigger intact while reducing tool-threshold-triggered updates." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "started_at": "2026-05-02T13:23:20.784Z", + "started_at_ms": 1777728200784, + "ended_at": "2026-05-02T13:23:24.383Z", + "ended_at_ms": 1777728204383, + "duration_ms": 3599, + "event_count": 26, + "query_count": 2, + "main_thread_query_count": 2, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "execute_harness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "bench_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "eval_run_id": "eval_execute_harness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T132304712Z", + "raw_input_tokens": "82", + "output_tokens": "2", + "cache_read_tokens": "1181", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26626", + "total_billed_tokens": "26628", + "main_thread_total_prompt_input_tokens": "26626", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "a3751c61-21ef-410c-a46f-bc117accc262", + "user_action_id": "0acb35d4-75b8-4219-86fc-ad5f291bc9ff", + "session_id": "9f488275-46c6-4757-aaaa-38ed8b3fe5c7", + "conversation_id": "9f488275-46c6-4757-aaaa-38ed8b3fe5c7", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T13:23:20.784Z", + "started_at_ms": 1777728200784, + "ended_at": "2026-05-02T13:23:24.383Z", + "ended_at_ms": 1777728204383, + "duration_ms": 3599, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 25, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [], + "recoveries": [] + } +} diff --git a/tests/evals/v2/scenarios/execute_harness_smoke_minimal.json b/tests/evals/v2/scenarios/execute_harness_smoke_minimal.json new file mode 100644 index 0000000000..e7ef7c9aa3 --- /dev/null +++ b/tests/evals/v2/scenarios/execute_harness_smoke_minimal.json @@ -0,0 +1,20 @@ +{ + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": ["smoke", "execute_harness", "v2_2"], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scores/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.scores.json b/tests/evals/v2/scores/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.scores.json new file mode 100644 index 0000000000..df452e2d90 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_task_success_main_chain_observed", + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 26628, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_stability_recovery_absence", + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T132317110Z_execute_harness_smoke_minimal_baseline_default_1e3c516e", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.scores.json b/tests/evals/v2/scores/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.scores.json new file mode 100644 index 0000000000..736ec88097 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4_task_success_main_chain_observed", + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 26628, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4_stability_recovery_absence", + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T132328037Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_0acb35d4", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T141434752Z.json b/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T141434752Z.json new file mode 100644 index 0000000000..fcbf04398d --- /dev/null +++ b/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T141434752Z.json @@ -0,0 +1,89 @@ +{ + "verification_id": "v2_2_execute_harness_alpha_2026-05-02T141434752Z", + "generated_at": "2026-05-02T14:14:42.530Z", + "temp_root": ".observability\\v2-execute-harness-verification\\2026-05-02T141434752Z", + "passed": true, + "case_count": 9, + "failed_count": 0, + "note": "Success-path verification uses a fixture command to avoid model/API spend; the production default adapter is cli_print.", + "results": [ + { + "case_id": "execute_harness_success_fixture", + "description": "execute_harness success path creates run, score, report, and risk verdict through benchmark_run_id capture.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_2_verify_success_2026-05-02T141434752Z_2026-05-02T141437513Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_success_2026-05-02T141434752Z_2026-05-02T141437513Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_2_verify_success_2026-05-02T141434752Z_2026-05-02T141437513Z.json\nCreated V2 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_success_2026-05-02T141434752Z_2026-05-02T141437513Z.md" + }, + { + "case_id": "adapter_not_found", + "description": "Unsupported adapter should fail clearly.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Unsupported execute_harness adapter: not_real_adapter" + }, + { + "case_id": "capture_failed", + "description": "Completed execution without matching benchmark_run_id should fail capture.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default action capture capture_failed: No user_action_id found for benchmark_run_id=bench_v2_2_verify_capture_failed_2026-05-02T141434752Z_cost_sensitive_task_baseline_default_2026-05-02T141437861Z" + }, + { + "case_id": "ambiguous_capture", + "description": "Multiple user_action_id rows for one benchmark_run_id should fail capture.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default action capture ambiguous_capture: Multiple user_action_id values found for benchmark_run_id=bench_v2_2_verify_ambiguous_capture_2026-05-02T141434752Z_cost_sensitive_task_baseline_default_2026-05-02T141438269Z" + }, + { + "case_id": "variant_apply_failed", + "description": "Strict config snapshot check should fail before execution when the referenced snapshot is missing.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Variant apply failed: config_snapshot_ref does not exist: path/to/baseline-config.json" + }, + { + "case_id": "scenario_missing", + "description": "Missing scenario manifest should fail before execution.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Scenario not found: not_real_scenario" + }, + { + "case_id": "baseline_failure", + "description": "Baseline execution failure should stop the experiment.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default execute_harness failed: Fixture requested failure for variant baseline_default" + }, + { + "case_id": "candidate_failure", + "description": "Candidate execution failure should stop the experiment after the baseline succeeds.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "candidate scenario=cost_sensitive_task variant=candidate_session_memory_sparse execute_harness failed: Fixture requested failure for variant candidate_session_memory_sparse" + }, + { + "case_id": "disabled_fallback_to_bind_existing", + "description": "Automation can be disabled and fall back to bind_existing.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_2_verify_disabled_fallback_2026-05-02T141434752Z_2026-05-02T141442497Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_disabled_fallback_2026-05-02T141434752Z_2026-05-02T141442497Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_2_verify_disabled_fallback_2026-05-02T141434752Z_2026-05-02T141442497Z.json\nCreated V2 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_disabled_fallback_2026-05-02T141434752Z_2026-05-02T141442497Z.md" + } + ] +} From f13a640be79ef4bf5d8b98922e10fe4daad94868 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sun, 3 May 2026 01:19:00 +0800 Subject: [PATCH 15/26] Close observability v2.2.5 real experiment paths --- ...ndidate_session_memory_sparse_cd929218.md" | 59 ++ ...ndidate_session_memory_sparse_b118c7c4.md" | 59 ++ ...parse_vs_default_2026-05-02T165222245Z.md" | 104 +++ ...al_bind_existing_2026-05-02T170311090Z.md" | 104 +++ ...er_sensitive_baseline_default_f9b83353.md" | 78 +++ ...ndidate_session_memory_sparse_cd929218.md" | 77 +++ ...er_sensitive_baseline_default_7b614b14.md" | 78 +++ ...ndidate_session_memory_sparse_b118c7c4.md" | 77 +++ scripts/evals/v2_compare_runs.ts | 122 +++- scripts/evals/v2_emit_fixture_trace.ts | 43 +- scripts/evals/v2_harness_execution.ts | 229 ++++++- scripts/evals/v2_manual_real_run.ps1 | 185 ++++++ scripts/evals/v2_record_run.ts | 98 ++- scripts/evals/v2_run_experiment.ts | 615 ++++++++++++++++-- scripts/evals/v2_score_registry.ts | 23 + .../evals/v2_validate_experiment_artifacts.ts | 41 ++ scripts/evals/v2_validate_manifests.ts | 31 + .../evals/v2_verify_execute_harness_alpha.ts | 1 + scripts/evals/v2_windows_spawn_bridge.cjs | 79 +++ src/observability/v2/evalTypes.ts | 4 + src/services/SessionMemory/sessionMemory.ts | 426 ++++++++++-- tests/evals/v2/README.md | 30 +- .../v2/V2.2-execute_harness-alpha-usage.md | 34 +- .../v2/V2.2.5-real-experiment-closure.md | 122 ++++ .../session_memory_default.runtime.json | 7 + .../session_memory_sparse.runtime.json | 10 + tests/evals/v2/experiment-runs/README.md | 11 +- ...arse_vs_default_2026-05-02T165222245Z.json | 520 +++++++++++++++ ...l_bind_existing_2026-05-02T170311090Z.json | 429 ++++++++++++ .../_experiment.execute_harness.smoke.json | 2 + ...sion_memory_runtime_sparse_vs_default.json | 29 + ...parse_vs_default_manual.bind_existing.json | 32 + ...r_sensitive_baseline_default_f9b83353.json | 187 ++++++ ...didate_session_memory_sparse_cd929218.json | 182 ++++++ ...r_sensitive_baseline_default_7b614b14.json | 187 ++++++ ...didate_session_memory_sparse_b118c7c4.json | 182 ++++++ .../session_memory_trigger_sensitive.json | 27 + .../score-specs/default-v2-1.score-specs.json | 11 + ...tive_baseline_default_f9b83353.scores.json | 62 ++ ...session_memory_sparse_cd929218.scores.json | 62 ++ ...tive_baseline_default_7b614b14.scores.json | 62 ++ ...session_memory_sparse_b118c7c4.scores.json | 62 ++ .../evals/v2/variants/baseline.template.json | 4 +- .../candidate_session_memory_sparse.json | 6 +- ...e_harness_alpha_2026-05-02T162923305Z.json | 89 +++ 45 files changed, 4736 insertions(+), 146 deletions(-) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" create mode 100644 scripts/evals/v2_manual_real_run.ps1 create mode 100644 scripts/evals/v2_windows_spawn_bridge.cjs create mode 100644 tests/evals/v2/V2.2.5-real-experiment-closure.md create mode 100644 tests/evals/v2/configs/session_memory_default.runtime.json create mode 100644 tests/evals/v2/configs/session_memory_sparse.runtime.json create mode 100644 tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json create mode 100644 tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json create mode 100644 tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json create mode 100644 tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.json create mode 100644 tests/evals/v2/scenarios/session_memory_trigger_sensitive.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.scores.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T162923305Z.json diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" new file mode 100644 index 0000000000..20822b37ba --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" @@ -0,0 +1,59 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353 +- candidate_run: run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218 +- scenario: session_memory_trigger_sensitive +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: f9b83353-0650-4868-af08-c0ff7048f7b1 +- candidate_user_action_id: cd929218-cfa1-4772-93ba-ae659d9ca0d9 +- runtime_difference_observed: true + +## Variant Effect Evidence + +- baseline_policy_event_observed: true +- candidate_policy_event_observed: true +- candidate_variant_effect_observed: true +- baseline_policy_mode: default +- candidate_policy_mode: sparse +- baseline_session_memory_subagent_count: 2 +- candidate_session_memory_subagent_count: 1 + +## Runtime Difference Summary + +- Baseline session_memory policy was observed with mode=default. +- Candidate session_memory policy was observed with mode=sparse. +- Candidate sparse runtime markers were observed. +- A runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[token_threshold_and_tool_threshold], candidate=[token_threshold_and_tool_threshold]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 2 | 1 | -1 | improved | +| efficiency.total_billed_tokens | 440499 | 304723 | -135776 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was observed, but this comparison is still single-run and should not be treated as a full stability judgment. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" new file mode 100644 index 0000000000..6fbb2c2d50 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" @@ -0,0 +1,59 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14 +- candidate_run: run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4 +- scenario: session_memory_trigger_sensitive +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 7b614b14-19d8-41db-8ee8-ebb61bc4b699 +- candidate_user_action_id: b118c7c4-18df-4ff0-b506-5b5454418b48 +- runtime_difference_observed: true + +## Variant Effect Evidence + +- baseline_policy_event_observed: true +- candidate_policy_event_observed: true +- candidate_variant_effect_observed: true +- baseline_policy_mode: default +- candidate_policy_mode: sparse +- baseline_session_memory_subagent_count: 2 +- candidate_session_memory_subagent_count: 1 + +## Runtime Difference Summary + +- Baseline session_memory policy was observed with mode=default. +- Candidate session_memory policy was observed with mode=sparse. +- Candidate sparse runtime markers were observed. +- A runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[token_threshold_and_tool_threshold], candidate=[token_threshold_and_tool_threshold]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 2 | 1 | -1 | improved | +| efficiency.total_billed_tokens | 396401 | 303392 | -93009 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was observed, but this comparison is still single-run and should not be treated as a full stability judgment. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md" new file mode 100644 index 0000000000..8f876b0345 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md" @@ -0,0 +1,104 @@ +# V2 Experiment Summary: session_memory_runtime_sparse_vs_default + +## Understanding + +- experiment: session_memory_runtime_sparse_vs_default +- mode: execute_harness +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, decision_quality.session_memory_policy_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json + +## Expected Outcome + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2 executes the scenario first, then captures the generated user_action_id through benchmark_run_id. + +## Design Rationale + +The runner always scores only trace-backed V1 facts. V2.2-beta adds runtime-effect evidence and experiment-validity semantics so smoke and real experiments are not confused with each other. + +## Real Experiment + +- requested_mode: execute_harness +- evaluation_intent: exploration +- candidate_runtime_effect_observed: true +- runtime_difference_observed: true +- note: This profile asks whether the candidate changed runtime behavior in an interpretable way. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 0 +- risk_status: pass +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: regression_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Variant Effect Evidence + +- session_memory_trigger_sensitive / candidate_session_memory_sparse: baseline_mode=default, candidate_mode=sparse, candidate_effect_observed=true, runtime_difference_observed=true + +## Experiment Validity + +- status: valid +- profile: real_experiment +- baseline_captured: true +- candidate_captured: true +- no_ambiguous_capture: true +- score_evidence_present: true +- variant_effect_observed: true +- runtime_difference_observed: true +- scenario_intent_matched: true +- reason: Real experiment remains interpretable. + +- No additional blockers or warnings. + +## Runtime Difference Summary + +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Baseline session_memory policy was observed with mode=default. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Candidate session_memory policy was observed with mode=sparse. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Candidate sparse-policy markers were observed in runtime evidence. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Observed baseline and candidate session_memory policies differ. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Session_memory subagent count changed from 2 to 1. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 2 | 1 | -1 | improved | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | efficiency.total_billed_tokens | 440499 | 304723 | -135776 | improved | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- 2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer. +- A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | experiment_validity | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | --- | +| session_memory_trigger_sensitive | 1 | run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353 | candidate_session_memory_sparse | run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | + +## Interpretation Limits + +- This real experiment remains single-scenario and single-run; it is not yet a stability study. +- Candidate runtime effect was observed, but qualitative harness value still needs broader experiments. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md" new file mode 100644 index 0000000000..6dcede7b96 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md" @@ -0,0 +1,104 @@ +# V2 Experiment Summary: session_memory_runtime_sparse_vs_default_manual_bind_existing + +## Understanding + +- experiment: session_memory_runtime_sparse_vs_default_manual_bind_existing +- mode: bind_existing +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, decision_quality.session_memory_policy_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json + +## Expected Outcome + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2 executes the scenario first, then captures the generated user_action_id through benchmark_run_id. + +## Design Rationale + +The runner always scores only trace-backed V1 facts. V2.2-beta adds runtime-effect evidence and experiment-validity semantics so smoke and real experiments are not confused with each other. + +## Real Experiment + +- requested_mode: bind_existing +- evaluation_intent: exploration +- candidate_runtime_effect_observed: true +- runtime_difference_observed: true +- note: This profile asks whether the candidate changed runtime behavior in an interpretable way. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 0 +- risk_status: pass +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: regression_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Variant Effect Evidence + +- session_memory_trigger_sensitive / candidate_session_memory_sparse: baseline_mode=default, candidate_mode=sparse, candidate_effect_observed=true, runtime_difference_observed=true + +## Experiment Validity + +- status: valid +- profile: real_experiment +- baseline_captured: true +- candidate_captured: true +- no_ambiguous_capture: true +- score_evidence_present: true +- variant_effect_observed: true +- runtime_difference_observed: true +- scenario_intent_matched: true +- reason: Real experiment remains interpretable. + +- No additional blockers or warnings. + +## Runtime Difference Summary + +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Baseline session_memory policy was observed with mode=default. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Candidate session_memory policy was observed with mode=sparse. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Candidate sparse-policy markers were observed in runtime evidence. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Observed baseline and candidate session_memory policies differ. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: Session_memory subagent count changed from 2 to 1. +- session_memory_trigger_sensitive / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 2 | 1 | -1 | improved | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | efficiency.total_billed_tokens | 396401 | 303392 | -93009 | improved | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- 2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer. +- A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | experiment_validity | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | --- | +| session_memory_trigger_sensitive | 1 | run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14 | candidate_session_memory_sparse | run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| session_memory_trigger_sensitive | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | + +## Interpretation Limits + +- This real experiment remains single-scenario and single-run; it is not yet a stability study. +- Candidate runtime effect was observed, but qualitative harness value still needs broader experiments. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.md" new file mode 100644 index 0000000000..30054bc6b6 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.md" @@ -0,0 +1,78 @@ +# V2 Run Report: run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353 + +## 理解清单 + +- scenario: session_memory_trigger_sensitive (Session Memory Trigger Sensitive) +- variant: baseline_default (Baseline Default) +- user_action_id: f9b83353-0650-4868-af08-c0ff7048f7b1 +- root_query_id: 5477a647-edbf-46d0-9dd5-906ffd1aa288 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T16:49:13.981Z +- duration_ms: 81846 +- query_count: 3 +- subagent_count: 2 +- tool_call_count: 21 +- total_prompt_input_tokens: 431495 +- total_billed_tokens: 440499 +- root_turn_count: 5 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Read: count=13, closed=13, failed=0 +- Edit: count=8, closed=8, failed=0 + +## Subagents + +- session_memory: count=2, trigger=token_threshold_and_tool_threshold + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: true +- variant_effect_observed: true +- session_memory_subagent_count: 2 +- session_memory_trigger_details: token_threshold_and_tool_threshold +- reason: Session-memory runtime policy was observed from V1 events. + +### Observed Policy + +```json +{ + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 +} +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- decision_quality.session_memory_policy_observed: observed (1) +- efficiency.total_billed_tokens: observed (440499) +- decision_quality.subagent_count_observed: observed (2) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" new file mode 100644 index 0000000000..cfd7798c2f --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md" @@ -0,0 +1,77 @@ +# V2 Run Report: run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218 + +## 理解清单 + +- scenario: session_memory_trigger_sensitive (Session Memory Trigger Sensitive) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- user_action_id: cd929218-cfa1-4772-93ba-ae659d9ca0d9 +- root_query_id: 9b4efe45-9504-4bc9-8391-fa0c51fa01b6 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T16:50:45.579Z +- duration_ms: 91254 +- query_count: 2 +- subagent_count: 1 +- tool_call_count: 12 +- total_prompt_input_tokens: 301366 +- total_billed_tokens: 304723 +- root_turn_count: 5 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Read: count=12, closed=12, failed=0 + +## Subagents + +- session_memory: count=1, trigger=token_threshold_and_tool_threshold + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: true +- variant_effect_observed: true +- session_memory_subagent_count: 1 +- session_memory_trigger_details: token_threshold_and_tool_threshold +- reason: Session-memory runtime policy was observed from V1 events. + +### Observed Policy + +```json +{ + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 +} +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- decision_quality.session_memory_policy_observed: observed (1) +- efficiency.total_billed_tokens: observed (304723) +- decision_quality.subagent_count_observed: observed (1) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.md" new file mode 100644 index 0000000000..99c447f845 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.md" @@ -0,0 +1,78 @@ +# V2 Run Report: run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14 + +## 理解清单 + +- scenario: session_memory_trigger_sensitive (Session Memory Trigger Sensitive) +- variant: baseline_default (Baseline Default) +- user_action_id: 7b614b14-19d8-41db-8ee8-ebb61bc4b699 +- root_query_id: 27da52c7-548e-4d7f-b477-60af0aef1bb5 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T16:54:15.469Z +- duration_ms: 99273 +- query_count: 3 +- subagent_count: 2 +- tool_call_count: 21 +- total_prompt_input_tokens: 385846 +- total_billed_tokens: 396401 +- root_turn_count: 5 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Read: count=12, closed=12, failed=0 +- Edit: count=9, closed=9, failed=0 + +## Subagents + +- session_memory: count=2, trigger=token_threshold_and_tool_threshold + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: true +- variant_effect_observed: true +- session_memory_subagent_count: 2 +- session_memory_trigger_details: token_threshold_and_tool_threshold +- reason: Session-memory runtime policy was observed from V1 events. + +### Observed Policy + +```json +{ + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 +} +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- decision_quality.session_memory_policy_observed: observed (1) +- efficiency.total_billed_tokens: observed (396401) +- decision_quality.subagent_count_observed: observed (2) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" new file mode 100644 index 0000000000..b09e6c7dcd --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md" @@ -0,0 +1,77 @@ +# V2 Run Report: run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4 + +## 理解清单 + +- scenario: session_memory_trigger_sensitive (Session Memory Trigger Sensitive) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- user_action_id: b118c7c4-18df-4ff0-b506-5b5454418b48 +- root_query_id: e5deb781-955f-4cbd-8194-62d79cd14bc7 +- observability_db_ref: .observability\observability_v1.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T16:59:20.101Z +- duration_ms: 83227 +- query_count: 2 +- subagent_count: 1 +- tool_call_count: 12 +- total_prompt_input_tokens: 300391 +- total_billed_tokens: 303392 +- root_turn_count: 5 +- root_terminal_reason: completed +- recovery_count: 0 + +## Tools + +- Read: count=12, closed=12, failed=0 + +## Subagents + +- session_memory: count=1, trigger=token_threshold_and_tool_threshold + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: true +- variant_effect_observed: true +- session_memory_subagent_count: 1 +- session_memory_trigger_details: token_threshold_and_tool_threshold +- reason: Session-memory runtime policy was observed from V1 events. + +### Observed Policy + +```json +{ + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 +} +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- decision_quality.session_memory_policy_observed: observed (1) +- efficiency.total_billed_tokens: observed (303392) +- decision_quality.subagent_count_observed: observed (1) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git a/scripts/evals/v2_compare_runs.ts b/scripts/evals/v2_compare_runs.ts index abddfb60c5..0395b43686 100644 --- a/scripts/evals/v2_compare_runs.ts +++ b/scripts/evals/v2_compare_runs.ts @@ -10,6 +10,11 @@ interface RunFile { variant_id: string entry_user_action_id?: string } + variant_effect?: Record + scenario?: { + evaluation_note?: string + expected_observations?: string[] + } } const repoRoot = path.resolve(import.meta.dirname, '..', '..') @@ -17,9 +22,9 @@ const evalRoot = path.join(repoRoot, 'tests', 'evals', 'v2') const reportRoot = path.join( repoRoot, 'ObservrityTask', - '10-系统版本', + '10-绯荤粺鐗堟湰', 'v2', - '06-运行报告', + '06-杩愯鎶ュ憡', ) async function findChildDir(parent: string, matcher: (name: string) => boolean) { @@ -67,6 +72,25 @@ function scoreKey(score: EvalScore): string { return `${score.dimension}.${score.subdimension}` } +function asString(value: unknown): string { + return typeof value === 'string' ? value : '' +} + +function asBoolean(value: unknown): boolean { + return value === true +} + +function asNumber(value: unknown): number { + if (typeof value === 'number') return value + if (typeof value === 'string' && value.trim() !== '') return Number(value) + return 0 +} + +function asStringArray(value: unknown): string[] { + if (!Array.isArray(value)) return [] + return value.filter((item): item is string => typeof item === 'string' && item.length > 0) +} + function isLowerBetter(score: EvalScore): boolean { return ( (score.dimension === 'efficiency' && @@ -105,6 +129,20 @@ function formatValue(value: number | null): string { return value === null ? 'n/a' : String(value) } +function policyMode(runFile: RunFile): string { + const observed = runFile.variant_effect?.observed_policy + if (observed && typeof observed === 'object' && !Array.isArray(observed)) { + return asString((observed as Record).mode) || 'unknown' + } + return 'unknown' +} + +function policySignature(runFile: RunFile): string { + const observed = runFile.variant_effect?.observed_policy + if (!observed || typeof observed !== 'object' || Array.isArray(observed)) return '' + return JSON.stringify(observed) +} + function buildReport(params: { baselineRun: RunFile candidateRun: RunFile @@ -135,9 +173,70 @@ function buildReport(params: { key => classifyDelta(baselineByKey.get(key), candidateByKey.get(key)) === 'regressed', ).length + const baselineObserved = asBoolean(baselineRun.variant_effect?.policy_event_observed) + const candidateObserved = asBoolean(candidateRun.variant_effect?.policy_event_observed) + const candidateEffectObserved = asBoolean( + candidateRun.variant_effect?.variant_effect_observed, + ) + const baselinePolicyMode = policyMode(baselineRun) + const candidatePolicyMode = policyMode(candidateRun) + const baselineSubagentCount = asNumber( + baselineRun.variant_effect?.session_memory_subagent_count, + ) + const candidateSubagentCount = asNumber( + candidateRun.variant_effect?.session_memory_subagent_count, + ) + const baselineTriggerDetails = [ + ...asStringArray(baselineRun.variant_effect?.session_memory_trigger_details), + ].sort() + const candidateTriggerDetails = [ + ...asStringArray(candidateRun.variant_effect?.session_memory_trigger_details), + ].sort() + const runtimeDifferenceObserved = + candidateEffectObserved && + ((policySignature(baselineRun) && + policySignature(candidateRun) && + policySignature(baselineRun) !== policySignature(candidateRun)) || + baselineSubagentCount !== candidateSubagentCount || + baselineTriggerDetails.join('|') !== candidateTriggerDetails.join('|')) + + const variantEffectRows = [ + `- baseline_policy_event_observed: ${baselineObserved}`, + `- candidate_policy_event_observed: ${candidateObserved}`, + `- candidate_variant_effect_observed: ${candidateEffectObserved}`, + `- baseline_policy_mode: ${baselinePolicyMode}`, + `- candidate_policy_mode: ${candidatePolicyMode}`, + `- baseline_session_memory_subagent_count: ${baselineSubagentCount}`, + `- candidate_session_memory_subagent_count: ${candidateSubagentCount}`, + ].join('\n') + + const runtimeSummary = [ + baselineObserved + ? `- Baseline session_memory policy was observed with mode=${baselinePolicyMode}.` + : '- Baseline session_memory policy was not observed.', + candidateObserved + ? `- Candidate session_memory policy was observed with mode=${candidatePolicyMode}.` + : '- Candidate session_memory policy was not observed.', + candidateEffectObserved + ? '- Candidate sparse runtime markers were observed.' + : '- Candidate sparse runtime markers were not observed.', + runtimeDifferenceObserved + ? '- A runtime difference was observed between baseline and candidate.' + : '- No stable runtime difference was observed between baseline and candidate.', + `- Trigger details: baseline=[${baselineTriggerDetails.join(', ') || 'none'}], candidate=[${candidateTriggerDetails.join(', ') || 'none'}].`, + ].join('\n') + + const interpretationLimits = [ + candidateEffectObserved + ? '- Candidate runtime effect was observed, but this comparison is still single-run and should not be treated as a full stability judgment.' + : '- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value.', + '- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself.', + `- Scenario note: ${asString(candidateRun.scenario?.evaluation_note) || 'n/a'}`, + ].join('\n') + return `# V2 Run Comparison -## 理解清单 +## Understanding - baseline_run: ${baselineRun.run.run_id} - candidate_run: ${candidateRun.run.run_id} @@ -145,11 +244,11 @@ function buildReport(params: { - baseline_variant: ${baselineRun.run.variant_id} - candidate_variant: ${candidateRun.run.variant_id} -## 预期效果 +## Expected Outcome This report compares two V2 runs using score artifacts generated from V1 observability evidence. -## 设计思路 +## Design Rationale Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. @@ -158,12 +257,25 @@ Higher is better for capability and stability scores. Lower is better for explic - regression_count: ${regressionCount} - baseline_user_action_id: ${baselineRun.run.entry_user_action_id ?? 'unknown'} - candidate_user_action_id: ${candidateRun.run.entry_user_action_id ?? 'unknown'} +- runtime_difference_observed: ${runtimeDifferenceObserved} + +## Variant Effect Evidence + +${variantEffectRows} + +## Runtime Difference Summary + +${runtimeSummary} ## Score Deltas | score | baseline | candidate | delta | verdict | | --- | ---: | ---: | ---: | --- | ${rows} + +## Interpretation Limits + +${interpretationLimits} ` } diff --git a/scripts/evals/v2_emit_fixture_trace.ts b/scripts/evals/v2_emit_fixture_trace.ts index 7d51c7e8aa..eb4429ace4 100644 --- a/scripts/evals/v2_emit_fixture_trace.ts +++ b/scripts/evals/v2_emit_fixture_trace.ts @@ -15,6 +15,13 @@ function requiredEnv(name: string): string { return value } +function requiredContextEnv(primary: string, fallback?: string): string { + const direct = process.env[primary] + if (direct && direct.trim() !== '') return direct + if (fallback) return requiredEnv(fallback) + return requiredEnv(primary) +} + function sqlString(value: string): string { return `'${value.replaceAll("'", "''")}'` } @@ -27,9 +34,18 @@ function writeFixtureDb(params: { endedAt: string }) { const benchmarkRunId = requiredEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID') - const experimentId = requiredEnv('CLAUDE_CODE_EVAL_EXPERIMENT_ID') - const scenarioId = requiredEnv('CLAUDE_CODE_EVAL_SCENARIO_ID') - const variantId = requiredEnv('CLAUDE_CODE_EVAL_VARIANT_ID') + const experimentId = requiredContextEnv( + 'CLAUDE_CODE_EVAL_EXPERIMENT_LABEL', + 'CLAUDE_CODE_EVAL_EXPERIMENT_ID', + ) + const scenarioId = requiredContextEnv( + 'CLAUDE_CODE_EVAL_SCENARIO_LABEL', + 'CLAUDE_CODE_EVAL_SCENARIO_ID', + ) + const variantId = requiredContextEnv( + 'CLAUDE_CODE_EVAL_VARIANT_LABEL', + 'CLAUDE_CODE_EVAL_VARIANT_ID', + ) const evalRunId = requiredEnv('CLAUDE_CODE_EVAL_RUN_ID') const sql = [ 'CREATE TABLE IF NOT EXISTS user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, experiment_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, benchmark_run_id VARCHAR, eval_run_id VARCHAR, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', @@ -66,8 +82,10 @@ async function main(): Promise { const userActionId = randomUUID() const queryId = randomUUID() const fixtureDbPath = process.env.V2_FIXTURE_DB_PATH - if (process.env.V2_FIXTURE_FAIL_VARIANT === process.env.CLAUDE_CODE_EVAL_VARIANT_ID) { - throw new Error(`Fixture requested failure for variant ${process.env.CLAUDE_CODE_EVAL_VARIANT_ID}`) + const fixtureVariantId = + process.env.CLAUDE_CODE_EVAL_VARIANT_LABEL ?? process.env.CLAUDE_CODE_EVAL_VARIANT_ID + if (process.env.V2_FIXTURE_FAIL_VARIANT === fixtureVariantId) { + throw new Error(`Fixture requested failure for variant ${fixtureVariantId}`) } if (fixtureDbPath) { writeFixtureDb({ @@ -98,9 +116,18 @@ async function main(): Promise { user_action_id: userActionId, query_id: queryId, query_source: 'repl_main_thread', - experiment_id: requiredEnv('CLAUDE_CODE_EVAL_EXPERIMENT_ID'), - scenario_id: requiredEnv('CLAUDE_CODE_EVAL_SCENARIO_ID'), - variant_id: requiredEnv('CLAUDE_CODE_EVAL_VARIANT_ID'), + experiment_id: requiredContextEnv( + 'CLAUDE_CODE_EVAL_EXPERIMENT_LABEL', + 'CLAUDE_CODE_EVAL_EXPERIMENT_ID', + ), + scenario_id: requiredContextEnv( + 'CLAUDE_CODE_EVAL_SCENARIO_LABEL', + 'CLAUDE_CODE_EVAL_SCENARIO_ID', + ), + variant_id: requiredContextEnv( + 'CLAUDE_CODE_EVAL_VARIANT_LABEL', + 'CLAUDE_CODE_EVAL_VARIANT_ID', + ), benchmark_run_id: requiredEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID'), eval_run_id: requiredEnv('CLAUDE_CODE_EVAL_RUN_ID'), cwd: repoRoot, diff --git a/scripts/evals/v2_harness_execution.ts b/scripts/evals/v2_harness_execution.ts index 861b1c6f0e..06abbe2de9 100644 --- a/scripts/evals/v2_harness_execution.ts +++ b/scripts/evals/v2_harness_execution.ts @@ -1,6 +1,7 @@ import { spawnSync } from 'node:child_process' +import { createHash } from 'node:crypto' import { existsSync } from 'node:fs' -import { mkdir, writeFile } from 'node:fs/promises' +import { mkdir, readFile, writeFile } from 'node:fs/promises' import path from 'node:path' import type { EvalScenario, EvalVariant } from '../../src/observability/v2/evalTypes' @@ -59,9 +60,17 @@ export interface ExecuteHarnessResult { } const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const bunExe = process.execPath +const nodeExe = process.env.CLAUDE_CODE_NODE_EXE?.trim() || 'node.exe' const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') const defaultDbPath = path.join(repoRoot, '.observability', 'observability_v1.duckdb') -const harnessRunsRoot = path.join(repoRoot, '.observability', 'v2-harness-runs') +const harnessRunsRoot = path.join(repoRoot, '.observability', 'v2h') +const windowsLauncherBridgePath = path.join( + repoRoot, + 'scripts', + 'evals', + 'v2_windows_spawn_bridge.cjs', +) function sqlString(value: string): string { return `'${value.replaceAll("'", "''")}'` @@ -71,6 +80,16 @@ function sanitizeId(value: string): string { return value.replace(/[^a-zA-Z0-9_-]+/g, '_').replace(/^_+|_+$/g, '') } +function artifactRunDirName(runId: string): string { + return createHash('sha1').update(runId).digest('hex').slice(0, 16) +} + +function evalAlias(prefix: string, value: string): string { + const human = sanitizeId(value).slice(0, 12) + const hash = createHash('sha1').update(value).digest('hex').slice(0, 8) + return `${prefix}_${human}_${hash}` +} + function stringifyEnv(value: string | number | boolean): string { return typeof value === 'string' ? value : String(value) } @@ -85,6 +104,53 @@ function mergeEnvRecords(...records: Array + input?: string + }, +) { + if (process.platform !== 'win32') { + return spawnSync(command, args, { + cwd: options.cwd, + encoding: options.encoding, + timeout: options.timeout, + input: options.input, + env: { + ...process.env, + ...options.env, + }, + }) + } + + const previousValues = new Map() + for (const [key, value] of Object.entries(options.env)) { + previousValues.set(key, process.env[key]) + process.env[key] = value + } + try { + return spawnSync(command, args, { + cwd: options.cwd, + encoding: options.encoding, + timeout: options.timeout, + input: options.input, + }) + } finally { + for (const [key, previousValue] of previousValues.entries()) { + if (previousValue === undefined) { + delete process.env[key] + } else { + process.env[key] = previousValue + } + } + } +} + function featureGateEnvName(key: string): string { return `CLAUDE_CODE_FEATURE_${key.replace(/[^a-zA-Z0-9]+/g, '_').toUpperCase()}` } @@ -125,9 +191,12 @@ function hasRelationColumn(dbPath: string, relation: string, column: string): bo export function buildEvalContextEnv(context: EvalExecutionContext): Record { return { - CLAUDE_CODE_EVAL_EXPERIMENT_ID: context.experiment_id, - CLAUDE_CODE_EVAL_SCENARIO_ID: context.scenario_id, - CLAUDE_CODE_EVAL_VARIANT_ID: context.variant_id, + CLAUDE_CODE_EVAL_EXPERIMENT_ID: evalAlias('exp', context.experiment_id), + CLAUDE_CODE_EVAL_SCENARIO_ID: evalAlias('scn', context.scenario_id), + CLAUDE_CODE_EVAL_VARIANT_ID: evalAlias('var', context.variant_id), + CLAUDE_CODE_EVAL_EXPERIMENT_LABEL: context.experiment_id, + CLAUDE_CODE_EVAL_SCENARIO_LABEL: context.scenario_id, + CLAUDE_CODE_EVAL_VARIANT_LABEL: context.variant_id, CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID: context.benchmark_run_id, CLAUDE_CODE_EVAL_RUN_ID: context.eval_run_id, } @@ -147,12 +216,15 @@ export function createRunIdentity(params: { variantId: string stamp: string }): { eval_run_id: string; benchmark_run_id: string } { - const base = sanitizeId( - `${params.experimentId}_${params.scenarioId}_${params.variantId}_${params.stamp}`, + const base = `${params.experimentId}_${params.scenarioId}_${params.variantId}_${params.stamp}` + const humanPrefix = sanitizeId( + `${params.experimentId.slice(0, 20)}_${params.scenarioId.slice(0, 20)}_${params.variantId.slice(0, 20)}`, ) + const hash = createHash('sha1').update(base).digest('hex').slice(0, 12) + const identity = `${humanPrefix}_${hash}` return { - eval_run_id: `eval_${base}`, - benchmark_run_id: `bench_${base}`, + eval_run_id: `eval_${identity}`, + benchmark_run_id: `bench_${identity}`, } } @@ -243,12 +315,15 @@ export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter ) {} async execute(input: HarnessExecutionAdapterInput): Promise { - const runDir = path.join(harnessRunsRoot, sanitizeId(input.runId)) + const runDir = path.join(harnessRunsRoot, artifactRunDirName(input.runId)) await mkdir(runDir, { recursive: true }) const stdoutPath = path.join(runDir, 'stdout.txt') const stderrPath = path.join(runDir, 'stderr.txt') const commandPath = path.join(runDir, 'command.json') - const command = this.options.execution?.command ?? 'bun' + const promptPath = path.join(runDir, 'prompt.txt') + const launcherRequestPath = path.join(runDir, 'launcher-request.json') + const launcherResultPath = path.join(runDir, 'launcher-result.json') + const command = this.options.execution?.command ?? bunExe const defaultArgs = [ 'run', 'src/entrypoints/cli.tsx', @@ -256,11 +331,32 @@ export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter '--output-format', 'json', ...this.options.cliArgs, - input.prompt, ] const args = this.options.execution?.args ? expandTemplateArgs(this.options.execution.args, input) : defaultArgs + const promptViaStdin = !this.options.execution?.args + if (promptViaStdin) { + await writeFile(promptPath, input.prompt, 'utf8') + } + if (process.platform === 'win32') { + await writeFile( + launcherRequestPath, + `${JSON.stringify( + { + command, + args, + cwd: repoRoot, + env: this.options.env, + timeout_ms: input.timeoutMs, + stdin_text: promptViaStdin ? input.prompt : undefined, + }, + null, + 2, + )}\n`, + 'utf8', + ) + } await writeFile( commandPath, @@ -268,6 +364,16 @@ export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter { command, args, + prompt_transport: promptViaStdin ? 'stdin' : 'arg_template', + prompt_ref: promptViaStdin ? path.relative(repoRoot, promptPath) : null, + launcher_bridge_ref: + process.platform === 'win32' + ? path.relative(repoRoot, windowsLauncherBridgePath) + : null, + launcher_request_ref: + process.platform === 'win32' + ? path.relative(repoRoot, launcherRequestPath) + : null, timeout_ms: input.timeoutMs, env_keys: Object.keys(this.options.env).sort(), }, @@ -277,38 +383,97 @@ export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter 'utf8', ) - const result = spawnSync(command, args, { - cwd: repoRoot, - encoding: 'utf8', - timeout: input.timeoutMs, - env: { - ...process.env, - ...this.options.env, - }, - }) - await writeFile(stdoutPath, String(result.stdout ?? ''), 'utf8') - await writeFile(stderrPath, String(result.stderr ?? result.error?.message ?? ''), 'utf8') + let status: HarnessExecutionAdapterOutput['status'] = 'completed' + let stdoutText = '' + let stderrText = '' + let errorText = '' + + if (process.platform === 'win32') { + const bridgeResult = spawnSync( + nodeExe, + [windowsLauncherBridgePath, '--request', launcherRequestPath, '--result', launcherResultPath], + { + cwd: repoRoot, + encoding: 'utf8', + timeout: input.timeoutMs + 10_000, + }, + ) + if (bridgeResult.status !== 0 && !existsSync(launcherResultPath)) { + stdoutText = String(bridgeResult.stdout ?? '') + stderrText = String(bridgeResult.stderr ?? bridgeResult.error?.message ?? '') + errorText = + stderrText.trim() || + stdoutText.trim() || + `Windows launcher bridge exited with status ${bridgeResult.status}` + status = bridgeResult.error?.name === 'ETIMEDOUT' ? 'timeout' : 'failed' + } else { + const launcherPayload = JSON.parse(await readFile(launcherResultPath, 'utf8')) as { + child_status?: number | null + stdout?: string + stderr?: string + error_name?: string | null + error_message?: string | null + timed_out?: boolean + signal?: string | null + } + stdoutText = String(launcherPayload.stdout ?? '') + stderrText = String(launcherPayload.stderr ?? launcherPayload.error_message ?? '') + if (launcherPayload.timed_out) { + status = 'timeout' + errorText = launcherPayload.error_message ?? 'Windows launcher bridge timed out' + } else if ((launcherPayload.child_status ?? 0) !== 0) { + status = 'failed' + errorText = + String(launcherPayload.stderr ?? '').trim() || + String(launcherPayload.stdout ?? '').trim() || + String(launcherPayload.error_message ?? '').trim() || + (launcherPayload.signal + ? `command terminated by signal ${launcherPayload.signal}` + : `command exited with status ${launcherPayload.child_status}`) + } + } + } else { + const result = spawnWithMergedEnv(command, args, { + cwd: repoRoot, + encoding: 'utf8', + timeout: input.timeoutMs, + env: this.options.env, + input: promptViaStdin ? input.prompt : undefined, + }) + stdoutText = String(result.stdout ?? '') + stderrText = String(result.stderr ?? result.error?.message ?? '') + if (result.error && result.error.name === 'ETIMEDOUT') { + status = 'timeout' + errorText = result.error.message + } else if (result.status !== 0) { + status = 'failed' + errorText = + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim() || + `command exited with status ${result.status}` + } + } + + await writeFile(stdoutPath, stdoutText, 'utf8') + await writeFile(stderrPath, stderrText, 'utf8') const stdoutRef = path.relative(repoRoot, stdoutPath) const stderrRef = path.relative(repoRoot, stderrPath) - if (result.error && result.error.name === 'ETIMEDOUT') { + if (status === 'timeout') { return { status: 'timeout', stdoutRef, stderrRef, - error: result.error.message, + error: errorText, } } - if (result.status !== 0) { + if (status === 'failed') { return { status: 'failed', stdoutRef, stderrRef, - error: - String(result.stderr ?? '').trim() || - String(result.stdout ?? '').trim() || - String(result.error?.message ?? '').trim() || - `command exited with status ${result.status}`, + error: errorText, } } return { @@ -333,7 +498,7 @@ export function createHarnessExecutionAdapter(params: { export function rebuildObservabilityDb(dbPath?: string): void { const args = ['run', 'scripts/observability/build_duckdb_etl.ts'] if (dbPath) args.push('--db-path', dbPath) - const result = spawnSync('bun', args, { + const result = spawnSync(bunExe, args, { cwd: repoRoot, encoding: 'utf8', }) diff --git a/scripts/evals/v2_manual_real_run.ps1 b/scripts/evals/v2_manual_real_run.ps1 new file mode 100644 index 0000000000..59e16c230e --- /dev/null +++ b/scripts/evals/v2_manual_real_run.ps1 @@ -0,0 +1,185 @@ +param( + [Parameter(Mandatory = $true)] + [string]$ScenarioId, + + [Parameter(Mandatory = $true)] + [string]$VariantId, + + [string]$ExperimentId = "session_memory_runtime_sparse_vs_default_manual", + + [int]$MaxTurns = 8, + + [string]$DbPath = ".observability/observability_v1.duckdb" +) + +$ErrorActionPreference = "Stop" + +function Get-RepoRoot { + return (Resolve-Path (Join-Path $PSScriptRoot "..\\..")).Path +} + +function Sanitize-Id([string]$Value) { + return (($Value -replace "[^a-zA-Z0-9_-]", "_").Trim("_")) +} + +function Get-RelativeRepoPath([string]$RepoRoot, [string]$TargetPath) { + $resolvedRepo = (Resolve-Path -LiteralPath $RepoRoot).Path + $resolvedTarget = (Resolve-Path -LiteralPath $TargetPath).Path + if ($resolvedTarget.StartsWith($resolvedRepo, [System.StringComparison]::OrdinalIgnoreCase)) { + return $resolvedTarget.Substring($resolvedRepo.Length).TrimStart('\', '/') + } + return $resolvedTarget +} + +function Get-VariantPath([string]$RepoRoot, [string]$VariantId) { + $direct = Join-Path $RepoRoot ("tests/evals/v2/variants/{0}.json" -f $VariantId) + if (Test-Path -LiteralPath $direct) { + return $direct + } + + $template = Join-Path $RepoRoot ("tests/evals/v2/variants/{0}.template.json" -f $VariantId) + if (Test-Path -LiteralPath $template) { + return $template + } + + $baseline = Join-Path $RepoRoot "tests/evals/v2/variants/baseline.template.json" + if ($VariantId -eq "baseline_default" -and (Test-Path -LiteralPath $baseline)) { + return $baseline + } + + throw "Variant not found: $VariantId" +} + +$repoRoot = Get-RepoRoot +$scenarioPath = Join-Path $repoRoot ("tests/evals/v2/scenarios/{0}.json" -f $ScenarioId) +if (-not (Test-Path -LiteralPath $scenarioPath)) { + throw "Scenario not found: $ScenarioId" +} + +$variantPath = Get-VariantPath -RepoRoot $repoRoot -VariantId $VariantId +$scenario = Get-Content -LiteralPath $scenarioPath -Raw | ConvertFrom-Json +$variant = Get-Content -LiteralPath $variantPath -Raw | ConvertFrom-Json + +$stamp = [DateTime]::UtcNow.ToString("yyyyMMddTHHmmssfffZ") +$suffix = [Guid]::NewGuid().ToString("N").Substring(0, 8) +$identity = "{0}_{1}_{2}" -f (Sanitize-Id $ScenarioId), (Sanitize-Id $VariantId), $suffix +$benchmarkRunId = "manual_bench_{0}_{1}" -f $stamp, $identity +$evalRunId = "manual_eval_{0}_{1}" -f $stamp, $identity + +$runRoot = Join-Path $repoRoot ".observability/v2-manual-runs" +$runDir = Join-Path $runRoot ("{0}_{1}_{2}" -f $stamp, (Sanitize-Id $ScenarioId), (Sanitize-Id $VariantId)) +New-Item -ItemType Directory -Force -Path $runDir | Out-Null + +$promptPath = Join-Path $runDir "prompt.txt" +$stdoutPath = Join-Path $runDir "stdout.txt" +$stderrPath = Join-Path $runDir "stderr.txt" +$commandPath = Join-Path $runDir "command.json" +$resultPath = Join-Path $runDir "result.json" + +$prompt = [string]$scenario.input_prompt +Set-Content -LiteralPath $promptPath -Value $prompt -Encoding UTF8 + +$cliArgs = @( + "run", + "src/entrypoints/cli.tsx", + "--print", + "--output-format", + "json", + "--max-turns", + [string]$MaxTurns +) + +$envVars = @{ + CLAUDE_CODE_EVAL_EXPERIMENT_ID = $ExperimentId + CLAUDE_CODE_EVAL_SCENARIO_ID = $ScenarioId + CLAUDE_CODE_EVAL_VARIANT_ID = $VariantId + CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID = $benchmarkRunId + CLAUDE_CODE_EVAL_RUN_ID = $evalRunId +} + +if ($variant.config_snapshot_ref) { + $envVars.CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF = [string]$variant.config_snapshot_ref +} + +$previousEnv = @{} +foreach ($key in $envVars.Keys) { + $previousEnv[$key] = [Environment]::GetEnvironmentVariable($key, "Process") + [Environment]::SetEnvironmentVariable($key, $envVars[$key], "Process") +} + +$exitCode = $null +$captureRows = @() + +try { + $commandRecord = @{ + command = "bun" + args = $cliArgs + scenario_id = $ScenarioId + variant_id = $VariantId + experiment_id = $ExperimentId + benchmark_run_id = $benchmarkRunId + eval_run_id = $evalRunId + prompt_ref = Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $promptPath + env_keys = @($envVars.Keys | Sort-Object) + } + ($commandRecord | ConvertTo-Json -Depth 6) + "`n" | Set-Content -LiteralPath $commandPath -Encoding UTF8 + + $rawPrompt = Get-Content -LiteralPath $promptPath -Raw + $rawPrompt | & bun @cliArgs 1> $stdoutPath 2> $stderrPath + $exitCode = $LASTEXITCODE + + if ($exitCode -ne 0) { + throw "Headless CLI exited with status $exitCode" + } + + & bun run scripts/observability/build_duckdb_etl.ts | Out-Null + + $duckdbExe = Join-Path $repoRoot "tools/duckdb/duckdb.exe" + $resolvedDbPath = if ([System.IO.Path]::IsPathRooted($DbPath)) { $DbPath } else { Join-Path $repoRoot $DbPath } + $sql = "SELECT DISTINCT user_action_id FROM user_actions WHERE benchmark_run_id = '$($benchmarkRunId.Replace("'", "''"))' AND TRIM(COALESCE(user_action_id, '')) <> '' ORDER BY user_action_id;" + $captureJson = & $duckdbExe -json $resolvedDbPath $sql + if ($LASTEXITCODE -ne 0) { + throw "DuckDB capture query failed for benchmark_run_id=$benchmarkRunId" + } + if ($captureJson) { + $captureRows = $captureJson | ConvertFrom-Json + } +} finally { + foreach ($key in $envVars.Keys) { + [Environment]::SetEnvironmentVariable($key, $previousEnv[$key], "Process") + } +} + +$userActionId = $null +$captureStatus = "capture_failed" +if ($captureRows.Count -eq 1) { + $captureStatus = "captured" + $userActionId = [string]$captureRows[0].user_action_id +} elseif ($captureRows.Count -gt 1) { + $captureStatus = "ambiguous_capture" +} + +$result = @{ + experiment_id = $ExperimentId + scenario_id = $ScenarioId + variant_id = $VariantId + benchmark_run_id = $benchmarkRunId + eval_run_id = $evalRunId + capture_status = $captureStatus + user_action_id = $userActionId + match_count = $captureRows.Count + exit_code = $exitCode + config_snapshot_ref = if ($variant.config_snapshot_ref) { [string]$variant.config_snapshot_ref } else { $null } + stdout_ref = Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $stdoutPath + stderr_ref = Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $stderrPath + command_ref = Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $commandPath + prompt_ref = Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $promptPath +} + +($result | ConvertTo-Json -Depth 6) + "`n" | Set-Content -LiteralPath $resultPath -Encoding UTF8 + +Write-Host ("Created manual real-run artifact: {0}" -f (Get-RelativeRepoPath -RepoRoot $repoRoot -TargetPath $resultPath)) +Write-Host ("capture_status: {0}" -f $captureStatus) +if ($userActionId) { + Write-Host ("user_action_id: {0}" -f $userActionId) +} diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index fe13180cce..66df7f1a76 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -80,6 +80,27 @@ function asString(value: unknown): string { return typeof value === 'string' ? value : '' } +function asBoolean(value: unknown): boolean { + return value === true +} + +function parseJsonRecord(value: unknown): JsonRecord | undefined { + if (typeof value !== 'string' || value.trim() === '') return undefined + try { + const parsed = JSON.parse(value) as unknown + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + return parsed as JsonRecord + } + } catch { + return undefined + } + return undefined +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values.filter(Boolean))] +} + function queryDuckDb( dbPath: string, sql: string, @@ -104,6 +125,15 @@ function queryDuckDb( return JSON.parse(output) as T[] } +function relationExists(dbPath: string, relation: string): boolean { + try { + const rows = queryDuckDb<{ name?: string }>(dbPath, 'SHOW TABLES;') + return rows.some(row => asString(row.name) === relation) + } catch { + return false + } +} + async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } @@ -183,6 +213,7 @@ function buildReport(params: { tools: JsonRecord[] subagents: JsonRecord[] recoveries: JsonRecord[] + variantEffect: JsonRecord scores: EvalScore[] }): string { const { @@ -194,6 +225,7 @@ function buildReport(params: { tools, subagents, recoveries, + variantEffect, scores, } = params const toolSummary = @@ -220,6 +252,9 @@ function buildReport(params: { `- ${score.dimension}.${score.subdimension}: ${score.score_label} (${score.score_value ?? 'n/a'})`, ) .join('\n') + const policySummary = variantEffect.observed_policy + ? JSON.stringify(variantEffect.observed_policy, null, 2) + : 'null' return `# V2 Run Report: ${run.run_id} @@ -263,6 +298,21 @@ ${toolSummary} ${subagentSummary} +## Variant Effect Evidence + +- effect_type: ${asString(variantEffect.effect_type) || 'unknown'} +- policy_event_observed: ${asBoolean(variantEffect.policy_event_observed)} +- variant_effect_observed: ${asBoolean(variantEffect.variant_effect_observed)} +- session_memory_subagent_count: ${asNumber(variantEffect.session_memory_subagent_count)} +- session_memory_trigger_details: ${(variantEffect.session_memory_trigger_details as string[] | undefined)?.join(', ') || 'none'} +- reason: ${asString(variantEffect.reason) || 'n/a'} + +### Observed Policy + +\`\`\`json +${policySummary} +\`\`\` + ## Scores ${scoreSummary} @@ -330,6 +380,50 @@ async function main(): Promise { dbPath, `SELECT * FROM metrics_integrity_daily WHERE event_date = ${sqlString(asString(action.event_date))} LIMIT 1;`, )[0] + const sessionMemoryPolicyRow = relationExists(dbPath, 'events_raw') + ? queryDuckDb( + dbPath, + `SELECT ts_wall, query_source, payload_json FROM events_raw WHERE user_action_id = ${sqlString(userActionId)} AND event_name = 'session_memory.policy.observed' ORDER BY ts_wall DESC LIMIT 1;`, + )[0] + : undefined + const observedPolicy = parseJsonRecord(sessionMemoryPolicyRow?.payload_json) + const sessionMemorySubagentRows = subagents.filter( + subagent => asString(subagent.subagent_reason) === 'session_memory', + ) + const sessionMemorySubagentCount = sessionMemorySubagentRows.reduce( + (sum, subagent) => sum + asNumber(subagent.subagent_count), + 0, + ) + const sessionMemoryTriggerDetails = uniqueStrings( + sessionMemorySubagentRows.map(subagent => + asString(subagent.subagent_trigger_detail), + ), + ) + const variantEffect: JsonRecord = { + effect_type: 'session_memory_policy', + policy_event_observed: observedPolicy !== undefined, + variant_effect_observed: + variant.variant_id === 'candidate_session_memory_sparse' + ? observedPolicy !== undefined && + (asString(observedPolicy.mode) === 'sparse' || + asBoolean(observedPolicy.natural_break_only)) + : observedPolicy !== undefined, + observed_policy: observedPolicy ?? null, + observed_at: asString(sessionMemoryPolicyRow?.ts_wall), + observed_query_source: asString(sessionMemoryPolicyRow?.query_source), + session_memory_subagent_count: sessionMemorySubagentCount, + session_memory_trigger_details: sessionMemoryTriggerDetails, + reason: + observedPolicy !== undefined + ? variant.variant_id === 'candidate_session_memory_sparse' && + !( + asString(observedPolicy.mode) === 'sparse' || + asBoolean(observedPolicy.natural_break_only) + ) + ? 'Session-memory policy was observed, but the candidate sparse policy markers were not present.' + : 'Session-memory runtime policy was observed from V1 events.' + : 'No session-memory policy observation event was found for this run.', + } const runId = sanitizeId( `run_${new Date().toISOString().replaceAll(':', '').replaceAll('.', '')}_${scenario.scenario_id}_${variant.variant_id}_${userActionId.slice(0, 8)}`, @@ -369,6 +463,7 @@ async function main(): Promise { tools, subagents, recoveries, + variantEffect, }, requestedScoreSpecIds) const runsDir = path.join(evalRoot, 'runs') @@ -379,7 +474,7 @@ async function main(): Promise { await writeFile( path.join(runsDir, `${runId}.json`), - `${JSON.stringify({ run, binding, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries } }, null, 2)}\n`, + `${JSON.stringify({ run, binding, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries }, variant_effect: variantEffect }, null, 2)}\n`, ) await writeFile( path.join(scoresDir, `${runId}.scores.json`), @@ -396,6 +491,7 @@ async function main(): Promise { tools, subagents, recoveries, + variantEffect, scores, }), ) diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index 5c7d7866d1..91a4692889 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -2,7 +2,11 @@ import { spawnSync } from 'node:child_process' import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises' import path from 'node:path' -import type { EvalScenario, EvalScore, EvalVariant } from '../../src/observability/v2/evalTypes' +import type { + EvalScenario, + EvalScore, + EvalVariant, +} from '../../src/observability/v2/evalTypes' import type { EvalExperimentActionBinding, EvalExperimentFlatActionBinding, @@ -20,6 +24,47 @@ import { type ExecuteHarnessResult, } from './v2_harness_execution' +type JsonRecord = Record +type ExperimentProfile = 'smoke' | 'real_experiment' + +interface RunArtifact { + run: { + run_id: string + scenario_id: string + variant_id: string + entry_user_action_id?: string + } + variant_effect?: JsonRecord +} + +interface VariantEffectSummary { + scenario_id: string + candidate_variant_id: string + baseline_variant_effect_observed: boolean + candidate_variant_effect_observed: boolean + runtime_difference_observed: boolean + baseline_policy_mode: string + candidate_policy_mode: string + summary: string[] +} + +interface ExperimentValidity { + status: 'valid' | 'invalid' | 'inconclusive' + profile: ExperimentProfile + reason: string + blockers: string[] + warnings: string[] + checks: { + baseline_captured: boolean + candidate_captured: boolean + no_ambiguous_capture: boolean + score_evidence_present: boolean + variant_effect_observed: boolean + runtime_difference_observed: boolean + scenario_intent_matched: boolean + } +} + interface CandidateExperimentResult { candidate_variant_id: string candidate_run_id: string @@ -27,6 +72,10 @@ interface CandidateExperimentResult { candidate_eval_run_id?: string candidate_benchmark_run_id?: string candidate_execution?: ExecuteHarnessResult + baseline_variant_effect?: JsonRecord + candidate_variant_effect?: JsonRecord + variant_effect_summary?: VariantEffectSummary + experiment_validity?: ExperimentValidity compare_report: string gate_results: GateResult[] scorecard_summary: ScorecardItem[] @@ -95,8 +144,10 @@ interface ScorecardItem { } const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const bunExe = process.execPath const evalRoot = path.join(repoRoot, 'tests', 'evals', 'v2') const scoresRoot = path.join(evalRoot, 'scores') +const runsRoot = path.join(evalRoot, 'runs') const experimentRunsRoot = path.join(evalRoot, 'experiment-runs') function parseArgs(argv: string[]): Record { @@ -120,6 +171,25 @@ async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } +function asString(value: unknown): string { + return typeof value === 'string' ? value : '' +} + +function asBoolean(value: unknown): boolean { + return value === true +} + +function asNumber(value: unknown): number { + if (typeof value === 'number') return value + if (typeof value === 'string' && value.trim() !== '') return Number(value) + return 0 +} + +function asStringArray(value: unknown): string[] { + if (!Array.isArray(value)) return [] + return value.filter((item): item is string => typeof item === 'string' && item.length > 0) +} + async function listJsonFiles(dir: string): Promise { const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) return entries @@ -248,7 +318,7 @@ function findBoundUserActionId(params: { } function runBunScript(script: string, args: string[]): string { - const result = spawnSync('bun', ['run', script, ...args], { + const result = spawnSync(bunExe, ['run', script, ...args], { cwd: repoRoot, encoding: 'utf8', }) @@ -279,6 +349,10 @@ function extractCreatedReport(output: string): string { return match?.[1]?.trim() ?? '' } +async function readRunArtifact(runId: string): Promise { + return readJson(path.join(runsRoot, `${runId}.json`)) +} + function scoreKey(score: EvalScore): string { return `${score.dimension}.${score.subdimension}` } @@ -369,16 +443,18 @@ function buildScorecardSummary(params: { function buildExplorationSignals(params: { scorecard: ScorecardItem[] gateResults: GateResult[] + experimentValidity?: ExperimentValidity + variantEffectSummary?: VariantEffectSummary }): string[] { - const { scorecard, gateResults } = params + const { scorecard, gateResults, experimentValidity, variantEffectSummary } = params const signals: string[] = [] const changedScores = scorecard.filter(item => ['improved', 'regressed', 'changed', 'observed'].includes(item.interpretation), ) const improvedScores = scorecard.filter(item => item.interpretation === 'improved') const regressedScores = scorecard.filter(item => item.interpretation === 'regressed') - const hardOrSoftGateResults = gateResults.filter(result => - result.verdict === 'hard_fail' || result.verdict === 'soft_warning', + const hardOrSoftGateResults = gateResults.filter( + result => result.verdict === 'hard_fail' || result.verdict === 'soft_warning', ) if (changedScores.length > 0) { @@ -396,6 +472,19 @@ function buildExplorationSignals(params: { 'Risk gate raised a warning/failure, but at least one score improved; this may be worth exploratory review instead of immediate rejection.', ) } + if (variantEffectSummary?.runtime_difference_observed) { + signals.push( + 'A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas.', + ) + } + if ( + experimentValidity?.profile === 'real_experiment' && + experimentValidity.status !== 'valid' + ) { + signals.push( + `Real experiment validity is ${experimentValidity.status}; treat score deltas as provisional until the variant effect is clearly observed.`, + ) + } if (signals.length === 0) { signals.push( 'No exploratory signal was derived from the current automatic scorecard; manual review may still find qualitative differences.', @@ -407,8 +496,13 @@ function buildExplorationSignals(params: { function recommendReviewMode(params: { scorecard: ScorecardItem[] gateResults: GateResult[] + experimentValidity?: ExperimentValidity }): ReviewMode { - const { scorecard, gateResults } = params + const { scorecard, gateResults, experimentValidity } = params + if (experimentValidity?.profile === 'real_experiment') { + if (experimentValidity.status === 'invalid') return 'manual_review' + if (experimentValidity.status === 'inconclusive') return 'exploratory_review' + } const hasRisk = gateResults.some(result => result.verdict !== 'pass') const hasTradeoff = scorecard.some(item => item.interpretation === 'improved') && @@ -595,9 +689,13 @@ function summarizeRisk(results: ScenarioExperimentResult[]): RiskVerdict { const candidates = results.flatMap(result => result.candidates) const allGateResults = candidates.flatMap(candidate => candidate.gate_results) const hardFailCount = allGateResults.filter(result => result.verdict === 'hard_fail').length - const softWarningCount = allGateResults.filter(result => result.verdict === 'soft_warning').length + const softWarningCount = allGateResults.filter( + result => result.verdict === 'soft_warning', + ).length const missingScoreCount = allGateResults.filter(result => result.verdict === 'missing').length - const inconclusiveCount = allGateResults.filter(result => result.verdict === 'inconclusive').length + const inconclusiveCount = allGateResults.filter( + result => result.verdict === 'inconclusive', + ).length return { status: hardFailCount > 0 @@ -671,21 +769,310 @@ function reportRefs(results: ScenarioExperimentResult[], experimentReport: strin ].filter(Boolean) } +function hasPolicyEventObserved(variantEffect: JsonRecord | undefined): boolean { + return asBoolean(variantEffect?.policy_event_observed) +} + +function hasVariantEffectObserved(variantEffect: JsonRecord | undefined): boolean { + return asBoolean(variantEffect?.variant_effect_observed) +} + +function observedPolicyMode(variantEffect: JsonRecord | undefined): string { + const observedPolicy = variantEffect?.observed_policy + if (observedPolicy && typeof observedPolicy === 'object' && !Array.isArray(observedPolicy)) { + return asString((observedPolicy as JsonRecord).mode) || 'unknown' + } + return 'unknown' +} + +function policySignature(variantEffect: JsonRecord | undefined): string { + const observedPolicy = variantEffect?.observed_policy + if (!observedPolicy || typeof observedPolicy !== 'object' || Array.isArray(observedPolicy)) { + return '' + } + return JSON.stringify(observedPolicy) +} + +function runtimeDifferenceAnalysis(params: { + scenarioId: string + candidateVariantId: string + baselineVariantEffect: JsonRecord | undefined + candidateVariantEffect: JsonRecord | undefined + scorecard: ScorecardItem[] +}): VariantEffectSummary { + const { + scenarioId, + candidateVariantId, + baselineVariantEffect, + candidateVariantEffect, + scorecard, + } = params + const summary: string[] = [] + const baselineObserved = hasPolicyEventObserved(baselineVariantEffect) + const candidateObserved = hasPolicyEventObserved(candidateVariantEffect) + const candidateEffectObserved = hasVariantEffectObserved(candidateVariantEffect) + const baselineMode = observedPolicyMode(baselineVariantEffect) + const candidateMode = observedPolicyMode(candidateVariantEffect) + const baselinePolicySig = policySignature(baselineVariantEffect) + const candidatePolicySig = policySignature(candidateVariantEffect) + const baselineSubagentCount = asNumber( + baselineVariantEffect?.session_memory_subagent_count, + ) + const candidateSubagentCount = asNumber( + candidateVariantEffect?.session_memory_subagent_count, + ) + const baselineTriggerDetails = [ + ...asStringArray(baselineVariantEffect?.session_memory_trigger_details), + ].sort() + const candidateTriggerDetails = [ + ...asStringArray(candidateVariantEffect?.session_memory_trigger_details), + ].sort() + const triggerDetailsChanged = + baselineTriggerDetails.join('|') !== candidateTriggerDetails.join('|') + const policyChanged = + baselinePolicySig !== '' && + candidatePolicySig !== '' && + baselinePolicySig !== candidatePolicySig + const scoreChanged = scorecard.some(item => + ['improved', 'regressed', 'changed', 'observed'].includes(item.interpretation), + ) + + if (baselineObserved) { + summary.push(`Baseline session_memory policy was observed with mode=${baselineMode}.`) + } else { + summary.push('Baseline session_memory policy was not observed in V1 events.') + } + if (candidateObserved) { + summary.push(`Candidate session_memory policy was observed with mode=${candidateMode}.`) + } else { + summary.push('Candidate session_memory policy was not observed in V1 events.') + } + if (candidateEffectObserved) { + summary.push('Candidate sparse-policy markers were observed in runtime evidence.') + } + if (policyChanged) { + summary.push('Observed baseline and candidate session_memory policies differ.') + } + if (baselineSubagentCount !== candidateSubagentCount) { + summary.push( + `Session_memory subagent count changed from ${baselineSubagentCount} to ${candidateSubagentCount}.`, + ) + } + if (triggerDetailsChanged) { + summary.push( + `Session_memory trigger details changed from [${baselineTriggerDetails.join(', ') || 'none'}] to [${candidateTriggerDetails.join(', ') || 'none'}].`, + ) + } + if (scoreChanged) { + summary.push('At least one score dimension changed between baseline and candidate.') + } + + const runtimeDifferenceObserved = + candidateEffectObserved && + (policyChanged || + baselineSubagentCount !== candidateSubagentCount || + triggerDetailsChanged) + + if (!runtimeDifferenceObserved) { + summary.push( + 'No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.', + ) + } + + return { + scenario_id: scenarioId, + candidate_variant_id: candidateVariantId, + baseline_variant_effect_observed: baselineObserved, + candidate_variant_effect_observed: candidateEffectObserved, + runtime_difference_observed: runtimeDifferenceObserved, + baseline_policy_mode: baselineMode, + candidate_policy_mode: candidateMode, + summary, + } +} + +function buildExperimentValidity(params: { + profile: ExperimentProfile + scenarioId: string + candidateVariantId: string + baselineExecution?: ExecuteHarnessResult + candidateExecution?: ExecuteHarnessResult + scorecard: ScorecardItem[] + variantEffectSummary: VariantEffectSummary +}): ExperimentValidity { + const { + profile, + scenarioId, + candidateVariantId, + baselineExecution, + candidateExecution, + scorecard, + variantEffectSummary, + } = params + const baselineCaptured = + baselineExecution === undefined || baselineExecution.capture.status === 'captured' + const candidateCaptured = + candidateExecution === undefined || candidateExecution.capture.status === 'captured' + const noAmbiguousCapture = + baselineExecution?.capture.status !== 'ambiguous_capture' && + candidateExecution?.capture.status !== 'ambiguous_capture' + const scoreEvidencePresent = scorecard.some(item => item.interpretation !== 'missing') + const variantEffectObserved = variantEffectSummary.candidate_variant_effect_observed + const scenarioIntentMatched = + profile === 'smoke' + ? baselineCaptured && candidateCaptured + : variantEffectObserved && variantEffectSummary.runtime_difference_observed + + const blockers: string[] = [] + const warnings: string[] = [] + if (!baselineCaptured) { + blockers.push( + `baseline_not_captured: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if (!candidateCaptured) { + blockers.push( + `candidate_not_captured: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if (!noAmbiguousCapture) { + blockers.push( + `ambiguous_capture_present: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if (!scoreEvidencePresent) { + blockers.push( + `score_evidence_missing: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if (profile === 'real_experiment' && !variantEffectObserved) { + blockers.push( + `variant_effect_not_observed: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if ( + profile === 'real_experiment' && + variantEffectObserved && + !variantEffectSummary.runtime_difference_observed + ) { + warnings.push( + `runtime_difference_not_observed: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + if (profile === 'real_experiment' && !scenarioIntentMatched) { + warnings.push( + `scenario_intent_not_matched: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } + + const status: ExperimentValidity['status'] = + blockers.length > 0 ? 'invalid' : warnings.length > 0 ? 'inconclusive' : 'valid' + const reason = + status === 'valid' + ? profile === 'smoke' + ? 'Smoke check passed: execute_harness closed the automatic execution and capture loop.' + : 'Real experiment is valid: runtime effect was observed and the baseline/candidate difference is interpretable.' + : status === 'invalid' + ? `Experiment is invalid because: ${blockers.join('; ')}` + : `Experiment is inconclusive because: ${warnings.join('; ')}` + + return { + status, + profile, + reason, + blockers, + warnings, + checks: { + baseline_captured: baselineCaptured, + candidate_captured: candidateCaptured, + no_ambiguous_capture: noAmbiguousCapture, + score_evidence_present: scoreEvidencePresent, + variant_effect_observed: variantEffectObserved, + runtime_difference_observed: variantEffectSummary.runtime_difference_observed, + scenario_intent_matched: scenarioIntentMatched, + }, + } +} + +function aggregateExperimentValidity(results: ScenarioExperimentResult[]): ExperimentValidity { + const validities = results.flatMap(result => + result.candidates + .map(candidate => candidate.experiment_validity) + .filter((value): value is ExperimentValidity => Boolean(value)), + ) + const blockers = validities.flatMap(validity => validity.blockers) + const warnings = validities.flatMap(validity => validity.warnings) + const status: ExperimentValidity['status'] = + validities.some(validity => validity.status === 'invalid') + ? 'invalid' + : validities.some(validity => validity.status === 'inconclusive') + ? 'inconclusive' + : 'valid' + const profile = validities[0]?.profile ?? 'smoke' + return { + status, + profile, + reason: + status === 'valid' + ? profile === 'smoke' + ? 'Smoke check remains healthy.' + : 'Real experiment remains interpretable.' + : status === 'invalid' + ? `At least one scenario/candidate pair is invalid: ${blockers.join('; ')}` + : `At least one scenario/candidate pair is inconclusive: ${warnings.join('; ')}`, + blockers, + warnings, + checks: { + baseline_captured: validities.every(validity => validity.checks.baseline_captured), + candidate_captured: validities.every(validity => validity.checks.candidate_captured), + no_ambiguous_capture: validities.every(validity => validity.checks.no_ambiguous_capture), + score_evidence_present: validities.every(validity => validity.checks.score_evidence_present), + variant_effect_observed: validities.every(validity => validity.checks.variant_effect_observed), + runtime_difference_observed: validities.every( + validity => validity.checks.runtime_difference_observed, + ), + scenario_intent_matched: validities.every( + validity => validity.checks.scenario_intent_matched, + ), + }, + } +} + +function aggregateVariantEffectSummary(results: ScenarioExperimentResult[]): VariantEffectSummary[] { + return results.flatMap(result => + result.candidates + .map(candidate => candidate.variant_effect_summary) + .filter((value): value is VariantEffectSummary => Boolean(value)), + ) +} + function buildMarkdownReport(params: { experiment: EvalExperimentV21 results: ScenarioExperimentResult[] outputJson: string + riskVerdict: RiskVerdict + experimentValidity: ExperimentValidity + scorecardSummary: ScorecardItem[] + explorationSignals: string[] + recommendedReviewMode: ReviewMode + variantEffectSummary: VariantEffectSummary[] }): string { - const { experiment, results, outputJson } = params + const { + experiment, + results, + outputJson, + riskVerdict, + experimentValidity, + scorecardSummary, + explorationSignals, + recommendedReviewMode, + variantEffectSummary, + } = params const allGateResults = results.flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results), ) - const hardFailures = allGateResults.filter( - result => result.verdict === 'hard_fail', - ) - const softWarnings = allGateResults.filter( - result => result.verdict === 'soft_warning', - ) + const hardFailures = allGateResults.filter(result => result.verdict === 'hard_fail') + const softWarnings = allGateResults.filter(result => result.verdict === 'soft_warning') const missingOrInconclusive = allGateResults.filter( result => result.verdict === 'missing' || result.verdict === 'inconclusive', ) @@ -696,7 +1083,8 @@ function buildMarkdownReport(params: { const gateSummary = candidate.gate_results.length ? `${candidate.gate_results.filter(gate => gate.verdict !== 'pass').length}/${candidate.gate_results.length} not passed` : 'not configured' - return `| ${result.scenario_id} | ${result.repeat_index} | ${result.baseline_run_id} | ${candidate.candidate_variant_id} | ${candidate.candidate_run_id} | ${gateSummary} | ${candidate.compare_report} |` + const validityStatus = candidate.experiment_validity?.status ?? 'unknown' + return `| ${result.scenario_id} | ${result.repeat_index} | ${result.baseline_run_id} | ${candidate.candidate_variant_id} | ${candidate.candidate_run_id} | ${validityStatus} | ${gateSummary} | ${candidate.compare_report} |` }), ) .join('\n') @@ -710,20 +1098,87 @@ function buildMarkdownReport(params: { `| ${result.scenario_id} | ${result.candidate_variant_id} | ${result.rule_type} | ${result.score_spec_id} | ${result.verdict} | ${result.regression_pct ?? 'n/a'} |`, ) .join('\n') - const scorecardRows = aggregateScorecard(results) + + const scorecardRows = scorecardSummary .map( item => `| ${item.scenario_id} | ${item.candidate_variant_id} | ${item.score_spec_id} | ${item.baseline_value ?? 'n/a'} | ${item.candidate_value ?? 'n/a'} | ${item.delta ?? 'n/a'} | ${item.interpretation} |`, ) .join('\n') - const explorationRows = aggregateExplorationSignals(results) - .map(signal => `- ${signal}`) - .join('\n') - const reviewMode = aggregateReviewMode(results) + + const explorationRows = explorationSignals.map(signal => `- ${signal}`).join('\n') + const variantEffectRows = + variantEffectSummary.length === 0 + ? '- No variant effect evidence summary was generated.' + : variantEffectSummary + .map( + item => + `- ${item.scenario_id} / ${item.candidate_variant_id}: baseline_mode=${item.baseline_policy_mode}, candidate_mode=${item.candidate_policy_mode}, candidate_effect_observed=${item.candidate_variant_effect_observed}, runtime_difference_observed=${item.runtime_difference_observed}`, + ) + .join('\n') + + const runtimeDifferenceRows = + variantEffectSummary.length === 0 + ? '- No runtime difference summary available.' + : variantEffectSummary + .flatMap(item => + item.summary.map( + summary => + `- ${item.scenario_id} / ${item.candidate_variant_id}: ${summary}`, + ), + ) + .join('\n') + + const validityRows = [ + `- status: ${experimentValidity.status}`, + `- profile: ${experimentValidity.profile}`, + `- baseline_captured: ${experimentValidity.checks.baseline_captured}`, + `- candidate_captured: ${experimentValidity.checks.candidate_captured}`, + `- no_ambiguous_capture: ${experimentValidity.checks.no_ambiguous_capture}`, + `- score_evidence_present: ${experimentValidity.checks.score_evidence_present}`, + `- variant_effect_observed: ${experimentValidity.checks.variant_effect_observed}`, + `- runtime_difference_observed: ${experimentValidity.checks.runtime_difference_observed}`, + `- scenario_intent_matched: ${experimentValidity.checks.scenario_intent_matched}`, + `- reason: ${experimentValidity.reason}`, + ].join('\n') + + const validityNotes = [ + ...experimentValidity.blockers.map(item => `- blocker: ${item}`), + ...experimentValidity.warnings.map(item => `- warning: ${item}`), + ].join('\n') + + const reportProfile: ExperimentProfile = experiment.report_profile ?? 'smoke' + const profileSection = + reportProfile === 'smoke' + ? `## Smoke Check + +- requested_mode: ${experiment.mode ?? 'bind_existing'} +- execute_harness_loop_closed: ${experimentValidity.checks.baseline_captured && experimentValidity.checks.candidate_captured} +- note: This profile validates the automatic pipeline, not harness value.` + : `## Real Experiment + +- requested_mode: ${experiment.mode ?? 'bind_existing'} +- evaluation_intent: ${experiment.evaluation_intent ?? 'exploration'} +- candidate_runtime_effect_observed: ${experimentValidity.checks.variant_effect_observed} +- runtime_difference_observed: ${experimentValidity.checks.runtime_difference_observed} +- note: This profile asks whether the candidate changed runtime behavior in an interpretable way.` + + const interpretationLimits = + reportProfile === 'smoke' + ? [ + '- Smoke only proves the automatic execute_harness -> capture -> run/score/report loop is healthy.', + '- Smoke does not prove a candidate harness change is beneficial.', + ].join('\n') + : [ + '- This real experiment remains single-scenario and single-run; it is not yet a stability study.', + experimentValidity.checks.variant_effect_observed + ? '- Candidate runtime effect was observed, but qualitative harness value still needs broader experiments.' + : '- Candidate runtime effect was not observed cleanly enough; do not treat score deltas as a reliable judgment.', + ].join('\n') return `# V2 Experiment Summary: ${experiment.experiment_id} -## 理解清单 +## Understanding - experiment: ${experiment.experiment_id} - mode: ${experiment.mode ?? 'bind_existing'} @@ -734,26 +1189,42 @@ function buildMarkdownReport(params: { - gate_policy: ${experiment.gate_policy_id ?? 'not configured'} - output_json: ${outputJson} -## 预期效果 +## Expected Outcome + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2 executes the scenario first, then captures the generated user_action_id through benchmark_run_id. -This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2.2-alpha executes the scenario first, then captures the generated user_action_id through benchmark_run_id. +## Design Rationale -## 设计思路 +The runner always scores only trace-backed V1 facts. V2.2-beta adds runtime-effect evidence and experiment-validity semantics so smoke and real experiments are not confused with each other. -The runner always scores only trace-backed V1 facts. V2.2-alpha adds an execution front half, but the score/compare/gate back half is the same fact-only pipeline used by V2.1. +${profileSection} ## Risk Verdict - hard_failures: ${hardFailures.length} - soft_warnings: ${softWarnings.length} - missing_or_inconclusive: ${missingOrInconclusive.length} -- risk_status: ${hardFailures.length > 0 ? 'failed' : missingOrInconclusive.length > 0 ? 'inconclusive' : softWarnings.length > 0 ? 'warning' : 'passed'} +- risk_status: ${riskVerdict.status} - scope: regression_risk_only - final_experiment_judgment: false -- recommended_review_mode: ${reviewMode} +- recommended_review_mode: ${recommendedReviewMode} This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. +## Variant Effect Evidence + +${variantEffectRows} + +## Experiment Validity + +${validityRows} + +${validityNotes || '- No additional blockers or warnings.'} + +## Runtime Difference Summary + +${runtimeDifferenceRows} + ## Scorecard Summary | scenario | candidate_variant | score | baseline | candidate | delta | interpretation | @@ -766,8 +1237,8 @@ ${explorationRows || '- No exploration signal generated.'} ## Runs -| scenario | repeat | baseline_run | candidate_variant | candidate_run | risk_gate | compare_report | -| --- | ---: | --- | --- | --- | --- | --- | +| scenario | repeat | baseline_run | candidate_variant | candidate_run | experiment_validity | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | --- | ${rows} ## Risk Gate Details @@ -775,6 +1246,10 @@ ${rows} | scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | | --- | --- | --- | --- | --- | ---: | ${gateRows} + +## Interpretation Limits + +${interpretationLimits} ` } @@ -801,12 +1276,8 @@ async function main(): Promise { 'execute_harness is disabled and this experiment does not allow bind_existing fallback.', ) } - if (mode !== 'bind_existing') { - if (mode !== 'execute_harness') { - throw new Error( - `Unsupported V2 experiment mode: ${mode}`, - ) - } + if (mode !== 'bind_existing' && mode !== 'execute_harness') { + throw new Error(`Unsupported V2 experiment mode: ${mode}`) } const scenarioIds = experiment.scenario_ids ?? [] @@ -820,9 +1291,7 @@ async function main(): Promise { const snapshotDb = !Boolean(args['no-snapshot-db']) for (const scoreSpecId of experiment.score_spec_ids ?? []) { if (!scoreSpecs.has(scoreSpecId)) { - throw new Error( - `Experiment references missing score_spec_id: ${scoreSpecId}`, - ) + throw new Error(`Experiment references missing score_spec_id: ${scoreSpecId}`) } } if (experiment.gate_policy_id && !gatePolicy) { @@ -837,23 +1306,27 @@ async function main(): Promise { ) } } + const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) if (mode === 'execute_harness') { if (scenarioIds.length !== 1) { - throw new Error('V2.2-alpha execute_harness supports exactly one scenario.') + throw new Error('V2.2 execute_harness supports exactly one scenario.') } if (experiment.candidate_variant_ids.length !== 1) { - throw new Error('V2.2-alpha execute_harness supports exactly one candidate variant.') + throw new Error('V2.2 execute_harness supports exactly one candidate variant.') } if (repeatCount !== 1) { - throw new Error('V2.2-alpha execute_harness supports repeat_count=1 only.') + throw new Error('V2.2 execute_harness supports repeat_count=1 only.') } } - const results: ScenarioExperimentResult[] = [] + const results: ScenarioExperimentResult[] = [] if (mode === 'bind_existing') { for (const scenarioId of scenarioIds) { - for (const variantId of [experiment.baseline_variant_id, ...experiment.candidate_variant_ids]) { + for (const variantId of [ + experiment.baseline_variant_id, + ...experiment.candidate_variant_ids, + ]) { const userActionId = findBoundUserActionId({ experiment, scenarioId, @@ -882,6 +1355,7 @@ async function main(): Promise { let baselineExecution: ExecuteHarnessResult | undefined let baselineEvalRunId: string | undefined let baselineBenchmarkRunId: string | undefined + if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) const baselineVariant = await loadVariant(experiment.baseline_variant_id) @@ -907,6 +1381,7 @@ async function main(): Promise { result: baselineExecution, }) } + if (!baselineUserActionId) { throw new Error( `Missing action binding for scenario=${scenarioId}, variant=${experiment.baseline_variant_id}. bind_existing mode requires user_action_id bindings.`, @@ -928,6 +1403,7 @@ async function main(): Promise { const baselineScores = await readJson( path.join(scoresRoot, `${baselineRunId}.scores.json`), ) + const baselineRunArtifact = await readRunArtifact(baselineRunId) const candidates: CandidateExperimentResult[] = [] for (const candidateVariantId of experiment.candidate_variant_ids) { @@ -939,6 +1415,7 @@ async function main(): Promise { let candidateExecution: ExecuteHarnessResult | undefined let candidateEvalRunId: string | undefined let candidateBenchmarkRunId: string | undefined + if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) const candidateVariant = await loadVariant(candidateVariantId) @@ -964,6 +1441,7 @@ async function main(): Promise { result: candidateExecution, }) } + if (!candidateActionId) { throw new Error( `Missing candidate user_action_id for scenario=${scenarioId}, variant=${candidateVariantId}`, @@ -985,6 +1463,7 @@ async function main(): Promise { const candidateScores = await readJson( path.join(scoresRoot, `${candidateRunId}.scores.json`), ) + const candidateRunArtifact = await readRunArtifact(candidateRunId) const compareOutput = runBunScript('scripts/evals/v2_compare_runs.ts', [ '--baseline-run', @@ -1008,6 +1487,22 @@ async function main(): Promise { baselineScores, candidateScores, }) + const variantEffect = runtimeDifferenceAnalysis({ + scenarioId, + candidateVariantId, + baselineVariantEffect: baselineRunArtifact.variant_effect, + candidateVariantEffect: candidateRunArtifact.variant_effect, + scorecard, + }) + const experimentValidityForCandidate = buildExperimentValidity({ + profile: experiment.report_profile ?? 'smoke', + scenarioId, + candidateVariantId, + baselineExecution, + candidateExecution, + scorecard, + variantEffectSummary: variantEffect, + }) candidates.push({ candidate_variant_id: candidateVariantId, @@ -1016,16 +1511,23 @@ async function main(): Promise { candidate_eval_run_id: candidateEvalRunId, candidate_benchmark_run_id: candidateBenchmarkRunId, candidate_execution: candidateExecution, + baseline_variant_effect: baselineRunArtifact.variant_effect, + candidate_variant_effect: candidateRunArtifact.variant_effect, + variant_effect_summary: variantEffect, + experiment_validity: experimentValidityForCandidate, compare_report: extractCreatedReport(compareOutput), gate_results: gateResults, scorecard_summary: scorecard, exploration_signals: buildExplorationSignals({ scorecard, gateResults, + experimentValidity: experimentValidityForCandidate, + variantEffectSummary: variantEffect, }), recommended_review_mode: recommendReviewMode({ scorecard, gateResults, + experimentValidity: experimentValidityForCandidate, }), }) } @@ -1062,13 +1564,23 @@ async function main(): Promise { const scorecardSummary = aggregateScorecard(results) const explorationSignals = aggregateExplorationSignals(results) const recommendedReviewMode = aggregateReviewMode(results) + const variantEffectSummary = aggregateVariantEffectSummary(results) + const experimentValidity = aggregateExperimentValidity(results) + const warningMessages = results .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) - .filter(result => result.verdict === 'soft_warning' || result.verdict === 'missing' || result.verdict === 'inconclusive') + .filter( + result => + result.verdict === 'soft_warning' || + result.verdict === 'missing' || + result.verdict === 'inconclusive', + ) .map( result => `${result.verdict}: scenario=${result.scenario_id}, candidate=${result.candidate_variant_id}, score=${result.score_spec_id}`, ) + warningMessages.push(...experimentValidity.warnings) + const errorMessages = results .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) .filter(result => result.verdict === 'hard_fail') @@ -1076,6 +1588,8 @@ async function main(): Promise { result => `hard_fail: scenario=${result.scenario_id}, candidate=${result.candidate_variant_id}, score=${result.score_spec_id}`, ) + errorMessages.push(...experimentValidity.blockers) + await writeFile( outputJsonPath, `${JSON.stringify( @@ -1086,11 +1600,16 @@ async function main(): Promise { mode, requested_mode: requestedMode, automation_disabled: automationDisabled, + report_profile: experiment.report_profile ?? 'smoke', + evaluation_intent: experiment.evaluation_intent ?? null, run_refs: runRefs(results), score_refs: scoreRefs(results), report_refs: reportRefs(results, outputMarkdownRel), risk_verdict: riskVerdict, gate_verdict: riskVerdict, + experiment_validity: experimentValidity, + variant_effect_summary: variantEffectSummary, + runtime_difference_summary: variantEffectSummary.flatMap(item => item.summary), verdict_boundary: 'risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.', scorecard_summary: scorecardSummary, @@ -1133,6 +1652,12 @@ async function main(): Promise { experiment, results, outputJson: outputJsonRel, + riskVerdict, + experimentValidity, + scorecardSummary, + explorationSignals, + recommendedReviewMode, + variantEffectSummary, }), ) diff --git a/scripts/evals/v2_score_registry.ts b/scripts/evals/v2_score_registry.ts index e23cfde03c..26650bc172 100644 --- a/scripts/evals/v2_score_registry.ts +++ b/scripts/evals/v2_score_registry.ts @@ -11,6 +11,7 @@ export interface V2ScoreInput { tools: JsonRecord[] subagents: JsonRecord[] recoveries: JsonRecord[] + variantEffect?: JsonRecord } type V2ScoreScorer = (input: V2ScoreInput) => EvalScore @@ -183,6 +184,28 @@ export const V2_SCORE_SCORERS: Record = { reason: 'Observed subagent count is a fact for later baseline vs candidate comparison.', }), + 'decision_quality.session_memory_policy_observed': ({ runId, variantEffect }) => { + const observed = + variantEffect && + (variantEffect.variant_effect_observed === true || + variantEffect.policy_event_observed === true) + ? 1 + : 0 + return { + score_id: `${runId}_decision_quality_session_memory_policy_observed`, + run_id: runId, + dimension: 'decision_quality', + subdimension: 'session_memory_policy_observed', + score_value: observed, + score_label: 'observed', + evidence_ref: 'variant_effect', + reason: + observed === 1 + ? 'Session-memory runtime policy was observed in trace-backed evidence.' + : 'No session-memory runtime policy observation was found for this run.', + } + }, + 'controllability.subagent_count_budget': ({ runId, scenario, subagents }) => { const limit = scenario.max_subagent_count const count = subagentCount(subagents) diff --git a/scripts/evals/v2_validate_experiment_artifacts.ts b/scripts/evals/v2_validate_experiment_artifacts.ts index cbb737235f..40d0c8fc46 100644 --- a/scripts/evals/v2_validate_experiment_artifacts.ts +++ b/scripts/evals/v2_validate_experiment_artifacts.ts @@ -6,6 +6,9 @@ type JsonRecord = Record const repoRoot = path.resolve(import.meta.dirname, '..', '..') const experimentRunsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'experiment-runs') const gateStatuses = new Set(['pass', 'warning', 'fail', 'inconclusive']) +const validityStatuses = new Set(['valid', 'invalid', 'inconclusive']) +const reportProfiles = new Set(['smoke', 'real_experiment']) +const evaluationIntents = new Set(['regression', 'exploration']) async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as JsonRecord @@ -40,6 +43,12 @@ function requireOptionalString( } } +function requireObject(errors: string[], filePath: string, fieldName: string, value: unknown) { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + errors.push(`${filePath}.${fieldName} must be an object`) + } +} + function validateArtifact(filePath: string, artifact: JsonRecord): string[] { const errors: string[] = [] requireString(errors, filePath, 'experiment_id', artifact.experiment_id) @@ -51,6 +60,19 @@ function validateArtifact(filePath: string, artifact: JsonRecord): string[] { requireArray(errors, filePath, 'report_refs', artifact.report_refs) requireArray(errors, filePath, 'errors', artifact.errors) requireArray(errors, filePath, 'warnings', artifact.warnings) + if ( + artifact.report_profile !== undefined && + !reportProfiles.has(String(artifact.report_profile)) + ) { + errors.push(`${filePath}.report_profile has invalid value: ${artifact.report_profile}`) + } + if ( + artifact.evaluation_intent !== undefined && + artifact.evaluation_intent !== null && + !evaluationIntents.has(String(artifact.evaluation_intent)) + ) { + errors.push(`${filePath}.evaluation_intent has invalid value: ${artifact.evaluation_intent}`) + } const riskVerdict = (artifact.risk_verdict ?? artifact.gate_verdict) as JsonRecord | undefined if (!riskVerdict || typeof riskVerdict !== 'object' || Array.isArray(riskVerdict)) { @@ -78,6 +100,25 @@ function validateArtifact(filePath: string, artifact: JsonRecord): string[] { if (artifact.exploration_signals !== undefined) { requireArray(errors, filePath, 'exploration_signals', artifact.exploration_signals) } + if (artifact.variant_effect_summary !== undefined) { + requireArray(errors, filePath, 'variant_effect_summary', artifact.variant_effect_summary) + } + if (artifact.runtime_difference_summary !== undefined) { + requireArray(errors, filePath, 'runtime_difference_summary', artifact.runtime_difference_summary) + } + if (artifact.experiment_validity !== undefined) { + requireObject(errors, filePath, 'experiment_validity', artifact.experiment_validity) + const validity = artifact.experiment_validity as JsonRecord + if (!validityStatuses.has(String(validity.status))) { + errors.push( + `${filePath}.experiment_validity.status has invalid value: ${validity.status}`, + ) + } + requireOptionalString(errors, `${filePath}.experiment_validity`, 'profile', validity.profile) + requireOptionalString(errors, `${filePath}.experiment_validity`, 'reason', validity.reason) + requireArray(errors, `${filePath}.experiment_validity`, 'blockers', validity.blockers) + requireArray(errors, `${filePath}.experiment_validity`, 'warnings', validity.warnings) + } requireOptionalString( errors, filePath, diff --git a/scripts/evals/v2_validate_manifests.ts b/scripts/evals/v2_validate_manifests.ts index dace60be88..1ae50680f6 100644 --- a/scripts/evals/v2_validate_manifests.ts +++ b/scripts/evals/v2_validate_manifests.ts @@ -41,6 +41,8 @@ const scoreDirections = new Set([ ]) const automationLevels = new Set(['automatic', 'manual_review', 'mixed']) const experimentModes = new Set(['bind_existing', 'execute_harness']) +const reportProfiles = new Set(['smoke', 'real_experiment']) +const evaluationIntents = new Set(['regression', 'exploration']) interface ValidationContext { scenarioIds: Set @@ -93,6 +95,17 @@ function requireOptionalNumber( } } +function requireOptionalString( + errors: string[], + objectName: string, + fieldName: string, + value: unknown, +) { + if (value !== undefined && typeof value !== 'string') { + errors.push(`${objectName}.${fieldName} must be a string when present`) + } +} + function isFlatActionBinding( binding: EvalExperimentActionBinding, ): binding is EvalExperimentFlatActionBinding { @@ -134,6 +147,10 @@ function validateScenario(filePath: string, scenario: EvalScenario): string[] { requireArray(errors, filePath, 'expected_tools', scenario.expected_tools) requireArray(errors, filePath, 'expected_skills', scenario.expected_skills) requireArray(errors, filePath, 'expected_constraints', scenario.expected_constraints) + if (scenario.expected_observations !== undefined) { + requireArray(errors, filePath, 'expected_observations', scenario.expected_observations) + } + requireOptionalString(errors, filePath, 'evaluation_note', scenario.evaluation_note) requireOptionalNumber(errors, filePath, 'max_turn_count', scenario.max_turn_count) requireOptionalNumber( errors, @@ -225,6 +242,20 @@ function validateExperiment( ) { errors.push(`${filePath}.mode has invalid value: ${experiment.mode}`) } + if ( + experiment.report_profile !== undefined && + !reportProfiles.has(experiment.report_profile) + ) { + errors.push(`${filePath}.report_profile has invalid value: ${experiment.report_profile}`) + } + if ( + experiment.evaluation_intent !== undefined && + !evaluationIntents.has(experiment.evaluation_intent) + ) { + errors.push( + `${filePath}.evaluation_intent has invalid value: ${experiment.evaluation_intent}`, + ) + } if (experiment.action_bindings !== undefined) { requireArray(errors, filePath, 'action_bindings', experiment.action_bindings) for (const [index, binding] of experiment.action_bindings.entries()) { diff --git a/scripts/evals/v2_verify_execute_harness_alpha.ts b/scripts/evals/v2_verify_execute_harness_alpha.ts index 2db9cc568f..82ed315e20 100644 --- a/scripts/evals/v2_verify_execute_harness_alpha.ts +++ b/scripts/evals/v2_verify_execute_harness_alpha.ts @@ -373,6 +373,7 @@ async function main(): Promise { no_snapshot_db: true, manifest: fixtureExperiment({ id: `v2_2_verify_variant_apply_failed_${stamp}`, + baselineVariantId: 'candidate_tool_router_v2', execution: { ...fixtureExecution(missingCaptureDb), require_config_snapshot: true, diff --git a/scripts/evals/v2_windows_spawn_bridge.cjs b/scripts/evals/v2_windows_spawn_bridge.cjs new file mode 100644 index 0000000000..7084baf490 --- /dev/null +++ b/scripts/evals/v2_windows_spawn_bridge.cjs @@ -0,0 +1,79 @@ +const fs = require('node:fs') +const path = require('node:path') +const { spawnSync } = require('node:child_process') + +function parseArgs(argv) { + const args = {} + for (let index = 0; index < argv.length; index += 1) { + const token = argv[index] + if (!token.startsWith('--')) continue + const key = token.slice(2) + const value = argv[index + 1] + if (!value || value.startsWith('--')) { + args[key] = true + continue + } + args[key] = value + index += 1 + } + return args +} + +function writeResult(resultPath, payload) { + fs.mkdirSync(path.dirname(resultPath), { recursive: true }) + fs.writeFileSync(resultPath, `${JSON.stringify(payload, null, 2)}\n`, 'utf8') +} + +function main() { + const args = parseArgs(process.argv.slice(2)) + const requestPath = args.request + const resultPath = args.result + if (typeof requestPath !== 'string' || typeof resultPath !== 'string') { + throw new Error('Usage: node v2_windows_spawn_bridge.cjs --request --result ') + } + + const request = JSON.parse(fs.readFileSync(requestPath, 'utf8')) + const result = spawnSync(request.command, request.args ?? [], { + cwd: request.cwd, + env: { + ...process.env, + ...(request.env ?? {}), + }, + encoding: 'utf8', + input: request.stdin_text, + timeout: request.timeout_ms, + }) + + writeResult(resultPath, { + command: request.command, + args: request.args ?? [], + cwd: request.cwd, + child_status: result.status, + signal: result.signal ?? null, + timed_out: result.error?.name === 'ETIMEDOUT', + error_name: result.error?.name ?? null, + error_message: result.error?.message ?? null, + stdout: String(result.stdout ?? ''), + stderr: String(result.stderr ?? ''), + }) +} + +try { + main() +} catch (error) { + const args = parseArgs(process.argv.slice(2)) + if (typeof args.result === 'string') { + writeResult(args.result, { + child_status: null, + signal: null, + timed_out: false, + error_name: error instanceof Error ? error.name : 'Error', + error_message: error instanceof Error ? error.message : String(error), + stdout: '', + stderr: '', + }) + } else { + process.stderr.write(`${error instanceof Error ? error.stack ?? error.message : String(error)}\n`) + } + process.exit(1) +} diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index 9dfe775c95..56c6bfa867 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -38,6 +38,8 @@ export interface EvalScenario { expected_tools: string[] expected_skills: string[] expected_constraints: string[] + expected_observations?: string[] + evaluation_note?: string max_turn_count?: number max_total_billed_tokens?: number max_subagent_count?: number @@ -116,5 +118,7 @@ export interface EvalExperiment { baseline_variant_id: string candidate_variant_ids: string[] scenario_set_id: string + report_profile?: 'smoke' | 'real_experiment' + evaluation_intent?: 'regression' | 'exploration' status: EvalExperimentStatus } diff --git a/src/services/SessionMemory/sessionMemory.ts b/src/services/SessionMemory/sessionMemory.ts index dd7c7ce850..be52e94024 100644 --- a/src/services/SessionMemory/sessionMemory.ts +++ b/src/services/SessionMemory/sessionMemory.ts @@ -4,8 +4,9 @@ * without interrupting the main conversation flow. */ +import { existsSync, readFileSync } from 'fs' import { writeFile } from 'fs/promises' -import memoize from 'lodash-es/memoize.js' +import path from 'node:path' import { feature } from 'bun:bundle' import { getIsRemoteMode } from '../../bootstrap/state.js' import { getSystemPrompt } from '../../constants/prompts.js' @@ -42,6 +43,7 @@ import { asSystemPrompt } from '../../utils/systemPromptType.js' import { getTokenUsage, tokenCountWithEstimation } from '../../utils/tokens.js' import { logEvent } from '../analytics/index.js' import { isAutoCompactEnabled } from '../compact/autoCompact.js' +import { emitHarnessEvent } from '../../observability/harness.js' import { buildSessionMemoryUpdatePrompt, loadSessionMemoryTemplate, @@ -98,12 +100,379 @@ function getSessionMemoryRemoteConfig(): Partial { // ============================================================================ let lastMemoryMessageUuid: string | undefined +let sessionMemoryRuntimeInitialized = false +let sessionMemoryNaturalBreakOnly = false +let sessionMemorySnapshotPolicyLoaded = false +let sessionMemorySnapshotPolicy: + | { + mode?: string + natural_break_only?: boolean + token_threshold_multiplier?: number + tool_threshold_multiplier?: number + minimum_message_tokens_to_init?: number + minimum_tokens_between_update?: number + tool_calls_between_updates?: number + force_enabled?: boolean + } + | null = null +let sessionMemoryRuntimePolicy: { + mode: 'default' | 'sparse' | 'custom' + source: string + gate_enabled: boolean + force_enabled: boolean + query_source_supported: boolean + natural_break_only: boolean + token_threshold_multiplier: number + tool_threshold_multiplier: number + minimum_message_tokens_to_init: number + minimum_tokens_between_update: number + tool_calls_between_updates: number +} = { + mode: 'default', + source: 'default_config', + gate_enabled: false, + force_enabled: false, + query_source_supported: true, + natural_break_only: false, + token_threshold_multiplier: 1, + tool_threshold_multiplier: 1, + minimum_message_tokens_to_init: + DEFAULT_SESSION_MEMORY_CONFIG.minimumMessageTokensToInit, + minimum_tokens_between_update: + DEFAULT_SESSION_MEMORY_CONFIG.minimumTokensBetweenUpdate, + tool_calls_between_updates: + DEFAULT_SESSION_MEMORY_CONFIG.toolCallsBetweenUpdates, +} +const emittedPolicyObservationKeys = new Set() /** * Reset the last memory message UUID (for testing) */ export function resetLastMemoryMessageUuid(): void { lastMemoryMessageUuid = undefined + sessionMemoryRuntimeInitialized = false + sessionMemoryNaturalBreakOnly = false + sessionMemorySnapshotPolicyLoaded = false + sessionMemorySnapshotPolicy = null + emittedPolicyObservationKeys.clear() + sessionMemoryRuntimePolicy = { + mode: 'default', + source: 'default_config', + gate_enabled: false, + force_enabled: false, + query_source_supported: true, + natural_break_only: false, + token_threshold_multiplier: 1, + tool_threshold_multiplier: 1, + minimum_message_tokens_to_init: + DEFAULT_SESSION_MEMORY_CONFIG.minimumMessageTokensToInit, + minimum_tokens_between_update: + DEFAULT_SESSION_MEMORY_CONFIG.minimumTokensBetweenUpdate, + tool_calls_between_updates: + DEFAULT_SESSION_MEMORY_CONFIG.toolCallsBetweenUpdates, + } +} + +function parseBooleanEnv(name: string): boolean | undefined { + const value = process.env[name]?.trim().toLowerCase() + if (!value) return undefined + if (['1', 'true', 'yes', 'on'].includes(value)) return true + if (['0', 'false', 'no', 'off'].includes(value)) return false + return undefined +} + +function parsePositiveNumberEnv(name: string): number | undefined { + const raw = process.env[name]?.trim() + if (!raw) return undefined + const value = Number(raw) + if (!Number.isFinite(value) || value <= 0) return undefined + return value +} + +function roundPositive(value: number): number { + return Math.max(1, Math.round(value)) +} + +function loadSessionMemorySnapshotPolicy(): + | { + mode?: string + natural_break_only?: boolean + token_threshold_multiplier?: number + tool_threshold_multiplier?: number + minimum_message_tokens_to_init?: number + minimum_tokens_between_update?: number + tool_calls_between_updates?: number + force_enabled?: boolean + } + | null { + if (sessionMemorySnapshotPolicyLoaded) { + return sessionMemorySnapshotPolicy + } + sessionMemorySnapshotPolicyLoaded = true + + const snapshotRef = process.env.CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF?.trim() + if (!snapshotRef || !snapshotRef.toLowerCase().endsWith('.json')) { + sessionMemorySnapshotPolicy = null + return sessionMemorySnapshotPolicy + } + + const snapshotPath = path.isAbsolute(snapshotRef) + ? snapshotRef + : path.resolve(process.cwd(), snapshotRef) + if (!existsSync(snapshotPath)) { + sessionMemorySnapshotPolicy = null + return sessionMemorySnapshotPolicy + } + + try { + const parsed = JSON.parse(readFileSync(snapshotPath, 'utf8')) as unknown + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + sessionMemorySnapshotPolicy = null + return sessionMemorySnapshotPolicy + } + const policy = (parsed as Record).session_memory_policy + if (!policy || typeof policy !== 'object' || Array.isArray(policy)) { + sessionMemorySnapshotPolicy = null + return sessionMemorySnapshotPolicy + } + sessionMemorySnapshotPolicy = policy as typeof sessionMemorySnapshotPolicy + return sessionMemorySnapshotPolicy + } catch { + sessionMemorySnapshotPolicy = null + return sessionMemorySnapshotPolicy + } +} + +function isEvalSessionMemorySdkAllowed(): boolean { + return ( + Boolean(process.env.CLAUDE_CODE_EVAL_EXPERIMENT_ID) || + parseBooleanEnv('CLAUDE_CODE_SESSION_MEMORY_ALLOW_SDK') === true + ) +} + +function isSessionMemoryQuerySourceSupported( + querySource: REPLHookContext['querySource'], +): boolean { + return ( + querySource === 'repl_main_thread' || + (querySource === 'sdk' && isEvalSessionMemorySdkAllowed()) + ) +} + +function buildSessionMemoryRuntimePolicy(params: { + gateEnabled: boolean + querySource: REPLHookContext['querySource'] +}): { + enabled: boolean + config: SessionMemoryConfig + policy: typeof sessionMemoryRuntimePolicy +} { + const remoteConfig = getSessionMemoryRemoteConfig() + const snapshotPolicy = loadSessionMemorySnapshotPolicy() + const forceEnabled = + snapshotPolicy?.force_enabled === true || + parseBooleanEnv('CLAUDE_CODE_SESSION_MEMORY_FORCE_ENABLE') === true + const querySourceSupported = isSessionMemoryQuerySourceSupported( + params.querySource, + ) + const policyEnv = process.env.CLAUDE_CODE_SESSION_MEMORY_POLICY + ?.trim() + .toLowerCase() + let mode: 'default' | 'sparse' | 'custom' = 'default' + let source = 'default_or_remote_config' + + const config: SessionMemoryConfig = { + minimumMessageTokensToInit: + remoteConfig.minimumMessageTokensToInit && + remoteConfig.minimumMessageTokensToInit > 0 + ? remoteConfig.minimumMessageTokensToInit + : DEFAULT_SESSION_MEMORY_CONFIG.minimumMessageTokensToInit, + minimumTokensBetweenUpdate: + remoteConfig.minimumTokensBetweenUpdate && + remoteConfig.minimumTokensBetweenUpdate > 0 + ? remoteConfig.minimumTokensBetweenUpdate + : DEFAULT_SESSION_MEMORY_CONFIG.minimumTokensBetweenUpdate, + toolCallsBetweenUpdates: + remoteConfig.toolCallsBetweenUpdates && + remoteConfig.toolCallsBetweenUpdates > 0 + ? remoteConfig.toolCallsBetweenUpdates + : DEFAULT_SESSION_MEMORY_CONFIG.toolCallsBetweenUpdates, + } + + let tokenThresholdMultiplier = + (typeof snapshotPolicy?.token_threshold_multiplier === 'number' && + snapshotPolicy.token_threshold_multiplier > 0 + ? snapshotPolicy.token_threshold_multiplier + : undefined) ?? + parsePositiveNumberEnv( + 'CLAUDE_CODE_SESSION_MEMORY_TOKEN_THRESHOLD_MULTIPLIER', + ) ?? 1 + let toolThresholdMultiplier = + (typeof snapshotPolicy?.tool_threshold_multiplier === 'number' && + snapshotPolicy.tool_threshold_multiplier > 0 + ? snapshotPolicy.tool_threshold_multiplier + : undefined) ?? + parsePositiveNumberEnv( + 'CLAUDE_CODE_SESSION_MEMORY_TOOL_THRESHOLD_MULTIPLIER', + ) ?? 1 + let naturalBreakOnly = + (typeof snapshotPolicy?.natural_break_only === 'boolean' + ? snapshotPolicy.natural_break_only + : undefined) ?? + parseBooleanEnv('CLAUDE_CODE_SESSION_MEMORY_NATURAL_BREAK_ONLY') ?? false + + if (snapshotPolicy?.mode === 'sparse') { + mode = 'sparse' + source = 'config_snapshot_session_memory_policy' + if (tokenThresholdMultiplier === 1) tokenThresholdMultiplier = 2 + if (toolThresholdMultiplier === 1) toolThresholdMultiplier = 2 + } else if (typeof snapshotPolicy?.mode === 'string' && snapshotPolicy.mode) { + mode = snapshotPolicy.mode === 'default' ? 'default' : 'custom' + source = 'config_snapshot_session_memory_policy' + } + + if (policyEnv === 'sparse') { + mode = 'sparse' + source = 'env_policy_sparse' + if (tokenThresholdMultiplier === 1) tokenThresholdMultiplier = 2 + if (toolThresholdMultiplier === 1) toolThresholdMultiplier = 2 + if ( + parseBooleanEnv('CLAUDE_CODE_SESSION_MEMORY_NATURAL_BREAK_ONLY') === + undefined + ) { + naturalBreakOnly = true + } + } else if (policyEnv) { + mode = 'custom' + source = `env_policy_${policyEnv}` + } + + if (tokenThresholdMultiplier !== 1) { + config.minimumMessageTokensToInit = roundPositive( + config.minimumMessageTokensToInit * tokenThresholdMultiplier, + ) + config.minimumTokensBetweenUpdate = roundPositive( + config.minimumTokensBetweenUpdate * tokenThresholdMultiplier, + ) + if (source === 'default_or_remote_config') { + source = 'env_token_multiplier' + } + } + if (toolThresholdMultiplier !== 1) { + config.toolCallsBetweenUpdates = roundPositive( + config.toolCallsBetweenUpdates * toolThresholdMultiplier, + ) + if (source === 'default_or_remote_config') { + source = 'env_tool_multiplier' + } + } + + const minInitOverride = parsePositiveNumberEnv( + 'CLAUDE_CODE_SESSION_MEMORY_MIN_INIT_TOKENS', + ) + const minUpdateOverride = parsePositiveNumberEnv( + 'CLAUDE_CODE_SESSION_MEMORY_MIN_TOKENS_BETWEEN_UPDATE', + ) + const toolThresholdOverride = parsePositiveNumberEnv( + 'CLAUDE_CODE_SESSION_MEMORY_TOOL_CALLS_BETWEEN_UPDATES', + ) + + const snapshotMinInit = + typeof snapshotPolicy?.minimum_message_tokens_to_init === 'number' && + snapshotPolicy.minimum_message_tokens_to_init > 0 + ? snapshotPolicy.minimum_message_tokens_to_init + : undefined + const snapshotMinUpdate = + typeof snapshotPolicy?.minimum_tokens_between_update === 'number' && + snapshotPolicy.minimum_tokens_between_update > 0 + ? snapshotPolicy.minimum_tokens_between_update + : undefined + const snapshotToolThreshold = + typeof snapshotPolicy?.tool_calls_between_updates === 'number' && + snapshotPolicy.tool_calls_between_updates > 0 + ? snapshotPolicy.tool_calls_between_updates + : undefined + + if (snapshotMinInit !== undefined) { + config.minimumMessageTokensToInit = roundPositive(snapshotMinInit) + source = 'config_snapshot_session_memory_policy' + } else if (minInitOverride !== undefined) { + config.minimumMessageTokensToInit = roundPositive(minInitOverride) + source = 'env_absolute_threshold_override' + } + if (snapshotMinUpdate !== undefined) { + config.minimumTokensBetweenUpdate = roundPositive(snapshotMinUpdate) + source = 'config_snapshot_session_memory_policy' + } else if (minUpdateOverride !== undefined) { + config.minimumTokensBetweenUpdate = roundPositive(minUpdateOverride) + source = 'env_absolute_threshold_override' + } + if (snapshotToolThreshold !== undefined) { + config.toolCallsBetweenUpdates = roundPositive(snapshotToolThreshold) + source = 'config_snapshot_session_memory_policy' + } else if (toolThresholdOverride !== undefined) { + config.toolCallsBetweenUpdates = roundPositive(toolThresholdOverride) + source = 'env_absolute_threshold_override' + } + + const policy = { + mode, + source, + gate_enabled: params.gateEnabled, + force_enabled: forceEnabled, + query_source_supported: querySourceSupported, + natural_break_only: naturalBreakOnly, + token_threshold_multiplier: tokenThresholdMultiplier, + tool_threshold_multiplier: toolThresholdMultiplier, + minimum_message_tokens_to_init: config.minimumMessageTokensToInit, + minimum_tokens_between_update: config.minimumTokensBetweenUpdate, + tool_calls_between_updates: config.toolCallsBetweenUpdates, + } + + return { + enabled: (params.gateEnabled || forceEnabled) && querySourceSupported, + config, + policy, + } +} + +function initSessionMemoryConfigIfNeeded( + querySource: REPLHookContext['querySource'], + gateEnabled: boolean, +): typeof sessionMemoryRuntimePolicy { + if (!sessionMemoryRuntimeInitialized) { + const runtime = buildSessionMemoryRuntimePolicy({ + gateEnabled, + querySource, + }) + setSessionMemoryConfig(runtime.config) + sessionMemoryRuntimeInitialized = true + sessionMemoryNaturalBreakOnly = runtime.policy.natural_break_only + sessionMemoryRuntimePolicy = runtime.policy + } + return sessionMemoryRuntimePolicy +} + +async function emitSessionMemoryPolicyObserved( + context: REPLHookContext, +): Promise { + const actionId = context.toolUseContext.userActionId ?? 'unknown-action' + const queryId = context.toolUseContext.queryTracking?.chainId ?? 'unknown-query' + const key = `${actionId}:${queryId}` + if (emittedPolicyObservationKeys.has(key)) return + emittedPolicyObservationKeys.add(key) + await emitHarnessEvent({ + event: 'session_memory.policy.observed', + component: 'session_memory', + user_action_id: context.toolUseContext.userActionId ?? null, + query_id: context.toolUseContext.queryTracking?.chainId ?? null, + query_source: context.querySource ?? null, + subagent_id: context.toolUseContext.agentId ?? null, + subagent_type: context.toolUseContext.agentType ?? null, + payload: { + ...sessionMemoryRuntimePolicy, + }, + }) } function countToolCallsSince( @@ -190,7 +559,9 @@ function evaluateSessionMemoryTrigger(messages: Message[]): { // Even if the tool call threshold is met, extraction won't happen until the // token threshold is also satisfied. This prevents excessive extractions. const shouldExtract = - (hasMetTokenThreshold && hasMetToolCallThreshold) || + (hasMetTokenThreshold && + !sessionMemoryNaturalBreakOnly && + hasMetToolCallThreshold) || (hasMetTokenThreshold && !hasToolCallsInLastTurn) let detail: @@ -288,40 +659,6 @@ async function setupSessionMemoryFile( return { memoryPath, currentMemory } } -/** - * Initialize session memory config from remote config (lazy initialization). - * Memoized - only runs once per session, subsequent calls return immediately. - * Uses cached config values - non-blocking. - */ -const initSessionMemoryConfigIfNeeded = memoize((): void => { - // Load config from cache (non-blocking, may be stale) - const remoteConfig = getSessionMemoryRemoteConfig() - - // Only use remote values if they are explicitly set (non-zero positive numbers) - // This ensures sensible defaults aren't overridden by zero values - const config: SessionMemoryConfig = { - minimumMessageTokensToInit: - remoteConfig.minimumMessageTokensToInit && - remoteConfig.minimumMessageTokensToInit > 0 - ? remoteConfig.minimumMessageTokensToInit - : DEFAULT_SESSION_MEMORY_CONFIG.minimumMessageTokensToInit, - minimumTokensBetweenUpdate: - remoteConfig.minimumTokensBetweenUpdate && - remoteConfig.minimumTokensBetweenUpdate > 0 - ? remoteConfig.minimumTokensBetweenUpdate - : DEFAULT_SESSION_MEMORY_CONFIG.minimumTokensBetweenUpdate, - toolCallsBetweenUpdates: - remoteConfig.toolCallsBetweenUpdates && - remoteConfig.toolCallsBetweenUpdates > 0 - ? remoteConfig.toolCallsBetweenUpdates - : DEFAULT_SESSION_MEMORY_CONFIG.toolCallsBetweenUpdates, - } - setSessionMemoryConfig(config) -}) - -/** - * Session memory post-sampling hook that extracts and updates session notes - */ // Track if we've logged the gate check failure this session (to avoid spam) let hasLoggedGateFailure = false @@ -330,8 +667,11 @@ const extractSessionMemory = sequential(async function ( ): Promise { const { messages, toolUseContext, querySource } = context - // Only run session memory on main REPL thread - if (querySource !== 'repl_main_thread') { + const gateEnabled = isSessionMemoryGateEnabled() + const runtimePolicy = initSessionMemoryConfigIfNeeded(querySource, gateEnabled) + await emitSessionMemoryPolicyObserved(context) + + if (!runtimePolicy.query_source_supported) { // Don't log this - it's expected for subagents, teammates, etc. return } @@ -343,7 +683,7 @@ const extractSessionMemory = sequential(async function ( } // Check gate lazily when hook runs (cached, non-blocking) - if (!isSessionMemoryGateEnabled()) { + if (!runtimePolicy.gate_enabled && !runtimePolicy.force_enabled) { // Log gate failure once per session (ant-only) if (process.env.USER_TYPE === 'ant' && !hasLoggedGateFailure) { hasLoggedGateFailure = true @@ -352,9 +692,6 @@ const extractSessionMemory = sequential(async function ( return } - // Initialize config from remote (lazy, only once) - initSessionMemoryConfigIfNeeded() - const triggerInfo = evaluateSessionMemoryTrigger(messages) if (!triggerInfo.shouldExtract) { return @@ -425,15 +762,18 @@ export function initSessionMemory(): void { if (getIsRemoteMode()) return // Session memory is used for compaction, so respect auto-compact settings const autoCompactEnabled = isAutoCompactEnabled() + const forceEnabled = + parseBooleanEnv('CLAUDE_CODE_SESSION_MEMORY_FORCE_ENABLE') === true // Log initialization state (ant-only to avoid noise in external logs) if (process.env.USER_TYPE === 'ant') { logEvent('tengu_session_memory_init', { auto_compact_enabled: autoCompactEnabled, + force_enabled: forceEnabled, }) } - if (!autoCompactEnabled) { + if (!autoCompactEnabled && !forceEnabled) { return } diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index 7fadac4ec2..dd002c11ed 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -17,9 +17,9 @@ This directory stores the local-first V2 evaluation system. ## Modes - `bind_existing`: V2.1 stable mode. You provide existing V1 `user_action_id` values through `action_bindings`. -- `execute_harness`: V2.2-alpha mode. The runner executes one scenario through the headless harness, injects eval context into V1 events, captures the generated `user_action_id` by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. +- `execute_harness`: V2.2 mode. The runner executes one scenario through the headless harness, injects eval context into V1 events, captures the generated `user_action_id` by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. -V2.2-alpha deliberately supports only 1 scenario, 1 baseline, 1 candidate, and `repeat_count=1`. +Current V2.2-beta deliberately supports only 1 scenario, 1 baseline, 1 candidate, and `repeat_count=1`. ## Basic Commands @@ -53,7 +53,7 @@ Run the current V2.1 sample: bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default ``` -Run the V2.2-alpha smoke manifest with automatic execution enabled: +Run the V2.2 smoke manifest with automatic execution enabled: ```powershell bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json @@ -72,6 +72,29 @@ $env:V2_2_EXECUTE_HARNESS='0' bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json ``` +Run the V2.2-beta real runtime-difference experiment: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json +``` + +Run the V2.2.5 manual fallback helper for one real trace: + +```powershell +& 'scripts/evals/v2_manual_real_run.ps1' -ScenarioId 'session_memory_trigger_sensitive' -VariantId 'baseline_default' -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' -MaxTurns 12 +``` + +Run the V2.2.5 manual `bind_existing` fallback experiment: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json +``` + +Interpretation: + +- `smoke`: validates automatic execution, automatic capture, and automatic artifact generation. +- `real_experiment`: asks whether the candidate changed runtime behavior in an observable and interpretable way. + ## bind_existing Binding Shape ```json @@ -115,6 +138,7 @@ If capture returns zero matches, the run fails as `capture_failed`. If it return ```text tests/evals/v2/V2.1-bind_existing-usage.md tests/evals/v2/V2.2-execute_harness-alpha-usage.md +tests/evals/v2/V2.2.5-real-experiment-closure.md tests/evals/v2/experiment-runs/README.md ``` diff --git a/tests/evals/v2/V2.2-execute_harness-alpha-usage.md b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md index 87a1a723b7..9bcac5de73 100644 --- a/tests/evals/v2/V2.2-execute_harness-alpha-usage.md +++ b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md @@ -1,4 +1,4 @@ -# V2.2-alpha execute_harness Usage +# V2.2 execute_harness Usage ## 理解清单 @@ -6,7 +6,7 @@ - V2.2-alpha 新增的是“前半段自动化”:由 runner 自动执行 scenario,并自动找到这次执行生成的 V1 action。 - 正式绑定不允许用“最新 user_action_id”,因为并发、后台任务或手动调试都可能生成更新的 action。 - 正式绑定使用 `benchmark_run_id -> user_action_id`,只有唯一命中时才进入 score/report。 -- 本阶段只支持 1 scenario / 1 baseline / 1 candidate / repeat=1。 +- 当前 beta 仍只支持 1 scenario / 1 baseline / 1 candidate / repeat=1。 - 自动化可以一键关闭,关闭后回退到 V2.1 `bind_existing`。 ## 预期效果 @@ -77,6 +77,32 @@ Variant v0 can pass: It does not do git checkout or source patching. +## Smoke vs Real Experiment + +Smoke manifest: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +Real runtime-difference experiment: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json +``` + +Difference: + +- smoke only proves `execute_harness -> capture -> run/score/report` is healthy +- real experiment additionally asks whether the candidate runtime effect was actually observed +- when `experiment_validity` is `invalid` or `inconclusive`, do not read score deltas as a reliable judgment of harness value + +V2.2.5 adds a closure document for the real runtime-difference path: + +```text +tests/evals/v2/V2.2.5-real-experiment-closure.md +``` + ## Disable Automation Command-line switch: @@ -146,3 +172,7 @@ The verification suite covers: - disabled automation fallback The success-path verification uses a fixture command to avoid real model/API spend. The production default adapter remains `cli_print`. + +## Windows Launcher Note + +The current Windows path no longer relies on `uv_spawn powershell.exe`. V2.2.5 uses a small Node-based launcher bridge for automatic execution, and also keeps a manual PowerShell fallback script for `bind_existing` recovery. diff --git a/tests/evals/v2/V2.2.5-real-experiment-closure.md b/tests/evals/v2/V2.2.5-real-experiment-closure.md new file mode 100644 index 0000000000..bc6b4776cd --- /dev/null +++ b/tests/evals/v2/V2.2.5-real-experiment-closure.md @@ -0,0 +1,122 @@ +# V2.2.5 Real Experiment Closure + +## Understanding + +- V2.2.5 closes the gap between `smoke valid` and `real experiment valid`. +- It provides two usable paths: + - automatic `execute_harness` + - manual real run + `bind_existing` fallback +- The two paths should converge to the same type of V2 evidence: + - `experiment_validity` + - `variant_effect_summary` + - `runtime_difference_summary` + - trace-backed scorecard and compare report + +## Expected Outcome + +You can now prove the `session_memory` runtime difference in either of these ways: + +```text +A. automatic execute_harness +scenario -> baseline auto run -> candidate auto run -> capture -> V2 artifacts + +B. manual fallback +manual baseline run -> baseline user_action_id +manual candidate run -> candidate user_action_id +bind_existing experiment -> V2 artifacts +``` + +## Design Rationale + +V2.2.5 exists because automatic execution can fail for platform reasons even when the scoring and evidence model is correct. The fallback path prevents the whole V2 system from being blocked by launcher instability. + +## Path A: Automatic Real Experiment + +Run: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json +``` + +Current successful artifact: + +```text +tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json +ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md +``` + +What this proves: + +- launcher bridge can execute the real scenario +- baseline and candidate are both captured +- runtime policy difference is observed +- the real experiment is `valid` + +## Path B: Manual Real Run + bind_existing + +Step 1: run baseline manually + +```powershell +& 'scripts/evals/v2_manual_real_run.ps1' ` + -ScenarioId 'session_memory_trigger_sensitive' ` + -VariantId 'baseline_default' ` + -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' ` + -MaxTurns 12 +``` + +Step 2: run candidate manually + +```powershell +& 'scripts/evals/v2_manual_real_run.ps1' ` + -ScenarioId 'session_memory_trigger_sensitive' ` + -VariantId 'candidate_session_memory_sparse' ` + -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' ` + -MaxTurns 12 +``` + +Step 3: use the captured `user_action_id` values in: + +```text +tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json +``` + +Step 4: run the fallback experiment + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json +``` + +Current successful artifact: + +```text +tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json +ObservrityTask/10-系统版本/v2/06-运行报告/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md +``` + +What this proves: + +- even without automatic execution, the real scenario still closes +- the runtime policy evidence survives through `bind_existing` +- V2 scoring is not dependent on the automatic launcher path + +## Reading the Result + +For either path, inspect these fields first: + +- `experiment_validity.status` +- `variant_effect_summary` +- `runtime_difference_summary` +- `scorecard_summary` + +For the current `session_memory` experiment, the important signals are: + +- baseline policy mode = `default` +- candidate policy mode = `sparse` +- `decision_quality.subagent_count_observed` improved +- `efficiency.total_billed_tokens` improved + +## Limits + +- This is still a single-scenario, single-run real experiment. +- It proves runtime difference and interpretability, not long-run stability. +- V2.3 should add batch and robustness before treating these results as broadly stable. diff --git a/tests/evals/v2/configs/session_memory_default.runtime.json b/tests/evals/v2/configs/session_memory_default.runtime.json new file mode 100644 index 0000000000..48b0a8791b --- /dev/null +++ b/tests/evals/v2/configs/session_memory_default.runtime.json @@ -0,0 +1,7 @@ +{ + "config_id": "session_memory_default_runtime", + "session_memory_policy": { + "mode": "default", + "force_enabled": true + } +} diff --git a/tests/evals/v2/configs/session_memory_sparse.runtime.json b/tests/evals/v2/configs/session_memory_sparse.runtime.json new file mode 100644 index 0000000000..ce0fcb2de8 --- /dev/null +++ b/tests/evals/v2/configs/session_memory_sparse.runtime.json @@ -0,0 +1,10 @@ +{ + "config_id": "session_memory_sparse_runtime", + "session_memory_policy": { + "mode": "sparse", + "force_enabled": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2 + } +} diff --git a/tests/evals/v2/experiment-runs/README.md b/tests/evals/v2/experiment-runs/README.md index bf371c33bf..dd15d546bd 100644 --- a/tests/evals/v2/experiment-runs/README.md +++ b/tests/evals/v2/experiment-runs/README.md @@ -4,7 +4,7 @@ - This directory stores experiment-level JSON summaries. - V2.1 summaries are usually produced by `bind_existing`. -- V2.2-alpha summaries may be produced by `execute_harness`, or by `execute_harness` disabled and falling back to `bind_existing`. +- V2.2 summaries may be produced by `execute_harness`, or by `execute_harness` disabled and falling back to `bind_existing`. - The top-level schema is stable enough for regression checks and documentation. ## Required Top-Level Fields @@ -15,6 +15,8 @@ | `manifest_ref` | string | Manifest path used by the runner. | | `generated_at` | string | ISO timestamp. | | `mode` | string | Effective mode: `bind_existing` or `execute_harness`. | +| `report_profile` | string | `smoke` or `real_experiment`. | +| `evaluation_intent` | string or null | Usually `exploration` or `regression`. | | `requested_mode` | string | Manifest-requested mode, when present in newer artifacts. | | `automation_disabled` | boolean | Whether `execute_harness` was disabled and fallback was used. | | `run_refs` | string[] | Generated V2 run JSON refs. | @@ -22,6 +24,9 @@ | `report_refs` | string[] | Generated report refs. | | `risk_verdict` | object | Regression-risk verdict. Not final experiment judgment. | | `gate_verdict` | object | Compatibility alias for older readers. | +| `experiment_validity` | object | Whether the experiment is interpretable as a smoke check or real runtime-difference check. | +| `variant_effect_summary` | array | Candidate runtime-effect evidence summary. | +| `runtime_difference_summary` | string[] | Flattened human-readable difference signals. | | `verdict_boundary` | string | Explicit boundary of verdict semantics. | | `scorecard_summary` | array | Baseline vs candidate score changes. | | `exploration_signals` | string[] | Automatic review hints. | @@ -68,7 +73,9 @@ Newer artifacts include: } ``` -For actual V2.2-alpha automatic runs, `results[*].baseline_execution` and `results[*].candidates[*].candidate_execution` contain the adapter result, capture result, `benchmark_run_id`, and `eval_run_id`. +For actual V2.2 automatic runs, `results[*].baseline_execution` and `results[*].candidates[*].candidate_execution` contain the adapter result, capture result, `benchmark_run_id`, and `eval_run_id`. + +Newer beta artifacts also include `results[*].candidates[*].experiment_validity` and `results[*].candidates[*].variant_effect_summary` so smoke and real experiments are not interpreted the same way. ## Boundary diff --git a/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json b/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json new file mode 100644 index 0000000000..ddf9295af6 --- /dev/null +++ b/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json @@ -0,0 +1,520 @@ +{ + "experiment_id": "session_memory_runtime_sparse_vs_default", + "manifest_ref": "tests\\evals\\v2\\experiments\\session_memory_runtime_sparse_vs_default.json", + "generated_at": "2026-05-02T16:52:22.247Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment remains interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "variant_effect_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 2, + "candidate_value": 1, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 440499, + "candidate_value": 304723, + "delta": -135776, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "session_memory_runtime_sparse_vs_default", + "name": "Session Memory Runtime Sparse vs Default", + "goal": "Verify that a real sparse session_memory candidate is injected into runtime and produces interpretable trace-backed differences under execute_harness.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_2_beta_real", + "scenario_ids": [ + "session_memory_trigger_sensitive" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "timeout_ms": 240000, + "max_turns": 12, + "allow_fallback_to_bind_existing": false + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "execute_harness_alpha_limits": { + "scenario_count": 1, + "candidate_count": 1, + "repeat_count": 1 + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "repeat_index": 1, + "baseline_run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "baseline_user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "baseline_eval_run_id": "eval_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "baseline_benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\67a3a6f37874a8c0\\stdout.txt", + "stderrRef": ".observability\\v2h\\67a3a6f37874a8c0\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_session_memo_e47801b5", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_session_memo_4dd033e6", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "session_memory_runtime_sparse_vs_default", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "session_memory_trigger_sensitive", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "12" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "eval_run_id": "eval_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "candidate_user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "candidate_eval_run_id": "eval_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "candidate_benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\4a945d33a0a43863\\stdout.txt", + "stderrRef": ".observability\\v2h\\4a945d33a0a43863\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_session_memo_e47801b5", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_session_memo_4dd033e6", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "session_memory_runtime_sparse_vs_default", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "session_memory_trigger_sensitive", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "12" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "eval_run_id": "eval_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-02T16:49:18.912Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 2, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-02T16:50:50.682Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "variant_effect_summary": { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment is valid: runtime effect was observed and the baseline/candidate difference is interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_vs_run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.md", + "gate_results": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 440499, + "candidate_value": 304723, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 440499, + "candidate_value": 304723, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 2, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 2, + "candidate_value": 1, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 440499, + "candidate_value": 304723, + "delta": -135776, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "created_at": "2026-05-02T16:52:22.247Z" +} diff --git a/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json b/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json new file mode 100644 index 0000000000..0f889aaaca --- /dev/null +++ b/tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json @@ -0,0 +1,429 @@ +{ + "experiment_id": "session_memory_runtime_sparse_vs_default_manual_bind_existing", + "manifest_ref": "tests\\evals\\v2\\experiments\\session_memory_runtime_sparse_vs_default_manual.bind_existing.json", + "generated_at": "2026-05-02T17:03:11.092Z", + "mode": "bind_existing", + "requested_mode": "bind_existing", + "automation_disabled": false, + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment remains interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "variant_effect_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 2, + "candidate_value": 1, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 396401, + "candidate_value": 303392, + "delta": -93009, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "session_memory_runtime_sparse_vs_default_manual_bind_existing", + "name": "Session Memory Runtime Sparse vs Default Manual Bind Existing", + "goal": "Fallback real experiment for V2.2.5. Use two manually executed real traces to verify that the session_memory runtime policy difference remains interpretable through bind_existing.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_2_5_manual_real", + "scenario_ids": [ + "session_memory_trigger_sensitive" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "action_bindings": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "baseline_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "candidate_user_action_ids": { + "candidate_session_memory_sparse": "b118c7c4-18df-4ff0-b506-5b5454418b48" + } + } + ], + "status": "ready" + }, + "runner": { + "requested_mode": "bind_existing", + "mode": "bind_existing", + "automation_disabled": false, + "fallback_reason": null, + "execute_harness_alpha_limits": null, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "repeat_index": 1, + "baseline_run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "baseline_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "candidate_user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-02T16:54:20.319Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 2, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-02T16:59:26.237Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "variant_effect_summary": { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "Session_memory subagent count changed from 2 to 1.", + "At least one score dimension changed between baseline and candidate." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment is valid: runtime effect was observed and the baseline/candidate difference is interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_vs_run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.md", + "gate_results": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 396401, + "candidate_value": 303392, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 396401, + "candidate_value": 303392, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 2, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 2, + "candidate_value": 1, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 396401, + "candidate_value": 303392, + "delta": -93009, + "interpretation": "improved" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "session_memory_trigger_sensitive", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "2 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "created_at": "2026-05-02T17:03:11.092Z" +} diff --git a/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json b/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json index 02b7ef712a..57b83defd0 100644 --- a/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +++ b/tests/evals/v2/experiments/_experiment.execute_harness.smoke.json @@ -7,6 +7,8 @@ "scenario_set_id": "v2_2_alpha_smoke", "scenario_ids": ["execute_harness_smoke_minimal"], "repeat_count": 1, + "report_profile": "smoke", + "evaluation_intent": "exploration", "score_spec_ids": [ "task_success.main_chain_observed", "efficiency.total_billed_tokens", diff --git a/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json b/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json new file mode 100644 index 0000000000..55c0293ad5 --- /dev/null +++ b/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json @@ -0,0 +1,29 @@ +{ + "experiment_id": "session_memory_runtime_sparse_vs_default", + "name": "Session Memory Runtime Sparse vs Default", + "goal": "Verify that a real sparse session_memory candidate is injected into runtime and produces interpretable trace-backed differences under execute_harness.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_2_beta_real", + "scenario_ids": ["session_memory_trigger_sensitive"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "timeout_ms": 240000, + "max_turns": 12, + "allow_fallback_to_bind_existing": false + }, + "status": "ready" +} diff --git a/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json b/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json new file mode 100644 index 0000000000..7cd9da536e --- /dev/null +++ b/tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json @@ -0,0 +1,32 @@ +{ + "experiment_id": "session_memory_runtime_sparse_vs_default_manual_bind_existing", + "name": "Session Memory Runtime Sparse vs Default Manual Bind Existing", + "goal": "Fallback real experiment for V2.2.5. Use two manually executed real traces to verify that the session_memory runtime policy difference remains interpretable through bind_existing.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": ["candidate_session_memory_sparse"], + "scenario_set_id": "v2_2_5_manual_real", + "scenario_ids": ["session_memory_trigger_sensitive"], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "decision_quality.session_memory_policy_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "bind_existing", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "action_bindings": [ + { + "scenario_id": "session_memory_trigger_sensitive", + "baseline_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "candidate_user_action_ids": { + "candidate_session_memory_sparse": "b118c7c4-18df-4ff0-b506-5b5454418b48" + } + } + ], + "status": "ready" +} diff --git a/tests/evals/v2/runs/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.json b/tests/evals/v2/runs/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.json new file mode 100644 index 0000000000..6fb2dd6d22 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.json @@ -0,0 +1,187 @@ +{ + "run": { + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "baseline_default", + "started_at": "2026-05-02T16:49:13.981Z", + "ended_at": "2026-05-02T16:50:35.827Z", + "status": "completed", + "entry_user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "root_query_id": "5477a647-edbf-46d0-9dd5-906ffd1aa288", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "root_query_id": "5477a647-edbf-46d0-9dd5-906ffd1aa288", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "root_query_id": "5477a647-edbf-46d0-9dd5-906ffd1aa288", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "session_memory_trigger_sensitive", + "name": "Session Memory Trigger Sensitive", + "description": "A real experiment scenario for V2.2-beta. It is intentionally designed to require many read-tool steps inside the current repository so session_memory policy differences can be observed with controlled cost.", + "input_prompt": "You are already inside the target repository root. Perform a read-only four-stage code inspection task and do not modify any files. Only use the exact relative file paths listed below. Do not search outside the current repository. Do not guess alternate absolute paths. If a listed file cannot be read, state that directly and continue without trying other repositories. Stage 1: read tests/evals/v2/README.md, tests/evals/v2/experiment-runs/README.md, and scripts/evals/v2_harness_execution.ts, then summarize how execute_harness works. Stage 2: read scripts/evals/v2_run_experiment.ts, scripts/evals/v2_compare_runs.ts, and scripts/evals/v2_record_run.ts, then summarize how V2 turns V1 evidence into run, score, compare, and experiment artifacts. Stage 3: read src/services/SessionMemory/sessionMemory.ts, src/services/SessionMemory/sessionMemoryUtils.ts, and src/observability/harness.ts, then summarize how session_memory is triggered and observed. Stage 4: read tests/evals/v2/variants/baseline.template.json, tests/evals/v2/variants/candidate_session_memory_sparse.json, and tests/evals/v2/configs/session_memory_sparse.runtime.json, then explain the expected difference between baseline and candidate session_memory policy. The final answer must contain exactly four top-level sections named Stage 1, Stage 2, Stage 3, and Stage 4.", + "tags": [ + "observability-v2", + "session-memory", + "runtime-diff", + "real-experiment" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should inspect many files across many tool turns", + "Should keep the task readable and finite", + "The experiment goal is to expose session_memory runtime behavior, not to optimize final prose quality" + ], + "expected_observations": [ + "A session_memory policy observation event should exist in V1 events", + "Baseline and candidate should expose different session_memory policies", + "Candidate should prefer natural-break-triggered session_memory updates" + ], + "evaluation_note": "This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence.", + "max_turn_count": 14, + "max_total_billed_tokens": 220000, + "max_subagent_count": 6, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "started_at": "2026-05-02T16:49:13.981Z", + "started_at_ms": 1777740553981, + "ended_at": "2026-05-02T16:50:35.827Z", + "ended_at_ms": 1777740635827, + "duration_ms": 81846, + "event_count": 318, + "query_count": 3, + "main_thread_query_count": 1, + "subagent_query_count": 2, + "subagent_count": 2, + "tool_call_count": 21, + "experiment_id": "exp_session_memo_e47801b5", + "scenario_id": "scn_session_memo_4dd033e6", + "variant_id": "var_baseline_def_eb4a038e", + "benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "eval_run_id": "eval_session_memory_runti_session_memory_trigg_baseline_default_1d69302245ce", + "raw_input_tokens": "760", + "output_tokens": "9004", + "cache_read_tokens": "266044", + "cache_create_tokens": "164691", + "total_prompt_input_tokens": "431495", + "total_billed_tokens": "440499", + "main_thread_total_prompt_input_tokens": "300312", + "subagent_total_prompt_input_tokens": "131183" + }, + "rootQuery": { + "query_id": "5477a647-edbf-46d0-9dd5-906ffd1aa288", + "user_action_id": "f9b83353-0650-4868-af08-c0ff7048f7b1", + "session_id": "64ab0053-be03-4628-93ca-c996782fe3e1", + "conversation_id": "64ab0053-be03-4628-93ca-c996782fe3e1", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T16:49:13.981Z", + "started_at_ms": 1777740553981, + "ended_at": "2026-05-02T16:50:35.827Z", + "ended_at_ms": 1777740635827, + "duration_ms": 81846, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 5, + "query_max_loop_iter": 5, + "query_avg_loop_iter": 3, + "tool_call_count": 12, + "event_count": 164, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Read", + "tool_count": 13, + "closed_count": "13", + "failed_count": "0" + }, + { + "tool_name": "Edit", + "tool_count": 8, + "closed_count": "8", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 2, + "avg_duration_ms": 68483 + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-02T16:49:18.912Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 2, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.json b/tests/evals/v2/runs/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.json new file mode 100644 index 0000000000..881787f87c --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.json @@ -0,0 +1,182 @@ +{ + "run": { + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "candidate_session_memory_sparse", + "started_at": "2026-05-02T16:50:45.579Z", + "ended_at": "2026-05-02T16:52:16.833Z", + "status": "completed", + "entry_user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "root_query_id": "9b4efe45-9504-4bc9-8391-fa0c51fa01b6", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "root_query_id": "9b4efe45-9504-4bc9-8391-fa0c51fa01b6", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "root_query_id": "9b4efe45-9504-4bc9-8391-fa0c51fa01b6", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "session_memory_trigger_sensitive", + "name": "Session Memory Trigger Sensitive", + "description": "A real experiment scenario for V2.2-beta. It is intentionally designed to require many read-tool steps inside the current repository so session_memory policy differences can be observed with controlled cost.", + "input_prompt": "You are already inside the target repository root. Perform a read-only four-stage code inspection task and do not modify any files. Only use the exact relative file paths listed below. Do not search outside the current repository. Do not guess alternate absolute paths. If a listed file cannot be read, state that directly and continue without trying other repositories. Stage 1: read tests/evals/v2/README.md, tests/evals/v2/experiment-runs/README.md, and scripts/evals/v2_harness_execution.ts, then summarize how execute_harness works. Stage 2: read scripts/evals/v2_run_experiment.ts, scripts/evals/v2_compare_runs.ts, and scripts/evals/v2_record_run.ts, then summarize how V2 turns V1 evidence into run, score, compare, and experiment artifacts. Stage 3: read src/services/SessionMemory/sessionMemory.ts, src/services/SessionMemory/sessionMemoryUtils.ts, and src/observability/harness.ts, then summarize how session_memory is triggered and observed. Stage 4: read tests/evals/v2/variants/baseline.template.json, tests/evals/v2/variants/candidate_session_memory_sparse.json, and tests/evals/v2/configs/session_memory_sparse.runtime.json, then explain the expected difference between baseline and candidate session_memory policy. The final answer must contain exactly four top-level sections named Stage 1, Stage 2, Stage 3, and Stage 4.", + "tags": [ + "observability-v2", + "session-memory", + "runtime-diff", + "real-experiment" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should inspect many files across many tool turns", + "Should keep the task readable and finite", + "The experiment goal is to expose session_memory runtime behavior, not to optimize final prose quality" + ], + "expected_observations": [ + "A session_memory policy observation event should exist in V1 events", + "Baseline and candidate should expose different session_memory policies", + "Candidate should prefer natural-break-triggered session_memory updates" + ], + "evaluation_note": "This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence.", + "max_turn_count": 14, + "max_total_billed_tokens": 220000, + "max_subagent_count": 6, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "started_at": "2026-05-02T16:50:45.579Z", + "started_at_ms": 1777740645579, + "ended_at": "2026-05-02T16:52:16.833Z", + "ended_at_ms": 1777740736833, + "duration_ms": 91254, + "event_count": 183, + "query_count": 2, + "main_thread_query_count": 1, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 12, + "experiment_id": "exp_session_memo_e47801b5", + "scenario_id": "scn_session_memo_4dd033e6", + "variant_id": "var_candidate_se_efbc2e82", + "benchmark_run_id": "bench_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "eval_run_id": "eval_session_memory_runti_session_memory_trigg_candidate_session_me_a3dfb7c7d2b8", + "raw_input_tokens": "247", + "output_tokens": "3357", + "cache_read_tokens": "217468", + "cache_create_tokens": "83651", + "total_prompt_input_tokens": "301366", + "total_billed_tokens": "304723", + "main_thread_total_prompt_input_tokens": "301366", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "9b4efe45-9504-4bc9-8391-fa0c51fa01b6", + "user_action_id": "cd929218-cfa1-4772-93ba-ae659d9ca0d9", + "session_id": "3b005440-cc4c-4c79-ae41-ccdd1b165986", + "conversation_id": "3b005440-cc4c-4c79-ae41-ccdd1b165986", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T16:50:45.579Z", + "started_at_ms": 1777740645579, + "ended_at": "2026-05-02T16:52:16.721Z", + "ended_at_ms": 1777740736721, + "duration_ms": 91142, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 5, + "query_max_loop_iter": 5, + "query_avg_loop_iter": 3, + "tool_call_count": 12, + "event_count": 165, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Read", + "tool_count": 12, + "closed_count": "12", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-02T16:50:50.682Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.json b/tests/evals/v2/runs/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.json new file mode 100644 index 0000000000..a0113dcd1e --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.json @@ -0,0 +1,187 @@ +{ + "run": { + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "baseline_default", + "started_at": "2026-05-02T16:54:15.469Z", + "ended_at": "2026-05-02T16:55:54.742Z", + "status": "completed", + "entry_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "root_query_id": "27da52c7-548e-4d7f-b477-60af0aef1bb5", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "root_query_id": "27da52c7-548e-4d7f-b477-60af0aef1bb5", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "root_query_id": "27da52c7-548e-4d7f-b477-60af0aef1bb5", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "session_memory_trigger_sensitive", + "name": "Session Memory Trigger Sensitive", + "description": "A real experiment scenario for V2.2-beta. It is intentionally designed to require many read-tool steps inside the current repository so session_memory policy differences can be observed with controlled cost.", + "input_prompt": "You are already inside the target repository root. Perform a read-only four-stage code inspection task and do not modify any files. Only use the exact relative file paths listed below. Do not search outside the current repository. Do not guess alternate absolute paths. If a listed file cannot be read, state that directly and continue without trying other repositories. Stage 1: read tests/evals/v2/README.md, tests/evals/v2/experiment-runs/README.md, and scripts/evals/v2_harness_execution.ts, then summarize how execute_harness works. Stage 2: read scripts/evals/v2_run_experiment.ts, scripts/evals/v2_compare_runs.ts, and scripts/evals/v2_record_run.ts, then summarize how V2 turns V1 evidence into run, score, compare, and experiment artifacts. Stage 3: read src/services/SessionMemory/sessionMemory.ts, src/services/SessionMemory/sessionMemoryUtils.ts, and src/observability/harness.ts, then summarize how session_memory is triggered and observed. Stage 4: read tests/evals/v2/variants/baseline.template.json, tests/evals/v2/variants/candidate_session_memory_sparse.json, and tests/evals/v2/configs/session_memory_sparse.runtime.json, then explain the expected difference between baseline and candidate session_memory policy. The final answer must contain exactly four top-level sections named Stage 1, Stage 2, Stage 3, and Stage 4.", + "tags": [ + "observability-v2", + "session-memory", + "runtime-diff", + "real-experiment" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should inspect many files across many tool turns", + "Should keep the task readable and finite", + "The experiment goal is to expose session_memory runtime behavior, not to optimize final prose quality" + ], + "expected_observations": [ + "A session_memory policy observation event should exist in V1 events", + "Baseline and candidate should expose different session_memory policies", + "Candidate should prefer natural-break-triggered session_memory updates" + ], + "evaluation_note": "This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence.", + "max_turn_count": 14, + "max_total_billed_tokens": 220000, + "max_subagent_count": 6, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "started_at": "2026-05-02T16:54:15.469Z", + "started_at_ms": 1777740855469, + "ended_at": "2026-05-02T16:55:54.742Z", + "ended_at_ms": 1777740954742, + "duration_ms": 99273, + "event_count": 304, + "query_count": 3, + "main_thread_query_count": 1, + "subagent_query_count": 2, + "subagent_count": 2, + "tool_call_count": 21, + "experiment_id": "session_memory_runtime_sparse_vs_default_manual", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "baseline_default", + "benchmark_run_id": "manual_bench_20260502T165411547Z_session_memory_trigger_sensitive_baseline_default_177a84fc", + "eval_run_id": "manual_eval_20260502T165411547Z_session_memory_trigger_sensitive_baseline_default_177a84fc", + "raw_input_tokens": "217", + "output_tokens": "10555", + "cache_read_tokens": "221055", + "cache_create_tokens": "164574", + "total_prompt_input_tokens": "385846", + "total_billed_tokens": "396401", + "main_thread_total_prompt_input_tokens": "300422", + "subagent_total_prompt_input_tokens": "85424" + }, + "rootQuery": { + "query_id": "27da52c7-548e-4d7f-b477-60af0aef1bb5", + "user_action_id": "7b614b14-19d8-41db-8ee8-ebb61bc4b699", + "session_id": "15e00668-3d68-4729-99c7-1c8188f74362", + "conversation_id": "15e00668-3d68-4729-99c7-1c8188f74362", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T16:54:15.469Z", + "started_at_ms": 1777740855469, + "ended_at": "2026-05-02T16:55:54.742Z", + "ended_at_ms": 1777740954742, + "duration_ms": 99273, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 5, + "query_max_loop_iter": 5, + "query_avg_loop_iter": 3, + "tool_call_count": 12, + "event_count": 165, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Read", + "tool_count": 12, + "closed_count": "12", + "failed_count": "0" + }, + { + "tool_name": "Edit", + "tool_count": 9, + "closed_count": "9", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 2, + "avg_duration_ms": 74679 + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-02T16:54:20.319Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 2, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.json b/tests/evals/v2/runs/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.json new file mode 100644 index 0000000000..9a3ca7a203 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.json @@ -0,0 +1,182 @@ +{ + "run": { + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "candidate_session_memory_sparse", + "started_at": "2026-05-02T16:59:20.101Z", + "ended_at": "2026-05-02T17:00:43.328Z", + "status": "completed", + "entry_user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "root_query_id": "e5deb781-955f-4cbd-8194-62d79cd14bc7", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "root_query_id": "e5deb781-955f-4cbd-8194-62d79cd14bc7", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "root_query_id": "e5deb781-955f-4cbd-8194-62d79cd14bc7", + "observability_db_ref": ".observability\\observability_v1.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "session_memory_trigger_sensitive", + "name": "Session Memory Trigger Sensitive", + "description": "A real experiment scenario for V2.2-beta. It is intentionally designed to require many read-tool steps inside the current repository so session_memory policy differences can be observed with controlled cost.", + "input_prompt": "You are already inside the target repository root. Perform a read-only four-stage code inspection task and do not modify any files. Only use the exact relative file paths listed below. Do not search outside the current repository. Do not guess alternate absolute paths. If a listed file cannot be read, state that directly and continue without trying other repositories. Stage 1: read tests/evals/v2/README.md, tests/evals/v2/experiment-runs/README.md, and scripts/evals/v2_harness_execution.ts, then summarize how execute_harness works. Stage 2: read scripts/evals/v2_run_experiment.ts, scripts/evals/v2_compare_runs.ts, and scripts/evals/v2_record_run.ts, then summarize how V2 turns V1 evidence into run, score, compare, and experiment artifacts. Stage 3: read src/services/SessionMemory/sessionMemory.ts, src/services/SessionMemory/sessionMemoryUtils.ts, and src/observability/harness.ts, then summarize how session_memory is triggered and observed. Stage 4: read tests/evals/v2/variants/baseline.template.json, tests/evals/v2/variants/candidate_session_memory_sparse.json, and tests/evals/v2/configs/session_memory_sparse.runtime.json, then explain the expected difference between baseline and candidate session_memory policy. The final answer must contain exactly four top-level sections named Stage 1, Stage 2, Stage 3, and Stage 4.", + "tags": [ + "observability-v2", + "session-memory", + "runtime-diff", + "real-experiment" + ], + "expected_artifacts": [], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should inspect many files across many tool turns", + "Should keep the task readable and finite", + "The experiment goal is to expose session_memory runtime behavior, not to optimize final prose quality" + ], + "expected_observations": [ + "A session_memory policy observation event should exist in V1 events", + "Baseline and candidate should expose different session_memory policies", + "Candidate should prefer natural-break-triggered session_memory updates" + ], + "evaluation_note": "This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence.", + "max_turn_count": 14, + "max_total_billed_tokens": 220000, + "max_subagent_count": 6, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "started_at": "2026-05-02T16:59:20.101Z", + "started_at_ms": 1777741160101, + "ended_at": "2026-05-02T17:00:43.328Z", + "ended_at_ms": 1777741243328, + "duration_ms": 83227, + "event_count": 183, + "query_count": 2, + "main_thread_query_count": 1, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 12, + "experiment_id": "session_memory_runtime_sparse_vs_default_manual", + "scenario_id": "session_memory_trigger_sensitive", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "manual_bench_20260502T165916439Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_26ce4f63", + "eval_run_id": "manual_eval_20260502T165916439Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_26ce4f63", + "raw_input_tokens": "95", + "output_tokens": "3001", + "cache_read_tokens": "217098", + "cache_create_tokens": "83198", + "total_prompt_input_tokens": "300391", + "total_billed_tokens": "303392", + "main_thread_total_prompt_input_tokens": "300391", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "e5deb781-955f-4cbd-8194-62d79cd14bc7", + "user_action_id": "b118c7c4-18df-4ff0-b506-5b5454418b48", + "session_id": "962717c8-d1ec-4a2c-8aeb-c4a21df3fffc", + "conversation_id": "962717c8-d1ec-4a2c-8aeb-c4a21df3fffc", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-02T16:59:20.101Z", + "started_at_ms": 1777741160101, + "ended_at": "2026-05-02T17:00:43.212Z", + "ended_at_ms": 1777741243212, + "duration_ms": 83111, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 5, + "query_max_loop_iter": 5, + "query_avg_loop_iter": 3, + "tool_call_count": 12, + "event_count": 165, + "raw_query_started_count": 1, + "raw_query_terminated_count": 1, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "true", + "inferred_is_complete": "true" + }, + "tools": [ + { + "tool_name": "Read", + "tool_count": 12, + "closed_count": "12", + "failed_count": "0" + } + ], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_tool_threshold", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-02T16:59:26.237Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_tool_threshold" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + } +} diff --git a/tests/evals/v2/scenarios/session_memory_trigger_sensitive.json b/tests/evals/v2/scenarios/session_memory_trigger_sensitive.json new file mode 100644 index 0000000000..ba575ef550 --- /dev/null +++ b/tests/evals/v2/scenarios/session_memory_trigger_sensitive.json @@ -0,0 +1,27 @@ +{ + "scenario_id": "session_memory_trigger_sensitive", + "name": "Session Memory Trigger Sensitive", + "description": "A real experiment scenario for V2.2-beta. It is intentionally designed to require many read-tool steps inside the current repository so session_memory policy differences can be observed with controlled cost.", + "input_prompt": "You are already inside the target repository root. Perform a read-only four-stage code inspection task and do not modify any files. Only use the exact relative file paths listed below. Do not search outside the current repository. Do not guess alternate absolute paths. If a listed file cannot be read, state that directly and continue without trying other repositories. Stage 1: read tests/evals/v2/README.md, tests/evals/v2/experiment-runs/README.md, and scripts/evals/v2_harness_execution.ts, then summarize how execute_harness works. Stage 2: read scripts/evals/v2_run_experiment.ts, scripts/evals/v2_compare_runs.ts, and scripts/evals/v2_record_run.ts, then summarize how V2 turns V1 evidence into run, score, compare, and experiment artifacts. Stage 3: read src/services/SessionMemory/sessionMemory.ts, src/services/SessionMemory/sessionMemoryUtils.ts, and src/observability/harness.ts, then summarize how session_memory is triggered and observed. Stage 4: read tests/evals/v2/variants/baseline.template.json, tests/evals/v2/variants/candidate_session_memory_sparse.json, and tests/evals/v2/configs/session_memory_sparse.runtime.json, then explain the expected difference between baseline and candidate session_memory policy. The final answer must contain exactly four top-level sections named Stage 1, Stage 2, Stage 3, and Stage 4.", + "tags": ["observability-v2", "session-memory", "runtime-diff", "real-experiment"], + "expected_artifacts": [], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Must not modify files", + "Should inspect many files across many tool turns", + "Should keep the task readable and finite", + "The experiment goal is to expose session_memory runtime behavior, not to optimize final prose quality" + ], + "expected_observations": [ + "A session_memory policy observation event should exist in V1 events", + "Baseline and candidate should expose different session_memory policies", + "Candidate should prefer natural-break-triggered session_memory updates" + ], + "evaluation_note": "This is a real runtime-difference scenario, not a smoke check. Success means the candidate policy is observed and interpretable in V1/V2 evidence.", + "max_turn_count": 14, + "max_total_billed_tokens": 220000, + "max_subagent_count": 6, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/score-specs/default-v2-1.score-specs.json b/tests/evals/v2/score-specs/default-v2-1.score-specs.json index ca24dac920..f0ea7c0a32 100644 --- a/tests/evals/v2/score-specs/default-v2-1.score-specs.json +++ b/tests/evals/v2/score-specs/default-v2-1.score-specs.json @@ -40,6 +40,17 @@ }, "version": "v2.1" }, + { + "score_spec_id": "decision_quality.session_memory_policy_observed", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "direction": "observed_only", + "formula": "1 if a session_memory.policy.observed event or equivalent run.variant_effect evidence exists, else 0", + "data_sources": ["V1 events_raw", "V2 run.variant_effect"], + "evidence_requirements": ["entry_user_action_id", "variant_effect"], + "automation_level": "automatic", + "version": "v2.2-beta" + }, { "score_spec_id": "stability.recovery_absence", "dimension": "stability", diff --git a/tests/evals/v2/scores/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.scores.json b/tests/evals/v2/scores/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.scores.json new file mode 100644 index 0000000000..7c2664da67 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353.scores.json @@ -0,0 +1,62 @@ +[ + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_task_success_main_chain_observed", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 440499, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_stability_recovery_absence", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T165041469Z_session_memory_trigger_sensitive_baseline_default_f9b83353", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 14." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.scores.json b/tests/evals/v2/scores/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.scores.json new file mode 100644 index 0000000000..125b7a93ad --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218.scores.json @@ -0,0 +1,62 @@ +[ + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_task_success_main_chain_observed", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 304723, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_stability_recovery_absence", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T165222048Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_cd929218", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 14." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.scores.json b/tests/evals/v2/scores/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.scores.json new file mode 100644 index 0000000000..bfbcdf021a --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14.scores.json @@ -0,0 +1,62 @@ +[ + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_task_success_main_chain_observed", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 396401, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_stability_recovery_absence", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T170309880Z_session_memory_trigger_sensitive_baseline_default_7b614b14", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 14." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.scores.json b/tests/evals/v2/scores/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.scores.json new file mode 100644 index 0000000000..1a65335c20 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4.scores.json @@ -0,0 +1,62 @@ +[ + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_task_success_main_chain_observed", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 303392, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_stability_recovery_absence", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T170310924Z_session_memory_trigger_sensitive_candidate_session_memory_sparse_b118c7c4", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 14." + } +] diff --git a/tests/evals/v2/variants/baseline.template.json b/tests/evals/v2/variants/baseline.template.json index b71d1c6fa4..502b6c1c11 100644 --- a/tests/evals/v2/variants/baseline.template.json +++ b/tests/evals/v2/variants/baseline.template.json @@ -4,6 +4,6 @@ "description": "Current default harness baseline used for comparison.", "change_layer": "mixed", "git_commit": "HEAD", - "config_snapshot_ref": "path/to/baseline-config.json", - "notes": "Use this as the default baseline unless a scenario explicitly requires another baseline." + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." } diff --git a/tests/evals/v2/variants/candidate_session_memory_sparse.json b/tests/evals/v2/variants/candidate_session_memory_sparse.json index 5ea9683cd6..43ddef9f9e 100644 --- a/tests/evals/v2/variants/candidate_session_memory_sparse.json +++ b/tests/evals/v2/variants/candidate_session_memory_sparse.json @@ -1,10 +1,10 @@ { "variant_id": "candidate_session_memory_sparse", "name": "Candidate Session Memory Sparse", - "description": "Increase the default session memory tool-call threshold from 3 to 6 to reduce background memory subagent cost.", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", "change_layer": "harness", "base_variant_id": "baseline_default", "git_commit": "HEAD", - "config_snapshot_ref": "src/services/SessionMemory/sessionMemoryUtils.ts", - "notes": "Token-saving harness candidate. Keeps natural-break trigger intact while reducing tool-threshold-triggered updates." + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." } diff --git a/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T162923305Z.json b/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T162923305Z.json new file mode 100644 index 0000000000..d6ab674b14 --- /dev/null +++ b/tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T162923305Z.json @@ -0,0 +1,89 @@ +{ + "verification_id": "v2_2_execute_harness_alpha_2026-05-02T162923305Z", + "generated_at": "2026-05-02T16:29:33.062Z", + "temp_root": ".observability\\v2-execute-harness-verification\\2026-05-02T162923305Z", + "passed": true, + "case_count": 9, + "failed_count": 0, + "note": "Success-path verification uses a fixture command to avoid model/API spend; the production default adapter is cli_print.", + "results": [ + { + "case_id": "execute_harness_success_fixture", + "description": "execute_harness success path creates run, score, report, and risk verdict through benchmark_run_id capture.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_2_verify_success_2026-05-02T162923305Z_2026-05-02T162926620Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_success_2026-05-02T162923305Z_2026-05-02T162926620Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_2_verify_success_2026-05-02T162923305Z_2026-05-02T162926620Z.json\nCreated V2 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_success_2026-05-02T162923305Z_2026-05-02T162926620Z.md" + }, + { + "case_id": "adapter_not_found", + "description": "Unsupported adapter should fail clearly.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Unsupported execute_harness adapter: not_real_adapter" + }, + { + "case_id": "capture_failed", + "description": "Completed execution without matching benchmark_run_id should fail capture.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default action capture capture_failed: No user_action_id found for benchmark_run_id=bench_v2_2_verify_capture__cost_sensitive_task_baseline_default_a1218a4838d8" + }, + { + "case_id": "ambiguous_capture", + "description": "Multiple user_action_id rows for one benchmark_run_id should fail capture.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default action capture ambiguous_capture: Multiple user_action_id values found for benchmark_run_id=bench_v2_2_verify_ambiguou_cost_sensitive_task_baseline_default_3c326d19fa92" + }, + { + "case_id": "variant_apply_failed", + "description": "Strict config snapshot check should fail before execution when the referenced snapshot is missing.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Variant apply failed: config_snapshot_ref does not exist: manual" + }, + { + "case_id": "scenario_missing", + "description": "Missing scenario manifest should fail before execution.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "Scenario not found: not_real_scenario" + }, + { + "case_id": "baseline_failure", + "description": "Baseline execution failure should stop the experiment.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "baseline scenario=cost_sensitive_task variant=baseline_default execute_harness failed: Fixture requested failure for variant baseline_default" + }, + { + "case_id": "candidate_failure", + "description": "Candidate execution failure should stop the experiment after the baseline succeeds.", + "passed": true, + "expected": "failure", + "status": 1, + "error_excerpt": "candidate scenario=cost_sensitive_task variant=candidate_session_memory_sparse execute_harness failed: Fixture requested failure for variant candidate_session_memory_sparse" + }, + { + "case_id": "disabled_fallback_to_bind_existing", + "description": "Automation can be disabled and fall back to bind_existing.", + "passed": true, + "expected": "success", + "status": 0, + "summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_2_verify_disabled_fallback_2026-05-02T162923305Z_2026-05-02T162933014Z.json", + "report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_disabled_fallback_2026-05-02T162923305Z_2026-05-02T162933014Z.md", + "artifacts_cleaned": true, + "error_excerpt": "Created V2 experiment summary: tests\\evals\\v2\\experiment-runs\\v2_2_verify_disabled_fallback_2026-05-02T162923305Z_2026-05-02T162933014Z.json\nCreated V2 experiment report: ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_2_verify_disabled_fallback_2026-05-02T162923305Z_2026-05-02T162933014Z.md" + } + ] +} From e79f308a17824ebd8a637adaa8e52696c2484c60 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sun, 3 May 2026 11:47:55 +0800 Subject: [PATCH 16/26] Add observability v2.3 batch robustness evaluation --- ...05\350\257\273\346\214\207\345\215\227.md" | 601 ++++ ...robustness_smoke_2026-05-02T183608080Z.md" | 43 + ...ndidate_session_memory_sparse_9c051f26.md" | 58 + ...candidate_eval_fixture_shadow_f8573444.md" | 58 + ...ndidate_session_memory_sparse_659719ae.md" | 58 + ...candidate_eval_fixture_shadow_0af9186b.md" | 58 + ...ndidate_session_memory_sparse_0c047aff.md" | 58 + ...candidate_eval_fixture_shadow_5cbe5887.md" | 58 + ...ndidate_session_memory_sparse_1bf4c32c.md" | 58 + ...candidate_eval_fixture_shadow_ef24adf5.md" | 58 + ...robustness_smoke_2026-05-02T183608080Z.md" | 222 ++ ...moke_minimal_baseline_default_604a7b67.md" | 66 + ...ndidate_session_memory_sparse_9c051f26.md" | 66 + ...candidate_eval_fixture_shadow_f8573444.md" | 66 + ...moke_minimal_baseline_default_31267657.md" | 66 + ...ndidate_session_memory_sparse_659719ae.md" | 66 + ...candidate_eval_fixture_shadow_0af9186b.md" | 66 + ..._minimal_alt_baseline_default_5e2e7376.md" | 66 + ...ndidate_session_memory_sparse_0c047aff.md" | 66 + ...candidate_eval_fixture_shadow_5cbe5887.md" | 66 + ..._minimal_alt_baseline_default_c781769d.md" | 66 + ...ndidate_session_memory_sparse_1bf4c32c.md" | 66 + ...candidate_eval_fixture_shadow_ef24adf5.md" | 66 + scripts/evals/v2_harness_execution.ts | 125 +- scripts/evals/v2_record_run.ts | 7 + scripts/evals/v2_run_experiment.ts | 580 +++- .../evals/v2_validate_experiment_artifacts.ts | 12 + scripts/evals/v2_validate_manifests.ts | 22 + scripts/evals/v2_verify_bind_runner.ts | 37 + src/observability/v2/evalExperimentTypes.ts | 4 +- src/observability/v2/evalTypes.ts | 2 + tests/evals/v2/README.md | 30 +- .../v2/V2.2-execute_harness-alpha-usage.md | 2 +- tests/evals/v2/V2.3-batch-robustness-usage.md | 54 + ...obustness_smoke_2026-05-02T183608080Z.json | 2820 +++++++++++++++++ .../_experiment.robustness.smoke.json | 37 + ...aseline_default_2026-05-02T183554916Z.json | 33 + ..._fixture_shadow_2026-05-02T183554916Z.json | 33 + ...n_memory_sparse_2026-05-02T183554916Z.json | 33 + ...aseline_default_2026-05-02T183554916Z.json | 33 + ..._fixture_shadow_2026-05-02T183554916Z.json | 33 + ...n_memory_sparse_2026-05-02T183554916Z.json | 33 + ...oke_minimal_baseline_default_604a7b67.json | 117 + ...didate_session_memory_sparse_9c051f26.json | 118 + ...andidate_eval_fixture_shadow_f8573444.json | 120 + ...oke_minimal_baseline_default_31267657.json | 117 + ...didate_session_memory_sparse_659719ae.json | 118 + ...andidate_eval_fixture_shadow_0af9186b.json | 120 + ...minimal_alt_baseline_default_5e2e7376.json | 122 + ...didate_session_memory_sparse_0c047aff.json | 123 + ...andidate_eval_fixture_shadow_5cbe5887.json | 125 + ...minimal_alt_baseline_default_c781769d.json | 122 + ...didate_session_memory_sparse_1bf4c32c.json | 123 + ...andidate_eval_fixture_shadow_ef24adf5.json | 125 + .../robustness_smoke_minimal_alt.json | 29 + ...imal_baseline_default_604a7b67.scores.json | 52 + ...session_memory_sparse_9c051f26.scores.json | 52 + ...e_eval_fixture_shadow_f8573444.scores.json | 52 + ...imal_baseline_default_31267657.scores.json | 52 + ...session_memory_sparse_659719ae.scores.json | 52 + ...e_eval_fixture_shadow_0af9186b.scores.json | 52 + ..._alt_baseline_default_5e2e7376.scores.json | 52 + ...session_memory_sparse_0c047aff.scores.json | 52 + ...e_eval_fixture_shadow_5cbe5887.scores.json | 52 + ..._alt_baseline_default_c781769d.scores.json | 52 + ...session_memory_sparse_1bf4c32c.scores.json | 52 + ...e_eval_fixture_shadow_ef24adf5.scores.json | 52 + .../candidate_eval_fixture_shadow.json | 12 + 68 files changed, 8131 insertions(+), 36 deletions(-) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" create mode 100644 tests/evals/v2/V2.3-batch-robustness-usage.md create mode 100644 tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-02T183608080Z.json create mode 100644 tests/evals/v2/experiments/_experiment.robustness.smoke.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.json create mode 100644 tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.scores.json create mode 100644 tests/evals/v2/variants/candidate_eval_fixture_shadow.json diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" new file mode 100644 index 0000000000..3b5187cdc8 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" @@ -0,0 +1,601 @@ +# V2.3 版本项目介绍与阅读指南 + +## 理解清单 + +V2.3 的核心目标不是继续增加新评测维度,而是把 V2.2.5 已经跑通的单次真实实验,升级成可以批量运行、可以重复运行、可以观察稳定性的评测系统。 + +用最简单的话说: + +- V2.2.5 证明了“一个真实实验能闭合”。 +- V2.3 解决的是“多个任务、多种候选、多次重复跑,结果是否稳定”。 +- V2.3 的关键词是 `batch`、`repeat`、`run_group`、`stability`、`flaky`。 +- V2.3 仍然不做长上下文专项,也不做 tool/skill 价值专项。 + +## 预期效果 + +V2.3 完成后,你应该能做这样的事情: + +```text +同一组 scenario +-> baseline 跑多次 +-> candidate A 跑多次 +-> candidate B 跑多次 +-> 每次 run 都绑定 V1 事实证据 +-> 每个 scenario + variant 聚合成 run_group +-> 看稳定性、失败率、成本波动、路径波动 +-> 得到 batch report +``` + +这意味着你不再只能问: + +```text +这一次 candidate 是否比 baseline 好? +``` + +而是可以开始问: + +```text +candidate 在一批任务里是否整体更稳定? +它是不是偶尔很好、偶尔失败? +它是不是成本更低但波动更大? +它是不是在某些 scenario 上明显 flaky? +``` + +## 设计思路 + +V2.3 延续 V2 的基本原则:所有正式判断都必须回到 V1 事实证据。 + +所以 V2.3 没有绕开原来的 `run / score / compare / report` 管线,而是在它上面增加了一层聚合: + +```text +V1 evidence +-> V2 run +-> V2 score +-> compare report +-> run_group +-> stability summary +-> batch report +``` + +`run_group` 是 V2.3 的关键抽象。它不是替代 `run`,而是把同一个 `scenario_id + variant_id` 的多次 repeat 聚合起来。单次 run 仍然是最小事实单元,run_group 只是稳定性分析单元。 + +## 版本位置 + +当前版本链路可以这样理解: + +```text +V1:事实观测系统,记录 user_action / query / turn / tool / subagent / token / flow。 +V2.1:bind_existing runner,手动提供已有 user_action_id,生成 run/score/report。 +V2.2-alpha:execute_harness,自动执行 scenario,再用 benchmark_run_id 捕获 user_action_id。 +V2.2-beta:runtime contract、variant_effect_observed、experiment_validity。 +V2.2.5:真实实验闭合,自动 execute_harness 和 manual bind_existing fallback 两条路径都可用。 +V2.3:Batch + Robustness,多 scenario、多 candidate、repeat、run_group、稳定性摘要、flaky 标记。 +``` + +## 本轮完成内容 + +V2.3 已经完成以下能力: + +- 支持 `scenario_ids.length > 1`。 +- 支持 `candidate_variant_ids.length > 1`。 +- 支持 `repeat_count > 1`。 +- 每个 run 都带 `run_group_id`。 +- 每个 run 都带 `repeat_index`。 +- 每个 `scenario_id + variant_id` 生成一个 `run_group`。 +- 每个 run_group 生成稳定性指标。 +- 每个 run_group 生成 `flaky_status`。 +- experiment summary 里新增 batch 相关字段。 +- 额外生成 batch markdown report。 +- 新增无成本 `fixture_trace` adapter,用于验证 batch runner,不调用模型。 +- V2.1/V2.2 旧验证路径仍然可用。 + +## 本轮没有做什么 + +V2.3 明确没有做这些事情: + +- 没有进入 V2.4 长上下文评测。 +- 没有新增 tool/skill 价值专项指标。 +- 没有引入模型裁判。 +- 没有做远端任务调度。 +- 没有大改 V1 观测 schema。 +- 没有重做 risk verdict 语义。 +- 没有把 fixture smoke 当作真实 harness 价值结论。 + +## 核心对象模型 + +### scenario + +`scenario` 是一个评测任务。V2.3 支持一个 experiment 中包含多个 scenario。 + +相关目录: + +```text +tests/evals/v2/scenarios/ +``` + +本轮新增示例: + +```text +tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json +``` + +### variant + +`variant` 是一套待比较的 harness / config / feature gate / model 配置。V2.3 支持一个 experiment 中包含多个 candidate variant。 + +相关目录: + +```text +tests/evals/v2/variants/ +``` + +本轮新增示例: + +```text +tests/evals/v2/variants/candidate_eval_fixture_shadow.json +``` + +这个 variant 只用于 fixture smoke,不代表真实产品 harness 改动。 + +### run + +`run` 是一次具体执行结果,是 V2 的最小事实单元。 + +V2.3 为 run 增加了两个字段: + +```text +run_group_id +repeat_index +``` + +相关目录: + +```text +tests/evals/v2/runs/ +``` + +### run_group + +`run_group` 是 V2.3 新增的聚合单元。 + +一个 run_group 对应: + +```text +experiment_id + scenario_id + variant_id +``` + +它包含这个 scenario/variant 在本次 experiment 中的所有 repeat。 + +相关目录: + +```text +tests/evals/v2/run-groups/ +``` + +run_group 的核心字段包括: + +```text +run_group_id +experiment_id +scenario_id +variant_id +repeat_count +run_ids +status +started_at +ended_at +aggregate_summary_ref +stability_metrics +flaky_status +failures +``` + +### experiment summary + +experiment summary 是一次 experiment 的总 JSON 产物。 + +V2.3 新增字段包括: + +```text +run_group_refs +stability_summary +flaky_scenarios +run_failures +runner.v2_3_batch_capabilities +``` + +相关目录: + +```text +tests/evals/v2/experiment-runs/ +``` + +### batch report + +batch report 是 V2.3 新增的人类可读报告。 + +命名格式: + +```text +batch_experiment__.md +``` + +相关目录: + +```text +ObservrityTask/10-系统版本/v2/06-运行报告/ +``` + +## 稳定性指标 + +V2.3 第一版稳定性指标刻意保持简单,不做复杂统计。 + +当前 run_group 会计算: + +```text +repeat_success_rate +capture_failure_rate +total_billed_tokens_mean +total_billed_tokens_min +total_billed_tokens_max +total_billed_tokens_stddev +e2e_duration_mean +e2e_duration_min +e2e_duration_max +e2e_duration_stddev +tool_call_count_variance +subagent_count_variance +turn_count_variance +recovery_rate +``` + +这些指标主要回答: + +- 多次 repeat 是否都成功? +- capture 是否稳定? +- token 成本是否波动? +- 总耗时是否波动? +- tool 调用路径是否波动? +- subagent 路径是否波动? +- turn 数是否波动? +- 是否出现 recovery? + +## flaky_status + +V2.3 对每个 run_group 给出一个粗粒度 `flaky_status`。 + +当前状态包括: + +```text +stable +flaky +unstable +inconclusive +``` + +含义如下: + +- `stable`:所有 repeat 成功,粗粒度波动低。 +- `flaky`:部分 repeat 失败,或 token/tool/subagent/turn 波动较大。 +- `unstable`:没有成功 repeat。 +- `inconclusive`:repeat 太少,暂时不能判断稳定性。 + +这个标记是工程信号,不是最终质量裁判。 + +## 执行流程 + +V2.3 的 execute_harness batch 流程如下: + +```text +读取 experiment manifest +-> 遍历 scenario_ids +-> 遍历 repeat_index +-> 执行 baseline +-> 为 baseline 记录 run +-> 遍历 candidate_variant_ids +-> 执行 candidate +-> 为 candidate 记录 run +-> 生成 compare report +-> 所有 run 完成后聚合 run_group +-> 写入 stability_summary +-> 写入 batch report +-> 写入 experiment summary +``` + +每一次自动执行仍然使用: + +```text +benchmark_run_id -> user_action_id +``` + +这保证了 V2.3 没有回退到“取最新 action”这种不可靠绑定。 + +## failure_policy + +V2.3 在 `execution` 中支持: + +```text +failure_policy = fail_fast | continue_on_failure +``` + +含义: + +- `fail_fast`:遇到失败直接终止 experiment。 +- `continue_on_failure`:记录失败,继续执行后续 scenario / repeat / candidate。 + +batch 场景下,`continue_on_failure` 很重要。因为一个 scenario 失败,不应该直接污染或阻断其它 scenario 的稳定性统计。 + +## fixture_trace adapter + +V2.3 新增了 `fixture_trace` adapter。 + +它的目的不是模拟模型能力,而是验证 batch runner 的机制: + +- 不调用真实模型。 +- 不消耗真实 token。 +- 写入最小 DuckDB 事实表。 +- 生成可 capture 的 `benchmark_run_id`。 +- 让 runner 继续走正式 `record_run / score / compare / run_group / report` 管线。 + +它适合做无成本 smoke,不适合用来判断真实 harness 改动价值。 + +## 当前 smoke + +V2.3 当前无成本 smoke manifest: + +```text +tests/evals/v2/experiments/_experiment.robustness.smoke.json +``` + +它覆盖: + +```text +2 scenarios +1 baseline +2 candidates +repeat_count = 2 +``` + +所以一次完整 smoke 会产生: + +```text +2 scenario * 3 variant * 2 repeat = 12 runs +``` + +并聚合成: + +```text +2 scenario * 3 variant = 6 run_groups +``` + +## 如何运行 V2.3 smoke + +命令: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.robustness.smoke.json +``` + +注意: + +在当前 Windows/Bun 环境中,如果沙箱限制阻止 `duckdb.exe` 子进程执行,需要在允许本地子进程的环境中运行。 + +## 最新验证产物 + +最近一次成功的 V2.3 smoke 产物: + +```text +tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-02T183608080Z.json +``` + +对应 batch report: + +```text +ObservrityTask/10-系统版本/v2/06-运行报告/batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md +``` + +对应 run_group 目录: + +```text +tests/evals/v2/run-groups/ +``` + +## 怎么读 V2.3 结果 + +建议按这个顺序读: + +1. 先打开 experiment summary JSON。 +2. 看 `mode` 是否为 `execute_harness`。 +3. 看 `run_refs` 数量是否符合预期。 +4. 看 `run_group_refs` 数量是否符合预期。 +5. 看 `stability_summary`。 +6. 看 `flaky_scenarios`。 +7. 再打开 batch report。 +8. 最后只在需要排查时打开单个 run JSON。 + +对于当前 smoke,重点检查: + +```text +run_refs.length = 12 +run_group_refs.length = 6 +所有 run_group.repeat_success_rate = 1 +所有 run_group.capture_failure_rate = 0 +所有 run_group.flaky_status = stable +``` + +## batch report 怎么读 + +batch report 中最重要的是 `Batch Stability Table`。 + +它按 `scenario + variant` 展示: + +```text +repeat count +success rate +token mean +token stddev +duration mean +duration stddev +tool variance +subagent variance +turn variance +recovery rate +flaky status +``` + +如果只是快速判断系统有没有跑通,看三列就够: + +```text +success_rate +capture_failure_rate +flaky_status +``` + +如果要判断 candidate 是否稳定,再看: + +```text +token_stddev +tool_variance +subagent_variance +turn_variance +``` + +## 与 V2.2.5 的关系 + +V2.2.5 解决的是: + +```text +真实实验能不能闭合? +``` + +V2.3 解决的是: + +```text +真实实验能不能批量、重复、稳定地闭合? +``` + +所以 V2.3 不是替代 V2.2.5,而是在 V2.2.5 之上增加稳定性判断。 + +V2.2.5 的真实 session_memory 实验仍然是当前最重要的真实实验样例。 + +V2.3 当前新增的 robustness smoke 主要是机制验证,不是新的真实 harness 价值实验。 + +## 与 V2.4 的边界 + +V2.4 计划进入长上下文评测。 + +但 V2.4 应该建立在 V2.3 之上,因为长上下文任务天然更容易出现: + +- 高 token 成本 +- 高延时 +- 结果波动 +- 约束丢失 +- 被干扰信息带偏 +- compaction 行为差异 + +如果没有 V2.3 的 repeat/run_group/stability 能力,长上下文评测很容易只得到“某一次看起来不错”的偶然结果。 + +所以 V2.3 是 V2.4 的稳定性地基。 + +## 当前验收状态 + +已通过的验证: + +```powershell +bun run typecheck +bun run scripts/evals/v2_validate_manifests.ts +bun run scripts/evals/v2_validate_experiment_artifacts.ts +bun run scripts/evals/v2_verify_bind_runner.ts +bun run scripts/evals/v2_verify_execute_harness_alpha.ts +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.robustness.smoke.json +``` + +已验证能力: + +- `repeat_count > 1`:通过。 +- 多 scenario:通过。 +- 多 candidate:通过。 +- 每个 run 有唯一 `benchmark_run_id`:通过。 +- 每个 run 可 fact-only capture:通过。 +- run_group 生成:通过。 +- stability summary 生成:通过。 +- flaky scenario 标记:通过。 +- bind_existing 仍可用:通过。 +- execute_harness 仍可用:通过。 +- smoke / real_experiment 分层仍保留:通过。 + +## 当前风险和限制 + +V2.3 当前还有这些限制: + +- `flaky_status` 是第一版启发式,不是严格统计检验。 +- 目前只跑了 fixture smoke,没有跑真实模型 batch。 +- 真实 batch 会明显消耗 token,需要先控制 scenario 数和 repeat 数。 +- 当前 batch ranking 只适合辅助阅读,不是最终决策。 +- `fixture_trace` 只证明 runner 机制,不证明 harness 改动收益。 +- V2.3 没有解决长上下文任务中的 constraint retention 问题,那是 V2.4 范围。 + +## 下一步建议 + +进入 V2.4 前,建议先做一个很小的真实 batch: + +```text +1 real scenario +1 baseline +1 candidate +repeat_count = 2 或 3 +``` + +目标不是证明大结论,而是确认真实模型链路下: + +- run_group 是否稳定生成; +- repeat 成本是否合理; +- capture 是否稳定; +- `flaky_status` 是否有解释力; +- batch report 是否真的能帮助阅读。 + +如果这个小型真实 batch 结果可读,再进入 V2.4 长上下文会更稳。 + +## 文件地图 + +核心实现: + +```text +scripts/evals/v2_run_experiment.ts +scripts/evals/v2_harness_execution.ts +scripts/evals/v2_record_run.ts +scripts/evals/v2_validate_manifests.ts +scripts/evals/v2_validate_experiment_artifacts.ts +scripts/evals/v2_verify_bind_runner.ts +src/observability/v2/evalTypes.ts +src/observability/v2/evalExperimentTypes.ts +``` + +V2.3 文档: + +```text +tests/evals/v2/V2.3-batch-robustness-usage.md +ObservrityTask/10-系统版本/v2/01-总览/V2.3版本项目介绍与阅读指南.md +``` + +V2.3 smoke 输入: + +```text +tests/evals/v2/experiments/_experiment.robustness.smoke.json +tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json +tests/evals/v2/variants/candidate_eval_fixture_shadow.json +``` + +V2.3 输出: + +```text +tests/evals/v2/runs/ +tests/evals/v2/scores/ +tests/evals/v2/run-groups/ +tests/evals/v2/experiment-runs/ +ObservrityTask/10-系统版本/v2/06-运行报告/ +``` + +## 一句话总结 + +V2.3 把你的评测系统从“能跑一次真实实验”推进到了“能组织一批可重复实验,并用稳定性指标判断结果是否可靠”的阶段。它不是更花哨的 dashboard,而是进入 V2.4 长上下文评测和后续 skill/tool 价值评测之前必须具备的实验基础设施。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" new file mode 100644 index 0000000000..d931779278 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" @@ -0,0 +1,43 @@ +# V2.3 Batch Experiment Summary: v2_3_robustness_smoke + +## Understanding + +- experiment: v2_3_robustness_smoke +- mode: execute_harness +- scenario_count: 2 +- candidate_count: 2 +- repeat_count: 2 +- output_json: tests\evals\v2\experiment-runs\v2_3_robustness_smoke_2026-05-02T183608080Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| execute_harness_smoke_minimal | baseline_default | 2 | 1 | 110 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | baseline_default | 2 | 1 | 110 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_session_memory_sparse | execute_harness_smoke_minimal | 1 | 100 | stable | +| 2 | candidate_session_memory_sparse | robustness_smoke_minimal_alt | 1 | 100 | stable | +| 3 | candidate_eval_fixture_shadow | execute_harness_smoke_minimal | 1 | 105 | stable | +| 4 | candidate_eval_fixture_shadow | robustness_smoke_minimal_alt | 1 | 105 | stable | + +## Flaky Scenario Notes + +- No flaky run group detected by the current V2.3 heuristic. + +## Run Failures + +- No run failures recorded. + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" new file mode 100644 index 0000000000..a417c94de3 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67 +- candidate_run: run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26 +- scenario: execute_harness_smoke_minimal +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 604a7b67-9437-43a4-aeee-45e84f75fef1 +- candidate_user_action_id: 9c051f26-951b-4525-98e1-36e769791384 +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: n/a diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" new file mode 100644 index 0000000000..c2235b37f2 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67 +- candidate_run: run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444 +- scenario: execute_harness_smoke_minimal +- baseline_variant: baseline_default +- candidate_variant: candidate_eval_fixture_shadow + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 604a7b67-9437-43a4-aeee-45e84f75fef1 +- candidate_user_action_id: f8573444-aa1c-4c0f-980b-81d8d1e5ddcb +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: n/a diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" new file mode 100644 index 0000000000..6d53193001 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657 +- candidate_run: run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae +- scenario: execute_harness_smoke_minimal +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 31267657-6e21-4cac-80ab-da7d55690e5b +- candidate_user_action_id: 659719ae-5215-4efc-bedc-c626af0161bd +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: n/a diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" new file mode 100644 index 0000000000..404ccb950e --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657 +- candidate_run: run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b +- scenario: execute_harness_smoke_minimal +- baseline_variant: baseline_default +- candidate_variant: candidate_eval_fixture_shadow + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 31267657-6e21-4cac-80ab-da7d55690e5b +- candidate_user_action_id: 0af9186b-081f-43a8-be0f-7f4f67c17416 +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: n/a diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" new file mode 100644 index 0000000000..4349c936b3 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376 +- candidate_run: run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff +- scenario: robustness_smoke_minimal_alt +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6 +- candidate_user_action_id: 0c047aff-f3e6-4a2b-9c4d-4a3e9523315b +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a runner smoke scenario, not a qualitative harness evaluation. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" new file mode 100644 index 0000000000..e944f372b2 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376 +- candidate_run: run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887 +- scenario: robustness_smoke_minimal_alt +- baseline_variant: baseline_default +- candidate_variant: candidate_eval_fixture_shadow + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6 +- candidate_user_action_id: 5cbe5887-4214-4541-acf8-6333218aed6d +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a runner smoke scenario, not a qualitative harness evaluation. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" new file mode 100644 index 0000000000..626b168b3c --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d +- candidate_run: run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c +- scenario: robustness_smoke_minimal_alt +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: c781769d-13e2-4389-89bb-80fd0fa48cc9 +- candidate_user_action_id: 1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3 +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a runner smoke scenario, not a qualitative harness evaluation. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" new file mode 100644 index 0000000000..a5d183bde9 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" @@ -0,0 +1,58 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d +- candidate_run: run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5 +- scenario: robustness_smoke_minimal_alt +- baseline_variant: baseline_default +- candidate_variant: candidate_eval_fixture_shadow + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: c781769d-13e2-4389-89bb-80fd0fa48cc9 +- candidate_user_action_id: ef24adf5-89d3-4024-87cd-14db5f49e20d +- runtime_difference_observed: false + +## Variant Effect Evidence + +- baseline_policy_event_observed: false +- candidate_policy_event_observed: false +- candidate_variant_effect_observed: false +- baseline_policy_mode: unknown +- candidate_policy_mode: unknown +- baseline_session_memory_subagent_count: 0 +- candidate_session_memory_subagent_count: 0 + +## Runtime Difference Summary + +- Baseline session_memory policy was not observed. +- Candidate session_memory policy was not observed. +- Candidate sparse runtime markers were not observed. +- No stable runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[none], candidate=[none]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was not observed cleanly enough; score deltas may be noise rather than proof of harness value. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: This is a runner smoke scenario, not a qualitative harness evaluation. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" new file mode 100644 index 0000000000..f58cff4d3b --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" @@ -0,0 +1,222 @@ +# V2 Experiment Summary: v2_3_robustness_smoke + +## Understanding + +- experiment: v2_3_robustness_smoke +- mode: execute_harness +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse, candidate_eval_fixture_shadow +- scenario_count: 2 +- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.subagent_count_observed, stability.recovery_absence, controllability.turn_limit_basic +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\v2_3_robustness_smoke_2026-05-02T183608080Z.json + +## Expected Outcome + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2 executes the scenario first, then captures the generated user_action_id through benchmark_run_id. + +## Design Rationale + +The runner always scores only trace-backed V1 facts. V2.2-beta adds runtime-effect evidence and experiment-validity semantics so smoke and real experiments are not confused with each other. + +## Smoke Check + +- requested_mode: execute_harness +- execute_harness_loop_closed: true +- note: This profile validates the automatic pipeline, not harness value. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 0 +- risk_status: pass +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: regression_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Variant Effect Evidence + +- execute_harness_smoke_minimal / candidate_session_memory_sparse: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- execute_harness_smoke_minimal / candidate_session_memory_sparse: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: baseline_mode=unknown, candidate_mode=unknown, candidate_effect_observed=false, runtime_difference_observed=false + +## Experiment Validity + +- status: valid +- profile: smoke +- baseline_captured: true +- candidate_captured: true +- no_ambiguous_capture: true +- score_evidence_present: true +- variant_effect_observed: false +- runtime_difference_observed: false +- scenario_intent_matched: true +- reason: Smoke check remains healthy. + +- No additional blockers or warnings. + +## Runtime Difference Summary + +- execute_harness_smoke_minimal / candidate_session_memory_sparse: Baseline session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: Candidate session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: Baseline session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: Candidate session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: At least one score dimension changed between baseline and candidate. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: Baseline session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: Candidate session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. +- execute_harness_smoke_minimal / candidate_session_memory_sparse: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: Baseline session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: Candidate session_memory policy was not observed in V1 events. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: At least one score dimension changed between baseline and candidate. +- execute_harness_smoke_minimal / candidate_eval_fixture_shadow: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: Baseline session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: Candidate session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: Baseline session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: Candidate session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: At least one score dimension changed between baseline and candidate. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: Baseline session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: Candidate session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. +- robustness_smoke_minimal_alt / candidate_session_memory_sparse: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: Baseline session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: Candidate session_memory policy was not observed in V1 events. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: At least one score dimension changed between baseline and candidate. +- robustness_smoke_minimal_alt / candidate_eval_fixture_shadow: No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect. + +## V2.3 Batch Robustness + +- batch_report: ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md +- run_group_count: 6 +- run_failure_count: 0 + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | --- | +| execute_harness_smoke_minimal | baseline_default | 2 | 1 | 110 | 0 | stable | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | stable | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | stable | +| robustness_smoke_minimal_alt | baseline_default | 2 | 1 | 110 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | stable | + +### Run Failures + +- No run failures recorded. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | efficiency.total_billed_tokens | 110 | 100 | -10 | improved | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | decision_quality.subagent_count_observed | 0 | 0 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | efficiency.total_billed_tokens | 110 | 105 | -5 | improved | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- 1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | experiment_validity | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | --- | +| execute_harness_smoke_minimal | 1 | run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67 | candidate_session_memory_sparse | run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md | +| execute_harness_smoke_minimal | 1 | run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67 | candidate_eval_fixture_shadow | run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md | +| execute_harness_smoke_minimal | 2 | run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657 | candidate_session_memory_sparse | run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md | +| execute_harness_smoke_minimal | 2 | run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657 | candidate_eval_fixture_shadow | run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md | +| robustness_smoke_minimal_alt | 1 | run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376 | candidate_session_memory_sparse | run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md | +| robustness_smoke_minimal_alt | 1 | run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376 | candidate_eval_fixture_shadow | run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md | +| robustness_smoke_minimal_alt | 2 | run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d | candidate_session_memory_sparse | run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md | +| robustness_smoke_minimal_alt | 2 | run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d | candidate_eval_fixture_shadow | run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5 | valid | 0/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | hard_fail | task_success.main_chain_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | hard_fail | task_success.main_chain_observed | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | hard_fail | task_success.main_chain_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | hard_fail | task_success.main_chain_observed | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | soft_warning | decision_quality.subagent_count_observed | pass | 0 | + +## Interpretation Limits + +- Smoke only proves the automatic execute_harness -> capture -> run/score/report loop is healthy. +- Smoke does not prove a candidate harness change is beneficial. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.md" new file mode 100644 index 0000000000..2f83e21814 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67 + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: baseline_default (Baseline Default) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: 604a7b67-9437-43a4-aeee-45e84f75fef1 +- root_query_id: eb99485a-4783-45c5-b3b5-0a95ce68ccd4 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:35:54.924Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 100 +- total_billed_tokens: 110 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (110) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" new file mode 100644 index 0000000000..0cbe01125a --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26 + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: 9c051f26-951b-4525-98e1-36e769791384 +- root_query_id: 3906aa11-8018-49c5-ac3a-b916513e1236 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:35:56.001Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 90 +- total_billed_tokens: 100 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (100) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" new file mode 100644 index 0000000000..64b1d3bb9c --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444 + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: candidate_eval_fixture_shadow (Candidate Eval Fixture Shadow) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: f8573444-aa1c-4c0f-980b-81d8d1e5ddcb +- root_query_id: bd334a3c-e2ef-405e-8de7-ab0771e889bd +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:35:57.164Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 95 +- total_billed_tokens: 105 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (105) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.md" new file mode 100644 index 0000000000..9a82bb166d --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657 + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: baseline_default (Baseline Default) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: 31267657-6e21-4cac-80ab-da7d55690e5b +- root_query_id: ff52a587-6842-4fa6-a0d7-82537d11049a +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:35:58.306Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 100 +- total_billed_tokens: 110 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (110) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" new file mode 100644 index 0000000000..cd8d6605aa --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: 659719ae-5215-4efc-bedc-c626af0161bd +- root_query_id: b8547936-74ae-453d-8955-9e4a4fd1b388 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:35:59.290Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 90 +- total_billed_tokens: 100 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (100) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" new file mode 100644 index 0000000000..fe8d96a17e --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b + +## 理解清单 + +- scenario: execute_harness_smoke_minimal (Execute Harness Smoke Minimal) +- variant: candidate_eval_fixture_shadow (Candidate Eval Fixture Shadow) +- run_group_id: group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: 0af9186b-081f-43a8-be0f-7f4f67c17416 +- root_query_id: a59382a2-80e4-4593-80f2-e416634ff888 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:00.396Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 95 +- total_billed_tokens: 105 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (105) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.md" new file mode 100644 index 0000000000..4933016698 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376 + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: baseline_default (Baseline Default) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: 5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6 +- root_query_id: 19e5257b-24f7-4ceb-ad92-30837387e139 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:01.515Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 100 +- total_billed_tokens: 110 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (110) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" new file mode 100644 index 0000000000..ffc70b4127 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: 0c047aff-f3e6-4a2b-9c4d-4a3e9523315b +- root_query_id: b2728007-19b0-453b-9283-8b8b3fd4b3f0 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:02.529Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 90 +- total_billed_tokens: 100 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (100) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" new file mode 100644 index 0000000000..b80fac0d4a --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887 + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: candidate_eval_fixture_shadow (Candidate Eval Fixture Shadow) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z +- repeat_index: 1 +- user_action_id: 5cbe5887-4214-4541-acf8-6333218aed6d +- root_query_id: 8987783a-22a5-4b21-8e59-2f87b4de19af +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:03.663Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 95 +- total_billed_tokens: 105 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (105) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.md" new file mode 100644 index 0000000000..2b1ca4ae65 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: baseline_default (Baseline Default) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: c781769d-13e2-4389-89bb-80fd0fa48cc9 +- root_query_id: 03eae129-e46b-4a2b-b590-6760260dab08 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:04.810Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 100 +- total_billed_tokens: 110 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (110) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" new file mode 100644 index 0000000000..37a322a483 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: candidate_session_memory_sparse (Candidate Session Memory Sparse) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: 1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3 +- root_query_id: 72bf3b7e-d2d7-45f0-9607-6fbe6fe24021 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:05.821Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 90 +- total_billed_tokens: 100 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (100) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" new file mode 100644 index 0000000000..31f3961adb --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md" @@ -0,0 +1,66 @@ +# V2 Run Report: run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5 + +## 理解清单 + +- scenario: robustness_smoke_minimal_alt (Robustness Smoke Minimal Alt) +- variant: candidate_eval_fixture_shadow (Candidate Eval Fixture Shadow) +- run_group_id: group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z +- repeat_index: 2 +- user_action_id: ef24adf5-89d3-4024-87cd-14db5f49e20d +- root_query_id: 10f63fde-e69e-4e42-9113-31d6ea626479 +- observability_db_ref: .observability\v2-robustness-smoke.duckdb + +## 预期效果 + +This report binds one V2 run back to V1 evidence, then emits phase-one rule and structure scores. + +## 设计思路 + +The report does not judge final answer quality by itself. It records trace-backed facts that can support baseline vs candidate comparison. + +## V1 Evidence + +- binding_mode: fact_only +- bind_passed: true +- binding_failure_reason: n/a +- started_at: 2026-05-02T18:36:06.949Z +- duration_ms: 10 +- query_count: 1 +- subagent_count: 0 +- tool_call_count: 0 +- total_prompt_input_tokens: 95 +- total_billed_tokens: 105 +- root_turn_count: 1 +- root_terminal_reason: fixture_completed +- recovery_count: 0 + +## Tools + +- No tools observed + +## Subagents + +- No subagents observed + +## Variant Effect Evidence + +- effect_type: session_memory_policy +- policy_event_observed: false +- variant_effect_observed: false +- session_memory_subagent_count: 0 +- session_memory_trigger_details: none +- reason: No session-memory policy observation event was found for this run. + +### Observed Policy + +```json +null +``` + +## Scores + +- task_success.main_chain_observed: pass (1) +- efficiency.total_billed_tokens: observed (105) +- decision_quality.subagent_count_observed: observed (0) +- stability.recovery_absence: pass (1) +- controllability.turn_limit_basic: pass (1) diff --git a/scripts/evals/v2_harness_execution.ts b/scripts/evals/v2_harness_execution.ts index 06abbe2de9..99c83a4184 100644 --- a/scripts/evals/v2_harness_execution.ts +++ b/scripts/evals/v2_harness_execution.ts @@ -1,5 +1,5 @@ import { spawnSync } from 'node:child_process' -import { createHash } from 'node:crypto' +import { createHash, randomUUID } from 'node:crypto' import { existsSync } from 'node:fs' import { mkdir, readFile, writeFile } from 'node:fs/promises' import path from 'node:path' @@ -76,6 +76,20 @@ function sqlString(value: string): string { return `'${value.replaceAll("'", "''")}'` } +function runDuckDbSql(dbPath: string, sql: string): void { + const result = spawnSync(duckdbExe, [dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + throw new Error( + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim(), + ) + } +} + function sanitizeId(value: string): string { return value.replace(/[^a-zA-Z0-9_-]+/g, '_').replace(/^_+|_+$/g, '') } @@ -215,10 +229,13 @@ export function createRunIdentity(params: { scenarioId: string variantId: string stamp: string + repeatIndex?: number }): { eval_run_id: string; benchmark_run_id: string } { - const base = `${params.experimentId}_${params.scenarioId}_${params.variantId}_${params.stamp}` + const repeatPart = + typeof params.repeatIndex === 'number' ? `_repeat_${params.repeatIndex}` : '' + const base = `${params.experimentId}_${params.scenarioId}_${params.variantId}${repeatPart}_${params.stamp}` const humanPrefix = sanitizeId( - `${params.experimentId.slice(0, 20)}_${params.scenarioId.slice(0, 20)}_${params.variantId.slice(0, 20)}`, + `${params.experimentId.slice(0, 20)}_${params.scenarioId.slice(0, 20)}_${params.variantId.slice(0, 20)}${repeatPart}`, ) const hash = createHash('sha1').update(base).digest('hex').slice(0, 12) const identity = `${humanPrefix}_${hash}` @@ -484,6 +501,106 @@ export class CliPrintHarnessExecutionAdapter implements HarnessExecutionAdapter } } +export class FixtureTraceHarnessExecutionAdapter implements HarnessExecutionAdapter { + constructor( + private readonly options: { + execution?: EvalExperimentExecutionConfig + env: Record + }, + ) {} + + async execute(input: HarnessExecutionAdapterInput): Promise { + const runDir = path.join(harnessRunsRoot, artifactRunDirName(input.runId)) + await mkdir(runDir, { recursive: true }) + const stdoutPath = path.join(runDir, 'stdout.txt') + const stderrPath = path.join(runDir, 'stderr.txt') + const commandPath = path.join(runDir, 'command.json') + const dbPath = path.resolve( + repoRoot, + this.options.execution?.db_path ?? + this.options.env.V2_FIXTURE_DB_PATH ?? + path.join('.observability', 'v2-fixture-trace.duckdb'), + ) + + await writeFile( + commandPath, + `${JSON.stringify( + { + adapter: 'fixture_trace', + db_path: path.relative(repoRoot, dbPath), + timeout_ms: input.timeoutMs, + env_keys: Object.keys(this.options.env).sort(), + }, + null, + 2, + )}\n`, + 'utf8', + ) + + if (this.options.env.V2_FIXTURE_FAIL_VARIANT === input.variantId) { + const message = `Fixture requested failure for variant ${input.variantId}` + await writeFile(stdoutPath, '', 'utf8') + await writeFile(stderrPath, message, 'utf8') + return { + status: 'failed', + stdoutRef: path.relative(repoRoot, stdoutPath), + stderrRef: path.relative(repoRoot, stderrPath), + error: message, + } + } + + const now = new Date() + const endedAt = new Date(now.getTime() + 10).toISOString() + const userActionId = randomUUID() + const queryId = randomUUID() + const benchmarkRunId = this.options.env.CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID + const evalRunId = this.options.env.CLAUDE_CODE_EVAL_RUN_ID + const experimentId = + this.options.env.CLAUDE_CODE_EVAL_EXPERIMENT_LABEL ?? input.experimentId + const scenarioId = this.options.env.CLAUDE_CODE_EVAL_SCENARIO_LABEL ?? input.scenarioId + const variantId = this.options.env.CLAUDE_CODE_EVAL_VARIANT_LABEL ?? input.variantId + const tokenBase = + input.variantId === 'baseline_default' + ? 110 + : input.variantId.includes('sparse') + ? 100 + : 105 + + const sql = [ + 'CREATE TABLE IF NOT EXISTS user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, experiment_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, benchmark_run_id VARCHAR, eval_run_id VARCHAR, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', + 'CREATE TABLE IF NOT EXISTS queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR, turn_count BIGINT, terminal_reason VARCHAR);', + 'CREATE TABLE IF NOT EXISTS tools(user_action_id VARCHAR, tool_name VARCHAR, is_closed BOOLEAN, has_failed BOOLEAN);', + 'CREATE TABLE IF NOT EXISTS subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', + 'CREATE TABLE IF NOT EXISTS recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', + 'CREATE TABLE IF NOT EXISTS metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', + `INSERT INTO user_actions VALUES (${sqlString(now.toISOString().slice(0, 10))}, ${sqlString(userActionId)}, ${sqlString(now.toISOString())}, 0, ${sqlString(endedAt)}, 10, 10, 2, 1, 1, 0, 0, 0, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, ${tokenBase - 10}, 10, 0, 0, ${tokenBase - 10}, ${tokenBase}, ${tokenBase - 10}, 0);`, + `INSERT INTO queries VALUES (${sqlString(queryId)}, ${sqlString(userActionId)}, 'main_thread', ${sqlString(now.toISOString())}, 1, 'fixture_completed');`, + `INSERT INTO metrics_integrity_daily VALUES (${sqlString(now.toISOString().slice(0, 10))}, 1, 1, 1, 1);`, + ].join('\n') + + try { + runDuckDbSql(dbPath, sql) + await writeFile(stdoutPath, `fixture_user_action_id=${userActionId}\n`, 'utf8') + await writeFile(stderrPath, '', 'utf8') + return { + status: 'completed', + stdoutRef: path.relative(repoRoot, stdoutPath), + stderrRef: path.relative(repoRoot, stderrPath), + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + await writeFile(stdoutPath, '', 'utf8') + await writeFile(stderrPath, message, 'utf8') + return { + status: 'failed', + stdoutRef: path.relative(repoRoot, stdoutPath), + stderrRef: path.relative(repoRoot, stderrPath), + error: message, + } + } + } +} + export function createHarnessExecutionAdapter(params: { execution?: EvalExperimentExecutionConfig env: Record @@ -492,6 +609,7 @@ export function createHarnessExecutionAdapter(params: { const adapter = params.execution?.adapter ?? 'cli_print' if (adapter === 'disabled') return new DisabledHarnessExecutionAdapter() if (adapter === 'cli_print') return new CliPrintHarnessExecutionAdapter(params) + if (adapter === 'fixture_trace') return new FixtureTraceHarnessExecutionAdapter(params) throw new Error(`Unsupported execute_harness adapter: ${adapter}`) } @@ -603,6 +721,7 @@ export async function executeHarnessAndCapture(params: { }) const shouldRebuildDb = execution.status === 'completed' && + params.execution?.adapter !== 'fixture_trace' && (!params.dbPath || (!params.execution?.command && !params.execution?.args)) diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index 66df7f1a76..d517487a9c 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -262,6 +262,8 @@ function buildReport(params: { - scenario: ${scenario.scenario_id} (${scenario.name}) - variant: ${variant.variant_id} (${variant.name}) +- run_group_id: ${run.run_group_id ?? 'none'} +- repeat_index: ${run.repeat_index ?? 'none'} - user_action_id: ${run.entry_user_action_id ?? 'unknown'} - root_query_id: ${run.root_query_id ?? 'unknown'} - observability_db_ref: ${run.observability_db_ref ?? 'unknown'} @@ -323,6 +325,9 @@ async function main(): Promise { const args = parseArgs(process.argv.slice(2)) const scenarioId = String(args.scenario ?? '') const variantId = String(args.variant ?? 'baseline_default') + const runGroupId = String(args['run-group-id'] ?? '') + const repeatIndex = + args['repeat-index'] === undefined ? undefined : asNumber(args['repeat-index']) const sourceDbPath = String(args.db ?? defaultDbPath) const dbPath = await resolveReadableDbPath( sourceDbPath, @@ -440,6 +445,8 @@ async function main(): Promise { run_id: runId, scenario_id: scenario.scenario_id, variant_id: variant.variant_id, + ...(runGroupId ? { run_group_id: runGroupId } : {}), + ...(repeatIndex !== undefined ? { repeat_index: repeatIndex } : {}), started_at: asString(action.started_at), ended_at: asString(action.ended_at), status: 'completed', diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index 91a4692889..b224a4d12c 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -67,6 +67,7 @@ interface ExperimentValidity { interface CandidateExperimentResult { candidate_variant_id: string + candidate_run_group_id: string candidate_run_id: string candidate_user_action_id: string candidate_eval_run_id?: string @@ -86,6 +87,7 @@ interface CandidateExperimentResult { interface ScenarioExperimentResult { scenario_id: string repeat_index: number + baseline_run_group_id: string baseline_run_id: string baseline_user_action_id: string baseline_eval_run_id?: string @@ -94,6 +96,48 @@ interface ScenarioExperimentResult { candidates: CandidateExperimentResult[] } +interface RunExecutionFailure { + scenario_id: string + variant_id: string + run_group_id: string + repeat_index: number + stage: 'execute_harness' | 'capture' | 'record_run' | 'compare' + error: string +} + +interface RunGroupArtifact { + run_group_id: string + experiment_id: string + scenario_id: string + variant_id: string + repeat_count: number + run_ids: string[] + status: 'completed' | 'partial' | 'failed' + started_at: string | null + ended_at: string | null + aggregate_summary_ref: string | null + stability_metrics: StabilityMetrics + flaky_status: 'stable' | 'flaky' | 'unstable' | 'inconclusive' + failures: RunExecutionFailure[] +} + +interface StabilityMetrics { + repeat_success_rate: number + capture_failure_rate: number + total_billed_tokens_mean: number | null + total_billed_tokens_min: number | null + total_billed_tokens_max: number | null + total_billed_tokens_stddev: number | null + e2e_duration_mean: number | null + e2e_duration_min: number | null + e2e_duration_max: number | null + e2e_duration_stddev: number | null + tool_call_count_variance: number | null + subagent_count_variance: number | null + turn_count_variance: number | null + recovery_rate: number +} + interface GateResult { scenario_id: string candidate_variant_id: string @@ -148,6 +192,7 @@ const bunExe = process.execPath const evalRoot = path.join(repoRoot, 'tests', 'evals', 'v2') const scoresRoot = path.join(evalRoot, 'scores') const runsRoot = path.join(evalRoot, 'runs') +const runGroupsRoot = path.join(evalRoot, 'run-groups') const experimentRunsRoot = path.join(evalRoot, 'experiment-runs') function parseArgs(argv: string[]): Record { @@ -190,6 +235,22 @@ function asStringArray(value: unknown): string[] { return value.filter((item): item is string => typeof item === 'string' && item.length > 0) } +function sanitizeId(value: string): string { + return value.replace(/[^a-zA-Z0-9_-]+/g, '_').replace(/^_+|_+$/g, '') +} + +function createRunGroupId(params: { + experimentId: string + scenarioId: string + variantId: string + stamp: string +}): string { + const base = sanitizeId( + `group_${params.experimentId}_${params.scenarioId}_${params.variantId}_${params.stamp}`, + ) + return base.length > 160 ? base.slice(0, 160) : base +} + async function listJsonFiles(dir: string): Promise { const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) return entries @@ -647,6 +708,8 @@ function buildRecordRunArgs(params: { scenarioId: string variantId: string userActionId: string + runGroupId: string + repeatIndex: number scoreSpecIds: string[] dbPath?: string snapshotDb: boolean @@ -658,6 +721,10 @@ function buildRecordRunArgs(params: { params.variantId, '--user-action-id', params.userActionId, + '--run-group-id', + params.runGroupId, + '--repeat-index', + String(params.repeatIndex), ] if (params.snapshotDb) args.push('--snapshot-db') if (params.dbPath) args.push('--db', params.dbPath) @@ -760,15 +827,60 @@ function scoreRefs(results: ScenarioExperimentResult[]): string[] { ]) } -function reportRefs(results: ScenarioExperimentResult[], experimentReport: string): string[] { +function reportRefs(params: { + results: ScenarioExperimentResult[] + experimentReport: string + batchReport: string +}): string[] { return [ - ...results.flatMap(result => + ...params.results.flatMap(result => result.candidates.map(candidate => candidate.compare_report), ), - experimentReport, + params.batchReport, + params.experimentReport, ].filter(Boolean) } +function numberOrNull(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) return value + if (typeof value === 'string' && value.trim() !== '') { + const parsed = Number(value) + return Number.isFinite(parsed) ? parsed : null + } + return null +} + +function mean(values: number[]): number | null { + if (values.length === 0) return null + return Number((values.reduce((sum, value) => sum + value, 0) / values.length).toFixed(6)) +} + +function variance(values: number[]): number | null { + if (values.length < 2) return 0 + const avg = mean(values) + if (avg === null) return null + return Number( + (values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / values.length).toFixed(6), + ) +} + +function stddev(values: number[]): number | null { + const value = variance(values) + return value === null ? null : Number(Math.sqrt(value).toFixed(6)) +} + +function minValue(values: number[]): number | null { + return values.length === 0 ? null : Math.min(...values) +} + +function maxValue(values: number[]): number | null { + return values.length === 0 ? null : Math.max(...values) +} + +function scoreValue(scores: EvalScore[], scoreSpecId: string): number | null { + return valueFor(scores, scoreSpecId) +} + function hasPolicyEventObserved(variantEffect: JsonRecord | undefined): boolean { return asBoolean(variantEffect?.policy_event_observed) } @@ -1046,9 +1158,291 @@ function aggregateVariantEffectSummary(results: ScenarioExperimentResult[]): Var ) } +function runGroupRefs(runGroups: RunGroupArtifact[]): string[] { + return runGroups.map(group => + path.join('tests', 'evals', 'v2', 'run-groups', `${group.run_group_id}.json`), + ) +} + +async function buildRunGroups(params: { + experimentId: string + baselineVariantId: string + repeatCount: number + results: ScenarioExperimentResult[] + failures: RunExecutionFailure[] + aggregateSummaryRef: string +}): Promise { + const groups = new Map< + string, + { + experiment_id: string + scenario_id: string + variant_id: string + run_ids: string[] + failures: RunExecutionFailure[] + } + >() + + function ensureGroup(runGroupId: string, scenarioId: string, variantId: string) { + if (!groups.has(runGroupId)) { + groups.set(runGroupId, { + experiment_id: params.experimentId, + scenario_id: scenarioId, + variant_id: variantId, + run_ids: [], + failures: [], + }) + } + return groups.get(runGroupId)! + } + + for (const result of params.results) { + ensureGroup( + result.baseline_run_group_id, + result.scenario_id, + params.baselineVariantId, + ).run_ids.push(result.baseline_run_id) + for (const candidate of result.candidates) { + ensureGroup( + candidate.candidate_run_group_id, + result.scenario_id, + candidate.candidate_variant_id, + ).run_ids.push(candidate.candidate_run_id) + } + } + + for (const failure of params.failures) { + ensureGroup(failure.run_group_id, failure.scenario_id, failure.variant_id).failures.push(failure) + } + + const artifacts: RunGroupArtifact[] = [] + for (const [runGroupId, group] of groups.entries()) { + const runArtifacts = await Promise.all(group.run_ids.map(runId => readRunArtifact(runId))) + const scoreArtifacts = await Promise.all( + group.run_ids.map(runId => + readJson(path.join(scoresRoot, `${runId}.scores.json`)), + ), + ) + const actions = runArtifacts + .map(artifact => (artifact as JsonRecord).evidence) + .map(evidence => + evidence && typeof evidence === 'object' && !Array.isArray(evidence) + ? (evidence as JsonRecord).action + : undefined, + ) + .filter( + (action): action is JsonRecord => + Boolean(action) && typeof action === 'object' && !Array.isArray(action), + ) + const rootQueries = runArtifacts + .map(artifact => (artifact as JsonRecord).evidence) + .map(evidence => + evidence && typeof evidence === 'object' && !Array.isArray(evidence) + ? (evidence as JsonRecord).rootQuery + : undefined, + ) + .filter( + (query): query is JsonRecord => + Boolean(query) && typeof query === 'object' && !Array.isArray(query), + ) + const totalBilledTokens = scoreArtifacts + .map(scores => scoreValue(scores, 'efficiency.total_billed_tokens')) + .filter((value): value is number => value !== null) + const durations = actions + .map(action => numberOrNull(action.duration_ms)) + .filter((value): value is number => value !== null) + const toolCounts = actions + .map(action => numberOrNull(action.tool_call_count)) + .filter((value): value is number => value !== null) + const subagentCounts = actions + .map(action => numberOrNull(action.subagent_count)) + .filter((value): value is number => value !== null) + const turnCounts = rootQueries + .map(query => numberOrNull(query.turn_count)) + .filter((value): value is number => value !== null) + const recoveryFlags = runArtifacts.map(artifact => { + const evidence = (artifact as JsonRecord).evidence + if (!evidence || typeof evidence !== 'object' || Array.isArray(evidence)) return 0 + const recoveries = (evidence as JsonRecord).recoveries + return Array.isArray(recoveries) && recoveries.length > 0 ? 1 : 0 + }) + const successCount = group.run_ids.length + const expectedCount = params.repeatCount + const failureCount = group.failures.length + const metrics: StabilityMetrics = { + repeat_success_rate: Number((successCount / expectedCount).toFixed(6)), + capture_failure_rate: Number((failureCount / expectedCount).toFixed(6)), + total_billed_tokens_mean: mean(totalBilledTokens), + total_billed_tokens_min: minValue(totalBilledTokens), + total_billed_tokens_max: maxValue(totalBilledTokens), + total_billed_tokens_stddev: stddev(totalBilledTokens), + e2e_duration_mean: mean(durations), + e2e_duration_min: minValue(durations), + e2e_duration_max: maxValue(durations), + e2e_duration_stddev: stddev(durations), + tool_call_count_variance: variance(toolCounts), + subagent_count_variance: variance(subagentCounts), + turn_count_variance: variance(turnCounts), + recovery_rate: + recoveryFlags.length === 0 + ? 0 + : Number( + ( + recoveryFlags.reduce((sum, value) => sum + value, 0) / + recoveryFlags.length + ).toFixed(6), + ), + } + const tokenCv = + metrics.total_billed_tokens_mean && metrics.total_billed_tokens_stddev !== null + ? metrics.total_billed_tokens_stddev / Math.max(metrics.total_billed_tokens_mean, 1) + : 0 + const status: RunGroupArtifact['status'] = + successCount === expectedCount && failureCount === 0 + ? 'completed' + : successCount === 0 + ? 'failed' + : 'partial' + const flakyStatus: RunGroupArtifact['flaky_status'] = + successCount === 0 + ? 'unstable' + : expectedCount < 2 + ? 'inconclusive' + : failureCount > 0 || successCount < expectedCount + ? 'flaky' + : tokenCv > 0.2 || + (metrics.tool_call_count_variance ?? 0) > 1 || + (metrics.subagent_count_variance ?? 0) > 1 || + (metrics.turn_count_variance ?? 0) > 1 + ? 'flaky' + : 'stable' + + artifacts.push({ + run_group_id: runGroupId, + experiment_id: group.experiment_id, + scenario_id: group.scenario_id, + variant_id: group.variant_id, + repeat_count: expectedCount, + run_ids: group.run_ids, + status, + started_at: actions.map(action => asString(action.started_at)).filter(Boolean).sort()[0] ?? null, + ended_at: + actions + .map(action => asString(action.ended_at)) + .filter(Boolean) + .sort() + .at(-1) ?? null, + aggregate_summary_ref: params.aggregateSummaryRef, + stability_metrics: metrics, + flaky_status: flakyStatus, + failures: group.failures, + }) + } + + return artifacts.sort((a, b) => + `${a.scenario_id}:${a.variant_id}`.localeCompare(`${b.scenario_id}:${b.variant_id}`), + ) +} + +async function writeRunGroups(runGroups: RunGroupArtifact[]): Promise { + await mkdir(runGroupsRoot, { recursive: true }) + for (const group of runGroups) { + await writeFile( + path.join(runGroupsRoot, `${group.run_group_id}.json`), + `${JSON.stringify(group, null, 2)}\n`, + ) + } +} + +function buildBatchReport(params: { + experiment: EvalExperimentV21 + runGroups: RunGroupArtifact[] + failures: RunExecutionFailure[] + outputJson: string +}): string { + const { experiment, runGroups, failures, outputJson } = params + const groupRows = runGroups + .map(group => { + const metrics = group.stability_metrics + return `| ${group.scenario_id} | ${group.variant_id} | ${group.repeat_count} | ${metrics.repeat_success_rate} | ${metrics.total_billed_tokens_mean ?? 'n/a'} | ${metrics.total_billed_tokens_stddev ?? 'n/a'} | ${metrics.e2e_duration_mean ?? 'n/a'} | ${metrics.e2e_duration_stddev ?? 'n/a'} | ${metrics.tool_call_count_variance ?? 'n/a'} | ${metrics.subagent_count_variance ?? 'n/a'} | ${metrics.turn_count_variance ?? 'n/a'} | ${metrics.recovery_rate} | ${group.flaky_status} |` + }) + .join('\n') + const flakyRows = runGroups + .filter(group => group.flaky_status !== 'stable') + .map(group => `- ${group.scenario_id} / ${group.variant_id}: ${group.flaky_status}`) + .join('\n') + const rankingRows = runGroups + .filter(group => group.variant_id !== experiment.baseline_variant_id) + .sort((a, b) => { + const aMetrics = a.stability_metrics + const bMetrics = b.stability_metrics + if (bMetrics.repeat_success_rate !== aMetrics.repeat_success_rate) { + return bMetrics.repeat_success_rate - aMetrics.repeat_success_rate + } + return ( + (aMetrics.total_billed_tokens_mean ?? Number.POSITIVE_INFINITY) - + (bMetrics.total_billed_tokens_mean ?? Number.POSITIVE_INFINITY) + ) + }) + .map( + (group, index) => + `| ${index + 1} | ${group.variant_id} | ${group.scenario_id} | ${group.stability_metrics.repeat_success_rate} | ${group.stability_metrics.total_billed_tokens_mean ?? 'n/a'} | ${group.flaky_status} |`, + ) + .join('\n') + const failureRows = + failures.length === 0 + ? '- No run failures recorded.' + : failures + .map( + failure => + `- ${failure.scenario_id} / ${failure.variant_id} / repeat ${failure.repeat_index}: ${failure.stage}: ${failure.error}`, + ) + .join('\n') + + return `# V2.3 Batch Experiment Summary: ${experiment.experiment_id} + +## Understanding + +- experiment: ${experiment.experiment_id} +- mode: ${experiment.mode ?? 'bind_existing'} +- scenario_count: ${experiment.scenario_ids?.length ?? 0} +- candidate_count: ${experiment.candidate_variant_ids.length} +- repeat_count: ${experiment.repeat_count ?? 1} +- output_json: ${outputJson} + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +${groupRows || '| n/a | n/a | 0 | 0 | n/a | n/a | n/a | n/a | n/a | n/a | n/a | 0 | inconclusive |'} + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +${rankingRows || '| n/a | n/a | n/a | n/a | n/a | n/a |'} + +## Flaky Scenario Notes + +${flakyRows || '- No flaky run group detected by the current V2.3 heuristic.'} + +## Run Failures + +${failureRows} + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. +` +} + function buildMarkdownReport(params: { experiment: EvalExperimentV21 results: ScenarioExperimentResult[] + runGroups: RunGroupArtifact[] + failures: RunExecutionFailure[] + batchReport: string outputJson: string riskVerdict: RiskVerdict experimentValidity: ExperimentValidity @@ -1060,6 +1454,9 @@ function buildMarkdownReport(params: { const { experiment, results, + runGroups, + failures, + batchReport, outputJson, riskVerdict, experimentValidity, @@ -1089,6 +1486,23 @@ function buildMarkdownReport(params: { ) .join('\n') + const runGroupRows = runGroups + .map(group => { + const metrics = group.stability_metrics + return `| ${group.scenario_id} | ${group.variant_id} | ${group.repeat_count} | ${metrics.repeat_success_rate} | ${metrics.total_billed_tokens_mean ?? 'n/a'} | ${metrics.total_billed_tokens_stddev ?? 'n/a'} | ${group.flaky_status} |` + }) + .join('\n') + + const failureRows = + failures.length === 0 + ? '- No run failures recorded.' + : failures + .map( + failure => + `- ${failure.scenario_id} / ${failure.variant_id} / repeat ${failure.repeat_index}: ${failure.stage}: ${failure.error}`, + ) + .join('\n') + const gateRows = allGateResults.length === 0 ? '| n/a | n/a | n/a | n/a | n/a | n/a |\n' @@ -1225,6 +1639,20 @@ ${validityNotes || '- No additional blockers or warnings.'} ${runtimeDifferenceRows} +## V2.3 Batch Robustness + +- batch_report: ${batchReport || 'not generated'} +- run_group_count: ${runGroups.length} +- run_failure_count: ${failures.length} + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | --- | +${runGroupRows || '| n/a | n/a | 0 | 0 | n/a | n/a | inconclusive |'} + +### Run Failures + +${failureRows} + ## Scorecard Summary | scenario | candidate_variant | score | baseline | candidate | delta | interpretation | @@ -1287,8 +1715,13 @@ async function main(): Promise { const scoreSpecs = await loadScoreSpecs() const gatePolicy = await loadGatePolicy(experiment.gate_policy_id) - const dbPath = typeof args.db === 'string' ? args.db : undefined + const configuredDbPath = + typeof experiment.execution?.db_path === 'string' && experiment.execution.db_path.trim() + ? path.resolve(repoRoot, experiment.execution.db_path) + : undefined + const dbPath = typeof args.db === 'string' ? args.db : configuredDbPath const snapshotDb = !Boolean(args['no-snapshot-db']) + const failurePolicy = experiment.execution?.failure_policy ?? 'fail_fast' for (const scoreSpecId of experiment.score_spec_ids ?? []) { if (!scoreSpecs.has(scoreSpecId)) { throw new Error(`Experiment references missing score_spec_id: ${scoreSpecId}`) @@ -1308,19 +1741,9 @@ async function main(): Promise { } const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) - if (mode === 'execute_harness') { - if (scenarioIds.length !== 1) { - throw new Error('V2.2 execute_harness supports exactly one scenario.') - } - if (experiment.candidate_variant_ids.length !== 1) { - throw new Error('V2.2 execute_harness supports exactly one candidate variant.') - } - if (repeatCount !== 1) { - throw new Error('V2.2 execute_harness supports repeat_count=1 only.') - } - } const results: ScenarioExperimentResult[] = [] + const failures: RunExecutionFailure[] = [] if (mode === 'bind_existing') { for (const scenarioId of scenarioIds) { for (const variantId of [ @@ -1345,6 +1768,12 @@ async function main(): Promise { for (const scenarioId of scenarioIds) { const scenario = mode === 'execute_harness' ? await loadScenario(scenarioId) : undefined + const baselineRunGroupId = createRunGroupId({ + experimentId: experiment.experiment_id, + scenarioId, + variantId: experiment.baseline_variant_id, + stamp: executionStamp, + }) for (let repeatIndex = 1; repeatIndex <= repeatCount; repeatIndex += 1) { let baselineUserActionId = findBoundUserActionId({ @@ -1355,6 +1784,11 @@ async function main(): Promise { let baselineExecution: ExecuteHarnessResult | undefined let baselineEvalRunId: string | undefined let baselineBenchmarkRunId: string | undefined + let baselineRunId = '' + let baselineScores: EvalScore[] = [] + let baselineRunArtifact: RunArtifact | undefined + + try { if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) @@ -1364,6 +1798,7 @@ async function main(): Promise { scenarioId, variantId: experiment.baseline_variant_id, stamp: executionStamp, + repeatIndex, }) baselineEvalRunId = identity.eval_run_id baselineBenchmarkRunId = identity.benchmark_run_id @@ -1394,19 +1829,40 @@ async function main(): Promise { scenarioId, variantId: experiment.baseline_variant_id, userActionId: baselineUserActionId, + runGroupId: baselineRunGroupId, + repeatIndex, scoreSpecIds: experiment.score_spec_ids ?? [], dbPath, snapshotDb, }), ) - const baselineRunId = extractCreatedRunId(baselineOutput) - const baselineScores = await readJson( + baselineRunId = extractCreatedRunId(baselineOutput) + baselineScores = await readJson( path.join(scoresRoot, `${baselineRunId}.scores.json`), ) - const baselineRunArtifact = await readRunArtifact(baselineRunId) + baselineRunArtifact = await readRunArtifact(baselineRunId) + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + if (failurePolicy === 'fail_fast') throw error + failures.push({ + scenario_id: scenarioId, + variant_id: experiment.baseline_variant_id, + run_group_id: baselineRunGroupId, + repeat_index: repeatIndex, + stage: message.includes('capture') ? 'capture' : mode === 'execute_harness' ? 'execute_harness' : 'record_run', + error: message, + }) + continue + } const candidates: CandidateExperimentResult[] = [] for (const candidateVariantId of experiment.candidate_variant_ids) { + const candidateRunGroupId = createRunGroupId({ + experimentId: experiment.experiment_id, + scenarioId, + variantId: candidateVariantId, + stamp: executionStamp, + }) let candidateActionId = findBoundUserActionId({ experiment, scenarioId, @@ -1416,6 +1872,8 @@ async function main(): Promise { let candidateEvalRunId: string | undefined let candidateBenchmarkRunId: string | undefined + try { + if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) const candidateVariant = await loadVariant(candidateVariantId) @@ -1424,6 +1882,7 @@ async function main(): Promise { scenarioId, variantId: candidateVariantId, stamp: executionStamp, + repeatIndex, }) candidateEvalRunId = identity.eval_run_id candidateBenchmarkRunId = identity.benchmark_run_id @@ -1454,6 +1913,8 @@ async function main(): Promise { scenarioId, variantId: candidateVariantId, userActionId: candidateActionId, + runGroupId: candidateRunGroupId, + repeatIndex, scoreSpecIds: experiment.score_spec_ids ?? [], dbPath, snapshotDb, @@ -1490,7 +1951,7 @@ async function main(): Promise { const variantEffect = runtimeDifferenceAnalysis({ scenarioId, candidateVariantId, - baselineVariantEffect: baselineRunArtifact.variant_effect, + baselineVariantEffect: baselineRunArtifact?.variant_effect, candidateVariantEffect: candidateRunArtifact.variant_effect, scorecard, }) @@ -1506,12 +1967,13 @@ async function main(): Promise { candidates.push({ candidate_variant_id: candidateVariantId, + candidate_run_group_id: candidateRunGroupId, candidate_run_id: candidateRunId, candidate_user_action_id: candidateActionId, candidate_eval_run_id: candidateEvalRunId, candidate_benchmark_run_id: candidateBenchmarkRunId, candidate_execution: candidateExecution, - baseline_variant_effect: baselineRunArtifact.variant_effect, + baseline_variant_effect: baselineRunArtifact?.variant_effect, candidate_variant_effect: candidateRunArtifact.variant_effect, variant_effect_summary: variantEffect, experiment_validity: experimentValidityForCandidate, @@ -1530,11 +1992,25 @@ async function main(): Promise { experimentValidity: experimentValidityForCandidate, }), }) + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + if (failurePolicy === 'fail_fast') throw error + failures.push({ + scenario_id: scenarioId, + variant_id: candidateVariantId, + run_group_id: candidateRunGroupId, + repeat_index: repeatIndex, + stage: message.includes('compare') ? 'compare' : message.includes('capture') ? 'capture' : mode === 'execute_harness' ? 'execute_harness' : 'record_run', + error: message, + }) + continue + } } results.push({ scenario_id: scenarioId, repeat_index: repeatIndex, + baseline_run_group_id: baselineRunGroupId, baseline_run_id: baselineRunId, baseline_user_action_id: baselineUserActionId, baseline_eval_run_id: baselineEvalRunId, @@ -1559,6 +2035,11 @@ async function main(): Promise { `experiment_${experiment.experiment_id}_${runStamp}.md`, ) const outputMarkdownRel = path.relative(repoRoot, outputMarkdownPath) + const batchMarkdownPath = path.join( + reportRoot, + `batch_experiment_${experiment.experiment_id}_${runStamp}.md`, + ) + const batchMarkdownRel = path.relative(repoRoot, batchMarkdownPath) const generatedAt = new Date().toISOString() const riskVerdict = summarizeRisk(results) const scorecardSummary = aggregateScorecard(results) @@ -1566,6 +2047,15 @@ async function main(): Promise { const recommendedReviewMode = aggregateReviewMode(results) const variantEffectSummary = aggregateVariantEffectSummary(results) const experimentValidity = aggregateExperimentValidity(results) + const runGroups = await buildRunGroups({ + experimentId: experiment.experiment_id, + baselineVariantId: experiment.baseline_variant_id, + repeatCount, + results, + failures, + aggregateSummaryRef: batchMarkdownRel, + }) + await writeRunGroups(runGroups) const warningMessages = results .flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results)) @@ -1589,6 +2079,12 @@ async function main(): Promise { `hard_fail: scenario=${result.scenario_id}, candidate=${result.candidate_variant_id}, score=${result.score_spec_id}`, ) errorMessages.push(...experimentValidity.blockers) + errorMessages.push( + ...failures.map( + failure => + `${failure.stage}: scenario=${failure.scenario_id}, variant=${failure.variant_id}, repeat=${failure.repeat_index}: ${failure.error}`, + ), + ) await writeFile( outputJsonPath, @@ -1603,8 +2099,13 @@ async function main(): Promise { report_profile: experiment.report_profile ?? 'smoke', evaluation_intent: experiment.evaluation_intent ?? null, run_refs: runRefs(results), + run_group_refs: runGroupRefs(runGroups), score_refs: scoreRefs(results), - report_refs: reportRefs(results, outputMarkdownRel), + report_refs: reportRefs({ + results, + experimentReport: outputMarkdownRel, + batchReport: batchMarkdownRel, + }), risk_verdict: riskVerdict, gate_verdict: riskVerdict, experiment_validity: experimentValidity, @@ -1614,6 +2115,14 @@ async function main(): Promise { 'risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.', scorecard_summary: scorecardSummary, exploration_signals: explorationSignals, + stability_summary: runGroups, + flaky_scenarios: runGroups + .filter(group => group.flaky_status !== 'stable') + .map(group => ({ + scenario_id: group.scenario_id, + variant_id: group.variant_id, + flaky_status: group.flaky_status, + })), recommended_review_mode: recommendedReviewMode, final_decision: null, errors: errorMessages, @@ -1627,18 +2136,17 @@ async function main(): Promise { requestedMode === 'execute_harness' && mode === 'bind_existing' ? 'execute_harness disabled by flag or environment; bind_existing fallback used' : null, - execute_harness_alpha_limits: - mode === 'execute_harness' - ? { - scenario_count: 1, - candidate_count: 1, - repeat_count: 1, - } - : null, + v2_3_batch_capabilities: { + multi_scenario: scenarioIds.length > 1, + multi_candidate: experiment.candidate_variant_ids.length > 1, + repeat_count: repeatCount, + failure_policy: failurePolicy, + }, score_spec_ids: experiment.score_spec_ids ?? [], gate_policy_id: experiment.gate_policy_id ?? null, }, results, + run_failures: failures, created_at: generatedAt, }, null, @@ -1646,11 +2154,24 @@ async function main(): Promise { )}\n`, ) + await writeFile( + batchMarkdownPath, + buildBatchReport({ + experiment, + runGroups, + failures, + outputJson: outputJsonRel, + }), + ) + await writeFile( outputMarkdownPath, buildMarkdownReport({ experiment, results, + runGroups, + failures, + batchReport: batchMarkdownRel, outputJson: outputJsonRel, riskVerdict, experimentValidity, @@ -1662,6 +2183,7 @@ async function main(): Promise { ) console.log(`Created V2 experiment summary: ${outputJsonRel}`) + console.log(`Created V2 batch summary: ${batchMarkdownRel}`) console.log(`Created V2 experiment report: ${outputMarkdownRel}`) } diff --git a/scripts/evals/v2_validate_experiment_artifacts.ts b/scripts/evals/v2_validate_experiment_artifacts.ts index 40d0c8fc46..68622abe6f 100644 --- a/scripts/evals/v2_validate_experiment_artifacts.ts +++ b/scripts/evals/v2_validate_experiment_artifacts.ts @@ -60,6 +60,18 @@ function validateArtifact(filePath: string, artifact: JsonRecord): string[] { requireArray(errors, filePath, 'report_refs', artifact.report_refs) requireArray(errors, filePath, 'errors', artifact.errors) requireArray(errors, filePath, 'warnings', artifact.warnings) + if (artifact.run_group_refs !== undefined) { + requireArray(errors, filePath, 'run_group_refs', artifact.run_group_refs) + } + if (artifact.stability_summary !== undefined) { + requireArray(errors, filePath, 'stability_summary', artifact.stability_summary) + } + if (artifact.flaky_scenarios !== undefined) { + requireArray(errors, filePath, 'flaky_scenarios', artifact.flaky_scenarios) + } + if (artifact.run_failures !== undefined) { + requireArray(errors, filePath, 'run_failures', artifact.run_failures) + } if ( artifact.report_profile !== undefined && !reportProfiles.has(String(artifact.report_profile)) diff --git a/scripts/evals/v2_validate_manifests.ts b/scripts/evals/v2_validate_manifests.ts index 1ae50680f6..9b58c1d89b 100644 --- a/scripts/evals/v2_validate_manifests.ts +++ b/scripts/evals/v2_validate_manifests.ts @@ -43,6 +43,8 @@ const automationLevels = new Set(['automatic', 'manual_review', 'mixed']) const experimentModes = new Set(['bind_existing', 'execute_harness']) const reportProfiles = new Set(['smoke', 'real_experiment']) const evaluationIntents = new Set(['regression', 'exploration']) +const failurePolicies = new Set(['fail_fast', 'continue_on_failure']) +const executionAdapters = new Set(['cli_print', 'fixture_trace', 'disabled']) interface ValidationContext { scenarioIds: Set @@ -256,6 +258,26 @@ function validateExperiment( `${filePath}.evaluation_intent has invalid value: ${experiment.evaluation_intent}`, ) } + if ( + experiment.execution?.failure_policy !== undefined && + !failurePolicies.has(experiment.execution.failure_policy) + ) { + errors.push( + `${filePath}.execution.failure_policy has invalid value: ${experiment.execution.failure_policy}`, + ) + } + if ( + experiment.execution?.db_path !== undefined && + typeof experiment.execution.db_path !== 'string' + ) { + errors.push(`${filePath}.execution.db_path must be a string when present`) + } + if ( + experiment.execution?.adapter !== undefined && + !executionAdapters.has(experiment.execution.adapter) + ) { + errors.push(`${filePath}.execution.adapter has invalid value: ${experiment.execution.adapter}`) + } if (experiment.action_bindings !== undefined) { requireArray(errors, filePath, 'action_bindings', experiment.action_bindings) for (const [index, binding] of experiment.action_bindings.entries()) { diff --git a/scripts/evals/v2_verify_bind_runner.ts b/scripts/evals/v2_verify_bind_runner.ts index 9d310ac430..b6d6063640 100644 --- a/scripts/evals/v2_verify_bind_runner.ts +++ b/scripts/evals/v2_verify_bind_runner.ts @@ -272,6 +272,32 @@ async function createMissingRootDb(): Promise { return dbPath } +async function createBindExistingDb(): Promise { + const dbPath = path.join(tempRoot, 'bind-existing.duckdb') + const startedAt = '2026-05-01T00:00:00.000Z' + const sql = [ + 'CREATE TABLE user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', + 'CREATE TABLE queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR, turn_count BIGINT, terminal_reason VARCHAR);', + 'CREATE TABLE tools(user_action_id VARCHAR, tool_name VARCHAR, is_closed BOOLEAN, has_failed BOOLEAN);', + 'CREATE TABLE subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', + 'CREATE TABLE recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', + 'CREATE TABLE metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', + `INSERT INTO user_actions VALUES ('2026-05-01', '${baselineActionId}', '${startedAt}', 0, '2026-05-01T00:00:01.000Z', 1000, 1000, 2, 1, 1, 0, 0, 0, 100, 10, 0, 0, 100, 110, 100, 0);`, + `INSERT INTO user_actions VALUES ('2026-05-01', '${candidateActionId}', '${startedAt}', 0, '2026-05-01T00:00:01.000Z', 1000, 1000, 2, 1, 1, 0, 0, 0, 90, 10, 0, 0, 90, 100, 90, 0);`, + `INSERT INTO queries VALUES ('q-baseline', '${baselineActionId}', 'main_thread', '${startedAt}', 1, 'fixture_completed');`, + `INSERT INTO queries VALUES ('q-candidate', '${candidateActionId}', 'main_thread', '${startedAt}', 1, 'fixture_completed');`, + "INSERT INTO metrics_integrity_daily VALUES ('2026-05-01', 1, 1, 1, 1);", + ].join(' ') + const result = spawnSync(duckdbExe, [dbPath, sql], { + cwd: repoRoot, + encoding: 'utf8', + }) + if (result.status !== 0) { + throw new Error(String(result.stderr || result.stdout || result.error?.message)) + } + return dbPath +} + async function runCase(testCase: VerifyCase): Promise { const manifestPath = path.join(manifestsRoot, `${testCase.case_id}.json`) await writeJson(manifestPath, testCase.manifest) @@ -332,6 +358,7 @@ async function runCase(testCase: VerifyCase): Promise { async function main(): Promise { await mkdir(manifestsRoot, { recursive: true }) await mkdir(reportsRoot, { recursive: true }) + const bindExistingDb = await createBindExistingDb() const missingRootDb = await createMissingRootDb() const cases: VerifyCase[] = [ @@ -339,6 +366,8 @@ async function main(): Promise { case_id: 'single_scenario_single_candidate', description: 'Single scenario plus one candidate should complete.', expect: 'success', + db_path: bindExistingDb, + no_snapshot_db: true, manifest: experiment({ id: `v2_1_verify_single_candidate_${stamp}`, scenarioIds: ['cost_sensitive_task'], @@ -353,6 +382,8 @@ async function main(): Promise { case_id: 'single_scenario_multi_candidate', description: 'Single scenario plus multiple candidates should complete.', expect: 'success', + db_path: bindExistingDb, + no_snapshot_db: true, manifest: experiment({ id: `v2_1_verify_multi_candidate_${stamp}`, scenarioIds: ['cost_sensitive_task'], @@ -373,6 +404,8 @@ async function main(): Promise { case_id: 'multi_scenario_single_candidate', description: 'Multiple scenarios plus one candidate should complete.', expect: 'success', + db_path: bindExistingDb, + no_snapshot_db: true, manifest: experiment({ id: `v2_1_verify_multi_scenario_${stamp}`, scenarioIds: ['cost_sensitive_task', 'tool_choice_sensitive'], @@ -406,6 +439,8 @@ async function main(): Promise { description: 'Nonexistent V1 user_action_id should fail.', expect: 'failure', expected_error: 'user_action_id not found', + db_path: bindExistingDb, + no_snapshot_db: true, manifest: experiment({ id: `v2_1_verify_missing_action_${stamp}`, scenarioIds: ['cost_sensitive_task'], @@ -472,6 +507,8 @@ async function main(): Promise { case_id: 'execute_harness_disabled_fallback', description: 'execute_harness can be disabled and falls back to bind_existing when action bindings are present.', expect: 'success', + db_path: bindExistingDb, + no_snapshot_db: true, extra_args: ['--disable-execute-harness'], manifest: experiment({ id: `v2_1_verify_execute_harness_${stamp}`, diff --git a/src/observability/v2/evalExperimentTypes.ts b/src/observability/v2/evalExperimentTypes.ts index 24f4651bdf..1cc29f37f0 100644 --- a/src/observability/v2/evalExperimentTypes.ts +++ b/src/observability/v2/evalExperimentTypes.ts @@ -66,11 +66,13 @@ export type EvalExperimentActionBinding = | EvalExperimentNestedActionBinding export interface EvalExperimentExecutionConfig { - adapter?: 'cli_print' | 'disabled' + adapter?: 'cli_print' | 'fixture_trace' | 'disabled' timeout_ms?: number max_turns?: number + failure_policy?: 'fail_fast' | 'continue_on_failure' allow_fallback_to_bind_existing?: boolean require_config_snapshot?: boolean + db_path?: string env?: Record command?: string args?: string[] diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index 56c6bfa867..c9a75537f3 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -70,6 +70,8 @@ export interface EvalRun { run_id: string scenario_id: string variant_id: string + run_group_id?: string + repeat_index?: number started_at: string ended_at?: string status: EvalRunStatus diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index dd002c11ed..a94441f817 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -2,6 +2,16 @@ This directory stores the local-first V2 evaluation system. +## Recommended Overview + +If you want the project-level explanation first, start here: + +```text +ObservrityTask/10-系统版本/v2/01-总览/V2.2.5版本项目介绍与阅读指南.md +``` + +Use this README after that when you want the concrete execution entrypoints and folder-level technical view. + ## Structure - `scenarios/`: scenario manifests. @@ -12,14 +22,15 @@ This directory stores the local-first V2 evaluation system. - `runs/`: generated run records bound to V1 evidence. - `scores/`: generated score artifacts. - `experiment-runs/`: experiment-level JSON summaries. +- `run-groups/`: V2.3 repeat aggregation artifacts. - `verification-reports/`: runner verification reports. ## Modes - `bind_existing`: V2.1 stable mode. You provide existing V1 `user_action_id` values through `action_bindings`. -- `execute_harness`: V2.2 mode. The runner executes one scenario through the headless harness, injects eval context into V1 events, captures the generated `user_action_id` by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. +- `execute_harness`: V2.2+ mode. The runner executes scenarios through the headless harness, injects eval context into V1 events, captures generated `user_action_id` values by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. -Current V2.2-beta deliberately supports only 1 scenario, 1 baseline, 1 candidate, and `repeat_count=1`. +V2.3 adds batch robustness support on top of V2.2.5: multi-scenario, multi-candidate, `repeat_count > 1`, run groups, stability summaries, and flaky status. ## Basic Commands @@ -90,10 +101,17 @@ Run the V2.2.5 manual `bind_existing` fallback experiment: bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json ``` +Run the V2.3 no-cost robustness smoke: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.robustness.smoke.json +``` + Interpretation: - `smoke`: validates automatic execution, automatic capture, and automatic artifact generation. - `real_experiment`: asks whether the candidate changed runtime behavior in an observable and interpretable way. +- `run_group`: groups repeats for one `scenario_id + variant_id` and reports success rate, token/duration variance, recovery rate, and flaky status. ## bind_existing Binding Shape @@ -139,6 +157,8 @@ If capture returns zero matches, the run fails as `capture_failed`. If it return tests/evals/v2/V2.1-bind_existing-usage.md tests/evals/v2/V2.2-execute_harness-alpha-usage.md tests/evals/v2/V2.2.5-real-experiment-closure.md +tests/evals/v2/V2.3-batch-robustness-usage.md +tests/evals/v2/run-groups/ tests/evals/v2/experiment-runs/README.md ``` @@ -161,3 +181,9 @@ List recorded runs: ```powershell bun run scripts/evals/v2_list_runs.ts --scenario tool_choice_sensitive ``` + +## V2.3 Project Overview + +```text +ObservrityTask/10-系统版本/v2/01-总览/V2.3版本项目介绍与阅读指南.md +``` diff --git a/tests/evals/v2/V2.2-execute_harness-alpha-usage.md b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md index 9bcac5de73..3043202e59 100644 --- a/tests/evals/v2/V2.2-execute_harness-alpha-usage.md +++ b/tests/evals/v2/V2.2-execute_harness-alpha-usage.md @@ -6,7 +6,7 @@ - V2.2-alpha 新增的是“前半段自动化”:由 runner 自动执行 scenario,并自动找到这次执行生成的 V1 action。 - 正式绑定不允许用“最新 user_action_id”,因为并发、后台任务或手动调试都可能生成更新的 action。 - 正式绑定使用 `benchmark_run_id -> user_action_id`,只有唯一命中时才进入 score/report。 -- 当前 beta 仍只支持 1 scenario / 1 baseline / 1 candidate / repeat=1。 +- V2.3 已在该链路上增加 batch robustness:multi scenario、multi candidate、repeat_count > 1、run_group 和 stability summary。 - 自动化可以一键关闭,关闭后回退到 V2.1 `bind_existing`。 ## 预期效果 diff --git a/tests/evals/v2/V2.3-batch-robustness-usage.md b/tests/evals/v2/V2.3-batch-robustness-usage.md new file mode 100644 index 0000000000..9682221d34 --- /dev/null +++ b/tests/evals/v2/V2.3-batch-robustness-usage.md @@ -0,0 +1,54 @@ +# V2.3 Batch Robustness Usage + +V2.3 extends the V2.2.5 real-experiment runner from one scenario and one candidate into batch evaluation. + +## Scope + +V2.3 supports: + +- multiple `scenario_ids` +- multiple `candidate_variant_ids` +- `repeat_count > 1` +- one `run_group` for each `scenario_id + variant_id` +- stability metrics for each run group +- flaky status for each run group +- a batch experiment summary report + +V2.3 does not introduce long-context evaluation, tool/skill value scoring, remote scheduling, or a new V1 schema. + +## Smoke Verification + +Run the no-cost fixture smoke: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.robustness.smoke.json +``` + +This manifest runs two scenarios, two candidates, and two repeats through `execute_harness` using the `fixture_trace` adapter. It verifies runner behavior without calling the model. + +## Outputs + +The runner now emits these additional V2.3 artifacts: + +- `tests/evals/v2/run-groups/*.json` +- `stability_summary` in `tests/evals/v2/experiment-runs/*.json` +- `flaky_scenarios` in `tests/evals/v2/experiment-runs/*.json` +- `batch_experiment__.md` in the V2 report directory + +## Reading Order + +1. Open the latest experiment summary JSON. +2. Check `run_group_refs` and `stability_summary`. +3. Open the batch markdown report. +4. Inspect individual run JSON files only when a run group is flaky or unstable. + +## Flaky Status + +The first V2.3 heuristic is intentionally simple: + +- `stable`: all repeats completed and coarse variance is low. +- `flaky`: at least one repeat failed or coarse token/tool/subagent/turn variance is high. +- `unstable`: no successful repeat exists for the group. +- `inconclusive`: repeat count is too low to make a stability judgment. + +This is an engineering signal, not a final quality verdict. diff --git a/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-02T183608080Z.json b/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-02T183608080Z.json new file mode 100644 index 0000000000..0fea30c922 --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-02T183608080Z.json @@ -0,0 +1,2820 @@ +{ + "experiment_id": "v2_3_robustness_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.robustness.smoke.json", + "generated_at": "2026-05-02T18:36:08.082Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "smoke", + "evaluation_intent": "regression", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.json", + "tests\\evals\\v2\\runs\\run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check remains healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "variant_effect_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:54.924Z", + "ended_at": "2026-05-02T18:35:58.316Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:57.164Z", + "ended_at": "2026-05-02T18:36:00.406Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:56.001Z", + "ended_at": "2026-05-02T18:35:59.300Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:01.515Z", + "ended_at": "2026-05-02T18:36:04.820Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:03.663Z", + "ended_at": "2026-05-02T18:36:06.959Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:02.529Z", + "ended_at": "2026-05-02T18:36:05.831Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + } + ], + "flaky_scenarios": [], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "v2_3_robustness_smoke", + "name": "V2.3 Robustness Smoke", + "goal": "Verify V2.3 batch runner support for multi-scenario, multi-candidate, repeat_count > 1, run_group aggregation, stability summary, and flaky detection without model/API spend.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse", + "candidate_eval_fixture_shadow" + ], + "scenario_set_id": "v2_3_robustness_smoke", + "scenario_ids": [ + "execute_harness_smoke_minimal", + "robustness_smoke_minimal_alt" + ], + "repeat_count": 2, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "smoke", + "evaluation_intent": "regression", + "execution": { + "adapter": "fixture_trace", + "db_path": ".observability/v2-robustness-smoke.duckdb", + "timeout_ms": 30000, + "failure_policy": "continue_on_failure", + "env": { + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb" + } + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": true, + "multi_candidate": true, + "repeat_count": 2, + "failure_policy": "continue_on_failure" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "baseline_run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "baseline_user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\dcff71c38706e280\\stdout.txt", + "stderrRef": ".observability\\v2h\\dcff71c38706e280\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "candidate_user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\c771f2835f7f76ea\\stdout.txt", + "stderrRef": ".observability\\v2h\\c771f2835f7f76ea\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "candidate_user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\c5a4e79f1541c163\\stdout.txt", + "stderrRef": ".observability\\v2h\\c5a4e79f1541c163\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_vs_run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "baseline_run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "baseline_user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\62fe28efab69e4fa\\stdout.txt", + "stderrRef": ".observability\\v2h\\62fe28efab69e4fa\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "candidate_user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\999d114effb31f92\\stdout.txt", + "stderrRef": ".observability\\v2h\\999d114effb31f92\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "candidate_user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\7c664774694e12e5\\stdout.txt", + "stderrRef": ".observability\\v2h\\7c664774694e12e5\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_vs_run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "baseline_run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "baseline_user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\fac76318977a27a1\\stdout.txt", + "stderrRef": ".observability\\v2h\\fac76318977a27a1\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "candidate_user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\fac085e228015b97\\stdout.txt", + "stderrRef": ".observability\\v2h\\fac085e228015b97\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "candidate_user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\d91b2b96fcd45f03\\stdout.txt", + "stderrRef": ".observability\\v2h\\d91b2b96fcd45f03\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_vs_run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "baseline_run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "baseline_user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\75151876c547a3e6\\stdout.txt", + "stderrRef": ".observability\\v2h\\75151876c547a3e6\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "candidate_user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\8141cfeaa6083c63\\stdout.txt", + "stderrRef": ".observability\\v2h\\8141cfeaa6083c63\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "candidate_run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "candidate_user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\2311e1d8d3d70963\\stdout.txt", + "stderrRef": ".observability\\v2h\\2311e1d8d3d70963\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_vs_run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-02T18:36:08.082Z" +} diff --git a/tests/evals/v2/experiments/_experiment.robustness.smoke.json b/tests/evals/v2/experiments/_experiment.robustness.smoke.json new file mode 100644 index 0000000000..46c09f05f1 --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.robustness.smoke.json @@ -0,0 +1,37 @@ +{ + "experiment_id": "v2_3_robustness_smoke", + "name": "V2.3 Robustness Smoke", + "goal": "Verify V2.3 batch runner support for multi-scenario, multi-candidate, repeat_count > 1, run_group aggregation, stability summary, and flaky detection without model/API spend.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse", + "candidate_eval_fixture_shadow" + ], + "scenario_set_id": "v2_3_robustness_smoke", + "scenario_ids": [ + "execute_harness_smoke_minimal", + "robustness_smoke_minimal_alt" + ], + "repeat_count": 2, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "smoke", + "evaluation_intent": "regression", + "execution": { + "adapter": "fixture_trace", + "db_path": ".observability/v2-robustness-smoke.duckdb", + "timeout_ms": 30000, + "failure_policy": "continue_on_failure", + "env": { + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb" + } + }, + "status": "ready" +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..11296d7c91 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:54.924Z", + "ended_at": "2026-05-02T18:35:58.316Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..1ad6dd4cf5 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:57.164Z", + "ended_at": "2026-05-02T18:36:00.406Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..898fa85f25 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae" + ], + "status": "completed", + "started_at": "2026-05-02T18:35:56.001Z", + "ended_at": "2026-05-02T18:35:59.300Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..f63ec70ed9 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:01.515Z", + "ended_at": "2026-05-02T18:36:04.820Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..1883b2d9ec --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:03.663Z", + "ended_at": "2026-05-02T18:36:06.959Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z.json new file mode 100644 index 0000000000..fa0ef2f09b --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c" + ], + "status": "completed", + "started_at": "2026-05-02T18:36:02.529Z", + "ended_at": "2026-05-02T18:36:05.831Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-02T183608080Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.json b/tests/evals/v2/runs/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.json new file mode 100644 index 0000000000..acddbe1532 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.json @@ -0,0 +1,117 @@ +{ + "run": { + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:35:54.924Z", + "ended_at": "2026-05-02T18:35:54.934Z", + "status": "completed", + "entry_user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "root_query_id": "eb99485a-4783-45c5-b3b5-0a95ce68ccd4", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "root_query_id": "eb99485a-4783-45c5-b3b5-0a95ce68ccd4", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "root_query_id": "eb99485a-4783-45c5-b3b5-0a95ce68ccd4", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "started_at": "2026-05-02T18:35:54.924Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:35:54.934Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_580abf736489", + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 100, + "total_billed_tokens": 110, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "eb99485a-4783-45c5-b3b5-0a95ce68ccd4", + "user_action_id": "604a7b67-9437-43a4-aeee-45e84f75fef1", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:35:54.924Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.json b/tests/evals/v2/runs/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.json new file mode 100644 index 0000000000..73399e78d4 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.json @@ -0,0 +1,118 @@ +{ + "run": { + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:35:56.001Z", + "ended_at": "2026-05-02T18:35:56.011Z", + "status": "completed", + "entry_user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "root_query_id": "3906aa11-8018-49c5-ac3a-b916513e1236", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "root_query_id": "3906aa11-8018-49c5-ac3a-b916513e1236", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "root_query_id": "3906aa11-8018-49c5-ac3a-b916513e1236", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "started_at": "2026-05-02T18:35:56.001Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:35:56.011Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_84dbeba3a127", + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 90, + "total_billed_tokens": 100, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "3906aa11-8018-49c5-ac3a-b916513e1236", + "user_action_id": "9c051f26-951b-4525-98e1-36e769791384", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:35:56.001Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.json b/tests/evals/v2/runs/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.json new file mode 100644 index 0000000000..818b1ba101 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.json @@ -0,0 +1,120 @@ +{ + "run": { + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:35:57.164Z", + "ended_at": "2026-05-02T18:35:57.174Z", + "status": "completed", + "entry_user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "root_query_id": "bd334a3c-e2ef-405e-8de7-ab0771e889bd", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "root_query_id": "bd334a3c-e2ef-405e-8de7-ab0771e889bd", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "root_query_id": "bd334a3c-e2ef-405e-8de7-ab0771e889bd", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "started_at": "2026-05-02T18:35:57.164Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:35:57.174Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_c45a9e254447", + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 95, + "total_billed_tokens": 105, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "bd334a3c-e2ef-405e-8de7-ab0771e889bd", + "user_action_id": "f8573444-aa1c-4c0f-980b-81d8d1e5ddcb", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:35:57.164Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.json b/tests/evals/v2/runs/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.json new file mode 100644 index 0000000000..491904e897 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.json @@ -0,0 +1,117 @@ +{ + "run": { + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:35:58.306Z", + "ended_at": "2026-05-02T18:35:58.316Z", + "status": "completed", + "entry_user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "root_query_id": "ff52a587-6842-4fa6-a0d7-82537d11049a", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "root_query_id": "ff52a587-6842-4fa6-a0d7-82537d11049a", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "root_query_id": "ff52a587-6842-4fa6-a0d7-82537d11049a", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "started_at": "2026-05-02T18:35:58.306Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:35:58.316Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_1e1e184f4d5d", + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 100, + "total_billed_tokens": 110, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "ff52a587-6842-4fa6-a0d7-82537d11049a", + "user_action_id": "31267657-6e21-4cac-80ab-da7d55690e5b", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:35:58.306Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.json b/tests/evals/v2/runs/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.json new file mode 100644 index 0000000000..3cbfee8d65 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.json @@ -0,0 +1,118 @@ +{ + "run": { + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:35:59.290Z", + "ended_at": "2026-05-02T18:35:59.300Z", + "status": "completed", + "entry_user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "root_query_id": "b8547936-74ae-453d-8955-9e4a4fd1b388", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "root_query_id": "b8547936-74ae-453d-8955-9e4a4fd1b388", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "root_query_id": "b8547936-74ae-453d-8955-9e4a4fd1b388", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "started_at": "2026-05-02T18:35:59.290Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:35:59.300Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_51c8c47f1c92", + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 90, + "total_billed_tokens": 100, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "b8547936-74ae-453d-8955-9e4a4fd1b388", + "user_action_id": "659719ae-5215-4efc-bedc-c626af0161bd", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:35:59.290Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.json b/tests/evals/v2/runs/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.json new file mode 100644 index 0000000000..7a3fa456a9 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.json @@ -0,0 +1,120 @@ +{ + "run": { + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:36:00.396Z", + "ended_at": "2026-05-02T18:36:00.406Z", + "status": "completed", + "entry_user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "root_query_id": "a59382a2-80e4-4593-80f2-e416634ff888", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "root_query_id": "a59382a2-80e4-4593-80f2-e416634ff888", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "root_query_id": "a59382a2-80e4-4593-80f2-e416634ff888", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "started_at": "2026-05-02T18:36:00.396Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:00.406Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_046647b1dd14", + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 95, + "total_billed_tokens": 105, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "a59382a2-80e4-4593-80f2-e416634ff888", + "user_action_id": "0af9186b-081f-43a8-be0f-7f4f67c17416", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:00.396Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.json b/tests/evals/v2/runs/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.json new file mode 100644 index 0000000000..a3e6a1e140 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.json @@ -0,0 +1,122 @@ +{ + "run": { + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:36:01.515Z", + "ended_at": "2026-05-02T18:36:01.525Z", + "status": "completed", + "entry_user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "root_query_id": "19e5257b-24f7-4ceb-ad92-30837387e139", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "root_query_id": "19e5257b-24f7-4ceb-ad92-30837387e139", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "root_query_id": "19e5257b-24f7-4ceb-ad92-30837387e139", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "started_at": "2026-05-02T18:36:01.515Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:01.525Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_89cf50a8b6b1", + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 100, + "total_billed_tokens": 110, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "19e5257b-24f7-4ceb-ad92-30837387e139", + "user_action_id": "5e2e7376-c088-4bb9-ad88-a7a0a30cb2f6", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:01.515Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.json b/tests/evals/v2/runs/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.json new file mode 100644 index 0000000000..b70bca6d0c --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.json @@ -0,0 +1,123 @@ +{ + "run": { + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:36:02.529Z", + "ended_at": "2026-05-02T18:36:02.539Z", + "status": "completed", + "entry_user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "root_query_id": "b2728007-19b0-453b-9283-8b8b3fd4b3f0", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "root_query_id": "b2728007-19b0-453b-9283-8b8b3fd4b3f0", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "root_query_id": "b2728007-19b0-453b-9283-8b8b3fd4b3f0", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "started_at": "2026-05-02T18:36:02.529Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:02.539Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_8c53b90c3d92", + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 90, + "total_billed_tokens": 100, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "b2728007-19b0-453b-9283-8b8b3fd4b3f0", + "user_action_id": "0c047aff-f3e6-4a2b-9c4d-4a3e9523315b", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:02.529Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.json b/tests/evals/v2/runs/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.json new file mode 100644 index 0000000000..8fcde09588 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.json @@ -0,0 +1,125 @@ +{ + "run": { + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "repeat_index": 1, + "started_at": "2026-05-02T18:36:03.663Z", + "ended_at": "2026-05-02T18:36:03.673Z", + "status": "completed", + "entry_user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "root_query_id": "8987783a-22a5-4b21-8e59-2f87b4de19af", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "root_query_id": "8987783a-22a5-4b21-8e59-2f87b4de19af", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "root_query_id": "8987783a-22a5-4b21-8e59-2f87b4de19af", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "started_at": "2026-05-02T18:36:03.663Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:03.673Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_042669f544ce", + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 95, + "total_billed_tokens": 105, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "8987783a-22a5-4b21-8e59-2f87b4de19af", + "user_action_id": "5cbe5887-4214-4541-acf8-6333218aed6d", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:03.663Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.json b/tests/evals/v2/runs/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.json new file mode 100644 index 0000000000..ea18f4a67a --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.json @@ -0,0 +1,122 @@ +{ + "run": { + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:36:04.810Z", + "ended_at": "2026-05-02T18:36:04.820Z", + "status": "completed", + "entry_user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "root_query_id": "03eae129-e46b-4a2b-b590-6760260dab08", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "root_query_id": "03eae129-e46b-4a2b-b590-6760260dab08", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "root_query_id": "03eae129-e46b-4a2b-b590-6760260dab08", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "started_at": "2026-05-02T18:36:04.810Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:04.820Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_6a5011686a1c", + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 100, + "total_billed_tokens": 110, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "03eae129-e46b-4a2b-b590-6760260dab08", + "user_action_id": "c781769d-13e2-4389-89bb-80fd0fa48cc9", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:04.810Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.json b/tests/evals/v2/runs/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.json new file mode 100644 index 0000000000..d0df54e511 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.json @@ -0,0 +1,123 @@ +{ + "run": { + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:36:05.821Z", + "ended_at": "2026-05-02T18:36:05.831Z", + "status": "completed", + "entry_user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "root_query_id": "72bf3b7e-d2d7-45f0-9607-6fbe6fe24021", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "root_query_id": "72bf3b7e-d2d7-45f0-9607-6fbe6fe24021", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "root_query_id": "72bf3b7e-d2d7-45f0-9607-6fbe6fe24021", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "started_at": "2026-05-02T18:36:05.821Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:05.831Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_ba88f7385940", + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 90, + "total_billed_tokens": 100, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "72bf3b7e-d2d7-45f0-9607-6fbe6fe24021", + "user_action_id": "1bf4c32c-3dbe-4ab7-906d-7ff0dabd68c3", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:05.821Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/runs/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.json b/tests/evals/v2/runs/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.json new file mode 100644 index 0000000000..c452bdf25f --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.json @@ -0,0 +1,125 @@ +{ + "run": { + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-02T183554916Z", + "repeat_index": 2, + "started_at": "2026-05-02T18:36:06.949Z", + "ended_at": "2026-05-02T18:36:06.959Z", + "status": "completed", + "entry_user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "root_query_id": "10f63fde-e69e-4e42-9113-31d6ea626479", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "root_query_id": "10f63fde-e69e-4e42-9113-31d6ea626479", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "root_query_id": "10f63fde-e69e-4e42-9113-31d6ea626479", + "observability_db_ref": ".observability\\v2-robustness-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-02", + "user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "started_at": "2026-05-02T18:36:06.949Z", + "started_at_ms": 0, + "ended_at": "2026-05-02T18:36:06.959Z", + "ended_at_ms": 10, + "duration_ms": 10, + "event_count": 2, + "query_count": 1, + "main_thread_query_count": 1, + "subagent_query_count": 0, + "subagent_count": 0, + "tool_call_count": 0, + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_06f9838e86ec", + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "total_prompt_input_tokens": 95, + "total_billed_tokens": 105, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "10f63fde-e69e-4e42-9113-31d6ea626479", + "user_action_id": "ef24adf5-89d3-4024-87cd-14db5f49e20d", + "agent_name": "main_thread", + "started_at": "2026-05-02T18:36:06.949Z", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "observed_at": "", + "observed_query_source": "", + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [], + "reason": "No session-memory policy observation event was found for this run." + } +} diff --git a/tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json b/tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json new file mode 100644 index 0000000000..ad66752afc --- /dev/null +++ b/tests/evals/v2/scenarios/robustness_smoke_minimal_alt.json @@ -0,0 +1,29 @@ +{ + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scores/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.scores.json b/tests/evals/v2/scores/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.scores.json new file mode 100644 index 0000000000..123f98d364 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_stability_recovery_absence", + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183555972Z_execute_harness_smoke_minimal_baseline_default_604a7b67", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.scores.json b/tests/evals/v2/scores/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.scores.json new file mode 100644 index 0000000000..3462f99188 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26_stability_recovery_absence", + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183557002Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9c051f26", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.scores.json b/tests/evals/v2/scores/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.scores.json new file mode 100644 index 0000000000..5ce570ff87 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444_stability_recovery_absence", + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183558138Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_f8573444", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.scores.json b/tests/evals/v2/scores/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.scores.json new file mode 100644 index 0000000000..f32dda8408 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_stability_recovery_absence", + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183559260Z_execute_harness_smoke_minimal_baseline_default_31267657", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.scores.json b/tests/evals/v2/scores/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.scores.json new file mode 100644 index 0000000000..74d8dda620 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae_stability_recovery_absence", + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183600230Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_659719ae", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.scores.json b/tests/evals/v2/scores/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.scores.json new file mode 100644 index 0000000000..f0115f3c53 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b_stability_recovery_absence", + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183601346Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_0af9186b", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.scores.json b/tests/evals/v2/scores/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.scores.json new file mode 100644 index 0000000000..84c9f01e36 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_stability_recovery_absence", + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183602496Z_robustness_smoke_minimal_alt_baseline_default_5e2e7376", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.scores.json b/tests/evals/v2/scores/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.scores.json new file mode 100644 index 0000000000..712d980816 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff_stability_recovery_absence", + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183603500Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_0c047aff", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.scores.json b/tests/evals/v2/scores/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.scores.json new file mode 100644 index 0000000000..20a5d2ae5f --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887_stability_recovery_absence", + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183604648Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_5cbe5887", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.scores.json b/tests/evals/v2/scores/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.scores.json new file mode 100644 index 0000000000..0096c05d5f --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_stability_recovery_absence", + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183605793Z_robustness_smoke_minimal_alt_baseline_default_c781769d", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.scores.json b/tests/evals/v2/scores/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.scores.json new file mode 100644 index 0000000000..a1eb5f4478 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c_stability_recovery_absence", + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183606790Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_1bf4c32c", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.scores.json b/tests/evals/v2/scores/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.scores.json new file mode 100644 index 0000000000..f963f0d862 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5_task_success_main_chain_observed", + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5_efficiency_total_billed_tokens", + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5_stability_recovery_absence", + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5_controllability_turn_limit_basic", + "run_id": "run_2026-05-02T183607920Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ef24adf5", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/variants/candidate_eval_fixture_shadow.json b/tests/evals/v2/variants/candidate_eval_fixture_shadow.json new file mode 100644 index 0000000000..72c228776e --- /dev/null +++ b/tests/evals/v2/variants/candidate_eval_fixture_shadow.json @@ -0,0 +1,12 @@ +{ + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." +} From fb08b4ff90a2fc9ea4876634a84c2cbc9d656ae7 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Mon, 4 May 2026 00:28:52 +0800 Subject: [PATCH 17/26] Advance observability v2.5 feedback contracts --- ...05\350\257\273\346\214\207\345\215\227.md" | 519 ++ ...21\351\241\265\347\253\257\357\274\211.md" | 302 ++ ...05\350\257\273\346\214\207\345\215\227.md" | 272 + ...05\350\257\273\346\214\207\345\215\227.md" | 245 + .../README.md" | 20 + ...robustness_smoke_2026-05-03T070927523Z.md" | 45 + ...xt_fixture_smoke_2026-05-03T070957231Z.md" | 98 + ...ntext_real_smoke_2026-05-03T060617173Z.md" | 65 + ...ntext_real_smoke_2026-05-03T145644822Z.md" | 66 + ...tion_contract_v0_2026-05-03T153229792Z.md" | 66 + ...ndidate_session_memory_sparse_a3fb1e0d.md" | 68 + ...tion_contract_v0_2026-05-03T153229792Z.md" | 154 + ...\243\350\257\273-2026-05-03T070927523Z.md" | 217 + ...\243\350\257\273-2026-05-03T070957231Z.md" | 247 + ...\243\350\257\273-2026-05-03T060617173Z.md" | 290 + ...oke_alpha_20260503T103210763Z_9b46cb66.md" | 180 + ...moke_beta_20260503T124541901Z_355a063b.md" | 307 ++ ...moke_beta_20260503T145942988Z_7893da90.md" | 211 + ...trac_beta_20260503T153244784Z_57470f65.md" | 211 + ...trac_beta_20260503T154626054Z_5ed1c19e.md" | 211 + .../v2/README.md" | 133 +- scripts/evals/v2_emit_fixture_trace.ts | 40 +- scripts/evals/v2_harness_execution.ts | 623 ++- scripts/evals/v2_record_run.ts | 99 +- scripts/evals/v2_run_experiment.ts | 907 +++- scripts/evals/v2_run_feedback.ts | 1338 +++++ scripts/evals/v2_score_registry.ts | 201 + .../evals/v2_validate_experiment_artifacts.ts | 18 + .../evals/v2_validate_feedback_artifacts.ts | 549 ++ scripts/evals/v2_validate_manifests.ts | 220 +- scripts/evals/v2_verify_long_context.ts | 106 + src/observability/v2/evalTypes.ts | 193 +- tests/evals/v2/README.md | 95 +- tests/evals/v2/V2.4-long-context-usage.md | 146 + tests/evals/v2/V2.5-feedback-loop-usage.md | 185 + tests/evals/v2/experiment-runs/README.md | 36 +- ...obustness_smoke_2026-05-03T070927523Z.json | 2786 ++++++++++ ...t_fixture_smoke_2026-05-03T070957231Z.json | 4690 +++++++++++++++++ ...text_real_smoke_2026-05-03T060617173Z.json | 836 +++ ...text_real_smoke_2026-05-03T145644822Z.json | 837 +++ ...ion_contract_v0_2026-05-03T153229792Z.json | 842 +++ ...experiment.long_context.fixture_smoke.json | 47 + ...xt.real_smoke.expectation_contract_v0.json | 44 + .../_experiment.long_context.real_smoke.json | 44 + ...tract_v0_20260503T103210763Z_2d4e45cb.json | 25 + ...tract_v0_20260503T124541901Z_66e07dac.json | 25 + ...tract_v0_20260503T145942988Z_829a2c3a.json | 25 + ...tract_v0_20260503T103210763Z_7f0974ed.json | 25 + ...tract_v0_20260503T124541901Z_d326279e.json | 25 + ...tract_v0_20260503T145942988Z_1bdb5652.json | 25 + ...arser_v0_20260503T103210763Z_c72924f7.json | 25 + ...arser_v0_20260503T124541901Z_d4ec8978.json | 25 + ...nding_v0_20260503T103210763Z_d3a111b9.json | 25 + ...nding_v0_20260503T124541901Z_b0296355.json | 25 + ...contract_20260503T154626054Z_b4723ba2.json | 25 + ...tract_v0_20260503T153244784Z_0241aad3.json | 25 + ...tract_v0_20260503T154626054Z_9131c8e3.json | 25 + ...tract_v0_20260503T153244784Z_f1ed1c1f.json | 25 + ...tract_v0_20260503T103210763Z_d1610f7f.json | 20 + ...tract_v0_20260503T124541901Z_0b77bb8b.json | 20 + ...tract_v0_20260503T145942988Z_1e6a3fb4.json | 20 + ...tract_v0_20260503T103210763Z_6f16a48e.json | 20 + ...tract_v0_20260503T124541901Z_06010de6.json | 20 + ...tract_v0_20260503T145942988Z_62748519.json | 20 + ...arser_v0_20260503T103210763Z_4d4bb400.json | 21 + ...arser_v0_20260503T124541901Z_346bd758.json | 21 + ...nding_v0_20260503T103210763Z_f6ca0f37.json | 21 + ...nding_v0_20260503T124541901Z_415a96a3.json | 21 + ...contract_20260503T154626054Z_2002193a.json | 20 + ...tract_v0_20260503T153244784Z_c29168a1.json | 20 + ...tract_v0_20260503T154626054Z_7c0d5a2f.json | 20 + ...tract_v0_20260503T153244784Z_ff510cf4.json | 20 + ...ontext_f_20260503T103210763Z_bd4fc15b.json | 10 + ...ontext_f_20260503T124541901Z_b497c06c.json | 16 + ..._real_sm_20260503T103210763Z_2086d4ae.json | 10 + ..._real_sm_20260503T103210763Z_f63fd723.json | 10 + ..._real_sm_20260503T124541901Z_02dccdee.json | 16 + ..._real_sm_20260503T124541901Z_534c0740.json | 16 + ..._real_sm_20260503T145942988Z_69707008.json | 16 + ..._real_sm_20260503T145942988Z_6ac48f97.json | 16 + ...l_review_20260503T103210763Z_aaceea39.json | 10 + ...l_review_20260503T124541901Z_4fbdb97e.json | 16 + ...l_review_20260503T145942988Z_3c7be194.json | 16 + ..._retriev_20260503T103210763Z_acb6cee2.json | 10 + ..._retriev_20260503T124541901Z_efe417a8.json | 16 + ..._retriev_20260503T145942988Z_7fb1e53a.json | 16 + ...positive_20260503T103210763Z_5d5767ae.json | 10 + ...positive_20260503T124541901Z_70cd437b.json | 16 + ...positive_20260503T145942988Z_f7a7a853.json | 16 + ...text_fac_20260503T103210763Z_e7b6a006.json | 10 + ...text_fac_20260503T124541901Z_2f6593de.json | 16 + ...nclusive_20260503T103210763Z_28ef91e4.json | 10 + ...nclusive_20260503T124541901Z_72968af2.json | 16 + ...nclusive_20260503T145942988Z_e946246a.json | 16 + ..._real_sm_20260503T153244784Z_22ead42f.json | 16 + ..._real_sm_20260503T153244784Z_3b395438.json | 16 + ..._real_sm_20260503T154626054Z_1e601052.json | 16 + ..._real_sm_20260503T154626054Z_537428d4.json | 16 + ...l_review_20260503T153244784Z_ba0288de.json | 16 + ...l_review_20260503T154626054Z_72a1d044.json | 16 + ..._retriev_20260503T153244784Z_0bf6f7ad.json | 16 + ..._retriev_20260503T154626054Z_5550e925.json | 16 + ...positive_20260503T153244784Z_d24225e3.json | 16 + ...positive_20260503T154626054Z_797c63b8.json | 16 + ...nclusive_20260503T153244784Z_5de554f8.json | 16 + ...nclusive_20260503T154626054Z_7e7d8ae0.json | 16 + ...c_scores_20260503T103210763Z_ac3b840c.json | 17 + ...c_scores_20260503T124541901Z_f3494c13.json | 24 + ...ill_open_20260503T103210763Z_a207056a.json | 17 + ...ill_open_20260503T124541901Z_54cd7243.json | 24 + ...ill_open_20260503T145942988Z_2aa4b447.json | 24 + ..._missing_20260503T103210763Z_e3ed5d57.json | 18 + ..._missing_20260503T124541901Z_569976b8.json | 26 + ...tability_20260503T103210763Z_21239a93.json | 17 + ...tability_20260503T124541901Z_e6e1981e.json | 24 + ...tability_20260503T145942988Z_01fd35e0.json | 24 + ...ontract__20260503T154626054Z_46855661.json | 25 + ...ill_open_20260503T153244784Z_89789b5b.json | 24 + ...tability_20260503T153244784Z_9de1252e.json | 24 + ...tability_20260503T154626054Z_d615b243.json | 24 + ...arser_v0_20260503T103210763Z_19602146.json | 15 + ...arser_v0_20260503T124541901Z_5e4eee36.json | 25 + ...cores_v0_20260503T103210763Z_a7718488.json | 14 + ...cores_v0_20260503T124541901Z_6af2f3f2.json | 24 + ...tract_v0_20260503T103210763Z_b0a56fb4.json | 14 + ...tract_v0_20260503T124541901Z_30cd7b51.json | 24 + ...tract_v0_20260503T145942988Z_a0ba210d.json | 24 + ...tions_v0_20260503T103210763Z_d022ab84.json | 14 + ...tions_v0_20260503T124541901Z_013f97a8.json | 27 + ...tions_v0_20260503T145942988Z_3851af91.json | 27 + ...contract_20260503T154626054Z_75dd25e4.json | 27 + ...tract_v0_20260503T153244784Z_d19670cd.json | 24 + ...tract_v0_20260503T154626054Z_0bb87bd6.json | 24 + ...tions_v0_20260503T153244784Z_8bc73d52.json | 27 + ...ke_alpha_20260503T103210763Z_9b46cb66.json | 48 + ...oke_beta_20260503T124541901Z_355a063b.json | 102 + ...oke_beta_20260503T145942988Z_7893da90.json | 82 + ...rac_beta_20260503T153244784Z_57470f65.json | 82 + ...rac_beta_20260503T154626054Z_5ed1c19e.json | 82 + .../compaction-pressure/constraints.json | 19 + .../compaction-pressure/context_body.md | 25 + .../compaction-pressure/critical_facts.json | 16 + .../compaction-pressure/distractors.json | 12 + .../compaction-pressure/expected_output.md | 14 + .../constraint-retention/constraints.json | 19 + .../constraint-retention/context_body.md | 26 + .../constraint-retention/critical_facts.json | 12 + .../constraint-retention/distractors.json | 12 + .../constraint-retention/expected_output.md | 8 + .../distractor-resistance/constraints.json | 14 + .../distractor-resistance/context_body.md | 21 + .../distractor-resistance/critical_facts.json | 12 + .../distractor-resistance/distractors.json | 12 + .../distractor-resistance/expected_output.md | 3 + .../fact-retrieval/constraints.json | 14 + .../fact-retrieval/context_body.md | 25 + .../fact-retrieval/critical_facts.json | 16 + .../fact-retrieval/distractors.json | 12 + .../fact-retrieval/expected_output.md | 4 + ...aseline_default_2026-05-03T070927456Z.json | 33 + ..._fixture_shadow_2026-05-03T070927456Z.json | 33 + ...n_memory_sparse_2026-05-03T070927456Z.json | 33 + ...aseline_default_2026-05-03T070927456Z.json | 33 + ..._fixture_shadow_2026-05-03T070927456Z.json | 33 + ...n_memory_sparse_2026-05-03T070927456Z.json | 33 + ...aseline_default_2026-05-03T070957125Z.json | 33 + ...fixture_guarded_2026-05-03T070957125Z.json | 33 + ...aseline_default_2026-05-03T070957125Z.json | 33 + ...fixture_guarded_2026-05-03T070957125Z.json | 33 + ...aseline_default_2026-05-03T070957125Z.json | 33 + ...fixture_guarded_2026-05-03T070957125Z.json | 33 + ...aseline_default_2026-05-03T070957125Z.json | 33 + ...fixture_guarded_2026-05-03T070957125Z.json | 33 + ...aseline_default_2026-05-03T060545110Z.json | 32 + ...aseline_default_2026-05-03T145605757Z.json | 32 + ...n_memory_sparse_2026-05-03T060545110Z.json | 32 + ...n_memory_sparse_2026-05-03T145605757Z.json | 32 + ...aseline_default_2026-05-03T153143608Z.json | 32 + ...sion_memory_sparse_2026-05-03T1531436.json | 32 + ..._real_smoke_baseline_default_b963e6da.json | 288 + ...didate_session_memory_sparse_96004ff8.json | 289 + ...oke_minimal_baseline_default_49e858ae.json | 101 + ...didate_session_memory_sparse_1e5948a5.json | 102 + ...andidate_eval_fixture_shadow_09f1deec.json | 104 + ...oke_minimal_baseline_default_8600f149.json | 101 + ...didate_session_memory_sparse_862641d4.json | 102 + ...andidate_eval_fixture_shadow_61d3ed8d.json | 104 + ...minimal_alt_baseline_default_231de0ad.json | 106 + ...didate_session_memory_sparse_c53e147c.json | 107 + ...andidate_eval_fixture_shadow_1afeb0f4.json | 109 + ...minimal_alt_baseline_default_5ee185bf.json | 106 + ...didate_session_memory_sparse_242dc6f0.json | 107 + ...andidate_eval_fixture_shadow_59258ce7.json | 109 + ...t_retention_baseline_default_a928b6b2.json | 243 + ...long_context_fixture_guarded_4be1715e.json | 245 + ...t_retention_baseline_default_fa3b48d1.json | 243 + ...long_context_fixture_guarded_6124af22.json | 245 + ...t_retrieval_baseline_default_fdcab6c9.json | 242 + ...long_context_fixture_guarded_1abcd4c9.json | 244 + ...t_retrieval_baseline_default_70401d6d.json | 242 + ...long_context_fixture_guarded_6d06184d.json | 244 + ..._resistance_baseline_default_4d94c847.json | 239 + ...long_context_fixture_guarded_23354a67.json | 240 + ..._resistance_baseline_default_0f2affa1.json | 239 + ...long_context_fixture_guarded_a3fd72c9.json | 240 + ...on_pressure_baseline_default_c9cab754.json | 265 + ...long_context_fixture_guarded_6488e757.json | 266 + ...on_pressure_baseline_default_31b412ce.json | 265 + ...long_context_fixture_guarded_8c630899.json | 266 + ..._real_smoke_baseline_default_4015c73b.json | 319 ++ ...didate_session_memory_sparse_54964348.json | 320 ++ ...contract_v0_baseline_default_0b6a625e.json | 330 ++ ...didate_session_memory_sparse_a3fb1e0d.json | 331 ++ .../long_context_compaction_pressure.json | 110 + .../long_context_constraint_retention.json | 109 + .../long_context_distractor_resistance.json | 106 + .../long_context_fact_retrieval.json | 108 + ...ong_context_fact_retrieval_real_smoke.json | 115 + ...fact_retrieval_real_smoke_contract_v0.json | 126 + .../score-specs/long-context.score-specs.json | 154 + ...moke_baseline_default_b963e6da.scores.json | 152 + ...session_memory_sparse_96004ff8.scores.json | 152 + ...imal_baseline_default_49e858ae.scores.json | 52 + ...session_memory_sparse_1e5948a5.scores.json | 52 + ...e_eval_fixture_shadow_09f1deec.scores.json | 52 + ...imal_baseline_default_8600f149.scores.json | 52 + ...session_memory_sparse_862641d4.scores.json | 52 + ...e_eval_fixture_shadow_61d3ed8d.scores.json | 52 + ..._alt_baseline_default_231de0ad.scores.json | 52 + ...session_memory_sparse_c53e147c.scores.json | 52 + ...e_eval_fixture_shadow_1afeb0f4.scores.json | 52 + ..._alt_baseline_default_5ee185bf.scores.json | 52 + ...session_memory_sparse_242dc6f0.scores.json | 52 + ...e_eval_fixture_shadow_59258ce7.scores.json | 52 + ...tion_baseline_default_a928b6b2.scores.json | 142 + ...ntext_fixture_guarded_4be1715e.scores.json | 142 + ...tion_baseline_default_fa3b48d1.scores.json | 142 + ...ntext_fixture_guarded_6124af22.scores.json | 142 + ...eval_baseline_default_fdcab6c9.scores.json | 142 + ...ntext_fixture_guarded_1abcd4c9.scores.json | 142 + ...eval_baseline_default_70401d6d.scores.json | 142 + ...ntext_fixture_guarded_6d06184d.scores.json | 142 + ...ance_baseline_default_4d94c847.scores.json | 142 + ...ntext_fixture_guarded_23354a67.scores.json | 142 + ...ance_baseline_default_0f2affa1.scores.json | 142 + ...ntext_fixture_guarded_a3fd72c9.scores.json | 142 + ...sure_baseline_default_c9cab754.scores.json | 142 + ...ntext_fixture_guarded_6488e757.scores.json | 142 + ...sure_baseline_default_31b412ce.scores.json | 142 + ...ntext_fixture_guarded_8c630899.scores.json | 142 + ...moke_baseline_default_4015c73b.scores.json | 152 + ...session_memory_sparse_54964348.scores.json | 152 + ...t_v0_baseline_default_0b6a625e.scores.json | 152 + ...session_memory_sparse_a3fb1e0d.scores.json | 152 + ...andidate_long_context_fixture_guarded.json | 12 + ..._4_long_context_2026-05-03T055334949Z.json | 9 + 256 files changed, 33559 insertions(+), 108 deletions(-) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3-V2.5\345\275\223\345\211\215\347\212\266\346\200\201\345\220\214\346\255\245\347\250\277\357\274\210\347\275\221\351\241\265\347\253\257\357\274\211.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.4\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.3-robustness-\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070927523Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-fixture-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070957231Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-real-smoke-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T060617173Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md" create mode 100644 scripts/evals/v2_run_feedback.ts create mode 100644 scripts/evals/v2_validate_feedback_artifacts.ts create mode 100644 scripts/evals/v2_verify_long_context.ts create mode 100644 tests/evals/v2/V2.4-long-context-usage.md create mode 100644 tests/evals/v2/V2.5-feedback-loop-usage.md create mode 100644 tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json create mode 100644 tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json create mode 100644 tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json create mode 100644 tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json create mode 100644 tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json create mode 100644 tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json create mode 100644 tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json create mode 100644 tests/evals/v2/experiments/_experiment.long_context.real_smoke.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json create mode 100644 tests/evals/v2/fixtures/long-context/compaction-pressure/constraints.json create mode 100644 tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md create mode 100644 tests/evals/v2/fixtures/long-context/compaction-pressure/critical_facts.json create mode 100644 tests/evals/v2/fixtures/long-context/compaction-pressure/distractors.json create mode 100644 tests/evals/v2/fixtures/long-context/compaction-pressure/expected_output.md create mode 100644 tests/evals/v2/fixtures/long-context/constraint-retention/constraints.json create mode 100644 tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md create mode 100644 tests/evals/v2/fixtures/long-context/constraint-retention/critical_facts.json create mode 100644 tests/evals/v2/fixtures/long-context/constraint-retention/distractors.json create mode 100644 tests/evals/v2/fixtures/long-context/constraint-retention/expected_output.md create mode 100644 tests/evals/v2/fixtures/long-context/distractor-resistance/constraints.json create mode 100644 tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md create mode 100644 tests/evals/v2/fixtures/long-context/distractor-resistance/critical_facts.json create mode 100644 tests/evals/v2/fixtures/long-context/distractor-resistance/distractors.json create mode 100644 tests/evals/v2/fixtures/long-context/distractor-resistance/expected_output.md create mode 100644 tests/evals/v2/fixtures/long-context/fact-retrieval/constraints.json create mode 100644 tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md create mode 100644 tests/evals/v2/fixtures/long-context/fact-retrieval/critical_facts.json create mode 100644 tests/evals/v2/fixtures/long-context/fact-retrieval/distractors.json create mode 100644 tests/evals/v2/fixtures/long-context/fact-retrieval/expected_output.md create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z.json create mode 100644 tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.json create mode 100644 tests/evals/v2/runs/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_compaction_pressure.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_constraint_retention.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_distractor_resistance.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_fact_retrieval.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke.json create mode 100644 tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json create mode 100644 tests/evals/v2/score-specs/long-context.score-specs.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.scores.json create mode 100644 tests/evals/v2/variants/candidate_long_context_fixture_guarded.json create mode 100644 tests/evals/v2/verification-reports/v2_4_long_context_2026-05-03T055334949Z.json diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" new file mode 100644 index 0000000000..b7f06da0e1 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" @@ -0,0 +1,519 @@ +# V2.2.5版本项目介绍与阅读指南 + +## 理解清单 + +- `V2.2.5` 不是一个“新增很多指标”的版本,而是一个“把真实实验闭环补齐”的版本。 +- 它解决的核心问题是:`V2.2-beta` 之前虽然已经能做 `smoke`,但真实 `real experiment` 一度被平台启动问题卡住,导致系统还不能稳定回答“这个 harness 改动到底有没有真实效果”。 +- `V2.2.5` 做成之后,系统终于同时拥有两条可用路径: + - 自动路径:`execute_harness` + - 保底路径:`manual real run + bind_existing` +- 这两条路径最后都必须收敛到同一类 V2 证据,而不是一条“真”、一条“假”。 + +## 预期效果 + +读完这份文档后,你应该能清楚回答下面这些问题: + +1. `V2` 这个系统到底在评测什么,不在评测什么。 +2. `V2.2.5` 和 `V2.1`、`V2.2-alpha`、`V2.2-beta` 的关系是什么。 +3. `scenario / variant / experiment / run / score / report` 各自是什么。 +4. 一次真实实验是如何从“发送任务”走到“得到结论”的。 +5. 你应该先读哪些文件,再读哪些文件。 +6. 你自己要复跑一次 `V2.2.5` 时,最短命令链是什么。 + +## 设计思路 + +这份指南按“先理解系统定位,再理解对象模型,再理解目录,再理解运行顺序”的方式组织。 + +原因很简单: + +- 如果先看脚本,你会陷入实现细节,看不出 V2 的抽象边界。 +- 如果只看任务书,你会知道目标,但不知道当前仓库里真实已经做到哪里。 +- 所以最有效的阅读方式是: + - 先看系统在解决什么问题 + - 再看 V2.2.5 当前已经闭合了什么 + - 再看具体实现和 artifact + +--- + +## 1. V2 系统到底是什么 + +### 1.1 一句话定义 + +`V2` 不是一个“更漂亮的 dashboard”,而是一个**面向 harness 演进的本地评测系统**。 + +它的目标不是只回答: + +- 这次 trace 里发生了什么 + +而是进一步回答: + +- baseline 和 candidate 哪个更好 +- 好在哪里 +- 是真的更好,还是只是更贵 +- 这个结论有没有足够可靠的证据 + +### 1.2 它和 V1 的关系 + +`V1` 解决的是“观测”。 + +它关心的是: + +- 一个 `user_action_id` 下发生了哪些 query / turn / tool / subagent +- 成本是多少 +- trace 是否完整 + +`V2` 解决的是“评测”。 + +它关心的是: + +- 给定一个 `scenario` +- 对比一个 `baseline variant` 和一个 `candidate variant` +- 把两边各自对应的 V1 事实证据绑定出来 +- 自动产出 run / score / compare / experiment summary + +所以 V2 永远建立在 V1 之上。 +V2 自己不发明事实,它只消费 V1 的事实证据。 + +### 1.3 为什么 V2.2.5 重要 + +在 `V2.2.5` 之前,系统已经具备: + +- `V2.1`: `bind_existing`,可以把已有的真实 `user_action_id` 做成正式实验 +- `V2.2-alpha`: `execute_harness` 自动执行链路 +- `V2.2-beta`: `variant_effect_observed`、`experiment_validity`、`runtime_difference_summary` + +但还差最后一步: + +- `real experiment` 能不能稳定跑通 + +`V2.2.5` 正是在补这个最后缺口。 + +--- + +## 2. V2.2.5 当前到底已经实现了什么 + +### 2.1 自动真实实验路径 + +你现在可以直接运行: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json +``` + +这会做完整闭环: + +```text +读取 experiment +-> 读取 scenario +-> 读取 baseline variant +-> 读取 candidate variant +-> 自动执行 baseline +-> 自动执行 candidate +-> 通过 benchmark_run_id 捕获各自 user_action_id +-> 生成 run / score / compare / experiment summary +``` + +当前一份成功的正式产物是: + +- [自动 real experiment summary](../06-运行报告/experiment_session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.md) +- [自动 real experiment JSON](../../../../tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json) + +### 2.2 手动保底路径 + +你现在也可以不用自动执行器,先自己跑出两条真实 trace,再回绑成正式实验。 + +这条路径是: + +```text +手动 baseline real run +-> baseline user_action_id +手动 candidate real run +-> candidate user_action_id +写入 bind_existing manifest +-> 跑 V2 experiment +-> 生成正式 artifact +``` + +当前一份成功的正式产物是: + +- [manual fallback summary](../06-运行报告/experiment_session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.md) +- [manual fallback JSON](../../../../tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json) + +### 2.3 这两条路径为什么都要有 + +自动路径的优点: + +- 用起来最顺 +- 真正符合未来“一键跑实验”的方向 + +手动路径的优点: + +- 就算启动桥或平台环境抖动,评测系统本身仍然可用 +- 能区分“运行器坏了”还是“评分口径坏了” + +所以 `V2.2.5` 的价值不只是把某个 bug 修掉,而是让 V2 真正具备**主路径 + 保底路径**。 + +--- + +## 3. 你必须掌握的对象模型 + +### 3.1 scenario + +`scenario` 表示一个待评测任务。 + +它定义: + +- 任务描述 +- `input_prompt` +- 预期约束 +- 希望观察到的行为 + +本轮真实实验使用的场景是: + +- [session_memory_trigger_sensitive.json](../../../../tests/evals/v2/scenarios/session_memory_trigger_sensitive.json) + +### 3.2 variant + +`variant` 表示一套待比较的 harness 配置或候选改动。 + +当前最重要的两个 variant 是: + +- [baseline.template.json](../../../../tests/evals/v2/variants/baseline.template.json) +- [candidate_session_memory_sparse.json](../../../../tests/evals/v2/variants/candidate_session_memory_sparse.json) + +在 `V2.2.5` 里,它们的关键差别不是文案,而是 runtime contract: + +- [session_memory_default.runtime.json](../../../../tests/evals/v2/configs/session_memory_default.runtime.json) +- [session_memory_sparse.runtime.json](../../../../tests/evals/v2/configs/session_memory_sparse.runtime.json) + +### 3.3 experiment + +`experiment` 是把 scenario 和 variant 组合起来的正式评测定义。 + +它会说明: + +- baseline 是谁 +- candidate 是谁 +- 用哪些 score spec +- 用哪套 gate policy +- 是 `bind_existing` 还是 `execute_harness` + +当前本轮最重要的两个 experiment: + +- 自动 real experiment: + [session_memory_runtime_sparse_vs_default.json](../../../../tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json) +- 手动 fallback experiment: + [session_memory_runtime_sparse_vs_default_manual.bind_existing.json](../../../../tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json) + +### 3.4 run + +`run` 是“一次 scenario 在某个 variant 下的正式评测记录”。 + +它不是原始日志,而是从 V1 事实里提炼出来的结构化记录。 + +它关心: + +- 绑定了哪个 `user_action_id` +- root query 是谁 +- 成本是多少 +- turn / tool / subagent / recovery 情况如何 +- 有没有观察到 `variant_effect` + +### 3.5 score + +`score` 是 run 上的单维度评分结果。 + +本轮最关键的几个 score 是: + +- `task_success.main_chain_observed` +- `decision_quality.session_memory_policy_observed` +- `efficiency.total_billed_tokens` +- `decision_quality.subagent_count_observed` +- `stability.recovery_absence` +- `controllability.turn_limit_basic` + +### 3.6 experiment summary + +这是你平时最应该先看的 artifact。 + +它会聚合: + +- 这次 experiment 是什么 +- mode 是什么 +- baseline/candidate 是否都成功绑定 +- `experiment_validity` +- `variant_effect_summary` +- `runtime_difference_summary` +- `scorecard_summary` +- `risk_verdict` + +一句话说: +如果你只有 2 分钟,就先看 `experiment summary`。 + +--- + +## 4. V2.2.5 的核心闭环是怎么工作的 + +### 4.1 自动路径 + +自动路径的正式绑定 key 不是“最新 action”,而是: + +- `benchmark_run_id` + +完整链路可以理解成: + +```text +experiment manifest +-> scenario prompt +-> variant apply +-> headless CLI execution +-> V1 事件中注入 eval context +-> DuckDB 重建 +-> benchmark_run_id 查唯一 user_action_id +-> V2 run +-> V2 scores +-> compare report +-> experiment summary +``` + +### 4.2 手动路径 + +手动路径少掉的是“自动执行”,但不会少掉“正式评分”。 + +也就是说,差别只是: + +- 自动路径:系统自己先把 trace 跑出来 +- 手动路径:你先拿到 trace,再交给系统评测 + +后半段仍然是同一套 V2 逻辑。 + +### 4.3 这意味着什么 + +这意味着: + +- V2 的“评测口径”不依赖自动执行器 +- 自动执行器只是前端执行入口 +- 真正的 V2 价值在于“把真实 trace 转成正式评测结论” + +--- + +## 5. 当前目录该怎么理解 + +### 5.1 面向版本说明的目录 + +- [v2/README.md](../README.md) +- [01-总览](./) +- [02-实施任务书](../02-实施任务书/) +- [03-数据模型](../03-数据模型/) +- [04-Scenario集](../04-Scenario集/) +- [05-Variant与实验](../05-Variant与实验/) +- [06-运行报告](../06-运行报告/) + +这里更适合回答: + +- 系统想做什么 +- 版本发展到了哪一步 +- 阅读顺序是什么 + +### 5.2 面向实际执行的目录 + +- [tests/evals/v2/README.md](../../../../tests/evals/v2/README.md) +- [tests/evals/v2/scenarios](../../../../tests/evals/v2/scenarios/) +- [tests/evals/v2/variants](../../../../tests/evals/v2/variants/) +- [tests/evals/v2/experiments](../../../../tests/evals/v2/experiments/) +- [tests/evals/v2/runs](../../../../tests/evals/v2/runs/) +- [tests/evals/v2/scores](../../../../tests/evals/v2/scores/) +- [tests/evals/v2/experiment-runs](../../../../tests/evals/v2/experiment-runs/) + +这里更适合回答: + +- 真正运行时用哪个文件 +- manifest 在哪 +- artifact 在哪 + +--- + +## 6. 推荐阅读顺序 + +### 第 1 层:先看这 3 份 + +1. 当前这份文档 + [V2.2.5版本项目介绍与阅读指南.md](./V2.2.5版本项目介绍与阅读指南.md) +2. V2 工作区说明 + [tests/evals/v2/README.md](../../../../tests/evals/v2/README.md) +3. V2.2.5 闭环说明 + [V2.2.5-real-experiment-closure.md](../../../../tests/evals/v2/V2.2.5-real-experiment-closure.md) + +读完这三份,你会知道“系统是什么、入口是什么、V2.2.5 到底解决了什么”。 + +### 第 2 层:再看真实案例 + +1. 自动 real experiment summary + [session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json](../../../../tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_2026-05-02T165222245Z.json) +2. manual fallback summary + [session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json](../../../../tests/evals/v2/experiment-runs/session_memory_runtime_sparse_vs_default_manual_bind_existing_2026-05-02T170311090Z.json) + +这两份会告诉你: +同一个评测目标,在两条路径下都能闭合。 + +### 第 3 层:最后看实现 + +推荐顺序: + +1. [v2_run_experiment.ts](../../../../scripts/evals/v2_run_experiment.ts) +2. [v2_harness_execution.ts](../../../../scripts/evals/v2_harness_execution.ts) +3. [v2_record_run.ts](../../../../scripts/evals/v2_record_run.ts) +4. [v2_compare_runs.ts](../../../../scripts/evals/v2_compare_runs.ts) +5. [v2_score_registry.ts](../../../../scripts/evals/v2_score_registry.ts) +6. [sessionMemory.ts](../../../../src/services/SessionMemory/sessionMemory.ts) + +原因: + +- `v2_run_experiment.ts` 是总调度器 +- `v2_harness_execution.ts` 是自动执行前半段 +- `v2_record_run.ts` 是 V1 -> V2 run 的桥 +- `v2_compare_runs.ts` 是对比逻辑 +- `v2_score_registry.ts` 是评分实现 +- `sessionMemory.ts` 是本轮真实差异的业务核心 + +--- + +## 7. 你自己复跑 V2.2.5 时,最简单的命令链 + +### 7.1 如果你想跑自动真实实验 + +```powershell +bun run scripts/evals/v2_validate_manifests.ts +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json +``` + +然后看: + +- `tests/evals/v2/experiment-runs/` +- `ObservrityTask/10-系统版本/v2/06-运行报告/` + +### 7.2 如果你想走手动 fallback + +先跑 baseline: + +```powershell +& 'scripts/evals/v2_manual_real_run.ps1' -ScenarioId 'session_memory_trigger_sensitive' -VariantId 'baseline_default' -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' -MaxTurns 12 +``` + +再跑 candidate: + +```powershell +& 'scripts/evals/v2_manual_real_run.ps1' -ScenarioId 'session_memory_trigger_sensitive' -VariantId 'candidate_session_memory_sparse' -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' -MaxTurns 12 +``` + +最后跑 experiment: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default_manual.bind_existing.json +``` + +--- + +## 8. 你该怎么读结果 + +### 8.1 第一眼先看什么 + +先看 `experiment_validity`。 + +如果它不是: + +- `valid` + +那就不要急着解读成本差异。 + +### 8.2 第二眼看什么 + +看: + +- `variant_effect_summary` +- `runtime_difference_summary` + +这两块回答的是: + +- candidate 有没有真的改到 runtime +- baseline 和 candidate 的差异是不是被 V1/V2 证据稳定观察到了 + +### 8.3 第三眼看什么 + +再看 `scorecard_summary`。 + +对 `session_memory` 这个实验来说,最关键的是: + +- `decision_quality.subagent_count_observed` +- `efficiency.total_billed_tokens` + +当前结果里,这两个都是改善。 + +### 8.4 不要怎么读 + +不要只看到: + +- token 更低 + +就直接说: + +- candidate 更聪明 + +当前 `V2.2.5` 只能说明: + +- runtime policy 差异是可解释的 +- 某些成本/行为指标变好了 + +它还不能单独证明: + +- 全局更优 +- 长期更稳 +- 在更多任务上也一定更好 + +--- + +## 9. V2.2.5 的边界 + +当前版本仍然有明确边界: + +- 仍然是 `1 scenario / 1 baseline / 1 candidate / repeat=1` +- 还不是 batch robustness 系统 +- 还不是 long-context 专项系统 +- 还不是 tool/skill 价值专项系统 + +所以正确理解是: + +- `V2.2.5` 解决了“真实实验能不能闭合” +- 它还没有解决“这个结论在更多场景、多次重复下是否稳定” + +--- + +## 10. 从这里继续往后怎么走 + +如果以工程顺序看,我建议后续路线是: + +1. `V2.3 Batch + Robustness` + - 多 scenario + - repeat + - 看波动而不是只看单次结果 +2. `V2.4 Long-Context` + - 专门研究长上下文成本、压缩、记忆策略 +3. `V2.5 Tool / Skill Value` + - 研究 tool / skill 的真实价值,而不是只看调用次数 + +为什么不是直接跳到 long-context 或 skill? + +因为如果 batch 和 robustness 没补,你很容易把一次偶然结果误判成稳定规律。 + +--- + +## 11. 最后的阅读建议 + +如果你以后再次中断一段时间后回来,我建议你用下面这个顺序快速恢复上下文: + +1. 先读当前这份指南 +2. 再读 [tests/evals/v2/README.md](../../../../tests/evals/v2/README.md) +3. 再读最新一份 `experiment summary` +4. 如果要深入,再去看 `run / compare / code` + +这样你能最快恢复到“知道系统现在是什么状态、怎么用、下一步该做什么”的工作面。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3-V2.5\345\275\223\345\211\215\347\212\266\346\200\201\345\220\214\346\255\245\347\250\277\357\274\210\347\275\221\351\241\265\347\253\257\357\274\211.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3-V2.5\345\275\223\345\211\215\347\212\266\346\200\201\345\220\214\346\255\245\347\250\277\357\274\210\347\275\221\351\241\265\347\253\257\357\274\211.md" new file mode 100644 index 0000000000..78b6e0696f --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.3-V2.5\345\275\223\345\211\215\347\212\266\346\200\201\345\220\214\346\255\245\347\250\277\357\274\210\347\275\221\351\241\265\347\253\257\357\274\211.md" @@ -0,0 +1,302 @@ +# V2.3-V2.5 当前状态同步稿(网页端) + +## 理解清单 + +这份同步稿的目的不是重新解释整套系统,而是把当前仓库里已经完成的 `V2.3 / V2.4 / V2.5` 真实状态压缩成一个网页端可继续规划的状态包。 + +当前主线已经推进到: + +```text +V2.3 -> batch / robustness +V2.4 -> long-context evaluation +V2.5 -> feedback loop beta +``` + +并且 `V2.5` 已经不只是“会提建议”,而是已经继续往前做了两步: + +1. `candidate_long_context_output_parser_v0` 已实现 +2. `candidate_long_context_expectation_contract_v0` 已实现 +3. `candidate_feedback_input_contract_after_contract_v0` 已实现为反馈系统层去重/稳态能力 + +## 当前结论(一句话版本) + +当前系统已经具备: + +- 批量评测 +- 长上下文专项评测 +- 真实链路下的轻量语义判定 +- 基于实验结果生成结构化反馈 +- 在反馈系统内部识别“某个 follow-up 已经执行过”,避免循环推荐 + +但当前系统还不具备: + +- 自动改代码 +- 自动 promote candidate +- 自动取消 manual review + +## V2.3 当前状态 + +### 目标 + +把 `V2.2.5` 的单次真实实验闭环,升级成: + +- multi-scenario +- multi-candidate +- repeat +- run_group +- stability summary +- flaky detection + +### 当前已完成 + +- runner 支持 `multi-scenario / multi-candidate / repeat_count > 1` +- 引入 `run_group` +- experiment summary 支持: + - `stability_summary` + - `flaky_scenarios` + - `run_failures` +- batch markdown report 已可用 +- 无成本 robustness smoke 已可用 + +### 当前代表性产物 + +- summary + `tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json` +- batch report + `ObservrityTask/10-系统版本/v2/06-运行报告/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md` + +### 当前结论 + +`V2.3` 已经不是阻塞项。 +它已经稳定提供: + +- 批量执行骨架 +- 重复运行骨架 +- 稳定性摘要骨架 + +## V2.4 当前状态 + +### 目标 + +在 `V2.3` 的 batch/robustness 之上,补出 long-context 专项评测层,重点观察: + +- constraint retention +- fact retrieval +- distractor resistance +- compaction / context governance + +### 当前已完成 + +- `fixture smoke` 已闭合 +- `real smoke` 已跑通 +- 长上下文对象模型已落地 +- `context.*` score-spec 已落地 +- `long_context_summary` 已进入正式 experiment summary + +### 关键进展 1:output parser 已实现 + +当前真实 `real smoke` 已不再停留在: + +- `constraint_retention_rate_mean = null` +- `retrieved_fact_hit_rate_mean = null` + +而是已经通过轻量 parser,把真实输出里的: + +- retained constraints +- retrieved facts +- missed facts +- distractor confusion + +正式写回 `long_context_evidence`。 + +### 当前代表性产物 + +- latest real smoke summary + `tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json` +- latest fixture smoke summary + `tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json` + +### 当前结论 + +`V2.4` 当前已经完成“最小真实语义闭环”: + +- real smoke 可跑 +- runtime difference 可观测 +- 轻量语义判定可落分 +- manual review 仍保留为边界,而不是被假装消除 + +## V2.5 当前状态 + +### 目标 + +把实验结果转成结构化反馈,而不是只停留在: + +- 跑实验 +- 出报告 +- 人工自己读 + +### V2.5 alpha 已完成 + +已完成: + +- finding extractor +- hypothesis builder +- proposal generator +- candidate variant proposal +- next experiment plan + +### V2.5 beta 已完成 + +已完成: + +- feedback taxonomy +- proposal queue +- approval card +- feedback artifact validator + +### 关键进展 2:expectation contract follow-up 已实现 + +当前独立 follow-up 路径已经存在: + +- scenario + `tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json` +- experiment + `tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json` + +它的作用是: + +- 不改 runtime harness policy +- 只收紧: + - answer-shape expectation + - expected fact anchoring + - manual-review question precision + +### 关键进展 3:feedback input contract follow-up 已实现 + +这是这轮新增的重点。 + +当前反馈系统已经能识别: + +- source experiment 已经是 `expectation_contract_v0` +- 因此不应再把 `tighten_real_smoke_expectations_v0` 重复推荐为新的 top action + +也就是说,当前系统已经具备一层新的能力: + +```text +反馈系统能识别“某个 follow-up 已经被执行过” +``` + +这一步不是 runtime 改动,也不是 scenario 改动,而是 feedback-system 自身的稳态化。 + +### 当前最新反馈产物 + +- latest feedback run + `tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json` +- latest feedback report + `ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md` + +### 当前最新反馈结论 + +当前 queue 状态是: + +- `top_recommendation` + - `stabilize_feedback_input_contract_after_contract_v0` +- `deferred` + - `stabilize_feedback_input_contract_v0` + +这说明系统已经能区分: + +1. “当前最该改的是反馈系统去重/稳态逻辑” +2. “泛化的 feedback input stabilization 仍然有价值,但还不是现在最高优先级” + +### 当前 validator 状态 + +已通过: + +```powershell +bun run scripts/evals/v2_validate_feedback_artifacts.ts +``` + +这意味着最新 feedback 产物已经满足: + +- 唯一 `top_recommendation` +- proposal queue 自洽 +- approval card 自洽 +- candidate proposal / next plan 自洽 + +## 当前系统的真实能力边界 + +### 已具备 + +- `V2.3`:批量 / repeat / 稳定性 +- `V2.4`:long-context fixture + real smoke +- `V2.4`:real smoke 轻量语义判定 +- `V2.5`:结构化反馈 +- `V2.5`:approval card +- `V2.5`:feedback queue 去重与稳态识别 + +### 仍未具备 + +- 自动实现 proposal +- 自动改 harness runtime +- 自动修改 scenario/scorer +- 自动取消 manual review +- 自动做最终 candidate promote/reject + +## 当前最合理的下一步方向 + +如果网页端要继续写下一阶段任务书,我建议它不要回头重做: + +- output parser +- expectation contract +- feedback queue 基础 + +这些已经完成。 + +下一步更合理的方向应该是二选一: + +### 方向 A:继续做 V2.5 beta/stable + +重点做: + +- feedback taxonomy 更细分 +- manual-review findings 的层级化 +- proposal ranking 更稳定 +- feedback-run 间的一致性比较 +- “同一个问题反复出现” 的跨 run 聚合 + +### 方向 B:进入 V2.6 + +前提是网页端认可: + +- `V2.3-V2.5` 的当前骨架已经足够稳定 + +然后正式进入: + +- tool / skill 专项价值评测 +- 或更正式的 harness iteration workflow + +## 推荐给网页端的简版结论 + +可以直接把下面这段发给网页端: + +```text +当前仓库中的 V2.3-V2.5 已经推进到以下状态: + +1. V2.3 已完成 batch / repeat / run_group / stability summary / flaky detection。 +2. V2.4 已完成 long-context 评测层,fixture smoke 和 real smoke 都已跑通。 +3. V2.4 的 real smoke 不再只有 runtime evidence,轻量 output parser 已实现,constraint retention 和 fact retrieval 已能形成正式语义证据。 +4. V2.5 alpha/beta 已完成 feedback taxonomy、proposal queue、approval card、feedback artifact validator。 +5. expectation_contract_v0 已经落地为独立实验路径。 +6. feedback_input_contract_after_contract_v0 也已落地,反馈系统现在能识别“某个 follow-up 已经执行过”,不再循环推荐同一个 scenario-contract proposal。 +7. 当前最新 feedback 的 top recommendation 是 feedback-system 层的 contract stabilization,而不是重新做 parser 或重新做 expectation contract。 + +因此,下一阶段任务书不应回退重做 V2.4 parser 或 V2.5 queue 基础,而应承接当前事实,继续规划: +- V2.5 beta/stable 的反馈体系深化 +或 +- 基于当前骨架进入下一版本的 tool/skill 专项价值评测。 +``` + +## 一句话总结 + +当前系统已经从“能看实验结果”推进到了“能识别自己已经走过哪些 follow-up,并把真正下一步动作收敛成唯一可拍板 proposal”的阶段。 diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.4\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.4\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" new file mode 100644 index 0000000000..629cdd375a --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.4\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" @@ -0,0 +1,272 @@ +# V2.4版本项目介绍与阅读指南 +## 理解清单 + +V2.4 的目标不是再扩一个泛化大平台,而是在 V2.3 已经具备 `batch / repeat / run_group / stability_summary` 的基础上,补出一组专门面向长上下文压力的评测能力。 + +这轮重点回答 5 个问题: + +- 上下文很长时,agent 会不会丢硬约束。 +- 关键事实被埋在长上下文里时,agent 能不能稳定找回。 +- 上下文里混入旧说明、假路径、废弃口径时,agent 会不会被带偏。 +- `compact / tool_result_budget / session_memory` 这些治理机制出现时,链路是否仍然可解释。 +- 长上下文下的成本变化,是否能和结果质量一起被观察,而不是只看 token。 + +V2.4 仍然复用 V2 的既有对象模型: + +- `scenario` +- `variant` +- `experiment` +- `run` +- `score` +- `run_group` + +V2.4 没有推翻 V2.3,而是在这些对象上新增了长上下文专用字段、专用 score-spec 和专用报告区块。 + +## 预期效果 + +如果你只想快速确认 V2.4 已经具备什么,现在可以直接理解成两条路径: + +1. `fixture smoke` + +- 完全不消耗真实模型成本。 +- 用 4 个长上下文 scenario family 验证: + - 约束保持 + - 事实找回 + - 抗干扰 + - compaction 压力 +- 会自动生成: + - `run` + - `score` + - `run_group` + - `experiment summary` + - `batch report` + - `long_context_summary` + +2. `real smoke` + +- 真实调用模型。 +- 只跑一个小型长上下文场景。 +- 目标不是做正式 benchmark,而是确认: + - `execute_harness` 真实链路可跑 + - 长上下文指标在真实运行下仍可解释 + - 至少能拿到成本、手工复核提示、上下文治理信号 + +## 设计思路 + +V2.4 没有试图把“长上下文能力”压成一个单分数。 + +因为长上下文问题本质上是复合问题,它至少包含: + +- `constraint retention` +- `fact retrieval` +- `distractor resistance` +- `context governance` +- `cost-quality tradeoff` + +所以 V2.4 的做法是: + +1. 用 scenario family 把问题拆开。 +2. 用 `context.*` score-spec 分别记录各类表现。 +3. 用 `long_context_summary` 在 experiment 层做聚合。 +4. 保留 `manual_review_questions`,承认这类问题不应被完全自动裁决。 + +## 与 V2.3 的关系 + +你可以把版本关系理解成: + +```text +V2.2.5 = 单次真实实验闭环 +V2.3 = 批量、重复、稳定性 +V2.4 = 长上下文专项评测 +``` + +V2.4 直接继承 V2.3 的这些能力: + +- 多 scenario +- repeat +- run_group +- stability summary +- flaky status +- batch markdown report + +所以 V2.4 不是一套平行系统,而是 “V2.3 runner + long-context 评测层”。 + +## 本轮新增能力 + +### 1. 长上下文 scenario family + +当前已落地 4 个核心 family: + +- `long_context_constraint_retention` +- `long_context_fact_retrieval` +- `long_context_distractor_resistance` +- `long_context_compaction_pressure` + +它们对应 4 类最核心的长上下文问题。 + +### 2. 长上下文 fixture 集 + +每个 family 都有独立 fixture 目录,至少包含: + +- `context_body.md` +- `critical_facts.json` +- `constraints.json` +- `distractors.json` +- `expected_output.md` + +这保证了 fixture smoke 可复现、可追溯、可扩展。 + +### 3. 长上下文专用 score-spec + +当前新增的 `context.*` 指标包括: + +- `context.retained_constraint_count` +- `context.lost_constraint_count` +- `context.constraint_retention_rate` +- `context.retrieved_fact_hit_rate` +- `context.distractor_confusion_count` +- `context.total_prompt_input_tokens` +- `context.compaction_trigger_count` +- `context.compaction_saved_tokens` +- `context.success_under_context_pressure` +- `context.manual_review_required` + +### 4. run 级长上下文证据 + +单个 `run` 现在会额外写出 `long_context` 结构,记录: + +- 当前场景属于哪个 `context_family` +- 上下文规模等级 +- 预期约束 +- 预期事实 +- 干扰项 +- compaction 相关计数 +- saved tokens +- manual review 提示 + +### 5. experiment 级长上下文汇总 + +experiment summary 现在新增: + +- `long_context_review_verdict` +- `long_context_summary` + +batch markdown 报告也会新增: + +- `## Long Context Summary` + +这一层是 V2.4 最重要的人类阅读入口。 + +## 当前推荐阅读顺序 + +1. 先读本文件。 +2. 再读 [tests/evals/v2/README.md](../../../tests/evals/v2/README.md)。 +3. 再读 [tests/evals/v2/V2.4-long-context-usage.md](../../../tests/evals/v2/V2.4-long-context-usage.md)。 +4. 然后看最新 V2.4 fixture smoke summary: + [v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.json](../../../tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.json) +5. 再看对应 batch report: + [batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.md](../06-运行报告/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.md) + +## 如何运行 + +### 1. 先做 manifest 校验 + +```powershell +bun run scripts/evals/v2_validate_manifests.ts +``` + +### 2. 跑 V2.4 fixture smoke + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json +``` + +### 3. 跑 V2.4 verifier + +```powershell +bun run scripts/evals/v2_verify_long_context.ts +``` + +### 4. 如果要试真实链路,再跑 real smoke + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.json +``` + +## 结果怎么读 + +建议固定按这个顺序读: + +1. 最新 `experiment summary json` + +先看: + +- `mode` +- `report_profile` +- `experiment_validity` +- `long_context_review_verdict` +- `long_context_summary` + +2. 最新 `batch report` + +重点看: + +- `Batch Stability Table` +- `Long Context Summary` +- `Semantic Interpretation` +- `Manual Review Notes` + +3. 如果某个 scenario 需要深挖,再看单个 `run json` + +重点看: + +- `scenario.long_context_profile` +- `evidence.action` +- `evidence.rootQuery` +- `variant_effect` +- `long_context` + +## 当前已确认的状态 + +截至当前版本,V2.4 的 `fixture smoke` 已闭合: + +- 4 个 scenario family 均已进入 summary +- baseline / candidate 均已生成 `run` 与 `score` +- `long_context_summary` 已生成 +- `long_context_review_verdict` 已生成 +- batch report 已带 `Long Context Summary` + +最新可直接查看的产物是: + +- [experiment summary](../../../tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.json) +- [batch report](../06-运行报告/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.md) + +同时,V2.4 的 `real smoke` 也已经成功跑通,当前可直接查看: + +- [real smoke summary](../../../tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json) +- [real smoke batch report](../06-运行报告/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md) + +当前这条真实链路的状态可以简化理解为: + +- `experiment_validity = valid` +- `long_context_review_verdict = needs_manual_review` +- 自动化的长上下文质量判断在 real smoke 下仍然有限 +- 但成本、compaction、tool-result-budget、session_memory policy evidence 已经进入正式产物 + +## 当前边界 + +V2.4 当前仍然有边界,不要误读: + +- 它不是最终的长上下文 benchmark 平台。 +- `manual_review_required` 依然是设计的一部分,不是暂时缺陷。 +- `fixture smoke` 最强,因为它能提供可控、可复现的 trace-backed 长上下文证据。 +- `real smoke` 只是小型真实链路确认,不代表大规模真实评测已经完成。 +- 本轮没有进入 `tool / skill` 专项价值评测,那是下一阶段问题。 + +## 一句话总结 + +V2.4 让这套系统第一次能够系统地问: + +```text +上下文变长之后,这个 harness 到底有没有稳住约束、事实和治理效果? +``` diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" new file mode 100644 index 0000000000..1d44706237 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.5\347\211\210\346\234\254\351\241\271\347\233\256\344\273\213\347\273\215\344\270\216\351\230\205\350\257\273\346\214\207\345\215\227.md" @@ -0,0 +1,245 @@ +# V2.5版本项目介绍与阅读指南 +## 理解清单 + +`V2.5` 的目标不是继续堆更多评测项,而是在 `V2.3` 与 `V2.4` 已经能够稳定产出实验结果的基础上,补出第一层“反馈回路”。 + +当前稳定状态已经推进到 `V2.5 beta`: + +- `alpha` 负责把实验结果转成结构化建议 +- `beta` 负责把这些建议正式分类、排序,并生成可拍板的 approval card + +这里的反馈回路不是: + +- agent 自动改代码 +- agent 自动合并 candidate +- agent 自动做自我进化 + +而是: + +- 把评测结果系统化地转成结构化建议 +- 明确哪些是事实,哪些是推断 +- 生成可审查的下一步 proposal +- 等你拍板 + +## 预期效果 + +如果你只想快速理解 V2.5,可以把它看成一条新 pipeline: + +```text +Experiment Report +-> Finding Extractor +-> Taxonomy Normalizer +-> Hypothesis Builder +-> Proposal Prioritizer +-> Candidate Variant Proposal +-> Next Experiment Plan +-> Human Approval Card +``` + +它会把实验结果输出为: + +- `Finding` +- `Hypothesis` +- `Improvement Proposal` +- `Candidate Variant Proposal` +- `Next Experiment Plan` +- `Feedback Run` + +并且把对应的人类可读报告写到: + +```text +ObservrityTask/10-系统版本/v2/07-反馈报告/ +``` + +## 设计思路 + +V2.5 的设计非常克制。 + +因为当前系统虽然已经能: + +- 批量评测 +- 做 long-context 专项评测 +- 观测 runtime difference + +但它还不能完全自动判断真实语义质量。 + +所以 V2.5 选择先补: + +- “建议生成层” + +而不是直接补: + +- “自动自我进化层” + +换句话说,V2.5 的核心原则是: + +```text +自动提建议 +不自动改代码 +``` + +## 与前面版本的关系 + +当前版本关系可以理解成: + +```text +V2.2.5 = 单次真实实验闭环 +V2.3 = batch / repeat / stability +V2.4 = long-context 专项评测 +V2.5 = feedback loop alpha +``` + +也就是说: + +- V2.3 解决“怎么批量跑” +- V2.4 解决“怎么评测 long-context” +- V2.5 解决“评测结果应该怎么转成下一步建议” + +## 当前新增对象 + +V2.5 新增或正式定义了 6 个核心对象: + +1. `Finding` +- 表示观察到的事实 + +2. `Hypothesis` +- 表示对 finding 的解释推断 + +3. `Improvement Proposal` +- 表示建议改哪一层 + +4. `Candidate Variant Proposal` +- 表示如果要做 candidate,草案应该长什么样 + +5. `Next Experiment Plan` +- 表示做完建议之后怎么验证 + +6. `Feedback Run` +- 表示一次 feedback 生成过程本身的正式产物 + +## 第一版 extractor 当前能处理什么 + +当前 `V2.5 beta` 仍然只处理明确规则化 finding: + +1. `constraint_retention_rate_mean = null` +2. `retrieved_fact_hit_rate_mean = null` +3. `long_context_review_verdict = needs_manual_review` +4. `risk_verdict.status = inconclusive` +5. `missing_score_count > 0` +6. `manual_review_required = true` +7. `flaky_status != stable` +8. `run_failures` 非空 + +这些 finding 都必须带 `evidence_ref`。 + +## 第一版建议类型 + +当前 proposal generator 主要生成 4 类建议: + +1. `evaluator_improvement` +- 例如为 real smoke 增加轻量语义 output parser + +2. `score_binding_improvement` +- 例如把 parser 结果接入 `context.*` score-spec + +3. `scenario_improvement` +- 例如收紧 expected facts / constraints / manual review prompts + +4. `feedback_contract_improvement` +- 例如收紧 feedback taxonomy / proposal queue / approval contract + +## 当前推荐样例 + +V2.5 alpha 最推荐的输入是: + +- `tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json` + +因为它最能代表当前系统边界: + +- runtime difference 已证明 +- 真实链路已跑通 +- 但语义评分仍有 `null` +- 很适合作为第一条 feedback case + +## 当前推荐阅读顺序 + +1. 先读本文件 +2. 再读 [tests/evals/v2/README.md](../../../tests/evals/v2/README.md) +3. 再读 [tests/evals/v2/V2.5-feedback-loop-usage.md](../../../tests/evals/v2/V2.5-feedback-loop-usage.md) +4. 再看生成出来的 `07-反馈报告` + +## 如何运行 + +```powershell +bun run typecheck +bun run scripts/evals/v2_validate_manifests.ts +bun run scripts/evals/v2_validate_experiment_artifacts.ts +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +``` + +## 当前边界 + +当前 `V2.5 beta` 仍然有明确边界: + +- 不自动改代码 +- 不自动实现 proposal +- 不自动 promote candidate +- 不把 hypothesis 当事实 +- 不把 proposal 当最终判断 +- 不绕过人工批准 + +但它已经比 alpha 多出: + +- `proposal queue` +- `top recommendation` +- `blocking/manual/auto_resolvable` finding buckets +- `approval card` +- `feedback artifact validator` + +## 一句话总结 + +`V2.5 的本质,是让系统第一次具备“根据评测结果,系统化地产生下一步改动建议”的能力;而 beta 的意义,是让这些建议变得正式、可排序、可拍板。` +## Contract v0 Follow-up + +Current V2.5 beta has already moved one step forward: + +1. `candidate_long_context_output_parser_v0` is implemented. +2. Feedback now promotes `tighten_real_smoke_expectations_v0` as the next recommendation. +3. A dedicated follow-up path now exists: + +```text +tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json +tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json +``` + +This follow-up does not change runtime harness policy. It only tightens: + +- final answer contract +- expected fact anchoring +- manual-review prompt precision + +## Feedback Contract Follow-up + +Current V2.5 beta has now moved one layer further than `expectation_contract_v0`. + +The newest follow-up is not a runtime or scenario change. It is a feedback-system change: + +- detect when the source experiment already uses `expectation_contract_v0` +- stop re-recommending the same scenario-contract proposal as the next top action +- keep one unique `top_recommendation` in the approval card + +Latest validated feedback artifact: + +```text +tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json +``` + +Its current queue state is: + +- `top_recommendation = stabilize_feedback_input_contract_after_contract_v0` +- `deferred = stabilize_feedback_input_contract_v0` + +This means the system can now distinguish: + +- "the expectation contract still needs tightening" +- from "the feedback loop must recognize that this tightening has already happened" diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/README.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/README.md" index 33d2481cec..7c0b3aef4a 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/README.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/README.md" @@ -2,8 +2,28 @@ 当前目录用于承载可观测系统 V2 的分阶段实施文档。 +## 理解清单 + +- 这里保存的是“如何实施”的文档,不是运行报告。 +- 当前最新的阶段性规划已经推进到 `V2.5 beta`。 +- `V2.5 beta` 的重点不是直接实现 proposal,而是先把 feedback taxonomy / proposal queue / manual approval contract 做扎实。 + +## 预期效果 + +如果你要继续推进 V2,读这个目录应该能快速知道: + +1. 当前阶段做到哪里了。 +2. 下一阶段准备修哪一层。 +3. 哪份任务书是当前最该执行的版本。 + +## 设计思路 + +这里不追求完整历史陈列,而是优先突出当前仍然有执行价值的阶段文档。 + 建议阅读顺序: 1. [../01-总览/可观测系统V2北极星与评测模型草案.md](../01-%E6%80%BB%E8%A7%88/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E5%8C%97%E6%9E%81%E6%98%9F%E4%B8%8E%E8%AF%84%E6%B5%8B%E6%A8%A1%E5%9E%8B%E8%8D%89%E6%A1%88.md) 2. [可观测系统V2第一阶段实施任务书.md](./%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E7%AC%AC%E4%B8%80%E9%98%B6%E6%AE%B5%E5%AE%9E%E6%96%BD%E4%BB%BB%E5%8A%A1%E4%B9%A6.md) 3. [可观测系统V2第一阶段执行清单.md](./%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E7%AC%AC%E4%B8%80%E9%98%B6%E6%AE%B5%E6%89%A7%E8%A1%8C%E6%B8%85%E5%8D%95.md) +4. [可观测系统V2.5alpha任务书.md](./%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2.5alpha%E4%BB%BB%E5%8A%A1%E4%B9%A6.md) +5. [可观测系统V2.5Beta任务书.md](./%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2.5Beta%E4%BB%BB%E5%8A%A1%E4%B9%A6.md) diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" new file mode 100644 index 0000000000..cebe484b15 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" @@ -0,0 +1,45 @@ +# V2.3 Batch Experiment Summary: v2_3_robustness_smoke + +## Understanding + +- experiment: v2_3_robustness_smoke +- mode: execute_harness +- scenario_count: 2 +- candidate_count: 2 +- repeat_count: 2 +- output_json: tests\evals\v2\experiment-runs\v2_3_robustness_smoke_2026-05-03T070927523Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| execute_harness_smoke_minimal | baseline_default | 2 | 1 | 110 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| execute_harness_smoke_minimal | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| execute_harness_smoke_minimal | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | baseline_default | 2 | 1 | 110 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_eval_fixture_shadow | 2 | 1 | 105 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| robustness_smoke_minimal_alt | candidate_session_memory_sparse | 2 | 1 | 100 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_session_memory_sparse | execute_harness_smoke_minimal | 1 | 100 | stable | +| 2 | candidate_session_memory_sparse | robustness_smoke_minimal_alt | 1 | 100 | stable | +| 3 | candidate_eval_fixture_shadow | execute_harness_smoke_minimal | 1 | 105 | stable | +| 4 | candidate_eval_fixture_shadow | robustness_smoke_minimal_alt | 1 | 105 | stable | + +## Flaky Scenario Notes + +- No flaky run group detected by the current V2.3 heuristic. + +## Run Failures + +- No run failures recorded. + + + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" new file mode 100644 index 0000000000..4154323fd1 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" @@ -0,0 +1,98 @@ +# V2.4 Long-Context Experiment Summary: v2_4_long_context_fixture_smoke + +## Understanding + +- experiment: v2_4_long_context_fixture_smoke +- mode: execute_harness +- scenario_count: 4 +- candidate_count: 1 +- repeat_count: 2 +- output_json: tests\evals\v2\experiment-runs\v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_compaction_pressure | baseline_default | 2 | 1 | 1640 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_compaction_pressure | candidate_long_context_fixture_guarded | 2 | 1 | 1240 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_constraint_retention | baseline_default | 2 | 1 | 1280 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_constraint_retention | candidate_long_context_fixture_guarded | 2 | 1 | 1090 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_distractor_resistance | baseline_default | 2 | 1 | 1320 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_distractor_resistance | candidate_long_context_fixture_guarded | 2 | 1 | 1120 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_fact_retrieval | baseline_default | 2 | 1 | 1360 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | +| long_context_fact_retrieval | candidate_long_context_fixture_guarded | 2 | 1 | 1140 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | stable | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_long_context_fixture_guarded | long_context_constraint_retention | 1 | 1090 | stable | +| 2 | candidate_long_context_fixture_guarded | long_context_distractor_resistance | 1 | 1120 | stable | +| 3 | candidate_long_context_fixture_guarded | long_context_fact_retrieval | 1 | 1140 | stable | +| 4 | candidate_long_context_fixture_guarded | long_context_compaction_pressure | 1 | 1240 | stable | + +## Flaky Scenario Notes + +- No flaky run group detected by the current V2.3 heuristic. + +## Run Failures + +- No run failures recorded. + +## Long Context Summary + +- review_verdict: needs_manual_review +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_compaction_pressure | candidate_long_context_fixture_guarded | compaction_pressure | large | 1 | 1 | 0 | 0 | 0 | 2 | 188 | 1230 | 1 | true | +| long_context_constraint_retention | candidate_long_context_fixture_guarded | constraint_retention | medium | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1080 | 1 | true | +| long_context_distractor_resistance | candidate_long_context_fixture_guarded | distractor_resistance | medium | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1110 | 1 | true | +| long_context_fact_retrieval | candidate_long_context_fixture_guarded | retrieval | medium | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1130 | 1 | true | + +### Semantic Interpretation + +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Observed constraint retention remained at 100.0%. +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Observed fact retrieval hit rate is 100.0%. +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: No distractor confusion was observed in the current evidence window. +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Compaction/tool-result governance was active with mean compaction trigger count 2.000 and mean saved tokens 188. +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Relative to baseline, candidate prompt-token delta mean is -400.000. +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Manual review remains open for 2 question(s). +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Observed constraint retention remained at 100.0%. +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Observed fact retrieval hit rate is 100.0%. +- long_context_constraint_retention / candidate_long_context_fixture_guarded: No distractor confusion was observed in the current evidence window. +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Relative to baseline, candidate prompt-token delta mean is -190.000. +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Manual review remains open for 2 question(s). +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Observed constraint retention remained at 100.0%. +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Observed fact retrieval hit rate is 100.0%. +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: No distractor confusion was observed in the current evidence window. +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Relative to baseline, candidate prompt-token delta mean is -200.000. +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Manual review remains open for 2 question(s). +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Observed constraint retention remained at 100.0%. +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Observed fact retrieval hit rate is 100.0%. +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: No distractor confusion was observed in the current evidence window. +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Relative to baseline, candidate prompt-token delta mean is -220.000. +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Manual review remains open for 2 question(s). + +### Manual Review Notes + +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Did the answer keep the exact three required headings? +- long_context_compaction_pressure / candidate_long_context_fixture_guarded: Did the answer stay on current compaction signals instead of archived names? +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Did the answer remain valid JSON instead of drifting into prose? +- long_context_constraint_retention / candidate_long_context_fixture_guarded: Did the answer preserve owner=v2-platform while staying read-only? +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper? +- long_context_distractor_resistance / candidate_long_context_fixture_guarded: Did the answer avoid treating the old execute_harness smoke as the long-context manifest? +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? +- long_context_fact_retrieval / candidate_long_context_fixture_guarded: Did the answer preserve the four-bullet constraint without extra prose? + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. + + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" new file mode 100644 index 0000000000..97c58afb03 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" @@ -0,0 +1,65 @@ +# V2.4 Long-Context Experiment Summary: v2_4_long_context_real_smoke + +## Understanding + +- experiment: v2_4_long_context_real_smoke +- mode: execute_harness +- scenario_count: 1 +- candidate_count: 1 +- repeat_count: 1 +- output_json: tests\evals\v2\experiment-runs\v2_4_long_context_real_smoke_2026-05-03T060617173Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke | baseline_default | 1 | 1 | 27189 | 0 | 7982 | 0 | 0 | 0 | 0 | 0 | inconclusive | +| long_context_fact_retrieval_real_smoke | candidate_session_memory_sparse | 1 | 1 | 27189 | 0 | 7506 | 0 | 0 | 0 | 0 | 0 | inconclusive | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_session_memory_sparse | long_context_fact_retrieval_real_smoke | 1 | 27189 | inconclusive | + +## Flaky Scenario Notes + +- long_context_fact_retrieval_real_smoke / baseline_default: inconclusive +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: inconclusive + +## Run Failures + +- No run failures recorded. + +## Long Context Summary + +- review_verdict: needs_manual_review +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke | candidate_session_memory_sparse | retrieval | medium | n/a | n/a | 0 | 0 | 0 | 4 | 0 | 26887 | n/a | true | + +### Semantic Interpretation + +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Automatic fact-retrieval quality could not be fully established from trace-backed evidence alone. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: No distractor confusion was observed in the current evidence window. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Relative to baseline, candidate prompt-token delta mean is 0.000. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Manual review remains open for 2 question(s). + +### Manual Review Notes + +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Did the answer preserve the four-bullet constraint without extra prose? + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. + + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" new file mode 100644 index 0000000000..8870e6f51f --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" @@ -0,0 +1,66 @@ +# V2.4 Long-Context Experiment Summary: v2_4_long_context_real_smoke + +## Understanding + +- experiment: v2_4_long_context_real_smoke +- mode: execute_harness +- scenario_count: 1 +- candidate_count: 1 +- repeat_count: 1 +- output_json: tests\evals\v2\experiment-runs\v2_4_long_context_real_smoke_2026-05-03T145644822Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke | baseline_default | 1 | 1 | 27189 | 0 | 7109 | 0 | 0 | 0 | 0 | 0 | inconclusive | +| long_context_fact_retrieval_real_smoke | candidate_session_memory_sparse | 1 | 1 | 27189 | 0 | 12172 | 0 | 0 | 0 | 0 | 0 | inconclusive | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_session_memory_sparse | long_context_fact_retrieval_real_smoke | 1 | 27189 | inconclusive | + +## Flaky Scenario Notes + +- long_context_fact_retrieval_real_smoke / baseline_default: inconclusive +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: inconclusive + +## Run Failures + +- No run failures recorded. + +## Long Context Summary + +- review_verdict: needs_manual_review +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke | candidate_session_memory_sparse | retrieval | medium | 1 | 1 | 0 | 0 | 0 | 4 | 0 | 26887 | n/a | true | + +### Semantic Interpretation + +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Observed constraint retention remained at 100.0%. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Observed fact retrieval hit rate is 100.0%. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: No distractor confusion was observed in the current evidence window. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Relative to baseline, candidate prompt-token delta mean is 0.000. +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Manual review remains open for 2 question(s). + +### Manual Review Notes + +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? +- long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse: Did the answer preserve the four-bullet constraint without extra prose? + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. + + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" new file mode 100644 index 0000000000..c4159f6e86 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" @@ -0,0 +1,66 @@ +# V2.4 Long-Context Experiment Summary: v2_5_long_context_real_smoke_expectation_contract_v0 + +## Understanding + +- experiment: v2_5_long_context_real_smoke_expectation_contract_v0 +- mode: execute_harness +- scenario_count: 1 +- candidate_count: 1 +- repeat_count: 1 +- output_json: tests\evals\v2\experiment-runs\v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json + +## Batch Stability Table + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | duration_mean_ms | duration_stddev_ms | tool_variance | subagent_variance | turn_variance | recovery_rate | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | baseline_default | 1 | 1 | 27436 | 0 | 15546 | 0 | 0 | 0 | 0 | 0 | inconclusive | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | 1 | 1 | 27372 | 0 | 12781 | 0 | 0 | 0 | 0 | 0 | inconclusive | + +## Candidate Ranking + +| rank | candidate_variant | scenario | success_rate | token_mean | flaky_status | +| ---: | --- | --- | ---: | ---: | --- | +| 1 | candidate_session_memory_sparse | long_context_fact_retrieval_real_smoke_contract_v0 | 1 | 27372 | inconclusive | + +## Flaky Scenario Notes + +- long_context_fact_retrieval_real_smoke_contract_v0 / baseline_default: inconclusive +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: inconclusive + +## Run Failures + +- No run failures recorded. + +## Long Context Summary + +- review_verdict: needs_manual_review +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | retrieval | medium | 1 | 1 | 0 | 0 | 0 | 4 | 0 | 27007 | n/a | true | + +### Semantic Interpretation + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Observed constraint retention remained at 100.0%. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Observed fact retrieval hit rate is 100.0%. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: No distractor confusion was observed in the current evidence window. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Relative to baseline, candidate prompt-token delta mean is 0.000. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Manual review remains open for 2 question(s). + +### Manual Review Notes + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint? +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet? + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. + + +## Interpretation Limits + +- V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. +- Flaky status is a first-pass engineering signal based on failures and coarse variance, not a statistical proof. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md" new file mode 100644 index 0000000000..92b4ab48ee --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md" @@ -0,0 +1,68 @@ +# V2 Run Comparison + +## Understanding + +- baseline_run: run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e +- candidate_run: run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d +- scenario: long_context_fact_retrieval_real_smoke_contract_v0 +- baseline_variant: baseline_default +- candidate_variant: candidate_session_memory_sparse + +## Expected Outcome + +This report compares two V2 runs using score artifacts generated from V1 observability evidence. + +## Design Rationale + +Higher is better for capability and stability scores. Lower is better for explicit efficiency cost or latency scores. + +## Summary + +- regression_count: 0 +- baseline_user_action_id: 0b6a625e-d7ce-4afc-b42d-fdaf6df5654e +- candidate_user_action_id: a3fb1e0d-6260-4f43-a830-70b723a236ae +- runtime_difference_observed: true + +## Variant Effect Evidence + +- baseline_policy_event_observed: true +- candidate_policy_event_observed: true +- candidate_variant_effect_observed: true +- baseline_policy_mode: default +- candidate_policy_mode: sparse +- baseline_session_memory_subagent_count: 1 +- candidate_session_memory_subagent_count: 1 + +## Runtime Difference Summary + +- Baseline session_memory policy was observed with mode=default. +- Candidate session_memory policy was observed with mode=sparse. +- Candidate sparse runtime markers were observed. +- A runtime difference was observed between baseline and candidate. +- Trigger details: baseline=[token_threshold_and_natural_break], candidate=[token_threshold_and_natural_break]. + +## Score Deltas + +| score | baseline | candidate | delta | verdict | +| --- | ---: | ---: | ---: | --- | +| context.compaction_saved_tokens | 0 | 0 | 0 | unchanged | +| context.compaction_trigger_count | 4 | 4 | 0 | unchanged | +| context.constraint_retention_rate | 1 | 1 | 0 | unchanged | +| context.distractor_confusion_count | 0 | 0 | 0 | unchanged | +| context.lost_constraint_count | 0 | 0 | 0 | unchanged | +| context.manual_review_required | 1 | 1 | 0 | unchanged | +| context.retained_constraint_count | 2 | 2 | 0 | unchanged | +| context.retrieved_fact_hit_rate | 1 | 1 | 0 | unchanged | +| context.success_under_context_pressure | 1 | 1 | 0 | unchanged | +| context.total_prompt_input_tokens | 27007 | 27007 | 0 | unchanged | +| controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| efficiency.total_billed_tokens | 27436 | 27372 | -64 | improved | +| stability.recovery_absence | 1 | 1 | 0 | unchanged | +| task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Interpretation Limits + +- Candidate runtime effect was observed, but this comparison is still single-run and should not be treated as a full stability judgment. +- This compare report only uses trace-backed V1/V2 evidence and does not judge final answer quality by itself. +- Scenario note: n/a diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" new file mode 100644 index 0000000000..c02952b819 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" @@ -0,0 +1,154 @@ +# V2 Experiment Summary: v2_5_long_context_real_smoke_expectation_contract_v0 + +## Understanding + +- experiment: v2_5_long_context_real_smoke_expectation_contract_v0 +- mode: execute_harness +- baseline_variant: baseline_default +- candidate_variants: candidate_session_memory_sparse +- scenario_count: 1 +- score_specs: task_success.main_chain_observed, efficiency.total_billed_tokens, decision_quality.session_memory_policy_observed, stability.recovery_absence, controllability.turn_limit_basic, context.retained_constraint_count, context.lost_constraint_count, context.constraint_retention_rate, context.retrieved_fact_hit_rate, context.distractor_confusion_count, context.total_prompt_input_tokens, context.compaction_trigger_count, context.compaction_saved_tokens, context.success_under_context_pressure, context.manual_review_required +- gate_policy: default_v2_1_gate +- output_json: tests\evals\v2\experiment-runs\v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json + +## Expected Outcome + +This summary records a manifest-driven V2 experiment run. In bind_existing mode, V2 binds existing V1 traces. In execute_harness mode, V2 executes the scenario first, then captures the generated user_action_id through benchmark_run_id. + +## Design Rationale + +The runner always scores only trace-backed V1 facts. V2.2-beta adds runtime-effect evidence and experiment-validity semantics so smoke and real experiments are not confused with each other. + +## Long Context Review + +- requested_mode: execute_harness +- review_verdict: needs_manual_review +- note: This profile focuses on whether long-context pressure preserves constraints, facts, and governance signals. + +## Risk Verdict + +- hard_failures: 0 +- soft_warnings: 0 +- missing_or_inconclusive: 1 +- risk_status: inconclusive +- scope: regression_risk_only +- final_experiment_judgment: false +- recommended_review_mode: manual_review + +This section is a regression-risk gate, not a final judgment about whether the harness change is valuable. + +## Variant Effect Evidence + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: baseline_mode=default, candidate_mode=sparse, candidate_effect_observed=true, runtime_difference_observed=true + +## Experiment Validity + +- status: valid +- profile: real_experiment +- baseline_captured: true +- candidate_captured: true +- no_ambiguous_capture: true +- score_evidence_present: true +- variant_effect_observed: true +- runtime_difference_observed: true +- scenario_intent_matched: true +- reason: Real experiment remains interpretable. + +- No additional blockers or warnings. + +## Runtime Difference Summary + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Baseline session_memory policy was observed with mode=default. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Candidate session_memory policy was observed with mode=sparse. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Candidate sparse-policy markers were observed in runtime evidence. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Observed baseline and candidate session_memory policies differ. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: At least one score dimension changed between baseline and candidate. + +## Long Context Summary + +- review_verdict: needs_manual_review +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | retrieval | medium | 1 | 1 | 0 | 0 | 0 | 4 | 0 | 27007 | n/a | true | + +### Semantic Interpretation + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Observed constraint retention remained at 100.0%. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Observed fact retrieval hit rate is 100.0%. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: No distractor confusion was observed in the current evidence window. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Relative to baseline, candidate prompt-token delta mean is 0.000. +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Manual review remains open for 2 question(s). + +### Manual Review Notes + +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint? +- long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse: Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet? + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. + + +## V2.3 Batch Robustness + +- batch_report: ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md +- run_group_count: 2 +- run_failure_count: 0 + +| scenario | variant | repeats | success_rate | token_mean | token_stddev | flaky_status | +| --- | --- | ---: | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | baseline_default | 1 | 1 | 27436 | 0 | inconclusive | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | 1 | 1 | 27372 | 0 | inconclusive | + +### Run Failures + +- No run failures recorded. + +## Scorecard Summary + +| scenario | candidate_variant | score | baseline | candidate | delta | interpretation | +| --- | --- | --- | ---: | ---: | ---: | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.compaction_saved_tokens | 0 | 0 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.compaction_trigger_count | 4 | 4 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.constraint_retention_rate | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.distractor_confusion_count | 0 | 0 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.lost_constraint_count | 0 | 0 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.manual_review_required | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.retained_constraint_count | 2 | 2 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.retrieved_fact_hit_rate | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.success_under_context_pressure | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | context.total_prompt_input_tokens | 27007 | 27007 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | controllability.turn_limit_basic | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | decision_quality.session_memory_policy_observed | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | efficiency.total_billed_tokens | 27436 | 27372 | -64 | improved | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | stability.recovery_absence | 1 | 1 | 0 | unchanged | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | task_success.main_chain_observed | 1 | 1 | 0 | unchanged | + +## Exploration Signals + +- 1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer. +- A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas. + +## Runs + +| scenario | repeat | baseline_run | candidate_variant | candidate_run | experiment_validity | risk_gate | compare_report | +| --- | ---: | --- | --- | --- | --- | --- | --- | +| long_context_fact_retrieval_real_smoke_contract_v0 | 1 | run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e | candidate_session_memory_sparse | run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d | valid | 1/4 not passed | ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md | + +## Risk Gate Details + +| scenario | candidate_variant | rule_type | score_spec | verdict | regression_pct | +| --- | --- | --- | --- | --- | ---: | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | hard_fail | task_success.main_chain_observed | pass | 0 | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | hard_fail | efficiency.total_billed_tokens | pass | 0 | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | soft_warning | efficiency.total_billed_tokens | pass | 0 | +| long_context_fact_retrieval_real_smoke_contract_v0 | candidate_session_memory_sparse | soft_warning | decision_quality.subagent_count_observed | missing | n/a | + +## Interpretation Limits + +- Long-context automatic scoring is strongest in fixture_trace mode; real smoke still preserves a manual-review lane. +- Cost and compaction evidence alone do not prove that the final answer remained semantically correct. diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.3-robustness-\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070927523Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.3-robustness-\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070927523Z.md" new file mode 100644 index 0000000000..1f5be1209d --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.3-robustness-\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070927523Z.md" @@ -0,0 +1,217 @@ +## V2.3 报告详细解读 + +对应原始结果: +- `tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json` +- `ObservrityTask/10-系统版本/v2/06-运行报告/batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md` + +### 这份报告在回答什么 + +这不是一份“模型能力总评”报告,而是一份“批量运行与稳定性框架是否正常工作”的报告。 + +它主要回答 4 个问题: +- 多个 scenario 能不能一起跑 +- 多个 candidate 能不能一起比较 +- repeat 之后结果是否稳定 +- `run_group` / `stability_summary` / `flaky_status` 这些 V2.3 基础设施是否正常 + +### 先看总状态 + +这次实验的总体状态是健康的: +- `requested_mode = execute_harness` +- `mode = execute_harness` +- `experiment_validity.status = valid` +- `risk_verdict.status = pass` +- `run_refs = 12` +- `run_group_refs = 6` +- `flaky_scenarios = []` +- `run_failures = []` + +这表示: +- 跑的是自动执行链路,而不是手工绑定 +- 本次 smoke 有效 +- 一共生成了 12 个 run +- 这些 run 被组织成 6 个 `run_group` +- 没有 flaky group +- 没有失败 group + +### 这 12 个 run 是怎么来的 + +本次 V2.3 smoke 的实验结构是: +- 2 个 scenario +- 1 个 baseline +- 2 个 candidate +- repeat 2 次 + +所以总 run 数是: +- `2 × (1 + 2) × 2 = 12` + +而 `run_group` 的粒度是“同一个 scenario + variant 的重复组”,所以总共是: +- `2 × 3 = 6` + +### 如何阅读 Batch Stability Table + +这张表是 V2.3 报告的核心。 + +你应该这样读: + +1. `success_rate` +- 是否每次 repeat 都跑成了 +- 当前全部是 `1` +- 意思是每组都 100% 成功 + +2. `token_mean` 与 `token_stddev` +- `token_mean` 表示该组重复运行后的平均总 token +- `token_stddev` 表示波动 +- 当前全部是 `0` +- 说明两次 repeat 的 token 完全一致 + +3. `duration_mean_ms` 与 `duration_stddev_ms` +- 这是端到端耗时 +- 当前 stddev 也是 `0` +- 说明时长没有抖动 + +4. `tool_variance / subagent_variance / turn_variance` +- 这三个值用来监控结构性抖动 +- 如果一次 run 用了工具、另一次没用,或者 turn 数变化很大,这里会抬高 +- 当前全部是 `0` +- 说明结构非常稳定 + +5. `recovery_rate` +- 是否经常进入恢复/补救链路 +- 当前全部是 `0` +- 说明 smoke 下没有异常恢复 + +6. `flaky_status` +- 这是 V2.3 的粗粒度稳定性标签 +- 当前全部是 `stable` + +### 当前这份 V2.3 报告的直接结论 + +#### 结论 1:V2.3 的 batch 机制是活的 + +这次实验已经证明: +- multi-scenario 正常 +- multi-candidate 正常 +- repeat 正常 +- `run_group` 正常 +- `stability_summary` 正常 +- `flaky_status` 正常 + +也就是说,V2.3 的“批量运行 + 稳定性抽象层”已经不是纸面设计,而是能实际出结果的。 + +#### 结论 2:当前 smoke 很稳定 + +这次最重要的工程结论其实不是“哪个 candidate 更强”,而是: +- 所有 group 都稳定 +- 没有结构性抖动 +- 没有失败 +- 没有 flaky + +这说明: +- 你的 V2.3 runner 不只是能跑 +- 而且在 smoke 规模下已经能稳定跑 + +#### 结论 3:成本差异已经能被正确观测 + +在这次 smoke 里: +- `baseline_default` 平均 token = `110` +- `candidate_eval_fixture_shadow` 平均 token = `105` +- `candidate_session_memory_sparse` 平均 token = `100` + +所以当前报告里能看到: +- `candidate_eval_fixture_shadow` 相比 baseline 节省 `5` +- `candidate_session_memory_sparse` 相比 baseline 节省 `10` + +这证明: +- V2.3 不只是会跑 +- 它已经能正确记录 baseline / candidate 的成本差异 + +### 为什么 Candidate Ranking 不能被过度解读 + +报告里有一个 `Candidate Ranking`,看上去像是在给 candidate 排名。 + +但你要非常克制地理解它。 + +当前这个 ranking 的含义是: +- 在这次 smoke 中 +- 在当前这些结构化指标下 +- 哪个 candidate 的成本更低、且稳定性没坏 + +它不等价于: +- 哪个 candidate 更聪明 +- 哪个 harness 更有长期价值 +- 哪个 candidate 在真实复杂任务里一定更好 + +原因很简单: +- 这是 smoke 任务 +- 任务非常短 +- 没有复杂语义负担 +- 也没有真实长上下文压力 + +因此,`Candidate Ranking` 只能被当成: +- 一种轻量工程排序信号 + +不能被当成: +- 模型质量裁决 + +### Risk Verdict 应该怎么理解 + +这里的: +- `risk_verdict.status = pass` + +它的意思不是: +- “candidate 是正确的” +- “candidate 更强” + +它真正的意思是: +- 在这次 smoke 中,没有观察到明显回归风险 + +所以 `pass` 只能解释为: +- 回归风险门通过 + +不能解释为: +- 最终实验结论为真 + +### 这份 V2.3 报告真正证明了什么 + +它真正证明了 3 件事: + +1. V2.3 的批量执行框架可用 +- 多 scenario、多 candidate、repeat 都跑通 + +2. V2.3 的稳定性抽象可用 +- `run_group` +- `stability_summary` +- `flaky_status` +- `run_failures` + +3. V2.3 的基础对比能力可用 +- baseline 与 candidate 的成本差异已经能被系统记录并汇总 + +### 这份 V2.3 报告没有证明什么 + +它没有证明: +- 某个 candidate 一定更聪明 +- 某个 candidate 一定更适合真实任务 +- session memory sparse 策略已经在复杂任务中被正式验证 + +换句话说: +- V2.3 当前报告证明的是“平台基础设施” +- 不是“能力层最终裁决” + +### 推荐阅读顺序 + +以后你回来看 V2.3 报告,建议固定这样读: + +1. `experiment_validity` +2. `risk_verdict` +3. `Batch Stability Table` +4. `Flaky Scenario Notes` +5. `Run Failures` +6. 最后再看 `Candidate Ranking` + +### 一句话总结 + +这份 V2.3 报告说明: + +`V2.3 已经成功把单次实验推进成“可批量、可重复、可看稳定性”的工程评测层;当前 smoke 结果稳定、无失败、无 flaky,说明这层基础设施已经可用。` diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-fixture-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070957231Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-fixture-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070957231Z.md" new file mode 100644 index 0000000000..41f7f65db5 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-fixture-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T070957231Z.md" @@ -0,0 +1,247 @@ +## V2.4 Fixture 长上下文报告详细解读 + +对应原始结果: +- `tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json` +- `ObservrityTask/10-系统版本/v2/06-运行报告/batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md` + +### 这份报告在回答什么 + +这不是“真实模型最终能力报告”,而是一份“长上下文专项评测层在可控环境下是否闭合”的报告。 + +它主要回答: +- 约束在长上下文中会不会丢失 +- 关键事实在长上下文中能不能找回 +- 干扰项会不会把 agent 带偏 +- compaction / context governance 是否可被观测 +- 在质量不坏的前提下,candidate 是否节省 token + +### 先看总状态 + +当前这次 fixture smoke 的总状态是健康的: +- `requested_mode = execute_harness` +- `mode = execute_harness` +- `experiment_validity.status = valid` +- `long_context_review_verdict = needs_manual_review` +- `run_refs = 16` +- `run_group_refs = 8` + +这表示: +- 长上下文专项评测已经进入正式 experiment runner +- 本次实验有效 +- 一共形成了 16 个 run +- 它们被组织成 8 个 `run_group` + +### 为什么是 16 个 run、8 个 run_group + +本次 fixture smoke 的结构是: +- 4 个 long-context family +- baseline + 1 个 candidate +- repeat 2 次 + +所以总 run 数是: +- `4 × 2 × 2 = 16` + +而 `run_group` 的粒度仍然是: +- 同一个 `scenario + variant` + +所以是: +- `4 × 2 = 8` + +### 这 4 个 long-context family 分别在测什么 + +#### 1. `long_context_constraint_retention` + +它测试: +- 上下文很长时,硬约束会不会被丢掉 + +典型问题包括: +- 输出是不是还保持指定结构 +- 有没有从只读任务偷偷滑向写任务 +- 有没有把规定字段漏掉 + +#### 2. `long_context_fact_retrieval` + +它测试: +- 关键事实埋在长上下文里后,agent 是否还能找回 + +典型问题包括: +- 真实 entrypoint 是否能正确找回 +- 关键路径、关键配置是否还能命中 + +#### 3. `long_context_distractor_resistance` + +它测试: +- 旧信息、假信息、过时名词会不会把 agent 带偏 + +典型问题包括: +- 是否把旧 smoke manifest 当成当前 long-context manifest +- 是否把旧 entrypoint 当成当前 entrypoint + +#### 4. `long_context_compaction_pressure` + +它测试: +- 在上下文治理被触发时,agent 是否仍然稳 +- 同时还要看 compaction 有没有真的带来 token 节省 + +### Long Context Summary 应该怎么读 + +这张表是 V2.4 fixture 报告的核心。 + +你可以按下面顺序读: + +1. `retention_rate` +- 约束保留率 +- 当前 4 个 family 全部是 `1` +- 说明没有出现约束丢失 + +2. `fact_hit_rate` +- 关键事实命中率 +- 当前 4 个 family 全部是 `1` +- 说明关键事实全部找回 + +3. `lost_constraints / missed_facts` +- 当前全是 `0` +- 说明既没有丢约束,也没有漏事实 + +4. `distractor_confusion` +- 当前全是 `0` +- 说明没有被干扰项带偏 + +5. `compaction_triggers / compaction_saved_tokens` +- 主要看 `compaction_pressure` 这一行 +- 当前: + - `compaction_triggers = 2` + - `compaction_saved_tokens = 188` +- 这说明 candidate 的省 token 不是纯黑箱,而是伴随真实治理事件 + +6. `total_prompt_tokens` +- 这是 candidate 的 prompt token 水平 +- 需要结合 `prompt_token_delta_mean` 一起看 + +7. `manual_review_required` +- 当前全部是 `true` +- 这是设计使然,不是失败 + +### 当前这份 fixture 报告的直接结论 + +#### 结论 1:V2.4 的长上下文评测层已经闭合 + +你现在已经有完整的正式链路: +- scenario +- execute_harness +- run +- score +- run_group +- experiment summary +- long_context_summary +- batch report + +也就是说,V2.4 已经不是“想法”,而是正式运行的评测层。 + +#### 结论 2:在 fixture 模式下,4 类 long-context 问题都被稳定测到了 + +当前所有 family 都表现为: +- `constraint_retention_rate = 1` +- `retrieved_fact_hit_rate = 1` +- `distractor_confusion = 0` + +这说明: +- 当前构造的 fixture 任务,candidate 能保持质量 +- 系统也能稳定识别这种质量保持 + +#### 结论 3:candidate 在 fixture 模式下节省了 token + +你可以直接看 candidate 相比 baseline 的 token 下降: + +- `constraint_retention`: `1280 -> 1090`,下降 `190` +- `fact_retrieval`: `1360 -> 1140`,下降 `220` +- `distractor_resistance`: `1320 -> 1120`,下降 `200` +- `compaction_pressure`: `1640 -> 1240`,下降 `400` + +这说明: +- candidate 不只是“答对” +- 还在“答对”的前提下降低了 prompt token + +其中最重要的是: +- `compaction_pressure` 这组下降最多 +- 并且伴随 `compaction_saved_tokens = 188` + +也就是说: +- 省 token 是可解释的 +- 不是偶然噪声 + +### 为什么 `long_context_review_verdict` 还是 `needs_manual_review` + +很多人看到这里会误解,以为: +- 既然都 100% 了,为什么还不是自动通过 + +正确理解是: +- V2.4 不打算把长上下文语义问题粗暴压成一个“全自动真理分数” +- 它会保留人类复核入口 + +这份报告里保留的人工复核问题包括: +- 是否真的保持 JSON,而不是偷偷写成 prose +- 是否真的命中了 `src/entrypoints/cli.tsx` +- 是否真的避开了旧 manifest / 旧入口 + +所以 `needs_manual_review` 的意思不是: +- 自动化失败 + +而是: +- 自动结构证据已经足够强 +- 但最终语义仍建议人类过一眼 + +### Risk Verdict 为什么是 `inconclusive` + +这里的: +- `risk_verdict.status = inconclusive` + +不是说实验失败。 + +它的真正含义是: +- 当前回归风险门里存在 `missing_score` +- 而这些缺失与 long-context 语义自动判定边界有关 + +所以你应该这样理解: +- 回归门没有给出负面结论 +- 但系统也拒绝装作自己已经能自动裁决全部长上下文质量 + +这是一个健康的边界表达。 + +### 当前这份 fixture 报告真正证明了什么 + +它真正证明了: + +1. V2.4 的 long-context 专项层已正式运行 +2. 4 个 long-context family 已经接入统一 experiment runner +3. fixture 模式下,自动证据足够强 +4. candidate 在质量不坏的前提下,能观察到 token 节省 +5. compaction/context governance 已经进入正式观测口径 + +### 这份 fixture 报告没有证明什么 + +它没有证明: +- 真实模型在真实复杂长上下文任务下已经完全自动可裁决 +- candidate 在真实线上任务中一定更优 +- manual review 已经可以取消 + +换句话说: +- 这份报告证明的是“长上下文评测层闭合” +- 不是“真实世界最终答案已经全自动化” + +### 推荐阅读顺序 + +以后你回看 V2.4 fixture 报告,建议按这个顺序: + +1. `experiment_validity` +2. `Batch Stability Table` +3. `Long Context Summary` +4. `Semantic Interpretation` +5. `Manual Review Notes` +6. 最后再看 `Candidate Ranking` + +### 一句话总结 + +这份 V2.4 fixture 报告说明: + +`V2.4 已经成功建立了长上下文专项评测层;在可控 fixture 环境下,系统能够稳定观测约束保持、事实找回、抗干扰和 compaction 治理,并且能把质量与成本一起呈现出来。` diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-real-smoke-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T060617173Z.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-real-smoke-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T060617173Z.md" new file mode 100644 index 0000000000..d13826765a --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/\346\212\245\345\221\212\350\247\243\350\257\273/V2.4-real-smoke-\351\225\277\344\270\212\344\270\213\346\226\207\346\212\245\345\221\212\350\257\246\347\273\206\350\247\243\350\257\273-2026-05-03T060617173Z.md" @@ -0,0 +1,290 @@ +## V2.4 Real Smoke 长上下文报告详细解读 + +对应原始结果: +- `tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json` +- `ObservrityTask/10-系统版本/v2/06-运行报告/batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md` + +### 这份报告在回答什么 + +这份 real smoke 报告的核心问题不是: +- candidate 最终是不是更强 + +它主要回答的是: +- 真实 `execute_harness` 链路下,V2.4 还能不能跑通 +- baseline 和 candidate 的 runtime policy 差异,是否真的进入了正式证据 +- 长上下文治理事件在真实链路下是否可被观测 +- 自动裁决在哪些地方已经够强,哪些地方仍必须留给人工复核 + +### 先看总状态 + +当前总体状态是健康的: +- `requested_mode = execute_harness` +- `mode = execute_harness` +- `report_profile = real_experiment` +- `experiment_validity.status = valid` +- `long_context_review_verdict = needs_manual_review` +- `run_refs = 2` +- `run_group_refs = 2` +- `run_failures = []` + +这表示: +- 本次不是 fixture,而是真实自动执行链路 +- baseline 和 candidate 都跑成了 +- V1 capture 成功 +- V2 artifact 也成功生成 +- 没有失败 run + +### 这份报告里最重要的不是 score,而是 runtime difference + +这份 real smoke 的核心价值,在于它首次明确证明: + +- `baseline_policy_mode = default` +- `candidate_policy_mode = sparse` +- `runtime_difference_observed = true` + +这三件事 together 才是最重要的。 + +为什么? + +因为它说明: +- 这次实验里 candidate 不是“名字上叫 sparse” +- 而是“真实 runtime 里真的执行成 sparse policy 了” + +### baseline 与 candidate 的 runtime policy 到底差在哪 + +#### baseline 观测到的 policy + +baseline 的 `session_memory` policy 是: +- `mode = default` +- `natural_break_only = false` +- `token_threshold_multiplier = 1` +- `tool_threshold_multiplier = 1` +- `minimum_message_tokens_to_init = 10000` +- `minimum_tokens_between_update = 5000` +- `tool_calls_between_updates = 6` + +这表示 baseline 是较标准的默认策略: +- 更容易更新 +- 门槛较低 +- 不要求必须 natural break + +#### candidate 观测到的 policy + +candidate 的 `session_memory` policy 是: +- `mode = sparse` +- `natural_break_only = true` +- `token_threshold_multiplier = 2` +- `tool_threshold_multiplier = 2` +- `minimum_message_tokens_to_init = 20000` +- `minimum_tokens_between_update = 10000` +- `tool_calls_between_updates = 12` + +这表示 candidate 的策略更保守: +- 只在更合适的时机更新 +- 阈值更高 +- 更偏向 sparse 更新 + +### 这件事为什么重要 + +因为这说明: +- 你的 variant 改动已经不只是 manifest 里的描述 +- 它已经成为真实 runtime 证据的一部分 + +也就是说,V2.4 real 当前已经能回答一个非常关键的问题: + +`这个 candidate 的 harness 改动,到底有没有真的生效?` + +当前答案是: +- 有 + +### Long Context Summary 应该怎么读 + +这次 real smoke 只有 1 个 scenario: +- `long_context_fact_retrieval_real_smoke` + +对应的 long-context summary 里,你最该看这些字段: + +1. `constraint_retention_rate_mean` +- 当前是 `null` + +2. `retrieved_fact_hit_rate_mean` +- 当前是 `null` + +3. `distractor_confusion_mean` +- 当前是 `0` + +4. `compaction_trigger_mean` +- 当前是 `4` + +5. `tool_result_budget_trigger_mean` +- 当前是 `2` + +6. `total_prompt_input_tokens_mean` +- 当前是 `26887` + +7. `prompt_token_delta_mean` +- 当前是 `0` + +### 这些值应该怎么解释 + +#### `constraint_retention_rate_mean = null` +#### `retrieved_fact_hit_rate_mean = null` + +这两个 `null` 不是简单 bug,也不应直接理解为失败。 + +它真正表达的是: +- 当前真实链路下,系统已经拿到了 trace-backed evidence +- 但这些证据还不足以让系统完全自动判断“语义上到底有没有正确找回事实、有没有完整保住约束” + +也就是说: +- 系统已经很诚实地告诉你“我现在还不能自动下最终结论” + +这恰恰是好事,因为它避免了伪精确。 + +#### `distractor_confusion_mean = 0` + +这个值非常有意义。 + +它说明: +- 在当前这次 real smoke 里 +- 没有观察到明显的“被旧信息/错误入口/错误线索带偏”的现象 + +它不等于“100% 语义正确”,但它至少说明: +- 没有出现显著误导 + +#### `compaction_trigger_mean = 4` +#### `tool_result_budget_trigger_mean = 2` + +这是 V2.4 real 非常关键的工程信号。 + +它说明: +- 在真实执行链路下 +- 上下文治理机制确实被触发了 +- 不是只在 fixture 里看得到 + +也就是说: +- compaction +- tool result budget + +这类长上下文治理行为,已经被正式纳入真实评测证据。 + +#### `total_prompt_input_tokens_mean = 26887` +#### `prompt_token_delta_mean = 0` + +这说明: +- 这次 real smoke 中,candidate 并没有在 prompt token 上拉开差距 +- 至少在这一次小型真实实验里,baseline 和 candidate 的 prompt 成本一样 + +这很重要,因为它提醒你: +- candidate 的 runtime policy 差异已经被证明 +- 但这个差异暂时还没有在这次 real smoke 里转化成明显成本收益 + +### 为什么 `long_context_review_verdict` 仍然是 `needs_manual_review` + +因为当前真实链路下,系统还不能自动回答下面两个语义问题: + +- 回答里是否真的写对了 `src/entrypoints/cli.tsx` +- 回答是否真的保持了四条约束,没有额外废话 + +所以这份报告明确保留了 `Manual Review Notes`: +- `Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?` +- `Did the answer preserve the four-bullet constraint without extra prose?` + +这说明: +- 当前真实链路的自动证据已经足够支撑平台判断 +- 但还不足以完全取代人类语义审查 + +### Scorecard Summary 应该怎么理解 + +当前 `scorecard_summary` 里你会看到: +- 一些项是 `unchanged` +- 一些项是 `missing` + +这背后的逻辑是: + +#### 可以自动比的项 +- `context.compaction_trigger_count` +- `context.compaction_saved_tokens` +- `context.distractor_confusion_count` +- `context.total_prompt_input_tokens` +- `efficiency.total_billed_tokens` +- `task_success.main_chain_observed` + +这些项有明确 trace 或 token 证据,所以能自动比较。 + +#### 仍然 `missing` 的项 +- `context.constraint_retention_rate` +- `context.retrieved_fact_hit_rate` + +这些项在当前真实链路下暂时不能全自动判定,所以被保守标为 `missing`。 + +这不是系统没做事,而是系统在拒绝假装自己已经看懂了全部语义。 + +### Gate Results 应该怎么读 + +这份 real smoke 的 gate 结果很有代表性: + +#### 已通过的 gate +- `task_success.main_chain_observed` +- `efficiency.total_billed_tokens` + +表示: +- candidate 没丢主链成功 +- 成本也没有恶化 + +#### `missing` 的 gate +- `decision_quality.subagent_count_observed` + +表示: +- 这个观察项当前没有足够证据,不宜强判 + +所以 `risk_verdict.status = inconclusive` 的正确含义是: +- 不是失败 +- 而是这次 real smoke 的风险门没有看到硬失败,但也有部分语义项尚未自动闭合 + +### 当前这份 real smoke 报告真正证明了什么 + +它真正证明了 4 件事: + +1. 真实 `execute_harness` 链路已经成功 +- baseline/candidate 都成功执行 +- capture 都成功 + +2. runtime variant difference 已经能被正式观测 +- baseline 是 `default` +- candidate 是 `sparse` +- 且系统明确写出了差异 + +3. 长上下文治理事件已经进入真实证据 +- compaction 触发 +- tool result budget 触发 + +4. 系统已经能正确区分“可自动判定的事实”和“必须人工复核的语义” + +### 这份 real smoke 报告没有证明什么 + +它没有证明: +- candidate 在真实长上下文任务里已经 definitively 更优 +- sparse policy 一定带来成本收益 +- 长上下文语义质量已经完全自动可裁决 + +也就是说: +- V2.4 real 证明的是“真实链路与 runtime 差异” +- 不是“最终能力裁决” + +### 推荐阅读顺序 + +以后你读 real smoke,建议固定这样看: + +1. `experiment_validity` +2. `variant_effect_summary` +3. `runtime_difference_summary` +4. `Long Context Summary` +5. `Manual Review Notes` +6. 最后再看 `scorecard_summary` 和 `gate_results` + +### 一句话总结 + +这份 V2.4 real smoke 报告说明: + +`V2.4 已经在真实 execute_harness 链路下成功证明了 baseline 与 candidate 的 runtime policy 差异确实存在,并且 compaction/context governance 已进入正式证据;但真实语义质量仍然需要人工复核,系统没有假装自己已经能全自动裁决。` diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.md" new file mode 100644 index 0000000000..f9a43c434c --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.md" @@ -0,0 +1,180 @@ +# V2.5 Feedback Report: feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66 + +## Understanding + +- source_experiment_run: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +- source_reports: + - ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md + - ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md + - ObservrityTask\10-系统版本\v2\06-运行报告\experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md +- generated_at: 2026-05-03T10:32:10.763Z +- this report is advisory only and does not apply code changes automatically + +## Findings + +- finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39 + - type: long_context_review_verdict_needs_manual_review + - severity: medium + - summary: The experiment-level long_context_review_verdict remains needs_manual_review. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4 + - type: risk_verdict_inconclusive + - severity: medium + - summary: The regression-risk verdict is inconclusive for this experiment. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae + - type: missing_score_count_positive + - severity: medium + - summary: The experiment still has 1 missing score(s). + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b + - type: constraint_retention_rate_missing_long_context_fact_retrieval_real_smoke + - severity: medium + - summary: constraint_retention_rate_mean is null for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006 + - type: retrieved_fact_hit_rate_missing_long_context_fact_retrieval_real_smoke + - severity: medium + - summary: retrieved_fact_hit_rate_mean is null for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2 + - type: manual_review_required_long_context_fact_retrieval_real_smoke + - severity: medium + - summary: manual_review_required is true for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723 + - type: flaky_status_long_context_fact_retrieval_real_smoke_baseline_default + - severity: high + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae + - type: flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse + - severity: high + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status + - fact_or_inference: fact + +## Hypotheses + +- hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57 + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b, finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006 + - hypothesis: The current real-smoke scorer lacks a lightweight semantic output parser, so fact retrieval and constraint retention cannot yet be auto-judged from runtime outputs. + - risks: A parser that is too narrow can miss valid answers. | A parser that is too loose can create false positives. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a + - confidence: high + - based_on: finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39, finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2 + - hypothesis: The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but not fully resolve final semantic correctness in real smoke. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4, finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae + - hypothesis: The regression-risk gate is inconclusive mainly because some semantic long-context scores are still missing, not because the runner failed to execute. + - risks: If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93 + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723, finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae + - hypothesis: Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - fact_or_inference: inference + +## Improvement Proposals + +- proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146 + - type: evaluator_improvement + - target_layer: scorer + - description: Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence. + - expected_effect: Convert currently-null long-context semantic scores into rule-backed observed values where the output format is narrow enough. + - risks: A parser that is too narrow can miss valid answers. | A parser that is too loose can create false positives. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84 + - type: scenario_improvement + - target_layer: scenario + - description: Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic. + - expected_effect: Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488 + - type: evaluator_improvement + - target_layer: scorer + - description: Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk. + - expected_effect: Reduce inconclusive gate results caused purely by absent semantic score evidence. + - risks: If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4 + - type: scenario_improvement + - target_layer: scenario + - description: Stabilize the upstream scenario or runner contract before trusting automated feedback suggestions for this branch of evaluation. + - expected_effect: Reduce flaky or failed inputs before turning feedback artifacts into candidate work items. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - requires_human_approval: true + +## Candidate Variant Proposals + +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7 + - variant_name: candidate_long_context_output_parser_v0 + - change_layer: scorer + - implementation_scope: Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed + - variant_name: candidate_long_context_expectation_contract_v0 + - change_layer: scenario + - implementation_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9 + - variant_name: candidate_long_context_score_binding_v0 + - change_layer: scorer + - implementation_scope: Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb + - variant_name: candidate_feedback_input_contract_v0 + - change_layer: scenario + - implementation_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files + +## Next Experiment Plans + +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400 + - candidate_variant_id: candidate_long_context_output_parser_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 2 + - success_criteria: retrieved_fact_hit_rate is no longer null for real smoke. | constraint_retention_rate is no longer null for real smoke. | manual_review_required does not increase. | distractor_confusion_count remains 0. + - failure_criteria: Parser introduces false positives against distractor-resistant scenarios. | Manual review requirement increases or semantic scores become contradictory. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e + - candidate_variant_id: candidate_long_context_expectation_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Manual review prompts become more specific and lower-ambiguity. | Scenario intent remains matched. | No new flaky or failed run groups appear. + - failure_criteria: Scenario contract changes erase the current runtime-difference evidence. | Long-context intent becomes less specific or more brittle. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37 + - candidate_variant_id: candidate_long_context_score_binding_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 2 + - success_criteria: retrieved_fact_hit_rate is no longer null for real smoke. | constraint_retention_rate is no longer null for real smoke. | manual_review_required does not increase. | distractor_confusion_count remains 0. + - failure_criteria: Parser introduces false positives against distractor-resistant scenarios. | Manual review requirement increases or semantic scores become contradictory. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f + - candidate_variant_id: candidate_feedback_input_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Manual review prompts become more specific and lower-ambiguity. | Scenario intent remains matched. | No new flaky or failed run groups appear. + - failure_criteria: Scenario contract changes erase the current runtime-difference evidence. | Long-context intent becomes less specific or more brittle. + - manual_review_required: true + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.md" new file mode 100644 index 0000000000..14cd13cb9a --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.md" @@ -0,0 +1,307 @@ +# V2.5 Beta Feedback Report: feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b + +## Understanding + +- source_experiment_run: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +- source_reports: + - ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md + - ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md + - ObservrityTask\10-系统版本\v2\06-运行报告\experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md +- generated_at: 2026-05-03T12:45:41.901Z +- this report is advisory only and does not apply code changes automatically + +## Human Approval Card + +- current_top_recommendation: tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json +- why_now: This directly targets the two most important semantic nulls in the current real-smoke sample and does not require runtime harness changes. +- why_not_others_yet: + - proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8: recommended_later - By itself it does not convert null semantic scores into formal evidence, so it is best staged after parser work begins. + - proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2: blocked - This is blocked until a lightweight parser exists; there is nothing stable to bind before that. + - proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. +- approval_scope: Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal. +- do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- next_experiment_plan_ref: tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json +- success_criteria: + - retrieved_fact_hit_rate is no longer null for real smoke. + - constraint_retention_rate is no longer null for real smoke. + - manual_review_required does not increase. + - distractor_confusion_count remains 0. +- risks: + - A parser that is too narrow can miss valid answers. + - A parser that is too loose can create false positives. +- manual_review_boundary: Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks. + +## Proposal Queue + +- top_recommendation: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json +- recommended_now: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json +- recommended_later: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json +- deferred: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json +- blocked: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json + +## Approval Contract + +- blocking_findings: + - none +- manual_judgement_required_findings: + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json +- auto_resolvable_findings: + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json + +## Findings + +- finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e + - type: long_context_review_verdict_needs_manual_review + - kind: manual_review_boundary + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The experiment-level long_context_review_verdict remains needs_manual_review. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2 + - type: risk_verdict_inconclusive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The regression-risk verdict is inconclusive for this experiment. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b + - type: missing_score_count_positive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The experiment still has 1 missing score(s). + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c + - type: constraint_retention_rate_missing_long_context_fact_retrieval_real_smoke + - kind: missing_score + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke + - summary: constraint_retention_rate_mean is null for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de + - type: retrieved_fact_hit_rate_missing_long_context_fact_retrieval_real_smoke + - kind: missing_score + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke + - summary: retrieved_fact_hit_rate_mean is null for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8 + - type: manual_review_required_long_context_fact_retrieval_real_smoke + - kind: manual_review_boundary + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke + - summary: manual_review_required is true for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740 + - type: flaky_status_long_context_fact_retrieval_real_smoke_baseline_default + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke:baseline_default + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee + - type: flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke:candidate_session_memory_sparse + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact + +## Hypotheses + +- hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8 + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c, finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean + - hypothesis: The current real-smoke evaluator lacks a lightweight semantic output parser, so fact retrieval and constraint retention cannot yet be auto-judged from runtime outputs. + - falsifiable_by: Implement a lightweight real-smoke output parser and rerun long_context_fact_retrieval_real_smoke. | Verify retrieved_fact_hit_rate and constraint_retention_rate become non-null without inflating distractor_confusion_count. + - risks: A parser that is too narrow can miss valid answers. | A parser that is too loose can create false positives. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243 + - confidence: high + - based_on: finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e, finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8 + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required + - hypothesis: The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke. + - falsifiable_by: Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13 + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2, finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count + - hypothesis: The regression-risk gate is inconclusive mainly because semantic long-context scores are still missing, not because the runner failed to execute. + - falsifiable_by: After parser output is bound into context scores, rerun the same real smoke and confirm whether risk_verdict becomes more decisive without hiding uncertainty. + - risks: If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740, finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status + - hypothesis: Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used. + - falsifiable_by: Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - fact_or_inference: inference + +## Improvement Proposals + +- proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36 + - type: evaluator_improvement + - target_layer: evaluator + - priority: P0 + - queue_bucket: top_recommendation + - description: Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence. + - expected_effect: Convert currently-null long-context semantic scores into rule-backed observed values where the output format is narrow enough. + - why_now: This directly targets the two most important semantic nulls in the current real-smoke sample and does not require runtime harness changes. + - why_not_now: n/a + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: A parser that is too narrow can miss valid answers. | A parser that is too loose can create false positives. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8 + - type: scenario_improvement + - target_layer: scenario + - priority: P1 + - queue_bucket: recommended_later + - description: Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic. + - expected_effect: Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs. + - why_now: This is the cleanest way to narrow manual review once semantic evidence collection improves. + - why_not_now: By itself it does not convert null semantic scores into formal evidence, so it is best staged after parser work begins. + - blocking_finding_ids: none + - manual_judgement_finding_ids: finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e | finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8 + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2 + - type: score_binding_improvement + - target_layer: scorer + - priority: P1 + - queue_bucket: blocked + - description: Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk. + - expected_effect: Reduce inconclusive gate results caused purely by absent semantic score evidence. + - why_now: The gate cannot become more informative until parser output is formally bound into context scores. + - why_not_now: This is blocked until a lightweight parser exists; there is nothing stable to bind before that. + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51 + - type: feedback_contract_improvement + - target_layer: feedback_system + - priority: P2 + - queue_bucket: deferred + - description: Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation. + - expected_effect: Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items. + - why_now: This keeps the feedback system honest when stability evidence is weak or under-sampled. + - why_not_now: The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - requires_human_approval: true + +## Candidate Variant Proposals + +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978 + - variant_name: candidate_long_context_output_parser_v0 + - change_layer: evaluator + - implementation_scope: Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e + - variant_name: candidate_long_context_expectation_contract_v0 + - change_layer: scenario + - implementation_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355 + - variant_name: candidate_long_context_score_binding_v0 + - change_layer: scorer + - implementation_scope: Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac + - variant_name: candidate_feedback_input_contract_v0 + - change_layer: feedback_system + - implementation_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts + +## Next Experiment Plans + +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758 + - candidate_variant_id: candidate_long_context_output_parser_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 2 + - success_criteria: retrieved_fact_hit_rate is no longer null for real smoke. | constraint_retention_rate is no longer null for real smoke. | manual_review_required does not increase. | distractor_confusion_count remains 0. + - failure_criteria: Parser introduces false positives against distractor-resistant scenarios. | Manual review requirement increases or semantic scores become contradictory. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6 + - candidate_variant_id: candidate_long_context_expectation_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Manual review prompts become more specific and lower-ambiguity. | Scenario intent remains matched. | No new flaky or failed run groups appear. + - failure_criteria: Scenario contract changes erase the current runtime-difference evidence. | Long-context intent becomes less specific or more brittle. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3 + - candidate_variant_id: candidate_long_context_score_binding_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 2 + - success_criteria: retrieved_fact_hit_rate is no longer null for real smoke. | constraint_retention_rate is no longer null for real smoke. | manual_review_required does not increase. | distractor_confusion_count remains 0. + - failure_criteria: Parser introduces false positives against distractor-resistant scenarios. | Manual review requirement increases or semantic scores become contradictory. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b + - candidate_variant_id: candidate_feedback_input_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Feedback queue semantics become stable and easier to approve. | Top recommendation remains unique. | No new schema ambiguity appears in feedback artifacts. + - failure_criteria: Feedback queue becomes contradictory or unstable across equivalent inputs. | Manual review and human approval boundaries become harder to distinguish. + - manual_review_required: true + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.md" new file mode 100644 index 0000000000..b2ff09df2f --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.md" @@ -0,0 +1,211 @@ +# V2.5 Beta Feedback Report: feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90 + +## Understanding + +- source_experiment_run: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json +- source_reports: + - ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_vs_run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md + - ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md + - ObservrityTask\10-系统版本\v2\06-运行报告\experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md +- generated_at: 2026-05-03T14:59:42.988Z +- this report is advisory only and does not apply code changes automatically + +## Human Approval Card + +- current_top_recommendation: tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json +- why_now: Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision. +- why_not_others_yet: + - proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. +- approval_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. +- do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- next_experiment_plan_ref: tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json +- success_criteria: + - Manual review prompts become more specific and lower-ambiguity. + - Scenario intent remains matched. + - No new flaky or failed run groups appear. +- risks: + - Treating manual review signals as auto-pass would overstate evaluator certainty. +- manual_review_boundary: Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks. + +## Proposal Queue + +- top_recommendation: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json +- recommended_now: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json +- recommended_later: + - none +- deferred: + - tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json +- blocked: + - none + +## Approval Contract + +- blocking_findings: + - none +- manual_judgement_required_findings: + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json +- auto_resolvable_findings: + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json + - tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json + +## Findings + +- finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194 + - type: long_context_review_verdict_needs_manual_review + - kind: manual_review_boundary + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The experiment-level long_context_review_verdict remains needs_manual_review. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_review_verdict + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a + - type: risk_verdict_inconclusive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The regression-risk verdict is inconclusive for this experiment. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/risk_verdict/status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853 + - type: missing_score_count_positive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_4_long_context_real_smoke + - summary: The experiment still has 1 missing score(s). + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/risk_verdict/missing_score_count + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a + - type: manual_review_required_long_context_fact_retrieval_real_smoke + - kind: manual_review_boundary + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke + - summary: manual_review_required is true for long_context_fact_retrieval_real_smoke. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_summary/0/manual_review_required + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008 + - type: flaky_status_long_context_fact_retrieval_real_smoke_baseline_default + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke:baseline_default + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/0/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97 + - type: flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke:candidate_session_memory_sparse + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse. + - evidence_ref: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/1/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact + +## Hypotheses + +- hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447 + - confidence: high + - based_on: finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194, finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_review_verdict | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_summary/0/manual_review_required + - hypothesis: The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke. + - falsifiable_by: Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - fact_or_inference: inference +- hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0 + - confidence: medium + - based_on: finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008, finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97 + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/0/flaky_status | tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/1/flaky_status + - hypothesis: Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used. + - falsifiable_by: Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - fact_or_inference: inference + +## Improvement Proposals + +- proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91 + - type: scenario_improvement + - target_layer: scenario + - priority: P1 + - queue_bucket: top_recommendation + - description: Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic. + - expected_effect: Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs. + - why_now: Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision. + - why_not_now: n/a + - blocking_finding_ids: none + - manual_judgement_finding_ids: finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194 | finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - requires_human_approval: true +- proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d + - type: feedback_contract_improvement + - target_layer: feedback_system + - priority: P2 + - queue_bucket: deferred + - description: Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation. + - expected_effect: Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items. + - why_now: This keeps the feedback system honest when stability evidence is weak or under-sampled. + - why_not_now: The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - requires_human_approval: true + +## Candidate Variant Proposals + +- candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652 + - variant_name: candidate_long_context_expectation_contract_v0 + - change_layer: scenario + - implementation_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a + - variant_name: candidate_feedback_input_contract_v0 + - change_layer: feedback_system + - implementation_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts + +## Next Experiment Plans + +- experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519 + - candidate_variant_id: candidate_long_context_expectation_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Manual review prompts become more specific and lower-ambiguity. | Scenario intent remains matched. | No new flaky or failed run groups appear. + - failure_criteria: Scenario contract changes erase the current runtime-difference evidence. | Long-context intent becomes less specific or more brittle. + - manual_review_required: true +- experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4 + - candidate_variant_id: candidate_feedback_input_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke + - repeat_count: 1 + - success_criteria: Feedback queue semantics become stable and easier to approve. | Top recommendation remains unique. | No new schema ambiguity appears in feedback artifacts. + - failure_criteria: Feedback queue becomes contradictory or unstable across equivalent inputs. | Manual review and human approval boundaries become harder to distinguish. + - manual_review_required: true + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.md" new file mode 100644 index 0000000000..81613b501f --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.md" @@ -0,0 +1,211 @@ +# V2.5 Beta Feedback Report: feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65 + +## Understanding + +- source_experiment_run: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json +- source_reports: + - ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md + - ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md + - ObservrityTask\10-系统版本\v2\06-运行报告\experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md +- generated_at: 2026-05-03T15:32:44.784Z +- this report is advisory only and does not apply code changes automatically + +## Human Approval Card + +- current_top_recommendation: tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json +- why_now: Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision. +- why_not_others_yet: + - proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. +- approval_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. +- do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- next_experiment_plan_ref: tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json +- success_criteria: + - Manual review prompts become more specific and lower-ambiguity. + - Scenario intent remains matched. + - No new flaky or failed run groups appear. +- risks: + - Treating manual review signals as auto-pass would overstate evaluator certainty. +- manual_review_boundary: Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks. + +## Proposal Queue + +- top_recommendation: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json +- recommended_now: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json +- recommended_later: + - none +- deferred: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json +- blocked: + - none + +## Approval Contract + +- blocking_findings: + - none +- manual_judgement_required_findings: + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json +- auto_resolvable_findings: + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json + +## Findings + +- finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de + - type: long_context_review_verdict_needs_manual_review + - kind: manual_review_boundary + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The experiment-level long_context_review_verdict remains needs_manual_review. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8 + - type: risk_verdict_inconclusive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The regression-risk verdict is inconclusive for this experiment. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3 + - type: missing_score_count_positive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The experiment still has 1 missing score(s). + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/missing_score_count + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad + - type: manual_review_required_long_context_fact_retrieval_real_smoke_contract_v0 + - kind: manual_review_boundary + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0 + - summary: manual_review_required is true for long_context_fact_retrieval_real_smoke_contract_v0. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438 + - type: flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0:baseline_default + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / baseline_default. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f + - type: flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0:candidate_session_memory_sparse + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact + +## Hypotheses + +- hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b + - confidence: high + - based_on: finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de, finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict | tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required + - hypothesis: The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke. + - falsifiable_by: Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - fact_or_inference: inference +- hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e + - confidence: medium + - based_on: finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438, finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status | tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status + - hypothesis: Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used. + - falsifiable_by: Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - fact_or_inference: inference + +## Improvement Proposals + +- proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52 + - type: scenario_improvement + - target_layer: scenario + - priority: P1 + - queue_bucket: top_recommendation + - description: Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic. + - expected_effect: Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs. + - why_now: Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision. + - why_not_now: n/a + - blocking_finding_ids: none + - manual_judgement_finding_ids: finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de | finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - requires_human_approval: true +- proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd + - type: feedback_contract_improvement + - target_layer: feedback_system + - priority: P2 + - queue_bucket: deferred + - description: Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation. + - expected_effect: Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items. + - why_now: This keeps the feedback system honest when stability evidence is weak or under-sampled. + - why_not_now: The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - requires_human_approval: true + +## Candidate Variant Proposals + +- candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f + - variant_name: candidate_long_context_expectation_contract_v0 + - change_layer: scenario + - implementation_scope: Only scenario manifests, expected facts, constraints, and manual review prompts may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | runtime harness policy files +- candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3 + - variant_name: candidate_feedback_input_contract_v0 + - change_layer: feedback_system + - implementation_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts + +## Next Experiment Plans + +- experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4 + - candidate_variant_id: candidate_long_context_expectation_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke_contract_v0 + - repeat_count: 1 + - success_criteria: Manual review prompts become more specific and lower-ambiguity. | Scenario intent remains matched. | No new flaky or failed run groups appear. + - failure_criteria: Scenario contract changes erase the current runtime-difference evidence. | Long-context intent becomes less specific or more brittle. + - manual_review_required: true +- experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1 + - candidate_variant_id: candidate_feedback_input_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke_contract_v0 + - repeat_count: 1 + - success_criteria: Feedback queue semantics become stable and easier to approve. | Top recommendation remains unique. | No new schema ambiguity appears in feedback artifacts. + - failure_criteria: Feedback queue becomes contradictory or unstable across equivalent inputs. | Manual review and human approval boundaries become harder to distinguish. + - manual_review_required: true + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md" new file mode 100644 index 0000000000..8823895e62 --- /dev/null +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md" @@ -0,0 +1,211 @@ +# V2.5 Beta Feedback Report: feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e + +## Understanding + +- source_experiment_run: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json +- source_reports: + - ObservrityTask\10-系统版本\v2\06-运行报告\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md + - ObservrityTask\10-系统版本\v2\06-运行报告\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md + - ObservrityTask\10-系统版本\v2\06-运行报告\experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md +- generated_at: 2026-05-03T15:46:26.054Z +- this report is advisory only and does not apply code changes automatically + +## Human Approval Card + +- current_top_recommendation: tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json +- why_now: The current source experiment already uses expectation_contract_v0, so repeating the same contract proposal would be a feedback-loop error rather than a useful next action. +- why_not_others_yet: + - proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. +- approval_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. +- do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- next_experiment_plan_ref: tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json +- success_criteria: + - Feedback queue semantics become stable and easier to approve. + - Top recommendation remains unique. + - No new schema ambiguity appears in feedback artifacts. +- risks: + - Treating manual review signals as auto-pass would overstate evaluator certainty. +- manual_review_boundary: Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks. + +## Proposal Queue + +- top_recommendation: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json +- recommended_now: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json +- recommended_later: + - none +- deferred: + - tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json +- blocked: + - none + +## Approval Contract + +- blocking_findings: + - none +- manual_judgement_required_findings: + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json +- auto_resolvable_findings: + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json + - tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json + +## Findings + +- finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044 + - type: long_context_review_verdict_needs_manual_review + - kind: manual_review_boundary + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The experiment-level long_context_review_verdict remains needs_manual_review. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0 + - type: risk_verdict_inconclusive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The regression-risk verdict is inconclusive for this experiment. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8 + - type: missing_score_count_positive + - kind: missing_score + - severity: warning + - scope: experiment + - scope_ref: v2_5_long_context_real_smoke_expectation_contract_v0 + - summary: The experiment still has 1 missing score(s). + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/missing_score_count + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: true + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925 + - type: manual_review_required_long_context_fact_retrieval_real_smoke_contract_v0 + - kind: manual_review_boundary + - severity: warning + - scope: scenario + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0 + - summary: manual_review_required is true for long_context_fact_retrieval_real_smoke_contract_v0. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required + - is_blocking: false + - requires_manual_judgement: true + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4 + - type: flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0:baseline_default + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / baseline_default. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact +- finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052 + - type: flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse + - kind: stability_gap + - severity: warning + - scope: variant + - scope_ref: long_context_fact_retrieval_real_smoke_contract_v0:candidate_session_memory_sparse + - summary: flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse. + - evidence_ref: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status + - is_blocking: false + - requires_manual_judgement: false + - auto_resolvable: false + - fact_or_inference: fact + +## Hypotheses + +- hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661 + - confidence: high + - based_on: finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044, finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925 + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict | tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required + - hypothesis: The tightened expectation contract is already in place, but manual review still remains open. The next bottleneck is feedback-loop deduplication and proposal stability, not another copy of the same scenario-contract recommendation. + - falsifiable_by: Re-run feedback on the same expectation-contract artifact and confirm the queue no longer repeats the same expectation-contract recommendation as top priority. | Verify the next top recommendation, if any, shifts to feedback-system stabilization rather than a duplicate scenario contract. + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - fact_or_inference: inference +- hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243 + - confidence: medium + - based_on: finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4, finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052 + - depends_on_finding_refs: tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status | tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status + - hypothesis: Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used. + - falsifiable_by: Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable. + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - fact_or_inference: inference + +## Improvement Proposals + +- proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4 + - type: feedback_contract_improvement + - target_layer: feedback_system + - priority: P1 + - queue_bucket: top_recommendation + - description: Stabilize the feedback input contract so an already-realized expectation-contract follow-up is detected and not re-recommended as the next top proposal. + - expected_effect: Prevent proposal-loop duplication and keep approval cards aligned with the true next unresolved bottleneck. + - why_now: The current source experiment already uses expectation_contract_v0, so repeating the same contract proposal would be a feedback-loop error rather than a useful next action. + - why_not_now: n/a + - blocking_finding_ids: none + - manual_judgement_finding_ids: finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044 | finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925 + - risks: Treating manual review signals as auto-pass would overstate evaluator certainty. + - requires_human_approval: true +- proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6 + - type: feedback_contract_improvement + - target_layer: feedback_system + - priority: P2 + - queue_bucket: deferred + - description: Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation. + - expected_effect: Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items. + - why_now: This keeps the feedback system honest when stability evidence is weak or under-sampled. + - why_not_now: The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred. + - blocking_finding_ids: none + - manual_judgement_finding_ids: none + - risks: Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise. + - requires_human_approval: true + +## Candidate Variant Proposals + +- candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2 + - variant_name: candidate_feedback_input_contract_after_contract_v0 + - change_layer: feedback_system + - implementation_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts +- candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3 + - variant_name: candidate_feedback_input_contract_v0 + - change_layer: feedback_system + - implementation_scope: Only feedback extraction rules, feedback taxonomy, and report/queue logic may change. + - do_not_touch: src/query.ts | src/services/SessionMemory/sessionMemory.ts | src/services/api/claude.ts + +## Next Experiment Plans + +- experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a + - candidate_variant_id: candidate_feedback_input_contract_after_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke_contract_v0 + - repeat_count: 1 + - success_criteria: Feedback queue semantics become stable and easier to approve. | Top recommendation remains unique. | No new schema ambiguity appears in feedback artifacts. + - failure_criteria: Feedback queue becomes contradictory or unstable across equivalent inputs. | Manual review and human approval boundaries become harder to distinguish. + - manual_review_required: true +- experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f + - candidate_variant_id: candidate_feedback_input_contract_v0 + - scenario_ids: long_context_fact_retrieval_real_smoke_contract_v0 + - repeat_count: 1 + - success_criteria: Feedback queue semantics become stable and easier to approve. | Top recommendation remains unique. | No new schema ambiguity appears in feedback artifacts. + - failure_criteria: Feedback queue becomes contradictory or unstable across equivalent inputs. | Manual review and human approval boundaries become harder to distinguish. + - manual_review_required: true + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences diff --git "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/README.md" "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/README.md" index 65a1bee2a6..f45b466d44 100644 --- "a/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/README.md" +++ "b/ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/README.md" @@ -1,64 +1,123 @@ # V2 -本目录用于承载可观测系统 V2 的设计、评测模型、实验框架与后续实现文档。 +本目录用于承载可观测系统 `V2` 的版本说明、任务书、数据模型、Scenario/Variant 设计与运行报告。 -目录结构: +## 理解清单 + +- `V2` 的核心目标不是“看日志”,而是“基于 V1 事实证据做正式评测”。 +- 当前最值得先理解的稳定状态已经推进到 `V2.5 beta`。 +- `V2.5 beta` 的关键成果是:在 `V2.4` 真实评测已经可运行的基础上,系统开始把评测结果转成结构化反馈建议、proposal queue 与人工拍板卡,但仍然不自动改代码。 + +## 预期效果 + +如果你第一次进入这个目录,读完这里列出的入口文档后,应该能快速知道: + +1. 当前系统发展到了哪一步。 +2. 你应该从哪份文档开始读。 +3. V2.4 的 fixture smoke 和 real smoke 应该怎么跑。 +4. V2.5 的 feedback loop beta 应该怎么跑。 +4. 运行结果应该去哪里看。 + +## 设计思路 + +这份 README 只做导航,不重复展开所有实现细节。 + +详细内容分别落在: + +- `01-总览`:讲系统是什么 +- `tests/evals/v2`:讲系统怎么跑 +- `06-运行报告`:看实验结果 + +## 当前推荐入口 + +优先阅读这三份: + +1. [01-总览/V2.5版本项目介绍与阅读指南.md](./01-%E6%80%BB%E8%A7%88/V2.5%E7%89%88%E6%9C%AC%E9%A1%B9%E7%9B%AE%E4%BB%8B%E7%BB%8D%E4%B8%8E%E9%98%85%E8%AF%BB%E6%8C%87%E5%8D%97.md) +2. [tests/evals/v2/README.md](../../../tests/evals/v2/README.md) +3. [tests/evals/v2/V2.5-feedback-loop-usage.md](../../../tests/evals/v2/V2.5-feedback-loop-usage.md) + +这三份文档分别回答: + +- `V2.5` 到底是什么 +- 当前系统该怎么运行 +- V2.5 feedback loop 怎么跑 + +## 目录结构 - `01-总览` - - 北极星、目标、抽象模型 + - 当前阶段的总览、北极星、阅读指南 - `02-实施任务书` - - 可直接开工的阶段性任务书 + - 各阶段任务书与后续规划 - `03-数据模型` - - 核心对象与关系定稿 + - `scenario / variant / experiment / run / score` 等抽象说明 - `04-Scenario集` - - 第一批 benchmark 场景 + - 评测场景设计 - `05-Variant与实验` - - variant / experiment 组织规范 + - variant 与 experiment 的组织规范 - `06-运行报告` - - V2 run 生成的证据报告 + - 运行后生成的人类可读报告 -V2.1 新增了 manifest 驱动的实验闭环: +## 当前阅读顺序 -```powershell -bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default -``` +1. [01-总览/V2.5版本项目介绍与阅读指南.md](./01-%E6%80%BB%E8%A7%88/V2.5%E7%89%88%E6%9C%AC%E9%A1%B9%E7%9B%AE%E4%BB%8B%E7%BB%8D%E4%B8%8E%E9%98%85%E8%AF%BB%E6%8C%87%E5%8D%97.md) +2. [01-总览/可观测系统V2北极星与评测模型草案.md](./01-%E6%80%BB%E8%A7%88/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E5%8C%97%E6%9E%81%E6%98%9F%E4%B8%8E%E8%AF%84%E6%B5%8B%E6%A8%A1%E5%9E%8B%E8%8D%89%E6%A1%88.md) +3. [tests/evals/v2/README.md](../../../tests/evals/v2/README.md) +4. [tests/evals/v2/V2.4-long-context-usage.md](../../../tests/evals/v2/V2.4-long-context-usage.md) +5. [tests/evals/v2/V2.5-feedback-loop-usage.md](../../../tests/evals/v2/V2.5-feedback-loop-usage.md) +6. [03-数据模型](./03-%E6%95%B0%E6%8D%AE%E6%A8%A1%E5%9E%8B/) +7. [04-Scenario集](./04-Scenario%E9%9B%86/) +8. [05-Variant与实验](./05-Variant%E4%B8%8E%E5%AE%9E%E9%AA%8C/) +9. [06-运行报告](./06-%E8%BF%90%E8%A1%8C%E6%8A%A5%E5%91%8A/) +10. [07-反馈报告](./07-%E5%8F%8D%E9%A6%88%E6%8A%A5%E5%91%8A/) -当前 runner 是 `bind_existing` 模式:它不会自动启动 harness 跑 prompt,而是把已经存在的 V1 `user_action_id` 绑定成 baseline/candidate run,然后自动生成 score、comparison report、gate 结果和 experiment summary。这是从“单次 run 对比”走向“实验闭环”的第一步。 +## 当前最重要的三条运行路径 -## 当前评测执行顺序 +### V2.4 fixture smoke -如果你现在要跑一次评测,推荐顺序是: +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json +``` -1. 选定一个 `scenario`,确认它已经存在于 `tests/evals/v2/scenarios/`。 -2. 选定 baseline 和 candidate `variant`,确认它们已经存在于 `tests/evals/v2/variants/`。 -3. 分别真实运行 baseline 和 candidate,拿到两个 V1 `user_action_id`。 -4. 把这两个 `user_action_id` 写入 `tests/evals/v2/experiments/.json` 的 `action_bindings`。 -5. 运行 manifest 校验: +### V2.4 verifier ```powershell -bun run scripts/evals/v2_validate_manifests.ts +bun run scripts/evals/v2_verify_long_context.ts ``` -6. 运行实验: +### V2.4 real smoke ```powershell -bun run scripts/evals/v2_run_experiment.ts --experiment +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.json ``` -7. 阅读输出: - - `tests/evals/v2/runs/`:baseline/candidate run 绑定记录 - - `tests/evals/v2/scores/`:每个 run 的 score - - `tests/evals/v2/experiment-runs/`:实验级 JSON summary - - `06-运行报告/`:面向人阅读的 run、compare、experiment 报告 +### V2.5 feedback loop beta -旧的 `v2_record_run.ts` 和 `v2_compare_runs.ts` 仍然保留,但它们现在更适合作为底层调试命令;日常推荐入口是 `v2_run_experiment.ts`。 +```powershell +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +``` -当前建议阅读顺序: +```powershell +bun run scripts/evals/v2_validate_feedback_artifacts.ts +``` -1. [01-总览/可观测系统V2北极星与评测模型草案.md](./01-%E6%80%BB%E8%A7%88/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E5%8C%97%E6%9E%81%E6%98%9F%E4%B8%8E%E8%AF%84%E6%B5%8B%E6%A8%A1%E5%9E%8B%E8%8D%89%E6%A1%88.md) -2. [02-实施任务书/可观测系统V2第一阶段实施任务书.md](./02-%E5%AE%9E%E6%96%BD%E4%BB%BB%E5%8A%A1%E4%B9%A6/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E7%AC%AC%E4%B8%80%E9%98%B6%E6%AE%B5%E5%AE%9E%E6%96%BD%E4%BB%BB%E5%8A%A1%E4%B9%A6.md) -3. [02-实施任务书/可观测系统V2第一阶段执行清单.md](./02-%E5%AE%9E%E6%96%BD%E4%BB%BB%E5%8A%A1%E4%B9%A6/%E5%8F%AF%E8%A7%82%E6%B5%8B%E7%B3%BB%E7%BB%9FV2%E7%AC%AC%E4%B8%80%E9%98%B6%E6%AE%B5%E6%89%A7%E8%A1%8C%E6%B8%85%E5%8D%95.md) -4. [03-数据模型/V2评测数据模型定稿.md](./03-%E6%95%B0%E6%8D%AE%E6%A8%A1%E5%9E%8B/V2%E8%AF%84%E6%B5%8B%E6%95%B0%E6%8D%AE%E6%A8%A1%E5%9E%8B%E5%AE%9A%E7%A8%BF.md) -5. [04-Scenario集/第一批Scenario候选集.md](./04-Scenario%E9%9B%86/%E7%AC%AC%E4%B8%80%E6%89%B9Scenario%E5%80%99%E9%80%89%E9%9B%86.md) -6. [05-Variant与实验/Variant组织规范.md](./05-Variant%E4%B8%8E%E5%AE%9E%E9%AA%8C/Variant%E7%BB%84%E7%BB%87%E8%A7%84%E8%8C%83.md) -7. [06-运行报告/README.md](./06-%E8%BF%90%E8%A1%8C%E6%8A%A5%E5%91%8A/README.md) +## 最新状态 + +当前 `V2.5 beta` 已经具备: + +- `V2.3` 的 batch / repeat / run_group / stability_summary +- 4 个长上下文 scenario family +- `context.*` score-spec +- `long_context` run 证据 +- `long_context_summary` +- `long_context_review_verdict` +- `feedback/findings` +- `feedback/hypotheses` +- `feedback/proposals` +- `feedback/candidate-proposals` +- `feedback/next experiment plans` +- `feedback/proposal queue` +- `feedback/approval card` +- `feedback artifact validator` + +这意味着: +系统已经不只是“能跑批量实验并解释长上下文”,而是开始能把这些结果转成结构化反馈建议,供你拍板下一轮改动。 diff --git a/scripts/evals/v2_emit_fixture_trace.ts b/scripts/evals/v2_emit_fixture_trace.ts index eb4429ace4..73ffeec843 100644 --- a/scripts/evals/v2_emit_fixture_trace.ts +++ b/scripts/evals/v2_emit_fixture_trace.ts @@ -3,6 +3,8 @@ import { spawnSync } from 'node:child_process' import { appendFile, mkdir } from 'node:fs/promises' import path from 'node:path' +import { buildLongContextFixtureEvidence } from './v2_harness_execution' + const repoRoot = path.resolve(import.meta.dirname, '..', '..') const observabilityDir = path.join(repoRoot, '.observability') const duckdbExe = path.join(repoRoot, 'tools', 'duckdb', 'duckdb.exe') @@ -32,6 +34,7 @@ function writeFixtureDb(params: { queryId: string startedAt: string endedAt: string + longContextFixture?: Awaited> }) { const benchmarkRunId = requiredEnv('CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID') const experimentId = requiredContextEnv( @@ -47,6 +50,12 @@ function writeFixtureDb(params: { 'CLAUDE_CODE_EVAL_VARIANT_ID', ) const evalRunId = requiredEnv('CLAUDE_CODE_EVAL_RUN_ID') + const tokenBase = + params.longContextFixture?.tokenBase ?? + (variantId.includes('sparse') ? 100 : 110) + const turnCount = params.longContextFixture?.turnCount ?? 1 + const subagentCount = params.longContextFixture?.subagentCount ?? 0 + const toolCallCount = params.longContextFixture?.toolCallCount ?? 0 const sql = [ 'CREATE TABLE IF NOT EXISTS user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, experiment_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, benchmark_run_id VARCHAR, eval_run_id VARCHAR, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', 'CREATE TABLE IF NOT EXISTS queries(query_id VARCHAR, user_action_id VARCHAR, agent_name VARCHAR, started_at VARCHAR, turn_count BIGINT, terminal_reason VARCHAR);', @@ -54,9 +63,25 @@ function writeFixtureDb(params: { 'CREATE TABLE IF NOT EXISTS subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', 'CREATE TABLE IF NOT EXISTS recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', 'CREATE TABLE IF NOT EXISTS metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', - `INSERT INTO user_actions VALUES (${sqlString(params.startedAt.slice(0, 10))}, ${sqlString(params.userActionId)}, ${sqlString(params.startedAt)}, 0, ${sqlString(params.endedAt)}, 10, 10, 2, 1, 1, 0, 0, 0, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, 100, 10, 0, 0, 100, 110, 100, 0);`, - `INSERT INTO queries VALUES (${sqlString(params.queryId)}, ${sqlString(params.userActionId)}, 'main_thread', ${sqlString(params.startedAt)}, 1, 'fixture_completed');`, + 'CREATE TABLE IF NOT EXISTS events_raw(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR, query_source VARCHAR, payload_json VARCHAR);', + 'CREATE TABLE IF NOT EXISTS long_context_evidence(user_action_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, payload_json VARCHAR);', + `INSERT INTO user_actions VALUES (${sqlString(params.startedAt.slice(0, 10))}, ${sqlString(params.userActionId)}, ${sqlString(params.startedAt)}, 0, ${sqlString(params.endedAt)}, 10, 10, 2, 1, 1, 0, ${subagentCount}, ${toolCallCount}, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, ${tokenBase - 10}, 10, 0, 0, ${tokenBase - 10}, ${tokenBase}, ${tokenBase - 10}, 0);`, + `INSERT INTO queries VALUES (${sqlString(params.queryId)}, ${sqlString(params.userActionId)}, 'main_thread', ${sqlString(params.startedAt)}, ${turnCount}, 'fixture_completed');`, `INSERT INTO metrics_integrity_daily VALUES (${sqlString(params.startedAt.slice(0, 10))}, 1, 1, 1, 1);`, + ...Array.from({ length: toolCallCount }, (_, index) => + `INSERT INTO tools VALUES (${sqlString(params.userActionId)}, ${sqlString(index === 0 ? 'Read' : 'Search')}, true, false);`, + ), + ...Array.from({ length: subagentCount }, () => + `INSERT INTO subagents VALUES (${sqlString(params.userActionId)}, 'session_memory', 'context_pressure', ${sqlString(scenarioId)}, 12);`, + ), + ...((params.longContextFixture?.events ?? []).map((event, index) => + `INSERT INTO events_raw VALUES (${sqlString(params.userActionId)}, ${sqlString(event.event_name)}, ${sqlString(new Date(new Date(params.startedAt).getTime() + index + 1).toISOString())}, 'main_thread', ${sqlString(JSON.stringify(event.payload))});`, + )), + ...(params.longContextFixture + ? [ + `INSERT INTO long_context_evidence VALUES (${sqlString(params.userActionId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(JSON.stringify(params.longContextFixture.payload))});`, + ] + : []), ].join('\n') const result = spawnSync(duckdbExe, [params.dbPath, sql], { cwd: repoRoot, @@ -84,16 +109,27 @@ async function main(): Promise { const fixtureDbPath = process.env.V2_FIXTURE_DB_PATH const fixtureVariantId = process.env.CLAUDE_CODE_EVAL_VARIANT_LABEL ?? process.env.CLAUDE_CODE_EVAL_VARIANT_ID + const scenarioId = + process.env.CLAUDE_CODE_EVAL_SCENARIO_LABEL ?? process.env.CLAUDE_CODE_EVAL_SCENARIO_ID if (process.env.V2_FIXTURE_FAIL_VARIANT === fixtureVariantId) { throw new Error(`Fixture requested failure for variant ${fixtureVariantId}`) } if (fixtureDbPath) { + const longContextFixture = + scenarioId && fixtureVariantId + ? await buildLongContextFixtureEvidence({ + scenarioId, + variantId: fixtureVariantId, + env: process.env as Record, + }) + : null writeFixtureDb({ dbPath: fixtureDbPath, userActionId, queryId, startedAt: now.toISOString(), endedAt, + longContextFixture, }) if (process.env.V2_FIXTURE_DUPLICATE_CAPTURE === '1') { writeFixtureDb({ diff --git a/scripts/evals/v2_harness_execution.ts b/scripts/evals/v2_harness_execution.ts index 99c83a4184..708d0bc9c7 100644 --- a/scripts/evals/v2_harness_execution.ts +++ b/scripts/evals/v2_harness_execution.ts @@ -1,7 +1,7 @@ import { spawnSync } from 'node:child_process' import { createHash, randomUUID } from 'node:crypto' -import { existsSync } from 'node:fs' -import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { existsSync, unlinkSync, writeFileSync } from 'node:fs' +import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises' import path from 'node:path' import type { EvalScenario, EvalVariant } from '../../src/observability/v2/evalTypes' @@ -76,17 +76,32 @@ function sqlString(value: string): string { return `'${value.replaceAll("'", "''")}'` } -function runDuckDbSql(dbPath: string, sql: string): void { - const result = spawnSync(duckdbExe, [dbPath, sql], { +function spawnDuckDb(args: string[]) { + return spawnSync(duckdbExe, args, { cwd: repoRoot, encoding: 'utf8', }) - if (result.status !== 0) { - throw new Error( - String(result.stderr ?? '').trim() || - String(result.stdout ?? '').trim() || - String(result.error?.message ?? '').trim(), - ) +} + +function runDuckDbSql(dbPath: string, sql: string): void { + const tempSqlPath = path.join( + repoRoot, + '.observability', + `fixture_sql_${randomUUID()}.sql`, + ) + const tempSqlRef = path.relative(repoRoot, tempSqlPath).split(path.sep).join('/') + writeFileSync(tempSqlPath, `${sql}\n`, 'utf8') + try { + const result = spawnDuckDb([dbPath, `.read ${tempSqlRef}`]) + if (result.status !== 0) { + throw new Error( + String(result.stderr ?? '').trim() || + String(result.stdout ?? '').trim() || + String(result.error?.message ?? '').trim(), + ) + } + } finally { + unlinkSync(tempSqlPath) } } @@ -170,10 +185,7 @@ function featureGateEnvName(key: string): string { } function queryDuckDb(dbPath: string, sql: string): T[] { - const result = spawnSync(duckdbExe, ['-json', dbPath, sql], { - cwd: repoRoot, - encoding: 'utf8', - }) + const result = spawnDuckDb(['-json', dbPath, sql]) if (result.status !== 0) { const message = String(result.stderr ?? '').trim() || @@ -189,6 +201,522 @@ function escapeSqlLiteral(value: string): string { return value.replaceAll("'", "''") } +async function readJsonRecord(filePath: string): Promise { + return JSON.parse(await readFile(filePath, 'utf8')) as JsonRecord +} + +async function listJsonFiles(dir: string, recursive = false): Promise { + const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) + const files = entries + .filter(entry => entry.isFile() && entry.name.endsWith('.json')) + .map(entry => path.join(dir, entry.name)) + if (!recursive) return files + const nested = await Promise.all( + entries + .filter(entry => entry.isDirectory()) + .map(entry => listJsonFiles(path.join(dir, entry.name), true)), + ) + return [...files, ...nested.flat()] +} + +async function resolveScenarioManifestPath(scenarioId: string): Promise { + const directPath = path.join(repoRoot, 'tests', 'evals', 'v2', 'scenarios', `${scenarioId}.json`) + if (existsSync(directPath)) return directPath + const nestedFiles = await listJsonFiles( + path.join(repoRoot, 'tests', 'evals', 'v2', 'scenarios'), + true, + ) + return nestedFiles.find(filePath => path.basename(filePath) === `${scenarioId}.json`) +} + +function idsFromFixtureSection(payload: JsonRecord, key: string): string[] { + const items = payload[key] + if (!Array.isArray(items)) return [] + return items + .map(item => + item && typeof item === 'object' && typeof (item as JsonRecord).id === 'string' + ? String((item as JsonRecord).id) + : item && typeof item === 'object' && typeof (item as JsonRecord)[`${key.slice(0, -1)}_id`] === 'string' + ? String((item as JsonRecord)[`${key.slice(0, -1)}_id`]) + : null, + ) + .filter((value): value is string => Boolean(value)) +} + +function takeAllButLast(values: string[]): string[] { + return values.length <= 1 ? values : values.slice(0, -1) +} + +function nonEmptyLines(value: string): string[] { + return value + .split(/\r?\n/) + .map(line => line.trim()) + .filter(Boolean) +} + +function isBulletLine(line: string): boolean { + return /^[-*]\s+/.test(line) +} + +function parseCliPrintResultText(stdoutText: string): string | null { + const trimmed = stdoutText.trim() + if (!trimmed) return null + + const parseCandidate = (candidate: string): string | null => { + try { + const parsed = JSON.parse(candidate) as unknown + if ( + parsed && + typeof parsed === 'object' && + !Array.isArray(parsed) && + typeof (parsed as JsonRecord).result === 'string' + ) { + return String((parsed as JsonRecord).result) + } + } catch { + return null + } + return null + } + + const direct = parseCandidate(trimmed) + if (direct) return direct + + const lines = trimmed + .split(/\r?\n/) + .map(line => line.trim()) + .filter(Boolean) + for (let index = lines.length - 1; index >= 0; index -= 1) { + const fromLine = parseCandidate(lines[index]) + if (fromLine) return fromLine + } + + return trimmed +} + +function supportsRetainedConstraintId(constraintId: string): boolean { + return ['four_bullets_only', 'read_only_task'].includes(constraintId) +} + +function supportsRetrievedFactId(factId: string): boolean { + return [ + 'cli_entrypoint_cli_tsx', + 'capture_key_benchmark_run_id', + 'experiment_summary_dir', + ].includes(factId) +} + +function supportsConfusionId(confusionId: string): boolean { + return [ + 'old_entrypoint_main_tsx', + 'fake_capture_key_latest_action', + ].includes(confusionId) +} + +function evaluateRetainedConstraint( + constraintId: string, + answerText: string, + answerLines: string[], +): boolean | null { + const lower = answerText.toLowerCase() + switch (constraintId) { + case 'four_bullets_only': + return answerLines.length === 4 && answerLines.every(isBulletLine) + case 'read_only_task': + return ( + lower.includes('read-only') || + lower.includes('read only') || + lower.includes('do not modify files') || + lower.includes('do not modify file') + ) + default: + return null + } +} + +function evaluateRetrievedFact(factId: string, answerText: string): boolean | null { + switch (factId) { + case 'cli_entrypoint_cli_tsx': + return answerText.includes('src/entrypoints/cli.tsx') + case 'capture_key_benchmark_run_id': + return answerText.includes('benchmark_run_id') + case 'experiment_summary_dir': + return answerText.includes('tests/evals/v2/experiment-runs/') + default: + return null + } +} + +function evaluateForbiddenConfusion(confusionId: string, answerText: string): boolean | null { + const lower = answerText.toLowerCase() + switch (confusionId) { + case 'old_entrypoint_main_tsx': + return answerText.includes('src/main.tsx') + case 'fake_capture_key_latest_action': + return ( + /latest\s+user_action_id/i.test(answerText) || + /latest\s+action\s*id/i.test(answerText) || + lower.includes('latest action id') + ) + default: + return null + } +} + +async function buildLongContextRealOutputEvidence(params: { + scenario: EvalScenario + variantId: string + stdoutRef: string +}): Promise { + const profile = params.scenario.long_context_profile + if (!profile) return null + + const stdoutPath = path.resolve(repoRoot, params.stdoutRef) + const stdoutText = await readFile(stdoutPath, 'utf8') + const answerText = parseCliPrintResultText(stdoutText) + + const payload: JsonRecord = { + parser_version: 'candidate_long_context_output_parser_v0', + parser_mode: 'real_smoke_rule_based', + parser_status: answerText ? 'parsed' : 'unparsed', + variant_id: params.variantId, + observed_output_excerpt: answerText?.trim().slice(0, 240) ?? '', + supported_constraint_ids: profile.expected_retained_constraints.filter( + supportsRetainedConstraintId, + ), + supported_fact_ids: profile.expected_retrieved_facts.filter(supportsRetrievedFactId), + supported_confusion_ids: profile.forbidden_confusions.filter(supportsConfusionId), + manual_review_required: profile.manual_review_questions.length > 0, + } + + if (!answerText) { + return payload + } + + const answerLines = nonEmptyLines(answerText) + const observedRetainedConstraints: string[] = [] + const observedLostConstraints: string[] = [] + const observedRetrievedFacts: string[] = [] + const observedMissedFacts: string[] = [] + const observedConfusions: string[] = [] + + for (const constraintId of profile.expected_retained_constraints) { + const observed = evaluateRetainedConstraint(constraintId, answerText, answerLines) + if (observed === true) observedRetainedConstraints.push(constraintId) + if (observed === false) observedLostConstraints.push(constraintId) + } + + for (const factId of profile.expected_retrieved_facts) { + const observed = evaluateRetrievedFact(factId, answerText) + if (observed === true) observedRetrievedFacts.push(factId) + if (observed === false) observedMissedFacts.push(factId) + } + + for (const confusionId of profile.forbidden_confusions) { + const observed = evaluateForbiddenConfusion(confusionId, answerText) + if (observed === true) observedConfusions.push(confusionId) + } + + payload.observed_retained_constraints = observedRetainedConstraints + payload.observed_lost_constraints = observedLostConstraints + payload.observed_retrieved_facts = observedRetrievedFacts + payload.observed_missed_facts = observedMissedFacts + payload.observed_confusions = observedConfusions + return payload +} + +function upsertLongContextEvidence(params: { + dbPath?: string + userActionId: string + scenarioId: string + variantId: string + payload: JsonRecord +}): void { + const targetDbPath = params.dbPath ?? defaultDbPath + runDuckDbSql( + targetDbPath, + [ + 'CREATE TABLE IF NOT EXISTS long_context_evidence(user_action_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, payload_json VARCHAR);', + `DELETE FROM long_context_evidence WHERE user_action_id = ${sqlString(params.userActionId)};`, + `INSERT INTO long_context_evidence VALUES (${sqlString(params.userActionId)}, ${sqlString(params.scenarioId)}, ${sqlString(params.variantId)}, ${sqlString(JSON.stringify(params.payload))});`, + ].join('\n'), + ) +} + +export async function buildLongContextFixtureEvidence(params: { + scenarioId: string + variantId: string + env: Record +}): Promise<{ + payload: JsonRecord + tokenBase: number + turnCount: number + subagentCount: number + toolCallCount: number + events: Array<{ event_name: string; payload: JsonRecord }> + } | null> { + const manifestPath = await resolveScenarioManifestPath(params.scenarioId) + if (!manifestPath) return null + const scenario = await readJsonRecord(manifestPath) as EvalScenario + const profile = scenario.long_context_profile + if (!profile) return null + + const fixtureDir = path.resolve(repoRoot, profile.fixture_ref) + const criticalFactsPayload = await readJsonRecord(path.join(fixtureDir, 'critical_facts.json')) + const constraintsPayload = await readJsonRecord(path.join(fixtureDir, 'constraints.json')) + const distractorsPayload = await readJsonRecord(path.join(fixtureDir, 'distractors.json')) + const expectedOutput = await readFile(path.join(fixtureDir, 'expected_output.md'), 'utf8') + const observedMode = + params.env.V2_FIXTURE_VARIANT_KIND ?? + (params.variantId === 'baseline_default' + ? 'baseline' + : params.variantId.includes('guarded') + ? 'long_context_guarded' + : params.variantId.includes('sparse') + ? 'sparse' + : 'baseline') + + const expectedConstraints = + profile.expected_retained_constraints.length > 0 + ? profile.expected_retained_constraints + : idsFromFixtureSection(constraintsPayload, 'constraints') + const expectedFacts = + profile.expected_retrieved_facts.length > 0 + ? profile.expected_retrieved_facts + : idsFromFixtureSection(criticalFactsPayload, 'facts') + const distractorIds = + profile.distractor_refs.length > 0 + ? profile.distractor_refs + : idsFromFixtureSection(distractorsPayload, 'distractors') + + let observedRetainedConstraints = [...expectedConstraints] + let observedLostConstraints: string[] = [] + let observedRetrievedFacts = [...expectedFacts] + let observedMissedFacts: string[] = [] + let observedConfusions: string[] = [] + let compactionTriggerCount = 0 + let toolResultBudgetTriggerCount = 0 + let compactionSavedTokens = 0 + let tokenBase = 1180 + let turnCount = 3 + let subagentCount = 0 + let toolCallCount = 0 + let successUnderContextPressure = 1 + + switch (profile.context_family) { + case 'constraint_retention': + tokenBase = observedMode === 'baseline' ? 1280 : 1090 + if (observedMode === 'baseline') { + observedLostConstraints = expectedConstraints.length > 0 ? [expectedConstraints.at(-1) as string] : [] + observedRetainedConstraints = takeAllButLast(expectedConstraints) + } + break + case 'retrieval': + tokenBase = observedMode === 'baseline' ? 1360 : 1140 + if (observedMode === 'baseline') { + observedMissedFacts = expectedFacts.length > 0 ? [expectedFacts.at(-1) as string] : [] + observedRetrievedFacts = takeAllButLast(expectedFacts) + } + break + case 'distractor_resistance': + tokenBase = observedMode === 'baseline' ? 1320 : 1120 + if (observedMode === 'baseline') { + observedConfusions = distractorIds.slice(0, 1) + } + break + case 'compaction_pressure': + tokenBase = observedMode === 'baseline' ? 1640 : 1240 + turnCount = 5 + subagentCount = observedMode === 'baseline' ? 1 : 1 + toolCallCount = 2 + compactionTriggerCount = observedMode === 'baseline' ? 2 : 2 + toolResultBudgetTriggerCount = 1 + compactionSavedTokens = observedMode === 'baseline' ? 42 : 188 + if (observedMode === 'baseline') { + observedLostConstraints = expectedConstraints.length > 0 ? [expectedConstraints.at(-1) as string] : [] + observedRetainedConstraints = takeAllButLast(expectedConstraints) + observedMissedFacts = expectedFacts.length > 0 ? [expectedFacts.at(-1) as string] : [] + observedRetrievedFacts = takeAllButLast(expectedFacts) + successUnderContextPressure = 0 + } + break + } + + if (observedMode !== 'baseline') { + observedRetainedConstraints = [...expectedConstraints] + observedLostConstraints = [] + observedRetrievedFacts = [...expectedFacts] + observedMissedFacts = [] + observedConfusions = [] + } + + const payload: JsonRecord = { + context_family: profile.context_family, + context_size_class: profile.context_size_class, + fixture_ref: profile.fixture_ref, + expected_retained_constraints: expectedConstraints, + expected_retrieved_facts: expectedFacts, + distractor_refs: distractorIds, + forbidden_confusions: profile.forbidden_confusions, + manual_review_questions: profile.manual_review_questions, + observed_retained_constraints: observedRetainedConstraints, + observed_lost_constraints: observedLostConstraints, + observed_retrieved_facts: observedRetrievedFacts, + observed_missed_facts: observedMissedFacts, + observed_confusions: observedConfusions, + compaction_trigger_count: compactionTriggerCount, + compaction_saved_tokens: compactionSavedTokens, + tool_result_budget_trigger_count: toolResultBudgetTriggerCount, + memory_or_subagent_count: subagentCount, + success_under_context_pressure: successUnderContextPressure, + manual_review_required: profile.manual_review_questions.length > 0, + expected_output_excerpt: expectedOutput.trim().slice(0, 240), + observed_mode: observedMode, + } + + const events: Array<{ event_name: string; payload: JsonRecord }> = [] + for (let index = 0; index < compactionTriggerCount; index += 1) { + events.push({ + event_name: index === 0 ? 'messages.compact_boundary.applied' : 'messages.microcompact.applied', + payload: { + tokens_saved: + compactionTriggerCount <= 1 + ? compactionSavedTokens + : Math.floor(compactionSavedTokens / compactionTriggerCount), + }, + }) + } + for (let index = 0; index < toolResultBudgetTriggerCount; index += 1) { + events.push({ + event_name: 'messages.tool_result_budget.applied', + payload: { + tokens_saved: 0, + }, + }) + } + + return { + payload, + tokenBase, + turnCount, + subagentCount, + toolCallCount, + events, + } +} + +async function runFixtureEmitterViaBridge(params: { + env: Record + runDir: string + timeoutMs: number +}): Promise<{ + status: HarnessExecutionAdapterOutput['status'] + stdoutRef: string + stderrRef: string + error?: string +}> { + const stdoutPath = path.join(params.runDir, 'stdout.txt') + const stderrPath = path.join(params.runDir, 'stderr.txt') + const commandPath = path.join(params.runDir, 'command.json') + const launcherRequestPath = path.join(params.runDir, 'launcher-request.json') + const launcherResultPath = path.join(params.runDir, 'launcher-result.json') + const command = bunExe + const args = ['run', 'scripts/evals/v2_emit_fixture_trace.ts'] + + await writeFile( + commandPath, + `${JSON.stringify( + { + adapter: 'fixture_trace', + transport: 'external_emitter', + command, + args, + launcher_bridge_ref: path.relative(repoRoot, windowsLauncherBridgePath), + launcher_request_ref: path.relative(repoRoot, launcherRequestPath), + timeout_ms: params.timeoutMs, + env_keys: Object.keys(params.env).sort(), + }, + null, + 2, + )}\n`, + 'utf8', + ) + await writeFile( + launcherRequestPath, + `${JSON.stringify( + { + command, + args, + cwd: repoRoot, + env: params.env, + timeout_ms: params.timeoutMs, + }, + null, + 2, + )}\n`, + 'utf8', + ) + + const bridgeResult = spawnSync( + nodeExe, + [windowsLauncherBridgePath, '--request', launcherRequestPath, '--result', launcherResultPath], + { + cwd: repoRoot, + encoding: 'utf8', + timeout: params.timeoutMs + 10_000, + }, + ) + + let stdoutText = '' + let stderrText = '' + let status: HarnessExecutionAdapterOutput['status'] = 'completed' + let errorText = '' + + if (bridgeResult.status !== 0 && !existsSync(launcherResultPath)) { + stdoutText = String(bridgeResult.stdout ?? '') + stderrText = String(bridgeResult.stderr ?? bridgeResult.error?.message ?? '') + errorText = + stderrText.trim() || + stdoutText.trim() || + `fixture emitter bridge exited with status ${bridgeResult.status}` + status = bridgeResult.error?.name === 'ETIMEDOUT' ? 'timeout' : 'failed' + } else { + const launcherPayload = JSON.parse(await readFile(launcherResultPath, 'utf8')) as { + child_status?: number | null + stdout?: string + stderr?: string + error_name?: string | null + error_message?: string | null + timed_out?: boolean + signal?: string | null + } + stdoutText = String(launcherPayload.stdout ?? '') + stderrText = String(launcherPayload.stderr ?? launcherPayload.error_message ?? '') + if (launcherPayload.timed_out) { + status = 'timeout' + errorText = launcherPayload.error_message ?? 'fixture emitter bridge timed out' + } else if ((launcherPayload.child_status ?? 0) !== 0) { + status = 'failed' + errorText = + String(launcherPayload.stderr ?? '').trim() || + String(launcherPayload.stdout ?? '').trim() || + String(launcherPayload.error_message ?? '').trim() || + (launcherPayload.signal + ? `fixture emitter terminated by signal ${launcherPayload.signal}` + : `fixture emitter exited with status ${launcherPayload.child_status}`) + } + } + + await writeFile(stdoutPath, stdoutText, 'utf8') + await writeFile(stderrPath, stderrText, 'utf8') + return { + status, + stdoutRef: path.relative(repoRoot, stdoutPath), + stderrRef: path.relative(repoRoot, stderrPath), + error: errorText || undefined, + } +} + function relationColumns(dbPath: string, relation: string): string[] { const rows = queryDuckDb<{ name?: string }>( dbPath, @@ -549,6 +1077,14 @@ export class FixtureTraceHarnessExecutionAdapter implements HarnessExecutionAdap } } + if (process.platform === 'win32') { + return runFixtureEmitterViaBridge({ + env: this.options.env, + runDir, + timeoutMs: input.timeoutMs, + }) + } + const now = new Date() const endedAt = new Date(now.getTime() + 10).toISOString() const userActionId = randomUUID() @@ -559,12 +1095,21 @@ export class FixtureTraceHarnessExecutionAdapter implements HarnessExecutionAdap this.options.env.CLAUDE_CODE_EVAL_EXPERIMENT_LABEL ?? input.experimentId const scenarioId = this.options.env.CLAUDE_CODE_EVAL_SCENARIO_LABEL ?? input.scenarioId const variantId = this.options.env.CLAUDE_CODE_EVAL_VARIANT_LABEL ?? input.variantId + const longContextFixture = await buildLongContextFixtureEvidence({ + scenarioId, + variantId, + env: this.options.env, + }) const tokenBase = - input.variantId === 'baseline_default' + longContextFixture?.tokenBase ?? + (input.variantId === 'baseline_default' ? 110 : input.variantId.includes('sparse') ? 100 - : 105 + : 105) + const turnCount = longContextFixture?.turnCount ?? 1 + const subagentCount = longContextFixture?.subagentCount ?? 0 + const toolCallCount = longContextFixture?.toolCallCount ?? 0 const sql = [ 'CREATE TABLE IF NOT EXISTS user_actions(event_date VARCHAR, user_action_id VARCHAR, started_at VARCHAR, started_at_ms BIGINT, ended_at VARCHAR, ended_at_ms BIGINT, duration_ms BIGINT, event_count BIGINT, query_count BIGINT, main_thread_query_count BIGINT, subagent_query_count BIGINT, subagent_count BIGINT, tool_call_count BIGINT, experiment_id VARCHAR, scenario_id VARCHAR, variant_id VARCHAR, benchmark_run_id VARCHAR, eval_run_id VARCHAR, raw_input_tokens BIGINT, output_tokens BIGINT, cache_read_tokens BIGINT, cache_create_tokens BIGINT, total_prompt_input_tokens BIGINT, total_billed_tokens BIGINT, main_thread_total_prompt_input_tokens BIGINT, subagent_total_prompt_input_tokens BIGINT);', @@ -573,13 +1118,32 @@ export class FixtureTraceHarnessExecutionAdapter implements HarnessExecutionAdap 'CREATE TABLE IF NOT EXISTS subagents(user_action_id VARCHAR, subagent_reason VARCHAR, subagent_trigger_kind VARCHAR, subagent_trigger_detail VARCHAR, duration_ms BIGINT);', 'CREATE TABLE IF NOT EXISTS recoveries(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR);', 'CREATE TABLE IF NOT EXISTS metrics_integrity_daily(event_date VARCHAR, strict_query_completion_rate DOUBLE, strict_turn_state_closure_rate DOUBLE, tool_lifecycle_closure_rate DOUBLE, subagent_lifecycle_closure_rate DOUBLE);', - `INSERT INTO user_actions VALUES (${sqlString(now.toISOString().slice(0, 10))}, ${sqlString(userActionId)}, ${sqlString(now.toISOString())}, 0, ${sqlString(endedAt)}, 10, 10, 2, 1, 1, 0, 0, 0, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, ${tokenBase - 10}, 10, 0, 0, ${tokenBase - 10}, ${tokenBase}, ${tokenBase - 10}, 0);`, - `INSERT INTO queries VALUES (${sqlString(queryId)}, ${sqlString(userActionId)}, 'main_thread', ${sqlString(now.toISOString())}, 1, 'fixture_completed');`, + 'CREATE TABLE IF NOT EXISTS events_raw(user_action_id VARCHAR, event_name VARCHAR, ts_wall VARCHAR, query_source VARCHAR, payload_json VARCHAR);', + `INSERT INTO user_actions VALUES (${sqlString(now.toISOString().slice(0, 10))}, ${sqlString(userActionId)}, ${sqlString(now.toISOString())}, 0, ${sqlString(endedAt)}, 10, 10, 2, 1, 1, 0, ${subagentCount}, ${toolCallCount}, ${sqlString(experimentId)}, ${sqlString(scenarioId)}, ${sqlString(variantId)}, ${sqlString(benchmarkRunId)}, ${sqlString(evalRunId)}, ${tokenBase - 10}, 10, 0, 0, ${tokenBase - 10}, ${tokenBase}, ${tokenBase - 10}, 0);`, + `INSERT INTO queries VALUES (${sqlString(queryId)}, ${sqlString(userActionId)}, 'main_thread', ${sqlString(now.toISOString())}, ${turnCount}, 'fixture_completed');`, `INSERT INTO metrics_integrity_daily VALUES (${sqlString(now.toISOString().slice(0, 10))}, 1, 1, 1, 1);`, + ...Array.from({ length: toolCallCount }, (_, index) => + `INSERT INTO tools VALUES (${sqlString(userActionId)}, ${sqlString(index === 0 ? 'Read' : 'Search')}, true, false);`, + ), + ...Array.from({ length: subagentCount }, () => + `INSERT INTO subagents VALUES (${sqlString(userActionId)}, 'session_memory', 'context_pressure', ${sqlString(scenarioId)}, 12);`, + ), + ...(longContextFixture?.events ?? []).map((event, index) => + `INSERT INTO events_raw VALUES (${sqlString(userActionId)}, ${sqlString(event.event_name)}, ${sqlString(new Date(now.getTime() + index + 1).toISOString())}, 'main_thread', ${sqlString(JSON.stringify(event.payload))});`, + ), ].join('\n') try { runDuckDbSql(dbPath, sql) + if (longContextFixture) { + upsertLongContextEvidence({ + dbPath, + userActionId, + scenarioId, + variantId, + payload: longContextFixture.payload, + }) + } await writeFile(stdoutPath, `fixture_user_action_id=${userActionId}\n`, 'utf8') await writeFile(stderrPath, '', 'utf8') return { @@ -739,6 +1303,29 @@ export async function executeHarnessAndCapture(params: { match_count: 0, error: execution.error ?? `Harness execution did not complete: ${execution.status}`, } + + if ( + execution.status === 'completed' && + capture.status === 'captured' && + params.execution?.adapter !== 'fixture_trace' && + params.scenario.long_context_profile && + execution.stdoutRef + ) { + const realLongContextPayload = await buildLongContextRealOutputEvidence({ + scenario: params.scenario, + variantId: params.variant.variant_id, + stdoutRef: execution.stdoutRef, + }) + if (realLongContextPayload) { + upsertLongContextEvidence({ + dbPath: params.dbPath, + userActionId: capture.user_action_id, + scenarioId: params.scenario.scenario_id, + variantId: params.variant.variant_id, + payload: realLongContextPayload, + }) + } + } return { execution, capture, diff --git a/scripts/evals/v2_record_run.ts b/scripts/evals/v2_record_run.ts index d517487a9c..71849acd2f 100644 --- a/scripts/evals/v2_record_run.ts +++ b/scripts/evals/v2_record_run.ts @@ -97,6 +97,11 @@ function parseJsonRecord(value: unknown): JsonRecord | undefined { return undefined } +function mergeJsonRecords(...records: Array): JsonRecord | undefined { + const merged = Object.assign({}, ...records.filter(Boolean)) + return Object.keys(merged).length > 0 ? merged : undefined +} + function uniqueStrings(values: string[]): string[] { return [...new Set(values.filter(Boolean))] } @@ -158,6 +163,20 @@ async function loadScenario(scenarioId: string): Promise { try { return await readJson(directPath) } catch { + const nestedScenarioDir = path.join(evalRoot, 'scenarios') + const nestedEntries = await readdir(nestedScenarioDir, { withFileTypes: true }).catch( + () => [], + ) + for (const entry of nestedEntries) { + if (!entry.isDirectory()) continue + const nestedPath = path.join(nestedScenarioDir, entry.name, `${scenarioId}.json`) + try { + return await readJson(nestedPath) + } catch { + // Keep searching nested directories before falling back to the catalog shell. + } + } + // The phase-one catalog stores scenario shells before full manifests exist. } @@ -214,6 +233,7 @@ function buildReport(params: { subagents: JsonRecord[] recoveries: JsonRecord[] variantEffect: JsonRecord + longContext?: JsonRecord scores: EvalScore[] }): string { const { @@ -226,6 +246,7 @@ function buildReport(params: { subagents, recoveries, variantEffect, + longContext, scores, } = params const toolSummary = @@ -255,6 +276,22 @@ function buildReport(params: { const policySummary = variantEffect.observed_policy ? JSON.stringify(variantEffect.observed_policy, null, 2) : 'null' + const longContextSummary = longContext + ? `- context_family: ${asString(longContext.context_family) || 'unknown'} +- context_size_class: ${asString(longContext.context_size_class) || 'unknown'} +- fixture_ref: ${asString(longContext.fixture_ref) || 'n/a'} +- retained_constraints: ${(longContext.observed_retained_constraints as string[] | undefined)?.join(', ') || 'none'} +- lost_constraints: ${(longContext.observed_lost_constraints as string[] | undefined)?.join(', ') || 'none'} +- retrieved_facts: ${(longContext.observed_retrieved_facts as string[] | undefined)?.join(', ') || 'none'} +- missed_facts: ${(longContext.observed_missed_facts as string[] | undefined)?.join(', ') || 'none'} +- distractor_confusions: ${(longContext.observed_confusions as string[] | undefined)?.join(', ') || 'none'} +- compaction_trigger_count: ${asNumber(longContext.compaction_trigger_count)} +- compaction_saved_tokens: ${asNumber(longContext.compaction_saved_tokens)} +- tool_result_budget_trigger_count: ${asNumber(longContext.tool_result_budget_trigger_count)} +- memory_or_subagent_count: ${asNumber(longContext.memory_or_subagent_count)} +- success_under_context_pressure: ${longContext.success_under_context_pressure ?? 'n/a'} +- manual_review_questions: ${(longContext.manual_review_questions as string[] | undefined)?.join(' | ') || 'none'}` + : '- No long-context evidence attached to this run.' return `# V2 Run Report: ${run.run_id} @@ -315,6 +352,10 @@ ${subagentSummary} ${policySummary} \`\`\` +## Long Context Evidence + +${longContextSummary} + ## Scores ${scoreSummary} @@ -385,6 +426,37 @@ async function main(): Promise { dbPath, `SELECT * FROM metrics_integrity_daily WHERE event_date = ${sqlString(asString(action.event_date))} LIMIT 1;`, )[0] + const longContextEvidenceRow = relationExists(dbPath, 'long_context_evidence') + ? queryDuckDb( + dbPath, + `SELECT payload_json FROM long_context_evidence WHERE user_action_id = ${sqlString(userActionId)} ORDER BY rowid DESC LIMIT 1;`, + )[0] + : undefined + const longContextPayload = parseJsonRecord(longContextEvidenceRow?.payload_json) + const eventRows = relationExists(dbPath, 'events_raw') + ? queryDuckDb( + dbPath, + [ + 'SELECT event_name, payload_json', + 'FROM events_raw', + `WHERE user_action_id = ${sqlString(userActionId)}`, + " AND event_name IN ('messages.compact_boundary.applied', 'messages.microcompact.applied', 'messages.tool_result_budget.applied')", + 'ORDER BY ts_wall ASC;', + ].join(' '), + ) + : [] + const compactionTriggerCount = eventRows.filter(row => + ['messages.compact_boundary.applied', 'messages.microcompact.applied'].includes( + asString(row.event_name), + ), + ).length + const toolResultBudgetTriggerCount = eventRows.filter( + row => asString(row.event_name) === 'messages.tool_result_budget.applied', + ).length + const compactionSavedTokens = eventRows.reduce((sum, row) => { + const payload = parseJsonRecord(row.payload_json) + return sum + asNumber(payload?.tokens_saved) + }, 0) const sessionMemoryPolicyRow = relationExists(dbPath, 'events_raw') ? queryDuckDb( dbPath, @@ -404,6 +476,29 @@ async function main(): Promise { asString(subagent.subagent_trigger_detail), ), ) + const longContext = scenario.long_context_profile + ? mergeJsonRecords( + { + context_family: scenario.long_context_profile.context_family, + context_size_class: scenario.long_context_profile.context_size_class, + fixture_ref: scenario.long_context_profile.fixture_ref, + expected_retained_constraints: + scenario.long_context_profile.expected_retained_constraints, + expected_retrieved_facts: + scenario.long_context_profile.expected_retrieved_facts, + distractor_refs: scenario.long_context_profile.distractor_refs, + forbidden_confusions: scenario.long_context_profile.forbidden_confusions, + manual_review_questions: + scenario.long_context_profile.manual_review_questions, + compaction_trigger_count: compactionTriggerCount, + compaction_saved_tokens: compactionSavedTokens, + tool_result_budget_trigger_count: toolResultBudgetTriggerCount, + memory_or_subagent_count: asNumber(action.subagent_count), + total_prompt_input_tokens: asNumber(action.total_prompt_input_tokens), + }, + longContextPayload, + ) + : undefined const variantEffect: JsonRecord = { effect_type: 'session_memory_policy', policy_event_observed: observedPolicy !== undefined, @@ -471,6 +566,7 @@ async function main(): Promise { subagents, recoveries, variantEffect, + longContext, }, requestedScoreSpecIds) const runsDir = path.join(evalRoot, 'runs') @@ -481,7 +577,7 @@ async function main(): Promise { await writeFile( path.join(runsDir, `${runId}.json`), - `${JSON.stringify({ run, binding, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries }, variant_effect: variantEffect }, null, 2)}\n`, + `${JSON.stringify({ run, binding, scenario, variant, evidence: { action, rootQuery, tools, subagents, recoveries }, variant_effect: variantEffect, long_context: longContext ?? null }, null, 2)}\n`, ) await writeFile( path.join(scoresDir, `${runId}.scores.json`), @@ -499,6 +595,7 @@ async function main(): Promise { subagents, recoveries, variantEffect, + longContext, scores, }), ) diff --git a/scripts/evals/v2_run_experiment.ts b/scripts/evals/v2_run_experiment.ts index b224a4d12c..5d7e6763ba 100644 --- a/scripts/evals/v2_run_experiment.ts +++ b/scripts/evals/v2_run_experiment.ts @@ -1,4 +1,5 @@ import { spawnSync } from 'node:child_process' +import { randomUUID } from 'node:crypto' import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises' import path from 'node:path' @@ -18,11 +19,14 @@ import type { EvalScoreSpecCollection, } from '../../src/observability/v2/evalExperimentTypes' import { + applyVariantV0, + buildLongContextFixtureEvidence, createRunIdentity, executeHarnessAndCapture, isExecuteHarnessDisabled, type ExecuteHarnessResult, } from './v2_harness_execution' +import { buildScoresForSpecIds } from './v2_score_registry' type JsonRecord = Record type ExperimentProfile = 'smoke' | 'real_experiment' @@ -65,6 +69,36 @@ interface ExperimentValidity { } } +type LongContextReviewVerdict = + | 'pass' + | 'warning' + | 'needs_manual_review' + | 'invalid' + +interface LongContextSummaryItem { + scenario_id: string + candidate_variant_id: string + repeat_count: number + context_family: string + context_size_class: string + retained_constraint_mean: number | null + lost_constraint_mean: number | null + constraint_retention_rate_mean: number | null + retrieved_fact_mean: number | null + missed_fact_mean: number | null + retrieved_fact_hit_rate_mean: number | null + distractor_confusion_mean: number | null + compaction_trigger_mean: number | null + compaction_saved_tokens_mean: number | null + tool_result_budget_trigger_mean: number | null + total_prompt_input_tokens_mean: number | null + prompt_token_delta_mean: number | null + success_under_context_pressure_rate: number | null + manual_review_required: boolean + manual_review_questions: string[] + interpretation: string[] +} + interface CandidateExperimentResult { candidate_variant_id: string candidate_run_group_id: string @@ -235,6 +269,15 @@ function asStringArray(value: unknown): string[] { return value.filter((item): item is string => typeof item === 'string' && item.length > 0) } +function asJsonRecord(value: unknown): JsonRecord | undefined { + if (!value || typeof value !== 'object' || Array.isArray(value)) return undefined + return value as JsonRecord +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values.filter(Boolean))] +} + function sanitizeId(value: string): string { return value.replace(/[^a-zA-Z0-9_-]+/g, '_').replace(/^_+|_+$/g, '') } @@ -251,11 +294,18 @@ function createRunGroupId(params: { return base.length > 160 ? base.slice(0, 160) : base } -async function listJsonFiles(dir: string): Promise { +async function listJsonFiles(dir: string, recursive = false): Promise { const entries = await readdir(dir, { withFileTypes: true }).catch(() => []) - return entries + const files = entries .filter(entry => entry.isFile() && entry.name.endsWith('.json')) .map(entry => path.join(dir, entry.name)) + if (!recursive) return files + const nested = await Promise.all( + entries + .filter(entry => entry.isDirectory()) + .map(entry => listJsonFiles(path.join(dir, entry.name), true)), + ) + return [...files, ...nested.flat()] } async function findChildDir(parent: string, matcher: (name: string) => boolean) { @@ -302,10 +352,15 @@ async function loadGatePolicy(gatePolicyId?: string): Promise { - const filePath = path.join(evalRoot, 'scenarios', `${scenarioId}.json`) + const directPath = path.join(evalRoot, 'scenarios', `${scenarioId}.json`) try { - return await readJson(filePath) + return await readJson(directPath) } catch { + const nestedFiles = await listJsonFiles(path.join(evalRoot, 'scenarios'), true) + for (const filePath of nestedFiles) { + if (path.basename(filePath) !== `${scenarioId}.json`) continue + return await readJson(filePath) + } throw new Error(`Scenario not found: ${scenarioId}`) } } @@ -841,6 +896,283 @@ function reportRefs(params: { ].filter(Boolean) } +function syntheticRunId(params: { + scenarioId: string + variantId: string + userActionId: string +}): string { + return sanitizeId( + `run_${new Date().toISOString().replaceAll(':', '').replaceAll('.', '')}_${params.scenarioId}_${params.variantId}_${params.userActionId.slice(0, 8)}`, + ) +} + +async function synthesizeFixtureRun(params: { + experiment: EvalExperimentV21 + scenario: EvalScenario + variant: EvalVariant + runGroupId: string + repeatIndex: number + scoreSpecIds: string[] +}): Promise<{ + runId: string + userActionId: string + scores: EvalScore[] + runArtifact: RunArtifact + execution: ExecuteHarnessResult +}> { + const now = new Date() + const startedAt = now.toISOString() + const endedAt = new Date(now.getTime() + 10).toISOString() + const userActionId = randomUUID() + const queryId = randomUUID() + const identity = createRunIdentity({ + experimentId: params.experiment.experiment_id, + scenarioId: params.scenario.scenario_id, + variantId: params.variant.variant_id, + stamp: now.toISOString().replace(/[:.]/g, ''), + repeatIndex: params.repeatIndex, + }) + const variantApply = applyVariantV0({ + variant: params.variant, + execution: params.experiment.execution, + context: { + experiment_id: params.experiment.experiment_id, + scenario_id: params.scenario.scenario_id, + variant_id: params.variant.variant_id, + benchmark_run_id: identity.benchmark_run_id, + eval_run_id: identity.eval_run_id, + }, + }) + const longContextFixture = await buildLongContextFixtureEvidence({ + scenarioId: params.scenario.scenario_id, + variantId: params.variant.variant_id, + env: variantApply.env, + }) + const tokenBase = + longContextFixture?.tokenBase ?? + (params.variant.variant_id === 'baseline_default' + ? 110 + : params.variant.variant_id.includes('sparse') + ? 100 + : params.variant.variant_id.includes('shadow') + ? 105 + : params.variant.variant_id.includes('guarded') + ? 98 + : 104) + const turnCount = longContextFixture?.turnCount ?? 1 + const subagentCount = longContextFixture?.subagentCount ?? 0 + const toolCallCount = longContextFixture?.toolCallCount ?? 0 + const action: JsonRecord = { + event_date: startedAt.slice(0, 10), + user_action_id: userActionId, + started_at: startedAt, + ended_at: endedAt, + duration_ms: 10, + subagent_count: subagentCount, + tool_call_count: toolCallCount, + total_billed_tokens: tokenBase, + total_prompt_input_tokens: tokenBase - 10, + raw_input_tokens: tokenBase - 10, + output_tokens: 10, + cache_read_tokens: 0, + cache_create_tokens: 0, + main_thread_total_prompt_input_tokens: tokenBase - 10, + subagent_total_prompt_input_tokens: 0, + } + const rootQuery: JsonRecord = { + query_id: queryId, + turn_count: turnCount, + terminal_reason: 'fixture_completed', + } + const tools = Array.from({ length: toolCallCount }, (_, index) => ({ + tool_name: index === 0 ? 'Read' : 'Search', + is_closed: true, + has_failed: false, + })) + const subagents = Array.from({ length: subagentCount }, () => ({ + subagent_count: 1, + subagent_reason: 'session_memory', + subagent_trigger_kind: 'context_pressure', + subagent_trigger_detail: params.scenario.scenario_id, + })) + const recoveries: JsonRecord[] = [] + const integrity: JsonRecord = { + strict_query_completion_rate: 1, + strict_turn_state_closure_rate: 1, + tool_lifecycle_closure_rate: 1, + subagent_lifecycle_closure_rate: 1, + } + const longContext = + longContextFixture?.payload && params.scenario.long_context_profile + ? { + context_family: params.scenario.long_context_profile.context_family, + context_size_class: params.scenario.long_context_profile.context_size_class, + fixture_ref: params.scenario.long_context_profile.fixture_ref, + expected_retained_constraints: + params.scenario.long_context_profile.expected_retained_constraints, + expected_retrieved_facts: + params.scenario.long_context_profile.expected_retrieved_facts, + distractor_refs: params.scenario.long_context_profile.distractor_refs, + forbidden_confusions: params.scenario.long_context_profile.forbidden_confusions, + manual_review_questions: + params.scenario.long_context_profile.manual_review_questions, + total_prompt_input_tokens: tokenBase - 10, + ...longContextFixture.payload, + } + : null + const variantEffect: JsonRecord = { + effect_type: 'fixture_variant', + policy_event_observed: false, + variant_effect_observed: params.variant.variant_id.includes('sparse'), + observed_policy: null, + session_memory_subagent_count: subagentCount, + session_memory_trigger_details: longContextFixture + ? [params.scenario.scenario_id] + : [], + } + const runId = syntheticRunId({ + scenarioId: params.scenario.scenario_id, + variantId: params.variant.variant_id, + userActionId, + }) + const binding = { + binding_mode: 'fact_only' as const, + entry_user_action_id: userActionId, + root_query_id: String(rootQuery.query_id), + observability_db_ref: 'fixture_trace://synthetic', + bind_passed: true, + binding_failure_reason: null, + } + const run = { + run_id: runId, + scenario_id: params.scenario.scenario_id, + variant_id: params.variant.variant_id, + run_group_id: params.runGroupId, + repeat_index: params.repeatIndex, + started_at: startedAt, + ended_at: endedAt, + status: 'completed' as const, + entry_user_action_id: userActionId, + root_query_id: String(rootQuery.query_id), + observability_db_ref: 'fixture_trace://synthetic', + binding, + notes: 'Synthetic fixture_trace run generated by V2.4 fast path.', + } + const scores = buildScoresForSpecIds( + { + runId, + scenario: params.scenario, + action, + rootQuery, + integrity, + tools, + subagents, + recoveries, + variantEffect, + longContext: longContext ?? undefined, + }, + params.scoreSpecIds, + ) + + await mkdir(runsRoot, { recursive: true }) + await mkdir(scoresRoot, { recursive: true }) + await writeFile( + path.join(runsRoot, `${runId}.json`), + `${JSON.stringify( + { + run, + binding, + scenario: params.scenario, + variant: params.variant, + evidence: { + action, + rootQuery, + tools, + subagents, + recoveries, + }, + variant_effect: variantEffect, + long_context: longContext, + }, + null, + 2, + )}\n`, + ) + await writeFile( + path.join(scoresRoot, `${runId}.scores.json`), + `${JSON.stringify(scores, null, 2)}\n`, + ) + + return { + runId, + userActionId, + scores, + runArtifact: { + run, + variant_effect: variantEffect, + ...(longContext ? { long_context: longContext } : {}), + } as RunArtifact, + execution: { + execution: { + status: 'completed', + stdoutRef: 'fixture_trace://synthetic', + stderrRef: 'fixture_trace://synthetic', + }, + capture: { + status: 'captured', + user_action_id: userActionId, + match_count: 1, + }, + variant_apply: variantApply, + benchmark_run_id: identity.benchmark_run_id, + eval_run_id: identity.eval_run_id, + }, + } +} + +async function writeSyntheticCompareReport(params: { + baselineRunId: string + candidateRunId: string + scorecard: ScorecardItem[] + variantEffectSummary: VariantEffectSummary +}): Promise { + const reportRoot = await resolveReportRoot() + await mkdir(reportRoot, { recursive: true }) + const reportPath = path.join( + reportRoot, + `compare_${params.baselineRunId}_vs_${params.candidateRunId}.md`, + ) + const rows = params.scorecard + .map( + item => + `| ${item.score_spec_id} | ${item.baseline_value ?? 'n/a'} | ${item.candidate_value ?? 'n/a'} | ${item.delta ?? 'n/a'} | ${item.interpretation} |`, + ) + .join('\n') + await writeFile( + reportPath, + `# Synthetic Compare: ${params.baselineRunId} vs ${params.candidateRunId} + +## Scorecard + +| score | baseline | candidate | delta | interpretation | +| --- | ---: | ---: | ---: | --- | +${rows || '| n/a | n/a | n/a | n/a | n/a |'} + +## Variant Effect Summary + +- scenario: ${params.variantEffectSummary.scenario_id} +- candidate_variant: ${params.variantEffectSummary.candidate_variant_id} +- baseline_policy_mode: ${params.variantEffectSummary.baseline_policy_mode} +- candidate_policy_mode: ${params.variantEffectSummary.candidate_policy_mode} +- candidate_variant_effect_observed: ${params.variantEffectSummary.candidate_variant_effect_observed} +- runtime_difference_observed: ${params.variantEffectSummary.runtime_difference_observed} + +${params.variantEffectSummary.summary.map(item => `- ${item}`).join('\n')} +`, + ) + return path.relative(repoRoot, reportPath) +} + function numberOrNull(value: unknown): number | null { if (typeof value === 'number' && Number.isFinite(value)) return value if (typeof value === 'string' && value.trim() !== '') { @@ -877,6 +1209,14 @@ function maxValue(values: number[]): number | null { return values.length === 0 ? null : Math.max(...values) } +function meanFromUnknown(values: unknown[]): number | null { + return mean( + values + .map(numberOrNull) + .filter((value): value is number => value !== null), + ) +} + function scoreValue(scores: EvalScore[], scoreSpecId: string): number | null { return valueFor(scores, scoreSpecId) } @@ -1007,6 +1347,7 @@ function buildExperimentValidity(params: { profile: ExperimentProfile scenarioId: string candidateVariantId: string + scenario?: EvalScenario baselineExecution?: ExecuteHarnessResult candidateExecution?: ExecuteHarnessResult scorecard: ScorecardItem[] @@ -1016,11 +1357,13 @@ function buildExperimentValidity(params: { profile, scenarioId, candidateVariantId, + scenario, baselineExecution, candidateExecution, scorecard, variantEffectSummary, } = params + const longContextMode = isLongContextScenario(scenario) const baselineCaptured = baselineExecution === undefined || baselineExecution.capture.status === 'captured' const candidateCaptured = @@ -1029,11 +1372,25 @@ function buildExperimentValidity(params: { baselineExecution?.capture.status !== 'ambiguous_capture' && candidateExecution?.capture.status !== 'ambiguous_capture' const scoreEvidencePresent = scorecard.some(item => item.interpretation !== 'missing') - const variantEffectObserved = variantEffectSummary.candidate_variant_effect_observed + const longContextScoreEvidencePresent = scorecard.some( + item => + item.score_spec_id.startsWith('context.') && item.interpretation !== 'missing', + ) + const effectiveScoreEvidencePresent = longContextMode + ? longContextScoreEvidencePresent || scoreEvidencePresent + : scoreEvidencePresent + const variantEffectObserved = longContextMode + ? effectiveScoreEvidencePresent + : variantEffectSummary.candidate_variant_effect_observed + const runtimeDifferenceObserved = longContextMode + ? effectiveScoreEvidencePresent + : variantEffectSummary.runtime_difference_observed const scenarioIntentMatched = - profile === 'smoke' + longContextMode + ? baselineCaptured && candidateCaptured && effectiveScoreEvidencePresent + : profile === 'smoke' ? baselineCaptured && candidateCaptured - : variantEffectObserved && variantEffectSummary.runtime_difference_observed + : variantEffectObserved && runtimeDifferenceObserved const blockers: string[] = [] const warnings: string[] = [] @@ -1052,25 +1409,35 @@ function buildExperimentValidity(params: { `ambiguous_capture_present: scenario=${scenarioId}, candidate=${candidateVariantId}`, ) } - if (!scoreEvidencePresent) { + if (!effectiveScoreEvidencePresent) { blockers.push( - `score_evidence_missing: scenario=${scenarioId}, candidate=${candidateVariantId}`, + `${longContextMode ? 'long_context_score_evidence_missing' : 'score_evidence_missing'}: scenario=${scenarioId}, candidate=${candidateVariantId}`, ) } - if (profile === 'real_experiment' && !variantEffectObserved) { + if (profile === 'real_experiment' && !longContextMode && !variantEffectObserved) { blockers.push( `variant_effect_not_observed: scenario=${scenarioId}, candidate=${candidateVariantId}`, ) } if ( profile === 'real_experiment' && + !longContextMode && variantEffectObserved && - !variantEffectSummary.runtime_difference_observed + !runtimeDifferenceObserved ) { warnings.push( `runtime_difference_not_observed: scenario=${scenarioId}, candidate=${candidateVariantId}`, ) } + if ( + longContextMode && + profile === 'real_experiment' && + !longContextScoreEvidencePresent + ) { + warnings.push( + `long_context_manual_review_only: scenario=${scenarioId}, candidate=${candidateVariantId}`, + ) + } if (profile === 'real_experiment' && !scenarioIntentMatched) { warnings.push( `scenario_intent_not_matched: scenario=${scenarioId}, candidate=${candidateVariantId}`, @@ -1081,7 +1448,11 @@ function buildExperimentValidity(params: { blockers.length > 0 ? 'invalid' : warnings.length > 0 ? 'inconclusive' : 'valid' const reason = status === 'valid' - ? profile === 'smoke' + ? longContextMode + ? profile === 'smoke' + ? 'Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.' + : 'Long-context real smoke captured interpretable trace-backed context-governance evidence.' + : profile === 'smoke' ? 'Smoke check passed: execute_harness closed the automatic execution and capture loop.' : 'Real experiment is valid: runtime effect was observed and the baseline/candidate difference is interpretable.' : status === 'invalid' @@ -1098,9 +1469,9 @@ function buildExperimentValidity(params: { baseline_captured: baselineCaptured, candidate_captured: candidateCaptured, no_ambiguous_capture: noAmbiguousCapture, - score_evidence_present: scoreEvidencePresent, + score_evidence_present: effectiveScoreEvidencePresent, variant_effect_observed: variantEffectObserved, - runtime_difference_observed: variantEffectSummary.runtime_difference_observed, + runtime_difference_observed: runtimeDifferenceObserved, scenario_intent_matched: scenarioIntentMatched, }, } @@ -1158,6 +1529,288 @@ function aggregateVariantEffectSummary(results: ScenarioExperimentResult[]): Var ) } +function isLongContextScenario(scenario: EvalScenario | undefined): boolean { + return Boolean(scenario?.long_context_profile) +} + +function longContextStringArray(value: JsonRecord | undefined, key: string): string[] { + return asStringArray(value?.[key]) +} + +function longContextNumber(value: JsonRecord | undefined, key: string): number | null { + return numberOrNull(value?.[key]) +} + +async function aggregateLongContextSummary( + results: ScenarioExperimentResult[], +): Promise { + const grouped = new Map< + string, + { + scenario_id: string + candidate_variant_id: string + repeat_count: number + context_family: string + context_size_class: string + retainedCounts: number[] + lostCounts: number[] + retentionRates: number[] + retrievedCounts: number[] + missedCounts: number[] + hitRates: number[] + distractorCounts: number[] + compactionTriggers: number[] + compactionSavedTokens: number[] + toolResultBudgetTriggers: number[] + totalPromptInputTokens: number[] + promptTokenDeltas: number[] + successRates: number[] + manualReviewQuestions: string[] + manualReviewRequired: boolean + } + >() + + for (const result of results) { + const baselineArtifact = await readRunArtifact(result.baseline_run_id) + const baselineLongContext = asJsonRecord((baselineArtifact as JsonRecord).long_context) + for (const candidate of result.candidates) { + const candidateArtifact = await readRunArtifact(candidate.candidate_run_id) + const candidateLongContext = asJsonRecord((candidateArtifact as JsonRecord).long_context) + if (!candidateLongContext && !baselineLongContext) continue + + const summaryKey = `${result.scenario_id}::${candidate.candidate_variant_id}` + const entry = + grouped.get(summaryKey) ?? + { + scenario_id: result.scenario_id, + candidate_variant_id: candidate.candidate_variant_id, + repeat_count: 0, + context_family: + asString(candidateLongContext?.context_family) || + asString(baselineLongContext?.context_family) || + 'unknown', + context_size_class: + asString(candidateLongContext?.context_size_class) || + asString(baselineLongContext?.context_size_class) || + 'unknown', + retainedCounts: [], + lostCounts: [], + retentionRates: [], + retrievedCounts: [], + missedCounts: [], + hitRates: [], + distractorCounts: [], + compactionTriggers: [], + compactionSavedTokens: [], + toolResultBudgetTriggers: [], + totalPromptInputTokens: [], + promptTokenDeltas: [], + successRates: [], + manualReviewQuestions: [], + manualReviewRequired: false, + } + entry.repeat_count += 1 + + const retained = longContextStringArray( + candidateLongContext, + 'observed_retained_constraints', + ).length + const lost = longContextStringArray( + candidateLongContext, + 'observed_lost_constraints', + ).length + const retrieved = longContextStringArray( + candidateLongContext, + 'observed_retrieved_facts', + ).length + const missed = longContextStringArray( + candidateLongContext, + 'observed_missed_facts', + ).length + const confusions = longContextStringArray(candidateLongContext, 'observed_confusions').length + const retainedRate = + retained + lost > 0 ? Number((retained / (retained + lost)).toFixed(6)) : null + const hitRate = + retrieved + missed > 0 + ? Number((retrieved / (retrieved + missed)).toFixed(6)) + : null + const compactionTriggerCount = longContextNumber( + candidateLongContext, + 'compaction_trigger_count', + ) + const compactionSavedTokens = longContextNumber( + candidateLongContext, + 'compaction_saved_tokens', + ) + const toolResultBudgetTriggers = longContextNumber( + candidateLongContext, + 'tool_result_budget_trigger_count', + ) + const totalPromptInputTokens = longContextNumber( + candidateLongContext, + 'total_prompt_input_tokens', + ) + const baselinePromptInputTokens = longContextNumber( + baselineLongContext, + 'total_prompt_input_tokens', + ) + const successRate = longContextNumber( + candidateLongContext, + 'success_under_context_pressure', + ) + if (retainedRate !== null) entry.retentionRates.push(retainedRate) + if (hitRate !== null) entry.hitRates.push(hitRate) + entry.retainedCounts.push(retained) + entry.lostCounts.push(lost) + entry.retrievedCounts.push(retrieved) + entry.missedCounts.push(missed) + entry.distractorCounts.push(confusions) + if (compactionTriggerCount !== null) entry.compactionTriggers.push(compactionTriggerCount) + if (compactionSavedTokens !== null) entry.compactionSavedTokens.push(compactionSavedTokens) + if (toolResultBudgetTriggers !== null) { + entry.toolResultBudgetTriggers.push(toolResultBudgetTriggers) + } + if (totalPromptInputTokens !== null) entry.totalPromptInputTokens.push(totalPromptInputTokens) + if (baselinePromptInputTokens !== null && totalPromptInputTokens !== null) { + entry.promptTokenDeltas.push(totalPromptInputTokens - baselinePromptInputTokens) + } + if (successRate !== null) entry.successRates.push(successRate) + entry.manualReviewQuestions = uniqueStrings([ + ...entry.manualReviewQuestions, + ...longContextStringArray(candidateLongContext, 'manual_review_questions'), + ]) + entry.manualReviewRequired = + entry.manualReviewRequired || + asBoolean(candidateLongContext?.manual_review_required) || + entry.manualReviewQuestions.length > 0 + grouped.set(summaryKey, entry) + } + } + + return [...grouped.values()] + .map(entry => { + const retainedConstraintMean = mean(entry.retainedCounts) + const lostConstraintMean = mean(entry.lostCounts) + const constraintRetentionRateMean = mean(entry.retentionRates) + const retrievedFactMean = mean(entry.retrievedCounts) + const missedFactMean = mean(entry.missedCounts) + const retrievedFactHitRateMean = mean(entry.hitRates) + const distractorConfusionMean = mean(entry.distractorCounts) + const compactionTriggerMean = mean(entry.compactionTriggers) + const compactionSavedTokensMean = mean(entry.compactionSavedTokens) + const toolResultBudgetTriggerMean = mean(entry.toolResultBudgetTriggers) + const totalPromptInputTokensMean = mean(entry.totalPromptInputTokens) + const promptTokenDeltaMean = mean(entry.promptTokenDeltas) + const successUnderContextPressureRate = mean(entry.successRates) + const interpretation: string[] = [] + + if (lostConstraintMean !== null && lostConstraintMean > 0) { + interpretation.push( + `Candidate still loses an average of ${lostConstraintMean.toFixed(3)} hard constraints under context pressure.`, + ) + } else if (constraintRetentionRateMean !== null) { + interpretation.push( + `Observed constraint retention remained at ${(constraintRetentionRateMean * 100).toFixed(1)}%.`, + ) + } + if (retrievedFactHitRateMean === null) { + interpretation.push( + 'Automatic fact-retrieval quality could not be fully established from trace-backed evidence alone.', + ) + } else { + interpretation.push( + `Observed fact retrieval hit rate is ${(retrievedFactHitRateMean * 100).toFixed(1)}%.`, + ) + } + if (distractorConfusionMean !== null && distractorConfusionMean > 0) { + interpretation.push( + `Distractor confusion remains observable with mean count ${distractorConfusionMean.toFixed(3)}.`, + ) + } else { + interpretation.push('No distractor confusion was observed in the current evidence window.') + } + if (compactionTriggerMean !== null && compactionTriggerMean > 0) { + interpretation.push( + `Compaction/tool-result governance was active with mean compaction trigger count ${compactionTriggerMean.toFixed(3)} and mean saved tokens ${compactionSavedTokensMean ?? 0}.`, + ) + } + if (promptTokenDeltaMean !== null) { + interpretation.push( + `Relative to baseline, candidate prompt-token delta mean is ${promptTokenDeltaMean.toFixed(3)}.`, + ) + } + if ( + successUnderContextPressureRate !== null && + successUnderContextPressureRate < 1 + ) { + interpretation.push( + `Success under context pressure is incomplete at ${(successUnderContextPressureRate * 100).toFixed(1)}%.`, + ) + } + if (entry.manualReviewQuestions.length > 0) { + interpretation.push( + `Manual review remains open for ${entry.manualReviewQuestions.length} question(s).`, + ) + } + + return { + scenario_id: entry.scenario_id, + candidate_variant_id: entry.candidate_variant_id, + repeat_count: entry.repeat_count, + context_family: entry.context_family, + context_size_class: entry.context_size_class, + retained_constraint_mean: retainedConstraintMean, + lost_constraint_mean: lostConstraintMean, + constraint_retention_rate_mean: constraintRetentionRateMean, + retrieved_fact_mean: retrievedFactMean, + missed_fact_mean: missedFactMean, + retrieved_fact_hit_rate_mean: retrievedFactHitRateMean, + distractor_confusion_mean: distractorConfusionMean, + compaction_trigger_mean: compactionTriggerMean, + compaction_saved_tokens_mean: compactionSavedTokensMean, + tool_result_budget_trigger_mean: toolResultBudgetTriggerMean, + total_prompt_input_tokens_mean: totalPromptInputTokensMean, + prompt_token_delta_mean: promptTokenDeltaMean, + success_under_context_pressure_rate: successUnderContextPressureRate, + manual_review_required: entry.manualReviewRequired, + manual_review_questions: entry.manualReviewQuestions, + interpretation, + } + }) + .sort((a, b) => + `${a.scenario_id}:${a.candidate_variant_id}`.localeCompare( + `${b.scenario_id}:${b.candidate_variant_id}`, + ), + ) +} + +function summarizeLongContextVerdict(params: { + experimentValidity: ExperimentValidity + longContextSummary: LongContextSummaryItem[] +}): LongContextReviewVerdict | undefined { + const { experimentValidity, longContextSummary } = params + if (longContextSummary.length === 0) return undefined + if (experimentValidity.status === 'invalid') return 'invalid' + const hasWarning = longContextSummary.some( + item => + (item.lost_constraint_mean ?? 0) > 0 || + (item.distractor_confusion_mean ?? 0) > 0 || + (item.success_under_context_pressure_rate !== null && + item.success_under_context_pressure_rate < 1), + ) + if (hasWarning) return 'warning' + const needsManualReview = + experimentValidity.status === 'inconclusive' || + longContextSummary.some( + item => + item.manual_review_required || + item.constraint_retention_rate_mean === null || + item.retrieved_fact_hit_rate_mean === null, + ) + if (needsManualReview) return 'needs_manual_review' + return 'pass' +} + function runGroupRefs(runGroups: RunGroupArtifact[]): string[] { return runGroups.map(group => path.join('tests', 'evals', 'v2', 'run-groups', `${group.run_group_id}.json`), @@ -1354,13 +2007,74 @@ async function writeRunGroups(runGroups: RunGroupArtifact[]): Promise { } } +function buildLongContextSection(params: { + longContextSummary: LongContextSummaryItem[] + longContextReviewVerdict?: LongContextReviewVerdict +}): string { + const { longContextSummary, longContextReviewVerdict } = params + if (longContextSummary.length === 0) return '' + const rows = longContextSummary + .map( + item => + `| ${item.scenario_id} | ${item.candidate_variant_id} | ${item.context_family} | ${item.context_size_class} | ${item.constraint_retention_rate_mean ?? 'n/a'} | ${item.retrieved_fact_hit_rate_mean ?? 'n/a'} | ${item.lost_constraint_mean ?? 'n/a'} | ${item.missed_fact_mean ?? 'n/a'} | ${item.distractor_confusion_mean ?? 'n/a'} | ${item.compaction_trigger_mean ?? 'n/a'} | ${item.compaction_saved_tokens_mean ?? 'n/a'} | ${item.total_prompt_input_tokens_mean ?? 'n/a'} | ${item.success_under_context_pressure_rate ?? 'n/a'} | ${item.manual_review_required} |`, + ) + .join('\n') + const semanticRows = longContextSummary + .flatMap(item => + item.interpretation.map( + interpretation => + `- ${item.scenario_id} / ${item.candidate_variant_id}: ${interpretation}`, + ), + ) + .join('\n') + const manualReviewRows = longContextSummary + .flatMap(item => + item.manual_review_questions.map( + question => + `- ${item.scenario_id} / ${item.candidate_variant_id}: ${question}`, + ), + ) + .join('\n') + return `## Long Context Summary + +- review_verdict: ${longContextReviewVerdict ?? 'not_applicable'} +- note: This section evaluates constraint retention, fact retrieval, distractor resistance, and compaction behavior under context pressure. + +| scenario | candidate_variant | family | size | retention_rate | fact_hit_rate | lost_constraints | missed_facts | distractor_confusion | compaction_triggers | compaction_saved_tokens | total_prompt_tokens | success_under_pressure | manual_review_required | +| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +${rows} + +### Semantic Interpretation + +${semanticRows || '- No long-context interpretation rows were generated.'} + +### Manual Review Notes + +${manualReviewRows || '- No manual review prompts were attached to the current long-context scenarios.'} + +### Interpretation Limits + +- Automatic long-context scores are strongest in fixture_trace mode. +- Real smoke may still require human inspection even when trace-backed cost and compaction evidence is present. +` +} + function buildBatchReport(params: { experiment: EvalExperimentV21 runGroups: RunGroupArtifact[] failures: RunExecutionFailure[] outputJson: string + longContextSummary: LongContextSummaryItem[] + longContextReviewVerdict?: LongContextReviewVerdict }): string { - const { experiment, runGroups, failures, outputJson } = params + const { + experiment, + runGroups, + failures, + outputJson, + longContextSummary, + longContextReviewVerdict, + } = params const groupRows = runGroups .map(group => { const metrics = group.stability_metrics @@ -1399,7 +2113,12 @@ function buildBatchReport(params: { ) .join('\n') - return `# V2.3 Batch Experiment Summary: ${experiment.experiment_id} + const longContextSection = buildLongContextSection({ + longContextSummary, + longContextReviewVerdict, + }) + + return `# ${longContextSummary.length > 0 ? 'V2.4 Long-Context' : 'V2.3 Batch'} Experiment Summary: ${experiment.experiment_id} ## Understanding @@ -1430,6 +2149,8 @@ ${flakyRows || '- No flaky run group detected by the current V2.3 heuristic.'} ${failureRows} +${longContextSection} + ## Interpretation Limits - V2.3 stability is based on repeat groups and trace-backed metrics; it is not a model-quality judge. @@ -1450,6 +2171,8 @@ function buildMarkdownReport(params: { explorationSignals: string[] recommendedReviewMode: ReviewMode variantEffectSummary: VariantEffectSummary[] + longContextSummary: LongContextSummaryItem[] + longContextReviewVerdict?: LongContextReviewVerdict }): string { const { experiment, @@ -1464,6 +2187,8 @@ function buildMarkdownReport(params: { explorationSignals, recommendedReviewMode, variantEffectSummary, + longContextSummary, + longContextReviewVerdict, } = params const allGateResults = results.flatMap(result => result.candidates.flatMap(candidate => candidate.gate_results), @@ -1542,6 +2267,10 @@ function buildMarkdownReport(params: { ), ) .join('\n') + const longContextSection = buildLongContextSection({ + longContextSummary, + longContextReviewVerdict, + }) const validityRows = [ `- status: ${experimentValidity.status}`, @@ -1562,8 +2291,15 @@ function buildMarkdownReport(params: { ].join('\n') const reportProfile: ExperimentProfile = experiment.report_profile ?? 'smoke' + const longContextMode = longContextSummary.length > 0 const profileSection = - reportProfile === 'smoke' + longContextMode + ? `## Long Context Review + +- requested_mode: ${experiment.mode ?? 'bind_existing'} +- review_verdict: ${longContextReviewVerdict ?? 'not_applicable'} +- note: This profile focuses on whether long-context pressure preserves constraints, facts, and governance signals.` + : reportProfile === 'smoke' ? `## Smoke Check - requested_mode: ${experiment.mode ?? 'bind_existing'} @@ -1578,7 +2314,12 @@ function buildMarkdownReport(params: { - note: This profile asks whether the candidate changed runtime behavior in an interpretable way.` const interpretationLimits = - reportProfile === 'smoke' + longContextMode + ? [ + '- Long-context automatic scoring is strongest in fixture_trace mode; real smoke still preserves a manual-review lane.', + '- Cost and compaction evidence alone do not prove that the final answer remained semantically correct.', + ].join('\n') + : reportProfile === 'smoke' ? [ '- Smoke only proves the automatic execute_harness -> capture -> run/score/report loop is healthy.', '- Smoke does not prove a candidate harness change is beneficial.', @@ -1639,6 +2380,8 @@ ${validityNotes || '- No additional blockers or warnings.'} ${runtimeDifferenceRows} +${longContextSection} + ## V2.3 Batch Robustness - batch_report: ${batchReport || 'not generated'} @@ -1741,6 +2484,12 @@ async function main(): Promise { } const repeatCount = Math.max(experiment.repeat_count ?? 1, 1) + const scenarioCatalog = new Map() + for (const scenarioId of scenarioIds) { + scenarioCatalog.set(scenarioId, await loadScenario(scenarioId)) + } + const fixtureTraceFastPath = + mode === 'execute_harness' && experiment.execution?.adapter === 'fixture_trace' const results: ScenarioExperimentResult[] = [] const failures: RunExecutionFailure[] = [] @@ -1767,7 +2516,8 @@ async function main(): Promise { const executionStamp = new Date().toISOString().replace(/[:.]/g, '') for (const scenarioId of scenarioIds) { - const scenario = mode === 'execute_harness' ? await loadScenario(scenarioId) : undefined + const scenarioRecord = scenarioCatalog.get(scenarioId) + const scenario = mode === 'execute_harness' ? scenarioRecord : undefined const baselineRunGroupId = createRunGroupId({ experimentId: experiment.experiment_id, scenarioId, @@ -1789,6 +2539,25 @@ async function main(): Promise { let baselineRunArtifact: RunArtifact | undefined try { + if (fixtureTraceFastPath) { + if (!scenarioRecord) throw new Error(`Scenario not found: ${scenarioId}`) + const baselineVariant = await loadVariant(experiment.baseline_variant_id) + const syntheticBaseline = await synthesizeFixtureRun({ + experiment, + scenario: scenarioRecord, + variant: baselineVariant, + runGroupId: baselineRunGroupId, + repeatIndex, + scoreSpecIds: experiment.score_spec_ids ?? [], + }) + baselineUserActionId = syntheticBaseline.userActionId + baselineExecution = syntheticBaseline.execution + baselineEvalRunId = syntheticBaseline.execution.eval_run_id + baselineBenchmarkRunId = syntheticBaseline.execution.benchmark_run_id + baselineRunId = syntheticBaseline.runId + baselineScores = syntheticBaseline.scores + baselineRunArtifact = syntheticBaseline.runArtifact + } else { if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) @@ -1841,6 +2610,7 @@ async function main(): Promise { path.join(scoresRoot, `${baselineRunId}.scores.json`), ) baselineRunArtifact = await readRunArtifact(baselineRunId) + } } catch (error) { const message = error instanceof Error ? error.message : String(error) if (failurePolicy === 'fail_fast') throw error @@ -1873,6 +2643,92 @@ async function main(): Promise { let candidateBenchmarkRunId: string | undefined try { + if (fixtureTraceFastPath) { + if (!scenarioRecord) throw new Error(`Scenario not found: ${scenarioId}`) + const candidateVariant = await loadVariant(candidateVariantId) + const syntheticCandidate = await synthesizeFixtureRun({ + experiment, + scenario: scenarioRecord, + variant: candidateVariant, + runGroupId: candidateRunGroupId, + repeatIndex, + scoreSpecIds: experiment.score_spec_ids ?? [], + }) + candidateActionId = syntheticCandidate.userActionId + candidateExecution = syntheticCandidate.execution + candidateEvalRunId = syntheticCandidate.execution.eval_run_id + candidateBenchmarkRunId = syntheticCandidate.execution.benchmark_run_id + const candidateRunId = syntheticCandidate.runId + const candidateScores = syntheticCandidate.scores + const candidateRunArtifact = syntheticCandidate.runArtifact + + const gateResults = evaluateGate({ + scenarioId, + candidateVariantId, + gatePolicy, + scoreSpecs, + baselineScores, + candidateScores, + }) + const scorecard = buildScorecardSummary({ + scenarioId, + candidateVariantId, + scoreSpecs, + baselineScores, + candidateScores, + }) + const variantEffect = runtimeDifferenceAnalysis({ + scenarioId, + candidateVariantId, + baselineVariantEffect: baselineRunArtifact?.variant_effect, + candidateVariantEffect: candidateRunArtifact.variant_effect, + scorecard, + }) + const experimentValidityForCandidate = buildExperimentValidity({ + profile: experiment.report_profile ?? 'smoke', + scenarioId, + candidateVariantId, + scenario: scenarioRecord, + baselineExecution, + candidateExecution, + scorecard, + variantEffectSummary: variantEffect, + }) + const syntheticCompareReport = await writeSyntheticCompareReport({ + baselineRunId, + candidateRunId, + scorecard, + variantEffectSummary: variantEffect, + }) + + candidates.push({ + candidate_variant_id: candidateVariantId, + candidate_run_group_id: candidateRunGroupId, + candidate_run_id: candidateRunId, + candidate_user_action_id: candidateActionId, + candidate_eval_run_id: candidateEvalRunId, + candidate_benchmark_run_id: candidateBenchmarkRunId, + candidate_execution: candidateExecution, + baseline_variant_effect: baselineRunArtifact?.variant_effect, + candidate_variant_effect: candidateRunArtifact.variant_effect, + variant_effect_summary: variantEffect, + experiment_validity: experimentValidityForCandidate, + compare_report: syntheticCompareReport, + gate_results: gateResults, + scorecard_summary: scorecard, + exploration_signals: buildExplorationSignals({ + scorecard, + gateResults, + experimentValidity: experimentValidityForCandidate, + variantEffectSummary: variantEffect, + }), + recommended_review_mode: recommendReviewMode({ + scorecard, + gateResults, + experimentValidity: experimentValidityForCandidate, + }), + }) + } else { if (mode === 'execute_harness') { if (!scenario) throw new Error(`Scenario not found: ${scenarioId}`) @@ -1959,6 +2815,7 @@ async function main(): Promise { profile: experiment.report_profile ?? 'smoke', scenarioId, candidateVariantId, + scenario: scenarioRecord, baselineExecution, candidateExecution, scorecard, @@ -1992,6 +2849,7 @@ async function main(): Promise { experimentValidity: experimentValidityForCandidate, }), }) + } } catch (error) { const message = error instanceof Error ? error.message : String(error) if (failurePolicy === 'fail_fast') throw error @@ -2047,6 +2905,11 @@ async function main(): Promise { const recommendedReviewMode = aggregateReviewMode(results) const variantEffectSummary = aggregateVariantEffectSummary(results) const experimentValidity = aggregateExperimentValidity(results) + const longContextSummary = await aggregateLongContextSummary(results) + const longContextReviewVerdict = summarizeLongContextVerdict({ + experimentValidity, + longContextSummary, + }) const runGroups = await buildRunGroups({ experimentId: experiment.experiment_id, baselineVariantId: experiment.baseline_variant_id, @@ -2109,6 +2972,8 @@ async function main(): Promise { risk_verdict: riskVerdict, gate_verdict: riskVerdict, experiment_validity: experimentValidity, + long_context_review_verdict: longContextReviewVerdict ?? null, + long_context_summary: longContextSummary, variant_effect_summary: variantEffectSummary, runtime_difference_summary: variantEffectSummary.flatMap(item => item.summary), verdict_boundary: @@ -2161,6 +3026,8 @@ async function main(): Promise { runGroups, failures, outputJson: outputJsonRel, + longContextSummary, + longContextReviewVerdict, }), ) @@ -2179,6 +3046,8 @@ async function main(): Promise { explorationSignals, recommendedReviewMode, variantEffectSummary, + longContextSummary, + longContextReviewVerdict, }), ) diff --git a/scripts/evals/v2_run_feedback.ts b/scripts/evals/v2_run_feedback.ts new file mode 100644 index 0000000000..58335e910a --- /dev/null +++ b/scripts/evals/v2_run_feedback.ts @@ -0,0 +1,1338 @@ +import { createHash } from 'node:crypto' +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import path from 'node:path' + +import type { + EvalCandidateVariantProposal, + EvalFeedbackApprovalCard, + EvalFeedbackProposalQueue, + EvalFeedbackRun, + EvalFinding, + EvalHypothesis, + EvalImprovementProposal, + EvalNextExperimentPlan, +} from '../../src/observability/v2/evalTypes' + +type JsonRecord = Record + +interface ExperimentValidity { + status?: string +} + +interface RiskVerdict { + status?: string + missing_score_count?: number +} + +interface LongContextSummaryItem { + scenario_id?: string + candidate_variant_id?: string + constraint_retention_rate_mean?: number | null + retrieved_fact_hit_rate_mean?: number | null + manual_review_required?: boolean + manual_review_questions?: string[] +} + +interface StabilitySummaryItem { + scenario_id?: string + variant_id?: string + flaky_status?: string +} + +interface ExperimentRunArtifact { + experiment_id?: string + manifest_ref?: string + report_refs?: string[] + experiment_validity?: ExperimentValidity + risk_verdict?: RiskVerdict + long_context_review_verdict?: string | null + long_context_summary?: LongContextSummaryItem[] + stability_summary?: StabilitySummaryItem[] + run_failures?: JsonRecord[] +} + +interface ProposalQueueById { + top_recommendation_proposal_id: string | null + recommended_now_proposal_ids: string[] + recommended_later_proposal_ids: string[] + deferred_proposal_ids: string[] + blocked_proposal_ids: string[] +} + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') + +function parseArgs(argv: string[]): Record { + const result: Record = {} + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i] + if (!arg.startsWith('--')) continue + const key = arg.slice(2) + const next = argv[i + 1] + if (!next || next.startsWith('--')) { + result[key] = true + } else { + result[key] = next + i += 1 + } + } + return result +} + +function assertString(value: unknown, fieldName: string): string { + if (typeof value !== 'string' || value.trim() === '') { + throw new Error(`${fieldName} must be a non-empty string`) + } + return value +} + +async function readJson(filePath: string): Promise { + return JSON.parse(await readFile(filePath, 'utf8')) as T +} + +function slug(value: string): string { + return value + .toLowerCase() + .replace(/[^a-z0-9]+/g, '_') + .replace(/^_+|_+$/g, '') + .slice(0, 48) +} + +function shortHash(value: string): string { + return createHash('sha1').update(value).digest('hex').slice(0, 8) +} + +function buildId( + kind: string, + experimentId: string, + label: string, + generatedAtCompact: string, +): string { + return `${kind}_${slug(experimentId)}_${slug(label)}_${generatedAtCompact}_${shortHash( + `${kind}:${experimentId}:${label}:${generatedAtCompact}`, + )}` +} + +function toRepoRelative(targetPath: string): string { + return path.relative(repoRoot, targetPath).replace(/\\/g, '/') +} + +function asArray(value: unknown): T[] { + return Array.isArray(value) ? (value as T[]) : [] +} + +function asNumber(value: unknown): number | null { + return typeof value === 'number' && Number.isFinite(value) ? value : null +} + +function uniq(values: string[]): string[] { + return [...new Set(values.filter(value => value.trim() !== ''))] +} + +async function ensureDirectory(relativeDir: string) { + await mkdir(path.join(repoRoot, relativeDir), { recursive: true }) +} + +async function writeJson(relativePath: string, value: unknown) { + const absolutePath = path.join(repoRoot, relativePath) + await mkdir(path.dirname(absolutePath), { recursive: true }) + await writeFile(absolutePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') +} + +async function writeMarkdown(relativePath: string, content: string) { + const absolutePath = path.join(repoRoot, relativePath) + await mkdir(path.dirname(absolutePath), { recursive: true }) + await writeFile(absolutePath, content, 'utf8') +} + +function pushFinding( + findings: EvalFinding[], + params: { + experimentId: string + sourceReportRef: string + generatedAtCompact: string + findingType: string + findingKind: EvalFinding['finding_kind'] + severity: EvalFinding['severity'] + scope: EvalFinding['scope'] + scopeRef: string + summary: string + evidenceRef: string + isBlocking: boolean + requiresManualJudgement: boolean + autoResolvable: boolean + }, +) { + findings.push({ + finding_id: buildId( + 'finding', + params.experimentId, + params.findingType, + params.generatedAtCompact, + ), + source_experiment_id: params.experimentId, + source_report_ref: params.sourceReportRef, + finding_type: params.findingType, + finding_kind: params.findingKind, + severity: params.severity, + scope: params.scope, + scope_ref: params.scopeRef, + summary: params.summary, + evidence_ref: params.evidenceRef, + is_blocking: params.isBlocking, + requires_manual_judgement: params.requiresManualJudgement, + auto_resolvable: params.autoResolvable, + fact_or_inference: 'fact', + }) +} + +function extractFindings( + experimentRunRef: string, + artifact: ExperimentRunArtifact, + generatedAtCompact: string, +): EvalFinding[] { + const experimentId = assertString(artifact.experiment_id, 'experiment_id') + const reportRefs = asArray(artifact.report_refs) + const sourceReportRef = + reportRefs.find(ref => ref.includes('batch_experiment_')) ?? + reportRefs[0] ?? + experimentRunRef + const findings: EvalFinding[] = [] + + if (artifact.long_context_review_verdict === 'needs_manual_review') { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: 'long_context_review_verdict_needs_manual_review', + findingKind: 'manual_review_boundary', + severity: 'warning', + scope: 'experiment', + scopeRef: experimentId, + summary: + 'The experiment-level long_context_review_verdict remains needs_manual_review.', + evidenceRef: `${experimentRunRef}#/long_context_review_verdict`, + isBlocking: false, + requiresManualJudgement: true, + autoResolvable: false, + }) + } + + const riskVerdict = artifact.risk_verdict + if (riskVerdict?.status === 'inconclusive') { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: 'risk_verdict_inconclusive', + findingKind: 'missing_score', + severity: 'warning', + scope: 'experiment', + scopeRef: experimentId, + summary: 'The regression-risk verdict is inconclusive for this experiment.', + evidenceRef: `${experimentRunRef}#/risk_verdict/status`, + isBlocking: false, + requiresManualJudgement: false, + autoResolvable: true, + }) + } + + if (typeof riskVerdict?.missing_score_count === 'number' && riskVerdict.missing_score_count > 0) { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: 'missing_score_count_positive', + findingKind: 'missing_score', + severity: 'warning', + scope: 'experiment', + scopeRef: experimentId, + summary: `The experiment still has ${riskVerdict.missing_score_count} missing score(s).`, + evidenceRef: `${experimentRunRef}#/risk_verdict/missing_score_count`, + isBlocking: false, + requiresManualJudgement: false, + autoResolvable: true, + }) + } + + asArray(artifact.long_context_summary).forEach((item, index) => { + const scenarioId = item.scenario_id ?? `scenario_${index + 1}` + if (asNumber(item.constraint_retention_rate_mean) === null) { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: `constraint_retention_rate_missing_${scenarioId}`, + findingKind: 'missing_score', + severity: 'warning', + scope: 'scenario', + scopeRef: scenarioId, + summary: `constraint_retention_rate_mean is null for ${scenarioId}.`, + evidenceRef: `${experimentRunRef}#/long_context_summary/${index}/constraint_retention_rate_mean`, + isBlocking: false, + requiresManualJudgement: false, + autoResolvable: true, + }) + } + if (asNumber(item.retrieved_fact_hit_rate_mean) === null) { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: `retrieved_fact_hit_rate_missing_${scenarioId}`, + findingKind: 'missing_score', + severity: 'warning', + scope: 'scenario', + scopeRef: scenarioId, + summary: `retrieved_fact_hit_rate_mean is null for ${scenarioId}.`, + evidenceRef: `${experimentRunRef}#/long_context_summary/${index}/retrieved_fact_hit_rate_mean`, + isBlocking: false, + requiresManualJudgement: false, + autoResolvable: true, + }) + } + if (item.manual_review_required === true) { + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: `manual_review_required_${scenarioId}`, + findingKind: 'manual_review_boundary', + severity: 'warning', + scope: 'scenario', + scopeRef: scenarioId, + summary: `manual_review_required is true for ${scenarioId}.`, + evidenceRef: `${experimentRunRef}#/long_context_summary/${index}/manual_review_required`, + isBlocking: false, + requiresManualJudgement: true, + autoResolvable: false, + }) + } + }) + + asArray(artifact.stability_summary).forEach((item, index) => { + if (item.flaky_status && item.flaky_status !== 'stable') { + const scenarioId = item.scenario_id ?? `scenario_${index + 1}` + const variantId = item.variant_id ?? `variant_${index + 1}` + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: `flaky_status_${scenarioId}_${variantId}`, + findingKind: 'stability_gap', + severity: 'warning', + scope: 'variant', + scopeRef: `${scenarioId}:${variantId}`, + summary: `flaky_status is ${item.flaky_status} for ${scenarioId} / ${variantId}.`, + evidenceRef: `${experimentRunRef}#/stability_summary/${index}/flaky_status`, + isBlocking: false, + requiresManualJudgement: false, + autoResolvable: false, + }) + } + }) + + asArray(artifact.run_failures).forEach((item, index) => { + const stage = typeof item.stage === 'string' ? item.stage : 'unknown' + const scenarioId = typeof item.scenario_id === 'string' ? item.scenario_id : 'unknown' + pushFinding(findings, { + experimentId, + sourceReportRef, + generatedAtCompact, + findingType: `run_failure_${stage}_${scenarioId}_${index + 1}`, + findingKind: 'execution_failure', + severity: 'blocking', + scope: 'run', + scopeRef: `${stage}:${scenarioId}:${index + 1}`, + summary: `Run failure observed at stage=${stage} for scenario=${scenarioId}.`, + evidenceRef: `${experimentRunRef}#/run_failures/${index}`, + isBlocking: true, + requiresManualJudgement: false, + autoResolvable: false, + }) + }) + + return findings +} + +function buildHypothesis( + experimentId: string, + label: string, + generatedAtCompact: string, + findings: EvalFinding[], + body: { + hypothesis: string + confidence: EvalHypothesis['confidence'] + risks: string[] + falsifiableBy: string[] + }, +): EvalHypothesis { + return { + hypothesis_id: buildId('hypothesis', experimentId, label, generatedAtCompact), + based_on_finding_ids: findings.map(item => item.finding_id), + depends_on_finding_refs: findings.map(item => item.evidence_ref), + hypothesis: body.hypothesis, + confidence: body.confidence, + falsifiable_by: body.falsifiableBy, + supporting_evidence_refs: findings.map(item => item.evidence_ref), + risks: body.risks, + fact_or_inference: 'inference', + } +} + +function artifactUsesExpectationContract(artifact: ExperimentRunArtifact): boolean { + if ( + typeof artifact.experiment_id === 'string' && + artifact.experiment_id.includes('expectation_contract_v0') + ) { + return true + } + + return asArray(artifact.long_context_summary).some( + item => + typeof item.scenario_id === 'string' && + item.scenario_id.includes('contract_v0'), + ) +} + +function buildHypotheses( + experimentId: string, + artifact: ExperimentRunArtifact, + findings: EvalFinding[], + generatedAtCompact: string, +): EvalHypothesis[] { + const hypotheses: EvalHypothesis[] = [] + const usesExpectationContract = artifactUsesExpectationContract(artifact) + + const semanticMissingFindings = findings.filter( + finding => + finding.finding_type.startsWith('constraint_retention_rate_missing_') || + finding.finding_type.startsWith('retrieved_fact_hit_rate_missing_'), + ) + if (semanticMissingFindings.length > 0) { + hypotheses.push( + buildHypothesis( + experimentId, + 'real_output_semantic_parser_missing', + generatedAtCompact, + semanticMissingFindings, + { + hypothesis: + 'The current real-smoke evaluator lacks a lightweight semantic output parser, so fact retrieval and constraint retention cannot yet be auto-judged from runtime outputs.', + confidence: 'medium', + risks: [ + 'A parser that is too narrow can miss valid answers.', + 'A parser that is too loose can create false positives.', + ], + falsifiableBy: [ + 'Implement a lightweight real-smoke output parser and rerun long_context_fact_retrieval_real_smoke.', + 'Verify retrieved_fact_hit_rate and constraint_retention_rate become non-null without inflating distractor_confusion_count.', + ], + }, + ), + ) + } + + const manualReviewFindings = findings.filter( + finding => + finding.finding_type === 'long_context_review_verdict_needs_manual_review' || + finding.finding_type.startsWith('manual_review_required_'), + ) + if (manualReviewFindings.length > 0) { + hypotheses.push( + buildHypothesis( + experimentId, + usesExpectationContract + ? 'manual_review_boundary_persisted_after_contract_v0' + : 'manual_review_boundary_still_open', + generatedAtCompact, + manualReviewFindings, + { + hypothesis: usesExpectationContract + ? 'The tightened expectation contract is already in place, but manual review still remains open. The next bottleneck is feedback-loop deduplication and proposal stability, not another copy of the same scenario-contract recommendation.' + : 'The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke.', + confidence: 'high', + risks: [ + 'Treating manual review signals as auto-pass would overstate evaluator certainty.', + ], + falsifiableBy: usesExpectationContract + ? [ + 'Re-run feedback on the same expectation-contract artifact and confirm the queue no longer repeats the same expectation-contract recommendation as top priority.', + 'Verify the next top recommendation, if any, shifts to feedback-system stabilization rather than a duplicate scenario contract.', + ] + : [ + 'Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic.', + ], + }, + ), + ) + } + + const gateFindings = findings.filter( + finding => + finding.finding_type === 'risk_verdict_inconclusive' || + finding.finding_type === 'missing_score_count_positive', + ) + if (gateFindings.length > 0 && semanticMissingFindings.length > 0) { + hypotheses.push( + buildHypothesis( + experimentId, + 'gate_inconclusive_due_to_missing_semantic_scores', + generatedAtCompact, + gateFindings, + { + hypothesis: + 'The regression-risk gate is inconclusive mainly because semantic long-context scores are still missing, not because the runner failed to execute.', + confidence: 'medium', + risks: [ + 'If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports.', + ], + falsifiableBy: [ + 'After parser output is bound into context scores, rerun the same real smoke and confirm whether risk_verdict becomes more decisive without hiding uncertainty.', + ], + }, + ), + ) + } + + const instabilityFindings = findings.filter( + finding => + finding.finding_type.startsWith('flaky_status_') || + finding.finding_type.startsWith('run_failure_'), + ) + if (instabilityFindings.length > 0) { + hypotheses.push( + buildHypothesis( + experimentId, + 'runner_or_scenario_instability', + generatedAtCompact, + instabilityFindings, + { + hypothesis: + 'Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.', + confidence: 'medium', + risks: [ + 'Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise.', + ], + falsifiableBy: [ + 'Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable.', + ], + }, + ), + ) + } + + return hypotheses +} + +function proposalSeedForHypothesis( + hypothesis: EvalHypothesis, + findingsById: Map, + hasGlobalBlockingExecution: boolean, + hasSemanticParserGap: boolean, +): Omit | null { + const basedOnFindingIds = hypothesis.based_on_finding_ids + const manualJudgementFindingIds = basedOnFindingIds.filter( + findingId => findingsById.get(findingId)?.requires_manual_judgement === true, + ) + const blockingFindingIds = basedOnFindingIds.filter( + findingId => findingsById.get(findingId)?.is_blocking === true, + ) + + if (hypothesis.hypothesis_id.includes('real_output_semantic_parser_missing')) { + return { + based_on_hypothesis_ids: [hypothesis.hypothesis_id], + based_on_finding_ids: basedOnFindingIds, + proposal_type: 'evaluator_improvement', + target_layer: 'evaluator', + priority: 'P0', + queue_bucket: hasGlobalBlockingExecution ? 'blocked' : 'top_recommendation', + description: + 'Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence.', + expected_effect: + 'Convert currently-null long-context semantic scores into rule-backed observed values where the output format is narrow enough.', + why_now: + 'This directly targets the two most important semantic nulls in the current real-smoke sample and does not require runtime harness changes.', + why_not_now: hasGlobalBlockingExecution + ? 'Execution failures must be resolved before evaluator improvements can be trusted.' + : null, + blocking_finding_ids: blockingFindingIds, + manual_judgement_finding_ids: manualJudgementFindingIds, + risks: hypothesis.risks, + requires_human_approval: true, + } + } + + if (hypothesis.hypothesis_id.includes('manual_review_boundary_still_open')) { + const queueBucket = hasGlobalBlockingExecution + ? 'blocked' + : hasSemanticParserGap + ? 'recommended_later' + : 'top_recommendation' + return { + based_on_hypothesis_ids: [hypothesis.hypothesis_id], + based_on_finding_ids: basedOnFindingIds, + proposal_type: 'scenario_improvement', + target_layer: 'scenario', + priority: 'P1', + queue_bucket: queueBucket, + description: + 'Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.', + expected_effect: + 'Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs.', + why_now: + hasSemanticParserGap + ? 'This is the cleanest way to narrow manual review once semantic evidence collection improves.' + : 'Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision.', + why_not_now: hasGlobalBlockingExecution + ? 'Execution failures must be resolved before contract-tightening can be evaluated.' + : hasSemanticParserGap + ? 'By itself it does not convert null semantic scores into formal evidence, so it is best staged after parser work begins.' + : null, + blocking_finding_ids: blockingFindingIds, + manual_judgement_finding_ids: manualJudgementFindingIds, + risks: hypothesis.risks, + requires_human_approval: true, + } + } + + if (hypothesis.hypothesis_id.includes('manual_review_boundary_persisted_after_contract')) { + return { + based_on_hypothesis_ids: [hypothesis.hypothesis_id], + based_on_finding_ids: basedOnFindingIds, + proposal_type: 'feedback_contract_improvement', + target_layer: 'feedback_system', + priority: 'P1', + queue_bucket: hasGlobalBlockingExecution ? 'blocked' : 'top_recommendation', + description: + 'Stabilize the feedback input contract so an already-realized expectation-contract follow-up is detected and not re-recommended as the next top proposal.', + expected_effect: + 'Prevent proposal-loop duplication and keep approval cards aligned with the true next unresolved bottleneck.', + why_now: + 'The current source experiment already uses expectation_contract_v0, so repeating the same contract proposal would be a feedback-loop error rather than a useful next action.', + why_not_now: hasGlobalBlockingExecution + ? 'Execution failures must be resolved before feedback-contract stabilization can be trusted.' + : null, + blocking_finding_ids: blockingFindingIds, + manual_judgement_finding_ids: manualJudgementFindingIds, + risks: hypothesis.risks, + requires_human_approval: true, + } + } + + if (hypothesis.hypothesis_id.includes('gate_inconclusive_due_to_missing_semantic_scores')) { + return { + based_on_hypothesis_ids: [hypothesis.hypothesis_id], + based_on_finding_ids: basedOnFindingIds, + proposal_type: 'score_binding_improvement', + target_layer: 'scorer', + priority: 'P1', + queue_bucket: 'blocked', + description: + 'Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk.', + expected_effect: + 'Reduce inconclusive gate results caused purely by absent semantic score evidence.', + why_now: + 'The gate cannot become more informative until parser output is formally bound into context scores.', + why_not_now: + 'This is blocked until a lightweight parser exists; there is nothing stable to bind before that.', + blocking_finding_ids: blockingFindingIds, + manual_judgement_finding_ids: manualJudgementFindingIds, + risks: hypothesis.risks, + requires_human_approval: true, + } + } + + if (hypothesis.hypothesis_id.includes('runner_or_scenario_instability')) { + return { + based_on_hypothesis_ids: [hypothesis.hypothesis_id], + based_on_finding_ids: basedOnFindingIds, + proposal_type: 'feedback_contract_improvement', + target_layer: 'feedback_system', + priority: 'P2', + queue_bucket: hasGlobalBlockingExecution ? 'blocked' : 'deferred', + description: + 'Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.', + expected_effect: + 'Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items.', + why_now: + 'This keeps the feedback system honest when stability evidence is weak or under-sampled.', + why_not_now: hasGlobalBlockingExecution + ? 'Execution failures must be resolved before contract work can be meaningfully assessed.' + : 'The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred.', + blocking_finding_ids: blockingFindingIds, + manual_judgement_finding_ids: manualJudgementFindingIds, + risks: hypothesis.risks, + requires_human_approval: true, + } + } + + return null +} + +function buildImprovementProposals( + experimentId: string, + findings: EvalFinding[], + hypotheses: EvalHypothesis[], + generatedAtCompact: string, +): EvalImprovementProposal[] { + const findingsById = new Map(findings.map(item => [item.finding_id, item])) + const hasGlobalBlockingExecution = findings.some(item => item.finding_kind === 'execution_failure') + const hasSemanticParserGap = hypotheses.some(hypothesis => + hypothesis.hypothesis_id.includes('real_output_semantic_parser_missing'), + ) + const proposals: EvalImprovementProposal[] = [] + + for (const hypothesis of hypotheses) { + const seed = proposalSeedForHypothesis( + hypothesis, + findingsById, + hasGlobalBlockingExecution, + hasSemanticParserGap, + ) + if (!seed) continue + let label = 'proposal' + if (seed.description.includes('output parser')) label = 'add_long_context_output_parser_v0' + else if (seed.description.includes('expected facts')) label = 'tighten_real_smoke_expectations_v0' + else if (seed.description.includes('score-spec')) label = 'map_parser_output_to_context_scores_v0' + else if (seed.description.includes('already-realized expectation-contract')) { + label = 'stabilize_feedback_input_contract_after_contract_v0' + } else if (seed.description.includes('feedback input contract')) { + label = 'stabilize_feedback_input_contract_v0' + } + + proposals.push({ + proposal_id: buildId('proposal', experimentId, label, generatedAtCompact), + ...seed, + }) + } + + return proposals +} + +function buildCandidateVariantProposals( + experimentId: string, + proposals: EvalImprovementProposal[], + generatedAtCompact: string, +): EvalCandidateVariantProposal[] { + return proposals.map(proposal => { + if ( + proposal.proposal_type === 'evaluator_improvement' || + proposal.proposal_type === 'score_binding_improvement' + ) { + const variantName = proposal.proposal_id.includes('add_long_context_output_parser') + ? 'candidate_long_context_output_parser_v0' + : 'candidate_long_context_score_binding_v0' + return { + candidate_proposal_id: buildId( + 'candidate_proposal', + experimentId, + variantName, + generatedAtCompact, + ), + based_on_proposal_id: proposal.proposal_id, + change_layer: + proposal.proposal_type === 'evaluator_improvement' ? 'evaluator' : 'scorer', + variant_name: variantName, + implementation_scope: + 'Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.', + do_not_touch: [ + 'src/query.ts', + 'src/services/SessionMemory/sessionMemory.ts', + 'src/services/api/claude.ts', + ], + suggested_manifest_patch: { + proposed_variant_stub: { + variant_id: variantName, + name: variantName, + description: proposal.description, + change_layer: 'mixed', + notes: 'Evaluator-only candidate draft generated by V2.5 beta feedback loop.', + }, + implementation_hint: [ + 'Keep the human-review boundary explicit.', + proposal.proposal_type === 'evaluator_improvement' + ? 'Extend real-smoke output parsing for expected facts and retained constraints.' + : 'Bind parser output into context score-spec fields without hiding uncertainty.', + ], + }, + } + } + + let variantName = 'candidate_feedback_input_contract_v0' + if (proposal.proposal_type === 'scenario_improvement') { + variantName = 'candidate_long_context_expectation_contract_v0' + } else if ( + proposal.proposal_type === 'feedback_contract_improvement' && + proposal.proposal_id.includes('after_contract') + ) { + variantName = 'candidate_feedback_input_contract_after_contract_v0' + } + + return { + candidate_proposal_id: buildId( + 'candidate_proposal', + experimentId, + variantName, + generatedAtCompact, + ), + based_on_proposal_id: proposal.proposal_id, + change_layer: + proposal.proposal_type === 'scenario_improvement' + ? 'scenario' + : 'feedback_system', + variant_name: variantName, + implementation_scope: + proposal.proposal_type === 'scenario_improvement' + ? 'Only scenario manifests, expected facts, constraints, and manual review prompts may change.' + : 'Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.', + do_not_touch: + proposal.proposal_type === 'scenario_improvement' + ? [ + 'src/query.ts', + 'src/services/SessionMemory/sessionMemory.ts', + 'runtime harness policy files', + ] + : [ + 'src/query.ts', + 'src/services/SessionMemory/sessionMemory.ts', + 'src/services/api/claude.ts', + ], + suggested_manifest_patch: { + proposed_variant_stub: { + variant_id: variantName, + name: variantName, + description: proposal.description, + change_layer: 'mixed', + notes: 'Contract-level draft generated by V2.5 beta feedback loop.', + }, + implementation_hint: + proposal.proposal_type === 'scenario_improvement' + ? [ + 'Tighten expected facts, constraints, and manual review prompts for real smoke.', + 'Do not change runtime policy in this candidate.', + ] + : [ + 'Keep feedback taxonomy stable and queue semantics explicit.', + 'Do not turn manual review into automatic pass.', + ], + }, + } + }) +} + +function uniqueScenarioIds(artifact: ExperimentRunArtifact): string[] { + const scenarioIds = new Set() + for (const item of asArray(artifact.long_context_summary)) { + if (typeof item.scenario_id === 'string' && item.scenario_id.trim() !== '') { + scenarioIds.add(item.scenario_id) + } + } + for (const item of asArray(artifact.stability_summary)) { + if (typeof item.scenario_id === 'string' && item.scenario_id.trim() !== '') { + scenarioIds.add(item.scenario_id) + } + } + return [...scenarioIds] +} + +function buildNextExperimentPlans( + experimentId: string, + artifact: ExperimentRunArtifact, + proposals: EvalImprovementProposal[], + candidateProposals: EvalCandidateVariantProposal[], + generatedAtCompact: string, +): EvalNextExperimentPlan[] { + const scenarioIds = uniqueScenarioIds(artifact) + return proposals.map(proposal => { + const candidateProposal = candidateProposals.find( + item => item.based_on_proposal_id === proposal.proposal_id, + ) + const scenarioSelection = + scenarioIds.length > 0 ? scenarioIds : ['long_context_fact_retrieval_real_smoke'] + + const evaluatorLike = + proposal.proposal_type === 'evaluator_improvement' || + proposal.proposal_type === 'score_binding_improvement' + + return { + next_experiment_plan_id: buildId( + 'experiment_plan', + experimentId, + candidateProposal?.variant_name ?? proposal.proposal_id, + generatedAtCompact, + ), + based_on_proposal_id: proposal.proposal_id, + scenario_ids: evaluatorLike + ? ['long_context_fact_retrieval_real_smoke'] + : scenarioSelection, + baseline_variant_id: 'baseline_default', + candidate_variant_id: + candidateProposal?.variant_name ?? 'candidate_feedback_followup_v0', + repeat_count: evaluatorLike ? 2 : 1, + success_criteria: evaluatorLike + ? [ + 'retrieved_fact_hit_rate is no longer null for real smoke.', + 'constraint_retention_rate is no longer null for real smoke.', + 'manual_review_required does not increase.', + 'distractor_confusion_count remains 0.', + ] + : proposal.proposal_type === 'scenario_improvement' + ? [ + 'Manual review prompts become more specific and lower-ambiguity.', + 'Scenario intent remains matched.', + 'No new flaky or failed run groups appear.', + ] + : [ + 'Feedback queue semantics become stable and easier to approve.', + 'Top recommendation remains unique.', + 'No new schema ambiguity appears in feedback artifacts.', + ], + failure_criteria: evaluatorLike + ? [ + 'Parser introduces false positives against distractor-resistant scenarios.', + 'Manual review requirement increases or semantic scores become contradictory.', + ] + : proposal.proposal_type === 'scenario_improvement' + ? [ + 'Scenario contract changes erase the current runtime-difference evidence.', + 'Long-context intent becomes less specific or more brittle.', + ] + : [ + 'Feedback queue becomes contradictory or unstable across equivalent inputs.', + 'Manual review and human approval boundaries become harder to distinguish.', + ], + manual_review_required: true, + } + }) +} + +function buildProposalQueue(proposals: EvalImprovementProposal[]): ProposalQueueById { + const topRecommendation = proposals.find( + proposal => proposal.queue_bucket === 'top_recommendation', + ) + + return { + top_recommendation_proposal_id: topRecommendation?.proposal_id ?? null, + recommended_now_proposal_ids: proposals + .filter( + proposal => + proposal.queue_bucket === 'recommended_now' || + proposal.queue_bucket === 'top_recommendation', + ) + .map(proposal => proposal.proposal_id), + recommended_later_proposal_ids: proposals + .filter(proposal => proposal.queue_bucket === 'recommended_later') + .map(proposal => proposal.proposal_id), + deferred_proposal_ids: proposals + .filter(proposal => proposal.queue_bucket === 'deferred') + .map(proposal => proposal.proposal_id), + blocked_proposal_ids: proposals + .filter(proposal => proposal.queue_bucket === 'blocked') + .map(proposal => proposal.proposal_id), + } +} + +function buildApprovalCard( + proposals: EvalImprovementProposal[], + candidateProposals: EvalCandidateVariantProposal[], + nextExperimentPlans: EvalNextExperimentPlan[], + proposalQueue: ProposalQueueById, + proposalRefById: Map, + nextPlanRefByProposalId: Map, +): EvalFeedbackApprovalCard { + const topProposal = proposals.find( + proposal => proposal.proposal_id === proposalQueue.top_recommendation_proposal_id, + ) + const fallbackWhyNow = + 'No top recommendation was produced. Review findings manually before approving any proposal.' + + if (!topProposal) { + return { + current_top_recommendation_proposal_ref: null, + why_now: fallbackWhyNow, + why_not_others_yet: [], + approval_scope: 'No approval scope generated.', + do_not_touch: [], + next_experiment_plan_ref: null, + success_criteria: [], + risks: [], + manual_review_boundary: + 'Manual review remains required. Do not treat unresolved semantic checks as automatic pass.', + } + } + + const topCandidate = candidateProposals.find( + proposal => proposal.based_on_proposal_id === topProposal.proposal_id, + ) + const topPlan = nextExperimentPlans.find( + plan => plan.based_on_proposal_id === topProposal.proposal_id, + ) + const whyNotOthersYet = proposals + .filter(proposal => proposal.proposal_id !== topProposal.proposal_id) + .map( + proposal => + `${proposal.proposal_id}: ${proposal.queue_bucket}${ + proposal.why_not_now ? ` - ${proposal.why_not_now}` : '' + }`, + ) + + return { + current_top_recommendation_proposal_ref: + proposalRefById.get(topProposal.proposal_id) ?? null, + why_now: topProposal.why_now, + why_not_others_yet: whyNotOthersYet, + approval_scope: + topCandidate?.implementation_scope ?? + 'Approval is limited to the proposal scope recorded in the matching candidate draft.', + do_not_touch: topCandidate?.do_not_touch ?? [], + next_experiment_plan_ref: + nextPlanRefByProposalId.get(topProposal.proposal_id) ?? null, + success_criteria: topPlan?.success_criteria ?? [], + risks: topProposal.risks, + manual_review_boundary: + 'Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks.', + } +} + +function buildMarkdownReport(params: { + feedbackRunId: string + generatedAt: string + sourceExperimentRunRef: string + sourceReportRefs: string[] + findings: EvalFinding[] + hypotheses: EvalHypothesis[] + proposals: EvalImprovementProposal[] + candidateProposals: EvalCandidateVariantProposal[] + nextExperimentPlans: EvalNextExperimentPlan[] + proposalQueue: EvalFeedbackProposalQueue + blockingFindingRefs: string[] + manualJudgementFindingRefs: string[] + autoResolvableFindingRefs: string[] + approvalCard: EvalFeedbackApprovalCard + proposalRefById: Map +}): string { + const findingLines = + params.findings.length === 0 + ? ['- No findings generated.'] + : params.findings.map( + finding => + `- ${finding.finding_id}\n - type: ${finding.finding_type}\n - kind: ${finding.finding_kind}\n - severity: ${finding.severity}\n - scope: ${finding.scope}\n - scope_ref: ${finding.scope_ref}\n - summary: ${finding.summary}\n - evidence_ref: ${finding.evidence_ref}\n - is_blocking: ${String(finding.is_blocking)}\n - requires_manual_judgement: ${String(finding.requires_manual_judgement)}\n - auto_resolvable: ${String(finding.auto_resolvable)}\n - fact_or_inference: ${finding.fact_or_inference}`, + ) + + const hypothesisLines = + params.hypotheses.length === 0 + ? ['- No hypotheses generated.'] + : params.hypotheses.map( + hypothesis => + `- ${hypothesis.hypothesis_id}\n - confidence: ${hypothesis.confidence}\n - based_on: ${hypothesis.based_on_finding_ids.join(', ')}\n - depends_on_finding_refs: ${hypothesis.depends_on_finding_refs.join(' | ')}\n - hypothesis: ${hypothesis.hypothesis}\n - falsifiable_by: ${hypothesis.falsifiable_by.join(' | ')}\n - risks: ${hypothesis.risks.join(' | ')}\n - fact_or_inference: ${hypothesis.fact_or_inference}`, + ) + + const proposalLines = + params.proposals.length === 0 + ? ['- No proposals generated.'] + : params.proposals.map( + proposal => + `- ${proposal.proposal_id}\n - type: ${proposal.proposal_type}\n - target_layer: ${proposal.target_layer}\n - priority: ${proposal.priority}\n - queue_bucket: ${proposal.queue_bucket}\n - description: ${proposal.description}\n - expected_effect: ${proposal.expected_effect}\n - why_now: ${proposal.why_now}\n - why_not_now: ${proposal.why_not_now ?? 'n/a'}\n - blocking_finding_ids: ${proposal.blocking_finding_ids.join(' | ') || 'none'}\n - manual_judgement_finding_ids: ${proposal.manual_judgement_finding_ids.join(' | ') || 'none'}\n - risks: ${proposal.risks.join(' | ')}\n - requires_human_approval: true`, + ) + + const candidateLines = + params.candidateProposals.length === 0 + ? ['- No candidate variant proposals generated.'] + : params.candidateProposals.map( + candidate => + `- ${candidate.candidate_proposal_id}\n - variant_name: ${candidate.variant_name}\n - change_layer: ${candidate.change_layer}\n - implementation_scope: ${candidate.implementation_scope}\n - do_not_touch: ${candidate.do_not_touch.join(' | ')}`, + ) + + const nextPlanLines = + params.nextExperimentPlans.length === 0 + ? ['- No next experiment plans generated.'] + : params.nextExperimentPlans.map( + plan => + `- ${plan.next_experiment_plan_id}\n - candidate_variant_id: ${plan.candidate_variant_id}\n - scenario_ids: ${plan.scenario_ids.join(', ')}\n - repeat_count: ${plan.repeat_count}\n - success_criteria: ${plan.success_criteria.join(' | ')}\n - failure_criteria: ${plan.failure_criteria.join(' | ')}\n - manual_review_required: ${String(plan.manual_review_required)}`, + ) + + const topRecommendation = + params.approvalCard.current_top_recommendation_proposal_ref ?? 'none' + + return `# V2.5 Beta Feedback Report: ${params.feedbackRunId} + +## Understanding + +- source_experiment_run: ${params.sourceExperimentRunRef} +- source_reports: +${params.sourceReportRefs.map(ref => ` - ${ref}`).join('\n')} +- generated_at: ${params.generatedAt} +- this report is advisory only and does not apply code changes automatically + +## Human Approval Card + +- current_top_recommendation: ${topRecommendation} +- why_now: ${params.approvalCard.why_now} +- why_not_others_yet: +${params.approvalCard.why_not_others_yet.length > 0 ? params.approvalCard.why_not_others_yet.map(item => ` - ${item}`).join('\n') : ' - none'} +- approval_scope: ${params.approvalCard.approval_scope} +- do_not_touch: ${params.approvalCard.do_not_touch.join(' | ') || 'none'} +- next_experiment_plan_ref: ${params.approvalCard.next_experiment_plan_ref ?? 'none'} +- success_criteria: +${params.approvalCard.success_criteria.length > 0 ? params.approvalCard.success_criteria.map(item => ` - ${item}`).join('\n') : ' - none'} +- risks: +${params.approvalCard.risks.length > 0 ? params.approvalCard.risks.map(item => ` - ${item}`).join('\n') : ' - none'} +- manual_review_boundary: ${params.approvalCard.manual_review_boundary} + +## Proposal Queue + +- top_recommendation: + - ${params.proposalQueue.top_recommendation_proposal_ref ?? 'none'} +- recommended_now: +${params.proposalQueue.recommended_now_proposal_refs.length > 0 ? params.proposalQueue.recommended_now_proposal_refs.map(ref => ` - ${ref}`).join('\n') : ' - none'} +- recommended_later: +${params.proposalQueue.recommended_later_proposal_refs.length > 0 ? params.proposalQueue.recommended_later_proposal_refs.map(ref => ` - ${ref}`).join('\n') : ' - none'} +- deferred: +${params.proposalQueue.deferred_proposal_refs.length > 0 ? params.proposalQueue.deferred_proposal_refs.map(ref => ` - ${ref}`).join('\n') : ' - none'} +- blocked: +${params.proposalQueue.blocked_proposal_refs.length > 0 ? params.proposalQueue.blocked_proposal_refs.map(ref => ` - ${ref}`).join('\n') : ' - none'} + +## Approval Contract + +- blocking_findings: +${params.blockingFindingRefs.length > 0 ? params.blockingFindingRefs.map(ref => ` - ${ref}`).join('\n') : ' - none'} +- manual_judgement_required_findings: +${params.manualJudgementFindingRefs.length > 0 ? params.manualJudgementFindingRefs.map(ref => ` - ${ref}`).join('\n') : ' - none'} +- auto_resolvable_findings: +${params.autoResolvableFindingRefs.length > 0 ? params.autoResolvableFindingRefs.map(ref => ` - ${ref}`).join('\n') : ' - none'} + +## Findings + +${findingLines.join('\n')} + +## Hypotheses + +${hypothesisLines.join('\n')} + +## Improvement Proposals + +${proposalLines.join('\n')} + +## Candidate Variant Proposals + +${candidateLines.join('\n')} + +## Next Experiment Plans + +${nextPlanLines.join('\n')} + +## Human Approval Required + +- yes +- no proposal in this report has been auto-implemented +- findings are facts; hypotheses and proposals are reviewable inferences +` +} + +const args = parseArgs(process.argv.slice(2)) +const experimentRunArg = args['experiment-run'] +if (typeof experimentRunArg !== 'string' || experimentRunArg.trim() === '') { + console.error( + 'Usage: bun run scripts/evals/v2_run_feedback.ts --experiment-run ', + ) + process.exit(1) +} + +const experimentRunAbsolute = path.resolve(repoRoot, experimentRunArg) +const experimentRunRef = toRepoRelative(experimentRunAbsolute) +const artifact = await readJson(experimentRunAbsolute) +const experimentId = assertString(artifact.experiment_id, 'experiment_id') +const generatedAt = new Date().toISOString() +const generatedAtCompact = generatedAt.replace(/[-:.]/g, '') +const feedbackRunId = buildId('feedback_run', experimentId, 'beta', generatedAtCompact) + +await ensureDirectory('tests/evals/v2/feedback/findings') +await ensureDirectory('tests/evals/v2/feedback/hypotheses') +await ensureDirectory('tests/evals/v2/feedback/proposals') +await ensureDirectory('tests/evals/v2/feedback/candidate-proposals') +await ensureDirectory('tests/evals/v2/feedback/experiment-plans') +await ensureDirectory('tests/evals/v2/feedback/runs') +await ensureDirectory('ObservrityTask/10-系统版本/v2/07-反馈报告') + +const findings = extractFindings(experimentRunRef, artifact, generatedAtCompact) +const hypotheses = buildHypotheses(experimentId, artifact, findings, generatedAtCompact) +const proposals = buildImprovementProposals( + experimentId, + findings, + hypotheses, + generatedAtCompact, +) +const candidateProposals = buildCandidateVariantProposals( + experimentId, + proposals, + generatedAtCompact, +) +const nextExperimentPlans = buildNextExperimentPlans( + experimentId, + artifact, + proposals, + candidateProposals, + generatedAtCompact, +) +const proposalQueueById = buildProposalQueue(proposals) + +const findingRefs: string[] = [] +for (const finding of findings) { + const relativePath = `tests/evals/v2/feedback/findings/${finding.finding_id}.json` + await writeJson(relativePath, finding) + findingRefs.push(relativePath) +} + +const hypothesisRefs: string[] = [] +for (const hypothesis of hypotheses) { + const relativePath = `tests/evals/v2/feedback/hypotheses/${hypothesis.hypothesis_id}.json` + await writeJson(relativePath, hypothesis) + hypothesisRefs.push(relativePath) +} + +const proposalRefs: string[] = [] +const proposalRefById = new Map() +for (const proposal of proposals) { + const relativePath = `tests/evals/v2/feedback/proposals/${proposal.proposal_id}.json` + await writeJson(relativePath, proposal) + proposalRefs.push(relativePath) + proposalRefById.set(proposal.proposal_id, relativePath) +} + +const candidateProposalRefs: string[] = [] +for (const proposal of candidateProposals) { + const relativePath = `tests/evals/v2/feedback/candidate-proposals/${proposal.candidate_proposal_id}.json` + await writeJson(relativePath, proposal) + candidateProposalRefs.push(relativePath) +} + +const nextExperimentPlanRefs: string[] = [] +const nextPlanRefByProposalId = new Map() +for (const plan of nextExperimentPlans) { + const relativePath = `tests/evals/v2/feedback/experiment-plans/${plan.next_experiment_plan_id}.json` + await writeJson(relativePath, plan) + nextExperimentPlanRefs.push(relativePath) + nextPlanRefByProposalId.set(plan.based_on_proposal_id, relativePath) +} + +const proposalQueue: EvalFeedbackProposalQueue = { + top_recommendation_proposal_ref: + proposalQueueById.top_recommendation_proposal_id + ? proposalRefById.get(proposalQueueById.top_recommendation_proposal_id) ?? null + : null, + recommended_now_proposal_refs: uniq( + proposalQueueById.recommended_now_proposal_ids + .map(proposalId => proposalRefById.get(proposalId) ?? '') + .filter(Boolean), + ), + recommended_later_proposal_refs: uniq( + proposalQueueById.recommended_later_proposal_ids + .map(proposalId => proposalRefById.get(proposalId) ?? '') + .filter(Boolean), + ), + deferred_proposal_refs: uniq( + proposalQueueById.deferred_proposal_ids + .map(proposalId => proposalRefById.get(proposalId) ?? '') + .filter(Boolean), + ), + blocked_proposal_refs: uniq( + proposalQueueById.blocked_proposal_ids + .map(proposalId => proposalRefById.get(proposalId) ?? '') + .filter(Boolean), + ), +} + +const blockingFindingRefs = uniq( + findings + .filter(finding => finding.is_blocking) + .map(finding => `tests/evals/v2/feedback/findings/${finding.finding_id}.json`), +) +const manualJudgementFindingRefs = uniq( + findings + .filter(finding => finding.requires_manual_judgement) + .map(finding => `tests/evals/v2/feedback/findings/${finding.finding_id}.json`), +) +const autoResolvableFindingRefs = uniq( + findings + .filter(finding => finding.auto_resolvable) + .map(finding => `tests/evals/v2/feedback/findings/${finding.finding_id}.json`), +) + +const approvalCard = buildApprovalCard( + proposals, + candidateProposals, + nextExperimentPlans, + proposalQueueById, + proposalRefById, + nextPlanRefByProposalId, +) + +const sourceReportRefs = asArray(artifact.report_refs) +const reportRelativePath = `ObservrityTask/10-系统版本/v2/07-反馈报告/${feedbackRunId}.md` +await writeMarkdown( + reportRelativePath, + buildMarkdownReport({ + feedbackRunId, + generatedAt, + sourceExperimentRunRef: experimentRunRef, + sourceReportRefs, + findings, + hypotheses, + proposals, + candidateProposals, + nextExperimentPlans, + proposalQueue, + blockingFindingRefs, + manualJudgementFindingRefs, + autoResolvableFindingRefs, + approvalCard, + proposalRefById, + }), +) + +const feedbackRun: EvalFeedbackRun = { + feedback_run_id: feedbackRunId, + taxonomy_version: 'v2_5_beta', + generated_at: generatedAt, + source_experiment_id: experimentId, + source_experiment_run_ref: experimentRunRef, + source_report_refs: sourceReportRefs, + finding_refs: findingRefs, + hypothesis_refs: hypothesisRefs, + proposal_refs: proposalRefs, + candidate_proposal_refs: candidateProposalRefs, + next_experiment_plan_refs: nextExperimentPlanRefs, + proposal_queue: proposalQueue, + blocking_finding_refs: blockingFindingRefs, + manual_judgement_required_finding_refs: manualJudgementFindingRefs, + auto_resolvable_finding_refs: autoResolvableFindingRefs, + approval_card: approvalCard, + report_ref: reportRelativePath, + human_approval_required: true, + status: 'completed', +} + +const feedbackRunRelativePath = `tests/evals/v2/feedback/runs/${feedbackRunId}.json` +await writeJson(feedbackRunRelativePath, feedbackRun) + +console.log( + JSON.stringify( + { + feedback_run_id: feedbackRunId, + taxonomy_version: feedbackRun.taxonomy_version, + source_experiment_id: experimentId, + source_experiment_run_ref: experimentRunRef, + findings: findings.length, + hypotheses: hypotheses.length, + proposals: proposals.length, + candidate_proposals: candidateProposals.length, + next_experiment_plans: nextExperimentPlans.length, + top_recommendation_proposal_ref: proposalQueue.top_recommendation_proposal_ref, + report_ref: reportRelativePath, + feedback_run_ref: feedbackRunRelativePath, + human_approval_required: true, + }, + null, + 2, + ), +) diff --git a/scripts/evals/v2_score_registry.ts b/scripts/evals/v2_score_registry.ts index 26650bc172..20d21515b5 100644 --- a/scripts/evals/v2_score_registry.ts +++ b/scripts/evals/v2_score_registry.ts @@ -12,6 +12,7 @@ export interface V2ScoreInput { subagents: JsonRecord[] recoveries: JsonRecord[] variantEffect?: JsonRecord + longContext?: JsonRecord } type V2ScoreScorer = (input: V2ScoreInput) => EvalScore @@ -32,6 +33,45 @@ function scoreLabel(value: number): string { return 'fail' } +function longContextStringArray(evidence: JsonRecord | undefined, key: string): string[] { + const value = evidence?.[key] + if (!Array.isArray(value)) return [] + return value.filter((item): item is string => typeof item === 'string' && item.length > 0) +} + +function longContextNumber(evidence: JsonRecord | undefined, key: string): number | null { + if (!evidence || evidence[key] === undefined || evidence[key] === null) return null + return asNumber(evidence[key]) +} + +function ratio(numerator: number, denominator: number): number | null { + if (denominator <= 0) return null + return Number((numerator / denominator).toFixed(6)) +} + +function contextManualReviewScore( + params: Pick, +): EvalScore { + const { runId, longContext, scenario } = params + const questions = + longContextStringArray(longContext, 'manual_review_questions').length > 0 + ? longContextStringArray(longContext, 'manual_review_questions') + : scenario.manual_review_questions ?? [] + return { + score_id: `${runId}_context_manual_review_required`, + run_id: runId, + dimension: 'context', + subdimension: 'manual_review_required', + score_value: questions.length > 0 ? 1 : 0, + score_label: questions.length > 0 ? 'manual_review_required' : 'not_applicable', + evidence_ref: 'long_context_evidence.manual_review_questions', + reason: + questions.length > 0 + ? `Manual review remains required. Questions: ${questions.join(' | ')}` + : 'No manual review questions were configured for this run.', + } +} + export function scoreKey(score: EvalScore): string { return `${score.dimension}.${score.subdimension}` } @@ -224,6 +264,167 @@ export const V2_SCORE_SCORERS: Record = { : `subagent_count=${count}; budget=${limit}.`, } }, + + 'context.retained_constraint_count': ({ runId, longContext }) => { + const retained = longContextStringArray( + longContext, + 'observed_retained_constraints', + ).length + return { + score_id: `${runId}_context_retained_constraint_count`, + run_id: runId, + dimension: 'context', + subdimension: 'retained_constraint_count', + score_value: retained, + score_label: 'observed', + evidence_ref: 'long_context_evidence.observed_retained_constraints', + reason: `Observed ${retained} retained constraints from long-context evidence.`, + } + }, + + 'context.lost_constraint_count': ({ runId, longContext }) => { + const lost = longContextStringArray(longContext, 'observed_lost_constraints').length + return { + score_id: `${runId}_context_lost_constraint_count`, + run_id: runId, + dimension: 'context', + subdimension: 'lost_constraint_count', + score_value: lost, + score_label: 'observed', + evidence_ref: 'long_context_evidence.observed_lost_constraints', + reason: `Observed ${lost} lost constraints from long-context evidence.`, + } + }, + + 'context.constraint_retention_rate': ({ runId, longContext }) => { + const retained = longContextStringArray( + longContext, + 'observed_retained_constraints', + ).length + const lost = longContextStringArray(longContext, 'observed_lost_constraints').length + const value = ratio(retained, retained + lost) + return { + score_id: `${runId}_context_constraint_retention_rate`, + run_id: runId, + dimension: 'context', + subdimension: 'constraint_retention_rate', + score_value: value, + score_label: value === null ? 'inconclusive' : scoreLabel(value), + evidence_ref: 'long_context_evidence.observed_retained_constraints', + reason: + value === null + ? 'No retained/lost constraint evidence was available.' + : `Constraint retention rate=${value} from retained=${retained}, lost=${lost}.`, + } + }, + + 'context.retrieved_fact_hit_rate': ({ runId, longContext }) => { + const retrieved = longContextStringArray(longContext, 'observed_retrieved_facts').length + const missed = longContextStringArray(longContext, 'observed_missed_facts').length + const value = ratio(retrieved, retrieved + missed) + return { + score_id: `${runId}_context_retrieved_fact_hit_rate`, + run_id: runId, + dimension: 'context', + subdimension: 'retrieved_fact_hit_rate', + score_value: value, + score_label: value === null ? 'inconclusive' : scoreLabel(value), + evidence_ref: 'long_context_evidence.observed_retrieved_facts', + reason: + value === null + ? 'No retrieved/missed fact evidence was available.' + : `Retrieved fact hit rate=${value} from hits=${retrieved}, missed=${missed}.`, + } + }, + + 'context.distractor_confusion_count': ({ runId, longContext }) => { + const confusions = longContextStringArray(longContext, 'observed_confusions').length + return { + score_id: `${runId}_context_distractor_confusion_count`, + run_id: runId, + dimension: 'context', + subdimension: 'distractor_confusion_count', + score_value: confusions, + score_label: 'observed', + evidence_ref: 'long_context_evidence.observed_confusions', + reason: `Observed ${confusions} distractor confusions from long-context evidence.`, + } + }, + + 'context.total_prompt_input_tokens': ({ runId, action }) => ({ + score_id: `${runId}_context_total_prompt_input_tokens`, + run_id: runId, + dimension: 'context', + subdimension: 'total_prompt_input_tokens', + score_value: asNumber(action.total_prompt_input_tokens), + score_label: 'observed', + evidence_ref: 'user_actions.total_prompt_input_tokens', + reason: 'Raw prompt-input cost fact from V1 user_actions.', + }), + + 'context.compaction_trigger_count': ({ runId, longContext }) => { + const count = longContextNumber(longContext, 'compaction_trigger_count') + return { + score_id: `${runId}_context_compaction_trigger_count`, + run_id: runId, + dimension: 'context', + subdimension: 'compaction_trigger_count', + score_value: count, + score_label: count === null ? 'inconclusive' : 'observed', + evidence_ref: 'long_context_evidence.compaction_trigger_count', + reason: + count === null + ? 'No compaction trigger evidence was available.' + : `Observed compaction_trigger_count=${count}.`, + } + }, + + 'context.compaction_saved_tokens': ({ runId, longContext }) => { + const saved = longContextNumber(longContext, 'compaction_saved_tokens') + return { + score_id: `${runId}_context_compaction_saved_tokens`, + run_id: runId, + dimension: 'context', + subdimension: 'compaction_saved_tokens', + score_value: saved, + score_label: saved === null ? 'inconclusive' : 'observed', + evidence_ref: 'long_context_evidence.compaction_saved_tokens', + reason: + saved === null + ? 'No compaction saved-token evidence was available.' + : `Observed compaction_saved_tokens=${saved}.`, + } + }, + + 'context.success_under_context_pressure': ({ runId, rootQuery, longContext }) => { + const explicit = longContextNumber(longContext, 'success_under_context_pressure') + const value = + explicit !== null ? explicit : rootQuery ? 1 : 0 + return { + score_id: `${runId}_context_success_under_context_pressure`, + run_id: runId, + dimension: 'context', + subdimension: 'success_under_context_pressure', + score_value: value, + score_label: scoreLabel(value), + evidence_ref: + explicit !== null + ? 'long_context_evidence.success_under_context_pressure' + : 'queries', + reason: + explicit !== null + ? `Fixture/runtime evidence marked success_under_context_pressure=${explicit}.` + : rootQuery + ? 'Fallback success signal: root query exists.' + : 'No root query or explicit success-under-pressure evidence was found.', + } + }, + + 'context.manual_review_required': ({ runId, longContext, scenario }) => + contextManualReviewScore({ runId, longContext, scenario }), + + 'context.manual_quality_review_required': ({ runId, longContext, scenario }) => + contextManualReviewScore({ runId, longContext, scenario }), } export function listImplementedScoreSpecIds(): string[] { diff --git a/scripts/evals/v2_validate_experiment_artifacts.ts b/scripts/evals/v2_validate_experiment_artifacts.ts index 68622abe6f..c8d97f05d0 100644 --- a/scripts/evals/v2_validate_experiment_artifacts.ts +++ b/scripts/evals/v2_validate_experiment_artifacts.ts @@ -9,6 +9,12 @@ const gateStatuses = new Set(['pass', 'warning', 'fail', 'inconclusive']) const validityStatuses = new Set(['valid', 'invalid', 'inconclusive']) const reportProfiles = new Set(['smoke', 'real_experiment']) const evaluationIntents = new Set(['regression', 'exploration']) +const longContextReviewVerdicts = new Set([ + 'pass', + 'warning', + 'needs_manual_review', + 'invalid', +]) async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as JsonRecord @@ -118,6 +124,18 @@ function validateArtifact(filePath: string, artifact: JsonRecord): string[] { if (artifact.runtime_difference_summary !== undefined) { requireArray(errors, filePath, 'runtime_difference_summary', artifact.runtime_difference_summary) } + if ( + artifact.long_context_review_verdict !== undefined && + artifact.long_context_review_verdict !== null && + !longContextReviewVerdicts.has(String(artifact.long_context_review_verdict)) + ) { + errors.push( + `${filePath}.long_context_review_verdict has invalid value: ${artifact.long_context_review_verdict}`, + ) + } + if (artifact.long_context_summary !== undefined) { + requireArray(errors, filePath, 'long_context_summary', artifact.long_context_summary) + } if (artifact.experiment_validity !== undefined) { requireObject(errors, filePath, 'experiment_validity', artifact.experiment_validity) const validity = artifact.experiment_validity as JsonRecord diff --git a/scripts/evals/v2_validate_feedback_artifacts.ts b/scripts/evals/v2_validate_feedback_artifacts.ts new file mode 100644 index 0000000000..f34dc98802 --- /dev/null +++ b/scripts/evals/v2_validate_feedback_artifacts.ts @@ -0,0 +1,549 @@ +import { access, readFile, readdir } from 'node:fs/promises' +import path from 'node:path' + +type JsonRecord = Record + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const feedbackRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'feedback') +const feedbackRunsRoot = path.join(feedbackRoot, 'runs') + +const betaSeverity = new Set(['info', 'warning', 'blocking']) +const legacySeverity = new Set(['low', 'medium', 'high']) +const factOrInference = new Set(['fact', 'inference']) +const findingKinds = new Set([ + 'missing_score', + 'manual_review_boundary', + 'runtime_observation_gap', + 'stability_gap', + 'execution_failure', +]) +const scopes = new Set(['experiment', 'scenario', 'variant', 'run_group', 'run']) +const proposalTypes = new Set([ + 'evaluator_improvement', + 'score_binding_improvement', + 'scenario_improvement', + 'feedback_contract_improvement', + 'harness_candidate_improvement', +]) +const targetLayers = new Set([ + 'evaluator', + 'scorer', + 'scenario', + 'harness', + 'report', + 'feedback_system', + 'mixed', +]) +const priorities = new Set(['P0', 'P1', 'P2']) +const queueBuckets = new Set([ + 'top_recommendation', + 'recommended_now', + 'recommended_later', + 'deferred', + 'blocked', +]) +const confidenceValues = new Set(['low', 'medium', 'high']) + +async function readJson(filePath: string): Promise { + return JSON.parse(await readFile(filePath, 'utf8')) as JsonRecord +} + +function requireString(errors: string[], objectName: string, fieldName: string, value: unknown) { + if (typeof value !== 'string' || value.trim() === '') { + errors.push(`${objectName}.${fieldName} must be a non-empty string`) + } +} + +function requireArray(errors: string[], objectName: string, fieldName: string, value: unknown) { + if (!Array.isArray(value)) { + errors.push(`${objectName}.${fieldName} must be an array`) + } +} + +function requireBoolean( + errors: string[], + objectName: string, + fieldName: string, + value: unknown, +) { + if (typeof value !== 'boolean') { + errors.push(`${objectName}.${fieldName} must be a boolean`) + } +} + +function requireObject(errors: string[], objectName: string, fieldName: string, value: unknown) { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + errors.push(`${objectName}.${fieldName} must be an object`) + } +} + +function requireStringArray( + errors: string[], + objectName: string, + fieldName: string, + value: unknown, +) { + if (!Array.isArray(value) || value.some(item => typeof item !== 'string')) { + errors.push(`${objectName}.${fieldName} must be an array of strings`) + } +} + +function validateLegacyRun(filePath: string, artifact: JsonRecord): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'feedback_run_id', artifact.feedback_run_id) + requireString(errors, filePath, 'generated_at', artifact.generated_at) + requireString(errors, filePath, 'source_experiment_id', artifact.source_experiment_id) + requireString( + errors, + filePath, + 'source_experiment_run_ref', + artifact.source_experiment_run_ref, + ) + requireArray(errors, filePath, 'finding_refs', artifact.finding_refs) + requireArray(errors, filePath, 'hypothesis_refs', artifact.hypothesis_refs) + requireArray(errors, filePath, 'proposal_refs', artifact.proposal_refs) + requireArray( + errors, + filePath, + 'candidate_proposal_refs', + artifact.candidate_proposal_refs, + ) + requireArray( + errors, + filePath, + 'next_experiment_plan_refs', + artifact.next_experiment_plan_refs, + ) + requireString(errors, filePath, 'report_ref', artifact.report_ref) + if (artifact.human_approval_required !== true) { + errors.push(`${filePath}.human_approval_required must be true`) + } + if (artifact.status !== 'completed') { + errors.push(`${filePath}.status must be completed`) + } + return errors +} + +async function fileExists(relativePath: string): Promise { + try { + await access(path.join(repoRoot, relativePath)) + return true + } catch { + return false + } +} + +function validateFinding(filePath: string, finding: JsonRecord, strictBeta: boolean): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'finding_id', finding.finding_id) + requireString(errors, filePath, 'source_experiment_id', finding.source_experiment_id) + requireString(errors, filePath, 'source_report_ref', finding.source_report_ref) + requireString(errors, filePath, 'finding_type', finding.finding_type) + requireString(errors, filePath, 'summary', finding.summary) + requireString(errors, filePath, 'evidence_ref', finding.evidence_ref) + if (!factOrInference.has(String(finding.fact_or_inference)) || finding.fact_or_inference !== 'fact') { + errors.push(`${filePath}.fact_or_inference must be fact`) + } + + if (strictBeta) { + if (!betaSeverity.has(String(finding.severity))) { + errors.push(`${filePath}.severity has invalid beta value: ${finding.severity}`) + } + if (!findingKinds.has(String(finding.finding_kind))) { + errors.push(`${filePath}.finding_kind has invalid value: ${finding.finding_kind}`) + } + if (!scopes.has(String(finding.scope))) { + errors.push(`${filePath}.scope has invalid value: ${finding.scope}`) + } + requireString(errors, filePath, 'scope_ref', finding.scope_ref) + requireBoolean(errors, filePath, 'is_blocking', finding.is_blocking) + requireBoolean( + errors, + filePath, + 'requires_manual_judgement', + finding.requires_manual_judgement, + ) + requireBoolean(errors, filePath, 'auto_resolvable', finding.auto_resolvable) + } else if (!legacySeverity.has(String(finding.severity))) { + errors.push(`${filePath}.severity has invalid legacy value: ${finding.severity}`) + } + + return errors +} + +function validateHypothesis( + filePath: string, + hypothesis: JsonRecord, + strictBeta: boolean, +): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'hypothesis_id', hypothesis.hypothesis_id) + requireArray(errors, filePath, 'based_on_finding_ids', hypothesis.based_on_finding_ids) + requireString(errors, filePath, 'hypothesis', hypothesis.hypothesis) + requireArray( + errors, + filePath, + 'supporting_evidence_refs', + hypothesis.supporting_evidence_refs, + ) + requireArray(errors, filePath, 'risks', hypothesis.risks) + if (!factOrInference.has(String(hypothesis.fact_or_inference)) || hypothesis.fact_or_inference !== 'inference') { + errors.push(`${filePath}.fact_or_inference must be inference`) + } + if (!confidenceValues.has(String(hypothesis.confidence))) { + errors.push(`${filePath}.confidence has invalid value: ${hypothesis.confidence}`) + } + + if (strictBeta) { + requireArray(errors, filePath, 'depends_on_finding_refs', hypothesis.depends_on_finding_refs) + requireArray(errors, filePath, 'falsifiable_by', hypothesis.falsifiable_by) + } + + return errors +} + +function validateProposal(filePath: string, proposal: JsonRecord, strictBeta: boolean): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'proposal_id', proposal.proposal_id) + requireArray(errors, filePath, 'based_on_hypothesis_ids', proposal.based_on_hypothesis_ids) + requireString(errors, filePath, 'description', proposal.description) + requireString(errors, filePath, 'expected_effect', proposal.expected_effect) + requireArray(errors, filePath, 'risks', proposal.risks) + if (proposal.requires_human_approval !== true) { + errors.push(`${filePath}.requires_human_approval must be true`) + } + if (!proposalTypes.has(String(proposal.proposal_type))) { + errors.push(`${filePath}.proposal_type has invalid value: ${proposal.proposal_type}`) + } + if (!targetLayers.has(String(proposal.target_layer))) { + errors.push(`${filePath}.target_layer has invalid value: ${proposal.target_layer}`) + } + + if (strictBeta) { + requireArray(errors, filePath, 'based_on_finding_ids', proposal.based_on_finding_ids) + if (!priorities.has(String(proposal.priority))) { + errors.push(`${filePath}.priority has invalid value: ${proposal.priority}`) + } + if (!queueBuckets.has(String(proposal.queue_bucket))) { + errors.push(`${filePath}.queue_bucket has invalid value: ${proposal.queue_bucket}`) + } + requireString(errors, filePath, 'why_now', proposal.why_now) + if (proposal.why_not_now !== null && proposal.why_not_now !== undefined) { + requireString(errors, filePath, 'why_not_now', proposal.why_not_now) + } + requireArray(errors, filePath, 'blocking_finding_ids', proposal.blocking_finding_ids) + requireArray( + errors, + filePath, + 'manual_judgement_finding_ids', + proposal.manual_judgement_finding_ids, + ) + } + + return errors +} + +function validateCandidateProposal(filePath: string, artifact: JsonRecord): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'candidate_proposal_id', artifact.candidate_proposal_id) + requireString(errors, filePath, 'based_on_proposal_id', artifact.based_on_proposal_id) + requireString(errors, filePath, 'change_layer', artifact.change_layer) + requireString(errors, filePath, 'variant_name', artifact.variant_name) + requireString(errors, filePath, 'implementation_scope', artifact.implementation_scope) + requireStringArray(errors, filePath, 'do_not_touch', artifact.do_not_touch) + requireObject(errors, filePath, 'suggested_manifest_patch', artifact.suggested_manifest_patch) + return errors +} + +function validateExperimentPlan(filePath: string, artifact: JsonRecord): string[] { + const errors: string[] = [] + requireString(errors, filePath, 'next_experiment_plan_id', artifact.next_experiment_plan_id) + requireString(errors, filePath, 'based_on_proposal_id', artifact.based_on_proposal_id) + requireStringArray(errors, filePath, 'scenario_ids', artifact.scenario_ids) + requireString(errors, filePath, 'baseline_variant_id', artifact.baseline_variant_id) + requireString(errors, filePath, 'candidate_variant_id', artifact.candidate_variant_id) + if (typeof artifact.repeat_count !== 'number') { + errors.push(`${filePath}.repeat_count must be a number`) + } + requireStringArray(errors, filePath, 'success_criteria', artifact.success_criteria) + requireStringArray(errors, filePath, 'failure_criteria', artifact.failure_criteria) + requireBoolean(errors, filePath, 'manual_review_required', artifact.manual_review_required) + return errors +} + +async function validateBetaRun(filePath: string, artifact: JsonRecord): Promise { + const errors: string[] = [] + requireString(errors, filePath, 'taxonomy_version', artifact.taxonomy_version) + requireString(errors, filePath, 'feedback_run_id', artifact.feedback_run_id) + requireString(errors, filePath, 'generated_at', artifact.generated_at) + requireString(errors, filePath, 'source_experiment_id', artifact.source_experiment_id) + requireString( + errors, + filePath, + 'source_experiment_run_ref', + artifact.source_experiment_run_ref, + ) + requireStringArray(errors, filePath, 'source_report_refs', artifact.source_report_refs) + requireStringArray(errors, filePath, 'finding_refs', artifact.finding_refs) + requireStringArray(errors, filePath, 'hypothesis_refs', artifact.hypothesis_refs) + requireStringArray(errors, filePath, 'proposal_refs', artifact.proposal_refs) + requireStringArray( + errors, + filePath, + 'candidate_proposal_refs', + artifact.candidate_proposal_refs, + ) + requireStringArray( + errors, + filePath, + 'next_experiment_plan_refs', + artifact.next_experiment_plan_refs, + ) + requireString(errors, filePath, 'report_ref', artifact.report_ref) + requireStringArray(errors, filePath, 'blocking_finding_refs', artifact.blocking_finding_refs) + requireStringArray( + errors, + filePath, + 'manual_judgement_required_finding_refs', + artifact.manual_judgement_required_finding_refs, + ) + requireStringArray( + errors, + filePath, + 'auto_resolvable_finding_refs', + artifact.auto_resolvable_finding_refs, + ) + if (artifact.human_approval_required !== true) { + errors.push(`${filePath}.human_approval_required must be true`) + } + if (artifact.status !== 'completed') { + errors.push(`${filePath}.status must be completed`) + } + + requireObject(errors, filePath, 'proposal_queue', artifact.proposal_queue) + requireObject(errors, filePath, 'approval_card', artifact.approval_card) + if (errors.length > 0) return errors + + const proposalQueue = artifact.proposal_queue as JsonRecord + if ( + proposalQueue.top_recommendation_proposal_ref !== null && + proposalQueue.top_recommendation_proposal_ref !== undefined + ) { + requireString( + errors, + `${filePath}.proposal_queue`, + 'top_recommendation_proposal_ref', + proposalQueue.top_recommendation_proposal_ref, + ) + } + requireStringArray( + errors, + `${filePath}.proposal_queue`, + 'recommended_now_proposal_refs', + proposalQueue.recommended_now_proposal_refs, + ) + requireStringArray( + errors, + `${filePath}.proposal_queue`, + 'recommended_later_proposal_refs', + proposalQueue.recommended_later_proposal_refs, + ) + requireStringArray( + errors, + `${filePath}.proposal_queue`, + 'deferred_proposal_refs', + proposalQueue.deferred_proposal_refs, + ) + requireStringArray( + errors, + `${filePath}.proposal_queue`, + 'blocked_proposal_refs', + proposalQueue.blocked_proposal_refs, + ) + + const approvalCard = artifact.approval_card as JsonRecord + if ( + approvalCard.current_top_recommendation_proposal_ref !== null && + approvalCard.current_top_recommendation_proposal_ref !== undefined + ) { + requireString( + errors, + `${filePath}.approval_card`, + 'current_top_recommendation_proposal_ref', + approvalCard.current_top_recommendation_proposal_ref, + ) + } + requireString(errors, `${filePath}.approval_card`, 'why_now', approvalCard.why_now) + requireStringArray( + errors, + `${filePath}.approval_card`, + 'why_not_others_yet', + approvalCard.why_not_others_yet, + ) + requireString( + errors, + `${filePath}.approval_card`, + 'approval_scope', + approvalCard.approval_scope, + ) + requireStringArray( + errors, + `${filePath}.approval_card`, + 'do_not_touch', + approvalCard.do_not_touch, + ) + if ( + approvalCard.next_experiment_plan_ref !== null && + approvalCard.next_experiment_plan_ref !== undefined + ) { + requireString( + errors, + `${filePath}.approval_card`, + 'next_experiment_plan_ref', + approvalCard.next_experiment_plan_ref, + ) + } + requireStringArray( + errors, + `${filePath}.approval_card`, + 'success_criteria', + approvalCard.success_criteria, + ) + requireStringArray(errors, `${filePath}.approval_card`, 'risks', approvalCard.risks) + requireString( + errors, + `${filePath}.approval_card`, + 'manual_review_boundary', + approvalCard.manual_review_boundary, + ) + + const proposalRefs = artifact.proposal_refs as string[] + const findingRefs = artifact.finding_refs as string[] + const hypothesisRefs = artifact.hypothesis_refs as string[] + const candidateProposalRefs = artifact.candidate_proposal_refs as string[] + const nextPlanRefs = artifact.next_experiment_plan_refs as string[] + + if (proposalRefs.length > 0 && proposalQueue.top_recommendation_proposal_ref == null) { + errors.push(`${filePath}.proposal_queue.top_recommendation_proposal_ref must exist when proposals exist`) + } + if ( + typeof proposalQueue.top_recommendation_proposal_ref === 'string' && + !proposalRefs.includes(proposalQueue.top_recommendation_proposal_ref) + ) { + errors.push(`${filePath}.proposal_queue.top_recommendation_proposal_ref must reference proposal_refs`) + } + if ( + typeof approvalCard.current_top_recommendation_proposal_ref === 'string' && + approvalCard.current_top_recommendation_proposal_ref !== proposalQueue.top_recommendation_proposal_ref + ) { + errors.push(`${filePath}.approval_card.current_top_recommendation_proposal_ref must match proposal_queue.top_recommendation_proposal_ref`) + } + if ( + typeof approvalCard.next_experiment_plan_ref === 'string' && + !nextPlanRefs.includes(approvalCard.next_experiment_plan_ref) + ) { + errors.push(`${filePath}.approval_card.next_experiment_plan_ref must reference next_experiment_plan_refs`) + } + + for (const ref of [ + ...proposalQueue.recommended_now_proposal_refs as string[], + ...proposalQueue.recommended_later_proposal_refs as string[], + ...proposalQueue.deferred_proposal_refs as string[], + ...proposalQueue.blocked_proposal_refs as string[], + ]) { + if (!proposalRefs.includes(ref)) { + errors.push(`${filePath}.proposal_queue contains unknown proposal ref: ${ref}`) + } + } + + for (const ref of [ + ...(artifact.blocking_finding_refs as string[]), + ...(artifact.manual_judgement_required_finding_refs as string[]), + ...(artifact.auto_resolvable_finding_refs as string[]), + ]) { + if (!findingRefs.includes(ref)) { + errors.push(`${filePath} feedback finding bucket contains unknown finding ref: ${ref}`) + } + } + + if (!(await fileExists(String(artifact.report_ref)))) { + errors.push(`${filePath}.report_ref does not exist: ${artifact.report_ref}`) + } + + const proposalArtifacts = new Map() + for (const ref of proposalRefs) { + if (!(await fileExists(ref))) { + errors.push(`${filePath} missing referenced proposal file: ${ref}`) + continue + } + const proposal = await readJson(path.join(repoRoot, ref)) + proposalArtifacts.set(ref, proposal) + errors.push(...validateProposal(ref, proposal, true)) + } + + const topBucketCount = [...proposalArtifacts.values()].filter( + proposal => proposal.queue_bucket === 'top_recommendation', + ).length + if (proposalArtifacts.size > 0 && topBucketCount !== 1) { + errors.push(`${filePath} must have exactly one proposal with queue_bucket=top_recommendation`) + } + + for (const ref of findingRefs) { + if (!(await fileExists(ref))) { + errors.push(`${filePath} missing referenced finding file: ${ref}`) + continue + } + errors.push(...validateFinding(ref, await readJson(path.join(repoRoot, ref)), true)) + } + + for (const ref of hypothesisRefs) { + if (!(await fileExists(ref))) { + errors.push(`${filePath} missing referenced hypothesis file: ${ref}`) + continue + } + errors.push(...validateHypothesis(ref, await readJson(path.join(repoRoot, ref)), true)) + } + + for (const ref of candidateProposalRefs) { + if (!(await fileExists(ref))) { + errors.push(`${filePath} missing referenced candidate proposal file: ${ref}`) + continue + } + errors.push( + ...validateCandidateProposal(ref, await readJson(path.join(repoRoot, ref))), + ) + } + + for (const ref of nextPlanRefs) { + if (!(await fileExists(ref))) { + errors.push(`${filePath} missing referenced next experiment plan file: ${ref}`) + continue + } + errors.push(...validateExperimentPlan(ref, await readJson(path.join(repoRoot, ref)))) + } + + return errors +} + +const entries = await readdir(feedbackRunsRoot, { withFileTypes: true }).catch(() => []) +const runFiles = entries + .filter(entry => entry.isFile() && entry.name.endsWith('.json')) + .map(entry => path.join(feedbackRunsRoot, entry.name)) + +const errors: string[] = [] +for (const filePath of runFiles) { + const artifact = await readJson(filePath) + if (artifact.taxonomy_version === 'v2_5_beta') { + errors.push(...(await validateBetaRun(filePath, artifact))) + } else { + errors.push(...validateLegacyRun(filePath, artifact)) + } +} + +if (errors.length > 0) { + console.error('V2 feedback artifact schema validation failed:') + for (const error of errors) console.error(`- ${error}`) + process.exit(1) +} + +console.log(`V2 feedback artifact schema validation passed: ${runFiles.length} file(s).`) diff --git a/scripts/evals/v2_validate_manifests.ts b/scripts/evals/v2_validate_manifests.ts index 9b58c1d89b..465799a169 100644 --- a/scripts/evals/v2_validate_manifests.ts +++ b/scripts/evals/v2_validate_manifests.ts @@ -1,9 +1,11 @@ +import { existsSync } from 'node:fs' import { readFile, readdir } from 'node:fs/promises' import path from 'node:path' import type { EvalChangeLayer, EvalScenario, + EvalScenarioExpectation, EvalVariant, } from '../../src/observability/v2/evalTypes' import type { @@ -32,6 +34,7 @@ const scoreDimensions = new Set([ 'efficiency', 'stability', 'controllability', + 'context', ]) const scoreDirections = new Set([ 'higher_is_better', @@ -57,11 +60,18 @@ async function readJson(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } -async function listJsonFiles(dir: string): Promise { +async function listJsonFiles(dir: string, recursive = false): Promise { const entries = await readdir(dir, { withFileTypes: true }) - return entries + const files = entries .filter(entry => entry.isFile() && entry.name.endsWith('.json')) .map(entry => path.join(dir, entry.name)) + if (!recursive) return files + const nested = await Promise.all( + entries + .filter(entry => entry.isDirectory()) + .map(entry => listJsonFiles(path.join(dir, entry.name), true)), + ) + return [...files, ...nested.flat()] } function requireString( @@ -108,6 +118,17 @@ function requireOptionalString( } } +function requireObject( + errors: string[], + objectName: string, + fieldName: string, + value: unknown, +) { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + errors.push(`${objectName}.${fieldName} must be an object`) + } +} + function isFlatActionBinding( binding: EvalExperimentActionBinding, ): binding is EvalExperimentFlatActionBinding { @@ -138,6 +159,72 @@ function normalizeGateRules(gate: EvalGatePolicy): EvalGatePolicyRule[] { ] } +function validateScenarioExpectations( + errors: string[], + objectName: string, + scenarioId: string, + expectations: EvalScenarioExpectation[], +) { + const expectationTypes = new Set( + expectations.map(expectation => expectation.expectation_type), + ) + for (const [index, expectation] of expectations.entries()) { + const itemName = `${objectName}.expectations[${index}]` + requireString(errors, itemName, 'expectation_id', expectation.expectation_id) + requireString(errors, itemName, 'expectation_type', expectation.expectation_type) + requireObject(errors, itemName, 'expectation_body', expectation.expectation_body) + if (!['low', 'medium', 'high'].includes(expectation.severity)) { + errors.push(`${itemName}.severity has invalid value: ${expectation.severity}`) + } + if ( + expectation.expectation_type === 'manual_review' && + !Array.isArray(expectation.expectation_body?.questions) + ) { + errors.push(`${itemName}.expectation_body.questions must be an array for manual_review`) + } + } + + const isLongContextExpectationSet = + expectationTypes.has('retained_constraint') || + expectationTypes.has('retrieved_fact') || + expectationTypes.has('forbidden_confusion') || + expectationTypes.has('context_budget') + + if (isLongContextExpectationSet) { + for (const requiredType of [ + 'retained_constraint', + 'retrieved_fact', + 'forbidden_confusion', + 'manual_review', + ]) { + if (!expectationTypes.has(requiredType)) { + errors.push( + `${objectName}.expectations must include ${requiredType} for long-context scenario ${scenarioId}`, + ) + } + } + } else { + const hasRule = expectationTypes.has('rule') + const hasStructure = expectationTypes.has('structure') + const hasManual = expectationTypes.has('manual_review') + if (!hasRule) { + errors.push( + `${objectName}.expectations must include at least one rule expectation for ${scenarioId}`, + ) + } + if (!hasStructure) { + errors.push( + `${objectName}.expectations must include at least one structure expectation for ${scenarioId}`, + ) + } + if (!hasManual) { + errors.push( + `${objectName}.expectations must include at least one manual_review expectation for ${scenarioId}`, + ) + } + } +} + function validateScenario(filePath: string, scenario: EvalScenario): string[] { const errors: string[] = [] requireString(errors, filePath, 'scenario_id', scenario.scenario_id) @@ -161,6 +248,133 @@ function validateScenario(filePath: string, scenario: EvalScenario): string[] { scenario.max_total_billed_tokens, ) requireOptionalNumber(errors, filePath, 'max_subagent_count', scenario.max_subagent_count) + if (scenario.expected_facts !== undefined) { + requireArray(errors, filePath, 'expected_facts', scenario.expected_facts) + } + if (scenario.forbidden_confusions !== undefined) { + requireArray( + errors, + filePath, + 'forbidden_confusions', + scenario.forbidden_confusions, + ) + } + if (scenario.manual_review_questions !== undefined) { + requireArray( + errors, + filePath, + 'manual_review_questions', + scenario.manual_review_questions, + ) + } + requireOptionalString(errors, filePath, 'context_profile_ref', scenario.context_profile_ref) + if (scenario.expectations !== undefined) { + requireArray(errors, filePath, 'expectations', scenario.expectations) + if (Array.isArray(scenario.expectations)) { + validateScenarioExpectations( + errors, + filePath, + scenario.scenario_id, + scenario.expectations, + ) + } + } + if (scenario.long_context_profile !== undefined) { + const profile = scenario.long_context_profile + requireObject(errors, filePath, 'long_context_profile', profile) + requireString(errors, `${filePath}.long_context_profile`, 'context_family', profile.context_family) + if ( + ![ + 'constraint_retention', + 'retrieval', + 'distractor_resistance', + 'compaction_pressure', + ].includes(profile.context_family) + ) { + errors.push( + `${filePath}.long_context_profile.context_family has invalid value: ${profile.context_family}`, + ) + } + requireString( + errors, + `${filePath}.long_context_profile`, + 'context_size_class', + profile.context_size_class, + ) + if (!['small', 'medium', 'large'].includes(profile.context_size_class)) { + errors.push( + `${filePath}.long_context_profile.context_size_class has invalid value: ${profile.context_size_class}`, + ) + } + requireString(errors, `${filePath}.long_context_profile`, 'fixture_ref', profile.fixture_ref) + requireArray( + errors, + `${filePath}.long_context_profile`, + 'expected_retained_constraints', + profile.expected_retained_constraints, + ) + requireArray( + errors, + `${filePath}.long_context_profile`, + 'expected_retrieved_facts', + profile.expected_retrieved_facts, + ) + requireArray( + errors, + `${filePath}.long_context_profile`, + 'distractor_refs', + profile.distractor_refs, + ) + requireArray( + errors, + `${filePath}.long_context_profile`, + 'forbidden_confusions', + profile.forbidden_confusions, + ) + requireArray( + errors, + `${filePath}.long_context_profile`, + 'manual_review_questions', + profile.manual_review_questions, + ) + + const fixtureDir = path.resolve(repoRoot, profile.fixture_ref) + for (const requiredFile of [ + 'context_body.md', + 'critical_facts.json', + 'constraints.json', + 'distractors.json', + 'expected_output.md', + ]) { + if (!existsSync(path.join(fixtureDir, requiredFile))) { + errors.push( + `${filePath}.long_context_profile.fixture_ref is missing required fixture file: ${requiredFile}`, + ) + } + } + + if (!Array.isArray(scenario.expected_facts) || scenario.expected_facts.length === 0) { + errors.push(`${filePath}.expected_facts must exist for long-context scenarios`) + } + if ( + !Array.isArray(scenario.expected_constraints) || + scenario.expected_constraints.length === 0 + ) { + errors.push(`${filePath}.expected_constraints must exist for long-context scenarios`) + } + if ( + !Array.isArray(scenario.forbidden_confusions) || + scenario.forbidden_confusions.length === 0 + ) { + errors.push(`${filePath}.forbidden_confusions must exist for long-context scenarios`) + } + if ( + !Array.isArray(scenario.manual_review_questions) || + scenario.manual_review_questions.length === 0 + ) { + errors.push(`${filePath}.manual_review_questions must exist for long-context scenarios`) + } + } return errors } @@ -464,7 +678,7 @@ async function validateAll(): Promise { } const implementedScoreSpecIds = new Set(listImplementedScoreSpecIds()) - const scenarioFiles = await listJsonFiles(path.join(evalRoot, 'scenarios')) + const scenarioFiles = await listJsonFiles(path.join(evalRoot, 'scenarios'), true) const variantFiles = await listJsonFiles(path.join(evalRoot, 'variants')) const experimentFiles = await listJsonFiles(path.join(evalRoot, 'experiments')) const scoreSpecFiles = await listJsonFiles(path.join(evalRoot, 'score-specs')) diff --git a/scripts/evals/v2_verify_long_context.ts b/scripts/evals/v2_verify_long_context.ts new file mode 100644 index 0000000000..103fdaf7d3 --- /dev/null +++ b/scripts/evals/v2_verify_long_context.ts @@ -0,0 +1,106 @@ +import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises' +import path from 'node:path' + +type JsonRecord = Record + +const repoRoot = path.resolve(import.meta.dirname, '..', '..') +const experimentRunsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'experiment-runs') +const reportsRoot = path.join(repoRoot, 'tests', 'evals', 'v2', 'verification-reports') +const stamp = new Date().toISOString().replace(/[:.]/g, '') + +async function findLatestFixtureSmokeSummary(): Promise { + const entries = await readdir(experimentRunsRoot, { withFileTypes: true }) + const matches = entries + .filter( + entry => + entry.isFile() && + entry.name.startsWith('v2_4_long_context_fixture_smoke_') && + entry.name.endsWith('.json'), + ) + .map(entry => entry.name) + .sort() + const latest = matches.at(-1) + if (!latest) { + throw new Error( + 'No V2.4 fixture smoke summary found. Run bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json first.', + ) + } + return path.join(experimentRunsRoot, latest) +} + +async function main(): Promise { + const scenarioIds = [ + 'long_context_constraint_retention', + 'long_context_fact_retrieval', + 'long_context_distractor_resistance', + 'long_context_compaction_pressure', + ] + for (const scenarioId of scenarioIds) { + const scenarioPath = path.join( + repoRoot, + 'tests', + 'evals', + 'v2', + 'scenarios', + 'long-context', + `${scenarioId}.json`, + ) + await readFile(scenarioPath, 'utf8') + } + + const summaryPath = await findLatestFixtureSmokeSummary() + const summary = JSON.parse(await readFile(summaryPath, 'utf8')) as JsonRecord + const reportRefs = Array.isArray(summary.report_refs) + ? summary.report_refs.filter((value): value is string => typeof value === 'string') + : [] + const batchRef = + reportRefs.find(ref => path.basename(ref).startsWith('batch_experiment_')) ?? null + if (!batchRef) { + throw new Error('Latest V2.4 fixture smoke summary is missing a batch report ref.') + } + const batchMarkdown = await readFile(path.resolve(repoRoot, batchRef), 'utf8') + + if (!Array.isArray(summary.long_context_summary)) { + throw new Error('summary.long_context_summary must be present for V2.4 fixture smoke.') + } + if ((summary.long_context_summary as unknown[]).length < 4) { + throw new Error('summary.long_context_summary must contain at least four scenario rows.') + } + if (typeof summary.long_context_review_verdict !== 'string') { + throw new Error('summary.long_context_review_verdict must be present.') + } + if (!batchMarkdown.includes('## Long Context Summary')) { + throw new Error('Batch report is missing the Long Context Summary section.') + } + + await mkdir(reportsRoot, { recursive: true }) + const verificationPath = path.join( + reportsRoot, + `v2_4_long_context_${stamp}.json`, + ) + await writeFile( + verificationPath, + `${JSON.stringify( + { + verification_id: `v2_4_long_context_${stamp}`, + generated_at: new Date().toISOString(), + passed: true, + inspected_summary_ref: path.relative(repoRoot, summaryPath), + batch_report_ref: batchRef, + long_context_review_verdict: summary.long_context_review_verdict, + scenario_row_count: (summary.long_context_summary as unknown[]).length, + }, + null, + 2, + )}\n`, + ) + + console.log( + `V2.4 long-context verification passed: ${path.relative(repoRoot, verificationPath)}`, + ) +} + +main().catch(error => { + console.error(error instanceof Error ? error.message : error) + process.exit(1) +}) diff --git a/src/observability/v2/evalTypes.ts b/src/observability/v2/evalTypes.ts index c9a75537f3..9bbf8eb3e7 100644 --- a/src/observability/v2/evalTypes.ts +++ b/src/observability/v2/evalTypes.ts @@ -6,6 +6,10 @@ export type EvalChangeLayer = | 'mixed' export type EvalExpectationType = 'rule' | 'structure' | 'manual_review' + | 'retained_constraint' + | 'retrieved_fact' + | 'forbidden_confusion' + | 'context_budget' export type EvalRunStatus = | 'pending' @@ -27,6 +31,76 @@ export type EvalScoreDimension = | 'efficiency' | 'stability' | 'controllability' + | 'context' + +export type EvalFeedbackSeverity = 'info' | 'warning' | 'blocking' + +export type EvalFeedbackFactOrInference = 'fact' | 'inference' + +export type EvalFeedbackFindingKind = + | 'missing_score' + | 'manual_review_boundary' + | 'runtime_observation_gap' + | 'stability_gap' + | 'execution_failure' + +export type EvalFeedbackScope = + | 'experiment' + | 'scenario' + | 'variant' + | 'run_group' + | 'run' + +export type EvalFeedbackPriority = 'P0' | 'P1' | 'P2' + +export type EvalFeedbackQueueBucket = + | 'top_recommendation' + | 'recommended_now' + | 'recommended_later' + | 'deferred' + | 'blocked' + +export type EvalFeedbackProposalType = + | 'evaluator_improvement' + | 'score_binding_improvement' + | 'scenario_improvement' + | 'feedback_contract_improvement' + | 'harness_candidate_improvement' + +export type EvalFeedbackTargetLayer = + | 'evaluator' + | 'scorer' + | 'scenario' + | 'harness' + | 'report' + | 'feedback_system' + | 'mixed' + +export type EvalContextSizeClass = 'small' | 'medium' | 'large' + +export interface EvalLongContextProfile { + context_family: + | 'constraint_retention' + | 'retrieval' + | 'distractor_resistance' + | 'compaction_pressure' + context_size_class: EvalContextSizeClass + fixture_ref: string + expected_retained_constraints: string[] + expected_retrieved_facts: string[] + distractor_refs: string[] + forbidden_confusions: string[] + manual_review_questions: string[] +} + +export type EvalExpectationBody = Record + +export interface EvalScenarioExpectation { + expectation_id: string + expectation_type: EvalExpectationType + expectation_body: EvalExpectationBody + severity: 'low' | 'medium' | 'high' +} export interface EvalScenario { scenario_id: string @@ -43,6 +117,12 @@ export interface EvalScenario { max_turn_count?: number max_total_billed_tokens?: number max_subagent_count?: number + expected_facts?: string[] + forbidden_confusions?: string[] + manual_review_questions?: string[] + context_profile_ref?: string + long_context_profile?: EvalLongContextProfile + expectations?: EvalScenarioExpectation[] owner: string status: 'draft' | 'ready' | 'archived' } @@ -98,7 +178,7 @@ export interface EvalExpectation { expectation_id: string scenario_id: string expectation_type: EvalExpectationType - expectation_body: string + expectation_body: EvalExpectationBody severity: 'low' | 'medium' | 'high' } @@ -124,3 +204,114 @@ export interface EvalExperiment { evaluation_intent?: 'regression' | 'exploration' status: EvalExperimentStatus } + +export interface EvalFinding { + finding_id: string + source_experiment_id: string + source_report_ref: string + finding_type: string + finding_kind: EvalFeedbackFindingKind + severity: EvalFeedbackSeverity + scope: EvalFeedbackScope + scope_ref: string + summary: string + evidence_ref: string + is_blocking: boolean + requires_manual_judgement: boolean + auto_resolvable: boolean + fact_or_inference: 'fact' +} + +export interface EvalHypothesis { + hypothesis_id: string + based_on_finding_ids: string[] + depends_on_finding_refs: string[] + hypothesis: string + confidence: 'low' | 'medium' | 'high' + falsifiable_by: string[] + supporting_evidence_refs: string[] + risks: string[] + fact_or_inference: 'inference' +} + +export interface EvalImprovementProposal { + proposal_id: string + based_on_hypothesis_ids: string[] + based_on_finding_ids: string[] + proposal_type: EvalFeedbackProposalType + target_layer: EvalFeedbackTargetLayer + priority: EvalFeedbackPriority + queue_bucket: EvalFeedbackQueueBucket + description: string + expected_effect: string + why_now: string + why_not_now: string | null + blocking_finding_ids: string[] + manual_judgement_finding_ids: string[] + risks: string[] + requires_human_approval: true +} + +export interface EvalCandidateVariantProposal { + candidate_proposal_id: string + based_on_proposal_id: string + change_layer: EvalFeedbackTargetLayer + variant_name: string + implementation_scope: string + do_not_touch: string[] + suggested_manifest_patch: Record +} + +export interface EvalNextExperimentPlan { + next_experiment_plan_id: string + based_on_proposal_id: string + scenario_ids: string[] + baseline_variant_id: string + candidate_variant_id: string + repeat_count: number + success_criteria: string[] + failure_criteria: string[] + manual_review_required: boolean +} + +export interface EvalFeedbackProposalQueue { + top_recommendation_proposal_ref: string | null + recommended_now_proposal_refs: string[] + recommended_later_proposal_refs: string[] + deferred_proposal_refs: string[] + blocked_proposal_refs: string[] +} + +export interface EvalFeedbackApprovalCard { + current_top_recommendation_proposal_ref: string | null + why_now: string + why_not_others_yet: string[] + approval_scope: string + do_not_touch: string[] + next_experiment_plan_ref: string | null + success_criteria: string[] + risks: string[] + manual_review_boundary: string +} + +export interface EvalFeedbackRun { + feedback_run_id: string + taxonomy_version: string + generated_at: string + source_experiment_id: string + source_experiment_run_ref: string + source_report_refs: string[] + finding_refs: string[] + hypothesis_refs: string[] + proposal_refs: string[] + candidate_proposal_refs: string[] + next_experiment_plan_refs: string[] + proposal_queue: EvalFeedbackProposalQueue + blocking_finding_refs: string[] + manual_judgement_required_finding_refs: string[] + auto_resolvable_finding_refs: string[] + approval_card: EvalFeedbackApprovalCard + report_ref: string + human_approval_required: true + status: 'completed' +} diff --git a/tests/evals/v2/README.md b/tests/evals/v2/README.md index a94441f817..8293c8e9bb 100644 --- a/tests/evals/v2/README.md +++ b/tests/evals/v2/README.md @@ -7,7 +7,15 @@ This directory stores the local-first V2 evaluation system. If you want the project-level explanation first, start here: ```text -ObservrityTask/10-系统版本/v2/01-总览/V2.2.5版本项目介绍与阅读指南.md +ObservrityTask/10-系统版本/v2/01-总览/V2.5版本项目介绍与阅读指南.md +``` + +## Current Web Sync Note + +If you need the latest handoff note for the web GPT workflow, use: + +```text +ObservrityTask/10-系统版本/v2/01-总览/V2.3-V2.5当前状态同步稿(网页端).md ``` Use this README after that when you want the concrete execution entrypoints and folder-level technical view. @@ -15,14 +23,16 @@ Use this README after that when you want the concrete execution entrypoints and ## Structure - `scenarios/`: scenario manifests. +- `fixtures/`: reusable evaluation context packets and expected data. - `variants/`: baseline and candidate variant manifests. - `experiments/`: experiment manifests. - `score-specs/`: score definitions and evidence requirements. +- `feedback/`: generated feedback-loop artifacts such as findings, hypotheses, proposals, and next experiment plans. - `gates/`: regression-risk gate policies. - `runs/`: generated run records bound to V1 evidence. - `scores/`: generated score artifacts. +- `run-groups/`: repeat aggregation artifacts. - `experiment-runs/`: experiment-level JSON summaries. -- `run-groups/`: V2.3 repeat aggregation artifacts. - `verification-reports/`: runner verification reports. ## Modes @@ -30,7 +40,12 @@ Use this README after that when you want the concrete execution entrypoints and - `bind_existing`: V2.1 stable mode. You provide existing V1 `user_action_id` values through `action_bindings`. - `execute_harness`: V2.2+ mode. The runner executes scenarios through the headless harness, injects eval context into V1 events, captures generated `user_action_id` values by `benchmark_run_id`, then reuses the same score/report/risk-verdict pipeline. -V2.3 adds batch robustness support on top of V2.2.5: multi-scenario, multi-candidate, `repeat_count > 1`, run groups, stability summaries, and flaky status. +Version layering: + +- `V2.2.5`: real-experiment closure +- `V2.3`: batch / repeat / run_group / stability summary / flaky status +- `V2.4`: long-context scenario families, `context.*` score-specs, `long_context` run evidence, and `long_context_summary` +- `V2.5`: feedback loop beta, turning experiment reports into structured findings, hypotheses, proposals, proposal queues, and approval-ready next-step plans ## Basic Commands @@ -58,28 +73,29 @@ Run the V2.2-alpha execute_harness verification suite: bun run scripts/evals/v2_verify_execute_harness_alpha.ts ``` -Run the current V2.1 sample: +Run the V2.4 long-context verifier: ```powershell -bun run scripts/evals/v2_run_experiment.ts --experiment session_memory_sparse_vs_default +bun run scripts/evals/v2_verify_long_context.ts ``` -Run the V2.2 smoke manifest with automatic execution enabled: +Run the V2.5 feedback loop beta on an experiment-run summary: ```powershell -bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json ``` -Disable automatic execution and fall back to `bind_existing`: +Validate generated V2.5 feedback artifact schema: ```powershell -bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json --disable-execute-harness +bun run scripts/evals/v2_validate_feedback_artifacts.ts ``` -Equivalent environment switch: +## Main Experiment Entry Points + +Run the V2.2 execute_harness smoke: ```powershell -$env:V2_2_EXECUTE_HARNESS='0' bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json ``` @@ -89,12 +105,6 @@ Run the V2.2-beta real runtime-difference experiment: bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/session_memory_runtime_sparse_vs_default.json ``` -Run the V2.2.5 manual fallback helper for one real trace: - -```powershell -& 'scripts/evals/v2_manual_real_run.ps1' -ScenarioId 'session_memory_trigger_sensitive' -VariantId 'baseline_default' -ExperimentId 'session_memory_runtime_sparse_vs_default_manual' -MaxTurns 12 -``` - Run the V2.2.5 manual `bind_existing` fallback experiment: ```powershell @@ -107,11 +117,44 @@ Run the V2.3 no-cost robustness smoke: bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.robustness.smoke.json ``` -Interpretation: +Run the V2.4 no-cost long-context fixture smoke: -- `smoke`: validates automatic execution, automatic capture, and automatic artifact generation. -- `real_experiment`: asks whether the candidate changed runtime behavior in an observable and interpretable way. +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json +``` + +Run the V2.4 small real-model long-context smoke: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.json +``` + +Run the V2.5 tightened real-smoke expectation-contract follow-up: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json +``` + +Disable automatic execution and fall back to `bind_existing`: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json --disable-execute-harness +``` + +Equivalent environment switch: + +```powershell +$env:V2_2_EXECUTE_HARNESS='0' +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.execute_harness.smoke.json +``` + +## Interpretation + +- `smoke`: validates execution, capture, and artifact generation health. +- `real_experiment`: asks whether a candidate produced an interpretable runtime difference in a real path. - `run_group`: groups repeats for one `scenario_id + variant_id` and reports success rate, token/duration variance, recovery rate, and flaky status. +- `long_context_summary`: aggregates long-context retention, retrieval, distractor resistance, compaction evidence, and manual-review hints by `scenario + candidate`. +- `feedback run`: converts a completed experiment summary into `findings -> hypotheses -> proposals -> proposal queue -> candidate draft -> next experiment plan`, while keeping human approval as a hard gate. ## bind_existing Binding Shape @@ -134,7 +177,7 @@ The runner still accepts the older nested binding shape for compatibility. New m ## execute_harness Binding Mechanism -The formal binding key is `benchmark_run_id`, not “latest user_action_id”. +The formal binding key is `benchmark_run_id`, not "latest user_action_id". Flow: @@ -158,8 +201,11 @@ tests/evals/v2/V2.1-bind_existing-usage.md tests/evals/v2/V2.2-execute_harness-alpha-usage.md tests/evals/v2/V2.2.5-real-experiment-closure.md tests/evals/v2/V2.3-batch-robustness-usage.md -tests/evals/v2/run-groups/ +tests/evals/v2/V2.4-long-context-usage.md +tests/evals/v2/V2.5-feedback-loop-usage.md tests/evals/v2/experiment-runs/README.md +ObservrityTask/10-系统版本/v2/01-总览/V2.4版本项目介绍与阅读指南.md +ObservrityTask/10-系统版本/v2/01-总览/V2.5版本项目介绍与阅读指南.md ``` ## Low-Level Debug Commands @@ -182,8 +228,11 @@ List recorded runs: bun run scripts/evals/v2_list_runs.ts --scenario tool_choice_sensitive ``` -## V2.3 Project Overview +## Project Overviews ```text +ObservrityTask/10-系统版本/v2/01-总览/V2.2.5版本项目介绍与阅读指南.md ObservrityTask/10-系统版本/v2/01-总览/V2.3版本项目介绍与阅读指南.md +ObservrityTask/10-系统版本/v2/01-总览/V2.4版本项目介绍与阅读指南.md +ObservrityTask/10-系统版本/v2/01-总览/V2.5版本项目介绍与阅读指南.md ``` diff --git a/tests/evals/v2/V2.4-long-context-usage.md b/tests/evals/v2/V2.4-long-context-usage.md new file mode 100644 index 0000000000..ced57a8a88 --- /dev/null +++ b/tests/evals/v2/V2.4-long-context-usage.md @@ -0,0 +1,146 @@ +# V2.4 Long-Context Usage + +V2.4 extends the V2.3 batch runner with a dedicated long-context evaluation layer. + +## Scope + +V2.4 adds: + +- long-context scenario families +- fixture-backed long-context datasets +- `context.*` score-specs +- `long_context` evidence inside each run artifact +- `long_context_summary` and `long_context_review_verdict` inside experiment summaries +- a dedicated `Long Context Summary` section inside batch reports + +V2.4 does not add tool/skill-specialized scoring, remote scheduling, or a new V1 observability architecture. + +## Scenario Families + +The current V2.4 fixture set covers four pressure types: + +- `long_context_constraint_retention` +- `long_context_fact_retrieval` +- `long_context_distractor_resistance` +- `long_context_compaction_pressure` + +Each family has a fixture directory under: + +```text +tests/evals/v2/fixtures/long-context/ +``` + +Each fixture directory contains: + +- `context_body.md` +- `critical_facts.json` +- `constraints.json` +- `distractors.json` +- `expected_output.md` + +## New Score Specs + +The current V2.4 score-spec bundle is: + +```text +tests/evals/v2/score-specs/long-context.score-specs.json +``` + +Key metrics: + +- `context.retained_constraint_count` +- `context.lost_constraint_count` +- `context.constraint_retention_rate` +- `context.retrieved_fact_hit_rate` +- `context.distractor_confusion_count` +- `context.total_prompt_input_tokens` +- `context.compaction_trigger_count` +- `context.compaction_saved_tokens` +- `context.success_under_context_pressure` +- `context.manual_review_required` + +## Smoke Verification + +Run the no-cost long-context fixture smoke: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json +``` + +This experiment uses `execute_harness` with the `fixture_trace` adapter, so it verifies the V2.4 runner and artifact pipeline without calling the model. + +Then run the dedicated verifier: + +```powershell +bun run scripts/evals/v2_verify_long_context.ts +``` + +The verifier checks: + +- a latest V2.4 fixture smoke summary exists +- `long_context_summary` exists and contains the scenario rows +- `long_context_review_verdict` exists +- the batch report includes `## Long Context Summary` + +## Real Smoke + +Run the small real-model smoke: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.json +``` + +Purpose: + +- confirm the real `execute_harness` path still works for V2.4 +- confirm cost, compaction, and manual-review evidence remain interpretable + +This is not a large benchmark. It is a small real-path health check. + +## Reading Order + +1. Open the latest experiment summary JSON. +2. Check `experiment_validity`. +3. Check `long_context_review_verdict`. +4. Check `long_context_summary`. +5. Open the batch markdown report. +6. Inspect individual run JSON only when one family looks suspicious or requires manual review. + +## How To Read `long_context_summary` + +Each row is one `scenario_id + candidate_variant_id` aggregate across repeats. + +Important fields: + +- `context_family` +- `context_size_class` +- `retained_constraint_mean` +- `lost_constraint_mean` +- `constraint_retention_rate_mean` +- `retrieved_fact_hit_rate_mean` +- `distractor_confusion_mean` +- `compaction_trigger_mean` +- `compaction_saved_tokens_mean` +- `total_prompt_input_tokens_mean` +- `prompt_token_delta_mean` +- `success_under_context_pressure_rate` +- `manual_review_required` + +Interpretation rule of thumb: + +- high retention + high retrieval + low confusion is the desired shape +- lower prompt-token cost is only meaningful when retention/retrieval do not collapse +- `manual_review_required=true` is normal for long-context experiments + +## Current Boundary + +- Automatic long-context evidence is strongest in `fixture_trace` mode. +- Real smoke may still depend on human inspection even when the pipeline is healthy. +- V2.4 does not collapse long-context behavior into a single final verdict. + +## Related Docs + +- `tests/evals/v2/README.md` +- `tests/evals/v2/V2.3-batch-robustness-usage.md` +- `tests/evals/v2/experiment-runs/README.md` +- `ObservrityTask/10-系统版本/v2/01-总览/V2.4版本项目介绍与阅读指南.md` diff --git a/tests/evals/v2/V2.5-feedback-loop-usage.md b/tests/evals/v2/V2.5-feedback-loop-usage.md new file mode 100644 index 0000000000..c5c550d3d0 --- /dev/null +++ b/tests/evals/v2/V2.5-feedback-loop-usage.md @@ -0,0 +1,185 @@ +# V2.5 Feedback Loop Beta Usage + +## 理解清单 + +`V2.5 beta` 不自动改代码。 +它的职责是把已有 experiment report 转成: + +- `Finding` +- `Hypothesis` +- `Improvement Proposal` +- `Candidate Variant Proposal` +- `Next Experiment Plan` + +然后明确告诉你: + +- 哪些是事实 +- 哪些是推断 +- 哪些建议需要你拍板 + +## 预期效果 + +如果你运行: + +```powershell +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +``` + +你将得到: + +- `tests/evals/v2/feedback/findings/*.json` +- `tests/evals/v2/feedback/hypotheses/*.json` +- `tests/evals/v2/feedback/proposals/*.json` +- `tests/evals/v2/feedback/candidate-proposals/*.json` +- `tests/evals/v2/feedback/experiment-plans/*.json` +- `tests/evals/v2/feedback/runs/*.json` +- `ObservrityTask/10-系统版本/v2/07-反馈报告/*.md` + +## 设计思路 + +`V2.5 beta` 仍然不调用模型,也不自动实现建议。 +但它比 alpha 多了: + +- feedback taxonomy +- proposal queue +- human approval card +- feedback artifact validator + +当前 extractor 只处理这些明确规则化信号: + +1. `constraint_retention_rate_mean = null` +2. `retrieved_fact_hit_rate_mean = null` +3. `long_context_review_verdict = needs_manual_review` +4. `risk_verdict.status = inconclusive` +5. `missing_score_count > 0` +6. `manual_review_required = true` +7. `flaky_status != stable` +8. `run_failures` 非空 + +## 运行命令 + +先做基础校验: + +```powershell +bun run typecheck +bun run scripts/evals/v2_validate_manifests.ts +bun run scripts/evals/v2_validate_experiment_artifacts.ts +``` + +然后运行 feedback: + +```powershell +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json +``` + +再运行 feedback validator: + +```powershell +bun run scripts/evals/v2_validate_feedback_artifacts.ts +``` + +## 当前推荐输入 + +第一条建议直接使用: + +- `tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json` + +因为它最适合作为第一版反馈回路样例: + +- 有真实 runtime difference +- 仍保留 manual review +- 语义分数中有 `null` +- 能自然导出 “补轻量 output parser” 这类 evaluator 改进建议 + +## 输出怎么读 + +### 1. 先看 `findings` + +它们是事实: + +- 某个字段是否为 `null` +- 某个 verdict 是否为 `inconclusive` +- 某个 scenario 是否需要 manual review + +### 2. 再看 `hypotheses` + +它们是推断: + +- 为什么会出现这些 finding +- 当前最可能缺的是哪一层能力 + +### 3. 先看 `proposal queue` + +它会明确区分: + +- `top_recommendation` +- `recommended_now` +- `recommended_later` +- `deferred` +- `blocked` + +### 4. 再看 `proposals` + +它们是改动建议: + +- 改 evaluator +- 改 scenario +- 暂不直接改 runtime harness + +### 5. 最后看 `next experiment plans` + +它们告诉你: + +- 如果批准 proposal +- 下一轮应该跑什么 +- 成功标准是什么 + +## 当前边界 + +- `V2.5 beta` 不自动改代码 +- `V2.5 beta` 不自动生成真正的 variant 实现 +- `candidate variant proposal` 只是草案 +- `hypothesis` 永远不能当成事实 +- 任何 proposal 都必须人工拍板后才能进入实现 +## Expectation Contract v0 Follow-up + +After `candidate_long_context_output_parser_v0` is implemented, the next contract-tightening path is: + +```powershell +bun run scripts/evals/v2_run_experiment.ts --experiment tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json +``` + +Then feed the latest summary back into V2.5: + +```powershell +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/.json +``` + +This follow-up keeps runtime policy unchanged and only tightens: + +- answer-shape expectations +- expected fact anchoring +- manual-review question precision + +## Feedback Contract After Contract v0 + +After `expectation_contract_v0` is already the source experiment, the next feedback step is no longer another scenario-tightening recommendation. + +Instead, rerun feedback against the latest expectation-contract summary: + +```powershell +bun run scripts/evals/v2_run_feedback.ts --experiment-run tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json +bun run scripts/evals/v2_validate_feedback_artifacts.ts +``` + +Expected outcome: + +- exactly one `top_recommendation` +- that recommendation should point to a feedback-system proposal, not another copy of `tighten_real_smoke_expectations_v0` +- the deferred bucket may still keep a lower-priority generic feedback-contract stabilization item + +Current validated example: + +```text +tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json +``` diff --git a/tests/evals/v2/experiment-runs/README.md b/tests/evals/v2/experiment-runs/README.md index dd15d546bd..f03cfbdb87 100644 --- a/tests/evals/v2/experiment-runs/README.md +++ b/tests/evals/v2/experiment-runs/README.md @@ -5,7 +5,8 @@ - This directory stores experiment-level JSON summaries. - V2.1 summaries are usually produced by `bind_existing`. - V2.2 summaries may be produced by `execute_harness`, or by `execute_harness` disabled and falling back to `bind_existing`. -- The top-level schema is stable enough for regression checks and documentation. +- V2.3 adds batch-oriented fields such as `run_group_refs`, `stability_summary`, and `flaky_scenarios`. +- V2.4 may additionally include `long_context_review_verdict` and `long_context_summary`. ## Required Top-Level Fields @@ -31,10 +32,32 @@ | `scorecard_summary` | array | Baseline vs candidate score changes. | | `exploration_signals` | string[] | Automatic review hints. | | `recommended_review_mode` | string | Suggested review mode. | -| `final_decision` | null or object | Human final decision; runner keeps it `null`. | | `errors` | string[] | Hard failures or blocking runner errors. | | `warnings` | string[] | Soft warnings, missing scores, or inconclusive signals. | +## V2.3 Batch Fields + +Batch-oriented artifacts may include: + +- `run_group_refs` +- `stability_summary` +- `flaky_scenarios` +- `run_failures` + +These fields describe repeat aggregation and robustness status. + +## V2.4 Long-Context Fields + +Long-context artifacts may include: + +- `long_context_review_verdict` +- `long_context_summary` + +Meaning: + +- `long_context_review_verdict`: overall review posture for the long-context experiment, such as `needs_manual_review` +- `long_context_summary`: aggregated retention, retrieval, distractor, compaction, and prompt-cost evidence by `scenario + candidate` + ## Risk Verdict Shape ```json @@ -73,9 +96,14 @@ Newer artifacts include: } ``` -For actual V2.2 automatic runs, `results[*].baseline_execution` and `results[*].candidates[*].candidate_execution` contain the adapter result, capture result, `benchmark_run_id`, and `eval_run_id`. +For actual V2.2+ automatic runs, `results[*].baseline_execution` and `results[*].candidates[*].candidate_execution` contain adapter result, capture result, `benchmark_run_id`, and `eval_run_id`. + +Newer beta and later artifacts may also include: + +- `results[*].candidates[*].experiment_validity` +- `results[*].candidates[*].variant_effect_summary` -Newer beta artifacts also include `results[*].candidates[*].experiment_validity` and `results[*].candidates[*].variant_effect_summary` so smoke and real experiments are not interpreted the same way. +so smoke and real experiments are not interpreted the same way. ## Boundary diff --git a/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json b/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json new file mode 100644 index 0000000000..75bdb842a4 --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_3_robustness_smoke_2026-05-03T070927523Z.json @@ -0,0 +1,2786 @@ +{ + "experiment_id": "v2_3_robustness_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.robustness.smoke.json", + "generated_at": "2026-05-03T07:09:27.523Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "smoke", + "evaluation_intent": "regression", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" + ], + "risk_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "pass", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 0, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check remains healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "long_context_review_verdict": null, + "long_context_summary": [], + "variant_effect_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.458Z", + "ended_at": "2026-05-03T07:09:27.494Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.478Z", + "ended_at": "2026-05-03T07:09:27.501Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.467Z", + "ended_at": "2026-05-03T07:09:27.497Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.495Z", + "ended_at": "2026-05-03T07:09:27.519Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.503Z", + "ended_at": "2026-05-03T07:09:27.528Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.498Z", + "ended_at": "2026-05-03T07:09:27.522Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + } + ], + "flaky_scenarios": [], + "recommended_review_mode": "regression_review", + "final_decision": null, + "errors": [], + "warnings": [], + "experiment": { + "experiment_id": "v2_3_robustness_smoke", + "name": "V2.3 Robustness Smoke", + "goal": "Verify V2.3 batch runner support for multi-scenario, multi-candidate, repeat_count > 1, run_group aggregation, stability summary, and flaky detection without model/API spend.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse", + "candidate_eval_fixture_shadow" + ], + "scenario_set_id": "v2_3_robustness_smoke", + "scenario_ids": [ + "execute_harness_smoke_minimal", + "robustness_smoke_minimal_alt" + ], + "repeat_count": 2, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "smoke", + "evaluation_intent": "regression", + "execution": { + "adapter": "fixture_trace", + "db_path": ".observability/v2-robustness-smoke.duckdb", + "timeout_ms": 30000, + "failure_policy": "continue_on_failure", + "env": { + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb" + } + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": true, + "multi_candidate": true, + "repeat_count": 2, + "failure_policy": "continue_on_failure" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.subagent_count_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "baseline_run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "baseline_user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_1_147c3893038b" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "candidate_user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_1_74d214d1e887" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "candidate_user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_1_20a3f4041e99" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "baseline_run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "baseline_user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_baseline_default_repeat_2_bd0d45035ee5" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "candidate_user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_session_me_repeat_2_e1b73d3e5af2" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "candidate_user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_execute_harn_8962867b", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "execute_harness_smoke_minimal", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c", + "eval_run_id": "eval_v2_3_robustness_smok_execute_harness_smok_candidate_eval_fixtu_repeat_2_89badae81e3c" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.md", + "gate_results": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "execute_harness_smoke_minimal", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "baseline_run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "baseline_user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_1_2f998148b932" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "candidate_user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_1_1e3611cdfc01" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "candidate_user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_1_ada6201f9287" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "baseline_run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "baseline_user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "baseline_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f", + "baseline_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_baseline_default_repeat_2_752782a6e13f" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "candidate_user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_session_me_repeat_2_26ad9c80f7d1" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 100, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 100, + "delta": -10, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + }, + { + "candidate_variant_id": "candidate_eval_fixture_shadow", + "candidate_run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "candidate_run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "candidate_user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "candidate_eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21", + "candidate_benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_3_robustn_d65b3df1", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_robustness_s_6a7f68b4", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_ev_2bf59d78", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_3_robustness_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "robustness_smoke_minimal_alt", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_eval_fixture_shadow", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21", + "V2_FIXTURE_DB_PATH": ".observability/v2-robustness-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21", + "eval_run_id": "eval_v2_3_robustness_smok_robustness_smoke_min_candidate_eval_fixtu_repeat_2_52a1672d7b21" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "variant_effect_summary": { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check passed: execute_harness closed the automatic execution and capture loop.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": false, + "runtime_difference_observed": false, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.md", + "gate_results": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 110, + "candidate_value": 105, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 0, + "candidate_value": 0, + "regression_pct": 0, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "decision_quality.subagent_count_observed", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 110, + "candidate_value": 105, + "delta": -5, + "interpretation": "improved" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "robustness_smoke_minimal_alt", + "candidate_variant_id": "candidate_eval_fixture_shadow", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "regression_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-03T07:09:27.523Z" +} diff --git a/tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json b/tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json new file mode 100644 index 0000000000..fc78b7894a --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.json @@ -0,0 +1,4690 @@ +{ + "experiment_id": "v2_4_long_context_fixture_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.long_context.fixture_smoke.json", + "generated_at": "2026-05-03T07:09:57.232Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "smoke", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_vs_run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_vs_run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_vs_run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_vs_run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_vs_run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_vs_run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_vs_run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_vs_run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" + ], + "risk_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 8, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 8, + "inconclusive_count": 0, + "candidate_count": 8, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Smoke check remains healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "long_context_review_verdict": "needs_manual_review", + "long_context_summary": [ + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "context_family": "compaction_pressure", + "context_size_class": "large", + "retained_constraint_mean": 3, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 3, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 2, + "compaction_saved_tokens_mean": 188, + "tool_result_budget_trigger_mean": 1, + "total_prompt_input_tokens_mean": 1230, + "prompt_token_delta_mean": -400, + "success_under_context_pressure_rate": 1, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Compaction/tool-result governance was active with mean compaction trigger count 2.000 and mean saved tokens 188.", + "Relative to baseline, candidate prompt-token delta mean is -400.000.", + "Manual review remains open for 2 question(s)." + ] + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "context_family": "constraint_retention", + "context_size_class": "medium", + "retained_constraint_mean": 3, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 2, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 0, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 0, + "total_prompt_input_tokens_mean": 1080, + "prompt_token_delta_mean": -190, + "success_under_context_pressure_rate": 1, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Relative to baseline, candidate prompt-token delta mean is -190.000.", + "Manual review remains open for 2 question(s)." + ] + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "context_family": "distractor_resistance", + "context_size_class": "medium", + "retained_constraint_mean": 2, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 2, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 0, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 0, + "total_prompt_input_tokens_mean": 1110, + "prompt_token_delta_mean": -200, + "success_under_context_pressure_rate": 1, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Relative to baseline, candidate prompt-token delta mean is -200.000.", + "Manual review remains open for 2 question(s)." + ] + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "context_family": "retrieval", + "context_size_class": "medium", + "retained_constraint_mean": 2, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 3, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 0, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 0, + "total_prompt_input_tokens_mean": 1130, + "prompt_token_delta_mean": -220, + "success_under_context_pressure_rate": 1, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Relative to baseline, candidate prompt-token delta mean is -220.000.", + "Manual review remains open for 2 question(s)." + ] + } + ], + "variant_effect_summary": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect.", + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1270, + "candidate_value": 1080, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1280, + "candidate_value": 1090, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1270, + "candidate_value": 1080, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1280, + "candidate_value": 1090, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1350, + "candidate_value": 1130, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1360, + "candidate_value": 1140, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1350, + "candidate_value": 1130, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1360, + "candidate_value": 1140, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1310, + "candidate_value": 1110, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1320, + "candidate_value": 1120, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1310, + "candidate_value": 1110, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1320, + "candidate_value": 1120, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 42, + "candidate_value": 188, + "delta": 146, + "interpretation": "observed" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 1, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1630, + "candidate_value": 1230, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1640, + "candidate_value": 1240, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 42, + "candidate_value": 188, + "delta": 146, + "interpretation": "observed" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 1, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1630, + "candidate_value": 1230, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1640, + "candidate_value": 1240, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "5 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "3 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "8 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.210Z", + "ended_at": "2026-05-03T07:09:57.231Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1640, + "total_billed_tokens_min": 1640, + "total_billed_tokens_max": 1640, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.215Z", + "ended_at": "2026-05-03T07:09:57.235Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1240, + "total_billed_tokens_min": 1240, + "total_billed_tokens_max": 1240, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_constraint_retention", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.127Z", + "ended_at": "2026-05-03T07:09:57.162Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1280, + "total_billed_tokens_min": 1280, + "total_billed_tokens_max": 1280, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_constraint_retention", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.137Z", + "ended_at": "2026-05-03T07:09:57.166Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1090, + "total_billed_tokens_min": 1090, + "total_billed_tokens_max": 1090, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.187Z", + "ended_at": "2026-05-03T07:09:57.209Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1320, + "total_billed_tokens_min": 1320, + "total_billed_tokens_max": 1320, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.192Z", + "ended_at": "2026-05-03T07:09:57.213Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1120, + "total_billed_tokens_min": 1120, + "total_billed_tokens_max": 1120, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.163Z", + "ended_at": "2026-05-03T07:09:57.184Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1360, + "total_billed_tokens_min": 1360, + "total_billed_tokens_max": 1360, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.168Z", + "ended_at": "2026-05-03T07:09:57.190Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1140, + "total_billed_tokens_min": 1140, + "total_billed_tokens_max": 1140, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] + } + ], + "flaky_scenarios": [], + "recommended_review_mode": "manual_review", + "final_decision": null, + "errors": [], + "warnings": [ + "missing: scenario=long_context_constraint_retention, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_constraint_retention, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_fact_retrieval, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_fact_retrieval, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_distractor_resistance, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_distractor_resistance, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_compaction_pressure, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed", + "missing: scenario=long_context_compaction_pressure, candidate=candidate_long_context_fixture_guarded, score=decision_quality.subagent_count_observed" + ], + "experiment": { + "experiment_id": "v2_4_long_context_fixture_smoke", + "name": "V2.4 Long Context Fixture Smoke", + "goal": "Verify the V2.4 long-context scenario, fixture, scorer, and batch-report pipeline without model/API spend.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_long_context_fixture_guarded" + ], + "scenario_set_id": "v2_4_long_context_fixture", + "scenario_ids": [ + "long_context_constraint_retention", + "long_context_fact_retrieval", + "long_context_distractor_resistance", + "long_context_compaction_pressure" + ], + "repeat_count": 2, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "smoke", + "evaluation_intent": "exploration", + "execution": { + "adapter": "fixture_trace", + "db_path": ".observability/v2-long-context-fixture-smoke.duckdb", + "timeout_ms": 30000, + "failure_policy": "continue_on_failure", + "env": { + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb" + } + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": true, + "multi_candidate": false, + "repeat_count": 2, + "failure_policy": "continue_on_failure" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "long_context_constraint_retention", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "baseline_user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_85a962f9", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_constraint_retention", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_1_bc032d6c0467" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "candidate_user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_85a962f9", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_constraint_retention", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_1_82d1381e066b" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_vs_run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.md", + "gate_results": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1280, + "candidate_value": 1090, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1280, + "candidate_value": 1090, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1270, + "candidate_value": 1080, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1280, + "candidate_value": 1090, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "5 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_constraint_retention", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "baseline_user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_85a962f9", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_constraint_retention", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_baseline_default_repeat_2_8caa5a179406" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "candidate_user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_85a962f9", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_constraint_retention", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_constra_candidate_long_conte_repeat_2_55b173582983" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_vs_run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.md", + "gate_results": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1280, + "candidate_value": 1090, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1280, + "candidate_value": 1090, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1270, + "candidate_value": 1080, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1280, + "candidate_value": 1090, + "delta": -190, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_constraint_retention", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "5 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_fact_retrieval", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "baseline_user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8a2eb6d7", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_1_187e9ae80090" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "candidate_user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8a2eb6d7", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_1_dabe230089e3" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_vs_run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.md", + "gate_results": [ + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1360, + "candidate_value": 1140, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1360, + "candidate_value": 1140, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1350, + "candidate_value": 1130, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1360, + "candidate_value": 1140, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "3 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_fact_retrieval", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "baseline_user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8a2eb6d7", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_baseline_default_repeat_2_6b878480f45a" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "candidate_user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8a2eb6d7", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_fact_re_candidate_long_conte_repeat_2_2b8daafe6d19" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_vs_run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.md", + "gate_results": [ + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1360, + "candidate_value": 1140, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1360, + "candidate_value": 1140, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1350, + "candidate_value": 1130, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1360, + "candidate_value": 1140, + "delta": -220, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "3 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_distractor_resistance", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "baseline_user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8959f636", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_distractor_resistance", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_1_b6886edc58b4" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "candidate_user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8959f636", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_distractor_resistance", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_1_1a519894191b" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_vs_run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.md", + "gate_results": [ + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1320, + "candidate_value": 1120, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1320, + "candidate_value": 1120, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1310, + "candidate_value": 1110, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1320, + "candidate_value": 1120, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "3 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_distractor_resistance", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "baseline_user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8959f636", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_distractor_resistance", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_baseline_default_repeat_2_fc7060f76c1e" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "candidate_user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_8959f636", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_distractor_resistance", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_distrac_candidate_long_conte_repeat_2_e109ef1cd826" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_vs_run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.md", + "gate_results": [ + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1320, + "candidate_value": 1120, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1320, + "candidate_value": 1120, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1310, + "candidate_value": 1110, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1320, + "candidate_value": 1120, + "delta": -200, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_distractor_resistance", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "3 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_compaction_pressure", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "baseline_user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_1d22a803", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_compaction_pressure", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_1_7fa28b338c8c" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "candidate_user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_1d22a803", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_compaction_pressure", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_1_d5f015a79947" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_vs_run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.md", + "gate_results": [ + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1640, + "candidate_value": 1240, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1640, + "candidate_value": 1240, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 42, + "candidate_value": 188, + "delta": 146, + "interpretation": "observed" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 1, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1630, + "candidate_value": 1230, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1640, + "candidate_value": 1240, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "8 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + }, + { + "scenario_id": "long_context_compaction_pressure", + "repeat_index": 2, + "baseline_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "baseline_run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "baseline_user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "baseline_eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1", + "baseline_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_1d22a803", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_compaction_pressure", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_baseline_default_repeat_2_5621bb85ccb1" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "candidate_run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "candidate_run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "candidate_user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "candidate_eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8", + "candidate_benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": "fixture_trace://synthetic", + "stderrRef": "fixture_trace://synthetic" + }, + "capture": { + "status": "captured", + "user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_ce1f23b4", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_1d22a803", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_lo_79ee9d20", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_fixture_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_compaction_pressure", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_long_context_fixture_guarded", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8", + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb", + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "cliArgs": [], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": null, + "feature_gate_count": 0, + "env_override_count": 1, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8", + "eval_run_id": "eval_v2_4_long_context_fi_long_context_compact_candidate_long_conte_repeat_2_de4fddfcfec8" + }, + "baseline_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "candidate_variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "variant_effect_summary": { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "baseline_variant_effect_observed": false, + "candidate_variant_effect_observed": false, + "runtime_difference_observed": false, + "baseline_policy_mode": "unknown", + "candidate_policy_mode": "unknown", + "summary": [ + "Baseline session_memory policy was not observed in V1 events.", + "Candidate session_memory policy was not observed in V1 events.", + "At least one score dimension changed between baseline and candidate.", + "No stable runtime difference was observed yet; any score delta may still be execution noise rather than a proven harness effect." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "smoke", + "reason": "Long-context fixture smoke passed: the trace-backed scoring and reporting loop is healthy.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_vs_run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.md", + "gate_results": [ + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1640, + "candidate_value": 1240, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 1640, + "candidate_value": 1240, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 42, + "candidate_value": 188, + "delta": 146, + "interpretation": "observed" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 1, + "candidate_value": 0, + "delta": -1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 3, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 0.666667, + "candidate_value": 1, + "delta": 0.333333, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 1, + "delta": 1, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 1630, + "candidate_value": 1230, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 1640, + "candidate_value": 1240, + "delta": -400, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_compaction_pressure", + "candidate_variant_id": "candidate_long_context_fixture_guarded", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "8 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer." + ], + "recommended_review_mode": "manual_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-03T07:09:57.232Z" +} diff --git a/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json b/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json new file mode 100644 index 0000000000..2d80d2b21e --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json @@ -0,0 +1,836 @@ +{ + "experiment_id": "v2_4_long_context_real_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.long_context.real_smoke.json", + "generated_at": "2026-05-03T06:06:17.174Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" + ], + "risk_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment remains interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "long_context_review_verdict": "needs_manual_review", + "long_context_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "context_family": "retrieval", + "context_size_class": "medium", + "retained_constraint_mean": 0, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": null, + "retrieved_fact_mean": 0, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": null, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 4, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 2, + "total_prompt_input_tokens_mean": 26887, + "prompt_token_delta_mean": 0, + "success_under_context_pressure_rate": null, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "interpretation": [ + "Automatic fact-retrieval quality could not be fully established from trace-backed evidence alone.", + "No distractor confusion was observed in the current evidence window.", + "Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0.", + "Relative to baseline, candidate prompt-token delta mean is 0.000.", + "Manual review remains open for 2 question(s)." + ] + } + ], + "variant_effect_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": null, + "candidate_value": null, + "delta": null, + "interpretation": "missing" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": null, + "candidate_value": null, + "delta": null, + "interpretation": "missing" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 26887, + "candidate_value": 26887, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27189, + "candidate_value": 27189, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da" + ], + "status": "completed", + "started_at": "2026-05-03T06:05:48.876Z", + "ended_at": "2026-05-03T06:05:56.858Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7982, + "e2e_duration_min": 7982, + "e2e_duration_max": 7982, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8" + ], + "status": "completed", + "started_at": "2026-05-03T06:06:05.082Z", + "ended_at": "2026-05-03T06:06:12.588Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7506, + "e2e_duration_min": 7506, + "e2e_duration_max": 7506, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + } + ], + "flaky_scenarios": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "flaky_status": "inconclusive" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "flaky_status": "inconclusive" + } + ], + "recommended_review_mode": "manual_review", + "final_decision": null, + "errors": [], + "warnings": [ + "missing: scenario=long_context_fact_retrieval_real_smoke, candidate=candidate_session_memory_sparse, score=decision_quality.subagent_count_observed" + ], + "experiment": { + "experiment_id": "v2_4_long_context_real_smoke", + "name": "V2.4 Long Context Real Smoke", + "goal": "Run one small real-model long-context scenario to confirm that execute_harness can produce interpretable cost, compaction, and manual-review evidence.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_4_long_context_real", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "db_path": ".observability/v2-long-context-real-smoke.duckdb", + "timeout_ms": 120000, + "max_turns": 6, + "failure_policy": "fail_fast", + "allow_fallback_to_bind_existing": true + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": false, + "multi_candidate": false, + "repeat_count": 1, + "failure_policy": "fail_fast" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z", + "baseline_run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "baseline_user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "baseline_eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "baseline_benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\797aea5908e70f01\\stdout.txt", + "stderrRef": ".observability\\v2h\\797aea5908e70f01\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_fd8c0e6a", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_ac1e93f0", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_real_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z", + "candidate_run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "candidate_user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "candidate_eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "candidate_benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\3c0784524f99789f\\stdout.txt", + "stderrRef": ".observability\\v2h\\3c0784524f99789f\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_fd8c0e6a", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_ac1e93f0", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_real_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T06:05:56.765Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T06:06:12.486Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "variant_effect_summary": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Long-context real smoke captured interpretable trace-backed context-governance evidence.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md", + "gate_results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27189, + "candidate_value": 27189, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27189, + "candidate_value": 27189, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": null, + "candidate_value": null, + "delta": null, + "interpretation": "missing" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": null, + "candidate_value": null, + "delta": null, + "interpretation": "missing" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 26887, + "candidate_value": 26887, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27189, + "candidate_value": 27189, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "manual_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-03T06:06:17.174Z" +} diff --git a/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json b/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json new file mode 100644 index 0000000000..b2b6d79d6e --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json @@ -0,0 +1,837 @@ +{ + "experiment_id": "v2_4_long_context_real_smoke", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.long_context.real_smoke.json", + "generated_at": "2026-05-03T14:56:44.824Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_vs_run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" + ], + "risk_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment remains interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "long_context_review_verdict": "needs_manual_review", + "long_context_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "context_family": "retrieval", + "context_size_class": "medium", + "retained_constraint_mean": 2, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 3, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 4, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 2, + "total_prompt_input_tokens_mean": 26887, + "prompt_token_delta_mean": 0, + "success_under_context_pressure_rate": null, + "manual_review_required": true, + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0.", + "Relative to baseline, candidate prompt-token delta mean is 0.000.", + "Manual review remains open for 2 question(s)." + ] + } + ], + "variant_effect_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 26887, + "candidate_value": 26887, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27189, + "candidate_value": 27189, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b" + ], + "status": "completed", + "started_at": "2026-05-03T14:56:10.802Z", + "ended_at": "2026-05-03T14:56:17.911Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7109, + "e2e_duration_min": 7109, + "e2e_duration_max": 7109, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + }, + { + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348" + ], + "status": "completed", + "started_at": "2026-05-03T14:56:28.027Z", + "ended_at": "2026-05-03T14:56:40.199Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 12172, + "e2e_duration_min": 12172, + "e2e_duration_max": 12172, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + } + ], + "flaky_scenarios": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "flaky_status": "inconclusive" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "flaky_status": "inconclusive" + } + ], + "recommended_review_mode": "manual_review", + "final_decision": null, + "errors": [], + "warnings": [ + "missing: scenario=long_context_fact_retrieval_real_smoke, candidate=candidate_session_memory_sparse, score=decision_quality.subagent_count_observed" + ], + "experiment": { + "experiment_id": "v2_4_long_context_real_smoke", + "name": "V2.4 Long Context Real Smoke", + "goal": "Run one small real-model long-context scenario to confirm that execute_harness can produce interpretable cost, compaction, and manual-review evidence.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_4_long_context_real", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "db_path": ".observability/v2-long-context-real-smoke.duckdb", + "timeout_ms": 120000, + "max_turns": 6, + "failure_policy": "fail_fast", + "allow_fallback_to_bind_existing": true + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": false, + "multi_candidate": false, + "repeat_count": 1, + "failure_policy": "fail_fast" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z", + "baseline_run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "baseline_user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "baseline_eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "baseline_benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\983fb1f664390557\\stdout.txt", + "stderrRef": ".observability\\v2h\\983fb1f664390557\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_fd8c0e6a", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_ac1e93f0", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_real_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z", + "candidate_run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "candidate_user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "candidate_eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "candidate_benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\688d717cf0f5c81a\\stdout.txt", + "stderrRef": ".observability\\v2h\\688d717cf0f5c81a\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_4_long_co_fd8c0e6a", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_ac1e93f0", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_4_long_context_real_smoke", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T14:56:17.800Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T14:56:40.106Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "variant_effect_summary": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Long-context real smoke captured interpretable trace-backed context-governance evidence.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_vs_run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md", + "gate_results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27189, + "candidate_value": 27189, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27189, + "candidate_value": 27189, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 26887, + "candidate_value": 26887, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27189, + "candidate_value": 27189, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "manual_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-03T14:56:44.824Z" +} diff --git a/tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json b/tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json new file mode 100644 index 0000000000..bc35c04688 --- /dev/null +++ b/tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json @@ -0,0 +1,842 @@ +{ + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "manifest_ref": "tests\\evals\\v2\\experiments\\_experiment.long_context.real_smoke.expectation_contract_v0.json", + "generated_at": "2026-05-03T15:32:29.794Z", + "mode": "execute_harness", + "requested_mode": "execute_harness", + "automation_disabled": false, + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "run_refs": [ + "tests\\evals\\v2\\runs\\run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.json", + "tests\\evals\\v2\\runs\\run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.json" + ], + "run_group_refs": [ + "tests\\evals\\v2\\run-groups\\group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z.json", + "tests\\evals\\v2\\run-groups\\group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436.json" + ], + "score_refs": [ + "tests\\evals\\v2\\scores\\run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.scores.json", + "tests\\evals\\v2\\scores\\run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.scores.json" + ], + "report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" + ], + "risk_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "gate_verdict": { + "status": "inconclusive", + "scope": "regression_risk_only", + "is_final_experiment_judgment": false, + "hard_fail_count": 0, + "soft_warning_count": 0, + "missing_score_count": 1, + "inconclusive_count": 0, + "candidate_count": 1, + "notes": "This verdict is only a regression-risk gate result. It is not a final judgment about model intelligence, harness value, or exploratory potential." + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Real experiment remains interpretable.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "long_context_review_verdict": "needs_manual_review", + "long_context_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "context_family": "retrieval", + "context_size_class": "medium", + "retained_constraint_mean": 2, + "lost_constraint_mean": 0, + "constraint_retention_rate_mean": 1, + "retrieved_fact_mean": 3, + "missed_fact_mean": 0, + "retrieved_fact_hit_rate_mean": 1, + "distractor_confusion_mean": 0, + "compaction_trigger_mean": 4, + "compaction_saved_tokens_mean": 0, + "tool_result_budget_trigger_mean": 2, + "total_prompt_input_tokens_mean": 27007, + "prompt_token_delta_mean": 0, + "success_under_context_pressure_rate": null, + "manual_review_required": true, + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "interpretation": [ + "Observed constraint retention remained at 100.0%.", + "Observed fact retrieval hit rate is 100.0%.", + "No distractor confusion was observed in the current evidence window.", + "Compaction/tool-result governance was active with mean compaction trigger count 4.000 and mean saved tokens 0.", + "Relative to baseline, candidate prompt-token delta mean is 0.000.", + "Manual review remains open for 2 question(s)." + ] + } + ], + "variant_effect_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "At least one score dimension changed between baseline and candidate." + ] + } + ], + "runtime_difference_summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "At least one score dimension changed between baseline and candidate." + ], + "verdict_boundary": "risk_verdict/gate_verdict is regression-risk-only and is not a final experiment judgment.", + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 27007, + "candidate_value": 27007, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27436, + "candidate_value": 27372, + "delta": -64, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "stability_summary": [ + { + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z", + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e" + ], + "status": "completed", + "started_at": "2026-05-03T15:31:47.795Z", + "ended_at": "2026-05-03T15:32:03.341Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27436, + "total_billed_tokens_min": 27436, + "total_billed_tokens_max": 27436, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 15546, + "e2e_duration_min": 15546, + "e2e_duration_max": 15546, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + }, + { + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436", + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d" + ], + "status": "completed", + "started_at": "2026-05-03T15:32:12.356Z", + "ended_at": "2026-05-03T15:32:25.137Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27372, + "total_billed_tokens_min": 27372, + "total_billed_tokens_max": 27372, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 12781, + "e2e_duration_min": 12781, + "e2e_duration_max": 12781, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] + } + ], + "flaky_scenarios": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "baseline_default", + "flaky_status": "inconclusive" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "candidate_session_memory_sparse", + "flaky_status": "inconclusive" + } + ], + "recommended_review_mode": "manual_review", + "final_decision": null, + "errors": [], + "warnings": [ + "missing: scenario=long_context_fact_retrieval_real_smoke_contract_v0, candidate=candidate_session_memory_sparse, score=decision_quality.subagent_count_observed" + ], + "experiment": { + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "name": "V2.5 Long Context Real Smoke Expectation Contract v0", + "goal": "Run the tightened real-smoke fact-retrieval contract to verify that clearer answer constraints and review prompts preserve runtime-difference evidence without adding brittle failures.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_5_long_context_expectation_contract", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "db_path": ".observability/v2-long-context-real-smoke.duckdb", + "timeout_ms": 120000, + "max_turns": 6, + "failure_policy": "fail_fast", + "allow_fallback_to_bind_existing": true + }, + "status": "ready" + }, + "runner": { + "requested_mode": "execute_harness", + "mode": "execute_harness", + "automation_disabled": false, + "fallback_reason": null, + "v2_3_batch_capabilities": { + "multi_scenario": false, + "multi_candidate": false, + "repeat_count": 1, + "failure_policy": "fail_fast" + }, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate" + }, + "results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "repeat_index": 1, + "baseline_run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z", + "baseline_run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "baseline_user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "baseline_eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "baseline_benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "baseline_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\7cb26e13840948de\\stdout.txt", + "stderrRef": ".observability\\v2h\\7cb26e13840948de\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_5_long_co_f2af0643", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_616fb55e", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_baseline_def_eb4a038e", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_5_long_context_real_smoke_expectation_contract_v0", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke_contract_v0", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "baseline_default", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_default.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379" + }, + "candidates": [ + { + "candidate_variant_id": "candidate_session_memory_sparse", + "candidate_run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436", + "candidate_run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "candidate_user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "candidate_eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "candidate_benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "candidate_execution": { + "execution": { + "status": "completed", + "stdoutRef": ".observability\\v2h\\e6d6e3586fa85bf4\\stdout.txt", + "stderrRef": ".observability\\v2h\\e6d6e3586fa85bf4\\stderr.txt" + }, + "capture": { + "status": "captured", + "user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "match_count": 1 + }, + "variant_apply": { + "env": { + "CLAUDE_CODE_EVAL_EXPERIMENT_ID": "exp_v2_5_long_co_f2af0643", + "CLAUDE_CODE_EVAL_SCENARIO_ID": "scn_long_context_616fb55e", + "CLAUDE_CODE_EVAL_VARIANT_ID": "var_candidate_se_efbc2e82", + "CLAUDE_CODE_EVAL_EXPERIMENT_LABEL": "v2_5_long_context_real_smoke_expectation_contract_v0", + "CLAUDE_CODE_EVAL_SCENARIO_LABEL": "long_context_fact_retrieval_real_smoke_contract_v0", + "CLAUDE_CODE_EVAL_VARIANT_LABEL": "candidate_session_memory_sparse", + "CLAUDE_CODE_EVAL_BENCHMARK_RUN_ID": "bench_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "CLAUDE_CODE_EVAL_RUN_ID": "eval_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "CLAUDE_CODE_EVAL_CONFIG_SNAPSHOT_REF": "tests/evals/v2/configs/session_memory_sparse.runtime.json" + }, + "cliArgs": [ + "--max-turns", + "6" + ], + "metadata": { + "supported_variant_fields": [ + "env_overrides", + "config_snapshot_ref", + "model_config", + "feature_gates" + ], + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "feature_gate_count": 0, + "env_override_count": 0, + "model_config": null + } + }, + "benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50" + }, + "baseline_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T15:32:03.273Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "candidate_variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T15:32:25.067Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "variant_effect_summary": { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "baseline_variant_effect_observed": true, + "candidate_variant_effect_observed": true, + "runtime_difference_observed": true, + "baseline_policy_mode": "default", + "candidate_policy_mode": "sparse", + "summary": [ + "Baseline session_memory policy was observed with mode=default.", + "Candidate session_memory policy was observed with mode=sparse.", + "Candidate sparse-policy markers were observed in runtime evidence.", + "Observed baseline and candidate session_memory policies differ.", + "At least one score dimension changed between baseline and candidate." + ] + }, + "experiment_validity": { + "status": "valid", + "profile": "real_experiment", + "reason": "Long-context real smoke captured interpretable trace-backed context-governance evidence.", + "blockers": [], + "warnings": [], + "checks": { + "baseline_captured": true, + "candidate_captured": true, + "no_ambiguous_capture": true, + "score_evidence_present": true, + "variant_effect_observed": true, + "runtime_difference_observed": true, + "scenario_intent_matched": true + } + }, + "compare_report": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md", + "gate_results": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "task_success.main_chain_observed", + "verdict": "pass", + "passed": true, + "baseline_value": 1, + "candidate_value": 1, + "regression_pct": 0, + "condition": "candidate < baseline", + "notes": "Candidate cannot lose the main-chain success signal." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "hard_fail", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27436, + "candidate_value": 27372, + "regression_pct": 0, + "condition": "candidate_regression_pct > 30 and task_success_not_improved", + "notes": "Cost cannot rise sharply without a success improvement." + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "efficiency.total_billed_tokens", + "verdict": "pass", + "passed": true, + "baseline_value": 27436, + "candidate_value": 27372, + "regression_pct": 0, + "condition": "candidate_regression_pct > 10" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "rule_type": "soft_warning", + "score_spec_id": "decision_quality.subagent_count_observed", + "verdict": "missing", + "passed": false, + "baseline_value": null, + "candidate_value": null, + "regression_pct": null, + "condition": "candidate_regression_pct > 50" + } + ], + "scorecard_summary": [ + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_saved_tokens", + "direction": "observed_only", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.compaction_trigger_count", + "direction": "observed_only", + "baseline_value": 4, + "candidate_value": 4, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.constraint_retention_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.distractor_confusion_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.lost_constraint_count", + "direction": "lower_is_better", + "baseline_value": 0, + "candidate_value": 0, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.manual_review_required", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retained_constraint_count", + "direction": "higher_is_better", + "baseline_value": 2, + "candidate_value": 2, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.retrieved_fact_hit_rate", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.success_under_context_pressure", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "context.total_prompt_input_tokens", + "direction": "lower_is_better", + "baseline_value": 27007, + "candidate_value": 27007, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "controllability.turn_limit_basic", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "decision_quality.session_memory_policy_observed", + "direction": "observed_only", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "efficiency.total_billed_tokens", + "direction": "lower_is_better", + "baseline_value": 27436, + "candidate_value": 27372, + "delta": -64, + "interpretation": "improved" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "stability.recovery_absence", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + }, + { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "candidate_variant_id": "candidate_session_memory_sparse", + "score_spec_id": "task_success.main_chain_observed", + "direction": "higher_is_better", + "baseline_value": 1, + "candidate_value": 1, + "delta": 0, + "interpretation": "unchanged" + } + ], + "exploration_signals": [ + "1 score dimension(s) changed; inspect the scorecard before treating the risk verdict as the final answer.", + "A real runtime difference was observed between baseline and candidate; inspect policy evidence before reading score deltas." + ], + "recommended_review_mode": "manual_review" + } + ] + } + ], + "run_failures": [], + "created_at": "2026-05-03T15:32:29.794Z" +} diff --git a/tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json b/tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json new file mode 100644 index 0000000000..de970bb9bb --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json @@ -0,0 +1,47 @@ +{ + "experiment_id": "v2_4_long_context_fixture_smoke", + "name": "V2.4 Long Context Fixture Smoke", + "goal": "Verify the V2.4 long-context scenario, fixture, scorer, and batch-report pipeline without model/API spend.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_long_context_fixture_guarded" + ], + "scenario_set_id": "v2_4_long_context_fixture", + "scenario_ids": [ + "long_context_constraint_retention", + "long_context_fact_retrieval", + "long_context_distractor_resistance", + "long_context_compaction_pressure" + ], + "repeat_count": 2, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "smoke", + "evaluation_intent": "exploration", + "execution": { + "adapter": "fixture_trace", + "db_path": ".observability/v2-long-context-fixture-smoke.duckdb", + "timeout_ms": 30000, + "failure_policy": "continue_on_failure", + "env": { + "V2_FIXTURE_DB_PATH": ".observability/v2-long-context-fixture-smoke.duckdb" + } + }, + "status": "ready" +} diff --git a/tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json b/tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json new file mode 100644 index 0000000000..ca3792063d --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.long_context.real_smoke.expectation_contract_v0.json @@ -0,0 +1,44 @@ +{ + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "name": "V2.5 Long Context Real Smoke Expectation Contract v0", + "goal": "Run the tightened real-smoke fact-retrieval contract to verify that clearer answer constraints and review prompts preserve runtime-difference evidence without adding brittle failures.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_5_long_context_expectation_contract", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "db_path": ".observability/v2-long-context-real-smoke.duckdb", + "timeout_ms": 120000, + "max_turns": 6, + "failure_policy": "fail_fast", + "allow_fallback_to_bind_existing": true + }, + "status": "ready" +} diff --git a/tests/evals/v2/experiments/_experiment.long_context.real_smoke.json b/tests/evals/v2/experiments/_experiment.long_context.real_smoke.json new file mode 100644 index 0000000000..d425cc8292 --- /dev/null +++ b/tests/evals/v2/experiments/_experiment.long_context.real_smoke.json @@ -0,0 +1,44 @@ +{ + "experiment_id": "v2_4_long_context_real_smoke", + "name": "V2.4 Long Context Real Smoke", + "goal": "Run one small real-model long-context scenario to confirm that execute_harness can produce interpretable cost, compaction, and manual-review evidence.", + "baseline_variant_id": "baseline_default", + "candidate_variant_ids": [ + "candidate_session_memory_sparse" + ], + "scenario_set_id": "v2_4_long_context_real", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "repeat_count": 1, + "score_spec_ids": [ + "task_success.main_chain_observed", + "efficiency.total_billed_tokens", + "decision_quality.session_memory_policy_observed", + "stability.recovery_absence", + "controllability.turn_limit_basic", + "context.retained_constraint_count", + "context.lost_constraint_count", + "context.constraint_retention_rate", + "context.retrieved_fact_hit_rate", + "context.distractor_confusion_count", + "context.total_prompt_input_tokens", + "context.compaction_trigger_count", + "context.compaction_saved_tokens", + "context.success_under_context_pressure", + "context.manual_review_required" + ], + "gate_policy_id": "default_v2_1_gate", + "mode": "execute_harness", + "report_profile": "real_experiment", + "evaluation_intent": "exploration", + "execution": { + "adapter": "cli_print", + "db_path": ".observability/v2-long-context-real-smoke.duckdb", + "timeout_ms": 120000, + "max_turns": 6, + "failure_policy": "fail_fast", + "allow_fallback_to_bind_existing": true + }, + "status": "ready" +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb.json new file mode 100644 index 0000000000..20b6f9cfb0 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4", + "change_layer": "scenario", + "variant_name": "candidate_feedback_input_contract_v0", + "implementation_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_v0", + "name": "candidate_feedback_input_contract_v0", + "description": "Stabilize the upstream scenario or runner contract before trusting automated feedback suggestions for this branch of evaluation.", + "change_layer": "mixed", + "notes": "Scenario/evaluator contract draft generated by V2.5 feedback loop alpha." + }, + "implementation_hint": [ + "Tighten expected facts, constraints, and manual review prompts for real smoke.", + "Do not change runtime policy in this candidate." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac.json new file mode 100644 index 0000000000..6583053282 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51", + "change_layer": "feedback_system", + "variant_name": "candidate_feedback_input_contract_v0", + "implementation_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_v0", + "name": "candidate_feedback_input_contract_v0", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep feedback taxonomy stable and queue semantics explicit.", + "Do not turn manual review into automatic pass." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a.json new file mode 100644 index 0000000000..1cf2a26734 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d", + "change_layer": "feedback_system", + "variant_name": "candidate_feedback_input_contract_v0", + "implementation_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_v0", + "name": "candidate_feedback_input_contract_v0", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep feedback taxonomy stable and queue semantics explicit.", + "Do not turn manual review into automatic pass." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed.json new file mode 100644 index 0000000000..ad84c2e110 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84", + "change_layer": "scenario", + "variant_name": "candidate_long_context_expectation_contract_v0", + "implementation_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_expectation_contract_v0", + "name": "candidate_long_context_expectation_contract_v0", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "change_layer": "mixed", + "notes": "Scenario/evaluator contract draft generated by V2.5 feedback loop alpha." + }, + "implementation_hint": [ + "Tighten expected facts, constraints, and manual review prompts for real smoke.", + "Do not change runtime policy in this candidate." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e.json new file mode 100644 index 0000000000..d8cb26a842 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8", + "change_layer": "scenario", + "variant_name": "candidate_long_context_expectation_contract_v0", + "implementation_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_expectation_contract_v0", + "name": "candidate_long_context_expectation_contract_v0", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Tighten expected facts, constraints, and manual review prompts for real smoke.", + "Do not change runtime policy in this candidate." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652.json new file mode 100644 index 0000000000..59fd6b440a --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91", + "change_layer": "scenario", + "variant_name": "candidate_long_context_expectation_contract_v0", + "implementation_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_expectation_contract_v0", + "name": "candidate_long_context_expectation_contract_v0", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Tighten expected facts, constraints, and manual review prompts for real smoke.", + "Do not change runtime policy in this candidate." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7.json new file mode 100644 index 0000000000..5bf557bf6d --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146", + "change_layer": "scorer", + "variant_name": "candidate_long_context_output_parser_v0", + "implementation_scope": "Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_output_parser_v0", + "name": "candidate_long_context_output_parser_v0", + "description": "Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence.", + "change_layer": "mixed", + "notes": "Evaluator-only candidate draft generated by V2.5 feedback loop alpha." + }, + "implementation_hint": [ + "Extend real-smoke output parsing for expected facts and retained constraints.", + "Keep the human-review boundary explicit." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978.json new file mode 100644 index 0000000000..49a5018101 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36", + "change_layer": "evaluator", + "variant_name": "candidate_long_context_output_parser_v0", + "implementation_scope": "Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_output_parser_v0", + "name": "candidate_long_context_output_parser_v0", + "description": "Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence.", + "change_layer": "mixed", + "notes": "Evaluator-only candidate draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep the human-review boundary explicit.", + "Extend real-smoke output parsing for expected facts and retained constraints." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9.json new file mode 100644 index 0000000000..4fbf7be208 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488", + "change_layer": "scorer", + "variant_name": "candidate_long_context_score_binding_v0", + "implementation_scope": "Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_score_binding_v0", + "name": "candidate_long_context_score_binding_v0", + "description": "Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk.", + "change_layer": "mixed", + "notes": "Evaluator-only candidate draft generated by V2.5 feedback loop alpha." + }, + "implementation_hint": [ + "Extend real-smoke output parsing for expected facts and retained constraints.", + "Keep the human-review boundary explicit." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355.json new file mode 100644 index 0000000000..1808d9f397 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2", + "change_layer": "scorer", + "variant_name": "candidate_long_context_score_binding_v0", + "implementation_scope": "Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_score_binding_v0", + "name": "candidate_long_context_score_binding_v0", + "description": "Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk.", + "change_layer": "mixed", + "notes": "Evaluator-only candidate draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep the human-review boundary explicit.", + "Bind parser output into context score-spec fields without hiding uncertainty." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2.json new file mode 100644 index 0000000000..7e1aac9e5f --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4", + "change_layer": "feedback_system", + "variant_name": "candidate_feedback_input_contract_after_contract_v0", + "implementation_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_after_contract_v0", + "name": "candidate_feedback_input_contract_after_contract_v0", + "description": "Stabilize the feedback input contract so an already-realized expectation-contract follow-up is detected and not re-recommended as the next top proposal.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep feedback taxonomy stable and queue semantics explicit.", + "Do not turn manual review into automatic pass." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3.json new file mode 100644 index 0000000000..ea05de7be7 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd", + "change_layer": "feedback_system", + "variant_name": "candidate_feedback_input_contract_v0", + "implementation_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_v0", + "name": "candidate_feedback_input_contract_v0", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep feedback taxonomy stable and queue semantics explicit.", + "Do not turn manual review into automatic pass." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3.json new file mode 100644 index 0000000000..22a64f1343 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6", + "change_layer": "feedback_system", + "variant_name": "candidate_feedback_input_contract_v0", + "implementation_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_feedback_input_contract_v0", + "name": "candidate_feedback_input_contract_v0", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Keep feedback taxonomy stable and queue semantics explicit.", + "Do not turn manual review into automatic pass." + ] + } +} diff --git a/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f.json b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f.json new file mode 100644 index 0000000000..a00033ad34 --- /dev/null +++ b/tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f.json @@ -0,0 +1,25 @@ +{ + "candidate_proposal_id": "candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52", + "change_layer": "scenario", + "variant_name": "candidate_long_context_expectation_contract_v0", + "implementation_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "suggested_manifest_patch": { + "proposed_variant_stub": { + "variant_id": "candidate_long_context_expectation_contract_v0", + "name": "candidate_long_context_expectation_contract_v0", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "change_layer": "mixed", + "notes": "Contract-level draft generated by V2.5 beta feedback loop." + }, + "implementation_hint": [ + "Tighten expected facts, constraints, and manual review prompts for real smoke.", + "Do not change runtime policy in this candidate." + ] + } +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f.json new file mode 100644 index 0000000000..da4459df54 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "failure_criteria": [ + "Scenario contract changes erase the current runtime-difference evidence.", + "Long-context intent becomes less specific or more brittle." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b.json new file mode 100644 index 0000000000..e43d342185 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "failure_criteria": [ + "Feedback queue becomes contradictory or unstable across equivalent inputs.", + "Manual review and human approval boundaries become harder to distinguish." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4.json new file mode 100644 index 0000000000..4e3d2c5e2d --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "failure_criteria": [ + "Feedback queue becomes contradictory or unstable across equivalent inputs.", + "Manual review and human approval boundaries become harder to distinguish." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e.json new file mode 100644 index 0000000000..1d6db923ad --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_expectation_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "failure_criteria": [ + "Scenario contract changes erase the current runtime-difference evidence.", + "Long-context intent becomes less specific or more brittle." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6.json new file mode 100644 index 0000000000..8cb61ed219 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_expectation_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "failure_criteria": [ + "Scenario contract changes erase the current runtime-difference evidence.", + "Long-context intent becomes less specific or more brittle." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json new file mode 100644 index 0000000000..0d09dee0e6 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_expectation_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "failure_criteria": [ + "Scenario contract changes erase the current runtime-difference evidence.", + "Long-context intent becomes less specific or more brittle." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400.json new file mode 100644 index 0000000000..6e691b8ddb --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400.json @@ -0,0 +1,21 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_output_parser_v0", + "repeat_count": 2, + "success_criteria": [ + "retrieved_fact_hit_rate is no longer null for real smoke.", + "constraint_retention_rate is no longer null for real smoke.", + "manual_review_required does not increase.", + "distractor_confusion_count remains 0." + ], + "failure_criteria": [ + "Parser introduces false positives against distractor-resistant scenarios.", + "Manual review requirement increases or semantic scores become contradictory." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json new file mode 100644 index 0000000000..f0accdada6 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json @@ -0,0 +1,21 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_output_parser_v0", + "repeat_count": 2, + "success_criteria": [ + "retrieved_fact_hit_rate is no longer null for real smoke.", + "constraint_retention_rate is no longer null for real smoke.", + "manual_review_required does not increase.", + "distractor_confusion_count remains 0." + ], + "failure_criteria": [ + "Parser introduces false positives against distractor-resistant scenarios.", + "Manual review requirement increases or semantic scores become contradictory." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37.json new file mode 100644 index 0000000000..6e0ca67846 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37.json @@ -0,0 +1,21 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_score_binding_v0", + "repeat_count": 2, + "success_criteria": [ + "retrieved_fact_hit_rate is no longer null for real smoke.", + "constraint_retention_rate is no longer null for real smoke.", + "manual_review_required does not increase.", + "distractor_confusion_count remains 0." + ], + "failure_criteria": [ + "Parser introduces false positives against distractor-resistant scenarios.", + "Manual review requirement increases or semantic scores become contradictory." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3.json new file mode 100644 index 0000000000..4c45837a89 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3.json @@ -0,0 +1,21 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3", + "based_on_proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_score_binding_v0", + "repeat_count": 2, + "success_criteria": [ + "retrieved_fact_hit_rate is no longer null for real smoke.", + "constraint_retention_rate is no longer null for real smoke.", + "manual_review_required does not increase.", + "distractor_confusion_count remains 0." + ], + "failure_criteria": [ + "Parser introduces false positives against distractor-resistant scenarios.", + "Manual review requirement increases or semantic scores become contradictory." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json new file mode 100644 index 0000000000..005ab3d444 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_after_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "failure_criteria": [ + "Feedback queue becomes contradictory or unstable across equivalent inputs.", + "Manual review and human approval boundaries become harder to distinguish." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1.json new file mode 100644 index 0000000000..139aaf7aeb --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "failure_criteria": [ + "Feedback queue becomes contradictory or unstable across equivalent inputs.", + "Manual review and human approval boundaries become harder to distinguish." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f.json new file mode 100644 index 0000000000..124c126090 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_feedback_input_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "failure_criteria": [ + "Feedback queue becomes contradictory or unstable across equivalent inputs.", + "Manual review and human approval boundaries become harder to distinguish." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json new file mode 100644 index 0000000000..9ccf5a6458 --- /dev/null +++ b/tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json @@ -0,0 +1,20 @@ +{ + "next_experiment_plan_id": "experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4", + "based_on_proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52", + "scenario_ids": [ + "long_context_fact_retrieval_real_smoke_contract_v0" + ], + "baseline_variant_id": "baseline_default", + "candidate_variant_id": "candidate_long_context_expectation_contract_v0", + "repeat_count": 1, + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "failure_criteria": [ + "Scenario contract changes erase the current runtime-difference evidence.", + "Long-context intent becomes less specific or more brittle." + ], + "manual_review_required": true +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b.json new file mode 100644 index 0000000000..134dc752e1 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "constraint_retention_rate_missing_long_context_fact_retrieval_real_smoke", + "severity": "medium", + "summary": "constraint_retention_rate_mean is null for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json new file mode 100644 index 0000000000..a1f1a7a457 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "constraint_retention_rate_missing_long_context_fact_retrieval_real_smoke", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke", + "summary": "constraint_retention_rate_mean is null for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae.json new file mode 100644 index 0000000000..7822a75fcc --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse", + "severity": "high", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723.json new file mode 100644 index 0000000000..22a4c55516 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_baseline_default", + "severity": "high", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee.json new file mode 100644 index 0000000000..85008e8e64 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke:candidate_session_memory_sparse", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740.json new file mode 100644 index 0000000000..ba9aecca51 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_baseline_default", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke:baseline_default", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008.json new file mode 100644 index 0000000000..de9c169b54 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_baseline_default", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke:baseline_default", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / baseline_default.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/0/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97.json new file mode 100644 index 0000000000..e655d75a87 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke:candidate_session_memory_sparse", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke / candidate_session_memory_sparse.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/1/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39.json new file mode 100644 index 0000000000..b2bf2af311 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "long_context_review_verdict_needs_manual_review", + "severity": "medium", + "summary": "The experiment-level long_context_review_verdict remains needs_manual_review.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json new file mode 100644 index 0000000000..02275e1f77 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "long_context_review_verdict_needs_manual_review", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The experiment-level long_context_review_verdict remains needs_manual_review.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json new file mode 100644 index 0000000000..c76eee22d8 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "long_context_review_verdict_needs_manual_review", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The experiment-level long_context_review_verdict remains needs_manual_review.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_review_verdict", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2.json new file mode 100644 index 0000000000..7c2b3c7b12 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "manual_review_required_long_context_fact_retrieval_real_smoke", + "severity": "medium", + "summary": "manual_review_required is true for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json new file mode 100644 index 0000000000..36541dae50 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "manual_review_required_long_context_fact_retrieval_real_smoke", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke", + "summary": "manual_review_required is true for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json new file mode 100644 index 0000000000..e0c0aa83de --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "manual_review_required_long_context_fact_retrieval_real_smoke", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke", + "summary": "manual_review_required is true for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_summary/0/manual_review_required", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae.json new file mode 100644 index 0000000000..ef6d23c388 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "missing_score_count_positive", + "severity": "medium", + "summary": "The experiment still has 1 missing score(s).", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json new file mode 100644 index 0000000000..81207cb709 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "missing_score_count_positive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The experiment still has 1 missing score(s).", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json new file mode 100644 index 0000000000..70a613fc06 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "missing_score_count_positive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The experiment still has 1 missing score(s).", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/risk_verdict/missing_score_count", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006.json new file mode 100644 index 0000000000..468b7cfeb3 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "retrieved_fact_hit_rate_missing_long_context_fact_retrieval_real_smoke", + "severity": "medium", + "summary": "retrieved_fact_hit_rate_mean is null for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json new file mode 100644 index 0000000000..9a05e2f71e --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "retrieved_fact_hit_rate_missing_long_context_fact_retrieval_real_smoke", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke", + "summary": "retrieved_fact_hit_rate_mean is null for long_context_fact_retrieval_real_smoke.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4.json new file mode 100644 index 0000000000..27fcf540a2 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4.json @@ -0,0 +1,10 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "risk_verdict_inconclusive", + "severity": "medium", + "summary": "The regression-risk verdict is inconclusive for this experiment.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status", + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json new file mode 100644 index 0000000000..3c723e01ac --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "finding_type": "risk_verdict_inconclusive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The regression-risk verdict is inconclusive for this experiment.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json new file mode 100644 index 0000000000..19a95c3ac4 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "finding_type": "risk_verdict_inconclusive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_4_long_context_real_smoke", + "summary": "The regression-risk verdict is inconclusive for this experiment.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/risk_verdict/status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f.json new file mode 100644 index 0000000000..ed1cd1b772 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0:candidate_session_memory_sparse", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438.json new file mode 100644 index 0000000000..f63f05dc09 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0:baseline_default", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / baseline_default.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052.json new file mode 100644 index 0000000000..fa25062b1d --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0:candidate_session_memory_sparse", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / candidate_session_memory_sparse.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4.json new file mode 100644 index 0000000000..072500f713 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "flaky_status_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default", + "finding_kind": "stability_gap", + "severity": "warning", + "scope": "variant", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0:baseline_default", + "summary": "flaky_status is inconclusive for long_context_fact_retrieval_real_smoke_contract_v0 / baseline_default.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json new file mode 100644 index 0000000000..3e63615cd5 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "long_context_review_verdict_needs_manual_review", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The experiment-level long_context_review_verdict remains needs_manual_review.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json new file mode 100644 index 0000000000..44e4cc20c3 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "long_context_review_verdict_needs_manual_review", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The experiment-level long_context_review_verdict remains needs_manual_review.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json new file mode 100644 index 0000000000..d584201e05 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "manual_review_required_long_context_fact_retrieval_real_smoke_contract_v0", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0", + "summary": "manual_review_required is true for long_context_fact_retrieval_real_smoke_contract_v0.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json new file mode 100644 index 0000000000..be2e2b8502 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "manual_review_required_long_context_fact_retrieval_real_smoke_contract_v0", + "finding_kind": "manual_review_boundary", + "severity": "warning", + "scope": "scenario", + "scope_ref": "long_context_fact_retrieval_real_smoke_contract_v0", + "summary": "manual_review_required is true for long_context_fact_retrieval_real_smoke_contract_v0.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required", + "is_blocking": false, + "requires_manual_judgement": true, + "auto_resolvable": false, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json new file mode 100644 index 0000000000..795fcefb72 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "missing_score_count_positive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The experiment still has 1 missing score(s).", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/missing_score_count", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json new file mode 100644 index 0000000000..5d770f1288 --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "missing_score_count_positive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The experiment still has 1 missing score(s).", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/missing_score_count", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json new file mode 100644 index 0000000000..9b0c2002dc --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "risk_verdict_inconclusive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The regression-risk verdict is inconclusive for this experiment.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json new file mode 100644 index 0000000000..cd8e70dfbf --- /dev/null +++ b/tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json @@ -0,0 +1,16 @@ +{ + "finding_id": "finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "finding_type": "risk_verdict_inconclusive", + "finding_kind": "missing_score", + "severity": "warning", + "scope": "experiment", + "scope_ref": "v2_5_long_context_real_smoke_expectation_contract_v0", + "summary": "The regression-risk verdict is inconclusive for this experiment.", + "evidence_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/risk_verdict/status", + "is_blocking": false, + "requires_manual_judgement": false, + "auto_resolvable": true, + "fact_or_inference": "fact" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c.json new file mode 100644 index 0000000000..4b969cccf6 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c.json @@ -0,0 +1,17 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4", + "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae" + ], + "hypothesis": "The regression-risk gate is inconclusive mainly because some semantic long-context scores are still missing, not because the runner failed to execute.", + "confidence": "medium", + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count" + ], + "risks": [ + "If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13.json new file mode 100644 index 0000000000..05038f1dcb --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2", + "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count" + ], + "hypothesis": "The regression-risk gate is inconclusive mainly because semantic long-context scores are still missing, not because the runner failed to execute.", + "confidence": "medium", + "falsifiable_by": [ + "After parser output is bound into context scores, rerun the same real smoke and confirm whether risk_verdict becomes more decisive without hiding uncertainty." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/risk_verdict/missing_score_count" + ], + "risks": [ + "If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a.json new file mode 100644 index 0000000000..d3b20a544a --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a.json @@ -0,0 +1,17 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2" + ], + "hypothesis": "The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but not fully resolve final semantic correctness in real smoke.", + "confidence": "high", + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243.json new file mode 100644 index 0000000000..2664bc2551 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required" + ], + "hypothesis": "The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke.", + "confidence": "high", + "falsifiable_by": [ + "Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/manual_review_required" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447.json new file mode 100644 index 0000000000..4f7baf4383 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_summary/0/manual_review_required" + ], + "hypothesis": "The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke.", + "confidence": "high", + "falsifiable_by": [ + "Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/long_context_summary/0/manual_review_required" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57.json new file mode 100644 index 0000000000..474a475619 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57.json @@ -0,0 +1,18 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b", + "finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006" + ], + "hypothesis": "The current real-smoke scorer lacks a lightweight semantic output parser, so fact retrieval and constraint retention cannot yet be auto-judged from runtime outputs.", + "confidence": "medium", + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean" + ], + "risks": [ + "A parser that is too narrow can miss valid answers.", + "A parser that is too loose can create false positives." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8.json new file mode 100644 index 0000000000..3713cab047 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8.json @@ -0,0 +1,26 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c", + "finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean" + ], + "hypothesis": "The current real-smoke evaluator lacks a lightweight semantic output parser, so fact retrieval and constraint retention cannot yet be auto-judged from runtime outputs.", + "confidence": "medium", + "falsifiable_by": [ + "Implement a lightweight real-smoke output parser and rerun long_context_fact_retrieval_real_smoke.", + "Verify retrieved_fact_hit_rate and constraint_retention_rate become non-null without inflating distractor_confusion_count." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/constraint_retention_rate_mean", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/long_context_summary/0/retrieved_fact_hit_rate_mean" + ], + "risks": [ + "A parser that is too narrow can miss valid answers.", + "A parser that is too loose can create false positives." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93.json new file mode 100644 index 0000000000..8806b81826 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93.json @@ -0,0 +1,17 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723", + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae" + ], + "hypothesis": "Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.", + "confidence": "medium", + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status" + ], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e.json new file mode 100644 index 0000000000..9488c8456b --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740", + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status" + ], + "hypothesis": "Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.", + "confidence": "medium", + "falsifiable_by": [ + "Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json#/stability_summary/1/flaky_status" + ], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0.json new file mode 100644 index 0000000000..ed51c81e49 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0", + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008", + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/1/flaky_status" + ], + "hypothesis": "Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.", + "confidence": "medium", + "falsifiable_by": [ + "Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json#/stability_summary/1/flaky_status" + ], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661.json new file mode 100644 index 0000000000..638def7525 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661.json @@ -0,0 +1,25 @@ +{ + "hypothesis_id": "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661", + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required" + ], + "hypothesis": "The tightened expectation contract is already in place, but manual review still remains open. The next bottleneck is feedback-loop deduplication and proposal stability, not another copy of the same scenario-contract recommendation.", + "confidence": "high", + "falsifiable_by": [ + "Re-run feedback on the same expectation-contract artifact and confirm the queue no longer repeats the same expectation-contract recommendation as top priority.", + "Verify the next top recommendation, if any, shifts to feedback-system stabilization rather than a duplicate scenario contract." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b.json new file mode 100644 index 0000000000..274160c45c --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b", + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required" + ], + "hypothesis": "The current long-context evaluation boundary is still partially manual because the system can observe structure and governance, but cannot yet fully resolve final semantic correctness in real smoke.", + "confidence": "high", + "falsifiable_by": [ + "Tighten real-smoke expectations and review prompts, then rerun and confirm whether manual-review scope shrinks without pretending to be fully automatic." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_review_verdict", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/long_context_summary/0/manual_review_required" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e.json new file mode 100644 index 0000000000..f1bd9f338e --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e", + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438", + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status" + ], + "hypothesis": "Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.", + "confidence": "medium", + "falsifiable_by": [ + "Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status" + ], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243.json b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243.json new file mode 100644 index 0000000000..db96aa4f17 --- /dev/null +++ b/tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243.json @@ -0,0 +1,24 @@ +{ + "hypothesis_id": "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243", + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4", + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052" + ], + "depends_on_finding_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status" + ], + "hypothesis": "Observed instability suggests that runner mechanics or scenario contracts still need tightening before higher-trust automated feedback can be used.", + "confidence": "medium", + "falsifiable_by": [ + "Increase repeat_count for the real smoke input and inspect whether flaky_status remains inconclusive or converges to stable." + ], + "supporting_evidence_refs": [ + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/0/flaky_status", + "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json#/stability_summary/1/flaky_status" + ], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "fact_or_inference": "inference" +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146.json new file mode 100644 index 0000000000..fb5e34edff --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146.json @@ -0,0 +1,15 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57" + ], + "proposal_type": "evaluator_improvement", + "target_layer": "scorer", + "description": "Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence.", + "expected_effect": "Convert currently-null long-context semantic scores into rule-backed observed values where the output format is narrow enough.", + "risks": [ + "A parser that is too narrow can miss valid answers.", + "A parser that is too loose can create false positives." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json new file mode 100644 index 0000000000..3fb70dea21 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json @@ -0,0 +1,25 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c", + "finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de" + ], + "proposal_type": "evaluator_improvement", + "target_layer": "evaluator", + "priority": "P0", + "queue_bucket": "top_recommendation", + "description": "Add a lightweight output parser for long-context real smoke so expected facts and retained constraints can be mapped to explicit score evidence.", + "expected_effect": "Convert currently-null long-context semantic scores into rule-backed observed values where the output format is narrow enough.", + "why_now": "This directly targets the two most important semantic nulls in the current real-smoke sample and does not require runtime harness changes.", + "why_not_now": null, + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "A parser that is too narrow can miss valid answers.", + "A parser that is too loose can create false positives." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488.json new file mode 100644 index 0000000000..79095292c4 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488.json @@ -0,0 +1,14 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c" + ], + "proposal_type": "evaluator_improvement", + "target_layer": "scorer", + "description": "Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk.", + "expected_effect": "Reduce inconclusive gate results caused purely by absent semantic score evidence.", + "risks": [ + "If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json new file mode 100644 index 0000000000..ee2cc76c90 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json @@ -0,0 +1,24 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2", + "finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b" + ], + "proposal_type": "score_binding_improvement", + "target_layer": "scorer", + "priority": "P1", + "queue_bucket": "blocked", + "description": "Map parser output into context score-spec fields so long-context risk gating can distinguish missing semantics from genuine regression risk.", + "expected_effect": "Reduce inconclusive gate results caused purely by absent semantic score evidence.", + "why_now": "The gate cannot become more informative until parser output is formally bound into context scores.", + "why_not_now": "This is blocked until a lightweight parser exists; there is nothing stable to bind before that.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "If missing semantic scores are ignored, risk gating may appear healthier than the evidence supports." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4.json new file mode 100644 index 0000000000..c931f8071b --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4.json @@ -0,0 +1,14 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93" + ], + "proposal_type": "scenario_improvement", + "target_layer": "scenario", + "description": "Stabilize the upstream scenario or runner contract before trusting automated feedback suggestions for this branch of evaluation.", + "expected_effect": "Reduce flaky or failed inputs before turning feedback artifacts into candidate work items.", + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json new file mode 100644 index 0000000000..a9fd7090de --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json @@ -0,0 +1,24 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740", + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee" + ], + "proposal_type": "feedback_contract_improvement", + "target_layer": "feedback_system", + "priority": "P2", + "queue_bucket": "deferred", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "expected_effect": "Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items.", + "why_now": "This keeps the feedback system honest when stability evidence is weak or under-sampled.", + "why_not_now": "The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json new file mode 100644 index 0000000000..b828e5b279 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json @@ -0,0 +1,24 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008", + "finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97" + ], + "proposal_type": "feedback_contract_improvement", + "target_layer": "feedback_system", + "priority": "P2", + "queue_bucket": "deferred", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "expected_effect": "Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items.", + "why_now": "This keeps the feedback system honest when stability evidence is weak or under-sampled.", + "why_not_now": "The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84.json new file mode 100644 index 0000000000..3eb845de65 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84.json @@ -0,0 +1,14 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a" + ], + "proposal_type": "scenario_improvement", + "target_layer": "scenario", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "expected_effect": "Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs.", + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json new file mode 100644 index 0000000000..83c1eb770c --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json @@ -0,0 +1,27 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8" + ], + "proposal_type": "scenario_improvement", + "target_layer": "scenario", + "priority": "P1", + "queue_bucket": "recommended_later", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "expected_effect": "Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs.", + "why_now": "This is the cleanest way to narrow manual review once semantic evidence collection improves.", + "why_not_now": "By itself it does not convert null semantic scores into formal evidence, so it is best staged after parser work begins.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json new file mode 100644 index 0000000000..841801dc97 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json @@ -0,0 +1,27 @@ +{ + "proposal_id": "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91", + "based_on_hypothesis_ids": [ + "hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447" + ], + "based_on_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a" + ], + "proposal_type": "scenario_improvement", + "target_layer": "scenario", + "priority": "P1", + "queue_bucket": "top_recommendation", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "expected_effect": "Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs.", + "why_now": "Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision.", + "why_not_now": null, + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [ + "finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194", + "finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json new file mode 100644 index 0000000000..5fee2dfb68 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json @@ -0,0 +1,27 @@ +{ + "proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4", + "based_on_hypothesis_ids": [ + "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661" + ], + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925" + ], + "proposal_type": "feedback_contract_improvement", + "target_layer": "feedback_system", + "priority": "P1", + "queue_bucket": "top_recommendation", + "description": "Stabilize the feedback input contract so an already-realized expectation-contract follow-up is detected and not re-recommended as the next top proposal.", + "expected_effect": "Prevent proposal-loop duplication and keep approval cards aligned with the true next unresolved bottleneck.", + "why_now": "The current source experiment already uses expectation_contract_v0, so repeating the same contract proposal would be a feedback-loop error rather than a useful next action.", + "why_not_now": null, + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json new file mode 100644 index 0000000000..9dd9e29749 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json @@ -0,0 +1,24 @@ +{ + "proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd", + "based_on_hypothesis_ids": [ + "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e" + ], + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438", + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f" + ], + "proposal_type": "feedback_contract_improvement", + "target_layer": "feedback_system", + "priority": "P2", + "queue_bucket": "deferred", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "expected_effect": "Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items.", + "why_now": "This keeps the feedback system honest when stability evidence is weak or under-sampled.", + "why_not_now": "The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json new file mode 100644 index 0000000000..b70bef1d18 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json @@ -0,0 +1,24 @@ +{ + "proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6", + "based_on_hypothesis_ids": [ + "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243" + ], + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4", + "finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052" + ], + "proposal_type": "feedback_contract_improvement", + "target_layer": "feedback_system", + "priority": "P2", + "queue_bucket": "deferred", + "description": "Stabilize the upstream scenario or feedback input contract before trusting automated feedback suggestions for this branch of evaluation.", + "expected_effect": "Reduce noisy or ambiguous inputs before turning feedback artifacts into concrete candidate work items.", + "why_now": "This keeps the feedback system honest when stability evidence is weak or under-sampled.", + "why_not_now": "The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred.", + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [], + "risks": [ + "Pursuing harness changes before stabilizing the evaluator could hide platform issues behind candidate noise." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json new file mode 100644 index 0000000000..fddb063fd3 --- /dev/null +++ b/tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json @@ -0,0 +1,27 @@ +{ + "proposal_id": "proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52", + "based_on_hypothesis_ids": [ + "hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b" + ], + "based_on_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad" + ], + "proposal_type": "scenario_improvement", + "target_layer": "scenario", + "priority": "P1", + "queue_bucket": "top_recommendation", + "description": "Tighten long-context real-smoke expected facts, constraints, and review questions so the evaluator has clearer semantic anchors without pretending to be fully automatic.", + "expected_effect": "Reduce avoidable manual-review ambiguity while preserving an explicit human-review boundary for nuanced outputs.", + "why_now": "Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision.", + "why_not_now": null, + "blocking_finding_ids": [], + "manual_judgement_finding_ids": [ + "finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de", + "finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad" + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "requires_human_approval": true +} diff --git a/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.json b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.json new file mode 100644 index 0000000000..4561f2add5 --- /dev/null +++ b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.json @@ -0,0 +1,48 @@ +{ + "feedback_run_id": "feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66", + "generated_at": "2026-05-03T10:32:10.763Z", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_experiment_run_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json", + "source_report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" + ], + "finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T103210763Z_aaceea39.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T103210763Z_28ef91e4.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T103210763Z_5d5767ae.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T103210763Z_bd4fc15b.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T103210763Z_e7b6a006.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T103210763Z_acb6cee2.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_f63fd723.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T103210763Z_2086d4ae.json" + ], + "hypothesis_refs": [ + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T103210763Z_e3ed5d57.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T103210763Z_a207056a.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T103210763Z_ac3b840c.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T103210763Z_21239a93.json" + ], + "proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T103210763Z_19602146.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T103210763Z_d022ab84.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T103210763Z_a7718488.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T103210763Z_b0a56fb4.json" + ], + "candidate_proposal_refs": [ + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_c72924f7.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_7f0974ed.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_d3a111b9.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_2d4e45cb.json" + ], + "next_experiment_plan_refs": [ + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T103210763Z_4d4bb400.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T103210763Z_6f16a48e.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T103210763Z_f6ca0f37.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T103210763Z_d1610f7f.json" + ], + "report_ref": "ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_4_long_context_real_smoke_alpha_20260503T103210763Z_9b46cb66.md", + "human_approval_required": true, + "status": "completed" +} diff --git a/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.json b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.json new file mode 100644 index 0000000000..719a4d70cf --- /dev/null +++ b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.json @@ -0,0 +1,102 @@ +{ + "feedback_run_id": "feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b", + "taxonomy_version": "v2_5_beta", + "generated_at": "2026-05-03T12:45:41.901Z", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_experiment_run_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T060617173Z.json", + "source_report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" + ], + "finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_534c0740.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T124541901Z_02dccdee.json" + ], + "hypothesis_refs": [ + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_real_output_semantic_parser_missing_20260503T124541901Z_569976b8.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T124541901Z_54cd7243.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_gate_inconclusive_due_to_missing_semantic_scores_20260503T124541901Z_f3494c13.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T124541901Z_e6e1981e.json" + ], + "proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json" + ], + "candidate_proposal_refs": [ + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_d4ec8978.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_d326279e.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_b0296355.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_66e07dac.json" + ], + "next_experiment_plan_refs": [ + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T124541901Z_06010de6.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_score_binding_v0_20260503T124541901Z_415a96a3.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T124541901Z_0b77bb8b.json" + ], + "proposal_queue": { + "top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json", + "recommended_now_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json" + ], + "recommended_later_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8.json" + ], + "deferred_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51.json" + ], + "blocked_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2.json" + ] + }, + "blocking_finding_refs": [], + "manual_judgement_required_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T124541901Z_4fbdb97e.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T124541901Z_efe417a8.json" + ], + "auto_resolvable_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T124541901Z_72968af2.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T124541901Z_70cd437b.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_constraint_retention_rate_missing_long_context_f_20260503T124541901Z_b497c06c.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_retrieved_fact_hit_rate_missing_long_context_fac_20260503T124541901Z_2f6593de.json" + ], + "approval_card": { + "current_top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_add_long_context_output_parser_v0_20260503T124541901Z_5e4eee36.json", + "why_now": "This directly targets the two most important semantic nulls in the current real-smoke sample and does not require runtime harness changes.", + "why_not_others_yet": [ + "proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T124541901Z_013f97a8: recommended_later - By itself it does not convert null semantic scores into formal evidence, so it is best staged after parser work begins.", + "proposal_v2_4_long_context_real_smoke_map_parser_output_to_context_scores_v0_20260503T124541901Z_6af2f3f2: blocked - This is blocked until a lightweight parser exists; there is nothing stable to bind before that.", + "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T124541901Z_30cd7b51: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred." + ], + "approval_scope": "Only scorer/report/evaluator files may change. No runtime harness policy changes are allowed in this proposal.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "next_experiment_plan_ref": "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_output_parser_v0_20260503T124541901Z_346bd758.json", + "success_criteria": [ + "retrieved_fact_hit_rate is no longer null for real smoke.", + "constraint_retention_rate is no longer null for real smoke.", + "manual_review_required does not increase.", + "distractor_confusion_count remains 0." + ], + "risks": [ + "A parser that is too narrow can miss valid answers.", + "A parser that is too loose can create false positives." + ], + "manual_review_boundary": "Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks." + }, + "report_ref": "ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_4_long_context_real_smoke_beta_20260503T124541901Z_355a063b.md", + "human_approval_required": true, + "status": "completed" +} diff --git a/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.json b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.json new file mode 100644 index 0000000000..59dab518d6 --- /dev/null +++ b/tests/evals/v2/feedback/runs/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.json @@ -0,0 +1,82 @@ +{ + "feedback_run_id": "feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90", + "taxonomy_version": "v2_5_beta", + "generated_at": "2026-05-03T14:59:42.988Z", + "source_experiment_id": "v2_4_long_context_real_smoke", + "source_experiment_run_ref": "tests/evals/v2/experiment-runs/v2_4_long_context_real_smoke_2026-05-03T145644822Z.json", + "source_report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_vs_run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" + ], + "finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_69707008.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_flaky_status_long_context_fact_retrieval_real_sm_20260503T145942988Z_6ac48f97.json" + ], + "hypothesis_refs": [ + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_manual_review_boundary_still_open_20260503T145942988Z_2aa4b447.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_4_long_context_real_smoke_runner_or_scenario_instability_20260503T145942988Z_01fd35e0.json" + ], + "proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json", + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json" + ], + "candidate_proposal_refs": [ + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_1bdb5652.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_829a2c3a.json" + ], + "next_experiment_plan_refs": [ + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_feedback_input_contract_v0_20260503T145942988Z_1e6a3fb4.json" + ], + "proposal_queue": { + "top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json", + "recommended_now_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json" + ], + "recommended_later_proposal_refs": [], + "deferred_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d.json" + ], + "blocked_proposal_refs": [] + }, + "blocking_finding_refs": [], + "manual_judgement_required_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_long_context_review_verdict_needs_manual_review_20260503T145942988Z_3c7be194.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_manual_review_required_long_context_fact_retriev_20260503T145942988Z_7fb1e53a.json" + ], + "auto_resolvable_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_risk_verdict_inconclusive_20260503T145942988Z_e946246a.json", + "tests/evals/v2/feedback/findings/finding_v2_4_long_context_real_smoke_missing_score_count_positive_20260503T145942988Z_f7a7a853.json" + ], + "approval_card": { + "current_top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_4_long_context_real_smoke_tighten_real_smoke_expectations_v0_20260503T145942988Z_3851af91.json", + "why_now": "Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision.", + "why_not_others_yet": [ + "proposal_v2_4_long_context_real_smoke_stabilize_feedback_input_contract_v0_20260503T145942988Z_a0ba210d: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred." + ], + "approval_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "next_experiment_plan_ref": "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_4_long_context_real_smoke_candidate_long_context_expectation_contract_v0_20260503T145942988Z_62748519.json", + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "manual_review_boundary": "Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks." + }, + "report_ref": "ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_4_long_context_real_smoke_beta_20260503T145942988Z_7893da90.md", + "human_approval_required": true, + "status": "completed" +} diff --git a/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.json b/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.json new file mode 100644 index 0000000000..d67f344642 --- /dev/null +++ b/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.json @@ -0,0 +1,82 @@ +{ + "feedback_run_id": "feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65", + "taxonomy_version": "v2_5_beta", + "generated_at": "2026-05-03T15:32:44.784Z", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_experiment_run_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json", + "source_report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" + ], + "finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_3b395438.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T153244784Z_22ead42f.json" + ], + "hypothesis_refs": [ + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_still_open_20260503T153244784Z_89789b5b.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T153244784Z_9de1252e.json" + ], + "proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json", + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json" + ], + "candidate_proposal_refs": [ + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_f1ed1c1f.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_0241aad3.json" + ], + "next_experiment_plan_refs": [ + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T153244784Z_c29168a1.json" + ], + "proposal_queue": { + "top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json", + "recommended_now_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json" + ], + "recommended_later_proposal_refs": [], + "deferred_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd.json" + ], + "blocked_proposal_refs": [] + }, + "blocking_finding_refs": [], + "manual_judgement_required_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T153244784Z_ba0288de.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T153244784Z_0bf6f7ad.json" + ], + "auto_resolvable_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T153244784Z_5de554f8.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T153244784Z_d24225e3.json" + ], + "approval_card": { + "current_top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_tighten_real_smoke_expectations_v0_20260503T153244784Z_8bc73d52.json", + "why_now": "Semantic parsing is now present, so the next bottleneck is the real-smoke expectation contract and review-prompt precision.", + "why_not_others_yet": [ + "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T153244784Z_d19670cd: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred." + ], + "approval_scope": "Only scenario manifests, expected facts, constraints, and manual review prompts may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "runtime harness policy files" + ], + "next_experiment_plan_ref": "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_long_context_expectation_contract_v0_20260503T153244784Z_ff510cf4.json", + "success_criteria": [ + "Manual review prompts become more specific and lower-ambiguity.", + "Scenario intent remains matched.", + "No new flaky or failed run groups appear." + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "manual_review_boundary": "Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks." + }, + "report_ref": "ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T153244784Z_57470f65.md", + "human_approval_required": true, + "status": "completed" +} diff --git a/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json b/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json new file mode 100644 index 0000000000..fb0727dbbb --- /dev/null +++ b/tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.json @@ -0,0 +1,82 @@ +{ + "feedback_run_id": "feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e", + "taxonomy_version": "v2_5_beta", + "generated_at": "2026-05-03T15:46:26.054Z", + "source_experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "source_experiment_run_ref": "tests/evals/v2/experiment-runs/v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.json", + "source_report_refs": [ + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\compare_run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_vs_run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md" + ], + "finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_537428d4.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260503T154626054Z_1e601052.json" + ], + "hypothesis_refs": [ + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260503T154626054Z_46855661.json", + "tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260503T154626054Z_d615b243.json" + ], + "proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json", + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json" + ], + "candidate_proposal_refs": [ + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_b4723ba2.json", + "tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_9131c8e3.json" + ], + "next_experiment_plan_refs": [ + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json", + "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260503T154626054Z_7c0d5a2f.json" + ], + "proposal_queue": { + "top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json", + "recommended_now_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json" + ], + "recommended_later_proposal_refs": [], + "deferred_proposal_refs": [ + "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6.json" + ], + "blocked_proposal_refs": [] + }, + "blocking_finding_refs": [], + "manual_judgement_required_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260503T154626054Z_72a1d044.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260503T154626054Z_5550e925.json" + ], + "auto_resolvable_finding_refs": [ + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260503T154626054Z_7e7d8ae0.json", + "tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260503T154626054Z_797c63b8.json" + ], + "approval_card": { + "current_top_recommendation_proposal_ref": "tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260503T154626054Z_75dd25e4.json", + "why_now": "The current source experiment already uses expectation_contract_v0, so repeating the same contract proposal would be a feedback-loop error rather than a useful next action.", + "why_not_others_yet": [ + "proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260503T154626054Z_0bb87bd6: deferred - The current sample has a stronger semantic-evidence gap than a true contract-breakage gap, so this should remain deferred." + ], + "approval_scope": "Only feedback extraction rules, feedback taxonomy, and report/queue logic may change.", + "do_not_touch": [ + "src/query.ts", + "src/services/SessionMemory/sessionMemory.ts", + "src/services/api/claude.ts" + ], + "next_experiment_plan_ref": "tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260503T154626054Z_2002193a.json", + "success_criteria": [ + "Feedback queue semantics become stable and easier to approve.", + "Top recommendation remains unique.", + "No new schema ambiguity appears in feedback artifacts." + ], + "risks": [ + "Treating manual review signals as auto-pass would overstate evaluator certainty." + ], + "manual_review_boundary": "Do not treat manual_review_required or needs_manual_review as automatic pass. Any approved proposal must preserve explicit human review for nuanced semantic checks." + }, + "report_ref": "ObservrityTask/10-系统版本/v2/07-反馈报告/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260503T154626054Z_5ed1c19e.md", + "human_approval_required": true, + "status": "completed" +} diff --git a/tests/evals/v2/fixtures/long-context/compaction-pressure/constraints.json b/tests/evals/v2/fixtures/long-context/compaction-pressure/constraints.json new file mode 100644 index 0000000000..3a2e53eb5f --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/compaction-pressure/constraints.json @@ -0,0 +1,19 @@ +{ + "constraints": [ + { + "id": "three_exact_sections", + "description": "Use exactly the headings Overview, Evidence, Conclusion.", + "severity": "hard" + }, + { + "id": "no_archived_event_names", + "description": "Do not quote archived event names as current behavior.", + "severity": "hard" + }, + { + "id": "read_only_task", + "description": "Keep the task read-only.", + "severity": "hard" + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md b/tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md new file mode 100644 index 0000000000..f58bcac3cf --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md @@ -0,0 +1,25 @@ +## High-Pressure Review Packet + +This packet compresses several governance notes into one place. + +### Hard Constraints + +1. The final answer must contain exactly three section headings: Overview, Evidence, Conclusion. +2. Do not quote archived event names as current behavior. +3. Keep the task read-only. + +### Key Facts + +- Compaction signals are currently observed through `messages.compact_boundary.applied` and `messages.microcompact.applied`. +- Tool-result budget pressure is observed through `messages.tool_result_budget.applied`. +- The dedicated long-context score spec for saved compaction tokens is `context.compaction_saved_tokens`. + +### Pressure Notes + +- Under high context pressure, compact may trigger multiple times. +- Session memory or subagents may appear as a supporting governance mechanism. + +### Archived / False Notes + +- `messages.context_shrink.applied` is not the active compaction event name. +- `context.cache_prune_count` is not the active score spec. diff --git a/tests/evals/v2/fixtures/long-context/compaction-pressure/critical_facts.json b/tests/evals/v2/fixtures/long-context/compaction-pressure/critical_facts.json new file mode 100644 index 0000000000..013038e8b5 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/compaction-pressure/critical_facts.json @@ -0,0 +1,16 @@ +{ + "facts": [ + { + "id": "compact_boundary_event", + "description": "Compaction signals use messages.compact_boundary.applied and messages.microcompact.applied." + }, + { + "id": "tool_result_budget_event", + "description": "Tool-result budget pressure uses messages.tool_result_budget.applied." + }, + { + "id": "compaction_saved_tokens_score", + "description": "The score spec name is context.compaction_saved_tokens." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/compaction-pressure/distractors.json b/tests/evals/v2/fixtures/long-context/compaction-pressure/distractors.json new file mode 100644 index 0000000000..0cf721cbff --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/compaction-pressure/distractors.json @@ -0,0 +1,12 @@ +{ + "distractors": [ + { + "id": "fake_event_context_shrink", + "description": "messages.context_shrink.applied is a false archived event name." + }, + { + "id": "fake_score_cache_prune_count", + "description": "context.cache_prune_count is a fake score spec." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/compaction-pressure/expected_output.md b/tests/evals/v2/fixtures/long-context/compaction-pressure/expected_output.md new file mode 100644 index 0000000000..d0ef3be3bb --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/compaction-pressure/expected_output.md @@ -0,0 +1,14 @@ +## Overview + +Current compaction and tool-result budget governance must be described from active evidence only. + +## Evidence + +- `messages.compact_boundary.applied` +- `messages.microcompact.applied` +- `messages.tool_result_budget.applied` +- `context.compaction_saved_tokens` + +## Conclusion + +The answer should stay read-only and avoid archived event names. diff --git a/tests/evals/v2/fixtures/long-context/constraint-retention/constraints.json b/tests/evals/v2/fixtures/long-context/constraint-retention/constraints.json new file mode 100644 index 0000000000..d6dc81abd1 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/constraint-retention/constraints.json @@ -0,0 +1,19 @@ +{ + "constraints": [ + { + "id": "json_output_only", + "description": "The final answer must be valid JSON.", + "severity": "hard" + }, + { + "id": "owner_field_required", + "description": "The JSON must include owner=v2-platform.", + "severity": "hard" + }, + { + "id": "read_only_task", + "description": "Do not modify files or suggest edits.", + "severity": "hard" + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md b/tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md new file mode 100644 index 0000000000..e542e8a7ac --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md @@ -0,0 +1,26 @@ +## Current Task Packet + +You are reviewing a local-first evaluation workspace. The current user request is read-only. + +### Hard Constraints + +1. The final answer must be valid JSON. +2. The JSON must include an `"owner"` field with value `"v2-platform"`. +3. Do not modify files or suggest file edits. + +### Relevant Facts + +- The main batch runner entrypoint is `scripts/evals/v2_run_experiment.ts`. +- The preferred prompt-cost metric is `total_prompt_input_tokens`. +- Long-context fixture smoke lives under `tests/evals/v2/experiments/_experiment.long_context.fixture_smoke.json`. + +### Supporting Notes + +- Reports are written under `ObservrityTask/10-系统版本/v2/06-运行报告/`. +- The system still keeps a manual-review lane for semantic quality. + +### Legacy Notes (Ignore If They Conflict) + +- Old docs once asked for Markdown output instead of JSON. +- Some archived notes emphasize `cache_create_tokens` as the main prompt metric. +- Those legacy notes are no longer the active instruction set. diff --git a/tests/evals/v2/fixtures/long-context/constraint-retention/critical_facts.json b/tests/evals/v2/fixtures/long-context/constraint-retention/critical_facts.json new file mode 100644 index 0000000000..ec045439fd --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/constraint-retention/critical_facts.json @@ -0,0 +1,12 @@ +{ + "facts": [ + { + "id": "runner_entrypoint", + "description": "The main batch runner entrypoint is scripts/evals/v2_run_experiment.ts." + }, + { + "id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt-cost metric is total_prompt_input_tokens." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/constraint-retention/distractors.json b/tests/evals/v2/fixtures/long-context/constraint-retention/distractors.json new file mode 100644 index 0000000000..c53181a611 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/constraint-retention/distractors.json @@ -0,0 +1,12 @@ +{ + "distractors": [ + { + "id": "old_markdown_output_rule", + "description": "Archived notes once asked for Markdown output." + }, + { + "id": "wrong_metric_cache_create_tokens", + "description": "Archived notes emphasize cache_create_tokens as the main prompt metric." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/constraint-retention/expected_output.md b/tests/evals/v2/fixtures/long-context/constraint-retention/expected_output.md new file mode 100644 index 0000000000..96df6dc2d7 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/constraint-retention/expected_output.md @@ -0,0 +1,8 @@ +```json +{ + "owner": "v2-platform", + "runner": "scripts/evals/v2_run_experiment.ts", + "prompt_metric": "total_prompt_input_tokens", + "mode": "read_only" +} +``` diff --git a/tests/evals/v2/fixtures/long-context/distractor-resistance/constraints.json b/tests/evals/v2/fixtures/long-context/distractor-resistance/constraints.json new file mode 100644 index 0000000000..3f1b2a1321 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/distractor-resistance/constraints.json @@ -0,0 +1,14 @@ +{ + "constraints": [ + { + "id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived smoke examples.", + "severity": "hard" + }, + { + "id": "read_only_task", + "description": "Keep the answer read-only.", + "severity": "hard" + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md b/tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md new file mode 100644 index 0000000000..1ae4ca4099 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md @@ -0,0 +1,21 @@ +## Change Proposal Packet + +You are reading current local files to summarize the active V2.4 fixture setup. + +### Hard Constraints + +1. Prefer current V2.4 files over archived smoke examples. +2. Do not cite deprecated variants as if they were the active long-context candidate. +3. Output must stay read-only. + +### Relevant Facts + +- The fixture-only long-context candidate is `candidate_long_context_fixture_guarded`. +- The active long-context fixture smoke manifest is `_experiment.long_context.fixture_smoke.json`. +- The batch runner still writes run groups under `tests/evals/v2/run-groups/`. + +### Distractor Material + +- `candidate_eval_fixture_shadow` is a V2.3 robustness helper, not the V2.4 long-context candidate. +- `_experiment.execute_harness.smoke.json` is an older smoke manifest focused on execute_harness closure, not long-context specialization. +- Treat those as distractors for this task. diff --git a/tests/evals/v2/fixtures/long-context/distractor-resistance/critical_facts.json b/tests/evals/v2/fixtures/long-context/distractor-resistance/critical_facts.json new file mode 100644 index 0000000000..a16ec774a5 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/distractor-resistance/critical_facts.json @@ -0,0 +1,12 @@ +{ + "facts": [ + { + "id": "fixture_candidate_guarded", + "description": "The fixture-only long-context candidate is candidate_long_context_fixture_guarded." + }, + { + "id": "active_fixture_smoke_manifest", + "description": "The active long-context fixture smoke manifest is _experiment.long_context.fixture_smoke.json." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/distractor-resistance/distractors.json b/tests/evals/v2/fixtures/long-context/distractor-resistance/distractors.json new file mode 100644 index 0000000000..60af01a149 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/distractor-resistance/distractors.json @@ -0,0 +1,12 @@ +{ + "distractors": [ + { + "id": "old_variant_fixture_shadow", + "description": "candidate_eval_fixture_shadow is not the V2.4 long-context candidate." + }, + { + "id": "old_execute_harness_smoke_manifest", + "description": "_experiment.execute_harness.smoke.json is not the long-context fixture smoke manifest." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/distractor-resistance/expected_output.md b/tests/evals/v2/fixtures/long-context/distractor-resistance/expected_output.md new file mode 100644 index 0000000000..0c4b1cf313 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/distractor-resistance/expected_output.md @@ -0,0 +1,3 @@ +- Active candidate: `candidate_long_context_fixture_guarded` +- Active manifest: `_experiment.long_context.fixture_smoke.json` +- Ignore archived V2.3 helper variant and old execute_harness smoke diff --git a/tests/evals/v2/fixtures/long-context/fact-retrieval/constraints.json b/tests/evals/v2/fixtures/long-context/fact-retrieval/constraints.json new file mode 100644 index 0000000000..9e4fc44888 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/fact-retrieval/constraints.json @@ -0,0 +1,14 @@ +{ + "constraints": [ + { + "id": "four_bullets_only", + "description": "Return exactly four bullet points.", + "severity": "hard" + }, + { + "id": "read_only_task", + "description": "Do not modify files.", + "severity": "hard" + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md b/tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md new file mode 100644 index 0000000000..7bfe01a1aa --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md @@ -0,0 +1,25 @@ +## Evaluation Workspace Brief + +This is a read-only retrieval task inside the repository. + +### Hard Constraints + +1. Use exactly four bullet points in the final answer. +2. Do not modify files. + +### Key Facts + +- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`. +- The formal capture key for execute_harness binding is `benchmark_run_id`. +- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`. + +### Supplemental Context + +- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it. +- Batch reports are written as Markdown. + +### Legacy / Distractor Material + +- Older notes mention `src/main.tsx` as the CLI entrypoint. +- A stale debugging note says "just grab the latest user_action_id". +- Those two statements are intentionally outdated. diff --git a/tests/evals/v2/fixtures/long-context/fact-retrieval/critical_facts.json b/tests/evals/v2/fixtures/long-context/fact-retrieval/critical_facts.json new file mode 100644 index 0000000000..561ffa8179 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/fact-retrieval/critical_facts.json @@ -0,0 +1,16 @@ +{ + "facts": [ + { + "id": "cli_entrypoint_cli_tsx", + "description": "The current headless CLI entrypoint is src/entrypoints/cli.tsx." + }, + { + "id": "capture_key_benchmark_run_id", + "description": "The formal execute_harness capture key is benchmark_run_id." + }, + { + "id": "experiment_summary_dir", + "description": "Experiment summaries are stored under tests/evals/v2/experiment-runs/." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/fact-retrieval/distractors.json b/tests/evals/v2/fixtures/long-context/fact-retrieval/distractors.json new file mode 100644 index 0000000000..443e71b177 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/fact-retrieval/distractors.json @@ -0,0 +1,12 @@ +{ + "distractors": [ + { + "id": "old_entrypoint_main_tsx", + "description": "Older notes mention src/main.tsx as the CLI entrypoint." + }, + { + "id": "fake_capture_key_latest_action", + "description": "A stale note recommends using the latest user_action_id instead of benchmark_run_id." + } + ] +} diff --git a/tests/evals/v2/fixtures/long-context/fact-retrieval/expected_output.md b/tests/evals/v2/fixtures/long-context/fact-retrieval/expected_output.md new file mode 100644 index 0000000000..bed199b026 --- /dev/null +++ b/tests/evals/v2/fixtures/long-context/fact-retrieval/expected_output.md @@ -0,0 +1,4 @@ +- `src/entrypoints/cli.tsx` +- `benchmark_run_id` +- `tests/evals/v2/experiment-runs/` +- Read-only; no file modifications diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..6051e8958a --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.458Z", + "ended_at": "2026-05-03T07:09:27.494Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..46f6e2827b --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.478Z", + "ended_at": "2026-05-03T07:09:27.501Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..aaf094ba6d --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.467Z", + "ended_at": "2026-05-03T07:09:27.497Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..710b71859c --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.495Z", + "ended_at": "2026-05-03T07:09:27.519Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 110, + "total_billed_tokens_min": 110, + "total_billed_tokens_max": 110, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..574a05771f --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.503Z", + "ended_at": "2026-05-03T07:09:27.528Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 105, + "total_billed_tokens_min": 105, + "total_billed_tokens_max": 105, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z.json b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z.json new file mode 100644 index 0000000000..557d2983f0 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "experiment_id": "v2_3_robustness_smoke", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:27.498Z", + "ended_at": "2026-05-03T07:09:27.522Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 100, + "total_billed_tokens_min": 100, + "total_billed_tokens_max": 100, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..32ebc05254 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.210Z", + "ended_at": "2026-05-03T07:09:57.231Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1640, + "total_billed_tokens_min": 1640, + "total_billed_tokens_max": 1640, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..e8b783384e --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.215Z", + "ended_at": "2026-05-03T07:09:57.235Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1240, + "total_billed_tokens_min": 1240, + "total_billed_tokens_max": 1240, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..b3168dc4d5 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_constraint_retention", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.127Z", + "ended_at": "2026-05-03T07:09:57.162Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1280, + "total_billed_tokens_min": 1280, + "total_billed_tokens_max": 1280, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..0ece274d15 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_constraint_retention", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.137Z", + "ended_at": "2026-05-03T07:09:57.166Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1090, + "total_billed_tokens_min": 1090, + "total_billed_tokens_max": 1090, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..900a55d3d4 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.187Z", + "ended_at": "2026-05-03T07:09:57.209Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1320, + "total_billed_tokens_min": 1320, + "total_billed_tokens_max": 1320, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..ea2ee3e18f --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.192Z", + "ended_at": "2026-05-03T07:09:57.213Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1120, + "total_billed_tokens_min": 1120, + "total_billed_tokens_max": 1120, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..1b88a0da10 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "baseline_default", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.163Z", + "ended_at": "2026-05-03T07:09:57.184Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1360, + "total_billed_tokens_min": 1360, + "total_billed_tokens_max": 1360, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json new file mode 100644 index 0000000000..b09a09055f --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z.json @@ -0,0 +1,33 @@ +{ + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "experiment_id": "v2_4_long_context_fixture_smoke", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "candidate_long_context_fixture_guarded", + "repeat_count": 2, + "run_ids": [ + "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d" + ], + "status": "completed", + "started_at": "2026-05-03T07:09:57.168Z", + "ended_at": "2026-05-03T07:09:57.190Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 1140, + "total_billed_tokens_min": 1140, + "total_billed_tokens_max": 1140, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 10, + "e2e_duration_min": 10, + "e2e_duration_max": 10, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "stable", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z.json new file mode 100644 index 0000000000..3f7c008f11 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da" + ], + "status": "completed", + "started_at": "2026-05-03T06:05:48.876Z", + "ended_at": "2026-05-03T06:05:56.858Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7982, + "e2e_duration_min": 7982, + "e2e_duration_max": 7982, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z.json new file mode 100644 index 0000000000..9df3b08421 --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b" + ], + "status": "completed", + "started_at": "2026-05-03T14:56:10.802Z", + "ended_at": "2026-05-03T14:56:17.911Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7109, + "e2e_duration_min": 7109, + "e2e_duration_max": 7109, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z.json new file mode 100644 index 0000000000..10f02f660d --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8" + ], + "status": "completed", + "started_at": "2026-05-03T06:06:05.082Z", + "ended_at": "2026-05-03T06:06:12.588Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 7506, + "e2e_duration_min": 7506, + "e2e_duration_max": 7506, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z.json b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z.json new file mode 100644 index 0000000000..fd9c4f9a4d --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z", + "experiment_id": "v2_4_long_context_real_smoke", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348" + ], + "status": "completed", + "started_at": "2026-05-03T14:56:28.027Z", + "ended_at": "2026-05-03T14:56:40.199Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27189, + "total_billed_tokens_min": 27189, + "total_billed_tokens_max": 27189, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 12172, + "e2e_duration_min": 12172, + "e2e_duration_max": 12172, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z.json b/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z.json new file mode 100644 index 0000000000..ea39afde5f --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z", + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "baseline_default", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e" + ], + "status": "completed", + "started_at": "2026-05-03T15:31:47.795Z", + "ended_at": "2026-05-03T15:32:03.341Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27436, + "total_billed_tokens_min": 27436, + "total_billed_tokens_max": 27436, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 15546, + "e2e_duration_min": 15546, + "e2e_duration_max": 15546, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436.json b/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436.json new file mode 100644 index 0000000000..ecffdb9c7c --- /dev/null +++ b/tests/evals/v2/run-groups/group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436.json @@ -0,0 +1,32 @@ +{ + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436", + "experiment_id": "v2_5_long_context_real_smoke_expectation_contract_v0", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "candidate_session_memory_sparse", + "repeat_count": 1, + "run_ids": [ + "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d" + ], + "status": "completed", + "started_at": "2026-05-03T15:32:12.356Z", + "ended_at": "2026-05-03T15:32:25.137Z", + "aggregate_summary_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_5_long_context_real_smoke_expectation_contract_v0_2026-05-03T153229792Z.md", + "stability_metrics": { + "repeat_success_rate": 1, + "capture_failure_rate": 0, + "total_billed_tokens_mean": 27372, + "total_billed_tokens_min": 27372, + "total_billed_tokens_max": 27372, + "total_billed_tokens_stddev": 0, + "e2e_duration_mean": 12781, + "e2e_duration_min": 12781, + "e2e_duration_max": 12781, + "e2e_duration_stddev": 0, + "tool_call_count_variance": 0, + "subagent_count_variance": 0, + "turn_count_variance": 0, + "recovery_rate": 0 + }, + "flaky_status": "inconclusive", + "failures": [] +} diff --git a/tests/evals/v2/runs/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.json b/tests/evals/v2/runs/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.json new file mode 100644 index 0000000000..e7b5820c0a --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.json @@ -0,0 +1,288 @@ +{ + "run": { + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T060545110Z", + "repeat_index": 1, + "started_at": "2026-05-03T06:05:48.876Z", + "ended_at": "2026-05-03T06:05:56.858Z", + "status": "completed", + "entry_user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "root_query_id": "9fdaee2b-0f04-4245-9fe4-4bfbf2a6a57a", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "root_query_id": "9fdaee2b-0f04-4245-9fe4-4bfbf2a6a57a", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "root_query_id": "9fdaee2b-0f04-4245-9fe4-4bfbf2a6a57a", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "name": "Long Context Fact Retrieval Real Smoke", + "description": "A small inline long-context retrieval scenario for real execute_harness smoke. It avoids path-fragile file reads while preserving the same retrieval and distractor requirements.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]\n\nThe four bullets must cover: the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4", + "real-smoke" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection_real_smoke", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "started_at": "2026-05-03T06:05:48.876Z", + "started_at_ms": 1777788348876, + "ended_at": "2026-05-03T06:05:56.858Z", + "ended_at_ms": 1777788356858, + "duration_ms": 7982, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_4_long_co_fd8c0e6a", + "scenario_id": "scn_long_context_ac1e93f0", + "variant_id": "var_baseline_def_eb4a038e", + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_5f2fdcbca6e1", + "raw_input_tokens": "45", + "output_tokens": "302", + "cache_read_tokens": "1479", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26887", + "total_billed_tokens": "27189", + "main_thread_total_prompt_input_tokens": "26887", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "9fdaee2b-0f04-4245-9fe4-4bfbf2a6a57a", + "user_action_id": "b963e6da-2283-4ec2-888e-beb0f835d4ba", + "session_id": "134aeed6-8494-4333-a13a-3b7081a90631", + "conversation_id": "134aeed6-8494-4333-a13a-3b7081a90631", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T06:05:48.876Z", + "started_at_ms": 1777788348876, + "ended_at": "2026-05-03T06:05:56.773Z", + "ended_at_ms": 1777788356773, + "duration_ms": 7897, + "first_event": "submit.attempted", + "last_event": "stop_hooks.completed", + "terminal_reason": null, + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 26, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 0, + "strict_is_complete": "false", + "inferred_is_complete": "false" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T06:05:56.765Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 26887 + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.json b/tests/evals/v2/runs/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.json new file mode 100644 index 0000000000..65d0ddd1ff --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.json @@ -0,0 +1,289 @@ +{ + "run": { + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T060545110Z", + "repeat_index": 1, + "started_at": "2026-05-03T06:06:05.082Z", + "ended_at": "2026-05-03T06:06:12.588Z", + "status": "completed", + "entry_user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "root_query_id": "8c4aba3b-52a5-40d6-86a5-df1a94ce1b7c", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "root_query_id": "8c4aba3b-52a5-40d6-86a5-df1a94ce1b7c", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "root_query_id": "8c4aba3b-52a5-40d6-86a5-df1a94ce1b7c", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "name": "Long Context Fact Retrieval Real Smoke", + "description": "A small inline long-context retrieval scenario for real execute_harness smoke. It avoids path-fragile file reads while preserving the same retrieval and distractor requirements.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]\n\nThe four bullets must cover: the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4", + "real-smoke" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection_real_smoke", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "started_at": "2026-05-03T06:06:05.082Z", + "started_at_ms": 1777788365082, + "ended_at": "2026-05-03T06:06:12.588Z", + "ended_at_ms": 1777788372588, + "duration_ms": 7506, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_4_long_co_fd8c0e6a", + "scenario_id": "scn_long_context_ac1e93f0", + "variant_id": "var_candidate_se_efbc2e82", + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_c91e43d45ade", + "raw_input_tokens": "35", + "output_tokens": "302", + "cache_read_tokens": "1489", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26887", + "total_billed_tokens": "27189", + "main_thread_total_prompt_input_tokens": "26887", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "8c4aba3b-52a5-40d6-86a5-df1a94ce1b7c", + "user_action_id": "96004ff8-6b91-4663-a8a6-6576f9817519", + "session_id": "9149966c-7392-48b1-a9a3-315f2723ce21", + "conversation_id": "9149966c-7392-48b1-a9a3-315f2723ce21", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T06:06:05.082Z", + "started_at_ms": 1777788365082, + "ended_at": "2026-05-03T06:06:12.503Z", + "ended_at_ms": 1777788372503, + "duration_ms": 7421, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 27, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T06:06:12.486Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 26887 + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.json b/tests/evals/v2/runs/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.json new file mode 100644 index 0000000000..b400cb2bf2 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.json @@ -0,0 +1,101 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.458Z", + "ended_at": "2026-05-03T07:09:27.468Z", + "status": "completed", + "entry_user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "root_query_id": "cf5fe468-248a-42e2-8a81-fa620c5189b5", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "root_query_id": "cf5fe468-248a-42e2-8a81-fa620c5189b5", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "root_query_id": "cf5fe468-248a-42e2-8a81-fa620c5189b5", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "49e858ae-cbd7-4b4b-9210-a2cac28ebfdc", + "started_at": "2026-05-03T07:09:27.458Z", + "ended_at": "2026-05-03T07:09:27.468Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 110, + "total_prompt_input_tokens": 100, + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "cf5fe468-248a-42e2-8a81-fa620c5189b5", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.json b/tests/evals/v2/runs/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.json new file mode 100644 index 0000000000..d79d34ef79 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.json @@ -0,0 +1,102 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.467Z", + "ended_at": "2026-05-03T07:09:27.477Z", + "status": "completed", + "entry_user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "root_query_id": "b938ca52-72ac-451c-937f-f3d04cf0d040", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "root_query_id": "b938ca52-72ac-451c-937f-f3d04cf0d040", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "root_query_id": "b938ca52-72ac-451c-937f-f3d04cf0d040", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "1e5948a5-84e8-4aa0-b5d6-d84f28a1252a", + "started_at": "2026-05-03T07:09:27.467Z", + "ended_at": "2026-05-03T07:09:27.477Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 100, + "total_prompt_input_tokens": 90, + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "b938ca52-72ac-451c-937f-f3d04cf0d040", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.json b/tests/evals/v2/runs/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.json new file mode 100644 index 0000000000..fa57eb5f2f --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.json @@ -0,0 +1,104 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.478Z", + "ended_at": "2026-05-03T07:09:27.488Z", + "status": "completed", + "entry_user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "root_query_id": "7e741eee-e0fc-43c4-8654-f260a5ca251a", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "root_query_id": "7e741eee-e0fc-43c4-8654-f260a5ca251a", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "root_query_id": "7e741eee-e0fc-43c4-8654-f260a5ca251a", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "09f1deec-a00b-4943-8ba6-ff84062d7dbb", + "started_at": "2026-05-03T07:09:27.478Z", + "ended_at": "2026-05-03T07:09:27.488Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 105, + "total_prompt_input_tokens": 95, + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "7e741eee-e0fc-43c4-8654-f260a5ca251a", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.json b/tests/evals/v2/runs/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.json new file mode 100644 index 0000000000..0d6feb7a5f --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.json @@ -0,0 +1,101 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_baseline_default_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.484Z", + "ended_at": "2026-05-03T07:09:27.494Z", + "status": "completed", + "entry_user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "root_query_id": "f8663f3c-d96c-4be8-9591-75b7b1de814f", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "root_query_id": "f8663f3c-d96c-4be8-9591-75b7b1de814f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "root_query_id": "f8663f3c-d96c-4be8-9591-75b7b1de814f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "8600f149-b0cf-4e8c-b797-cc61cffeca36", + "started_at": "2026-05-03T07:09:27.484Z", + "ended_at": "2026-05-03T07:09:27.494Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 110, + "total_prompt_input_tokens": 100, + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "f8663f3c-d96c-4be8-9591-75b7b1de814f", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.json b/tests/evals/v2/runs/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.json new file mode 100644 index 0000000000..7856c78ca0 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.json @@ -0,0 +1,102 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_session_memory_sparse_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.487Z", + "ended_at": "2026-05-03T07:09:27.497Z", + "status": "completed", + "entry_user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "root_query_id": "31006bdb-ec14-4242-a7fd-ed6f860a20d1", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "root_query_id": "31006bdb-ec14-4242-a7fd-ed6f860a20d1", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "root_query_id": "31006bdb-ec14-4242-a7fd-ed6f860a20d1", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "862641d4-2152-41bd-9449-30291b6cd507", + "started_at": "2026-05-03T07:09:27.487Z", + "ended_at": "2026-05-03T07:09:27.497Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 100, + "total_prompt_input_tokens": 90, + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "31006bdb-ec14-4242-a7fd-ed6f860a20d1", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.json b/tests/evals/v2/runs/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.json new file mode 100644 index 0000000000..c61dbc6a0d --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.json @@ -0,0 +1,104 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "scenario_id": "execute_harness_smoke_minimal", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.491Z", + "ended_at": "2026-05-03T07:09:27.501Z", + "status": "completed", + "entry_user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "root_query_id": "1beefa3e-869f-48b0-aefc-93e0f1c59b83", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "root_query_id": "1beefa3e-869f-48b0-aefc-93e0f1c59b83", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "root_query_id": "1beefa3e-869f-48b0-aefc-93e0f1c59b83", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "execute_harness_smoke_minimal", + "name": "Execute Harness Smoke Minimal", + "description": "Minimal real-model smoke for V2.2 execute_harness. The goal is to verify automatic execution, V1 event emission, benchmark_run_id capture, and V2 artifact generation with minimal task complexity.", + "input_prompt": "只回复 OK,不要做任何额外解释。", + "tags": [ + "smoke", + "execute_harness", + "v2_2" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Must finish in one turn", + "Must not modify files", + "Must not expand into unnecessary subagents" + ], + "max_turn_count": 1, + "max_total_billed_tokens": 60000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "61d3ed8d-3e51-4a48-84cf-e1b18d4a83d2", + "started_at": "2026-05-03T07:09:27.491Z", + "ended_at": "2026-05-03T07:09:27.501Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 105, + "total_prompt_input_tokens": 95, + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "1beefa3e-869f-48b0-aefc-93e0f1c59b83", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.json b/tests/evals/v2/runs/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.json new file mode 100644 index 0000000000..e7aa3cc63e --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.json @@ -0,0 +1,106 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.495Z", + "ended_at": "2026-05-03T07:09:27.505Z", + "status": "completed", + "entry_user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "root_query_id": "88b593e6-9869-4258-a2cb-143ddc3ddef1", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "root_query_id": "88b593e6-9869-4258-a2cb-143ddc3ddef1", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "root_query_id": "88b593e6-9869-4258-a2cb-143ddc3ddef1", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "231de0ad-a147-4bc1-a6d3-1c997ab7c71d", + "started_at": "2026-05-03T07:09:27.495Z", + "ended_at": "2026-05-03T07:09:27.505Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 110, + "total_prompt_input_tokens": 100, + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "88b593e6-9869-4258-a2cb-143ddc3ddef1", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.json b/tests/evals/v2/runs/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.json new file mode 100644 index 0000000000..9ebb7fe28a --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.json @@ -0,0 +1,107 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.498Z", + "ended_at": "2026-05-03T07:09:27.508Z", + "status": "completed", + "entry_user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "root_query_id": "8d60428a-9884-4fef-b98d-10799a58bd29", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "root_query_id": "8d60428a-9884-4fef-b98d-10799a58bd29", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "root_query_id": "8d60428a-9884-4fef-b98d-10799a58bd29", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "c53e147c-51e7-4198-a565-79c92e9efd7f", + "started_at": "2026-05-03T07:09:27.498Z", + "ended_at": "2026-05-03T07:09:27.508Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 100, + "total_prompt_input_tokens": 90, + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "8d60428a-9884-4fef-b98d-10799a58bd29", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.json b/tests/evals/v2/runs/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.json new file mode 100644 index 0000000000..8d51eb1f39 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.json @@ -0,0 +1,109 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:27.503Z", + "ended_at": "2026-05-03T07:09:27.513Z", + "status": "completed", + "entry_user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "root_query_id": "4db9d795-c56a-404a-b95c-67b517979b2f", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "root_query_id": "4db9d795-c56a-404a-b95c-67b517979b2f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "root_query_id": "4db9d795-c56a-404a-b95c-67b517979b2f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "1afeb0f4-cfb6-4643-82be-7e545c0c18a2", + "started_at": "2026-05-03T07:09:27.503Z", + "ended_at": "2026-05-03T07:09:27.513Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 105, + "total_prompt_input_tokens": 95, + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "4db9d795-c56a-404a-b95c-67b517979b2f", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.json b/tests/evals/v2/runs/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.json new file mode 100644 index 0000000000..1cde0a1da2 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.json @@ -0,0 +1,106 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "baseline_default", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_baseline_default_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.509Z", + "ended_at": "2026-05-03T07:09:27.519Z", + "status": "completed", + "entry_user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "root_query_id": "8c120f58-04a2-45f7-b3c9-cf543bbeb0fc", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "root_query_id": "8c120f58-04a2-45f7-b3c9-cf543bbeb0fc", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "root_query_id": "8c120f58-04a2-45f7-b3c9-cf543bbeb0fc", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "5ee185bf-0219-4052-84a4-c6f109eda670", + "started_at": "2026-05-03T07:09:27.509Z", + "ended_at": "2026-05-03T07:09:27.519Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 110, + "total_prompt_input_tokens": 100, + "raw_input_tokens": 100, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 100, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "8c120f58-04a2-45f7-b3c9-cf543bbeb0fc", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.json b/tests/evals/v2/runs/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.json new file mode 100644 index 0000000000..b00a6ec08d --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.json @@ -0,0 +1,107 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_session_memory_sparse_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.512Z", + "ended_at": "2026-05-03T07:09:27.522Z", + "status": "completed", + "entry_user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "root_query_id": "11754ef6-f28e-44dc-8e35-c072300181db", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "root_query_id": "11754ef6-f28e-44dc-8e35-c072300181db", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "root_query_id": "11754ef6-f28e-44dc-8e35-c072300181db", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "242dc6f0-95c4-4be4-8531-4ea532908b7c", + "started_at": "2026-05-03T07:09:27.512Z", + "ended_at": "2026-05-03T07:09:27.522Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 100, + "total_prompt_input_tokens": 90, + "raw_input_tokens": 90, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 90, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "11754ef6-f28e-44dc-8e35-c072300181db", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": true, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.json b/tests/evals/v2/runs/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.json new file mode 100644 index 0000000000..c2614bc560 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.json @@ -0,0 +1,109 @@ +{ + "run": { + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "scenario_id": "robustness_smoke_minimal_alt", + "variant_id": "candidate_eval_fixture_shadow", + "run_group_id": "group_v2_3_robustness_smoke_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2026-05-03T070927456Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:27.518Z", + "ended_at": "2026-05-03T07:09:27.528Z", + "status": "completed", + "entry_user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "root_query_id": "1f94b857-1f51-4353-92f4-df72e750fd65", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "root_query_id": "1f94b857-1f51-4353-92f4-df72e750fd65", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "root_query_id": "1f94b857-1f51-4353-92f4-df72e750fd65", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "robustness_smoke_minimal_alt", + "name": "Robustness Smoke Minimal Alt", + "description": "A second tiny scenario used by V2.3 robustness smoke to exercise multi-scenario batch execution without model/API spend.", + "input_prompt": "只回复 READY,不要做任何额外解释。", + "tags": [ + "observability-v2", + "robustness-smoke", + "fixture" + ], + "expected_artifacts": [], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Should complete in one turn", + "Should not require tool calls", + "Used only for batch runner verification" + ], + "expected_observations": [ + "Fixture trace should create one main_thread root query", + "Run group aggregation should include this scenario" + ], + "evaluation_note": "This is a runner smoke scenario, not a qualitative harness evaluation.", + "max_turn_count": 1, + "max_total_billed_tokens": 1000, + "max_subagent_count": 0, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_eval_fixture_shadow", + "name": "Candidate Eval Fixture Shadow", + "description": "V2.3 fixture-only candidate used to verify multi-candidate batch runner behavior without making a real harness claim.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "shadow" + }, + "notes": "This variant is for runner robustness verification only. It should not be interpreted as a product harness improvement." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "59258ce7-8f60-4962-98fc-ed2040c75255", + "started_at": "2026-05-03T07:09:27.518Z", + "ended_at": "2026-05-03T07:09:27.528Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 105, + "total_prompt_input_tokens": 95, + "raw_input_tokens": 95, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 95, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "1f94b857-1f51-4353-92f4-df72e750fd65", + "turn_count": 1, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [] + }, + "long_context": null +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.json b/tests/evals/v2/runs/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.json new file mode 100644 index 0000000000..bb848a938a --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.json @@ -0,0 +1,243 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "scenario_id": "long_context_constraint_retention", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.127Z", + "ended_at": "2026-05-03T07:09:57.137Z", + "status": "completed", + "entry_user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "root_query_id": "94b96b90-e7cb-473e-8a87-9fdefa85a92c", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "root_query_id": "94b96b90-e7cb-473e-8a87-9fdefa85a92c", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "root_query_id": "94b96b90-e7cb-473e-8a87-9fdefa85a92c", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_constraint_retention", + "name": "Long Context Constraint Retention", + "description": "Verify that early hard constraints survive after the agent reads a longer mixed context packet.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md and answer the task without modifying files. Preserve the active hard constraints even if legacy notes suggest something else. Summarize the runner path, the preferred prompt metric, and the read-only mode.", + "tags": [ + "long-context", + "constraint-retention", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Final answer must remain JSON-shaped", + "The answer must keep owner=v2-platform", + "The task remains read-only" + ], + "expected_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "long_context_profile": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_json_output_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "json_output_only", + "description": "Final output must stay JSON-shaped.", + "severity": "hard" + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_prompt_metric", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt metric is total_prompt_input_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_markdown_rule", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_markdown_output_rule", + "description": "Do not switch back to Markdown output." + }, + "severity": "high" + }, + { + "expectation_id": "respect_context_budget", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track prompt-input growth while preserving constraints." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_output_shape", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "a928b6b2-0639-4125-8384-582e2f9f323c", + "started_at": "2026-05-03T07:09:57.127Z", + "ended_at": "2026-05-03T07:09:57.137Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1280, + "total_prompt_input_tokens": 1270, + "raw_input_tokens": 1270, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1270, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "94b96b90-e7cb-473e-8a87-9fdefa85a92c", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "long_context": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "total_prompt_input_tokens": 1270, + "observed_retained_constraints": [ + "json_output_only", + "owner_field_required" + ], + "observed_lost_constraints": [ + "read_only_task" + ], + "observed_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "```json\n{\n \"owner\": \"v2-platform\",\n \"runner\": \"scripts/evals/v2_run_experiment.ts\",\n \"prompt_metric\": \"total_prompt_input_tokens\",\n \"mode\": \"read_only\"\n}\n```", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.json b/tests/evals/v2/runs/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.json new file mode 100644 index 0000000000..b84be62484 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.json @@ -0,0 +1,245 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "scenario_id": "long_context_constraint_retention", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.137Z", + "ended_at": "2026-05-03T07:09:57.147Z", + "status": "completed", + "entry_user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "root_query_id": "3f2dbec5-a348-41c8-9a09-c98e11d6adf3", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "root_query_id": "3f2dbec5-a348-41c8-9a09-c98e11d6adf3", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "root_query_id": "3f2dbec5-a348-41c8-9a09-c98e11d6adf3", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_constraint_retention", + "name": "Long Context Constraint Retention", + "description": "Verify that early hard constraints survive after the agent reads a longer mixed context packet.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md and answer the task without modifying files. Preserve the active hard constraints even if legacy notes suggest something else. Summarize the runner path, the preferred prompt metric, and the read-only mode.", + "tags": [ + "long-context", + "constraint-retention", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Final answer must remain JSON-shaped", + "The answer must keep owner=v2-platform", + "The task remains read-only" + ], + "expected_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "long_context_profile": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_json_output_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "json_output_only", + "description": "Final output must stay JSON-shaped.", + "severity": "hard" + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_prompt_metric", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt metric is total_prompt_input_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_markdown_rule", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_markdown_output_rule", + "description": "Do not switch back to Markdown output." + }, + "severity": "high" + }, + { + "expectation_id": "respect_context_budget", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track prompt-input growth while preserving constraints." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_output_shape", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "4be1715e-7ac4-4f85-9180-3a2977c5cb09", + "started_at": "2026-05-03T07:09:57.137Z", + "ended_at": "2026-05-03T07:09:57.147Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1090, + "total_prompt_input_tokens": 1080, + "raw_input_tokens": 1080, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1080, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "3f2dbec5-a348-41c8-9a09-c98e11d6adf3", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "long_context": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "total_prompt_input_tokens": 1080, + "observed_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "```json\n{\n \"owner\": \"v2-platform\",\n \"runner\": \"scripts/evals/v2_run_experiment.ts\",\n \"prompt_metric\": \"total_prompt_input_tokens\",\n \"mode\": \"read_only\"\n}\n```", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.json b/tests/evals/v2/runs/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.json new file mode 100644 index 0000000000..c02d2d0906 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.json @@ -0,0 +1,243 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "scenario_id": "long_context_constraint_retention", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_baseline_default_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.152Z", + "ended_at": "2026-05-03T07:09:57.162Z", + "status": "completed", + "entry_user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "root_query_id": "5dcce365-2f87-413f-a867-d560fd0b4e2a", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "root_query_id": "5dcce365-2f87-413f-a867-d560fd0b4e2a", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "root_query_id": "5dcce365-2f87-413f-a867-d560fd0b4e2a", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_constraint_retention", + "name": "Long Context Constraint Retention", + "description": "Verify that early hard constraints survive after the agent reads a longer mixed context packet.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md and answer the task without modifying files. Preserve the active hard constraints even if legacy notes suggest something else. Summarize the runner path, the preferred prompt metric, and the read-only mode.", + "tags": [ + "long-context", + "constraint-retention", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Final answer must remain JSON-shaped", + "The answer must keep owner=v2-platform", + "The task remains read-only" + ], + "expected_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "long_context_profile": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_json_output_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "json_output_only", + "description": "Final output must stay JSON-shaped.", + "severity": "hard" + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_prompt_metric", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt metric is total_prompt_input_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_markdown_rule", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_markdown_output_rule", + "description": "Do not switch back to Markdown output." + }, + "severity": "high" + }, + { + "expectation_id": "respect_context_budget", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track prompt-input growth while preserving constraints." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_output_shape", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "fa3b48d1-cb82-464f-9010-bad958665eb0", + "started_at": "2026-05-03T07:09:57.152Z", + "ended_at": "2026-05-03T07:09:57.162Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1280, + "total_prompt_input_tokens": 1270, + "raw_input_tokens": 1270, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1270, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "5dcce365-2f87-413f-a867-d560fd0b4e2a", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "long_context": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "total_prompt_input_tokens": 1270, + "observed_retained_constraints": [ + "json_output_only", + "owner_field_required" + ], + "observed_lost_constraints": [ + "read_only_task" + ], + "observed_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "```json\n{\n \"owner\": \"v2-platform\",\n \"runner\": \"scripts/evals/v2_run_experiment.ts\",\n \"prompt_metric\": \"total_prompt_input_tokens\",\n \"mode\": \"read_only\"\n}\n```", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.json b/tests/evals/v2/runs/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.json new file mode 100644 index 0000000000..a728b2cc47 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.json @@ -0,0 +1,245 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "scenario_id": "long_context_constraint_retention", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_constraint_retention_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.156Z", + "ended_at": "2026-05-03T07:09:57.166Z", + "status": "completed", + "entry_user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "root_query_id": "327b70db-dd28-4094-ad58-d5a84c8b7aef", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "root_query_id": "327b70db-dd28-4094-ad58-d5a84c8b7aef", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "root_query_id": "327b70db-dd28-4094-ad58-d5a84c8b7aef", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_constraint_retention", + "name": "Long Context Constraint Retention", + "description": "Verify that early hard constraints survive after the agent reads a longer mixed context packet.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md and answer the task without modifying files. Preserve the active hard constraints even if legacy notes suggest something else. Summarize the runner path, the preferred prompt metric, and the read-only mode.", + "tags": [ + "long-context", + "constraint-retention", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Final answer must remain JSON-shaped", + "The answer must keep owner=v2-platform", + "The task remains read-only" + ], + "expected_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "long_context_profile": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_json_output_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "json_output_only", + "description": "Final output must stay JSON-shaped.", + "severity": "hard" + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_prompt_metric", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt metric is total_prompt_input_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_markdown_rule", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_markdown_output_rule", + "description": "Do not switch back to Markdown output." + }, + "severity": "high" + }, + { + "expectation_id": "respect_context_budget", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track prompt-input growth while preserving constraints." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_output_shape", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "6124af22-d716-4a71-b99e-bd268a34d5b1", + "started_at": "2026-05-03T07:09:57.156Z", + "ended_at": "2026-05-03T07:09:57.166Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1090, + "total_prompt_input_tokens": 1080, + "raw_input_tokens": 1080, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1080, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "327b70db-dd28-4094-ad58-d5a84c8b7aef", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_constraint_retention" + ] + }, + "long_context": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "total_prompt_input_tokens": 1080, + "observed_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "```json\n{\n \"owner\": \"v2-platform\",\n \"runner\": \"scripts/evals/v2_run_experiment.ts\",\n \"prompt_metric\": \"total_prompt_input_tokens\",\n \"mode\": \"read_only\"\n}\n```", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.json b/tests/evals/v2/runs/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.json new file mode 100644 index 0000000000..214232af67 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.json @@ -0,0 +1,242 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.163Z", + "ended_at": "2026-05-03T07:09:57.173Z", + "status": "completed", + "entry_user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "root_query_id": "6861de3b-d2fc-4f58-88c7-785a588f316f", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "root_query_id": "6861de3b-d2fc-4f58-88c7-785a588f316f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "root_query_id": "6861de3b-d2fc-4f58-88c7-785a588f316f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval", + "name": "Long Context Fact Retrieval", + "description": "Verify that the agent can retrieve key facts from a longer context packet and ignore stale routing notes.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md. Do not modify files. Return exactly four bullet points covering the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "fdcab6c9-1f14-41d4-9778-f00e68d8da59", + "started_at": "2026-05-03T07:09:57.163Z", + "ended_at": "2026-05-03T07:09:57.173Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1360, + "total_prompt_input_tokens": 1350, + "raw_input_tokens": 1350, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1350, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "6861de3b-d2fc-4f58-88c7-785a588f316f", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "total_prompt_input_tokens": 1350, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id" + ], + "observed_missed_facts": [ + "experiment_summary_dir" + ], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- `src/entrypoints/cli.tsx`\n- `benchmark_run_id`\n- `tests/evals/v2/experiment-runs/`\n- Read-only; no file modifications", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.json b/tests/evals/v2/runs/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.json new file mode 100644 index 0000000000..f28314f8ef --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.json @@ -0,0 +1,244 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.168Z", + "ended_at": "2026-05-03T07:09:57.178Z", + "status": "completed", + "entry_user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "root_query_id": "233be183-c56e-45b8-893e-0905e66cb8cd", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "root_query_id": "233be183-c56e-45b8-893e-0905e66cb8cd", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "root_query_id": "233be183-c56e-45b8-893e-0905e66cb8cd", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval", + "name": "Long Context Fact Retrieval", + "description": "Verify that the agent can retrieve key facts from a longer context packet and ignore stale routing notes.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md. Do not modify files. Return exactly four bullet points covering the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "1abcd4c9-c7f0-4de5-839b-c71bb539fd60", + "started_at": "2026-05-03T07:09:57.168Z", + "ended_at": "2026-05-03T07:09:57.178Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1140, + "total_prompt_input_tokens": 1130, + "raw_input_tokens": 1130, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1130, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "233be183-c56e-45b8-893e-0905e66cb8cd", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "total_prompt_input_tokens": 1130, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- `src/entrypoints/cli.tsx`\n- `benchmark_run_id`\n- `tests/evals/v2/experiment-runs/`\n- Read-only; no file modifications", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.json b/tests/evals/v2/runs/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.json new file mode 100644 index 0000000000..cec85bce13 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.json @@ -0,0 +1,242 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_baseline_default_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.174Z", + "ended_at": "2026-05-03T07:09:57.184Z", + "status": "completed", + "entry_user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "root_query_id": "e0cd7caf-7ab0-4b39-83de-29ca47ee5e07", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "root_query_id": "e0cd7caf-7ab0-4b39-83de-29ca47ee5e07", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "root_query_id": "e0cd7caf-7ab0-4b39-83de-29ca47ee5e07", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval", + "name": "Long Context Fact Retrieval", + "description": "Verify that the agent can retrieve key facts from a longer context packet and ignore stale routing notes.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md. Do not modify files. Return exactly four bullet points covering the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "70401d6d-04b0-4e05-877c-9696a93ce448", + "started_at": "2026-05-03T07:09:57.174Z", + "ended_at": "2026-05-03T07:09:57.184Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1360, + "total_prompt_input_tokens": 1350, + "raw_input_tokens": 1350, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1350, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "e0cd7caf-7ab0-4b39-83de-29ca47ee5e07", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "total_prompt_input_tokens": 1350, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id" + ], + "observed_missed_facts": [ + "experiment_summary_dir" + ], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- `src/entrypoints/cli.tsx`\n- `benchmark_run_id`\n- `tests/evals/v2/experiment-runs/`\n- Read-only; no file modifications", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.json b/tests/evals/v2/runs/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.json new file mode 100644 index 0000000000..c5e9dbaf2f --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.json @@ -0,0 +1,244 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "scenario_id": "long_context_fact_retrieval", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_fact_retrieval_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.180Z", + "ended_at": "2026-05-03T07:09:57.190Z", + "status": "completed", + "entry_user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "root_query_id": "f5e73b49-2ab0-4f3b-ac09-f8b1d18c0d9b", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "root_query_id": "f5e73b49-2ab0-4f3b-ac09-f8b1d18c0d9b", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "root_query_id": "f5e73b49-2ab0-4f3b-ac09-f8b1d18c0d9b", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval", + "name": "Long Context Fact Retrieval", + "description": "Verify that the agent can retrieve key facts from a longer context packet and ignore stale routing notes.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md. Do not modify files. Return exactly four bullet points covering the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "6d06184d-bafa-4548-a95a-121aba810f78", + "started_at": "2026-05-03T07:09:57.180Z", + "ended_at": "2026-05-03T07:09:57.190Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1140, + "total_prompt_input_tokens": 1130, + "raw_input_tokens": 1130, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1130, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "f5e73b49-2ab0-4f3b-ac09-f8b1d18c0d9b", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_fact_retrieval" + ] + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "total_prompt_input_tokens": 1130, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- `src/entrypoints/cli.tsx`\n- `benchmark_run_id`\n- `tests/evals/v2/experiment-runs/`\n- Read-only; no file modifications", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.json b/tests/evals/v2/runs/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.json new file mode 100644 index 0000000000..dbfea00ac0 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.json @@ -0,0 +1,239 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.187Z", + "ended_at": "2026-05-03T07:09:57.197Z", + "status": "completed", + "entry_user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "root_query_id": "d0237071-7ddb-4385-b9d3-e3bbc94e7992", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "root_query_id": "d0237071-7ddb-4385-b9d3-e3bbc94e7992", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "root_query_id": "d0237071-7ddb-4385-b9d3-e3bbc94e7992", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_distractor_resistance", + "name": "Long Context Distractor Resistance", + "description": "Verify that the agent resists stale but plausible-looking V2.3 artifacts when summarizing the current V2.4 fixture setup.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md. Do not modify files. Summarize the active V2.4 fixture candidate and manifest while explicitly avoiding archived V2.3 helper references.", + "tags": [ + "long-context", + "distractor-resistance", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Prefer current V2.4 files over archived helpers", + "Keep the answer read-only" + ], + "expected_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "long_context_profile": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_prefer_current_files", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived helpers." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_fixture_candidate", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "fixture_candidate_guarded", + "description": "The active V2.4 fixture candidate is candidate_long_context_fixture_guarded." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fixture_shadow_confusion", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_variant_fixture_shadow", + "description": "Do not treat candidate_eval_fixture_shadow as the V2.4 long-context candidate." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_distractors", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "distractor_confusion_count", + "description": "Observe whether distractor pressure alters the answer path." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_archived_references", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "4d94c847-217c-4889-86aa-51e0334165ee", + "started_at": "2026-05-03T07:09:57.187Z", + "ended_at": "2026-05-03T07:09:57.197Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1320, + "total_prompt_input_tokens": 1310, + "raw_input_tokens": 1310, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1310, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "d0237071-7ddb-4385-b9d3-e3bbc94e7992", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "long_context": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "total_prompt_input_tokens": 1310, + "observed_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "observed_missed_facts": [], + "observed_confusions": [ + "old_variant_fixture_shadow" + ], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- Active candidate: `candidate_long_context_fixture_guarded`\n- Active manifest: `_experiment.long_context.fixture_smoke.json`\n- Ignore archived V2.3 helper variant and old execute_harness smoke", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.json b/tests/evals/v2/runs/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.json new file mode 100644 index 0000000000..2a81598767 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.json @@ -0,0 +1,240 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.192Z", + "ended_at": "2026-05-03T07:09:57.202Z", + "status": "completed", + "entry_user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "root_query_id": "3013074b-82d2-4360-a7e8-3073b99e9ba5", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "root_query_id": "3013074b-82d2-4360-a7e8-3073b99e9ba5", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "root_query_id": "3013074b-82d2-4360-a7e8-3073b99e9ba5", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_distractor_resistance", + "name": "Long Context Distractor Resistance", + "description": "Verify that the agent resists stale but plausible-looking V2.3 artifacts when summarizing the current V2.4 fixture setup.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md. Do not modify files. Summarize the active V2.4 fixture candidate and manifest while explicitly avoiding archived V2.3 helper references.", + "tags": [ + "long-context", + "distractor-resistance", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Prefer current V2.4 files over archived helpers", + "Keep the answer read-only" + ], + "expected_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "long_context_profile": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_prefer_current_files", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived helpers." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_fixture_candidate", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "fixture_candidate_guarded", + "description": "The active V2.4 fixture candidate is candidate_long_context_fixture_guarded." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fixture_shadow_confusion", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_variant_fixture_shadow", + "description": "Do not treat candidate_eval_fixture_shadow as the V2.4 long-context candidate." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_distractors", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "distractor_confusion_count", + "description": "Observe whether distractor pressure alters the answer path." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_archived_references", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "23354a67-f2c3-497f-8cab-02fa427a1650", + "started_at": "2026-05-03T07:09:57.192Z", + "ended_at": "2026-05-03T07:09:57.202Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1120, + "total_prompt_input_tokens": 1110, + "raw_input_tokens": 1110, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1110, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "3013074b-82d2-4360-a7e8-3073b99e9ba5", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "long_context": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "total_prompt_input_tokens": 1110, + "observed_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- Active candidate: `candidate_long_context_fixture_guarded`\n- Active manifest: `_experiment.long_context.fixture_smoke.json`\n- Ignore archived V2.3 helper variant and old execute_harness smoke", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.json b/tests/evals/v2/runs/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.json new file mode 100644 index 0000000000..6b3a231c02 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.json @@ -0,0 +1,239 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_baseline_default_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.199Z", + "ended_at": "2026-05-03T07:09:57.209Z", + "status": "completed", + "entry_user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "root_query_id": "409f6340-bb3a-4d98-a27a-d7ee32f526fd", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "root_query_id": "409f6340-bb3a-4d98-a27a-d7ee32f526fd", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "root_query_id": "409f6340-bb3a-4d98-a27a-d7ee32f526fd", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_distractor_resistance", + "name": "Long Context Distractor Resistance", + "description": "Verify that the agent resists stale but plausible-looking V2.3 artifacts when summarizing the current V2.4 fixture setup.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md. Do not modify files. Summarize the active V2.4 fixture candidate and manifest while explicitly avoiding archived V2.3 helper references.", + "tags": [ + "long-context", + "distractor-resistance", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Prefer current V2.4 files over archived helpers", + "Keep the answer read-only" + ], + "expected_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "long_context_profile": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_prefer_current_files", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived helpers." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_fixture_candidate", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "fixture_candidate_guarded", + "description": "The active V2.4 fixture candidate is candidate_long_context_fixture_guarded." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fixture_shadow_confusion", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_variant_fixture_shadow", + "description": "Do not treat candidate_eval_fixture_shadow as the V2.4 long-context candidate." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_distractors", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "distractor_confusion_count", + "description": "Observe whether distractor pressure alters the answer path." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_archived_references", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "0f2affa1-25c4-4457-b906-482968d8dfa8", + "started_at": "2026-05-03T07:09:57.199Z", + "ended_at": "2026-05-03T07:09:57.209Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1320, + "total_prompt_input_tokens": 1310, + "raw_input_tokens": 1310, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1310, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "409f6340-bb3a-4d98-a27a-d7ee32f526fd", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "long_context": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "total_prompt_input_tokens": 1310, + "observed_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "observed_missed_facts": [], + "observed_confusions": [ + "old_variant_fixture_shadow" + ], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- Active candidate: `candidate_long_context_fixture_guarded`\n- Active manifest: `_experiment.long_context.fixture_smoke.json`\n- Ignore archived V2.3 helper variant and old execute_harness smoke", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.json b/tests/evals/v2/runs/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.json new file mode 100644 index 0000000000..e2f300db41 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.json @@ -0,0 +1,240 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "scenario_id": "long_context_distractor_resistance", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_distractor_resistance_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.203Z", + "ended_at": "2026-05-03T07:09:57.213Z", + "status": "completed", + "entry_user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "root_query_id": "d2b22829-1fbd-42dc-89ab-a2e5f4cf4a3d", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "root_query_id": "d2b22829-1fbd-42dc-89ab-a2e5f4cf4a3d", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "root_query_id": "d2b22829-1fbd-42dc-89ab-a2e5f4cf4a3d", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_distractor_resistance", + "name": "Long Context Distractor Resistance", + "description": "Verify that the agent resists stale but plausible-looking V2.3 artifacts when summarizing the current V2.4 fixture setup.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md. Do not modify files. Summarize the active V2.4 fixture candidate and manifest while explicitly avoiding archived V2.3 helper references.", + "tags": [ + "long-context", + "distractor-resistance", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Prefer current V2.4 files over archived helpers", + "Keep the answer read-only" + ], + "expected_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "long_context_profile": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_prefer_current_files", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived helpers." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_fixture_candidate", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "fixture_candidate_guarded", + "description": "The active V2.4 fixture candidate is candidate_long_context_fixture_guarded." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fixture_shadow_confusion", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_variant_fixture_shadow", + "description": "Do not treat candidate_eval_fixture_shadow as the V2.4 long-context candidate." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_distractors", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "distractor_confusion_count", + "description": "Observe whether distractor pressure alters the answer path." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_archived_references", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "a3fd72c9-cd71-4976-8201-a83c76b1bc87", + "started_at": "2026-05-03T07:09:57.203Z", + "ended_at": "2026-05-03T07:09:57.213Z", + "duration_ms": 10, + "subagent_count": 0, + "tool_call_count": 0, + "total_billed_tokens": 1120, + "total_prompt_input_tokens": 1110, + "raw_input_tokens": 1110, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1110, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "d2b22829-1fbd-42dc-89ab-a2e5f4cf4a3d", + "turn_count": 3, + "terminal_reason": "fixture_completed" + }, + "tools": [], + "subagents": [], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 0, + "session_memory_trigger_details": [ + "long_context_distractor_resistance" + ] + }, + "long_context": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "total_prompt_input_tokens": 1110, + "observed_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 0, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 0, + "memory_or_subagent_count": 0, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "- Active candidate: `candidate_long_context_fixture_guarded`\n- Active manifest: `_experiment.long_context.fixture_smoke.json`\n- Ignore archived V2.3 helper variant and old execute_harness smoke", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.json b/tests/evals/v2/runs/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.json new file mode 100644 index 0000000000..42aeffe339 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.json @@ -0,0 +1,265 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.210Z", + "ended_at": "2026-05-03T07:09:57.220Z", + "status": "completed", + "entry_user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "root_query_id": "5a7c056e-936f-4fd0-93fd-aaf7df2be76f", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "root_query_id": "5a7c056e-936f-4fd0-93fd-aaf7df2be76f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "root_query_id": "5a7c056e-936f-4fd0-93fd-aaf7df2be76f", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_compaction_pressure", + "name": "Long Context Compaction Pressure", + "description": "Verify that compaction and tool-result budget pressure do not destroy the task structure or key governance facts.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md. Do not modify files. Produce exactly three top-level sections named Overview, Evidence, and Conclusion. Explain the current compaction-related events, the tool-result budget event, and the saved-token score spec while avoiding archived event names.", + "tags": [ + "long-context", + "compaction-pressure", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Use exactly the headings Overview, Evidence, Conclusion", + "Do not quote archived event names as current behavior", + "Keep the task read-only" + ], + "expected_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "long_context_profile": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_three_exact_sections", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "three_exact_sections", + "description": "Use exactly Overview, Evidence, Conclusion." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_compaction_score_spec", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "compaction_saved_tokens_score", + "description": "The saved-token score spec is context.compaction_saved_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fake_context_shrink_event", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "fake_event_context_shrink", + "description": "Do not cite messages.context_shrink.applied as the current event." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_compaction", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "compaction_saved_tokens", + "description": "Observe compaction behavior and saved-token tradeoff." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_governance_semantics", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 10, + "max_total_billed_tokens": 220000, + "max_subagent_count": 4, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "c9cab754-06b4-4256-b62f-f547aa4a8349", + "started_at": "2026-05-03T07:09:57.210Z", + "ended_at": "2026-05-03T07:09:57.220Z", + "duration_ms": 10, + "subagent_count": 1, + "tool_call_count": 2, + "total_billed_tokens": 1640, + "total_prompt_input_tokens": 1630, + "raw_input_tokens": 1630, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1630, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "5a7c056e-936f-4fd0-93fd-aaf7df2be76f", + "turn_count": 5, + "terminal_reason": "fixture_completed" + }, + "tools": [ + { + "tool_name": "Read", + "is_closed": true, + "has_failed": false + }, + { + "tool_name": "Search", + "is_closed": true, + "has_failed": false + } + ], + "subagents": [ + { + "subagent_count": 1, + "subagent_reason": "session_memory", + "subagent_trigger_kind": "context_pressure", + "subagent_trigger_detail": "long_context_compaction_pressure" + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "long_context": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "total_prompt_input_tokens": 1630, + "observed_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names" + ], + "observed_lost_constraints": [ + "read_only_task" + ], + "observed_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event" + ], + "observed_missed_facts": [ + "compaction_saved_tokens_score" + ], + "observed_confusions": [], + "compaction_trigger_count": 2, + "compaction_saved_tokens": 42, + "tool_result_budget_trigger_count": 1, + "memory_or_subagent_count": 1, + "success_under_context_pressure": 0, + "manual_review_required": true, + "expected_output_excerpt": "## Overview\n\nCurrent compaction and tool-result budget governance must be described from active evidence only.\n\n## Evidence\n\n- `messages.compact_boundary.applied`\n- `messages.microcompact.applied`\n- `messages.tool_result_budget.applied`\n- `", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.json b/tests/evals/v2/runs/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.json new file mode 100644 index 0000000000..eb6bd965a6 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.json @@ -0,0 +1,266 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 1, + "started_at": "2026-05-03T07:09:57.215Z", + "ended_at": "2026-05-03T07:09:57.225Z", + "status": "completed", + "entry_user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "root_query_id": "31854445-b9ee-4c09-ac12-c88701a18600", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "root_query_id": "31854445-b9ee-4c09-ac12-c88701a18600", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "root_query_id": "31854445-b9ee-4c09-ac12-c88701a18600", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_compaction_pressure", + "name": "Long Context Compaction Pressure", + "description": "Verify that compaction and tool-result budget pressure do not destroy the task structure or key governance facts.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md. Do not modify files. Produce exactly three top-level sections named Overview, Evidence, and Conclusion. Explain the current compaction-related events, the tool-result budget event, and the saved-token score spec while avoiding archived event names.", + "tags": [ + "long-context", + "compaction-pressure", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Use exactly the headings Overview, Evidence, Conclusion", + "Do not quote archived event names as current behavior", + "Keep the task read-only" + ], + "expected_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "long_context_profile": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_three_exact_sections", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "three_exact_sections", + "description": "Use exactly Overview, Evidence, Conclusion." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_compaction_score_spec", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "compaction_saved_tokens_score", + "description": "The saved-token score spec is context.compaction_saved_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fake_context_shrink_event", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "fake_event_context_shrink", + "description": "Do not cite messages.context_shrink.applied as the current event." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_compaction", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "compaction_saved_tokens", + "description": "Observe compaction behavior and saved-token tradeoff." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_governance_semantics", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 10, + "max_total_billed_tokens": 220000, + "max_subagent_count": 4, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "6488e757-f4e2-42fc-9cfc-b99ade383d28", + "started_at": "2026-05-03T07:09:57.215Z", + "ended_at": "2026-05-03T07:09:57.225Z", + "duration_ms": 10, + "subagent_count": 1, + "tool_call_count": 2, + "total_billed_tokens": 1240, + "total_prompt_input_tokens": 1230, + "raw_input_tokens": 1230, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1230, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "31854445-b9ee-4c09-ac12-c88701a18600", + "turn_count": 5, + "terminal_reason": "fixture_completed" + }, + "tools": [ + { + "tool_name": "Read", + "is_closed": true, + "has_failed": false + }, + { + "tool_name": "Search", + "is_closed": true, + "has_failed": false + } + ], + "subagents": [ + { + "subagent_count": 1, + "subagent_reason": "session_memory", + "subagent_trigger_kind": "context_pressure", + "subagent_trigger_detail": "long_context_compaction_pressure" + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "long_context": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "total_prompt_input_tokens": 1230, + "observed_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 2, + "compaction_saved_tokens": 188, + "tool_result_budget_trigger_count": 1, + "memory_or_subagent_count": 1, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "## Overview\n\nCurrent compaction and tool-result budget governance must be described from active evidence only.\n\n## Evidence\n\n- `messages.compact_boundary.applied`\n- `messages.microcompact.applied`\n- `messages.tool_result_budget.applied`\n- `", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.json b/tests/evals/v2/runs/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.json new file mode 100644 index 0000000000..d4328297b0 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.json @@ -0,0 +1,265 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_baseline_default_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.221Z", + "ended_at": "2026-05-03T07:09:57.231Z", + "status": "completed", + "entry_user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "root_query_id": "5936f5b2-7255-42cd-8f2a-8fec01a2ecb9", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "root_query_id": "5936f5b2-7255-42cd-8f2a-8fec01a2ecb9", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "root_query_id": "5936f5b2-7255-42cd-8f2a-8fec01a2ecb9", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_compaction_pressure", + "name": "Long Context Compaction Pressure", + "description": "Verify that compaction and tool-result budget pressure do not destroy the task structure or key governance facts.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md. Do not modify files. Produce exactly three top-level sections named Overview, Evidence, and Conclusion. Explain the current compaction-related events, the tool-result budget event, and the saved-token score spec while avoiding archived event names.", + "tags": [ + "long-context", + "compaction-pressure", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Use exactly the headings Overview, Evidence, Conclusion", + "Do not quote archived event names as current behavior", + "Keep the task read-only" + ], + "expected_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "long_context_profile": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_three_exact_sections", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "three_exact_sections", + "description": "Use exactly Overview, Evidence, Conclusion." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_compaction_score_spec", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "compaction_saved_tokens_score", + "description": "The saved-token score spec is context.compaction_saved_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fake_context_shrink_event", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "fake_event_context_shrink", + "description": "Do not cite messages.context_shrink.applied as the current event." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_compaction", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "compaction_saved_tokens", + "description": "Observe compaction behavior and saved-token tradeoff." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_governance_semantics", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 10, + "max_total_billed_tokens": 220000, + "max_subagent_count": 4, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "31b412ce-f658-45fc-b7db-9cdfcfd2410e", + "started_at": "2026-05-03T07:09:57.221Z", + "ended_at": "2026-05-03T07:09:57.231Z", + "duration_ms": 10, + "subagent_count": 1, + "tool_call_count": 2, + "total_billed_tokens": 1640, + "total_prompt_input_tokens": 1630, + "raw_input_tokens": 1630, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1630, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "5936f5b2-7255-42cd-8f2a-8fec01a2ecb9", + "turn_count": 5, + "terminal_reason": "fixture_completed" + }, + "tools": [ + { + "tool_name": "Read", + "is_closed": true, + "has_failed": false + }, + { + "tool_name": "Search", + "is_closed": true, + "has_failed": false + } + ], + "subagents": [ + { + "subagent_count": 1, + "subagent_reason": "session_memory", + "subagent_trigger_kind": "context_pressure", + "subagent_trigger_detail": "long_context_compaction_pressure" + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "long_context": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "total_prompt_input_tokens": 1630, + "observed_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names" + ], + "observed_lost_constraints": [ + "read_only_task" + ], + "observed_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event" + ], + "observed_missed_facts": [ + "compaction_saved_tokens_score" + ], + "observed_confusions": [], + "compaction_trigger_count": 2, + "compaction_saved_tokens": 42, + "tool_result_budget_trigger_count": 1, + "memory_or_subagent_count": 1, + "success_under_context_pressure": 0, + "manual_review_required": true, + "expected_output_excerpt": "## Overview\n\nCurrent compaction and tool-result budget governance must be described from active evidence only.\n\n## Evidence\n\n- `messages.compact_boundary.applied`\n- `messages.microcompact.applied`\n- `messages.tool_result_budget.applied`\n- `", + "observed_mode": "baseline" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.json b/tests/evals/v2/runs/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.json new file mode 100644 index 0000000000..90b22435a7 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.json @@ -0,0 +1,266 @@ +{ + "run": { + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "scenario_id": "long_context_compaction_pressure", + "variant_id": "candidate_long_context_fixture_guarded", + "run_group_id": "group_v2_4_long_context_fixture_smoke_long_context_compaction_pressure_candidate_long_context_fixture_guarded_2026-05-03T070957125Z", + "repeat_index": 2, + "started_at": "2026-05-03T07:09:57.225Z", + "ended_at": "2026-05-03T07:09:57.235Z", + "status": "completed", + "entry_user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "root_query_id": "9c3c5002-4100-4606-80a0-e0f0a8f5af33", + "observability_db_ref": "fixture_trace://synthetic", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "root_query_id": "9c3c5002-4100-4606-80a0-e0f0a8f5af33", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Synthetic fixture_trace run generated by V2.4 fast path." + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "root_query_id": "9c3c5002-4100-4606-80a0-e0f0a8f5af33", + "observability_db_ref": "fixture_trace://synthetic", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_compaction_pressure", + "name": "Long Context Compaction Pressure", + "description": "Verify that compaction and tool-result budget pressure do not destroy the task structure or key governance facts.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md. Do not modify files. Produce exactly three top-level sections named Overview, Evidence, and Conclusion. Explain the current compaction-related events, the tool-result budget event, and the saved-token score spec while avoiding archived event names.", + "tags": [ + "long-context", + "compaction-pressure", + "v2.4" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [ + "Read" + ], + "expected_skills": [], + "expected_constraints": [ + "Use exactly the headings Overview, Evidence, Conclusion", + "Do not quote archived event names as current behavior", + "Keep the task read-only" + ], + "expected_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "long_context_profile": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_three_exact_sections", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "three_exact_sections", + "description": "Use exactly Overview, Evidence, Conclusion." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_compaction_score_spec", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "compaction_saved_tokens_score", + "description": "The saved-token score spec is context.compaction_saved_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fake_context_shrink_event", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "fake_event_context_shrink", + "description": "Do not cite messages.context_shrink.applied as the current event." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_compaction", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "compaction_saved_tokens", + "description": "Observe compaction behavior and saved-token tradeoff." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_governance_semantics", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 10, + "max_total_billed_tokens": 220000, + "max_subagent_count": 4, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "8c630899-4463-461c-a588-285512a1e921", + "started_at": "2026-05-03T07:09:57.225Z", + "ended_at": "2026-05-03T07:09:57.235Z", + "duration_ms": 10, + "subagent_count": 1, + "tool_call_count": 2, + "total_billed_tokens": 1240, + "total_prompt_input_tokens": 1230, + "raw_input_tokens": 1230, + "output_tokens": 10, + "cache_read_tokens": 0, + "cache_create_tokens": 0, + "main_thread_total_prompt_input_tokens": 1230, + "subagent_total_prompt_input_tokens": 0 + }, + "rootQuery": { + "query_id": "9c3c5002-4100-4606-80a0-e0f0a8f5af33", + "turn_count": 5, + "terminal_reason": "fixture_completed" + }, + "tools": [ + { + "tool_name": "Read", + "is_closed": true, + "has_failed": false + }, + { + "tool_name": "Search", + "is_closed": true, + "has_failed": false + } + ], + "subagents": [ + { + "subagent_count": 1, + "subagent_reason": "session_memory", + "subagent_trigger_kind": "context_pressure", + "subagent_trigger_detail": "long_context_compaction_pressure" + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "fixture_variant", + "policy_event_observed": false, + "variant_effect_observed": false, + "observed_policy": null, + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "long_context_compaction_pressure" + ] + }, + "long_context": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "total_prompt_input_tokens": 1230, + "observed_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "observed_missed_facts": [], + "observed_confusions": [], + "compaction_trigger_count": 2, + "compaction_saved_tokens": 188, + "tool_result_budget_trigger_count": 1, + "memory_or_subagent_count": 1, + "success_under_context_pressure": 1, + "manual_review_required": true, + "expected_output_excerpt": "## Overview\n\nCurrent compaction and tool-result budget governance must be described from active evidence only.\n\n## Evidence\n\n- `messages.compact_boundary.applied`\n- `messages.microcompact.applied`\n- `messages.tool_result_budget.applied`\n- `", + "observed_mode": "long_context_guarded" + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.json b/tests/evals/v2/runs/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.json new file mode 100644 index 0000000000..2e06f5e27a --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.json @@ -0,0 +1,319 @@ +{ + "run": { + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "baseline_default", + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_baseline_default_2026-05-03T145605757Z", + "repeat_index": 1, + "started_at": "2026-05-03T14:56:10.802Z", + "ended_at": "2026-05-03T14:56:17.911Z", + "status": "completed", + "entry_user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "root_query_id": "3b4329f1-5396-4c39-bad5-54c00976a14d", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "root_query_id": "3b4329f1-5396-4c39-bad5-54c00976a14d", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "root_query_id": "3b4329f1-5396-4c39-bad5-54c00976a14d", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "name": "Long Context Fact Retrieval Real Smoke", + "description": "A small inline long-context retrieval scenario for real execute_harness smoke. It avoids path-fragile file reads while preserving the same retrieval and distractor requirements.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]\n\nThe four bullets must cover: the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4", + "real-smoke" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection_real_smoke", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "started_at": "2026-05-03T14:56:10.802Z", + "started_at_ms": 1777820170802, + "ended_at": "2026-05-03T14:56:17.911Z", + "ended_at_ms": 1777820177911, + "duration_ms": 7109, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_4_long_co_fd8c0e6a", + "scenario_id": "scn_long_context_ac1e93f0", + "variant_id": "var_baseline_def_eb4a038e", + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_baseline_default_repeat_1_1b5c5949040a", + "raw_input_tokens": "15", + "output_tokens": "302", + "cache_read_tokens": "1509", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26887", + "total_billed_tokens": "27189", + "main_thread_total_prompt_input_tokens": "26887", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "3b4329f1-5396-4c39-bad5-54c00976a14d", + "user_action_id": "4015c73b-f268-4487-b8b7-d4be1cfba5bf", + "session_id": "d4941c66-6944-4500-a50f-55d9bc50736a", + "conversation_id": "d4941c66-6944-4500-a50f-55d9bc50736a", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T14:56:10.802Z", + "started_at_ms": 1777820170802, + "ended_at": "2026-05-03T14:56:17.818Z", + "ended_at_ms": 1777820177818, + "duration_ms": 7016, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 27, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T14:56:17.800Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 26887, + "parser_version": "candidate_long_context_output_parser_v0", + "parser_mode": "real_smoke_rule_based", + "parser_status": "parsed", + "variant_id": "baseline_default", + "observed_output_excerpt": "- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`\n- The formal capture key for execute_harness binding is `benchmark_run_id`\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`\n- This is a read-only re", + "supported_constraint_ids": [ + "four_bullets_only", + "read_only_task" + ], + "supported_fact_ids": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "supported_confusion_ids": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_required": true, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [] + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.json b/tests/evals/v2/runs/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.json new file mode 100644 index 0000000000..14e46162d9 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.json @@ -0,0 +1,320 @@ +{ + "run": { + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "scenario_id": "long_context_fact_retrieval_real_smoke", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_4_long_context_real_smoke_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_2026-05-03T145605757Z", + "repeat_index": 1, + "started_at": "2026-05-03T14:56:28.027Z", + "ended_at": "2026-05-03T14:56:40.199Z", + "status": "completed", + "entry_user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "root_query_id": "e4e3bfee-5d23-44f7-98ac-0189cde1add9", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "root_query_id": "e4e3bfee-5d23-44f7-98ac-0189cde1add9", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "root_query_id": "e4e3bfee-5d23-44f7-98ac-0189cde1add9", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke", + "name": "Long Context Fact Retrieval Real Smoke", + "description": "A small inline long-context retrieval scenario for real execute_harness smoke. It avoids path-fragile file reads while preserving the same retrieval and distractor requirements.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]\n\nThe four bullets must cover: the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4", + "real-smoke" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection_real_smoke", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "started_at": "2026-05-03T14:56:28.027Z", + "started_at_ms": 1777820188027, + "ended_at": "2026-05-03T14:56:40.199Z", + "ended_at_ms": 1777820200199, + "duration_ms": 12172, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_4_long_co_fd8c0e6a", + "scenario_id": "scn_long_context_ac1e93f0", + "variant_id": "var_candidate_se_efbc2e82", + "benchmark_run_id": "bench_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "eval_run_id": "eval_v2_4_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_26f2deede04b", + "raw_input_tokens": "12", + "output_tokens": "302", + "cache_read_tokens": "1512", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "26887", + "total_billed_tokens": "27189", + "main_thread_total_prompt_input_tokens": "26887", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "e4e3bfee-5d23-44f7-98ac-0189cde1add9", + "user_action_id": "54964348-774a-43ae-8c23-d3ba6f961894", + "session_id": "f2d0d29b-502b-4262-b0cb-e9fa0a96b8d9", + "conversation_id": "f2d0d29b-502b-4262-b0cb-e9fa0a96b8d9", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T14:56:28.027Z", + "started_at_ms": 1777820188027, + "ended_at": "2026-05-03T14:56:40.129Z", + "ended_at_ms": 1777820200129, + "duration_ms": 12102, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 27, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T14:56:40.106Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 26887, + "parser_version": "candidate_long_context_output_parser_v0", + "parser_mode": "real_smoke_rule_based", + "parser_status": "parsed", + "variant_id": "candidate_session_memory_sparse", + "observed_output_excerpt": "- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`\n- The formal capture key for execute_harness binding is `benchmark_run_id`\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`\n- This is a read-only re", + "supported_constraint_ids": [ + "four_bullets_only", + "read_only_task" + ], + "supported_fact_ids": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "supported_confusion_ids": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_required": true, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [] + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.json b/tests/evals/v2/runs/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.json new file mode 100644 index 0000000000..638e3cd8a8 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.json @@ -0,0 +1,330 @@ +{ + "run": { + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "baseline_default", + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_2026-05-03T153143608Z", + "repeat_index": 1, + "started_at": "2026-05-03T15:31:47.795Z", + "ended_at": "2026-05-03T15:32:03.341Z", + "status": "completed", + "entry_user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "root_query_id": "c301fb28-346a-4ee6-9cca-6104c1c09501", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "root_query_id": "c301fb28-346a-4ee6-9cca-6104c1c09501", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "root_query_id": "c301fb28-346a-4ee6-9cca-6104c1c09501", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "name": "Long Context Fact Retrieval Real Smoke Contract v0", + "description": "A tightened long-context real-smoke scenario that keeps the same factual task but narrows the final-answer contract and manual-review questions.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n3. Do not add any heading, numbering, preface, epilogue, or commentary.\n4. Preserve the bullet order defined below.\n\n### Required Bullet Order\n\n- Bullet 1 must identify the active headless CLI entrypoint and include the exact literal `src/entrypoints/cli.tsx`.\n- Bullet 2 must identify the formal execute_harness capture key and include the exact literal `benchmark_run_id`.\n- Bullet 3 must identify the experiment-summary directory and include the exact literal `tests/evals/v2/experiment-runs/`.\n- Bullet 4 must restate the read-only rule and explicitly include the sentence `Do not modify files.`\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]", + "tags": [ + "long-context", + "fact-retrieval", + "v2.5", + "real-smoke", + "expectation-contract" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points in the required order", + "Keep the task read-only and explicitly restate it in bullet 4", + "Do not add extra prose before or after the bullets" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points in the required order." + }, + "severity": "high" + }, + { + "expectation_id": "retain_read_only_constraint_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "read_only_task", + "description": "Explicitly restate the read-only rule in bullet 4." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke_contract_v0", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke_contract_v0", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke_contract_v0", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable under the tightened answer contract." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_contract_precision_real_smoke_contract_v0", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "baseline_default", + "name": "Baseline Default", + "description": "Current default harness baseline used for comparison.", + "change_layer": "mixed", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_default.runtime.json", + "notes": "Default baseline. For V2.2-beta execute_harness experiments, the config snapshot provides a traceable runtime contract without changing the baseline policy away from default mode." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "started_at": "2026-05-03T15:31:47.795Z", + "started_at_ms": 1777822307795, + "ended_at": "2026-05-03T15:32:03.341Z", + "ended_at_ms": 1777822323341, + "duration_ms": 15546, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_5_long_co_f2af0643", + "scenario_id": "scn_long_context_616fb55e", + "variant_id": "var_baseline_def_eb4a038e", + "benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_baseline_default_repeat_1_3c57dd68b379", + "raw_input_tokens": "21", + "output_tokens": "429", + "cache_read_tokens": "1623", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "27007", + "total_billed_tokens": "27436", + "main_thread_total_prompt_input_tokens": "27007", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "c301fb28-346a-4ee6-9cca-6104c1c09501", + "user_action_id": "0b6a625e-d7ce-4afc-b42d-fdaf6df5654e", + "session_id": "7ba2c757-8793-425e-8b5f-a91af1f4daca", + "conversation_id": "7ba2c757-8793-425e-8b5f-a91af1f4daca", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T15:31:47.795Z", + "started_at_ms": 1777822307795, + "ended_at": "2026-05-03T15:32:03.288Z", + "ended_at_ms": 1777822323288, + "duration_ms": 15493, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 27, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "default", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": false, + "token_threshold_multiplier": 1, + "tool_threshold_multiplier": 1, + "minimum_message_tokens_to_init": 10000, + "minimum_tokens_between_update": 5000, + "tool_calls_between_updates": 6 + }, + "observed_at": "2026-05-03T15:32:03.273Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 27007, + "parser_version": "candidate_long_context_output_parser_v0", + "parser_mode": "real_smoke_rule_based", + "parser_status": "parsed", + "variant_id": "baseline_default", + "observed_output_excerpt": "- The active headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal execute_harness capture key is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n- This is a read-only retrieval ta", + "supported_constraint_ids": [ + "four_bullets_only", + "read_only_task" + ], + "supported_fact_ids": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "supported_confusion_ids": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_required": true, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [] + } +} diff --git a/tests/evals/v2/runs/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.json b/tests/evals/v2/runs/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.json new file mode 100644 index 0000000000..b3598176c7 --- /dev/null +++ b/tests/evals/v2/runs/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.json @@ -0,0 +1,331 @@ +{ + "run": { + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "variant_id": "candidate_session_memory_sparse", + "run_group_id": "group_v2_5_long_context_real_smoke_expectation_contract_v0_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_2026-05-03T1531436", + "repeat_index": 1, + "started_at": "2026-05-03T15:32:12.356Z", + "ended_at": "2026-05-03T15:32:25.137Z", + "status": "completed", + "entry_user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "root_query_id": "679f208c-b47b-4fce-a8de-8888ad163c39", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "root_query_id": "679f208c-b47b-4fce-a8de-8888ad163c39", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "notes": "Generated by scripts/evals/v2_record_run.ts" + }, + "binding": { + "binding_mode": "fact_only", + "entry_user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "root_query_id": "679f208c-b47b-4fce-a8de-8888ad163c39", + "observability_db_ref": ".observability\\v2-long-context-real-smoke.duckdb", + "bind_passed": true, + "binding_failure_reason": null + }, + "scenario": { + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "name": "Long Context Fact Retrieval Real Smoke Contract v0", + "description": "A tightened long-context real-smoke scenario that keeps the same factual task but narrows the final-answer contract and manual-review questions.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n3. Do not add any heading, numbering, preface, epilogue, or commentary.\n4. Preserve the bullet order defined below.\n\n### Required Bullet Order\n\n- Bullet 1 must identify the active headless CLI entrypoint and include the exact literal `src/entrypoints/cli.tsx`.\n- Bullet 2 must identify the formal execute_harness capture key and include the exact literal `benchmark_run_id`.\n- Bullet 3 must identify the experiment-summary directory and include the exact literal `tests/evals/v2/experiment-runs/`.\n- Bullet 4 must restate the read-only rule and explicitly include the sentence `Do not modify files.`\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]", + "tags": [ + "long-context", + "fact-retrieval", + "v2.5", + "real-smoke", + "expectation-contract" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points in the required order", + "Keep the task read-only and explicitly restate it in bullet 4", + "Do not add extra prose before or after the bullets" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points in the required order." + }, + "severity": "high" + }, + { + "expectation_id": "retain_read_only_constraint_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "read_only_task", + "description": "Explicitly restate the read-only rule in bullet 4." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke_contract_v0", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke_contract_v0", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke_contract_v0", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable under the tightened answer contract." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_contract_precision_real_smoke_contract_v0", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" + }, + "variant": { + "variant_id": "candidate_session_memory_sparse", + "name": "Candidate Session Memory Sparse", + "description": "Use a sparser session_memory policy so background memory updates prefer natural breaks and higher thresholds.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "config_snapshot_ref": "tests/evals/v2/configs/session_memory_sparse.runtime.json", + "notes": "V2.2-beta runtime contract: this candidate now carries a sparse session_memory policy through config_snapshot_ref. The sparse policy must be observed in V1/V2 evidence, not inferred from manifest text." + }, + "evidence": { + "action": { + "event_date": "2026-05-03", + "user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "started_at": "2026-05-03T15:32:12.356Z", + "started_at_ms": 1777822332356, + "ended_at": "2026-05-03T15:32:25.137Z", + "ended_at_ms": 1777822345137, + "duration_ms": 12781, + "event_count": 46, + "query_count": 3, + "main_thread_query_count": 2, + "subagent_query_count": 1, + "subagent_count": 1, + "tool_call_count": 0, + "experiment_id": "exp_v2_5_long_co_f2af0643", + "scenario_id": "scn_long_context_616fb55e", + "variant_id": "var_candidate_se_efbc2e82", + "benchmark_run_id": "bench_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "eval_run_id": "eval_v2_5_long_context_re_long_context_fact_re_candidate_session_me_repeat_1_28a85e623a50", + "raw_input_tokens": "69", + "output_tokens": "365", + "cache_read_tokens": "1575", + "cache_create_tokens": "25363", + "total_prompt_input_tokens": "27007", + "total_billed_tokens": "27372", + "main_thread_total_prompt_input_tokens": "27007", + "subagent_total_prompt_input_tokens": "0" + }, + "rootQuery": { + "query_id": "679f208c-b47b-4fce-a8de-8888ad163c39", + "user_action_id": "a3fb1e0d-6260-4f43-a830-70b723a236ae", + "session_id": "a4a76b7e-dea4-4dad-ad69-0306be0bf321", + "conversation_id": "a4a76b7e-dea4-4dad-ad69-0306be0bf321", + "query_source": "sdk", + "subagent_id": null, + "subagent_type": null, + "subagent_reason": "sdk", + "subagent_trigger_kind": null, + "subagent_trigger_detail": null, + "subagent_trigger_payload_json": null, + "agent_name": "main_thread", + "source_group": "main_thread", + "started_at": "2026-05-03T15:32:12.356Z", + "started_at_ms": 1777822332356, + "ended_at": "2026-05-03T15:32:25.081Z", + "ended_at_ms": 1777822345081, + "duration_ms": 12725, + "first_event": "submit.attempted", + "last_event": "query.terminated", + "terminal_reason": "completed", + "stop_reason": "end_turn", + "turn_count": 1, + "query_max_loop_iter": 1, + "query_avg_loop_iter": 1, + "tool_call_count": 0, + "event_count": 27, + "raw_query_started_count": 1, + "raw_query_terminated_count": 0, + "inferred_query_started_count": 1, + "inferred_query_terminated_count": 1, + "strict_is_complete": "false", + "inferred_is_complete": "true" + }, + "tools": [], + "subagents": [ + { + "subagent_reason": "session_memory", + "subagent_trigger_kind": "post_sampling_hook", + "subagent_trigger_detail": "token_threshold_and_natural_break", + "subagent_count": 1, + "avg_duration_ms": null + } + ], + "recoveries": [] + }, + "variant_effect": { + "effect_type": "session_memory_policy", + "policy_event_observed": true, + "variant_effect_observed": true, + "observed_policy": { + "mode": "sparse", + "source": "config_snapshot_session_memory_policy", + "gate_enabled": true, + "force_enabled": true, + "query_source_supported": true, + "natural_break_only": true, + "token_threshold_multiplier": 2, + "tool_threshold_multiplier": 2, + "minimum_message_tokens_to_init": 20000, + "minimum_tokens_between_update": 10000, + "tool_calls_between_updates": 12 + }, + "observed_at": "2026-05-03T15:32:25.067Z", + "observed_query_source": "sdk", + "session_memory_subagent_count": 1, + "session_memory_trigger_details": [ + "token_threshold_and_natural_break" + ], + "reason": "Session-memory runtime policy was observed from V1 events." + }, + "long_context": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "compaction_trigger_count": 4, + "compaction_saved_tokens": 0, + "tool_result_budget_trigger_count": 2, + "memory_or_subagent_count": 1, + "total_prompt_input_tokens": 27007, + "parser_version": "candidate_long_context_output_parser_v0", + "parser_mode": "real_smoke_rule_based", + "parser_status": "parsed", + "variant_id": "candidate_session_memory_sparse", + "observed_output_excerpt": "- The active headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal execute_harness capture key is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n- This is a read-only retrieval ta", + "supported_constraint_ids": [ + "four_bullets_only", + "read_only_task" + ], + "supported_fact_ids": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "supported_confusion_ids": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_required": true, + "observed_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "observed_lost_constraints": [], + "observed_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "observed_missed_facts": [], + "observed_confusions": [] + } +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_compaction_pressure.json b/tests/evals/v2/scenarios/long-context/long_context_compaction_pressure.json new file mode 100644 index 0000000000..fc66e6c691 --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_compaction_pressure.json @@ -0,0 +1,110 @@ +{ + "scenario_id": "long_context_compaction_pressure", + "name": "Long Context Compaction Pressure", + "description": "Verify that compaction and tool-result budget pressure do not destroy the task structure or key governance facts.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/compaction-pressure/context_body.md. Do not modify files. Produce exactly three top-level sections named Overview, Evidence, and Conclusion. Explain the current compaction-related events, the tool-result budget event, and the saved-token score spec while avoiding archived event names.", + "tags": ["long-context", "compaction-pressure", "v2.4"], + "expected_artifacts": ["final_answer"], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Use exactly the headings Overview, Evidence, Conclusion", + "Do not quote archived event names as current behavior", + "Keep the task read-only" + ], + "expected_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "long_context_profile": { + "context_family": "compaction_pressure", + "context_size_class": "large", + "fixture_ref": "tests/evals/v2/fixtures/long-context/compaction-pressure", + "expected_retained_constraints": [ + "three_exact_sections", + "no_archived_event_names", + "read_only_task" + ], + "expected_retrieved_facts": [ + "compact_boundary_event", + "tool_result_budget_event", + "compaction_saved_tokens_score" + ], + "distractor_refs": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "forbidden_confusions": [ + "fake_event_context_shrink", + "fake_score_cache_prune_count" + ], + "manual_review_questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_three_exact_sections", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "three_exact_sections", + "description": "Use exactly Overview, Evidence, Conclusion." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_compaction_score_spec", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "compaction_saved_tokens_score", + "description": "The saved-token score spec is context.compaction_saved_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fake_context_shrink_event", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "fake_event_context_shrink", + "description": "Do not cite messages.context_shrink.applied as the current event." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_compaction", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "compaction_saved_tokens", + "description": "Observe compaction behavior and saved-token tradeoff." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_governance_semantics", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer keep the exact three required headings?", + "Did the answer stay on current compaction signals instead of archived names?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 10, + "max_total_billed_tokens": 220000, + "max_subagent_count": 4, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_constraint_retention.json b/tests/evals/v2/scenarios/long-context/long_context_constraint_retention.json new file mode 100644 index 0000000000..267716e4a2 --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_constraint_retention.json @@ -0,0 +1,109 @@ +{ + "scenario_id": "long_context_constraint_retention", + "name": "Long Context Constraint Retention", + "description": "Verify that early hard constraints survive after the agent reads a longer mixed context packet.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/constraint-retention/context_body.md and answer the task without modifying files. Preserve the active hard constraints even if legacy notes suggest something else. Summarize the runner path, the preferred prompt metric, and the read-only mode.", + "tags": ["long-context", "constraint-retention", "v2.4"], + "expected_artifacts": ["final_answer"], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Final answer must remain JSON-shaped", + "The answer must keep owner=v2-platform", + "The task remains read-only" + ], + "expected_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "long_context_profile": { + "context_family": "constraint_retention", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/constraint-retention", + "expected_retained_constraints": [ + "json_output_only", + "owner_field_required", + "read_only_task" + ], + "expected_retrieved_facts": [ + "runner_entrypoint", + "prompt_metric_total_prompt_input_tokens" + ], + "distractor_refs": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "forbidden_confusions": [ + "old_markdown_output_rule", + "wrong_metric_cache_create_tokens" + ], + "manual_review_questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_json_output_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "json_output_only", + "description": "Final output must stay JSON-shaped.", + "severity": "hard" + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_prompt_metric", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "prompt_metric_total_prompt_input_tokens", + "description": "The preferred prompt metric is total_prompt_input_tokens." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_markdown_rule", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_markdown_output_rule", + "description": "Do not switch back to Markdown output." + }, + "severity": "high" + }, + { + "expectation_id": "respect_context_budget", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track prompt-input growth while preserving constraints." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_output_shape", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer remain valid JSON instead of drifting into prose?", + "Did the answer preserve owner=v2-platform while staying read-only?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_distractor_resistance.json b/tests/evals/v2/scenarios/long-context/long_context_distractor_resistance.json new file mode 100644 index 0000000000..a6ad467978 --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_distractor_resistance.json @@ -0,0 +1,106 @@ +{ + "scenario_id": "long_context_distractor_resistance", + "name": "Long Context Distractor Resistance", + "description": "Verify that the agent resists stale but plausible-looking V2.3 artifacts when summarizing the current V2.4 fixture setup.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/distractor-resistance/context_body.md. Do not modify files. Summarize the active V2.4 fixture candidate and manifest while explicitly avoiding archived V2.3 helper references.", + "tags": ["long-context", "distractor-resistance", "v2.4"], + "expected_artifacts": ["final_answer"], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Prefer current V2.4 files over archived helpers", + "Keep the answer read-only" + ], + "expected_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "long_context_profile": { + "context_family": "distractor_resistance", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/distractor-resistance", + "expected_retained_constraints": [ + "prefer_current_v24_files", + "read_only_task" + ], + "expected_retrieved_facts": [ + "fixture_candidate_guarded", + "active_fixture_smoke_manifest" + ], + "distractor_refs": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "forbidden_confusions": [ + "old_variant_fixture_shadow", + "old_execute_harness_smoke_manifest" + ], + "manual_review_questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_prefer_current_files", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "prefer_current_v24_files", + "description": "Prefer current V2.4 files over archived helpers." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_fixture_candidate", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "fixture_candidate_guarded", + "description": "The active V2.4 fixture candidate is candidate_long_context_fixture_guarded." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_fixture_shadow_confusion", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_variant_fixture_shadow", + "description": "Do not treat candidate_eval_fixture_shadow as the V2.4 long-context candidate." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_distractors", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "distractor_confusion_count", + "description": "Observe whether distractor pressure alters the answer path." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_archived_references", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper?", + "Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval.json b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval.json new file mode 100644 index 0000000000..4579e2952d --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval.json @@ -0,0 +1,108 @@ +{ + "scenario_id": "long_context_fact_retrieval", + "name": "Long Context Fact Retrieval", + "description": "Verify that the agent can retrieve key facts from a longer context packet and ignore stale routing notes.", + "input_prompt": "Read tests/evals/v2/fixtures/long-context/fact-retrieval/context_body.md. Do not modify files. Return exactly four bullet points covering the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": ["long-context", "fact-retrieval", "v2.4"], + "expected_artifacts": ["final_answer"], + "expected_tools": ["Read"], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 8, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke.json b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke.json new file mode 100644 index 0000000000..26c50a48a9 --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke.json @@ -0,0 +1,115 @@ +{ + "scenario_id": "long_context_fact_retrieval_real_smoke", + "name": "Long Context Fact Retrieval Real Smoke", + "description": "A small inline long-context retrieval scenario for real execute_harness smoke. It avoids path-fragile file reads while preserving the same retrieval and distractor requirements.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]\n\nThe four bullets must cover: the CLI entrypoint, the formal capture key, the experiment-summary directory, and the read-only constraint.", + "tags": [ + "long-context", + "fact-retrieval", + "v2.4", + "real-smoke" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points", + "Keep the task read-only" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_fact_selection_real_smoke", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint?", + "Did the answer preserve the four-bullet constraint without extra prose?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json new file mode 100644 index 0000000000..85c67bf0cc --- /dev/null +++ b/tests/evals/v2/scenarios/long-context/long_context_fact_retrieval_real_smoke_contract_v0.json @@ -0,0 +1,126 @@ +{ + "scenario_id": "long_context_fact_retrieval_real_smoke_contract_v0", + "name": "Long Context Fact Retrieval Real Smoke Contract v0", + "description": "A tightened long-context real-smoke scenario that keeps the same factual task but narrows the final-answer contract and manual-review questions.", + "input_prompt": "You are inside the repository. This is a read-only long-context retrieval task. Do not modify files. Return exactly four bullet points and nothing else. Use the context packet below.\n\n[Context Packet Start]\n## Evaluation Workspace Brief\n\nThis is a read-only retrieval task inside the repository.\n\n### Hard Constraints\n\n1. Use exactly four bullet points in the final answer.\n2. Do not modify files.\n3. Do not add any heading, numbering, preface, epilogue, or commentary.\n4. Preserve the bullet order defined below.\n\n### Required Bullet Order\n\n- Bullet 1 must identify the active headless CLI entrypoint and include the exact literal `src/entrypoints/cli.tsx`.\n- Bullet 2 must identify the formal execute_harness capture key and include the exact literal `benchmark_run_id`.\n- Bullet 3 must identify the experiment-summary directory and include the exact literal `tests/evals/v2/experiment-runs/`.\n- Bullet 4 must restate the read-only rule and explicitly include the sentence `Do not modify files.`\n\n### Key Facts\n\n- The current headless CLI entrypoint is `src/entrypoints/cli.tsx`.\n- The formal capture key for execute_harness binding is `benchmark_run_id`.\n- Experiment summaries are stored under `tests/evals/v2/experiment-runs/`.\n\n### Supplemental Context\n\n- The runner can fall back to `bind_existing` when automation is disabled and the manifest allows it.\n- Batch reports are written as Markdown.\n\n### Legacy / Distractor Material\n\n- Older notes mention `src/main.tsx` as the CLI entrypoint.\n- A stale debugging note says \"just grab the latest user_action_id\".\n- Those two statements are intentionally outdated.\n[Context Packet End]", + "tags": [ + "long-context", + "fact-retrieval", + "v2.5", + "real-smoke", + "expectation-contract" + ], + "expected_artifacts": [ + "final_answer" + ], + "expected_tools": [], + "expected_skills": [], + "expected_constraints": [ + "Return exactly four bullet points in the required order", + "Keep the task read-only and explicitly restate it in bullet 4", + "Do not add extra prose before or after the bullets" + ], + "expected_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ], + "context_profile_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "long_context_profile": { + "context_family": "retrieval", + "context_size_class": "medium", + "fixture_ref": "tests/evals/v2/fixtures/long-context/fact-retrieval", + "expected_retained_constraints": [ + "four_bullets_only", + "read_only_task" + ], + "expected_retrieved_facts": [ + "cli_entrypoint_cli_tsx", + "capture_key_benchmark_run_id", + "experiment_summary_dir" + ], + "distractor_refs": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "forbidden_confusions": [ + "old_entrypoint_main_tsx", + "fake_capture_key_latest_action" + ], + "manual_review_questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "expectations": [ + { + "expectation_id": "retain_four_bullets_only_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "four_bullets_only", + "description": "Return exactly four bullet points in the required order." + }, + "severity": "high" + }, + { + "expectation_id": "retain_read_only_constraint_real_smoke_contract_v0", + "expectation_type": "retained_constraint", + "expectation_body": { + "constraint_id": "read_only_task", + "description": "Explicitly restate the read-only rule in bullet 4." + }, + "severity": "high" + }, + { + "expectation_id": "retrieve_capture_key_real_smoke_contract_v0", + "expectation_type": "retrieved_fact", + "expectation_body": { + "fact_id": "capture_key_benchmark_run_id", + "description": "The formal capture key is benchmark_run_id." + }, + "severity": "high" + }, + { + "expectation_id": "avoid_old_entrypoint_real_smoke_contract_v0", + "expectation_type": "forbidden_confusion", + "expectation_body": { + "confusion_id": "old_entrypoint_main_tsx", + "description": "Do not report src/main.tsx as the active CLI entrypoint." + }, + "severity": "high" + }, + { + "expectation_id": "watch_context_budget_retrieval_real_smoke_contract_v0", + "expectation_type": "context_budget", + "expectation_body": { + "metric": "total_prompt_input_tokens", + "description": "Track whether fact retrieval cost stays interpretable under the tightened answer contract." + }, + "severity": "medium" + }, + { + "expectation_id": "manual_check_contract_precision_real_smoke_contract_v0", + "expectation_type": "manual_review", + "expectation_body": { + "questions": [ + "Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint?", + "Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + ] + }, + "severity": "medium" + } + ], + "max_turn_count": 6, + "max_total_billed_tokens": 180000, + "max_subagent_count": 2, + "owner": "local", + "status": "ready" +} diff --git a/tests/evals/v2/score-specs/long-context.score-specs.json b/tests/evals/v2/score-specs/long-context.score-specs.json new file mode 100644 index 0000000000..482bb3ad6f --- /dev/null +++ b/tests/evals/v2/score-specs/long-context.score-specs.json @@ -0,0 +1,154 @@ +{ + "score_specs": [ + { + "score_spec_id": "context.retained_constraint_count", + "dimension": "context", + "subdimension": "retained_constraint_count", + "direction": "higher_is_better", + "formula": "count(long_context.observed_retained_constraints)", + "data_sources": ["V2 run.long_context", "fixture long-context evidence"], + "evidence_requirements": [ + "run.long_context.observed_retained_constraints" + ], + "automation_level": "automatic", + "version": "v2.4" + }, + { + "score_spec_id": "context.lost_constraint_count", + "dimension": "context", + "subdimension": "lost_constraint_count", + "direction": "lower_is_better", + "formula": "count(long_context.observed_lost_constraints)", + "data_sources": ["V2 run.long_context", "fixture long-context evidence"], + "evidence_requirements": [ + "run.long_context.observed_lost_constraints" + ], + "automation_level": "automatic", + "thresholds": { + "max_allowed_value": 0 + }, + "version": "v2.4" + }, + { + "score_spec_id": "context.constraint_retention_rate", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "direction": "higher_is_better", + "formula": "retained_constraint_count / (retained_constraint_count + lost_constraint_count)", + "data_sources": ["V2 run.long_context"], + "evidence_requirements": [ + "run.long_context.observed_retained_constraints", + "run.long_context.observed_lost_constraints" + ], + "automation_level": "automatic", + "thresholds": { + "min_allowed_value": 0.8 + }, + "version": "v2.4" + }, + { + "score_spec_id": "context.retrieved_fact_hit_rate", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "direction": "higher_is_better", + "formula": "retrieved_fact_count / (retrieved_fact_count + missed_fact_count)", + "data_sources": ["V2 run.long_context"], + "evidence_requirements": [ + "run.long_context.observed_retrieved_facts", + "run.long_context.observed_missed_facts" + ], + "automation_level": "automatic", + "thresholds": { + "min_allowed_value": 0.8 + }, + "version": "v2.4" + }, + { + "score_spec_id": "context.distractor_confusion_count", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "direction": "lower_is_better", + "formula": "count(long_context.observed_confusions)", + "data_sources": ["V2 run.long_context"], + "evidence_requirements": [ + "run.long_context.observed_confusions" + ], + "automation_level": "automatic", + "thresholds": { + "max_allowed_value": 0 + }, + "version": "v2.4" + }, + { + "score_spec_id": "context.total_prompt_input_tokens", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "direction": "lower_is_better", + "formula": "user_actions.total_prompt_input_tokens for the run entry action", + "data_sources": ["V1 user_actions", "V2 run.long_context"], + "evidence_requirements": [ + "entry_user_action_id", + "user_actions.total_prompt_input_tokens" + ], + "automation_level": "automatic", + "version": "v2.4" + }, + { + "score_spec_id": "context.compaction_trigger_count", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "direction": "observed_only", + "formula": "count(messages.compact_boundary.applied + messages.microcompact.applied)", + "data_sources": ["V1 events_raw", "V2 run.long_context"], + "evidence_requirements": [ + "events_raw.event_name", + "run.long_context.compaction_trigger_count" + ], + "automation_level": "automatic", + "version": "v2.4" + }, + { + "score_spec_id": "context.compaction_saved_tokens", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "direction": "observed_only", + "formula": "sum(payload.tokens_saved) across compaction-related events", + "data_sources": ["V1 events_raw", "V2 run.long_context"], + "evidence_requirements": [ + "events_raw.payload_json", + "run.long_context.compaction_saved_tokens" + ], + "automation_level": "automatic", + "version": "v2.4" + }, + { + "score_spec_id": "context.success_under_context_pressure", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "direction": "higher_is_better", + "formula": "1 if the long-context fixture/run indicates the task still succeeded under pressure, else 0", + "data_sources": ["V2 run.long_context"], + "evidence_requirements": [ + "run.long_context.success_under_context_pressure" + ], + "automation_level": "automatic", + "version": "v2.4", + "notes": "Real smoke may leave this score inconclusive when final semantic correctness cannot be inferred automatically." + }, + { + "score_spec_id": "context.manual_review_required", + "dimension": "context", + "subdimension": "manual_review_required", + "direction": "observed_only", + "formula": "1 when the scenario still requires human review prompts, else 0", + "data_sources": ["V2 scenario", "V2 run.long_context"], + "evidence_requirements": [ + "scenario.manual_review_questions", + "run.long_context.manual_review_questions" + ], + "automation_level": "mixed", + "version": "v2.4", + "notes": "This is not a quality score. It explicitly preserves the human-review lane for long-context evaluation." + } + ] +} diff --git a/tests/evals/v2/scores/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.scores.json b/tests/evals/v2/scores/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.scores.json new file mode 100644 index 0000000000..1ed5fe2eca --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_task_success_main_chain_observed", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27189, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_stability_recovery_absence", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_retained_constraint_count", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 0 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_lost_constraint_count", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_constraint_retention_rate", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": null, + "score_label": "inconclusive", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "No retained/lost constraint evidence was available." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": null, + "score_label": "inconclusive", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "No retrieved/missed fact evidence was available." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_distractor_confusion_count", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 26887, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_compaction_trigger_count", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_success_under_context_pressure", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_context_manual_review_required", + "run_id": "run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.scores.json b/tests/evals/v2/scores/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.scores.json new file mode 100644 index 0000000000..ee56ac2dd1 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_task_success_main_chain_observed", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27189, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_stability_recovery_absence", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_retained_constraint_count", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 0 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_lost_constraint_count", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_constraint_retention_rate", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": null, + "score_label": "inconclusive", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "No retained/lost constraint evidence was available." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": null, + "score_label": "inconclusive", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "No retrieved/missed fact evidence was available." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_distractor_confusion_count", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 26887, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_compaction_trigger_count", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_success_under_context_pressure", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8_context_manual_review_required", + "run_id": "run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.scores.json new file mode 100644 index 0000000000..164b5660d7 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_stability_recovery_absence", + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.scores.json new file mode 100644 index 0000000000..5b3f210451 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5_stability_recovery_absence", + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.scores.json new file mode 100644 index 0000000000..bfc6f44a03 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec_stability_recovery_absence", + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.scores.json new file mode 100644 index 0000000000..cb04e3a247 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_stability_recovery_absence", + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.scores.json new file mode 100644 index 0000000000..8b4c1dd949 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4_stability_recovery_absence", + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.scores.json new file mode 100644 index 0000000000..af1e33080c --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d_stability_recovery_absence", + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.scores.json new file mode 100644 index 0000000000..5c84b83e30 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_stability_recovery_absence", + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.scores.json new file mode 100644 index 0000000000..6d1c31671d --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c_stability_recovery_absence", + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.scores.json new file mode 100644 index 0000000000..c0b25a2f4c --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4_stability_recovery_absence", + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.scores.json new file mode 100644 index 0000000000..7de1c9e573 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 110, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_stability_recovery_absence", + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.scores.json new file mode 100644 index 0000000000..536ae1215e --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 100, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0_stability_recovery_absence", + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.scores.json b/tests/evals/v2/scores/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.scores.json new file mode 100644 index 0000000000..479e2d899c --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.scores.json @@ -0,0 +1,52 @@ +[ + { + "score_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 105, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7_decision_quality_subagent_count_observed", + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "dimension": "decision_quality", + "subdimension": "subagent_count_observed", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "subagents", + "reason": "Observed subagent count is a fact for later baseline vs candidate comparison." + }, + { + "score_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7_stability_recovery_absence", + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 1." + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.scores.json new file mode 100644 index 0000000000..966c8a27ad --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1280, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_stability_recovery_absence", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 1 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=0.666667 from retained=2, lost=1." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1270, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_context_manual_review_required", + "run_id": "run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer remain valid JSON instead of drifting into prose? | Did the answer preserve owner=v2-platform while staying read-only?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.scores.json new file mode 100644 index 0000000000..ddaaf8cf1e --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1090, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_stability_recovery_absence", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 3, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 3 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=3, lost=0." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1080, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e_context_manual_review_required", + "run_id": "run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer remain valid JSON instead of drifting into prose? | Did the answer preserve owner=v2-platform while staying read-only?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.scores.json new file mode 100644 index 0000000000..f52929917f --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1280, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_stability_recovery_absence", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 1 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=0.666667 from retained=2, lost=1." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1270, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_context_manual_review_required", + "run_id": "run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer remain valid JSON instead of drifting into prose? | Did the answer preserve owner=v2-platform while staying read-only?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.scores.json new file mode 100644 index 0000000000..033b4d04c2 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1090, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_stability_recovery_absence", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 3, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 3 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=3, lost=0." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1080, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22_context_manual_review_required", + "run_id": "run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer remain valid JSON instead of drifting into prose? | Did the answer preserve owner=v2-platform while staying read-only?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.scores.json new file mode 100644 index 0000000000..b835968a69 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1360, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_stability_recovery_absence", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=0.666667 from hits=2, missed=1." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1350, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_context_manual_review_required", + "run_id": "run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.scores.json new file mode 100644 index 0000000000..5630df26d1 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1140, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_stability_recovery_absence", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1130, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9_context_manual_review_required", + "run_id": "run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.scores.json new file mode 100644 index 0000000000..b7c6248bb0 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1360, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_stability_recovery_absence", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=0.666667 from hits=2, missed=1." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1350, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_context_manual_review_required", + "run_id": "run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.scores.json new file mode 100644 index 0000000000..b44ab23339 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1140, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_stability_recovery_absence", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1130, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d_context_manual_review_required", + "run_id": "run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.scores.json new file mode 100644 index 0000000000..334c218088 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1320, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_stability_recovery_absence", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 1 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1310, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_context_manual_review_required", + "run_id": "run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper? | Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.scores.json new file mode 100644 index 0000000000..22b5b7c70f --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1120, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_stability_recovery_absence", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1110, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67_context_manual_review_required", + "run_id": "run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper? | Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.scores.json new file mode 100644 index 0000000000..29f4154136 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1320, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_stability_recovery_absence", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 1 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1310, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_context_manual_review_required", + "run_id": "run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper? | Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.scores.json new file mode 100644 index 0000000000..4343d33e32 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1120, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_stability_recovery_absence", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=3; scenario limit is 8." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=2, missed=0." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1110, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=0." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9_context_manual_review_required", + "run_id": "run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer clearly distinguish the V2.4 candidate from the V2.3 fixture helper? | Did the answer avoid treating the old execute_harness smoke as the long-context manifest?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.scores.json new file mode 100644 index 0000000000..207d693508 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1640, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_stability_recovery_absence", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 10." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 1 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=0.666667 from retained=2, lost=1." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=0.666667 from hits=2, missed=1." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1630, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=2." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 42, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=42." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 0, + "score_label": "fail", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=0." + }, + { + "score_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_context_manual_review_required", + "run_id": "run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer keep the exact three required headings? | Did the answer stay on current compaction signals instead of archived names?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.scores.json new file mode 100644 index 0000000000..39aa567b86 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1240, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_stability_recovery_absence", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 10." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 3, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 3 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=3, lost=0." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1230, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=2." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 188, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=188." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757_context_manual_review_required", + "run_id": "run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer keep the exact three required headings? | Did the answer stay on current compaction signals instead of archived names?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.scores.json new file mode 100644 index 0000000000..96802c5946 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1640, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_stability_recovery_absence", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 10." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 1 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=0.666667 from retained=2, lost=1." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 0.666667, + "score_label": "partial", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=0.666667 from hits=2, missed=1." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1630, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=2." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 42, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=42." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 0, + "score_label": "fail", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=0." + }, + { + "score_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_context_manual_review_required", + "run_id": "run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer keep the exact three required headings? | Did the answer stay on current compaction signals instead of archived names?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.scores.json b/tests/evals/v2/scores/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.scores.json new file mode 100644 index 0000000000..2ae17634f7 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.scores.json @@ -0,0 +1,142 @@ +[ + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_task_success_main_chain_observed", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 1240, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_stability_recovery_absence", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=5; scenario limit is 10." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_retained_constraint_count", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 3, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 3 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_lost_constraint_count", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_constraint_retention_rate", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=3, lost=0." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_distractor_confusion_count", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 1230, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_compaction_trigger_count", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=2." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 188, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=188." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_success_under_context_pressure", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.success_under_context_pressure", + "reason": "Fixture/runtime evidence marked success_under_context_pressure=1." + }, + { + "score_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899_context_manual_review_required", + "run_id": "run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer keep the exact three required headings? | Did the answer stay on current compaction signals instead of archived names?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.scores.json b/tests/evals/v2/scores/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.scores.json new file mode 100644 index 0000000000..42907f2f3f --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_task_success_main_chain_observed", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27189, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_stability_recovery_absence", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_retained_constraint_count", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_lost_constraint_count", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_constraint_retention_rate", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_distractor_confusion_count", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 26887, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_compaction_trigger_count", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_success_under_context_pressure", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_context_manual_review_required", + "run_id": "run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.scores.json b/tests/evals/v2/scores/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.scores.json new file mode 100644 index 0000000000..d69acf94bf --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_task_success_main_chain_observed", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27189, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_stability_recovery_absence", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_retained_constraint_count", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_lost_constraint_count", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_constraint_retention_rate", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_distractor_confusion_count", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 26887, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_compaction_trigger_count", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_success_under_context_pressure", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348_context_manual_review_required", + "run_id": "run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did the answer really name src/entrypoints/cli.tsx rather than an archived entrypoint? | Did the answer preserve the four-bullet constraint without extra prose?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.scores.json b/tests/evals/v2/scores/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.scores.json new file mode 100644 index 0000000000..bc3ba25580 --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_task_success_main_chain_observed", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27436, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_stability_recovery_absence", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_retained_constraint_count", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_lost_constraint_count", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_constraint_retention_rate", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_distractor_confusion_count", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 27007, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_compaction_trigger_count", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_success_under_context_pressure", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e_context_manual_review_required", + "run_id": "run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint? | Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + } +] diff --git a/tests/evals/v2/scores/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.scores.json b/tests/evals/v2/scores/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.scores.json new file mode 100644 index 0000000000..2645915b1c --- /dev/null +++ b/tests/evals/v2/scores/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.scores.json @@ -0,0 +1,152 @@ +[ + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_task_success_main_chain_observed", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "task_success", + "subdimension": "main_chain_observed", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Main-thread root query is present in V1 evidence." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_efficiency_total_billed_tokens", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "efficiency", + "subdimension": "total_billed_tokens", + "score_value": 27372, + "score_label": "observed", + "evidence_ref": "user_actions.total_billed_tokens", + "reason": "Raw efficiency fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_decision_quality_session_memory_policy_observed", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "decision_quality", + "subdimension": "session_memory_policy_observed", + "score_value": 1, + "score_label": "observed", + "evidence_ref": "variant_effect", + "reason": "Session-memory runtime policy was observed in trace-backed evidence." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_stability_recovery_absence", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "stability", + "subdimension": "recovery_absence", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "recoveries", + "reason": "No recovery events were observed for this action." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_controllability_turn_limit_basic", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "controllability", + "subdimension": "turn_limit_basic", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries.turn_count", + "reason": "Root query turn_count=1; scenario limit is 6." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_retained_constraint_count", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "retained_constraint_count", + "score_value": 2, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Observed 2 retained constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_lost_constraint_count", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "lost_constraint_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_lost_constraints", + "reason": "Observed 0 lost constraints from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_constraint_retention_rate", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "constraint_retention_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retained_constraints", + "reason": "Constraint retention rate=1 from retained=2, lost=0." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_retrieved_fact_hit_rate", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "retrieved_fact_hit_rate", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "long_context_evidence.observed_retrieved_facts", + "reason": "Retrieved fact hit rate=1 from hits=3, missed=0." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_distractor_confusion_count", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "distractor_confusion_count", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.observed_confusions", + "reason": "Observed 0 distractor confusions from long-context evidence." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_total_prompt_input_tokens", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "total_prompt_input_tokens", + "score_value": 27007, + "score_label": "observed", + "evidence_ref": "user_actions.total_prompt_input_tokens", + "reason": "Raw prompt-input cost fact from V1 user_actions." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_compaction_trigger_count", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "compaction_trigger_count", + "score_value": 4, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_trigger_count", + "reason": "Observed compaction_trigger_count=4." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_compaction_saved_tokens", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "compaction_saved_tokens", + "score_value": 0, + "score_label": "observed", + "evidence_ref": "long_context_evidence.compaction_saved_tokens", + "reason": "Observed compaction_saved_tokens=0." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_success_under_context_pressure", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "success_under_context_pressure", + "score_value": 1, + "score_label": "pass", + "evidence_ref": "queries", + "reason": "Fallback success signal: root query exists." + }, + { + "score_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d_context_manual_review_required", + "run_id": "run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d", + "dimension": "context", + "subdimension": "manual_review_required", + "score_value": 1, + "score_label": "manual_review_required", + "evidence_ref": "long_context_evidence.manual_review_questions", + "reason": "Manual review remains required. Questions: Did bullet 1 include the exact literal `src/entrypoints/cli.tsx` and avoid any archived or paraphrased entrypoint? | Did bullet 4 explicitly include the sentence `Do not modify files.` with no extra prose before the first bullet or after the fourth bullet?" + } +] diff --git a/tests/evals/v2/variants/candidate_long_context_fixture_guarded.json b/tests/evals/v2/variants/candidate_long_context_fixture_guarded.json new file mode 100644 index 0000000000..59f9a8b9f5 --- /dev/null +++ b/tests/evals/v2/variants/candidate_long_context_fixture_guarded.json @@ -0,0 +1,12 @@ +{ + "variant_id": "candidate_long_context_fixture_guarded", + "name": "Candidate Long Context Fixture Guarded", + "description": "V2.4 fixture-only candidate used to simulate better long-context governance in fixture_trace without claiming a real runtime product improvement.", + "change_layer": "harness", + "base_variant_id": "baseline_default", + "git_commit": "HEAD", + "env_overrides": { + "V2_FIXTURE_VARIANT_KIND": "long_context_guarded" + }, + "notes": "Use only in fixture_trace long-context smoke. This variant is a deterministic simulation helper for V2.4." +} diff --git a/tests/evals/v2/verification-reports/v2_4_long_context_2026-05-03T055334949Z.json b/tests/evals/v2/verification-reports/v2_4_long_context_2026-05-03T055334949Z.json new file mode 100644 index 0000000000..85cb6bda78 --- /dev/null +++ b/tests/evals/v2/verification-reports/v2_4_long_context_2026-05-03T055334949Z.json @@ -0,0 +1,9 @@ +{ + "verification_id": "v2_4_long_context_2026-05-03T055334949Z", + "generated_at": "2026-05-03T05:53:34.959Z", + "passed": true, + "inspected_summary_ref": "tests\\evals\\v2\\experiment-runs\\v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.json", + "batch_report_ref": "ObservrityTask\\10-系统版本\\v2\\06-运行报告\\batch_experiment_v2_4_long_context_fixture_smoke_2026-05-03T054818236Z.md", + "long_context_review_verdict": "needs_manual_review", + "scenario_row_count": 4 +} From fe5dfa588e9562fa1c80804c1a2e89082b04853a Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sat, 9 May 2026 01:18:44 +0800 Subject: [PATCH 18/26] Add deep action observability report pipeline --- scripts/observability/deep_explain_action.ps1 | 96 ++++ scripts/observability/deep_explain_action.ts | 449 ++++++++++++++++++ scripts/observability/lib/artifact_tracker.ts | 75 +++ .../observability/lib/deep_action_types.ts | 211 ++++++++ .../observability/lib/deep_report_writer.ts | 177 +++++++ .../observability/lib/mermaid_rich_graph.ts | 86 ++++ scripts/observability/lib/phase_infer.ts | 279 +++++++++++ scripts/observability/lib/snapshot_reader.ts | 75 +++ .../observability/lib/tool_use_extractor.ts | 292 ++++++++++++ 9 files changed, 1740 insertions(+) create mode 100644 scripts/observability/deep_explain_action.ps1 create mode 100644 scripts/observability/deep_explain_action.ts create mode 100644 scripts/observability/lib/artifact_tracker.ts create mode 100644 scripts/observability/lib/deep_action_types.ts create mode 100644 scripts/observability/lib/deep_report_writer.ts create mode 100644 scripts/observability/lib/mermaid_rich_graph.ts create mode 100644 scripts/observability/lib/phase_infer.ts create mode 100644 scripts/observability/lib/snapshot_reader.ts create mode 100644 scripts/observability/lib/tool_use_extractor.ts diff --git a/scripts/observability/deep_explain_action.ps1 b/scripts/observability/deep_explain_action.ps1 new file mode 100644 index 0000000000..581f391523 --- /dev/null +++ b/scripts/observability/deep_explain_action.ps1 @@ -0,0 +1,96 @@ +param( + [string]$UserActionId, + [switch]$Latest, + [string]$OutputDir +) + +$ErrorActionPreference = "Stop" + +$repoRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) +$duckdbExe = Join-Path $repoRoot "tools\duckdb\duckdb.exe" +$dbPath = Join-Path $repoRoot ".observability\observability_v1.duckdb" +$bunExe = "bun" + +if (-not (Test-Path -LiteralPath $duckdbExe)) { + throw "DuckDB executable not found at $duckdbExe" +} + +if (-not (Test-Path -LiteralPath $dbPath)) { + throw "DuckDB database not found at $dbPath" +} + +if ([string]::IsNullOrWhiteSpace($UserActionId)) { + $Latest = $true +} + +function Resolve-ShortId { + param([string]$Value) + if ([string]::IsNullOrWhiteSpace($Value)) { return "latest" } + if ($Value.Length -le 8) { return $Value } + return $Value.Substring(0, 8) +} + +function Resolve-LatestUserActionId { + $snapshotDir = Join-Path $repoRoot ".observability\v1-report-db-snapshots" + [System.IO.Directory]::CreateDirectory($snapshotDir) | Out-Null + $tempDb = Join-Path $snapshotDir ("deep_explain_action_ps1_{0}.duckdb" -f ([DateTimeOffset]::UtcNow.ToUnixTimeMilliseconds())) + try { + Copy-Item -LiteralPath $dbPath -Destination $tempDb -Force + $rows = & $duckdbExe -json $tempDb "select user_action_id from user_actions order by started_at_ms desc limit 1;" + $parsed = $rows | ConvertFrom-Json + if ($parsed -is [System.Array]) { + return $parsed[0].user_action_id + } + return $parsed.user_action_id + } finally { + if (Test-Path -LiteralPath $tempDb) { + Remove-Item -LiteralPath $tempDb -Force + } + } +} + +if ([string]::IsNullOrWhiteSpace($OutputDir)) { + if ($Latest) { + $UserActionId = Resolve-LatestUserActionId + $Latest = $false + } + $targetId = Resolve-ShortId $UserActionId + $OutputDir = Join-Path $repoRoot ("ObservrityTask\action-reports\deep\user_action_{0}" -f $targetId) +} elseif (-not [System.IO.Path]::IsPathRooted($OutputDir)) { + $OutputDir = Join-Path $repoRoot $OutputDir +} + +[System.IO.Directory]::CreateDirectory($OutputDir) | Out-Null + +$baselineReportPath = Join-Path $OutputDir "baseline_action_report.md" +$tsArgs = @( + "run", + (Join-Path $repoRoot "scripts\observability\deep_explain_action.ts") +) +if ($Latest) { + $tsArgs += "--latest" +} else { + $tsArgs += @("--user-action-id", $UserActionId) +} +$tsArgs += @("--output-dir", $OutputDir, "--baseline-report-path", $baselineReportPath) + +& $bunExe @tsArgs +if ($LASTEXITCODE -ne 0) { + throw "deep_explain_action.ts failed." +} + +$explainArgs = @( + "-ExecutionPolicy", "Bypass", + "-File", (Join-Path $repoRoot "scripts\observability\explain_action.ps1"), + "-OutputPath", $baselineReportPath +) +if ($Latest) { + $explainArgs += "-Latest" +} else { + $explainArgs += @("-UserActionId", $UserActionId) +} +$explainArgs += "-SnapshotDb" + +powershell @explainArgs | Out-Null + +Write-Output ("Generated deep action report: {0}" -f (Join-Path $OutputDir "deep_report.md")) diff --git a/scripts/observability/deep_explain_action.ts b/scripts/observability/deep_explain_action.ts new file mode 100644 index 0000000000..69ce886da6 --- /dev/null +++ b/scripts/observability/deep_explain_action.ts @@ -0,0 +1,449 @@ +import { spawnSync } from "node:child_process" +import { copyFileSync, existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs" +import { join, resolve } from "node:path" +import { buildArtifactChain, enrichToolPaths } from "./lib/artifact_tracker" +import { writeDeepReport } from "./lib/deep_report_writer" +import type { + ActionRow, + ArtifactRecord, + EventRow, + EvidenceRecord, + IntegrityRow, + JsonValue, + PhaseRecord, + QueryRow, + RichToolCall, + SnapshotIndexRow, + SnapshotRecord, + SubagentRow, + ToolRow, + TurnRow, +} from "./lib/deep_action_types" +import { buildDebugChainFlow, buildRichStageFlow } from "./lib/mermaid_rich_graph" +import { inferPhases } from "./lib/phase_infer" +import { SnapshotReader } from "./lib/snapshot_reader" +import { buildRichToolCalls } from "./lib/tool_use_extractor" + +const repoRoot = resolve(import.meta.dir, "..", "..") +const duckdbExe = join(repoRoot, "tools", "duckdb", "duckdb.exe") +const dbPath = join(repoRoot, ".observability", "observability_v1.duckdb") +const dbSnapshotDir = join(repoRoot, ".observability", "v1-report-db-snapshots") + +function fail(message: string): never { + console.error(message) + process.exit(1) +} + +function parseArgs(argv: string[]): { + userActionId?: string + latest: boolean + outputDir?: string + baselineReportPath?: string +} { + const parsed = { latest: false } as { + userActionId?: string + latest: boolean + outputDir?: string + baselineReportPath?: string + } + for (let index = 0; index < argv.length; index += 1) { + const current = argv[index] + if (current === "--user-action-id") parsed.userActionId = argv[++index] + if (current === "--latest") parsed.latest = true + if (current === "--output-dir") parsed.outputDir = argv[++index] + if (current === "--baseline-report-path") parsed.baselineReportPath = argv[++index] + } + if (!parsed.userActionId) parsed.latest = true + return parsed +} + +function sqlLiteral(value: string): string { + return `'${value.replaceAll("'", "''")}'` +} + +function runDuckDbJson(databasePath: string, sql: string): T[] { + const result = spawnSync(duckdbExe, ["-json", databasePath, sql], { + cwd: repoRoot, + encoding: "utf8", + maxBuffer: 1024 * 1024 * 128, + }) + if (result.status !== 0) { + fail(result.stderr?.trim() || result.stdout?.trim() || "duckdb query failed") + } + const raw = result.stdout.trim() + return raw ? (JSON.parse(raw) as T[]) : [] +} + +function createDbSnapshot(): string { + mkdirSync(dbSnapshotDir, { recursive: true }) + const tempDbPath = join(dbSnapshotDir, `deep_explain_action.${process.pid}.${Date.now()}.duckdb`) + copyFileSync(dbPath, tempDbPath) + return tempDbPath +} + +function parseJsonValue(value: string | null): JsonValue | null { + if (!value) return null + try { + return JSON.parse(value) as JsonValue + } catch { + return null + } +} + +function toBoolean(value: unknown): boolean | null { + if (value === null || value === undefined) return null + if (typeof value === "boolean") return value + if (typeof value === "number") return value !== 0 + if (typeof value === "string") { + const lowered = value.toLowerCase() + if (lowered === "true") return true + if (lowered === "false") return false + } + return null +} + +function csvEscape(value: string | number | boolean | null | undefined): string { + const text = value === null || value === undefined ? "" : String(value) + if (/[",\n]/u.test(text)) { + return `"${text.replaceAll('"', '""')}"` + } + return text +} + +function toCsv(headers: string[], rows: Array>): string { + return [ + headers.join(","), + ...rows.map(row => row.map(csvEscape).join(",")), + ].join("\n") +} + +function shortId(value: string | null | undefined): string { + if (!value) return "null" + return value.length <= 8 ? value : value.slice(0, 8) +} + +function pickLatestUserActionId(databasePath: string): string { + const rows = runDuckDbJson<{ user_action_id: string }>( + databasePath, + "select user_action_id from user_actions order by started_at_ms desc limit 1;", + ) + if (rows.length === 0) { + fail("no user actions found") + } + return rows[0]!.user_action_id +} + +function collectResponseSnapshotsByTurn( + events: EventRow[], + snapshotReader: SnapshotReader, +): Map { + const result = new Map() + for (const event of events) { + if (event.event_name !== "api.stream.completed") continue + const payload = parseJsonValue(event.payload_json) + if (!payload || typeof payload !== "object" || Array.isArray(payload)) continue + const snapshotRef = typeof payload.response_snapshot_ref === "string" ? payload.response_snapshot_ref : null + if (!snapshotRef) continue + const key = `${event.effective_query_id ?? event.query_id ?? "unknown"}|${event.turn_id ?? "unknown"}` + const list = result.get(key) ?? [] + list.push(snapshotReader.read(snapshotRef)) + result.set(key, list) + } + return result +} + +function buildEvidenceIndex(params: { + events: EventRow[] + snapshots: Map +}): EvidenceRecord[] { + const rows: EvidenceRecord[] = [] + let index = 0 + + for (const event of params.events) { + const refs = (parseJsonValue(event.snapshot_refs_json) as string[] | null) ?? [] + for (const ref of refs) { + const snapshot = params.snapshots.get(ref) + if (!snapshot) continue + const data = snapshot.data + const extractedFields = + data && typeof data === "object" && !Array.isArray(data) + ? Object.keys(data).slice(0, 8) + : [] + const summary = + snapshot.category === "response" + ? "response snapshot with assistant text/tool_use blocks" + : snapshot.category === "state_after_turn" + ? "after-turn state snapshot" + : snapshot.category === "state_before_turn" + ? "before-turn state snapshot" + : snapshot.category ?? "snapshot" + index += 1 + rows.push({ + evidence_id: `e${String(index).padStart(3, "0")}`, + snapshot_ref: ref, + category: snapshot.category, + query_id: event.effective_query_id ?? event.query_id, + turn_id: event.turn_id, + extracted_fields: extractedFields, + summary, + }) + } + } + + return rows +} + +function main(): void { + if (!existsSync(duckdbExe)) fail(`DuckDB executable not found: ${duckdbExe}`) + if (!existsSync(dbPath)) fail(`DuckDB database not found: ${dbPath}`) + + const args = parseArgs(process.argv.slice(2)) + const tempDbPath = createDbSnapshot() + + try { + const userActionId = args.userActionId ?? pickLatestUserActionId(tempDbPath) + const actionIdSql = sqlLiteral(userActionId) + const action = runDuckDbJson( + tempDbPath, + `select * from user_actions where user_action_id = ${actionIdSql};`, + )[0] + if (!action) fail(`user action not found: ${userActionId}`) + + const integrity = runDuckDbJson( + tempDbPath, + `select * from metrics_integrity_daily where event_date = ${sqlLiteral(action.event_date)};`, + )[0] ?? null + const queries = runDuckDbJson( + tempDbPath, + `select query_id, user_action_id, query_source, subagent_id, subagent_reason, subagent_trigger_kind, subagent_trigger_detail, agent_name, source_group, started_at, started_at_ms, ended_at, ended_at_ms, duration_ms, turn_count, query_max_loop_iter, tool_call_count, terminal_reason, strict_is_complete, inferred_is_complete from queries where user_action_id = ${actionIdSql} order by started_at_ms asc;`, + ) + const turns = runDuckDbJson( + tempDbPath, + `select query_id, turn_id, agent_name, query_source, started_at, started_at_ms, ended_at, ended_at_ms, duration_ms, loop_iter_start, loop_iter_end, tool_call_count, stop_reason, transition_out, termination_reason, strict_is_closed, inferred_is_closed from turns where user_action_id = ${actionIdSql} order by started_at_ms asc;`, + ) + const tools = runDuckDbJson( + tempDbPath, + `select tool_call_id, query_id, turn_id, subagent_id, tool_name, detected_at, detected_at_ms, started_at, started_at_ms, completed_at, completed_at_ms, duration_ms, success, failure_reason from tools where user_action_id = ${actionIdSql} order by detected_at_ms asc;`, + ).map(tool => ({ + ...tool, + success: toBoolean(tool.success), + })) + const subagents = runDuckDbJson( + tempDbPath, + `select subagent_id, query_id, subagent_type, subagent_reason, subagent_trigger_kind, subagent_trigger_detail, query_source, agent_name, source_group, spawned_at, spawned_at_ms, completed_at, completed_at_ms, duration_ms from subagents where user_action_id = ${actionIdSql} order by spawned_at_ms asc;`, + ) + const events = runDuckDbJson( + tempDbPath, + `select event_name, ts_wall, ts_wall_ms, query_id, effective_query_id, turn_id, tool_call_id, subagent_id, payload_json, snapshot_refs_json from events_raw where user_action_id = ${actionIdSql} order by ts_wall_ms asc, event_idx asc;`, + ) + + const snapshotRefs = new Set() + for (const event of events) { + const refs = (parseJsonValue(event.snapshot_refs_json) as string[] | null) ?? [] + for (const ref of refs) snapshotRefs.add(ref) + } + const snapshotIndex = new Map() + if (snapshotRefs.size > 0) { + for (const row of runDuckDbJson( + tempDbPath, + "select snapshot_ref, file_name, relative_path, absolute_path, exists, size_bytes, sha256, referenced_count, first_event_ts, last_event_ts, category from snapshots_index;", + )) { + if (snapshotRefs.has(row.snapshot_ref)) { + snapshotIndex.set(row.snapshot_ref, row) + } + } + } + + const snapshotReader = new SnapshotReader(repoRoot, snapshotIndex) + const snapshots = new Map() + for (const ref of snapshotRefs) { + snapshots.set(ref, snapshotReader.read(ref)) + } + + const turnsByQueryTurn = new Map() + for (const turn of turns) { + turnsByQueryTurn.set(`${turn.query_id}|${turn.turn_id}`, { agent_name: turn.agent_name }) + } + + const responseSnapshotsByTurn = collectResponseSnapshotsByTurn(events, snapshotReader) + const richTools = enrichToolPaths( + buildRichToolCalls({ tools, events, turnsByQueryTurn, responseSnapshotsByTurn }), + ) + const phases = inferPhases({ action, queries, turns, tools: richTools }) + const phaseByToolId = new Map() + for (const phase of phases) { + for (const toolCallId of phase.tool_call_ids) { + phaseByToolId.set(toolCallId, phase) + } + } + const artifacts = buildArtifactChain(richTools, phaseByToolId) + const evidence = buildEvidenceIndex({ events, snapshots }) + + const outputDir = + args.outputDir ?? + join(repoRoot, "ObservrityTask", "action-reports", "deep", `user_action_${shortId(userActionId)}`) + mkdirSync(outputDir, { recursive: true }) + + const richMermaid = buildRichStageFlow(phases) + const debugMermaid = buildDebugChainFlow(phases) + const richMermaidPath = join(outputDir, "rich_stage_flow.mmd") + const debugMermaidPath = join(outputDir, "debug_chain_flow.mmd") + writeFileSync(richMermaidPath, richMermaid, "utf8") + writeFileSync(debugMermaidPath, debugMermaid, "utf8") + + writeFileSync( + join(outputDir, "phase_timeline_mapping.csv"), + toCsv( + [ + "phase_id", + "phase_name", + "start_local", + "end_local", + "duration_ms", + "query_ids", + "turn_range", + "tool_counts", + "main_outputs", + "problems", + "evidence_refs", + ], + phases.map(phase => [ + phase.phase_id, + phase.phase_name, + phase.start_local, + phase.end_local, + phase.duration_ms, + phase.query_ids.join(";"), + phase.turn_ids.join(";"), + Object.entries(phase.tool_counts) + .map(([name, count]) => `${name}:${count}`) + .join(";"), + phase.main_outputs.join(" | "), + phase.problems.join(" | "), + phase.evidence_refs.join(";"), + ]), + ), + "utf8", + ) + + writeFileSync( + join(outputDir, "tool_calls_rich.csv"), + toCsv( + [ + "query_id", + "agent_name", + "turn_id", + "tool_name", + "detected_at", + "completed_at", + "duration_ms", + "success", + "input_summary", + "output_summary", + "command_or_path", + "intent_inferred", + "produced_files", + "touched_files", + "snapshot_refs", + ], + richTools.map(tool => [ + tool.query_id, + tool.agent_name, + tool.turn_id, + tool.tool_name, + tool.detected_at, + tool.completed_at, + tool.duration_ms, + tool.success, + tool.input_summary, + tool.output_summary, + tool.command_or_path, + tool.intent_inferred, + tool.produced_files.join(";"), + tool.touched_files.join(";"), + tool.snapshot_refs.join(";"), + ]), + ), + "utf8", + ) + + writeFileSync( + join(outputDir, "artifact_chain.csv"), + toCsv( + [ + "artifact_path", + "artifact_type", + "first_seen_phase", + "created_by_tool", + "modified_by_tools", + "evidence_refs", + ], + artifacts.map((artifact: ArtifactRecord) => [ + artifact.artifact_path, + artifact.artifact_type, + artifact.first_seen_phase, + artifact.created_by_tool, + artifact.modified_by_tools.join(";"), + artifact.evidence_refs.join(";"), + ]), + ), + "utf8", + ) + + writeFileSync( + join(outputDir, "snapshot_evidence_index.csv"), + toCsv( + ["evidence_id", "snapshot_ref", "category", "query_id", "turn_id", "extracted_fields", "summary"], + evidence.map((item: EvidenceRecord) => [ + item.evidence_id, + item.snapshot_ref, + item.category, + item.query_id, + item.turn_id, + item.extracted_fields.join(";"), + item.summary, + ]), + ), + "utf8", + ) + + const report = writeDeepReport({ + action, + integrity, + queries, + subagents, + phases, + tools: richTools, + artifacts, + evidence, + richMermaidPath: "rich_stage_flow.mmd", + debugMermaidPath: "debug_chain_flow.mmd", + baselineReportPath: args.baselineReportPath ? "baseline_action_report.md" : null, + }) + writeFileSync(join(outputDir, "deep_report.md"), report, "utf8") + + console.log( + JSON.stringify( + { + userActionId, + outputDir, + files: [ + "deep_report.md", + "rich_stage_flow.mmd", + "debug_chain_flow.mmd", + "phase_timeline_mapping.csv", + "tool_calls_rich.csv", + "artifact_chain.csv", + "snapshot_evidence_index.csv", + ], + }, + null, + 2, + ), + ) + } finally { + rmSync(tempDbPath, { force: true }) + } +} + +main() diff --git a/scripts/observability/lib/artifact_tracker.ts b/scripts/observability/lib/artifact_tracker.ts new file mode 100644 index 0000000000..05df298425 --- /dev/null +++ b/scripts/observability/lib/artifact_tracker.ts @@ -0,0 +1,75 @@ +import type { ArtifactRecord, PhaseRecord, RichToolCall } from "./deep_action_types" + +const PATH_PATTERN = + /([A-Za-z]:\\[^\s"'`|<>]+|\/[^\s"'`|<>]+|(?:\.{0,2}\/)?[\w.-]+(?:\/[\w.-]+)*\.(?:docx|pptx|txt|json|py|js|ts|ps1|csv|md))/gu + +function unique(values: T[]): T[] { + return [...new Set(values)] +} + +function extractPaths(text: string): string[] { + return unique([...text.matchAll(PATH_PATTERN)].map(match => match[1] ?? "").filter(Boolean)) +} + +function classifyArtifact(path: string): string { + const lowered = path.toLowerCase() + if (/\.(py|js|ts|ps1)$/u.test(lowered)) return "script" + if (/\.(pptx)$/u.test(lowered)) return "final" + if (/\.(docx)$/u.test(lowered)) return "input" + if (/\.(md|csv|json|txt)$/u.test(lowered)) return lowered.includes("report") ? "report" : "intermediate" + return "other" +} + +export function enrichToolPaths(tools: RichToolCall[]): RichToolCall[] { + return tools.map(tool => { + const discovered = extractPaths(`${tool.command_or_path}\n${tool.input_summary}\n${tool.output_summary}`) + const touched = unique([...tool.touched_files, ...discovered]) + const produced = unique([ + ...tool.produced_files, + ...discovered.filter(path => /save|write|export|generate|create/iu.test(tool.command_or_path)), + ]) + return { + ...tool, + touched_files: touched, + produced_files: produced, + } + }) +} + +export function buildArtifactChain( + tools: RichToolCall[], + phasesByToolId: Map, +): ArtifactRecord[] { + const artifacts = new Map() + + for (const tool of tools) { + const phase = phasesByToolId.get(tool.tool_call_id) + const phaseId = phase?.phase_id ?? "unknown" + const everyPath = unique([...tool.touched_files, ...tool.produced_files]) + for (const path of everyPath) { + const existing = artifacts.get(path) + if (!existing) { + artifacts.set(path, { + artifact_path: path, + artifact_type: classifyArtifact(path), + first_seen_phase: phaseId, + created_by_tool: tool.produced_files.includes(path) ? tool.tool_name : "", + modified_by_tools: tool.touched_files.includes(path) ? [tool.tool_name] : [], + evidence_refs: [...tool.evidence_refs], + }) + continue + } + if (!existing.created_by_tool && tool.produced_files.includes(path)) { + existing.created_by_tool = tool.tool_name + } + if (tool.touched_files.includes(path)) { + existing.modified_by_tools = unique([...existing.modified_by_tools, tool.tool_name]) + } + existing.evidence_refs = unique([...existing.evidence_refs, ...tool.evidence_refs]) + } + } + + return [...artifacts.values()].sort((left, right) => + left.artifact_path.localeCompare(right.artifact_path), + ) +} diff --git a/scripts/observability/lib/deep_action_types.ts b/scripts/observability/lib/deep_action_types.ts new file mode 100644 index 0000000000..352db0c0a5 --- /dev/null +++ b/scripts/observability/lib/deep_action_types.ts @@ -0,0 +1,211 @@ +export type JsonValue = + | null + | boolean + | number + | string + | JsonValue[] + | { [key: string]: JsonValue } + +export type ActionRow = { + user_action_id: string + event_date: string + started_at: string + started_at_ms: number + ended_at: string + ended_at_ms: number + duration_ms: number + query_count: number + subagent_count: number + tool_call_count: number + total_prompt_input_tokens: number + total_billed_tokens: number + main_thread_total_prompt_input_tokens: number + subagent_total_prompt_input_tokens: number +} + +export type IntegrityRow = Record + +export type QueryRow = { + query_id: string + user_action_id: string + query_source: string | null + subagent_id: string | null + subagent_reason: string | null + subagent_trigger_kind: string | null + subagent_trigger_detail: string | null + agent_name: string | null + source_group: string | null + started_at: string + started_at_ms: number + ended_at: string | null + ended_at_ms: number | null + duration_ms: number | null + turn_count: number + query_max_loop_iter: number | null + tool_call_count: number + terminal_reason: string | null + strict_is_complete: boolean | null + inferred_is_complete: boolean | null +} + +export type TurnRow = { + query_id: string + turn_id: string + agent_name: string | null + query_source: string | null + started_at: string + started_at_ms: number + ended_at: string | null + ended_at_ms: number | null + duration_ms: number | null + loop_iter_start: number | null + loop_iter_end: number | null + tool_call_count: number + stop_reason: string | null + transition_out: string | null + termination_reason: string | null + strict_is_closed: boolean | null + inferred_is_closed: boolean | null +} + +export type ToolRow = { + tool_call_id: string + query_id: string | null + turn_id: string | null + subagent_id: string | null + tool_name: string | null + detected_at: string | null + detected_at_ms: number | null + started_at: string | null + started_at_ms: number | null + completed_at: string | null + completed_at_ms: number | null + duration_ms: number | null + success: boolean | null + failure_reason: string | null +} + +export type SubagentRow = { + subagent_id: string + query_id: string | null + subagent_type: string | null + subagent_reason: string | null + subagent_trigger_kind: string | null + subagent_trigger_detail: string | null + query_source: string | null + agent_name: string | null + source_group: string | null + spawned_at: string | null + spawned_at_ms: number | null + completed_at: string | null + completed_at_ms: number | null + duration_ms: number | null +} + +export type EventRow = { + event_name: string + ts_wall: string + ts_wall_ms: number | null + query_id: string | null + effective_query_id: string | null + turn_id: string | null + tool_call_id: string | null + subagent_id: string | null + payload_json: string | null + snapshot_refs_json: string | null +} + +export type SnapshotIndexRow = { + snapshot_ref: string + file_name: string + relative_path: string + absolute_path: string + exists: boolean + size_bytes: number | null + sha256: string | null + referenced_count: number + first_event_ts: string | null + last_event_ts: string | null + category: string | null +} + +export type SnapshotRecord = { + snapshotRef: string + category: string | null + exists: boolean + absolutePath: string + data: JsonValue | null + warnings: string[] +} + +export type ToolInputSemantics = { + toolUseId: string + toolName: string + inputSummary: string + commandOrPath: string + touchedFiles: string[] + producedFiles: string[] + assistantTextSummary: string + promptSummary: string + rawInput: JsonValue | null +} + +export type RichToolCall = { + tool_call_id: string + query_id: string | null + agent_name: string | null + turn_id: string | null + tool_name: string + detected_at: string | null + completed_at: string | null + duration_ms: number | null + success: boolean | null + input_summary: string + output_summary: string + command_or_path: string + intent_inferred: string + produced_files: string[] + touched_files: string[] + snapshot_refs: string[] + evidence_refs: string[] + warnings: string[] + prompt_summary: string +} + +export type PhaseRecord = { + phase_id: string + phase_name: string + stage_kind: "input" | "main" | "subagent" | "compact" | "script" | "issue" | "fix" | "output" + start_local: string + end_local: string + duration_ms: number + start_ms: number + end_ms: number + query_ids: string[] + turn_ids: string[] + tool_counts: Record + main_outputs: string[] + problems: string[] + fixes: string[] + evidence_refs: string[] + tool_call_ids: string[] +} + +export type ArtifactRecord = { + artifact_path: string + artifact_type: string + first_seen_phase: string + created_by_tool: string + modified_by_tools: string[] + evidence_refs: string[] +} + +export type EvidenceRecord = { + evidence_id: string + snapshot_ref: string + category: string | null + query_id: string | null + turn_id: string | null + extracted_fields: string[] + summary: string +} diff --git a/scripts/observability/lib/deep_report_writer.ts b/scripts/observability/lib/deep_report_writer.ts new file mode 100644 index 0000000000..982c727d3b --- /dev/null +++ b/scripts/observability/lib/deep_report_writer.ts @@ -0,0 +1,177 @@ +import type { + ActionRow, + ArtifactRecord, + EvidenceRecord, + IntegrityRow, + PhaseRecord, + QueryRow, + RichToolCall, + SubagentRow, +} from "./deep_action_types" + +function unique(values: T[]): T[] { + return [...new Set(values)] +} + +function shortId(value: string | null | undefined): string { + if (!value) return "null" + return value.length <= 8 ? value : value.slice(0, 8) +} + +function table(headers: string[], rows: string[][]): string[] { + return [ + `| ${headers.join(" | ")} |`, + `| ${headers.map(() => "---").join(" | ")} |`, + ...rows.map(row => `| ${row.join(" | ")} |`), + ] +} + +export function writeDeepReport(params: { + action: ActionRow + integrity: IntegrityRow | null + queries: QueryRow[] + subagents: SubagentRow[] + phases: PhaseRecord[] + tools: RichToolCall[] + artifacts: ArtifactRecord[] + evidence: EvidenceRecord[] + richMermaidPath: string + debugMermaidPath: string + baselineReportPath: string | null +}): string { + const missingSnapshotCount = params.tools.filter(tool => + tool.warnings.some(warning => warning.includes("snapshot")), + ).length + const confidence = missingSnapshotCount === 0 ? "high" : missingSnapshotCount < 5 ? "medium" : "low" + const summary = `This action expanded into ${params.action.query_count} queries, ${params.action.subagent_count} subagents, and ${params.phases.length} inferred phases with ${params.action.tool_call_count} tool calls.` + const lines: string[] = [ + "# Deep Action Report", + "", + "## 1. 一句话总结", + "", + summary, + "", + "## 2. Basics", + "", + `- user_action_id: ${params.action.user_action_id}`, + `- utc: ${params.action.started_at} -> ${params.action.ended_at}`, + `- duration_ms: ${params.action.duration_ms}`, + `- query_count: ${params.action.query_count}`, + `- subagent_count: ${params.action.subagent_count}`, + `- tool_call_count: ${params.action.tool_call_count}`, + `- total_prompt_input_tokens: ${params.action.total_prompt_input_tokens}`, + `- total_billed_tokens: ${params.action.total_billed_tokens}`, + "", + ] + + if (params.integrity) { + lines.push("## 3. Integrity Snapshot", "") + for (const [key, value] of Object.entries(params.integrity)) { + lines.push(`- ${key}: ${value ?? ""}`) + } + lines.push("") + } + + lines.push("## 4. Query / Agent 分工", "") + for (const query of params.queries) { + lines.push( + `- ${query.agent_name ?? "unknown"} ${shortId(query.query_id)}: turns=${query.turn_count}, tools=${query.tool_call_count}, duration_ms=${query.duration_ms ?? ""}, terminal=${query.terminal_reason ?? ""}`, + ) + } + for (const subagent of params.subagents) { + lines.push( + `- subagent ${shortId(subagent.subagent_id)}: ${subagent.subagent_reason ?? ""}, duration_ms=${subagent.duration_ms ?? ""}, child_query=${shortId(subagent.query_id)}`, + ) + } + lines.push("") + + lines.push("## 5. 阶段级时间线", "") + lines.push( + ...table( + ["phase", "time", "queries", "turns", "tools", "outputs", "problems", "evidence"], + params.phases.map(phase => [ + phase.phase_name, + `${phase.start_local} -> ${phase.end_local}`, + phase.query_ids.map(shortId).join(", "), + unique(phase.turn_ids).join(", "), + Object.entries(phase.tool_counts) + .map(([name, count]) => `${name} x${count}`) + .join("; "), + (phase.main_outputs[0] ?? "").replaceAll("|", "\\|"), + (phase.problems[0] ?? "").replaceAll("|", "\\|"), + phase.evidence_refs.slice(0, 2).join("
"), + ]), + ), + ) + lines.push("") + + lines.push("## 6. 富证据复杂 DAG", "") + lines.push(`- rich stage flow: ${params.richMermaidPath}`) + lines.push(`- debug chain flow: ${params.debugMermaidPath}`) + if (params.baselineReportPath) { + lines.push(`- baseline explain_action report: ${params.baselineReportPath}`) + } + lines.push("") + + lines.push("## 7. 工具调用语义复盘", "") + for (const tool of params.tools.slice(0, 20)) { + lines.push( + `- ${tool.tool_name} ${shortId(tool.tool_call_id)} @ ${tool.turn_id ?? "no-turn"}: ${tool.input_summary}; output=${tool.output_summary}; intent=${tool.intent_inferred}; evidence=${tool.evidence_refs[0] ?? "none"}`, + ) + } + if (params.tools.length > 20) { + lines.push(`- ... ${params.tools.length - 20} more tool calls in tool_calls_rich.csv`) + } + lines.push("") + + lines.push("## 8. 文件产物链", "") + for (const artifact of params.artifacts.slice(0, 20)) { + lines.push( + `- ${artifact.artifact_path}: type=${artifact.artifact_type}, first_seen_phase=${artifact.first_seen_phase}, created_by=${artifact.created_by_tool || "unknown"}, modified_by=${artifact.modified_by_tools.join(", ") || "none"}`, + ) + } + if (params.artifacts.length > 20) { + lines.push(`- ... ${params.artifacts.length - 20} more artifacts in artifact_chain.csv`) + } + lines.push("") + + lines.push("## 9. 问题与修复链", "") + const issueTools = params.tools.filter( + tool => tool.success === false || tool.intent_inferred === "repair" || tool.warnings.length > 0, + ) + if (issueTools.length === 0) { + lines.push("- no dense repair chain detected") + } else { + for (const tool of issueTools.slice(0, 20)) { + lines.push( + `- ${tool.tool_name} ${shortId(tool.tool_call_id)}: ${tool.output_summary}; warnings=${tool.warnings.join("; ") || "none"}`, + ) + } + } + lines.push("") + + lines.push("## 10. Snapshot 证据索引", "") + lines.push( + ...table( + ["evidence_id", "category", "query", "turn", "fields", "summary"], + params.evidence.slice(0, 20).map(item => [ + item.evidence_id, + item.category ?? "", + shortId(item.query_id), + item.turn_id ?? "", + item.extracted_fields.join(", "), + item.summary.replaceAll("|", "\\|"), + ]), + ), + ) + if (params.evidence.length > 20) { + lines.push("", `More evidence rows: ${params.evidence.length - 20} omitted from report; see snapshot_evidence_index.csv`) + } + lines.push("", "## 11. 缺失信息与可信度", "") + lines.push(`- confidence: ${confidence}`) + lines.push(`- missing_snapshot_tool_calls: ${missingSnapshotCount}`) + if (missingSnapshotCount > 0) { + lines.push("- some tool parameters or results could not be reconstructed because response/state snapshots were missing in V1 facts") + } + return lines.join("\n") +} diff --git a/scripts/observability/lib/mermaid_rich_graph.ts b/scripts/observability/lib/mermaid_rich_graph.ts new file mode 100644 index 0000000000..764c5e2707 --- /dev/null +++ b/scripts/observability/lib/mermaid_rich_graph.ts @@ -0,0 +1,86 @@ +import type { PhaseRecord } from "./deep_action_types" + +function esc(text: string): string { + return text.replaceAll('"', "'") +} + +function label(phase: PhaseRecord): string { + const toolSummary = Object.entries(phase.tool_counts) + .map(([name, count]) => `${name} x${count}`) + .join(" + ") + return esc( + [ + phase.phase_name, + `${phase.start_local} -> ${phase.end_local}`, + `duration ${phase.duration_ms}ms`, + phase.turn_ids.length > 0 ? `turns ${phase.turn_ids.join(",")}` : "", + toolSummary ? `tools ${toolSummary}` : "", + phase.main_outputs[0] ? `output ${phase.main_outputs[0]}` : "", + phase.problems[0] ? `problem ${phase.problems[0]}` : "", + phase.fixes[0] ? `fix ${phase.fixes[0]}` : "", + ] + .filter(Boolean) + .join("
"), + ) +} + +function className(kind: PhaseRecord["stage_kind"]): string { + return kind +} + +export function buildRichStageFlow(phases: PhaseRecord[]): string { + const lines = [ + "flowchart TD", + " classDef input fill:#eef6ff,stroke:#1d4ed8,color:#0f172a", + " classDef main fill:#ecfdf5,stroke:#15803d,color:#052e16", + " classDef subagent fill:#fff7ed,stroke:#c2410c,color:#431407", + " classDef compact fill:#f5f3ff,stroke:#7c3aed,color:#2e1065", + " classDef script fill:#fef3c7,stroke:#b45309,color:#451a03", + " classDef issue fill:#fff1f2,stroke:#e11d48,color:#4c0519", + " classDef fix fill:#eff6ff,stroke:#0891b2,color:#082f49", + " classDef output fill:#f0fdf4,stroke:#16a34a,color:#14532d", + ] + + phases.forEach((phase, index) => { + const nodeId = `P${index + 1}` + lines.push(` ${nodeId}["${label(phase)}"]`) + lines.push(` class ${nodeId} ${className(phase.stage_kind)}`) + if (index > 0) { + lines.push(` P${index} --> ${nodeId}`) + } + }) + + return lines.join("\n") +} + +export function buildDebugChainFlow(phases: PhaseRecord[]): string { + const debugPhases = phases.filter( + phase => + phase.problems.length > 0 || + phase.fixes.length > 0 || + phase.phase_name === "repair" || + phase.stage_kind === "issue" || + phase.stage_kind === "fix", + ) + const lines = [ + "flowchart TD", + " classDef issue fill:#fff1f2,stroke:#e11d48,color:#4c0519", + " classDef fix fill:#eff6ff,stroke:#0891b2,color:#082f49", + " classDef output fill:#f0fdf4,stroke:#16a34a,color:#14532d", + ] + + debugPhases.forEach((phase, index) => { + const nodeId = `D${index + 1}` + lines.push(` ${nodeId}["${label(phase)}"]`) + lines.push(` class ${nodeId} ${phase.stage_kind === "fix" ? "fix" : phase.problems.length > 0 ? "issue" : "output"}`) + if (index > 0) { + lines.push(` D${index} --> ${nodeId}`) + } + }) + + if (debugPhases.length === 0) { + lines.push(' D1["no dense repair chain detected"]') + lines.push(" class D1 output") + } + return lines.join("\n") +} diff --git a/scripts/observability/lib/phase_infer.ts b/scripts/observability/lib/phase_infer.ts new file mode 100644 index 0000000000..377501aa43 --- /dev/null +++ b/scripts/observability/lib/phase_infer.ts @@ -0,0 +1,279 @@ +import type { ActionRow, PhaseRecord, QueryRow, RichToolCall, TurnRow } from "./deep_action_types" + +type Seed = { + name: string + kind: PhaseRecord["stage_kind"] + startMs: number + endMs: number + queryId: string | null + turnId: string | null + toolName: string | null + toolCallId: string | null + output: string + problem: string + fix: string + evidenceRefs: string[] +} + +function localText(value: number): string { + return new Date(value).toLocaleString("sv-SE").replace("T", " ") +} + +function inferPhaseName(tool: RichToolCall): { name: string; kind: PhaseRecord["stage_kind"] } { + const haystack = `${tool.tool_name} ${tool.input_summary} ${tool.command_or_path} ${tool.prompt_summary} ${tool.agent_name ?? ""}`.toLowerCase() + if (haystack.includes("compact")) return { name: "compact", kind: "compact" } + if (haystack.includes("docx") || haystack.includes("python-docx") || haystack.includes("word")) { + return { name: "thesis_parse", kind: tool.agent_name === "main_thread" ? "main" : "subagent" } + } + if (haystack.includes("pptx") || haystack.includes("template") || haystack.includes("python-pptx")) { + return { name: "template_parse", kind: tool.agent_name === "main_thread" ? "main" : "subagent" } + } + if (haystack.includes("word/media") || haystack.includes("zipfile")) { + return { name: "media_extract", kind: "subagent" } + } + if (haystack.includes("blip") || haystack.includes("caption") || haystack.includes("image")) { + return { name: "image_caption_map", kind: "subagent" } + } + if (haystack.includes("pptxgenjs") || haystack.includes("generate_ppt") || haystack.includes("create_ppt")) { + return { name: "deck_build", kind: "script" } + } + if (haystack.includes("overlap") || haystack.includes("out-of-bounds") || haystack.includes("check")) { + return { name: "layout_check", kind: "issue" } + } + if (haystack.includes("readonly") || haystack.includes("lock") || haystack.includes("copy2") || haystack.includes("save")) { + return { name: "ppt_save_fix", kind: "fix" } + } + if (tool.tool_name === "Agent") return { name: "spawn_subagents", kind: "main" } + if (tool.tool_name === "Read" || tool.tool_name === "Grep" || tool.tool_name === "Glob") { + return { name: tool.agent_name === "main_thread" ? "initial_read" : "subagent_work", kind: tool.agent_name === "main_thread" ? "input" : "subagent" } + } + if (tool.tool_name === "Write" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) { + return { name: "script_generation", kind: "script" } + } + if (tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) { + return { name: "script_execution", kind: "script" } + } + if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit") { + return { name: "repair", kind: "fix" } + } + if (tool.tool_name === "Task") return { name: "completion", kind: "output" } + if (tool.agent_name && tool.agent_name !== "main_thread") { + return { name: "subagent_work", kind: "subagent" } + } + return { name: "main_preparation", kind: "main" } +} + +function appendCount(target: Record, key: string): void { + target[key] = (target[key] ?? 0) + 1 +} + +function mergeSeeds(seeds: Seed[]): PhaseRecord[] { + if (seeds.length === 0) { + return [] + } + const sorted = [...seeds].sort((left, right) => left.startMs - right.startMs) + const phases: PhaseRecord[] = [] + let current: PhaseRecord | null = null + + for (const seed of sorted) { + const shouldMerge = + current && + current.phase_name === seed.name && + current.stage_kind === seed.kind && + seed.startMs - current.end_ms < 90_000 + + if (!shouldMerge) { + current = { + phase_id: `phase_${String(phases.length + 1).padStart(2, "0")}`, + phase_name: seed.name, + stage_kind: seed.kind, + start_local: localText(seed.startMs), + end_local: localText(seed.endMs), + duration_ms: Math.max(seed.endMs - seed.startMs, 0), + start_ms: seed.startMs, + end_ms: seed.endMs, + query_ids: seed.queryId ? [seed.queryId] : [], + turn_ids: seed.turnId ? [seed.turnId] : [], + tool_counts: {}, + main_outputs: seed.output ? [seed.output] : [], + problems: seed.problem ? [seed.problem] : [], + fixes: seed.fix ? [seed.fix] : [], + evidence_refs: [...seed.evidenceRefs], + tool_call_ids: seed.toolCallId ? [seed.toolCallId] : [], + } + if (seed.toolName) { + appendCount(current.tool_counts, seed.toolName) + } + phases.push(current) + continue + } + + current.end_ms = Math.max(current.end_ms, seed.endMs) + current.end_local = localText(current.end_ms) + current.duration_ms = Math.max(current.end_ms - current.start_ms, 0) + if (seed.queryId && !current.query_ids.includes(seed.queryId)) current.query_ids.push(seed.queryId) + if (seed.turnId && !current.turn_ids.includes(seed.turnId)) current.turn_ids.push(seed.turnId) + if (seed.toolName) appendCount(current.tool_counts, seed.toolName) + if (seed.output && !current.main_outputs.includes(seed.output)) current.main_outputs.push(seed.output) + if (seed.problem && !current.problems.includes(seed.problem)) current.problems.push(seed.problem) + if (seed.fix && !current.fixes.includes(seed.fix)) current.fixes.push(seed.fix) + for (const ref of seed.evidenceRefs) { + if (!current.evidence_refs.includes(ref)) current.evidence_refs.push(ref) + } + if (seed.toolCallId && !current.tool_call_ids.includes(seed.toolCallId)) { + current.tool_call_ids.push(seed.toolCallId) + } + } + + return phases +} + +function coalescePhases(phases: PhaseRecord[]): PhaseRecord[] { + const merged = new Map() + const order: string[] = [] + + for (const phase of phases) { + const key = `${phase.phase_name}|${phase.stage_kind}` + const existing = merged.get(key) + if (!existing) { + merged.set(key, { + ...phase, + query_ids: [...phase.query_ids], + turn_ids: [...phase.turn_ids], + tool_counts: { ...phase.tool_counts }, + main_outputs: [...phase.main_outputs], + problems: [...phase.problems], + fixes: [...phase.fixes], + evidence_refs: [...phase.evidence_refs], + tool_call_ids: [...phase.tool_call_ids], + }) + order.push(key) + continue + } + + existing.start_ms = Math.min(existing.start_ms, phase.start_ms) + existing.end_ms = Math.max(existing.end_ms, phase.end_ms) + existing.start_local = localText(existing.start_ms) + existing.end_local = localText(existing.end_ms) + existing.duration_ms = Math.max(existing.end_ms - existing.start_ms, 0) + for (const queryId of phase.query_ids) { + if (!existing.query_ids.includes(queryId)) existing.query_ids.push(queryId) + } + for (const turnId of phase.turn_ids) { + if (!existing.turn_ids.includes(turnId)) existing.turn_ids.push(turnId) + } + for (const [toolName, count] of Object.entries(phase.tool_counts)) { + existing.tool_counts[toolName] = (existing.tool_counts[toolName] ?? 0) + count + } + for (const output of phase.main_outputs) { + if (!existing.main_outputs.includes(output)) existing.main_outputs.push(output) + } + for (const problem of phase.problems) { + if (!existing.problems.includes(problem)) existing.problems.push(problem) + } + for (const fix of phase.fixes) { + if (!existing.fixes.includes(fix)) existing.fixes.push(fix) + } + for (const ref of phase.evidence_refs) { + if (!existing.evidence_refs.includes(ref)) existing.evidence_refs.push(ref) + } + for (const toolCallId of phase.tool_call_ids) { + if (!existing.tool_call_ids.includes(toolCallId)) existing.tool_call_ids.push(toolCallId) + } + } + + return order.map((key, index) => ({ + ...merged.get(key)!, + phase_id: `phase_${String(index + 1).padStart(2, "0")}`, + })) +} + +export function inferPhases(params: { + action: ActionRow + queries: QueryRow[] + turns: TurnRow[] + tools: RichToolCall[] +}): PhaseRecord[] { + const seeds: Seed[] = [] + const firstTool = [...params.tools] + .filter(tool => tool.detected_at) + .sort((left, right) => Date.parse(left.detected_at ?? "") - Date.parse(right.detected_at ?? ""))[0] + + if (firstTool?.detected_at) { + seeds.push({ + name: "action_start", + kind: "input", + startMs: params.action.started_at_ms, + endMs: Date.parse(firstTool.detected_at), + queryId: params.queries[0]?.query_id ?? null, + turnId: params.turns[0]?.turn_id ?? null, + toolName: null, + toolCallId: null, + output: "entered action", + problem: "", + fix: "", + evidenceRefs: [], + }) + } + + for (const tool of params.tools) { + const startMs = tool.detected_at ? Date.parse(tool.detected_at) : params.action.started_at_ms + const endMs = tool.completed_at ? Date.parse(tool.completed_at) : startMs + const inferred = inferPhaseName(tool) + const failed = tool.success === false ? tool.output_summary : "" + const fix = inferred.kind === "fix" ? tool.input_summary : "" + seeds.push({ + name: inferred.name, + kind: inferred.kind, + startMs, + endMs, + queryId: tool.query_id, + turnId: tool.turn_id, + toolName: tool.tool_name, + toolCallId: tool.tool_call_id, + output: tool.produced_files[0] ?? tool.output_summary, + problem: failed, + fix, + evidenceRefs: tool.evidence_refs, + }) + } + + if (params.queries.some(query => (query.query_source ?? "").includes("compact"))) { + const compactQueries = params.queries.filter(query => + (query.query_source ?? "").includes("compact"), + ) + for (const query of compactQueries) { + seeds.push({ + name: "compact", + kind: "compact", + startMs: query.started_at_ms, + endMs: query.ended_at_ms ?? query.started_at_ms, + queryId: query.query_id, + turnId: null, + toolName: null, + toolCallId: null, + output: query.terminal_reason ?? "", + problem: "", + fix: "", + evidenceRefs: [], + }) + } + } + + seeds.push({ + name: "completion", + kind: "output", + startMs: params.action.ended_at_ms, + endMs: params.action.ended_at_ms, + queryId: params.queries.at(-1)?.query_id ?? null, + turnId: params.turns.at(-1)?.turn_id ?? null, + toolName: null, + toolCallId: null, + output: "action completed", + problem: "", + fix: "", + evidenceRefs: [], + }) + + return coalescePhases(mergeSeeds(seeds)) +} diff --git a/scripts/observability/lib/snapshot_reader.ts b/scripts/observability/lib/snapshot_reader.ts new file mode 100644 index 0000000000..eb38d24674 --- /dev/null +++ b/scripts/observability/lib/snapshot_reader.ts @@ -0,0 +1,75 @@ +import { existsSync, readFileSync } from "node:fs" +import { resolve } from "node:path" +import type { JsonValue, SnapshotIndexRow, SnapshotRecord } from "./deep_action_types" + +function inferCategory(snapshotRef: string): string | null { + const lowered = snapshotRef.toLowerCase() + if (lowered.includes("request")) return "request" + if (lowered.includes("response")) return "response" + if (lowered.includes("state.snapshot.after_turn")) return "state_after_turn" + if (lowered.includes("state.snapshot.before_turn")) return "state_before_turn" + if (lowered.includes("messages.")) return "messages_stage" + return null +} + +export class SnapshotReader { + private readonly cache = new Map() + + constructor( + private readonly repoRoot: string, + private readonly snapshotIndex = new Map(), + ) {} + + read(snapshotRef: string): SnapshotRecord { + const cached = this.cache.get(snapshotRef) + if (cached) { + return cached + } + + const indexed = this.snapshotIndex.get(snapshotRef) + const absolutePath = + indexed?.absolute_path ?? resolve(this.repoRoot, snapshotRef.replaceAll("/", "\\")) + const category = indexed?.category ?? inferCategory(snapshotRef) + const warnings: string[] = [] + + if (!existsSync(absolutePath)) { + const record: SnapshotRecord = { + snapshotRef, + category, + exists: false, + absolutePath, + data: null, + warnings: [`missing snapshot: ${snapshotRef}`], + } + this.cache.set(snapshotRef, record) + return record + } + + try { + const data = JSON.parse(readFileSync(absolutePath, "utf8")) as JsonValue + const record: SnapshotRecord = { + snapshotRef, + category, + exists: true, + absolutePath, + data, + warnings, + } + this.cache.set(snapshotRef, record) + return record + } catch (error) { + const record: SnapshotRecord = { + snapshotRef, + category, + exists: true, + absolutePath, + data: null, + warnings: [ + `failed to parse snapshot ${snapshotRef}: ${error instanceof Error ? error.message : String(error)}`, + ], + } + this.cache.set(snapshotRef, record) + return record + } + } +} diff --git a/scripts/observability/lib/tool_use_extractor.ts b/scripts/observability/lib/tool_use_extractor.ts new file mode 100644 index 0000000000..fea52a9ee5 --- /dev/null +++ b/scripts/observability/lib/tool_use_extractor.ts @@ -0,0 +1,292 @@ +import type { + EventRow, + JsonValue, + RichToolCall, + SnapshotRecord, + ToolInputSemantics, + ToolRow, +} from "./deep_action_types" + +function asRecord(value: JsonValue | null): Record | null { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return null + } + return value as Record +} + +function asArray(value: JsonValue | null | undefined): JsonValue[] { + return Array.isArray(value) ? value : [] +} + +function stringifyValue(value: JsonValue | null | undefined, maxLength = 180): string { + if (value === null || value === undefined) { + return "" + } + if (typeof value === "string") { + return value.length > maxLength ? `${value.slice(0, maxLength - 3)}...` : value + } + const serialized = JSON.stringify(value) + return serialized.length > maxLength + ? `${serialized.slice(0, maxLength - 3)}...` + : serialized +} + +function summarizeTextBlocks(messages: JsonValue[]): string { + const chunks: string[] = [] + for (const item of messages) { + const record = asRecord(item) + const message = asRecord(record?.message as JsonValue) + for (const content of asArray(message?.content)) { + const contentRecord = asRecord(content) + if (contentRecord?.type === "text" && typeof contentRecord.text === "string") { + chunks.push(contentRecord.text.trim()) + } + } + } + const merged = chunks.join(" ").replace(/\s+/gu, " ").trim() + return merged.length > 240 ? `${merged.slice(0, 237)}...` : merged +} + +function extractPromptSummary(toolName: string, input: Record | null): string { + if (!input) { + return "" + } + if (toolName === "Agent") { + const prompt = typeof input.prompt === "string" ? input.prompt : "" + return prompt.length > 200 ? `${prompt.slice(0, 197)}...` : prompt + } + if (toolName === "Write") { + const content = typeof input.content === "string" ? input.content : "" + return content.length > 200 ? `${content.slice(0, 197)}...` : content + } + if (toolName === "Edit" || toolName === "MultiEdit") { + const newString = typeof input.new_string === "string" ? input.new_string : "" + return newString.length > 200 ? `${newString.slice(0, 197)}...` : newString + } + return "" +} + +function extractPathsFromInput(toolName: string, input: Record | null): { + commandOrPath: string + touchedFiles: string[] + producedFiles: string[] + inputSummary: string +} { + if (!input) { + return { commandOrPath: "", touchedFiles: [], producedFiles: [], inputSummary: "" } + } + + const getPath = (...keys: string[]): string => { + for (const key of keys) { + if (typeof input[key] === "string") { + return input[key] as string + } + } + return "" + } + + switch (toolName) { + case "Agent": { + const description = stringifyValue(input.description) + const prompt = stringifyValue(input.prompt, 120) + const background = input.run_in_background === true ? "background" : "foreground" + return { + commandOrPath: description, + touchedFiles: [], + producedFiles: [], + inputSummary: `description=${description}; prompt=${prompt}; mode=${background}`, + } + } + case "Bash": { + const command = getPath("command") + const description = stringifyValue(input.description, 100) + return { + commandOrPath: command, + touchedFiles: [], + producedFiles: [], + inputSummary: `command=${stringifyValue(command, 160)}; description=${description}`, + } + } + case "Read": + case "Grep": + case "Glob": { + const path = getPath("file_path", "path", "pattern") + return { + commandOrPath: path, + touchedFiles: path ? [path] : [], + producedFiles: [], + inputSummary: stringifyValue(input), + } + } + case "Write": { + const filePath = getPath("file_path", "path") + return { + commandOrPath: filePath, + touchedFiles: filePath ? [filePath] : [], + producedFiles: filePath ? [filePath] : [], + inputSummary: `file=${filePath}; content=${stringifyValue(input.content, 120)}`, + } + } + case "Edit": + case "MultiEdit": { + const filePath = getPath("file_path", "path") + return { + commandOrPath: filePath, + touchedFiles: filePath ? [filePath] : [], + producedFiles: [], + inputSummary: `file=${filePath}; old=${stringifyValue(input.old_string, 80)}; new=${stringifyValue(input.new_string, 80)}`, + } + } + case "Task": { + return { + commandOrPath: stringifyValue(input.subagent_type), + touchedFiles: [], + producedFiles: [], + inputSummary: stringifyValue(input), + } + } + default: + return { + commandOrPath: stringifyValue(input, 140), + touchedFiles: [], + producedFiles: [], + inputSummary: stringifyValue(input), + } + } +} + +export function extractToolUsesFromResponse(snapshot: SnapshotRecord): Map { + const result = new Map() + const data = asRecord(snapshot.data) + if (!data) { + return result + } + + const assistantMessages = asArray(data.assistantMessages) + const textSummary = summarizeTextBlocks(assistantMessages) + const toolBlocks = asArray(data.toolUseBlocks) + + for (const block of toolBlocks) { + const record = asRecord(block) + const toolUseId = typeof record?.id === "string" ? record.id : "" + const toolName = typeof record?.name === "string" ? record.name : "unknown" + if (!toolUseId) { + continue + } + const input = asRecord((record?.input ?? null) as JsonValue) + const semantics = extractPathsFromInput(toolName, input) + result.set(toolUseId, { + toolUseId, + toolName, + inputSummary: semantics.inputSummary, + commandOrPath: semantics.commandOrPath, + touchedFiles: semantics.touchedFiles, + producedFiles: semantics.producedFiles, + assistantTextSummary: textSummary, + promptSummary: extractPromptSummary(toolName, input), + rawInput: (record?.input ?? null) as JsonValue, + }) + } + + return result +} + +function inferIntent(toolName: string, inputSummary: string, commandOrPath: string, agentName: string | null): string { + const haystack = `${toolName} ${inputSummary} ${commandOrPath} ${agentName ?? ""}`.toLowerCase() + if (haystack.includes("compact")) return "compact" + if (toolName === "Agent") return "spawn_subagent" + if (toolName === "Write" || toolName === "Edit" || toolName === "MultiEdit") return "modify_files" + if (toolName === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(commandOrPath)) return "run_script" + if (toolName === "Read" || toolName === "Grep" || toolName === "Glob") return "inspect_inputs" + if (haystack.includes("check") || haystack.includes("inspect") || haystack.includes("verify")) return "inspect_outputs" + if (haystack.includes("fix") || haystack.includes("replace") || haystack.includes("patch")) return "repair" + return "other" +} + +function summarizeOutput(tool: ToolRow, eventByToolId: Map): { summary: string; warnings: string[] } { + const warnings: string[] = [] + if (tool.success === false) { + return { + summary: tool.failure_reason ? `failed: ${tool.failure_reason}` : "failed", + warnings, + } + } + if (tool.success === true) { + return { summary: "completed", warnings } + } + const events = eventByToolId.get(tool.tool_call_id) ?? [] + const failedEvent = events.find(event => event.event_name === "tool.execution.failed") + if (failedEvent?.payload_json) { + return { summary: failedEvent.payload_json.slice(0, 160), warnings } + } + warnings.push("missing tool execution result summary in V1 facts") + return { summary: "result summary unavailable", warnings } +} + +export function buildRichToolCalls(params: { + tools: ToolRow[] + events: EventRow[] + turnsByQueryTurn: Map + responseSnapshotsByTurn: Map +}): RichToolCall[] { + const eventByToolId = new Map() + for (const event of params.events) { + if (!event.tool_call_id) { + continue + } + const list = eventByToolId.get(event.tool_call_id) ?? [] + list.push(event) + eventByToolId.set(event.tool_call_id, list) + } + + const extractedByTurn = new Map>() + for (const [turnKey, snapshots] of params.responseSnapshotsByTurn) { + const collected = new Map() + for (const snapshot of snapshots) { + for (const [id, semantics] of extractToolUsesFromResponse(snapshot)) { + collected.set(id, semantics) + } + } + extractedByTurn.set(turnKey, collected) + } + + return params.tools.map(tool => { + const turnKey = `${tool.query_id ?? "unknown"}|${tool.turn_id ?? "unknown"}` + const extracted = extractedByTurn.get(turnKey)?.get(tool.tool_call_id) + const output = summarizeOutput(tool, eventByToolId) + const agentName = params.turnsByQueryTurn.get(turnKey)?.agent_name ?? null + const toolName = tool.tool_name ?? extracted?.toolName ?? "unknown" + const evidenceRefs = [ + ...(params.responseSnapshotsByTurn.get(turnKey)?.map(snapshot => snapshot.snapshotRef) ?? []), + ] + if (!extracted) { + output.warnings.push("missing response snapshot tool_use block") + } + return { + tool_call_id: tool.tool_call_id, + query_id: tool.query_id, + agent_name: agentName, + turn_id: tool.turn_id, + tool_name: toolName, + detected_at: tool.detected_at, + completed_at: tool.completed_at, + duration_ms: tool.duration_ms, + success: tool.success, + input_summary: extracted?.inputSummary ?? "input unavailable", + output_summary: output.summary, + command_or_path: extracted?.commandOrPath ?? "", + intent_inferred: inferIntent( + toolName, + extracted?.inputSummary ?? "", + extracted?.commandOrPath ?? "", + agentName, + ), + produced_files: extracted?.producedFiles ?? [], + touched_files: extracted?.touchedFiles ?? [], + snapshot_refs: evidenceRefs, + evidence_refs: evidenceRefs, + warnings: output.warnings, + prompt_summary: extracted?.promptSummary ?? "", + } satisfies RichToolCall + }) +} From 30173427d9728c1fd385ae297178054eb102f28d Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sat, 9 May 2026 02:10:39 +0800 Subject: [PATCH 19/26] Enhance observability action Mermaid reports --- ObservrityTask/action-reports/deep/README.md | 117 ++++ scripts/observability/deep_explain_action.ps1 | 21 +- scripts/observability/deep_explain_action.ts | 217 +++++-- scripts/observability/lib/artifact_tracker.ts | 91 ++- .../observability/lib/deep_action_types.ts | 51 ++ .../observability/lib/deep_report_writer.ts | 238 ++++--- .../observability/lib/mermaid_rich_graph.ts | 319 ++++++++-- scripts/observability/lib/phase_infer.ts | 598 +++++++++++------- .../lib/repair_chain_detector.ts | 147 +++++ .../lib/tool_result_extractor.ts | 404 ++++++++++++ .../observability/lib/tool_use_extractor.ts | 7 + 11 files changed, 1766 insertions(+), 444 deletions(-) create mode 100644 ObservrityTask/action-reports/deep/README.md create mode 100644 scripts/observability/lib/repair_chain_detector.ts create mode 100644 scripts/observability/lib/tool_result_extractor.ts diff --git a/ObservrityTask/action-reports/deep/README.md b/ObservrityTask/action-reports/deep/README.md new file mode 100644 index 0000000000..887d22638e --- /dev/null +++ b/ObservrityTask/action-reports/deep/README.md @@ -0,0 +1,117 @@ +# Deep Action Reports + +## What This Folder Is + +This folder contains V1.1 deep reports for a single `user_action_id`. + +Each action output normally includes: + +- `deep_report.md` +- `rich_stage_flow.mmd` +- `debug_chain_flow.mmd` +- `phase_timeline_mapping.csv` +- `tool_calls_rich.csv` +- `artifact_chain.csv` +- `snapshot_evidence_index.csv` + +## Simple Action vs Complex Action + +`simple action` usually means one of these: + +- a very short action with `tool_call_count <= 3` +- an interrupted action +- an observability self-run action such as `explain_action` or `deep_explain_action` +- a task that never entered a real script -> check -> edit -> rerun loop + +`complex action` usually means: + +- many turns and many tools +- multiple scripts or script versions +- file artifacts that are created, checked, modified, and regenerated +- visible repair loops such as `Bash failed -> Edit -> Bash rerun -> verification` + +## Why `-Latest` May Pick The Wrong Action + +`-Latest` simply selects the newest action in the V1 DuckDB tables. + +That is often not the task you want. It can easily be: + +- an observability/debug command action +- a self-run of `explain_action.ps1` +- a `deep_explain_action.ps1` validation run + +For that reason the report adds a warning when the selection mode is `latest`. + +## Prefer Explicit `UserActionId` + +Use explicit selection when validating a real complex task: + +```powershell +powershell -ExecutionPolicy Bypass -File scripts/observability/deep_explain_action.ps1 ` + -UserActionId 0e05fe1b-ece6-4f6b-9f90-b862e0e88308 +``` + +Use `-Latest` only for quick smoke checks: + +```powershell +powershell -ExecutionPolicy Bypass -File scripts/observability/deep_explain_action.ps1 -Latest +``` + +## How To Read The Outputs + +Read in this order: + +1. `deep_report.md` +2. `rich_stage_flow.mmd` +3. `debug_chain_flow.mmd` +4. CSV files for drill-down + +`deep_report.md` is the main narrative view: + +- basics and selection mode +- warning if `latest` likely selected a self-run action +- phase-by-phase reason / action / result / artifacts / evidence + +`rich_stage_flow.mmd` is the main DAG: + +- action summary node +- query/subagent overview nodes +- one `subgraph` per phase +- tool nodes inside each phase +- artifact nodes +- evidence nodes +- cross-phase artifact flow and repair hints + +`debug_chain_flow.mmd` is the repair-focused DAG: + +- problem +- root cause guess +- fix actions +- rerun or verification +- resolved vs unresolved status + +`tool_calls_rich.csv` is the detailed tool ledger: + +- Bash command +- Write/Edit input +- after-turn or related snapshot result summaries +- detected problem / fix signal + +`phase_timeline_mapping.csv` is the phase timeline: + +- phase ids +- summaries +- tool ids +- primary artifacts +- evidence refs + +## Recommended Validation Pattern + +Use two samples: + +- one simple/self-run sample to validate warning behavior +- one explicit complex `user_action_id` to validate rich DAG generation + +The complex PPT sample used during this repair pass was: + +- `0e05fe1b-ece6-4f6b-9f90-b862e0e88308` diff --git a/scripts/observability/deep_explain_action.ps1 b/scripts/observability/deep_explain_action.ps1 index 581f391523..34f9fa03b5 100644 --- a/scripts/observability/deep_explain_action.ps1 +++ b/scripts/observability/deep_explain_action.ps1 @@ -23,6 +23,8 @@ if ([string]::IsNullOrWhiteSpace($UserActionId)) { $Latest = $true } +$SelectedBy = if ($Latest) { "latest" } else { "explicit_user_action_id" } + function Resolve-ShortId { param([string]$Value) if ([string]::IsNullOrWhiteSpace($Value)) { return "latest" } @@ -52,7 +54,6 @@ function Resolve-LatestUserActionId { if ([string]::IsNullOrWhiteSpace($OutputDir)) { if ($Latest) { $UserActionId = Resolve-LatestUserActionId - $Latest = $false } $targetId = Resolve-ShortId $UserActionId $OutputDir = Join-Path $repoRoot ("ObservrityTask\action-reports\deep\user_action_{0}" -f $targetId) @@ -60,6 +61,10 @@ if ([string]::IsNullOrWhiteSpace($OutputDir)) { $OutputDir = Join-Path $repoRoot $OutputDir } +if ($Latest -and [string]::IsNullOrWhiteSpace($UserActionId)) { + $UserActionId = Resolve-LatestUserActionId +} + [System.IO.Directory]::CreateDirectory($OutputDir) | Out-Null $baselineReportPath = Join-Path $OutputDir "baseline_action_report.md" @@ -67,12 +72,12 @@ $tsArgs = @( "run", (Join-Path $repoRoot "scripts\observability\deep_explain_action.ts") ) -if ($Latest) { - $tsArgs += "--latest" -} else { +if (-not [string]::IsNullOrWhiteSpace($UserActionId)) { $tsArgs += @("--user-action-id", $UserActionId) +} elseif ($Latest) { + $tsArgs += "--latest" } -$tsArgs += @("--output-dir", $OutputDir, "--baseline-report-path", $baselineReportPath) +$tsArgs += @("--selected-by", $SelectedBy, "--output-dir", $OutputDir, "--baseline-report-path", $baselineReportPath) & $bunExe @tsArgs if ($LASTEXITCODE -ne 0) { @@ -84,10 +89,10 @@ $explainArgs = @( "-File", (Join-Path $repoRoot "scripts\observability\explain_action.ps1"), "-OutputPath", $baselineReportPath ) -if ($Latest) { - $explainArgs += "-Latest" -} else { +if (-not [string]::IsNullOrWhiteSpace($UserActionId)) { $explainArgs += @("-UserActionId", $UserActionId) +} elseif ($Latest) { + $explainArgs += "-Latest" } $explainArgs += "-SnapshotDb" diff --git a/scripts/observability/deep_explain_action.ts b/scripts/observability/deep_explain_action.ts index 69ce886da6..55398e19b2 100644 --- a/scripts/observability/deep_explain_action.ts +++ b/scripts/observability/deep_explain_action.ts @@ -10,18 +10,22 @@ import type { EvidenceRecord, IntegrityRow, JsonValue, - PhaseRecord, QueryRow, + RepairChain, RichToolCall, + SelectionMode, SnapshotIndexRow, SnapshotRecord, SubagentRow, ToolRow, TurnRow, + TurnSnapshotBundle, } from "./lib/deep_action_types" import { buildDebugChainFlow, buildRichStageFlow } from "./lib/mermaid_rich_graph" import { inferPhases } from "./lib/phase_infer" +import { detectRepairChains } from "./lib/repair_chain_detector" import { SnapshotReader } from "./lib/snapshot_reader" +import { enrichToolCallsWithResults } from "./lib/tool_result_extractor" import { buildRichToolCalls } from "./lib/tool_use_extractor" const repoRoot = resolve(import.meta.dir, "..", "..") @@ -39,12 +43,14 @@ function parseArgs(argv: string[]): { latest: boolean outputDir?: string baselineReportPath?: string + selectedBy?: SelectionMode } { const parsed = { latest: false } as { userActionId?: string latest: boolean outputDir?: string baselineReportPath?: string + selectedBy?: SelectionMode } for (let index = 0; index < argv.length; index += 1) { const current = argv[index] @@ -52,8 +58,12 @@ function parseArgs(argv: string[]): { if (current === "--latest") parsed.latest = true if (current === "--output-dir") parsed.outputDir = argv[++index] if (current === "--baseline-report-path") parsed.baselineReportPath = argv[++index] + if (current === "--selected-by") parsed.selectedBy = argv[++index] as SelectionMode } if (!parsed.userActionId) parsed.latest = true + if (!parsed.selectedBy) { + parsed.selectedBy = parsed.userActionId ? "explicit_user_action_id" : "latest" + } return parsed } @@ -111,10 +121,7 @@ function csvEscape(value: string | number | boolean | null | undefined): string } function toCsv(headers: string[], rows: Array>): string { - return [ - headers.join(","), - ...rows.map(row => row.map(csvEscape).join(",")), - ].join("\n") + return [headers.join(","), ...rows.map(row => row.map(csvEscape).join(","))].join("\n") } function shortId(value: string | null | undefined): string { @@ -127,29 +134,66 @@ function pickLatestUserActionId(databasePath: string): string { databasePath, "select user_action_id from user_actions order by started_at_ms desc limit 1;", ) - if (rows.length === 0) { - fail("no user actions found") - } + if (rows.length === 0) fail("no user actions found") return rows[0]!.user_action_id } -function collectResponseSnapshotsByTurn( +function relevantSnapshot(snapshot: SnapshotRecord): boolean { + return Boolean( + snapshot.category === "response" || + snapshot.category === "state_after_turn" || + snapshot.category === "state_before_turn" || + snapshot.category === "messages_stage", + ) +} + +function collectTurnSnapshotsByTurn( events: EventRow[], - snapshotReader: SnapshotReader, -): Map { - const result = new Map() + snapshots: Map, +): Map { + const bundles = new Map() for (const event of events) { - if (event.event_name !== "api.stream.completed") continue + const queryId = event.effective_query_id ?? event.query_id + if (!queryId || !event.turn_id) continue + const key = `${queryId}|${event.turn_id}` + const bundle = + bundles.get(key) ?? { + responseSnapshots: [], + relatedSnapshots: [], + afterTurnSnapshots: [], + } + const refs = (parseJsonValue(event.snapshot_refs_json) as string[] | null) ?? [] + for (const ref of refs) { + const snapshot = snapshots.get(ref) + if (!snapshot || !relevantSnapshot(snapshot)) continue + if (!bundle.relatedSnapshots.some(item => item.snapshotRef === snapshot.snapshotRef)) { + bundle.relatedSnapshots.push(snapshot) + } + if (snapshot.category === "response" && !bundle.responseSnapshots.some(item => item.snapshotRef === snapshot.snapshotRef)) { + bundle.responseSnapshots.push(snapshot) + } + if (snapshot.category === "state_after_turn" && !bundle.afterTurnSnapshots.some(item => item.snapshotRef === snapshot.snapshotRef)) { + bundle.afterTurnSnapshots.push(snapshot) + } + } + const payload = parseJsonValue(event.payload_json) - if (!payload || typeof payload !== "object" || Array.isArray(payload)) continue - const snapshotRef = typeof payload.response_snapshot_ref === "string" ? payload.response_snapshot_ref : null - if (!snapshotRef) continue - const key = `${event.effective_query_id ?? event.query_id ?? "unknown"}|${event.turn_id ?? "unknown"}` - const list = result.get(key) ?? [] - list.push(snapshotReader.read(snapshotRef)) - result.set(key, list) + if (payload && typeof payload === "object" && !Array.isArray(payload)) { + const responseRef = typeof payload.response_snapshot_ref === "string" ? payload.response_snapshot_ref : null + if (responseRef) { + const snapshot = snapshots.get(responseRef) + if (snapshot && !bundle.responseSnapshots.some(item => item.snapshotRef === snapshot.snapshotRef)) { + bundle.responseSnapshots.push(snapshot) + } + if (snapshot && !bundle.relatedSnapshots.some(item => item.snapshotRef === snapshot.snapshotRef)) { + bundle.relatedSnapshots.push(snapshot) + } + } + } + + bundles.set(key, bundle) } - return result + return bundles } function buildEvidenceIndex(params: { @@ -157,6 +201,7 @@ function buildEvidenceIndex(params: { snapshots: Map }): EvidenceRecord[] { const rows: EvidenceRecord[] = [] + const seen = new Set() let index = 0 for (const event of params.events) { @@ -164,19 +209,22 @@ function buildEvidenceIndex(params: { for (const ref of refs) { const snapshot = params.snapshots.get(ref) if (!snapshot) continue + const key = `${snapshot.snapshotRef}|${event.effective_query_id ?? event.query_id ?? "unknown"}|${event.turn_id ?? "unknown"}` + if (seen.has(key)) continue + seen.add(key) const data = snapshot.data const extractedFields = - data && typeof data === "object" && !Array.isArray(data) - ? Object.keys(data).slice(0, 8) - : [] + data && typeof data === "object" && !Array.isArray(data) ? Object.keys(data).slice(0, 8) : [] const summary = snapshot.category === "response" - ? "response snapshot with assistant text/tool_use blocks" + ? "response snapshot with assistant tool_use blocks" : snapshot.category === "state_after_turn" - ? "after-turn state snapshot" + ? "after-turn snapshot with state counters / tool aftermath" : snapshot.category === "state_before_turn" - ? "before-turn state snapshot" - : snapshot.category ?? "snapshot" + ? "before-turn snapshot" + : snapshot.category === "messages_stage" + ? "messages-stage snapshot with tool_result history" + : snapshot.category ?? "snapshot" index += 1 rows.push({ evidence_id: `e${String(index).padStart(3, "0")}`, @@ -193,6 +241,11 @@ function buildEvidenceIndex(params: { return rows } +function terminalReason(queries: QueryRow[]): string { + const reasons = [...new Set(queries.map(query => query.terminal_reason).filter(Boolean))] + return reasons.join(" | ") || "unknown" +} + function main(): void { if (!existsSync(duckdbExe)) fail(`DuckDB executable not found: ${duckdbExe}`) if (!existsSync(dbPath)) fail(`DuckDB database not found: ${dbPath}`) @@ -241,16 +294,20 @@ function main(): void { for (const event of events) { const refs = (parseJsonValue(event.snapshot_refs_json) as string[] | null) ?? [] for (const ref of refs) snapshotRefs.add(ref) + const payload = parseJsonValue(event.payload_json) + if (payload && typeof payload === "object" && !Array.isArray(payload)) { + const responseRef = typeof payload.response_snapshot_ref === "string" ? payload.response_snapshot_ref : null + if (responseRef) snapshotRefs.add(responseRef) + } } + const snapshotIndex = new Map() if (snapshotRefs.size > 0) { for (const row of runDuckDbJson( tempDbPath, "select snapshot_ref, file_name, relative_path, absolute_path, exists, size_bytes, sha256, referenced_count, first_event_ts, last_event_ts, category from snapshots_index;", )) { - if (snapshotRefs.has(row.snapshot_ref)) { - snapshotIndex.set(row.snapshot_ref, row) - } + if (snapshotRefs.has(row.snapshot_ref)) snapshotIndex.set(row.snapshot_ref, row) } } @@ -265,31 +322,56 @@ function main(): void { turnsByQueryTurn.set(`${turn.query_id}|${turn.turn_id}`, { agent_name: turn.agent_name }) } - const responseSnapshotsByTurn = collectResponseSnapshotsByTurn(events, snapshotReader) + const turnSnapshotsByKey = collectTurnSnapshotsByTurn(events, snapshots) + const responseSnapshotsByTurn = new Map( + [...turnSnapshotsByKey.entries()].map(([key, bundle]) => [key, bundle.responseSnapshots]), + ) + const baseRichTools = buildRichToolCalls({ + tools, + events, + turnsByQueryTurn, + responseSnapshotsByTurn, + }) const richTools = enrichToolPaths( - buildRichToolCalls({ tools, events, turnsByQueryTurn, responseSnapshotsByTurn }), + enrichToolCallsWithResults({ + tools: baseRichTools, + turnSnapshotsByKey, + }), ) const phases = inferPhases({ action, queries, turns, tools: richTools }) - const phaseByToolId = new Map() + const phaseByToolId = new Map() for (const phase of phases) { - for (const toolCallId of phase.tool_call_ids) { + for (const toolCallId of phase.phase_tool_call_ids) { phaseByToolId.set(toolCallId, phase) } } const artifacts = buildArtifactChain(richTools, phaseByToolId) const evidence = buildEvidenceIndex({ events, snapshots }) + const repairChains = detectRepairChains({ richTools, phases, artifacts }) const outputDir = args.outputDir ?? join(repoRoot, "ObservrityTask", "action-reports", "deep", `user_action_${shortId(userActionId)}`) mkdirSync(outputDir, { recursive: true }) - const richMermaid = buildRichStageFlow(phases) - const debugMermaid = buildDebugChainFlow(phases) - const richMermaidPath = join(outputDir, "rich_stage_flow.mmd") - const debugMermaidPath = join(outputDir, "debug_chain_flow.mmd") - writeFileSync(richMermaidPath, richMermaid, "utf8") - writeFileSync(debugMermaidPath, debugMermaid, "utf8") + const richMermaid = buildRichStageFlow({ + action, + queries, + subagents, + phases, + tools: richTools, + artifacts, + evidence, + repairChains, + }) + const debugMermaid = buildDebugChainFlow({ + repairChains, + tools: richTools, + artifacts, + evidence, + }) + writeFileSync(join(outputDir, "rich_stage_flow.mmd"), richMermaid, "utf8") + writeFileSync(join(outputDir, "debug_chain_flow.mmd"), debugMermaid, "utf8") writeFileSync( join(outputDir, "phase_timeline_mapping.csv"), @@ -297,19 +379,26 @@ function main(): void { [ "phase_id", "phase_name", + "stage_kind", "start_local", "end_local", "duration_ms", "query_ids", - "turn_range", + "turn_ids", "tool_counts", - "main_outputs", + "reason_summary", + "action_summary", + "result_summary", + "primary_artifacts", "problems", + "fixes", + "phase_tool_call_ids", "evidence_refs", ], phases.map(phase => [ phase.phase_id, phase.phase_name, + phase.stage_kind, phase.start_local, phase.end_local, phase.duration_ms, @@ -318,8 +407,13 @@ function main(): void { Object.entries(phase.tool_counts) .map(([name, count]) => `${name}:${count}`) .join(";"), - phase.main_outputs.join(" | "), + phase.reason_summary, + phase.action_summary, + phase.result_summary, + phase.primary_artifacts.join(" | "), phase.problems.join(" | "), + phase.fixes.join(" | "), + phase.phase_tool_call_ids.join(";"), phase.evidence_refs.join(";"), ]), ), @@ -330,6 +424,7 @@ function main(): void { join(outputDir, "tool_calls_rich.csv"), toCsv( [ + "tool_call_id", "query_id", "agent_name", "turn_id", @@ -339,14 +434,23 @@ function main(): void { "duration_ms", "success", "input_summary", - "output_summary", "command_or_path", + "output_summary", + "stdout_summary", + "stderr_summary", + "error_summary", + "result_summary_rich", + "detected_problem", + "detected_fix_signal", "intent_inferred", "produced_files", "touched_files", + "result_files", "snapshot_refs", + "warnings", ], richTools.map(tool => [ + tool.tool_call_id, tool.query_id, tool.agent_name, tool.turn_id, @@ -356,12 +460,20 @@ function main(): void { tool.duration_ms, tool.success, tool.input_summary, - tool.output_summary, tool.command_or_path, + tool.output_summary, + tool.stdout_summary, + tool.stderr_summary, + tool.error_summary, + tool.result_summary_rich, + tool.detected_problem, + tool.detected_fix_signal, tool.intent_inferred, tool.produced_files.join(";"), tool.touched_files.join(";"), + tool.result_files.join(";"), tool.snapshot_refs.join(";"), + tool.warnings.join(";"), ]), ), "utf8", @@ -375,7 +487,11 @@ function main(): void { "artifact_type", "first_seen_phase", "created_by_tool", + "created_by_tool_call_id", + "created_by_phase_id", "modified_by_tools", + "modified_by_tool_call_ids", + "phase_ids", "evidence_refs", ], artifacts.map((artifact: ArtifactRecord) => [ @@ -383,7 +499,11 @@ function main(): void { artifact.artifact_type, artifact.first_seen_phase, artifact.created_by_tool, + artifact.created_by_tool_call_id, + artifact.created_by_phase_id, artifact.modified_by_tools.join(";"), + artifact.modified_by_tool_call_ids.join(";"), + artifact.phase_ids.join(";"), artifact.evidence_refs.join(";"), ]), ), @@ -416,6 +536,9 @@ function main(): void { tools: richTools, artifacts, evidence, + repairChains, + selectedBy: args.selectedBy ?? "explicit_user_action_id", + terminalReason: terminalReason(queries), richMermaidPath: "rich_stage_flow.mmd", debugMermaidPath: "debug_chain_flow.mmd", baselineReportPath: args.baselineReportPath ? "baseline_action_report.md" : null, @@ -426,7 +549,9 @@ function main(): void { JSON.stringify( { userActionId, + selectedBy: args.selectedBy ?? "explicit_user_action_id", outputDir, + repairChainCount: repairChains.length, files: [ "deep_report.md", "rich_stage_flow.mmd", diff --git a/scripts/observability/lib/artifact_tracker.ts b/scripts/observability/lib/artifact_tracker.ts index 05df298425..9ff6d73ad1 100644 --- a/scripts/observability/lib/artifact_tracker.ts +++ b/scripts/observability/lib/artifact_tracker.ts @@ -1,37 +1,79 @@ import type { ArtifactRecord, PhaseRecord, RichToolCall } from "./deep_action_types" -const PATH_PATTERN = - /([A-Za-z]:\\[^\s"'`|<>]+|\/[^\s"'`|<>]+|(?:\.{0,2}\/)?[\w.-]+(?:\/[\w.-]+)*\.(?:docx|pptx|txt|json|py|js|ts|ps1|csv|md))/gu +const FILE_PATTERN = + /([A-Za-z]:[\\/][^\s"'`<>|]+|(?:\.{1,2}[\\/])?[\w .-]+(?:[\\/][\w .-]+)*\.(?:docx|pptx|txt|json|py|js|ts|ps1|csv|md|xml|html|png|jpg|jpeg|svg|pdf|xlsx|output))/giu function unique(values: T[]): T[] { return [...new Set(values)] } +function normalizePath(path: string): string { + return path + .trim() + .replace(/^["']|["']$/gu, "") + .replace(/\\/gu, "/") + .replace(/^([A-Za-z]:)\/+/u, "$1/") + .replace(/([^:])\/{2,}/gu, "$1/") +} + +function isLikelyPath(path: string): boolean { + const normalized = normalizePath(path) + if (!normalized) return false + if (/[{}<>]/u.test(normalized)) return false + if (!/\.[A-Za-z0-9]{1,8}$/u.test(normalized)) return false + if (/^[A-Za-z]:$/u.test(normalized)) return false + if (normalized.startsWith("/") && normalized.split("/").length < 3) return false + return true +} + function extractPaths(text: string): string[] { - return unique([...text.matchAll(PATH_PATTERN)].map(match => match[1] ?? "").filter(Boolean)) + return unique( + [...text.matchAll(FILE_PATTERN)] + .map(match => normalizePath(match[1] ?? "")) + .filter(isLikelyPath), + ) } function classifyArtifact(path: string): string { - const lowered = path.toLowerCase() + const lowered = normalizePath(path).toLowerCase() if (/\.(py|js|ts|ps1)$/u.test(lowered)) return "script" if (/\.(pptx)$/u.test(lowered)) return "final" - if (/\.(docx)$/u.test(lowered)) return "input" - if (/\.(md|csv|json|txt)$/u.test(lowered)) return lowered.includes("report") ? "report" : "intermediate" + if (/\.(docx|pdf|txt)$/u.test(lowered)) return "input" + if (/\.(png|jpg|jpeg|svg)$/u.test(lowered)) return "media" + if (/\.(md|csv|json|xml|html|xlsx|output)$/u.test(lowered)) return "intermediate" return "other" } +function toolTouchesArtifact(tool: RichToolCall, path: string): boolean { + return tool.touched_files.includes(path) || tool.produced_files.includes(path) || tool.result_files.includes(path) +} + export function enrichToolPaths(tools: RichToolCall[]): RichToolCall[] { return tools.map(tool => { - const discovered = extractPaths(`${tool.command_or_path}\n${tool.input_summary}\n${tool.output_summary}`) - const touched = unique([...tool.touched_files, ...discovered]) - const produced = unique([ - ...tool.produced_files, - ...discovered.filter(path => /save|write|export|generate|create/iu.test(tool.command_or_path)), - ]) + const discovered = extractPaths( + [ + tool.command_or_path, + tool.input_summary, + tool.output_summary, + tool.stdout_summary, + tool.stderr_summary, + tool.result_summary_rich, + ] + .filter(Boolean) + .join("\n"), + ) + const touched = unique([...tool.touched_files, ...discovered].map(normalizePath).filter(isLikelyPath)) + const produced = unique( + [...tool.produced_files, ...tool.result_files] + .map(normalizePath) + .filter(isLikelyPath), + ) + const resultFiles = unique([...tool.result_files, ...discovered].map(normalizePath).filter(isLikelyPath)) return { ...tool, touched_files: touched, produced_files: produced, + result_files: resultFiles, } }) } @@ -45,31 +87,38 @@ export function buildArtifactChain( for (const tool of tools) { const phase = phasesByToolId.get(tool.tool_call_id) const phaseId = phase?.phase_id ?? "unknown" - const everyPath = unique([...tool.touched_files, ...tool.produced_files]) - for (const path of everyPath) { + const paths = unique([...tool.touched_files, ...tool.produced_files, ...tool.result_files].map(normalizePath).filter(isLikelyPath)) + for (const path of paths) { const existing = artifacts.get(path) + const produced = tool.produced_files.includes(path) || tool.result_files.includes(path) if (!existing) { artifacts.set(path, { artifact_path: path, artifact_type: classifyArtifact(path), first_seen_phase: phaseId, - created_by_tool: tool.produced_files.includes(path) ? tool.tool_name : "", - modified_by_tools: tool.touched_files.includes(path) ? [tool.tool_name] : [], + created_by_tool: produced ? tool.tool_name : "", + created_by_tool_call_id: produced ? tool.tool_call_id : null, + created_by_phase_id: produced ? phaseId : null, + modified_by_tools: toolTouchesArtifact(tool, path) ? [tool.tool_name] : [], + modified_by_tool_call_ids: toolTouchesArtifact(tool, path) ? [tool.tool_call_id] : [], + phase_ids: phaseId ? [phaseId] : [], evidence_refs: [...tool.evidence_refs], }) continue } - if (!existing.created_by_tool && tool.produced_files.includes(path)) { + if (!existing.created_by_tool && produced) { existing.created_by_tool = tool.tool_name + existing.created_by_tool_call_id = tool.tool_call_id + existing.created_by_phase_id = phaseId } - if (tool.touched_files.includes(path)) { + if (toolTouchesArtifact(tool, path)) { existing.modified_by_tools = unique([...existing.modified_by_tools, tool.tool_name]) + existing.modified_by_tool_call_ids = unique([...existing.modified_by_tool_call_ids, tool.tool_call_id]) } + existing.phase_ids = unique([...existing.phase_ids, phaseId]) existing.evidence_refs = unique([...existing.evidence_refs, ...tool.evidence_refs]) } } - return [...artifacts.values()].sort((left, right) => - left.artifact_path.localeCompare(right.artifact_path), - ) + return [...artifacts.values()].sort((left, right) => left.artifact_path.localeCompare(right.artifact_path)) } diff --git a/scripts/observability/lib/deep_action_types.ts b/scripts/observability/lib/deep_action_types.ts index 352db0c0a5..9a7bc62c7f 100644 --- a/scripts/observability/lib/deep_action_types.ts +++ b/scripts/observability/lib/deep_action_types.ts @@ -6,6 +6,8 @@ export type JsonValue = | JsonValue[] | { [key: string]: JsonValue } +export type SelectionMode = "latest" | "explicit_user_action_id" + export type ActionRow = { user_action_id: string event_date: string @@ -150,6 +152,20 @@ export type ToolInputSemantics = { rawInput: JsonValue | null } +export type ToolResultCandidate = { + tool_use_id: string | null + snapshot_ref: string + category: string | null + matched_by: "tool_use_id" | "turn_fallback" + text_summary: string + stdout_summary: string + stderr_summary: string + error_summary: string + status: string + result_files: string[] + warnings: string[] +} + export type RichToolCall = { tool_call_id: string query_id: string | null @@ -162,6 +178,13 @@ export type RichToolCall = { success: boolean | null input_summary: string output_summary: string + stdout_summary: string + stderr_summary: string + error_summary: string + result_summary_rich: string + detected_problem: string + detected_fix_signal: string + result_files: string[] command_or_path: string intent_inferred: string produced_files: string[] @@ -189,6 +212,11 @@ export type PhaseRecord = { fixes: string[] evidence_refs: string[] tool_call_ids: string[] + phase_tool_call_ids: string[] + primary_artifacts: string[] + reason_summary: string + action_summary: string + result_summary: string } export type ArtifactRecord = { @@ -196,7 +224,11 @@ export type ArtifactRecord = { artifact_type: string first_seen_phase: string created_by_tool: string + created_by_tool_call_id: string | null + created_by_phase_id: string | null modified_by_tools: string[] + modified_by_tool_call_ids: string[] + phase_ids: string[] evidence_refs: string[] } @@ -209,3 +241,22 @@ export type EvidenceRecord = { extracted_fields: string[] summary: string } + +export type RepairChain = { + chain_id: string + problem_summary: string + root_cause_guess: string + fix_actions: string[] + verification_summary: string + tool_call_ids: string[] + phase_ids: string[] + artifact_paths: string[] + evidence_refs: string[] + status: "resolved" | "unresolved" +} + +export type TurnSnapshotBundle = { + responseSnapshots: SnapshotRecord[] + relatedSnapshots: SnapshotRecord[] + afterTurnSnapshots: SnapshotRecord[] +} diff --git a/scripts/observability/lib/deep_report_writer.ts b/scripts/observability/lib/deep_report_writer.ts index 982c727d3b..3a90c1dfd3 100644 --- a/scripts/observability/lib/deep_report_writer.ts +++ b/scripts/observability/lib/deep_report_writer.ts @@ -5,7 +5,9 @@ import type { IntegrityRow, PhaseRecord, QueryRow, + RepairChain, RichToolCall, + SelectionMode, SubagentRow, } from "./deep_action_types" @@ -18,6 +20,10 @@ function shortId(value: string | null | undefined): string { return value.length <= 8 ? value : value.slice(0, 8) } +function escapeCell(value: string): string { + return value.replaceAll("|", "\\|").replaceAll("\n", "
") +} + function table(headers: string[], rows: string[][]): string[] { return [ `| ${headers.join(" | ")} |`, @@ -26,6 +32,16 @@ function table(headers: string[], rows: string[][]): string[] { ] } +function describeTool(tool: RichToolCall): string { + return `${tool.tool_name}${tool.success === false ? " fail" : tool.success === true ? " ok" : ""}` +} + +function isSelfRunAction(tools: RichToolCall[], toolCallCount: number): boolean { + if (toolCallCount > 3) return false + const bashCommands = tools.filter(tool => tool.tool_name === "Bash").map(tool => tool.command_or_path.toLowerCase()) + return bashCommands.length === 1 && bashCommands[0]!.includes("explain_action") +} + export function writeDeepReport(params: { action: ActionRow integrity: IntegrityRow | null @@ -35,47 +51,76 @@ export function writeDeepReport(params: { tools: RichToolCall[] artifacts: ArtifactRecord[] evidence: EvidenceRecord[] + repairChains: RepairChain[] + selectedBy: SelectionMode + terminalReason: string richMermaidPath: string debugMermaidPath: string baselineReportPath: string | null }): string { - const missingSnapshotCount = params.tools.filter(tool => - tool.warnings.some(warning => warning.includes("snapshot")), - ).length - const confidence = missingSnapshotCount === 0 ? "high" : missingSnapshotCount < 5 ? "medium" : "low" - const summary = `This action expanded into ${params.action.query_count} queries, ${params.action.subagent_count} subagents, and ${params.phases.length} inferred phases with ${params.action.tool_call_count} tool calls.` + const missingSnapshotCount = params.tools.filter(tool => tool.warnings.length > 0).length + const selfRun = isSelfRunAction(params.tools, params.action.tool_call_count) + const toolsById = new Map(params.tools.map(tool => [tool.tool_call_id, tool])) + const evidenceByRef = new Map(params.evidence.map(item => [item.snapshot_ref, item])) const lines: string[] = [ "# Deep Action Report", "", - "## 1. 一句话总结", - "", - summary, - "", - "## 2. Basics", - "", - `- user_action_id: ${params.action.user_action_id}`, - `- utc: ${params.action.started_at} -> ${params.action.ended_at}`, - `- duration_ms: ${params.action.duration_ms}`, - `- query_count: ${params.action.query_count}`, - `- subagent_count: ${params.action.subagent_count}`, - `- tool_call_count: ${params.action.tool_call_count}`, - `- total_prompt_input_tokens: ${params.action.total_prompt_input_tokens}`, - `- total_billed_tokens: ${params.action.total_billed_tokens}`, - "", ] + if (params.selectedBy === "latest") { + lines.push( + "> Warning: Latest action may be an observability/debug command action. For complex DAG validation, prefer explicit `-UserActionId`.", + "", + ) + } + + if (selfRun) { + lines.push( + "> This appears to be an observability self-run action, not a target complex task.", + "", + ) + } + + lines.push("## How To Read", "") + lines.push("- `rich_stage_flow.mmd`: phase structure, tool nodes, artifact nodes, evidence nodes.") + lines.push("- `debug_chain_flow.mmd`: problem -> fix -> verification chains.") + lines.push("- `deep_report.md`: per-phase reason, action, and result.") + lines.push("- CSV files are drill-down detail, not the primary reading path.", "") + + lines.push("## Summary", "") + lines.push( + `This action expanded into ${params.phases.length} phases across ${params.action.query_count} queries, ${params.action.subagent_count} subagents, and ${params.action.tool_call_count} tool calls.`, + "", + ) + + lines.push("## Basics", "") + lines.push(`- user_action_id: ${params.action.user_action_id}`) + lines.push(`- selected_by: ${params.selectedBy}`) + lines.push(`- utc: ${params.action.started_at} -> ${params.action.ended_at}`) + lines.push(`- duration_ms: ${params.action.duration_ms}`) + lines.push(`- query_count: ${params.action.query_count}`) + lines.push(`- subagent_count: ${params.action.subagent_count}`) + lines.push(`- tool_call_count: ${params.action.tool_call_count}`) + lines.push(`- terminal_reason: ${params.terminalReason}`) + lines.push(`- total_prompt_input_tokens: ${params.action.total_prompt_input_tokens}`) + lines.push(`- total_billed_tokens: ${params.action.total_billed_tokens}`) + if (selfRun) { + lines.push("- note: This appears to be an observability self-run action, not a target complex task.") + } + lines.push("") + if (params.integrity) { - lines.push("## 3. Integrity Snapshot", "") + lines.push("## Integrity Snapshot", "") for (const [key, value] of Object.entries(params.integrity)) { lines.push(`- ${key}: ${value ?? ""}`) } lines.push("") } - lines.push("## 4. Query / Agent 分工", "") + lines.push("## Query And Subagent Overview", "") for (const query of params.queries) { lines.push( - `- ${query.agent_name ?? "unknown"} ${shortId(query.query_id)}: turns=${query.turn_count}, tools=${query.tool_call_count}, duration_ms=${query.duration_ms ?? ""}, terminal=${query.terminal_reason ?? ""}`, + `- ${query.agent_name ?? "unknown"} ${shortId(query.query_id)}: source=${query.query_source ?? "main_thread"}, turns=${query.turn_count}, tools=${query.tool_call_count}, duration_ms=${query.duration_ms ?? ""}, terminal=${query.terminal_reason ?? ""}`, ) } for (const subagent of params.subagents) { @@ -85,27 +130,7 @@ export function writeDeepReport(params: { } lines.push("") - lines.push("## 5. 阶段级时间线", "") - lines.push( - ...table( - ["phase", "time", "queries", "turns", "tools", "outputs", "problems", "evidence"], - params.phases.map(phase => [ - phase.phase_name, - `${phase.start_local} -> ${phase.end_local}`, - phase.query_ids.map(shortId).join(", "), - unique(phase.turn_ids).join(", "), - Object.entries(phase.tool_counts) - .map(([name, count]) => `${name} x${count}`) - .join("; "), - (phase.main_outputs[0] ?? "").replaceAll("|", "\\|"), - (phase.problems[0] ?? "").replaceAll("|", "\\|"), - phase.evidence_refs.slice(0, 2).join("
"), - ]), - ), - ) - lines.push("") - - lines.push("## 6. 富证据复杂 DAG", "") + lines.push("## Graph Outputs", "") lines.push(`- rich stage flow: ${params.richMermaidPath}`) lines.push(`- debug chain flow: ${params.debugMermaidPath}`) if (params.baselineReportPath) { @@ -113,65 +138,110 @@ export function writeDeepReport(params: { } lines.push("") - lines.push("## 7. 工具调用语义复盘", "") - for (const tool of params.tools.slice(0, 20)) { - lines.push( - `- ${tool.tool_name} ${shortId(tool.tool_call_id)} @ ${tool.turn_id ?? "no-turn"}: ${tool.input_summary}; output=${tool.output_summary}; intent=${tool.intent_inferred}; evidence=${tool.evidence_refs[0] ?? "none"}`, - ) - } - if (params.tools.length > 20) { - lines.push(`- ... ${params.tools.length - 20} more tool calls in tool_calls_rich.csv`) + lines.push("## Repair Chains", "") + if (params.repairChains.length === 0) { + lines.push("- no dense repair chain detected", "") + } else { + for (const chain of params.repairChains) { + lines.push( + `- ${chain.chain_id}: ${chain.problem_summary}; root=${chain.root_cause_guess}; fix=${chain.fix_actions.join(" | ") || "n/a"}; verification=${chain.verification_summary}; status=${chain.status}`, + ) + } + lines.push("") } - lines.push("") - lines.push("## 8. 文件产物链", "") - for (const artifact of params.artifacts.slice(0, 20)) { + for (const phase of params.phases) { + const phaseTools = phase.phase_tool_call_ids + .map(id => toolsById.get(id)) + .filter((tool): tool is RichToolCall => Boolean(tool)) + const phaseArtifacts = params.artifacts.filter(artifact => artifact.phase_ids.includes(phase.phase_id)) + const phaseEvidence = unique(phase.evidence_refs) + .map(ref => evidenceByRef.get(ref)) + .filter((item): item is EvidenceRecord => Boolean(item)) + const phaseProblems = unique([...phase.problems, ...phaseTools.map(tool => tool.detected_problem).filter(Boolean)]) + const phaseFixes = unique([...phase.fixes, ...phaseTools.map(tool => tool.detected_fix_signal).filter(Boolean)]) + + lines.push(`## Phase ${phase.phase_id.replace("phase_", "")}: ${phase.phase_name}`, "") + lines.push(`- time: ${phase.start_local} -> ${phase.end_local} (${phase.duration_ms}ms)`) + lines.push(`- query: ${phase.query_ids.map(shortId).join(", ") || "-"}`) + lines.push(`- turn: ${phase.turn_ids.join(", ") || "-"}`) + lines.push(`- tools: ${phaseTools.map(describeTool).join(", ") || "-"}`) + lines.push(`- reason: ${phase.reason_summary || "-"}`) + lines.push(`- action: ${phase.action_summary || "-"}`) + lines.push(`- result: ${phase.result_summary || "-"}`) + lines.push(`- artifacts: ${phase.primary_artifacts.join(" | ") || "-"}`) + lines.push(`- problems: ${phaseProblems.join(" | ") || "-"}`) + lines.push(`- fixes: ${phaseFixes.join(" | ") || "-"}`) lines.push( - `- ${artifact.artifact_path}: type=${artifact.artifact_type}, first_seen_phase=${artifact.first_seen_phase}, created_by=${artifact.created_by_tool || "unknown"}, modified_by=${artifact.modified_by_tools.join(", ") || "none"}`, + `- evidence: ${phaseEvidence.map(item => `${item.category ?? "snapshot"}:${shortId(item.snapshot_ref)}`).join(" | ") || "-"}`, ) - } - if (params.artifacts.length > 20) { - lines.push(`- ... ${params.artifacts.length - 20} more artifacts in artifact_chain.csv`) - } - lines.push("") + lines.push("", "### Tool Details", "") + lines.push( + ...table( + ["turn", "tool", "command/path", "input摘要", "output摘要", "problem/fix", "evidence"], + phaseTools.slice(0, 5).map(tool => [ + escapeCell(tool.turn_id ?? ""), + escapeCell(tool.tool_name), + escapeCell(tool.command_or_path || "-"), + escapeCell(tool.input_summary || "-"), + escapeCell(tool.result_summary_rich || tool.output_summary || "-"), + escapeCell(unique([tool.detected_problem, tool.detected_fix_signal].filter(Boolean)).join(" | ") || "-"), + escapeCell(tool.evidence_refs.slice(0, 2).map(shortId).join(", ") || "-"), + ]), + ), + ) + if (phaseTools.length > 5) { + lines.push("", `More tools in phase: ${phaseTools.length - 5} additional rows in tool_calls_rich.csv`) + } - lines.push("## 9. 问题与修复链", "") - const issueTools = params.tools.filter( - tool => tool.success === false || tool.intent_inferred === "repair" || tool.warnings.length > 0, - ) - if (issueTools.length === 0) { - lines.push("- no dense repair chain detected") - } else { - for (const tool of issueTools.slice(0, 20)) { + lines.push("", "### Artifacts", "") + if (phaseArtifacts.length === 0) { + lines.push("- no explicit artifacts") + } else { lines.push( - `- ${tool.tool_name} ${shortId(tool.tool_call_id)}: ${tool.output_summary}; warnings=${tool.warnings.join("; ") || "none"}`, + ...table( + ["artifact", "type", "created/modified by"], + phaseArtifacts.slice(0, 8).map(artifact => [ + escapeCell(artifact.artifact_path), + escapeCell(artifact.artifact_type), + escapeCell( + [ + artifact.created_by_tool ? `create:${artifact.created_by_tool}` : "", + artifact.modified_by_tools.length > 0 ? `modify:${artifact.modified_by_tools.join(",")}` : "", + ] + .filter(Boolean) + .join(" | ") || "-", + ), + ]), + ), ) } + lines.push("") } - lines.push("") - lines.push("## 10. Snapshot 证据索引", "") + lines.push("## Snapshot Evidence Index", "") lines.push( ...table( ["evidence_id", "category", "query", "turn", "fields", "summary"], - params.evidence.slice(0, 20).map(item => [ + params.evidence.slice(0, 40).map(item => [ item.evidence_id, - item.category ?? "", - shortId(item.query_id), - item.turn_id ?? "", - item.extracted_fields.join(", "), - item.summary.replaceAll("|", "\\|"), + escapeCell(item.category ?? ""), + escapeCell(shortId(item.query_id)), + escapeCell(item.turn_id ?? ""), + escapeCell(item.extracted_fields.join(", ")), + escapeCell(item.summary), ]), ), ) - if (params.evidence.length > 20) { - lines.push("", `More evidence rows: ${params.evidence.length - 20} omitted from report; see snapshot_evidence_index.csv`) + if (params.evidence.length > 40) { + lines.push("", `More evidence rows: ${params.evidence.length - 40} omitted from report; see snapshot_evidence_index.csv`) } - lines.push("", "## 11. 缺失信息与可信度", "") - lines.push(`- confidence: ${confidence}`) - lines.push(`- missing_snapshot_tool_calls: ${missingSnapshotCount}`) + + lines.push("", "## Confidence", "") + lines.push(`- missing_snapshot_or_fallback_tool_calls: ${missingSnapshotCount}`) if (missingSnapshotCount > 0) { - lines.push("- some tool parameters or results could not be reconstructed because response/state snapshots were missing in V1 facts") + lines.push("- some tool results were reconstructed via related snapshots or turn fallback") } + return lines.join("\n") } diff --git a/scripts/observability/lib/mermaid_rich_graph.ts b/scripts/observability/lib/mermaid_rich_graph.ts index 764c5e2707..d938060c82 100644 --- a/scripts/observability/lib/mermaid_rich_graph.ts +++ b/scripts/observability/lib/mermaid_rich_graph.ts @@ -1,86 +1,299 @@ -import type { PhaseRecord } from "./deep_action_types" +import type { + ActionRow, + ArtifactRecord, + EvidenceRecord, + PhaseRecord, + QueryRow, + RepairChain, + RichToolCall, + SubagentRow, +} from "./deep_action_types" function esc(text: string): string { - return text.replaceAll('"', "'") + return text.replaceAll('"', "'").replaceAll("\n", "
") } -function label(phase: PhaseRecord): string { - const toolSummary = Object.entries(phase.tool_counts) - .map(([name, count]) => `${name} x${count}`) - .join(" + ") +function shortText(text: string, maxLength = 120): string { + const normalized = text.replace(/\s+/gu, " ").trim() + if (normalized.length <= maxLength) return normalized + return `${normalized.slice(0, maxLength - 3)}...` +} + +function shortId(value: string | null | undefined): string { + if (!value) return "null" + return value.length <= 8 ? value : value.slice(0, 8) +} + +function nodeId(raw: string): string { + return raw.replace(/[^A-Za-z0-9_]/gu, "_") +} + +function toolSummary(tool: RichToolCall): string { + const status = + tool.success === true ? "success" : tool.success === false ? "fail" : "unknown" + return esc( + [ + `turn ${tool.turn_id ?? "?"} | ${tool.tool_name} | ${status}`, + shortText(tool.command_or_path || tool.input_summary || "input unavailable", 90), + shortText(tool.detected_problem || tool.result_summary_rich || tool.output_summary || "no result", 110), + ].join("
"), + ) +} + +function artifactSummary(artifact: ArtifactRecord): string { return esc( [ - phase.phase_name, - `${phase.start_local} -> ${phase.end_local}`, - `duration ${phase.duration_ms}ms`, - phase.turn_ids.length > 0 ? `turns ${phase.turn_ids.join(",")}` : "", - toolSummary ? `tools ${toolSummary}` : "", - phase.main_outputs[0] ? `output ${phase.main_outputs[0]}` : "", - phase.problems[0] ? `problem ${phase.problems[0]}` : "", - phase.fixes[0] ? `fix ${phase.fixes[0]}` : "", + artifact.artifact_path.split("/").at(-1) ?? artifact.artifact_path, + `type=${artifact.artifact_type}`, + artifact.created_by_phase_id ? `from ${artifact.created_by_phase_id}` : "", ] .filter(Boolean) .join("
"), ) } -function className(kind: PhaseRecord["stage_kind"]): string { - return kind +function evidenceSummary(evidence: EvidenceRecord): string { + return esc( + [ + evidence.category ?? "snapshot", + shortId(evidence.snapshot_ref), + shortText(evidence.summary, 80), + ].join("
"), + ) } -export function buildRichStageFlow(phases: PhaseRecord[]): string { +export function buildRichStageFlow(params: { + action: ActionRow + queries: QueryRow[] + subagents: SubagentRow[] + phases: PhaseRecord[] + tools: RichToolCall[] + artifacts: ArtifactRecord[] + evidence: EvidenceRecord[] + repairChains: RepairChain[] +}): string { const lines = [ "flowchart TD", - " classDef input fill:#eef6ff,stroke:#1d4ed8,color:#0f172a", - " classDef main fill:#ecfdf5,stroke:#15803d,color:#052e16", + " classDef action fill:#111827,stroke:#0f172a,color:#f9fafb", + " classDef query fill:#ecfeff,stroke:#0f766e,color:#042f2e", " classDef subagent fill:#fff7ed,stroke:#c2410c,color:#431407", - " classDef compact fill:#f5f3ff,stroke:#7c3aed,color:#2e1065", - " classDef script fill:#fef3c7,stroke:#b45309,color:#451a03", - " classDef issue fill:#fff1f2,stroke:#e11d48,color:#4c0519", - " classDef fix fill:#eff6ff,stroke:#0891b2,color:#082f49", - " classDef output fill:#f0fdf4,stroke:#16a34a,color:#14532d", + " classDef summary fill:#f8fafc,stroke:#64748b,color:#0f172a", + " classDef tool fill:#eef2ff,stroke:#4338ca,color:#1e1b4b", + " classDef toolFail fill:#fff1f2,stroke:#e11d48,color:#4c0519", + " classDef artifact fill:#fef3c7,stroke:#b45309,color:#451a03", + " classDef artifactFinal fill:#dcfce7,stroke:#16a34a,color:#14532d", + " classDef evidence fill:#ede9fe,stroke:#7c3aed,color:#2e1065", + " classDef more fill:#f1f5f9,stroke:#94a3b8,color:#334155", + " classDef repair fill:#fce7f3,stroke:#a21caf,color:#4a044e", ] - phases.forEach((phase, index) => { - const nodeId = `P${index + 1}` - lines.push(` ${nodeId}["${label(phase)}"]`) - lines.push(` class ${nodeId} ${className(phase.stage_kind)}`) - if (index > 0) { - lines.push(` P${index} --> ${nodeId}`) + lines.push( + ` ACTION["${esc( + [ + `action ${shortId(params.action.user_action_id)}`, + `duration ${params.action.duration_ms}ms`, + `queries ${params.action.query_count} | subagents ${params.action.subagent_count} | tools ${params.action.tool_call_count}`, + `billed ${params.action.total_billed_tokens} tokens`, + ].join("
"), + )}"]`, + ) + lines.push(" class ACTION action") + + params.queries.forEach((query, index) => { + const id = `Q${index + 1}` + const kind = (query.query_source ?? "").includes("compact") + ? "compact" + : query.subagent_id + ? "fork subagent" + : "main_thread" + lines.push( + ` ${id}["${esc( + [ + `${kind} ${shortId(query.query_id)}`, + `turns ${query.turn_count} | tools ${query.tool_call_count}`, + `duration ${query.duration_ms ?? 0}ms`, + `terminal ${shortText(query.terminal_reason ?? "unknown", 60)}`, + ].join("
"), + )}"]`, + ) + lines.push(` ACTION --> ${id}`) + lines.push(` class ${id} ${query.subagent_id ? "subagent" : "query"}`) + }) + + params.subagents.forEach((subagent, index) => { + const id = `SA${index + 1}` + lines.push( + ` ${id}["${esc( + [ + `fork ${shortId(subagent.subagent_id)}`, + shortText(subagent.subagent_reason ?? subagent.subagent_type ?? "subagent", 70), + `duration ${subagent.duration_ms ?? 0}ms`, + ].join("
"), + )}"]`, + ) + lines.push(" class " + id + " subagent") + }) + + const toolsById = new Map(params.tools.map(tool => [tool.tool_call_id, tool])) + const evidenceById = new Map(params.evidence.map(item => [item.evidence_id, item])) + const evidenceByRef = new Map(params.evidence.map(item => [item.snapshot_ref, item])) + const phaseSummaryNodes: string[] = [] + + params.phases.forEach((phase, index) => { + const subgraphId = `PH${index + 1}` + const summaryNodeId = `${subgraphId}_SUM` + phaseSummaryNodes.push(summaryNodeId) + const toolNames = Object.entries(phase.tool_counts) + .map(([name, count]) => `${name}x${count}`) + .join(" + ") + lines.push( + ` subgraph ${subgraphId}["${esc( + `${phase.phase_id} ${phase.phase_name} | ${phase.start_local} | turns ${phase.turn_ids.join(",") || "-"} | ${toolNames || "no tools"}`, + )}"]`, + ) + lines.push( + ` ${summaryNodeId}["${esc( + [ + `reason: ${shortText(phase.reason_summary, 90)}`, + `action: ${shortText(phase.action_summary, 90)}`, + `result: ${shortText(phase.result_summary, 90)}`, + ].join("
"), + )}"]`, + ) + lines.push(` class ${summaryNodeId} summary`) + + const phaseTools = phase.phase_tool_call_ids + .map(id => toolsById.get(id)) + .filter((tool): tool is RichToolCall => Boolean(tool)) + phaseTools.slice(0, 5).forEach((tool, toolIndex) => { + const toolId = `${subgraphId}_T${toolIndex + 1}` + lines.push(` ${toolId}["${toolSummary(tool)}"]`) + lines.push(` class ${toolId} ${tool.success === false || tool.detected_problem ? "toolFail" : "tool"}`) + lines.push(` ${summaryNodeId} --> ${toolId}`) + }) + if (phaseTools.length > 5) { + const moreId = `${subgraphId}_TMORE` + lines.push(` ${moreId}["+${phaseTools.length - 5} more tools in CSV"]`) + lines.push(` class ${moreId} more`) + lines.push(` ${summaryNodeId} --> ${moreId}`) + } + + const phaseArtifacts = params.artifacts.filter( + artifact => + artifact.created_by_phase_id === phase.phase_id || + artifact.first_seen_phase === phase.phase_id || + phase.primary_artifacts.includes(artifact.artifact_path), + ) + phaseArtifacts.slice(0, 3).forEach((artifact, artifactIndex) => { + const artifactId = `${subgraphId}_A${artifactIndex + 1}` + lines.push(` ${artifactId}["${artifactSummary(artifact)}"]`) + lines.push(` class ${artifactId} ${artifact.artifact_type === "final" ? "artifactFinal" : "artifact"}`) + lines.push(` ${summaryNodeId} --> ${artifactId}`) + }) + + const phaseEvidence = phase.evidence_refs + .map(ref => evidenceByRef.get(ref)) + .filter((item): item is EvidenceRecord => Boolean(item)) + .slice(0, 2) + phaseEvidence.forEach((item, evidenceIndex) => { + const evidenceId = `${subgraphId}_E${evidenceIndex + 1}` + lines.push(` ${evidenceId}["${evidenceSummary(item)}"]`) + lines.push(` class ${evidenceId} evidence`) + lines.push(` ${summaryNodeId} --> ${evidenceId}`) + }) + + lines.push(" end") + if (index === 0) { + lines.push(` ACTION --> ${summaryNodeId}`) + } else { + lines.push(` ${phaseSummaryNodes[index - 1]} --> ${summaryNodeId}`) } }) + params.artifacts.forEach((artifact, index) => { + if (!artifact.created_by_phase_id) return + const sourceSummary = `PH${params.phases.findIndex(phase => phase.phase_id === artifact.created_by_phase_id) + 1}_SUM` + artifact.phase_ids + .filter(phaseId => phaseId !== artifact.created_by_phase_id) + .slice(0, 3) + .forEach(targetPhaseId => { + const targetIndex = params.phases.findIndex(phase => phase.phase_id === targetPhaseId) + if (targetIndex < 0) return + const targetSummary = `PH${targetIndex + 1}_SUM` + const hiddenArtifactNode = `AFLOW_${index + 1}_${targetIndex + 1}` + lines.push(` ${hiddenArtifactNode}["${esc(shortText(artifact.artifact_path.split("/").at(-1) ?? artifact.artifact_path, 60))}"]`) + lines.push(` class ${hiddenArtifactNode} ${artifact.artifact_type === "final" ? "artifactFinal" : "artifact"}`) + lines.push(` ${sourceSummary} --> ${hiddenArtifactNode}`) + lines.push(` ${hiddenArtifactNode} --> ${targetSummary}`) + }) + }) + + params.repairChains.forEach((chain, index) => { + const firstPhaseId = chain.phase_ids[0] + const lastPhaseId = chain.phase_ids.at(-1) + const firstPhaseIndex = params.phases.findIndex(phase => phase.phase_id === firstPhaseId) + const lastPhaseIndex = params.phases.findIndex(phase => phase.phase_id === lastPhaseId) + if (firstPhaseIndex < 0 || lastPhaseIndex < 0) return + const chainId = `RC${index + 1}` + lines.push(` ${chainId}["${esc(shortText(chain.problem_summary, 80))}"]`) + lines.push(` class ${chainId} repair`) + lines.push(` PH${firstPhaseIndex + 1}_SUM -. repair .-> ${chainId}`) + lines.push(` ${chainId} -. verify .-> PH${lastPhaseIndex + 1}_SUM`) + }) + return lines.join("\n") } -export function buildDebugChainFlow(phases: PhaseRecord[]): string { - const debugPhases = phases.filter( - phase => - phase.problems.length > 0 || - phase.fixes.length > 0 || - phase.phase_name === "repair" || - phase.stage_kind === "issue" || - phase.stage_kind === "fix", - ) +export function buildDebugChainFlow(params: { + repairChains: RepairChain[] + tools: RichToolCall[] + artifacts: ArtifactRecord[] + evidence: EvidenceRecord[] +}): string { const lines = [ "flowchart TD", - " classDef issue fill:#fff1f2,stroke:#e11d48,color:#4c0519", - " classDef fix fill:#eff6ff,stroke:#0891b2,color:#082f49", - " classDef output fill:#f0fdf4,stroke:#16a34a,color:#14532d", + " classDef problem fill:#fee2e2,stroke:#dc2626,color:#450a0a", + " classDef root fill:#fef3c7,stroke:#d97706,color:#451a03", + " classDef fix fill:#f3e8ff,stroke:#9333ea,color:#3b0764", + " classDef verification fill:#dbeafe,stroke:#2563eb,color:#172554", + " classDef resolved fill:#dcfce7,stroke:#16a34a,color:#14532d", + " classDef unresolved fill:#fed7aa,stroke:#ea580c,color:#431407", ] - debugPhases.forEach((phase, index) => { - const nodeId = `D${index + 1}` - lines.push(` ${nodeId}["${label(phase)}"]`) - lines.push(` class ${nodeId} ${phase.stage_kind === "fix" ? "fix" : phase.problems.length > 0 ? "issue" : "output"}`) - if (index > 0) { - lines.push(` D${index} --> ${nodeId}`) - } - }) - - if (debugPhases.length === 0) { + if (params.repairChains.length === 0) { lines.push(' D1["no dense repair chain detected"]') - lines.push(" class D1 output") + lines.push(" class D1 resolved") + return lines.join("\n") } + + params.repairChains.forEach((chain, index) => { + const base = `D${index + 1}` + const problemId = `${base}_P` + const rootId = `${base}_R` + const verificationId = `${base}_V` + const resultId = `${base}_O` + lines.push(` ${problemId}["${esc(shortText(chain.problem_summary, 90))}"]`) + lines.push(` ${rootId}["${esc(chain.root_cause_guess)}"]`) + lines.push(` ${verificationId}["${esc(shortText(chain.verification_summary, 90))}"]`) + lines.push(` ${resultId}["${esc(chain.status)}"]`) + lines.push(` class ${problemId} problem`) + lines.push(` class ${rootId} root`) + lines.push(` class ${verificationId} verification`) + lines.push(` class ${resultId} ${chain.status === "resolved" ? "resolved" : "unresolved"}`) + lines.push(` ${problemId} --> ${rootId}`) + + let previous = rootId + chain.fix_actions.slice(0, 4).forEach((fix, fixIndex) => { + const fixId = `${base}_F${fixIndex + 1}` + lines.push(` ${fixId}["${esc(shortText(fix, 90))}"]`) + lines.push(` class ${fixId} fix`) + lines.push(` ${previous} --> ${fixId}`) + previous = fixId + }) + + lines.push(` ${previous} --> ${verificationId}`) + lines.push(` ${verificationId} --> ${resultId}`) + }) + return lines.join("\n") } diff --git a/scripts/observability/lib/phase_infer.ts b/scripts/observability/lib/phase_infer.ts index 377501aa43..af5b566221 100644 --- a/scripts/observability/lib/phase_infer.ts +++ b/scripts/observability/lib/phase_infer.ts @@ -1,191 +1,405 @@ -import type { ActionRow, PhaseRecord, QueryRow, RichToolCall, TurnRow } from "./deep_action_types" +import type { ActionRow, ArtifactRecord, PhaseRecord, QueryRow, RichToolCall, TurnRow } from "./deep_action_types" -type Seed = { - name: string - kind: PhaseRecord["stage_kind"] - startMs: number - endMs: number +type ToolMarker = { + signature: string + phaseName: string + stageKind: PhaseRecord["stage_kind"] + reason: string + action: string + result: string + primaryArtifacts: string[] + problems: string[] + fixes: string[] + forceBoundaryBefore: boolean + forceBoundaryAfter: boolean queryId: string | null turnId: string | null - toolName: string | null - toolCallId: string | null - output: string - problem: string - fix: string - evidenceRefs: string[] +} + +function unique(values: T[]): T[] { + return [...new Set(values)] } function localText(value: number): string { return new Date(value).toLocaleString("sv-SE").replace("T", " ") } -function inferPhaseName(tool: RichToolCall): { name: string; kind: PhaseRecord["stage_kind"] } { - const haystack = `${tool.tool_name} ${tool.input_summary} ${tool.command_or_path} ${tool.prompt_summary} ${tool.agent_name ?? ""}`.toLowerCase() - if (haystack.includes("compact")) return { name: "compact", kind: "compact" } - if (haystack.includes("docx") || haystack.includes("python-docx") || haystack.includes("word")) { - return { name: "thesis_parse", kind: tool.agent_name === "main_thread" ? "main" : "subagent" } - } - if (haystack.includes("pptx") || haystack.includes("template") || haystack.includes("python-pptx")) { - return { name: "template_parse", kind: tool.agent_name === "main_thread" ? "main" : "subagent" } +function shortText(value: string, maxLength = 140): string { + const normalized = value.replace(/\s+/gu, " ").trim() + if (normalized.length <= maxLength) return normalized + return `${normalized.slice(0, maxLength - 3)}...` +} + +function fileBase(path: string): string { + const normalized = path.replace(/\\/gu, "/") + return normalized.split("/").at(-1) ?? normalized +} + +function scriptNameFromTool(tool: RichToolCall): string { + const haystack = [tool.command_or_path, tool.input_summary, tool.result_summary_rich] + .filter(Boolean) + .join(" ") + const match = haystack.match(/([A-Za-z0-9_.-]+\.(?:py|js|ts|ps1))/iu) + return match?.[1] ?? "" +} + +function haystack(tool: RichToolCall, query: QueryRow | undefined): string { + return [ + tool.tool_name, + tool.input_summary, + tool.command_or_path, + tool.result_summary_rich, + tool.prompt_summary, + query?.query_source ?? "", + query?.subagent_reason ?? "", + ] + .join(" ") + .toLowerCase() +} + +function containsCheckSignal(tool: RichToolCall, query: QueryRow | undefined): boolean { + return /check|inspect|verify|scan|grep|find|search|overlap|bounds|layout|read|compare|diff|look for|remaining/iu.test( + haystack(tool, query), + ) +} + +function inferStageKind(tool: RichToolCall, query: QueryRow | undefined): PhaseRecord["stage_kind"] { + if ((query?.query_source ?? "").toLowerCase().includes("compact")) return "compact" + if (tool.tool_name === "Agent") return "subagent" + if (tool.tool_name === "Write" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return "script" + if (tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return "script" + if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit" || tool.detected_fix_signal) return "fix" + if (tool.success === false || tool.detected_problem) return "issue" + if (tool.produced_files.some(path => /\.pptx$/iu.test(path))) return "output" + if (query?.subagent_id || (tool.agent_name && tool.agent_name !== "main_thread")) return "subagent" + if (tool.tool_name === "Read" || tool.tool_name === "Grep" || tool.tool_name === "Glob") return "input" + return "main" +} + +function inferPhaseCluster(tool: RichToolCall, query: QueryRow | undefined): { name: string; signature: string } { + const scriptName = scriptNameFromTool(tool) + const text = haystack(tool, query) + const compactQuery = (query?.query_source ?? "").toLowerCase().includes("compact") + const subagentQuery = Boolean(query?.subagent_id || (tool.agent_name && tool.agent_name !== "main_thread")) + + if (compactQuery) return { name: "compact carry-forward", signature: "compact" } + if (tool.tool_name === "Agent") return { name: "fork subagents", signature: "fork-subagents" } + if (tool.tool_name === "Write" && scriptName) return { name: `write script ${scriptName}`, signature: `write-script:${scriptName}` } + if (tool.tool_name === "Bash" && scriptName) return { name: `run script ${scriptName}`, signature: `run-script:${scriptName}` } + if ((tool.tool_name === "Edit" || tool.tool_name === "MultiEdit") && scriptName) return { name: `edit script ${scriptName}`, signature: `edit-script:${scriptName}` } + if (/pip install|pip3 install|where python|python --version|import docx|import pptx/iu.test(text)) { + return { name: "environment setup and dependency checks", signature: `env-setup:${subagentQuery ? "subagent" : "main"}` } } - if (haystack.includes("word/media") || haystack.includes("zipfile")) { - return { name: "media_extract", kind: "subagent" } + if (subagentQuery && /docx|thesis|论文|extract/.test(text)) { + return { name: "subagent thesis extraction", signature: "subagent-thesis-extraction" } } - if (haystack.includes("blip") || haystack.includes("caption") || haystack.includes("image")) { - return { name: "image_caption_map", kind: "subagent" } + if (subagentQuery && /pptx|template|slide|layout|master|footer|xml/.test(text)) { + return { name: "subagent template analysis", signature: "subagent-template-analysis" } } - if (haystack.includes("pptxgenjs") || haystack.includes("generate_ppt") || haystack.includes("create_ppt")) { - return { name: "deck_build", kind: "script" } + if (subagentQuery) { + return { name: "subagent evidence review", signature: "subagent-evidence-review" } } - if (haystack.includes("overlap") || haystack.includes("out-of-bounds") || haystack.includes("check")) { - return { name: "layout_check", kind: "issue" } + if (tool.success === false || /readonly|locked|permission|denied|timeout|traceback|exception/.test(text)) { + return { name: "execution or repair issue detection", signature: "issue-detection" } } - if (haystack.includes("readonly") || haystack.includes("lock") || haystack.includes("copy2") || haystack.includes("save")) { - return { name: "ppt_save_fix", kind: "fix" } + if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit" || tool.detected_fix_signal) { + return { name: "repair and adjustment edits", signature: "repair-edits" } } - if (tool.tool_name === "Agent") return { name: "spawn_subagents", kind: "main" } - if (tool.tool_name === "Read" || tool.tool_name === "Grep" || tool.tool_name === "Glob") { - return { name: tool.agent_name === "main_thread" ? "initial_read" : "subagent_work", kind: tool.agent_name === "main_thread" ? "input" : "subagent" } + if (containsCheckSignal(tool, query) && /ppt|output|analysis|check|verify|remaining|residue|ncalnn|footer/.test(text)) { + return { name: "output verification and residue checks", signature: "output-verification" } } - if (tool.tool_name === "Write" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) { - return { name: "script_generation", kind: "script" } + if (containsCheckSignal(tool, query) && /docx|thesis|template|spec|txt/.test(text)) { + return { name: "input collection and source review", signature: "input-review" } } - if (tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) { - return { name: "script_execution", kind: "script" } + if (tool.produced_files.some(path => /\.pptx$/iu.test(path))) { + return { name: `generate ${fileBase(tool.produced_files.find(path => /\.pptx$/iu.test(path)) ?? "deck.pptx")}`, signature: `generate-ppt:${fileBase(tool.produced_files.find(path => /\.pptx$/iu.test(path)) ?? "deck.pptx")}` } } - if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit") { - return { name: "repair", kind: "fix" } + if (tool.tool_name === "Write") return { name: `write ${fileBase(tool.command_or_path || tool.produced_files[0] || "file")}`, signature: `write:${fileBase(tool.command_or_path || tool.produced_files[0] || "file")}` } + if (tool.tool_name === "Bash") return { name: "bash execution and checks", signature: `bash-checks:${subagentQuery ? "subagent" : "main"}` } + if (tool.tool_name === "Read" || tool.tool_name === "Grep" || tool.tool_name === "Glob") { + return { name: "input collection and source review", signature: `inspect:${subagentQuery ? "subagent" : "main"}` } } - if (tool.tool_name === "Task") return { name: "completion", kind: "output" } - if (tool.agent_name && tool.agent_name !== "main_thread") { - return { name: "subagent_work", kind: "subagent" } + return { name: `${tool.tool_name.toLowerCase()} flow`, signature: `${tool.tool_name.toLowerCase()}-flow` } +} + +function buildReason(tool: RichToolCall, query: QueryRow | undefined): string { + return shortText( + tool.detected_problem || + query?.subagent_reason || + tool.prompt_summary || + query?.terminal_reason || + tool.input_summary || + tool.command_or_path || + "continue action flow", + 180, + ) +} + +function buildAction(tool: RichToolCall): string { + return shortText( + tool.command_or_path ? `${tool.tool_name}: ${tool.command_or_path}` : `${tool.tool_name}: ${tool.input_summary}`, + 180, + ) +} + +function buildResult(tool: RichToolCall): string { + return shortText( + tool.result_summary_rich || + tool.output_summary || + tool.result_files[0] || + tool.produced_files[0] || + (tool.success === true ? "completed" : tool.success === false ? "failed" : "done"), + 220, + ) +} + +function forceBoundaryBefore(tool: RichToolCall, previous: RichToolCall | null, query: QueryRow | undefined): boolean { + if (!previous) return true + if (tool.query_id !== previous.query_id) return true + if ((query?.query_source ?? "").toLowerCase().includes("compact")) return true + if (tool.tool_name === "Agent") return true + if (tool.tool_name === "Write" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return true + if (tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return true + if (tool.success === false) return true + if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit") return true + if (tool.detected_problem || tool.detected_fix_signal) return true + if (containsCheckSignal(tool, query) && previous.produced_files.length > 0) return true + if (tool.produced_files.some(path => /\.pptx$/iu.test(path)) && previous.produced_files.join("|") !== tool.produced_files.join("|")) return true + return false +} + +function forceBoundaryAfter(tool: RichToolCall, query: QueryRow | undefined): boolean { + if ((query?.query_source ?? "").toLowerCase().includes("compact")) return true + if (tool.tool_name === "Agent") return true + if (tool.tool_name === "Write" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return true + if (tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path)) return true + if (tool.tool_name === "Edit" || tool.tool_name === "MultiEdit") return true + if (tool.success === false) return true + if (tool.detected_problem || tool.detected_fix_signal) return true + return false +} + +function makeMarker(tool: RichToolCall, previous: RichToolCall | null, query: QueryRow | undefined): ToolMarker { + const cluster = inferPhaseCluster(tool, query) + return { + signature: cluster.signature, + phaseName: cluster.name, + stageKind: inferStageKind(tool, query), + reason: buildReason(tool, query), + action: buildAction(tool), + result: buildResult(tool), + primaryArtifacts: unique([...tool.produced_files, ...tool.result_files].slice(0, 4)), + problems: tool.detected_problem ? [tool.detected_problem] : tool.success === false ? [tool.output_summary] : [], + fixes: tool.detected_fix_signal ? [tool.detected_fix_signal] : [], + forceBoundaryBefore: forceBoundaryBefore(tool, previous, query), + forceBoundaryAfter: forceBoundaryAfter(tool, query), + queryId: tool.query_id, + turnId: tool.turn_id, } - return { name: "main_preparation", kind: "main" } } function appendCount(target: Record, key: string): void { target[key] = (target[key] ?? 0) + 1 } -function mergeSeeds(seeds: Seed[]): PhaseRecord[] { - if (seeds.length === 0) { - return [] - } - const sorted = [...seeds].sort((left, right) => left.startMs - right.startMs) - const phases: PhaseRecord[] = [] - let current: PhaseRecord | null = null - - for (const seed of sorted) { - const shouldMerge = - current && - current.phase_name === seed.name && - current.stage_kind === seed.kind && - seed.startMs - current.end_ms < 90_000 - - if (!shouldMerge) { - current = { - phase_id: `phase_${String(phases.length + 1).padStart(2, "0")}`, - phase_name: seed.name, - stage_kind: seed.kind, - start_local: localText(seed.startMs), - end_local: localText(seed.endMs), - duration_ms: Math.max(seed.endMs - seed.startMs, 0), - start_ms: seed.startMs, - end_ms: seed.endMs, - query_ids: seed.queryId ? [seed.queryId] : [], - turn_ids: seed.turnId ? [seed.turnId] : [], - tool_counts: {}, - main_outputs: seed.output ? [seed.output] : [], - problems: seed.problem ? [seed.problem] : [], - fixes: seed.fix ? [seed.fix] : [], - evidence_refs: [...seed.evidenceRefs], - tool_call_ids: seed.toolCallId ? [seed.toolCallId] : [], - } - if (seed.toolName) { - appendCount(current.tool_counts, seed.toolName) - } - phases.push(current) - continue - } +function canMergePhase(current: PhaseRecord, marker: ToolMarker, tool: RichToolCall, startMs: number): boolean { + if (marker.forceBoundaryBefore) return false + if (current.phase_name !== marker.phaseName) return false + if (current.stage_kind !== marker.stageKind) return false + if (marker.queryId && current.query_ids.at(-1) !== marker.queryId) return false + if (tool.detected_problem || tool.detected_fix_signal) return false + if (startMs - current.end_ms > 5 * 60 * 1000) return false + const maxTools = + current.stage_kind === "input" || current.stage_kind === "main" || current.stage_kind === "subagent" ? 10 : 6 + return current.phase_tool_call_ids.length < maxTools +} - current.end_ms = Math.max(current.end_ms, seed.endMs) - current.end_local = localText(current.end_ms) - current.duration_ms = Math.max(current.end_ms - current.start_ms, 0) - if (seed.queryId && !current.query_ids.includes(seed.queryId)) current.query_ids.push(seed.queryId) - if (seed.turnId && !current.turn_ids.includes(seed.turnId)) current.turn_ids.push(seed.turnId) - if (seed.toolName) appendCount(current.tool_counts, seed.toolName) - if (seed.output && !current.main_outputs.includes(seed.output)) current.main_outputs.push(seed.output) - if (seed.problem && !current.problems.includes(seed.problem)) current.problems.push(seed.problem) - if (seed.fix && !current.fixes.includes(seed.fix)) current.fixes.push(seed.fix) - for (const ref of seed.evidenceRefs) { - if (!current.evidence_refs.includes(ref)) current.evidence_refs.push(ref) - } - if (seed.toolCallId && !current.tool_call_ids.includes(seed.toolCallId)) { - current.tool_call_ids.push(seed.toolCallId) - } +function createPhase(index: number, tool: RichToolCall, marker: ToolMarker, startMs: number, endMs: number): PhaseRecord { + return { + phase_id: `phase_${String(index).padStart(2, "0")}`, + phase_name: marker.phaseName, + stage_kind: marker.stageKind, + start_local: localText(startMs), + end_local: localText(endMs), + duration_ms: Math.max(endMs - startMs, 0), + start_ms: startMs, + end_ms: endMs, + query_ids: marker.queryId ? [marker.queryId] : [], + turn_ids: marker.turnId ? [marker.turnId] : [], + tool_counts: { [tool.tool_name]: 1 }, + main_outputs: marker.result ? [marker.result] : [], + problems: [...marker.problems], + fixes: [...marker.fixes], + evidence_refs: [...tool.evidence_refs], + tool_call_ids: [tool.tool_call_id], + phase_tool_call_ids: [tool.tool_call_id], + primary_artifacts: [...marker.primaryArtifacts], + reason_summary: marker.reason, + action_summary: marker.action, + result_summary: marker.result, } +} - return phases +function mergeIntoPhase(phase: PhaseRecord, tool: RichToolCall, marker: ToolMarker, endMs: number): void { + phase.end_ms = Math.max(phase.end_ms, endMs) + phase.end_local = localText(phase.end_ms) + phase.duration_ms = Math.max(phase.end_ms - phase.start_ms, 0) + if (marker.queryId && !phase.query_ids.includes(marker.queryId)) phase.query_ids.push(marker.queryId) + if (marker.turnId && !phase.turn_ids.includes(marker.turnId)) phase.turn_ids.push(marker.turnId) + appendCount(phase.tool_counts, tool.tool_name) + phase.tool_call_ids = unique([...phase.tool_call_ids, tool.tool_call_id]) + phase.phase_tool_call_ids = unique([...phase.phase_tool_call_ids, tool.tool_call_id]) + phase.main_outputs = unique([...phase.main_outputs, marker.result].filter(Boolean)) + phase.problems = unique([...phase.problems, ...marker.problems]) + phase.fixes = unique([...phase.fixes, ...marker.fixes]) + phase.evidence_refs = unique([...phase.evidence_refs, ...tool.evidence_refs]) + phase.primary_artifacts = unique([...phase.primary_artifacts, ...marker.primaryArtifacts]) + phase.reason_summary = shortText(unique([phase.reason_summary, marker.reason]).filter(Boolean).join(" | "), 220) + phase.action_summary = shortText(unique([phase.action_summary, marker.action]).filter(Boolean).join(" | "), 220) + phase.result_summary = shortText(unique([phase.result_summary, marker.result]).filter(Boolean).join(" | "), 240) } -function coalescePhases(phases: PhaseRecord[]): PhaseRecord[] { - const merged = new Map() - const order: string[] = [] +function mergePhaseRecords(target: PhaseRecord, source: PhaseRecord): void { + target.end_ms = Math.max(target.end_ms, source.end_ms) + target.end_local = localText(target.end_ms) + target.duration_ms = Math.max(target.end_ms - target.start_ms, 0) + target.query_ids = unique([...target.query_ids, ...source.query_ids]) + target.turn_ids = unique([...target.turn_ids, ...source.turn_ids]) + for (const [toolName, count] of Object.entries(source.tool_counts)) { + target.tool_counts[toolName] = (target.tool_counts[toolName] ?? 0) + count + } + target.main_outputs = unique([...target.main_outputs, ...source.main_outputs]) + target.problems = unique([...target.problems, ...source.problems]) + target.fixes = unique([...target.fixes, ...source.fixes]) + target.evidence_refs = unique([...target.evidence_refs, ...source.evidence_refs]) + target.tool_call_ids = unique([...target.tool_call_ids, ...source.tool_call_ids]) + target.phase_tool_call_ids = unique([...target.phase_tool_call_ids, ...source.phase_tool_call_ids]) + target.primary_artifacts = unique([...target.primary_artifacts, ...source.primary_artifacts]) + target.reason_summary = shortText(unique([target.reason_summary, source.reason_summary]).join(" | "), 220) + target.action_summary = shortText(unique([target.action_summary, source.action_summary]).join(" | "), 220) + target.result_summary = shortText(unique([target.result_summary, source.result_summary]).join(" | "), 240) +} +function coalesceWithinQueryWindows(phases: PhaseRecord[]): PhaseRecord[] { + const grouped = new Map() for (const phase of phases) { - const key = `${phase.phase_name}|${phase.stage_kind}` - const existing = merged.get(key) - if (!existing) { - merged.set(key, { - ...phase, - query_ids: [...phase.query_ids], - turn_ids: [...phase.turn_ids], - tool_counts: { ...phase.tool_counts }, - main_outputs: [...phase.main_outputs], - problems: [...phase.problems], - fixes: [...phase.fixes], - evidence_refs: [...phase.evidence_refs], - tool_call_ids: [...phase.tool_call_ids], - }) - order.push(key) - continue - } + const key = phase.query_ids[0] ?? "__unknown__" + const list = grouped.get(key) ?? [] + list.push(phase) + grouped.set(key, list) + } - existing.start_ms = Math.min(existing.start_ms, phase.start_ms) - existing.end_ms = Math.max(existing.end_ms, phase.end_ms) - existing.start_local = localText(existing.start_ms) - existing.end_local = localText(existing.end_ms) - existing.duration_ms = Math.max(existing.end_ms - existing.start_ms, 0) - for (const queryId of phase.query_ids) { - if (!existing.query_ids.includes(queryId)) existing.query_ids.push(queryId) - } - for (const turnId of phase.turn_ids) { - if (!existing.turn_ids.includes(turnId)) existing.turn_ids.push(turnId) - } - for (const [toolName, count] of Object.entries(phase.tool_counts)) { - existing.tool_counts[toolName] = (existing.tool_counts[toolName] ?? 0) + count - } - for (const output of phase.main_outputs) { - if (!existing.main_outputs.includes(output)) existing.main_outputs.push(output) - } - for (const problem of phase.problems) { - if (!existing.problems.includes(problem)) existing.problems.push(problem) - } - for (const fix of phase.fixes) { - if (!existing.fixes.includes(fix)) existing.fixes.push(fix) - } - for (const ref of phase.evidence_refs) { - if (!existing.evidence_refs.includes(ref)) existing.evidence_refs.push(ref) + const merged: PhaseRecord[] = [] + for (const queryPhases of grouped.values()) { + const sorted = [...queryPhases].sort((left, right) => left.start_ms - right.start_ms) + let current: PhaseRecord | null = null + for (const phase of sorted) { + const mergeableName = + !/^write script |^run script /u.test(phase.phase_name) + const canMerge = + current && + mergeableName && + current.phase_name === phase.phase_name && + current.stage_kind === phase.stage_kind && + phase.start_ms - current.end_ms <= 10 * 60 * 1000 && + current.phase_tool_call_ids.length + phase.phase_tool_call_ids.length <= (phase.stage_kind === "fix" || phase.stage_kind === "issue" ? 8 : 18) + + if (!current || !canMerge) { + current = { + ...phase, + query_ids: [...phase.query_ids], + turn_ids: [...phase.turn_ids], + tool_counts: { ...phase.tool_counts }, + main_outputs: [...phase.main_outputs], + problems: [...phase.problems], + fixes: [...phase.fixes], + evidence_refs: [...phase.evidence_refs], + tool_call_ids: [...phase.tool_call_ids], + phase_tool_call_ids: [...phase.phase_tool_call_ids], + primary_artifacts: [...phase.primary_artifacts], + } + merged.push(current) + } else { + mergePhaseRecords(current, phase) + } } - for (const toolCallId of phase.tool_call_ids) { - if (!existing.tool_call_ids.includes(toolCallId)) existing.tool_call_ids.push(toolCallId) + } + return merged +} + +function buildSummaryPhases(action: ActionRow, queries: QueryRow[], turns: TurnRow[], tools: RichToolCall[]): PhaseRecord[] { + const queryById = new Map(queries.map(query => [query.query_id, query])) + const toolsByQuery = new Map() + for (const tool of tools) { + const key = tool.query_id ?? "__unknown__" + const list = toolsByQuery.get(key) ?? [] + list.push(tool) + toolsByQuery.set(key, list) + } + + const phases: PhaseRecord[] = [] + for (const queryTools of toolsByQuery.values()) { + const sortedTools = [...queryTools].sort((left, right) => { + const leftMs = Date.parse(left.detected_at ?? action.started_at) + const rightMs = Date.parse(right.detected_at ?? action.started_at) + return leftMs - rightMs + }) + let current: PhaseRecord | null = null + let previousTool: RichToolCall | null = null + + for (const tool of sortedTools) { + const query = tool.query_id ? queryById.get(tool.query_id) : undefined + const marker = makeMarker(tool, previousTool, query) + const startMs = tool.detected_at ? Date.parse(tool.detected_at) : action.started_at_ms + const endMs = tool.completed_at ? Date.parse(tool.completed_at) : startMs + const merge = current ? canMergePhase(current, marker, tool, startMs) : false + + if (!current || !merge) { + current = createPhase(phases.length + 1, tool, marker, startMs, endMs) + phases.push(current) + } else { + mergeIntoPhase(current, tool, marker, endMs) + } + + if (marker.forceBoundaryAfter) current = null + previousTool = tool } } - return order.map((key, index) => ({ - ...merged.get(key)!, - phase_id: `phase_${String(index + 1).padStart(2, "0")}`, - })) + if (phases.length === 0) { + return [ + { + phase_id: "phase_01", + phase_name: "action only", + stage_kind: "main", + start_local: localText(action.started_at_ms), + end_local: localText(action.ended_at_ms), + duration_ms: Math.max(action.ended_at_ms - action.started_at_ms, 0), + start_ms: action.started_at_ms, + end_ms: action.ended_at_ms, + query_ids: queries.map(query => query.query_id), + turn_ids: turns.map(turn => turn.turn_id), + tool_counts: {}, + main_outputs: ["no tool calls captured"], + problems: [], + fixes: [], + evidence_refs: [], + tool_call_ids: [], + phase_tool_call_ids: [], + primary_artifacts: [], + reason_summary: "no tool calls captured", + action_summary: "action did not emit tools", + result_summary: queries.at(-1)?.terminal_reason ?? "completed", + }, + ] + } + + return coalesceWithinQueryWindows(phases) + .sort((left, right) => left.start_ms - right.start_ms) + .map((phase, index) => ({ + ...phase, + phase_id: `phase_${String(index + 1).padStart(2, "0")}`, + })) } export function inferPhases(params: { @@ -193,87 +407,7 @@ export function inferPhases(params: { queries: QueryRow[] turns: TurnRow[] tools: RichToolCall[] + artifacts?: ArtifactRecord[] }): PhaseRecord[] { - const seeds: Seed[] = [] - const firstTool = [...params.tools] - .filter(tool => tool.detected_at) - .sort((left, right) => Date.parse(left.detected_at ?? "") - Date.parse(right.detected_at ?? ""))[0] - - if (firstTool?.detected_at) { - seeds.push({ - name: "action_start", - kind: "input", - startMs: params.action.started_at_ms, - endMs: Date.parse(firstTool.detected_at), - queryId: params.queries[0]?.query_id ?? null, - turnId: params.turns[0]?.turn_id ?? null, - toolName: null, - toolCallId: null, - output: "entered action", - problem: "", - fix: "", - evidenceRefs: [], - }) - } - - for (const tool of params.tools) { - const startMs = tool.detected_at ? Date.parse(tool.detected_at) : params.action.started_at_ms - const endMs = tool.completed_at ? Date.parse(tool.completed_at) : startMs - const inferred = inferPhaseName(tool) - const failed = tool.success === false ? tool.output_summary : "" - const fix = inferred.kind === "fix" ? tool.input_summary : "" - seeds.push({ - name: inferred.name, - kind: inferred.kind, - startMs, - endMs, - queryId: tool.query_id, - turnId: tool.turn_id, - toolName: tool.tool_name, - toolCallId: tool.tool_call_id, - output: tool.produced_files[0] ?? tool.output_summary, - problem: failed, - fix, - evidenceRefs: tool.evidence_refs, - }) - } - - if (params.queries.some(query => (query.query_source ?? "").includes("compact"))) { - const compactQueries = params.queries.filter(query => - (query.query_source ?? "").includes("compact"), - ) - for (const query of compactQueries) { - seeds.push({ - name: "compact", - kind: "compact", - startMs: query.started_at_ms, - endMs: query.ended_at_ms ?? query.started_at_ms, - queryId: query.query_id, - turnId: null, - toolName: null, - toolCallId: null, - output: query.terminal_reason ?? "", - problem: "", - fix: "", - evidenceRefs: [], - }) - } - } - - seeds.push({ - name: "completion", - kind: "output", - startMs: params.action.ended_at_ms, - endMs: params.action.ended_at_ms, - queryId: params.queries.at(-1)?.query_id ?? null, - turnId: params.turns.at(-1)?.turn_id ?? null, - toolName: null, - toolCallId: null, - output: "action completed", - problem: "", - fix: "", - evidenceRefs: [], - }) - - return coalescePhases(mergeSeeds(seeds)) + return buildSummaryPhases(params.action, params.queries, params.turns, params.tools) } diff --git a/scripts/observability/lib/repair_chain_detector.ts b/scripts/observability/lib/repair_chain_detector.ts new file mode 100644 index 0000000000..455daa88a3 --- /dev/null +++ b/scripts/observability/lib/repair_chain_detector.ts @@ -0,0 +1,147 @@ +import type { ArtifactRecord, PhaseRecord, RepairChain, RichToolCall } from "./deep_action_types" + +function unique(values: T[]): T[] { + return [...new Set(values)] +} + +function shortText(value: string, maxLength = 180): string { + const normalized = value.replace(/\s+/gu, " ").trim() + if (normalized.length <= maxLength) return normalized + return `${normalized.slice(0, maxLength - 3)}...` +} + +function toolMs(tool: RichToolCall): number { + return Date.parse(tool.detected_at ?? tool.completed_at ?? new Date(0).toISOString()) +} + +function isProblemTool(tool: RichToolCall): boolean { + return Boolean(tool.success === false || tool.detected_problem || /found|remaining|residue|error|failed|timeout|permission|readonly|locked/iu.test(tool.result_summary_rich)) +} + +function isFixTool(tool: RichToolCall): boolean { + return Boolean( + tool.tool_name === "Edit" || + tool.tool_name === "MultiEdit" || + tool.detected_fix_signal || + /fix|patch|replace|rewrite|remove|delete|rename|chmod|save|regenerate|rerun|修改|修复|替换|删除|重新生成/iu.test( + `${tool.input_summary} ${tool.result_summary_rich}`, + ), + ) +} + +function isRunTool(tool: RichToolCall): boolean { + return tool.tool_name === "Bash" && /\.(py|js|ts|ps1)\b/iu.test(tool.command_or_path) +} + +function isVerificationTool(tool: RichToolCall): boolean { + return /check|verify|scan|grep|read|inspect|find|layout|bounds/iu.test( + `${tool.tool_name} ${tool.input_summary} ${tool.command_or_path} ${tool.result_summary_rich}`, + ) +} + +function rootCauseGuess(text: string): string { + const lowered = text.toLowerCase() + if (/readonly|locked|save|copy2|permission/iu.test(lowered)) return "save_or_permission_repair" + if (/ncalnn|ncalnnn|repeated replace/iu.test(lowered)) return "replacement_pollution_repair" + if (/master|footer|run|xml|a:t/iu.test(lowered)) return "ppt_xml_or_footer_repair" + if (/residue|remaining|found/iu.test(lowered)) return "residue_scan_repair" + return "generic_execution_repair" +} + +function buildChain( + chainIndex: number, + tools: RichToolCall[], + phaseByToolId: Map, +): RepairChain { + const problemTool = tools[0]! + const fixTools = tools.filter(isFixTool) + const verificationTools = tools.filter(isVerificationTool) + const phaseIds = unique(tools.map(tool => phaseByToolId.get(tool.tool_call_id)?.phase_id ?? "unknown")) + const artifactPaths = unique(tools.flatMap(tool => [...tool.produced_files, ...tool.result_files, ...tool.touched_files])) + const evidenceRefs = unique(tools.flatMap(tool => tool.evidence_refs)) + const verificationSummary = + verificationTools.at(-1)?.result_summary_rich ?? + tools.at(-1)?.result_summary_rich ?? + "verification unavailable" + const resolved = !verificationTools.some(tool => isProblemTool(tool)) && !isProblemTool(tools.at(-1)!) + + return { + chain_id: `repair_${String(chainIndex).padStart(2, "0")}`, + problem_summary: shortText(problemTool.detected_problem || problemTool.result_summary_rich || problemTool.output_summary), + root_cause_guess: rootCauseGuess( + tools + .map(tool => [tool.detected_problem, tool.detected_fix_signal, tool.result_summary_rich].filter(Boolean).join(" ")) + .join(" "), + ), + fix_actions: unique(fixTools.map(tool => shortText(`${tool.tool_name}: ${tool.command_or_path || tool.input_summary || tool.detected_fix_signal}`))), + verification_summary: shortText(verificationSummary), + tool_call_ids: tools.map(tool => tool.tool_call_id), + phase_ids: phaseIds, + artifact_paths: artifactPaths, + evidence_refs: evidenceRefs, + status: resolved ? "resolved" : "unresolved", + } +} + +export function detectRepairChains(params: { + richTools: RichToolCall[] + phases: PhaseRecord[] + artifacts: ArtifactRecord[] +}): RepairChain[] { + const sortedTools = [...params.richTools].sort((left, right) => toolMs(left) - toolMs(right)) + const phaseByToolId = new Map() + for (const phase of params.phases) { + for (const toolCallId of phase.phase_tool_call_ids) { + phaseByToolId.set(toolCallId, phase) + } + } + + const chains: RepairChain[] = [] + const used = new Set() + + for (let index = 0; index < sortedTools.length; index += 1) { + const start = sortedTools[index]! + if (used.has(start.tool_call_id) || !isProblemTool(start)) continue + + const windowTools = [start] + let sawFix = false + let sawRerun = false + let sawVerification = false + const startMs = toolMs(start) + + for (let cursor = index + 1; cursor < sortedTools.length; cursor += 1) { + const current = sortedTools[cursor]! + if (toolMs(current) - startMs > 10 * 60 * 1000) break + if (current.query_id !== start.query_id && current.agent_name === start.agent_name) break + + const relatedArtifact = current.touched_files.some(path => start.produced_files.includes(path) || start.result_files.includes(path)) + const sameLoop = + isFixTool(current) || + isRunTool(current) || + isVerificationTool(current) || + relatedArtifact || + /readonly|locked|save|copy2|permission|ncalnn|ncalnnn|master|footer|xml|a:t|residue|remaining/iu.test( + `${current.input_summary} ${current.result_summary_rich}`, + ) + + if (!sameLoop) continue + + windowTools.push(current) + if (isFixTool(current)) sawFix = true + if (isRunTool(current) && sawFix) sawRerun = true + if (isVerificationTool(current) && (sawFix || sawRerun)) sawVerification = true + } + + const denseLoop = + windowTools.length >= 4 && + windowTools.filter(tool => isFixTool(tool) || isVerificationTool(tool) || isRunTool(tool)).length >= 3 + + if ((sawFix && sawRerun) || (sawFix && sawVerification) || denseLoop) { + const chain = buildChain(chains.length + 1, windowTools, phaseByToolId) + chains.push(chain) + for (const tool of windowTools) used.add(tool.tool_call_id) + } + } + + return chains +} diff --git a/scripts/observability/lib/tool_result_extractor.ts b/scripts/observability/lib/tool_result_extractor.ts new file mode 100644 index 0000000000..5e2534f742 --- /dev/null +++ b/scripts/observability/lib/tool_result_extractor.ts @@ -0,0 +1,404 @@ +import type { + JsonValue, + RichToolCall, + SnapshotRecord, + ToolResultCandidate, + TurnSnapshotBundle, +} from "./deep_action_types" + +const PROBLEM_KEYWORDS = [ + "error", + "failed", + "failure", + "denied", + "permission", + "readonly", + "locked", + "timeout", + "interrupted", + "traceback", + "exception", + "residue", + "remaining", + "found", + "bfz", + "gdc", + "\u53ef\u9006SOFC", + "\u53f6\u5148\u5706", + "2024", + "ncalnn", + "ncalnnn", +] + +const FIX_KEYWORDS = [ + "fix", + "patch", + "replace", + "rewrite", + "remove", + "delete", + "rename", + "chmod", + "save", + "regenerate", + "rerun", + "\u4fee\u6539", + "\u4fee\u590d", + "\u66ff\u6362", + "\u5220\u9664", + "\u91cd\u65b0\u751f\u6210", +] + +const FILE_HINT_KEYWORDS = [ + "saved", + "generated", + "written", + "output", + "created", + "exported", + "\u6587\u4ef6\u4f4d\u4e8e", + "\u5df2\u751f\u6210", +] + +const FILE_PATTERN = + /([A-Za-z]:[\\/][^\s"'`<>|]+|(?:\.{1,2}[\\/])?[\w .-]+(?:[\\/][\w .-]+)*\.(?:docx|pptx|txt|json|py|js|ts|ps1|csv|md|xml|html|png|jpg|jpeg|svg|pdf|xlsx|output))/giu + +function unique(values: T[]): T[] { + return [...new Set(values)] +} + +function asRecord(value: JsonValue | null | undefined): Record | null { + if (!value || typeof value !== "object" || Array.isArray(value)) return null + return value as Record +} + +function asArray(value: JsonValue | null | undefined): JsonValue[] { + return Array.isArray(value) ? value : [] +} + +function squash(text: string, maxLength = 220): string { + const normalized = text + .replace(//giu, "") + .replace(/<\/local-command-(stdout|stderr)>/giu, "") + .replace(/\s+/gu, " ") + .trim() + if (normalized.length <= maxLength) return normalized + return `${normalized.slice(0, maxLength - 3)}...` +} + +function stringify(value: JsonValue | null | undefined): string { + if (value === null || value === undefined) return "" + if (typeof value === "string") return value + return JSON.stringify(value) +} + +function extractFiles(text: string): string[] { + return unique([...text.matchAll(FILE_PATTERN)].map(match => (match[1] ?? "").trim()).filter(Boolean)) +} + +function findKeywordSummary(texts: string[], keywords: string[]): string { + const text = texts.filter(Boolean).join(" \n ") + const lowered = text.toLowerCase() + for (const keyword of keywords) { + const index = lowered.indexOf(keyword.toLowerCase()) + if (index < 0) continue + return squash(text.slice(Math.max(0, index - 40), index + 180)) + } + return "" +} + +function summarizeStructuredResult(record: Record): { + textSummary: string + stdoutSummary: string + stderrSummary: string + errorSummary: string + status: string + resultFiles: string[] +} { + const message = asRecord(record.message) + const toolUseResult = asRecord(record.toolUseResult) + const content = [...asArray(record.content), ...asArray(message?.content)] + + const textParts = content.flatMap(item => { + const block = asRecord(item) + if (!block) return [] + if (block.type === "text" && typeof block.text === "string") return [block.text] + if (block.type === "tool_result") { + return asArray(block.content).map(piece => { + const pieceRecord = asRecord(piece) + if (pieceRecord?.type === "text" && typeof pieceRecord.text === "string") return pieceRecord.text + return stringify(piece) + }) + } + return [] + }) + + const stdoutSummary = squash( + [ + typeof record.stdout === "string" ? record.stdout : "", + typeof toolUseResult?.stdout === "string" ? (toolUseResult.stdout as string) : "", + ] + .filter(Boolean) + .join("\n"), + ) + const stderrSummary = squash( + [ + typeof record.stderr === "string" ? record.stderr : "", + typeof toolUseResult?.stderr === "string" ? (toolUseResult.stderr as string) : "", + ] + .filter(Boolean) + .join("\n"), + ) + const errorSummary = squash( + [ + typeof record.error === "string" ? record.error : "", + typeof toolUseResult?.error === "string" ? (toolUseResult.error as string) : "", + typeof record.failure_reason === "string" ? record.failure_reason : "", + ] + .filter(Boolean) + .join("\n"), + ) + const status = squash( + [ + typeof record.status === "string" ? record.status : "", + typeof toolUseResult?.status === "string" ? (toolUseResult.status as string) : "", + typeof record.result === "string" ? record.result : "", + ] + .filter(Boolean) + .join(" "), + 80, + ) + const textSummary = squash( + [...textParts, stringify(toolUseResult?.content), stringify(record.result), status] + .filter(Boolean) + .join("\n"), + ) + const resultFiles = unique(extractFiles([textSummary, stdoutSummary, stderrSummary, errorSummary].join("\n"))) + return { textSummary, stdoutSummary, stderrSummary, errorSummary, status, resultFiles } +} + +function collectToolUseIds(record: Record): string[] { + const ids: string[] = [] + if (typeof record.tool_use_id === "string") ids.push(record.tool_use_id) + const message = asRecord(record.message) + for (const content of asArray(message?.content)) { + const contentRecord = asRecord(content) + if (typeof contentRecord?.tool_use_id === "string") ids.push(contentRecord.tool_use_id) + } + return unique(ids) +} + +function walkSnapshot(snapshot: SnapshotRecord, node: JsonValue, collector: ToolResultCandidate[]): void { + if (Array.isArray(node)) { + for (const item of node) walkSnapshot(snapshot, item, collector) + return + } + const record = asRecord(node) + if (!record) return + + const toolUseIds = collectToolUseIds(record) + const structured = + record.type === "tool_result" || + typeof record.stdout === "string" || + typeof record.stderr === "string" || + typeof record.error === "string" || + record.toolUseResult !== undefined + + if (structured && toolUseIds.length > 0) { + const summary = summarizeStructuredResult(record) + for (const toolUseId of toolUseIds) { + collector.push({ + tool_use_id: toolUseId, + snapshot_ref: snapshot.snapshotRef, + category: snapshot.category, + matched_by: "tool_use_id", + text_summary: summary.textSummary, + stdout_summary: summary.stdoutSummary, + stderr_summary: summary.stderrSummary, + error_summary: summary.errorSummary, + status: summary.status, + result_files: summary.resultFiles, + warnings: [], + }) + } + } + + for (const value of Object.values(record)) { + walkSnapshot(snapshot, value, collector) + } +} + +function extractCandidatesFromSnapshot(snapshot: SnapshotRecord): ToolResultCandidate[] { + const candidates: ToolResultCandidate[] = [] + walkSnapshot(snapshot, snapshot.data, candidates) + const seen = new Set() + return candidates.filter(candidate => { + const key = [ + candidate.tool_use_id ?? "null", + candidate.snapshot_ref, + candidate.text_summary, + candidate.stdout_summary, + candidate.stderr_summary, + candidate.error_summary, + ].join("|") + if (seen.has(key)) return false + seen.add(key) + return true + }) +} + +function buildFallbackCandidate( + turnSnapshots: TurnSnapshotBundle, + exactCandidates: ToolResultCandidate[], +): ToolResultCandidate | null { + if (exactCandidates.length === 0) return null + return { + tool_use_id: null, + snapshot_ref: + turnSnapshots.afterTurnSnapshots[0]?.snapshotRef ?? + turnSnapshots.relatedSnapshots[0]?.snapshotRef ?? + "unknown", + category: + turnSnapshots.afterTurnSnapshots[0]?.category ?? + turnSnapshots.relatedSnapshots[0]?.category ?? + null, + matched_by: "turn_fallback", + text_summary: squash(exactCandidates.map(item => item.text_summary).filter(Boolean).join("\n")), + stdout_summary: squash(exactCandidates.map(item => item.stdout_summary).filter(Boolean).join("\n")), + stderr_summary: squash(exactCandidates.map(item => item.stderr_summary).filter(Boolean).join("\n")), + error_summary: squash(exactCandidates.map(item => item.error_summary).filter(Boolean).join("\n")), + status: "turn_fallback", + result_files: unique(exactCandidates.flatMap(item => item.result_files)), + warnings: ["after_turn result matched by turn fallback"], + } +} + +function chooseBestCandidate(candidates: ToolResultCandidate[]): ToolResultCandidate | null { + if (candidates.length === 0) return null + return [...candidates].sort((left, right) => { + const leftScore = + (left.stdout_summary ? 4 : 0) + + (left.stderr_summary ? 3 : 0) + + (left.error_summary ? 5 : 0) + + (left.text_summary ? 2 : 0) + + (left.result_files.length > 0 ? 2 : 0) + const rightScore = + (right.stdout_summary ? 4 : 0) + + (right.stderr_summary ? 3 : 0) + + (right.error_summary ? 5 : 0) + + (right.text_summary ? 2 : 0) + + (right.result_files.length > 0 ? 2 : 0) + return rightScore - leftScore + })[0] ?? null +} + +export function buildTurnToolResultIndex( + turnSnapshotsByKey: Map, +): { + exactByTurnAndTool: Map + fallbackByTurn: Map +} { + const exactByTurnAndTool = new Map() + const fallbackByTurn = new Map() + const snapshotCache = new Map() + + const cachedCandidates = (snapshot: SnapshotRecord): ToolResultCandidate[] => { + const cached = snapshotCache.get(snapshot.snapshotRef) + if (cached) return cached + const extracted = extractCandidatesFromSnapshot(snapshot) + snapshotCache.set(snapshot.snapshotRef, extracted) + return extracted + } + + for (const [turnKey, bundle] of turnSnapshotsByKey) { + const perTool = new Map() + const limitedSnapshots = bundle.relatedSnapshots.slice(0, 8) + for (const snapshot of limitedSnapshots) { + for (const candidate of cachedCandidates(snapshot)) { + if (!candidate.tool_use_id) continue + const list = perTool.get(candidate.tool_use_id) ?? [] + list.push(candidate) + perTool.set(candidate.tool_use_id, list) + } + } + for (const [toolUseId, candidates] of perTool) { + const chosen = chooseBestCandidate(candidates) + if (chosen) exactByTurnAndTool.set(`${turnKey}|${toolUseId}`, chosen) + } + const fallback = buildFallbackCandidate( + { + ...bundle, + relatedSnapshots: limitedSnapshots, + }, + limitedSnapshots.flatMap(snapshot => cachedCandidates(snapshot)), + ) + if (fallback) fallbackByTurn.set(turnKey, fallback) + } + + return { exactByTurnAndTool, fallbackByTurn } +} + +export function enrichToolCallsWithResults(params: { + tools: RichToolCall[] + turnSnapshotsByKey: Map +}): RichToolCall[] { + const resultIndex = buildTurnToolResultIndex(params.turnSnapshotsByKey) + + return params.tools.map(tool => { + const turnKey = `${tool.query_id ?? "unknown"}|${tool.turn_id ?? "unknown"}` + const exact = resultIndex.exactByTurnAndTool.get(`${turnKey}|${tool.tool_call_id}`) + const fallback = resultIndex.fallbackByTurn.get(turnKey) + const selected = exact ?? fallback + const warnings = [...tool.warnings, ...(selected?.warnings ?? [])] + const texts = [ + selected?.error_summary ?? "", + selected?.stderr_summary ?? "", + selected?.stdout_summary ?? "", + selected?.text_summary ?? "", + tool.output_summary, + tool.input_summary, + tool.prompt_summary, + ] + const detectedProblem = findKeywordSummary(texts, PROBLEM_KEYWORDS) + const detectedFixSignal = findKeywordSummary(texts, FIX_KEYWORDS) + const outputHints = findKeywordSummary(texts, FILE_HINT_KEYWORDS) + const resultFiles = unique([ + ...tool.produced_files, + ...(selected?.result_files ?? []), + ...extractFiles( + [selected?.text_summary, selected?.stdout_summary, selected?.stderr_summary, outputHints] + .filter(Boolean) + .join("\n"), + ), + ]) + + const richSummary = squash( + [ + selected?.error_summary ? `error: ${selected.error_summary}` : "", + selected?.stderr_summary ? `stderr: ${selected.stderr_summary}` : "", + selected?.stdout_summary ? `stdout: ${selected.stdout_summary}` : "", + selected?.text_summary ? `result: ${selected.text_summary}` : "", + !selected && tool.output_summary ? tool.output_summary : "", + ] + .filter(Boolean) + .join(" | "), + 320, + ) + + return { + ...tool, + output_summary: richSummary || tool.output_summary, + stdout_summary: selected?.stdout_summary ?? "", + stderr_summary: selected?.stderr_summary ?? "", + error_summary: selected?.error_summary ?? "", + result_summary_rich: richSummary || tool.output_summary, + detected_problem: detectedProblem, + detected_fix_signal: detectedFixSignal, + result_files: resultFiles, + produced_files: unique([...tool.produced_files, ...resultFiles]), + evidence_refs: unique([...tool.evidence_refs, ...(selected?.snapshot_ref ? [selected.snapshot_ref] : [])]), + snapshot_refs: unique([...tool.snapshot_refs, ...(selected?.snapshot_ref ? [selected.snapshot_ref] : [])]), + warnings, + } + }) +} diff --git a/scripts/observability/lib/tool_use_extractor.ts b/scripts/observability/lib/tool_use_extractor.ts index fea52a9ee5..ead0a6c4e2 100644 --- a/scripts/observability/lib/tool_use_extractor.ts +++ b/scripts/observability/lib/tool_use_extractor.ts @@ -274,6 +274,13 @@ export function buildRichToolCalls(params: { success: tool.success, input_summary: extracted?.inputSummary ?? "input unavailable", output_summary: output.summary, + stdout_summary: "", + stderr_summary: "", + error_summary: tool.success === false ? output.summary : "", + result_summary_rich: output.summary, + detected_problem: tool.success === false ? output.summary : "", + detected_fix_signal: "", + result_files: [], command_or_path: extracted?.commandOrPath ?? "", intent_inferred: inferIntent( toolName, From a1e290ad4ab969837e42512d87fd06df1f17ff97 Mon Sep 17 00:00:00 2001 From: Bob10492 <70126353+Bob10492@users.noreply.github.com> Date: Sat, 9 May 2026 16:02:09 +0800 Subject: [PATCH 20/26] Defer Ink raw-mode release across input remounts --- packages/@ant/ink/src/hooks/use-input.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/@ant/ink/src/hooks/use-input.ts b/packages/@ant/ink/src/hooks/use-input.ts index 0d5cd55b7b..edd38c8d3f 100644 --- a/packages/@ant/ink/src/hooks/use-input.ts +++ b/packages/@ant/ink/src/hooks/use-input.ts @@ -55,7 +55,15 @@ const useInput = (inputHandler: Handler, options: Options = {}) => { setRawMode(true) return () => { - setRawMode(false) + // Defer raw-mode release until after the current React commit settles. + // During query -> permission prompt/select transitions, React can run + // layout-effect cleanup for an old input owner while the replacement + // input owner is still mounting. Synchronously disabling raw mode in + // that window can leave stdin in cooked mode: Enter still reaches the + // prompt, but arrow-key escape sequences no longer parse as up/down. + setTimeout(() => { + setRawMode(false) + }, 0) } }, [options.isActive, setRawMode]) From 99f534ae1799c08ec698bee2e7b2d606c841281b Mon Sep 17 00:00:00 2001 From: Bob10492 <70126353+Bob10492@users.noreply.github.com> Date: Sat, 9 May 2026 16:07:06 +0800 Subject: [PATCH 21/26] Handle Select arrows directly as fallback --- .../CustomSelect/use-select-input.ts | 84 ++++++++++--------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/src/components/CustomSelect/use-select-input.ts b/src/components/CustomSelect/use-select-input.ts index b289056ee2..935b325aa4 100644 --- a/src/components/CustomSelect/use-select-input.ts +++ b/src/components/CustomSelect/use-select-input.ts @@ -105,6 +105,28 @@ export const useSelectInput = ({ return focusedOption?.type === 'input' }, [options, state.focusedValue]) + const focusNext = () => { + if (onDownFromLastItem) { + const lastOption = options[options.length - 1] + if (lastOption && state.focusedValue === lastOption.value) { + onDownFromLastItem() + return + } + } + state.focusNextOption() + } + + const focusPrevious = () => { + if (onUpFromFirstItem && state.visibleFromIndex === 0) { + const firstOption = options[0] + if (firstOption && state.focusedValue === firstOption.value) { + onUpFromFirstItem() + return + } + } + state.focusPreviousOption() + } + // Core navigation via keybindings (up/down/enter/escape) // When in input mode, exclude navigation/accept keybindings so that // j/k/enter pass through to the TextInput instead of being intercepted. @@ -112,26 +134,8 @@ export const useSelectInput = ({ const handlers: Record void> = {} if (!isInInput) { - handlers['select:next'] = () => { - if (onDownFromLastItem) { - const lastOption = options[options.length - 1] - if (lastOption && state.focusedValue === lastOption.value) { - onDownFromLastItem() - return - } - } - state.focusNextOption() - } - handlers['select:previous'] = () => { - if (onUpFromFirstItem && state.visibleFromIndex === 0) { - const firstOption = options[0] - if (firstOption && state.focusedValue === firstOption.value) { - onUpFromFirstItem() - return - } - } - state.focusPreviousOption() - } + handlers['select:next'] = focusNext + handlers['select:previous'] = focusPrevious handlers['select:accept'] = () => { if (disableSelection === true) return if (state.focusedValue === undefined) return @@ -156,10 +160,10 @@ export const useSelectInput = ({ }, [ options, state, - onDownFromLastItem, - onUpFromFirstItem, isInInput, disableSelection, + focusNext, + focusPrevious, ]) useKeybindings(keybindingHandlers, { @@ -168,7 +172,10 @@ export const useSelectInput = ({ }) // Remaining keys that stay as useInput: number keys, pageUp/pageDown, tab, space, - // and arrow key navigation when in input mode + // and arrow key navigation when in input mode. We also keep direct up/down + // handling here as a defensive fallback for permission prompts after a + // query/tool cycle: if the keybinding context is temporarily stale during + // a modal transition, Select still owns arrow navigation and consumes it. useInput( (input, key, event: InputEvent) => { const normalizedInput = normalizeFullWidthDigits(input) @@ -196,28 +203,12 @@ export const useSelectInput = ({ // Arrow keys still navigate the select even while in input mode if (key.downArrow || (key.ctrl && input === 'n')) { - if (onDownFromLastItem) { - const lastOption = options[options.length - 1] - if (lastOption && state.focusedValue === lastOption.value) { - onDownFromLastItem() - event.stopImmediatePropagation() - return - } - } - state.focusNextOption() + focusNext() event.stopImmediatePropagation() return } if (key.upArrow || (key.ctrl && input === 'p')) { - if (onUpFromFirstItem && state.visibleFromIndex === 0) { - const firstOption = options[0] - if (firstOption && state.focusedValue === firstOption.value) { - onUpFromFirstItem() - event.stopImmediatePropagation() - return - } - } - state.focusPreviousOption() + focusPrevious() event.stopImmediatePropagation() return } @@ -229,6 +220,17 @@ export const useSelectInput = ({ return } + if (key.downArrow || (key.ctrl && input === 'n')) { + focusNext() + event.stopImmediatePropagation() + return + } + if (key.upArrow || (key.ctrl && input === 'p')) { + focusPrevious() + event.stopImmediatePropagation() + return + } + if (key.pageDown) { state.focusNextPage() } From 4c60c9c4bc6588b3eb33cf02552eb6daa72bc30a Mon Sep 17 00:00:00 2001 From: Bob10492 <70126353+Bob10492@users.noreply.github.com> Date: Sat, 9 May 2026 17:59:41 +0800 Subject: [PATCH 22/26] Add priority input handlers for modal overlays --- packages/@ant/ink/src/hooks/use-input.ts | 27 ++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/packages/@ant/ink/src/hooks/use-input.ts b/packages/@ant/ink/src/hooks/use-input.ts index edd38c8d3f..ae522aed6b 100644 --- a/packages/@ant/ink/src/hooks/use-input.ts +++ b/packages/@ant/ink/src/hooks/use-input.ts @@ -13,6 +13,15 @@ type Options = { * @default true */ isActive?: boolean + + /** + * Register this input handler before existing handlers. + * Useful for modal overlays that must consume navigation keys before + * background inputs, such as Select prompts over the main REPL input. + * + * @default false + */ + priority?: boolean } /** @@ -55,15 +64,7 @@ const useInput = (inputHandler: Handler, options: Options = {}) => { setRawMode(true) return () => { - // Defer raw-mode release until after the current React commit settles. - // During query -> permission prompt/select transitions, React can run - // layout-effect cleanup for an old input owner while the replacement - // input owner is still mounting. Synchronously disabling raw mode in - // that window can leave stdin in cooked mode: Enter still reaches the - // prompt, but arrow-key escape sequences no longer parse as up/down. - setTimeout(() => { - setRawMode(false) - }, 0) + setRawMode(false) } }, [options.isActive, setRawMode]) @@ -89,12 +90,16 @@ const useInput = (inputHandler: Handler, options: Options = {}) => { }) useEffect(() => { - internal_eventEmitter?.on('input', handleData) + if (options.priority) { + internal_eventEmitter?.prependListener('input', handleData) + } else { + internal_eventEmitter?.on('input', handleData) + } return () => { internal_eventEmitter?.removeListener('input', handleData) } - }, [internal_eventEmitter, handleData]) + }, [internal_eventEmitter, handleData, options.priority]) } export default useInput From 3d422c08755382decde3c7794e0ad4c5bcbd0f44 Mon Sep 17 00:00:00 2001 From: ZSN <1067700646@qq.com> Date: Sat, 9 May 2026 18:49:21 +0800 Subject: [PATCH 23/26] feat: V1.1 deep_explain feedback loop with layered output and noise reduction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add GraphProfile system: overview / part chunks / full / debug / artifact - Add chunked Mermaid output files with graph_manifest.json + graph_index.md - Add artifact_flow.mmd showing input→intermediate→script→final chain - Add size guard warning (>80KB / >300 nodes) to deep_report - Fix noise: detected_problem only from errors/stderr/scan, not Agent prompts - Fix artifact classification: template→input, versioned→intermediate, final→final - Restrict turn fallback to single-tool turns only - Move action-reports to ObservrityTask/v1/06-运行报告 - Add V1.1 development summary doc Co-Authored-By: Claude Opus 4.6 --- .duckdb-py/bin/python.exe | Bin 0 -> 139799 bytes .duckdb-py/bin/python3.12.exe | Bin 0 -> 139799 bytes .duckdb-py/bin/python3.exe | Bin 0 -> 139799 bytes .duckdb-py/bin/python3w.exe | Bin 0 -> 111635 bytes .duckdb-py/bin/pythonw.exe | Bin 0 -> 111635 bytes .duckdb-py/pyvenv.cfg | 5 + .githooks/pre-commit | 2 +- .tmp_action_0e05fe1b.json | 1 + .tmp_action_0e05fe1b_export_files.txt | 213 ++ .tmp_action_0e05fe1b_export_summary.json | 8 + CLAUDE.md | 4 + .../screenshot-20260423-221043.png" | Bin 0 -> 136745 bytes .../user_action_0e05fe1b_auto_report.md" | 660 +++++ ...20\347\240\201\347\211\210\357\274\211.md" | 735 +++++- .../deep_explain_V1.1_feedback_loop.md" | 93 + .../deep_action_feedback_dag_task_spec.md" | 706 ++++++ .../deep/README.md" | 0 .../user_action_0e05fe1b/artifact_chain.csv" | 30 + .../user_action_0e05fe1b/artifact_flow.mmd" | 237 ++ .../baseline_action_report.md" | 660 +++++ .../debug_chain_flow.mmd" | 53 + .../deep/user_action_0e05fe1b/deep_report.md" | 1871 ++++++++++++++ .../deep/user_action_0e05fe1b/graph_index.md" | 33 + .../user_action_0e05fe1b/graph_manifest.json" | 142 ++ .../phase_timeline_mapping.csv" | 61 + .../rich_stage_flow.full.mmd" | 1600 ++++++++++++ .../user_action_0e05fe1b/rich_stage_flow.mmd" | 1600 ++++++++++++ .../rich_stage_flow.overview.mmd" | 200 ++ .../rich_stage_flow.part_01_phase_01_10.mmd" | 178 ++ .../rich_stage_flow.part_02_phase_11_20.mmd" | 244 ++ .../rich_stage_flow.part_03_phase_21_30.mmd" | 292 +++ .../rich_stage_flow.part_04_phase_31_40.mmd" | 241 ++ .../rich_stage_flow.part_05_phase_41_50.mmd" | 244 ++ .../rich_stage_flow.part_06_phase_51_60.mmd" | 262 ++ .../snapshot_evidence_index.csv" | 2225 +++++++++++++++++ .../user_action_0e05fe1b/tool_calls_rich.csv" | 1758 +++++++++++++ ...45\344\270\273\345\257\274\357\274\211.md" | 68 + ...75\344\273\273\345\212\241\344\271\246.md" | 0 ...47\350\241\214\346\270\205\345\215\225.md" | 0 ...\212\250\345\256\236\347\216\260runner.md" | 0 ...75\344\273\273\345\212\241\344\271\246.md" | 924 +++++++ ...ha\344\273\273\345\212\241\344\271\246.md" | 0 ...65\344\273\273\345\212\241\344\271\246.md" | 0 ...45\344\270\273\345\257\274\357\274\211.md" | 87 + ...65\344\273\273\345\212\241\344\271\246.md" | 898 +++++++ ...65\344\273\273\345\212\241\344\271\246.md" | 1792 +++++++++++++ ...ta\344\273\273\345\212\241\344\271\246.md" | 509 ++++ ...ha\344\273\273\345\212\241\344\271\246.md" | 970 +++++++ .../v2_1_experiment_loop_patch_pack.md" | 0 ...41\344\273\273\345\212\241\344\271\246.md" | 898 +++++++ .../README.md" | 46 +- ...05\350\257\273\345\205\245\345\217\243.md" | 33 + .../README.md" | 164 +- ...ndidate_session_memory_sparse_e55a0f28.md" | 33 + ...ndidate_session_memory_sparse_1b6e0b9d.md" | 58 + ...ndidate_session_memory_sparse_8b3d4e6e.md" | 58 + ...ndidate_session_memory_sparse_aa955a44.md" | 58 + ...ndidate_session_memory_sparse_9a16434b.md" | 58 + ...candidate_eval_fixture_shadow_3b12231a.md" | 58 + ...ndidate_session_memory_sparse_15460460.md" | 58 + ...candidate_eval_fixture_shadow_106533c5.md" | 58 + ...ndidate_session_memory_sparse_d8c6f5f8.md" | 58 + ...candidate_eval_fixture_shadow_84a38e91.md" | 58 + ...ndidate_session_memory_sparse_fbf5e09d.md" | 58 + ...candidate_eval_fixture_shadow_ae2c9563.md" | 58 + ...ndidate_session_memory_sparse_f0bf222d.md" | 58 + ...candidate_eval_fixture_shadow_44f81026.md" | 58 + ...ndidate_session_memory_sparse_de72c558.md" | 58 + ...candidate_eval_fixture_shadow_3d7af2d8.md" | 58 + ...ndidate_session_memory_sparse_9a23ca8f.md" | 58 + ...candidate_eval_fixture_shadow_ed72e583.md" | 58 + ...ndidate_session_memory_sparse_7bb29ac2.md" | 58 + ...candidate_eval_fixture_shadow_2614401b.md" | 58 + ..._long_context_fixture_guarded_b1c79e38.md" | 67 + ..._long_context_fixture_guarded_dae80196.md" | 34 + ..._long_context_fixture_guarded_cef43fc7.md" | 34 + ..._long_context_fixture_guarded_719b0b16.md" | 34 + ..._long_context_fixture_guarded_d511b6bb.md" | 34 + ..._long_context_fixture_guarded_a669b877.md" | 34 + ..._long_context_fixture_guarded_4fc6ada1.md" | 34 + ..._long_context_fixture_guarded_9a66e2de.md" | 34 + ..._long_context_fixture_guarded_1ce68f72.md" | 34 + ...ndidate_session_memory_sparse_e165a301.md" | 26 + ...candidate_eval_fixture_shadow_a14307d2.md" | 25 + ...ndidate_session_memory_sparse_76a538e5.md" | 26 + ...candidate_eval_fixture_shadow_2f764a55.md" | 25 + ...ndidate_session_memory_sparse_07052af2.md" | 26 + ...candidate_eval_fixture_shadow_6c85b5a2.md" | 25 + ...ndidate_session_memory_sparse_4a936d1b.md" | 26 + ...candidate_eval_fixture_shadow_828b0684.md" | 25 + ...ndidate_session_memory_sparse_96004ff8.md" | 68 + ...ndidate_session_memory_sparse_1e5948a5.md" | 26 + ...candidate_eval_fixture_shadow_09f1deec.md" | 25 + ...ndidate_session_memory_sparse_862641d4.md" | 26 + ...candidate_eval_fixture_shadow_61d3ed8d.md" | 25 + ...ndidate_session_memory_sparse_c53e147c.md" | 26 + ...candidate_eval_fixture_shadow_1afeb0f4.md" | 25 + ...ndidate_session_memory_sparse_242dc6f0.md" | 26 + ...candidate_eval_fixture_shadow_59258ce7.md" | 25 + ..._long_context_fixture_guarded_4be1715e.md" | 34 + ..._long_context_fixture_guarded_6124af22.md" | 34 + ..._long_context_fixture_guarded_1abcd4c9.md" | 34 + ..._long_context_fixture_guarded_6d06184d.md" | 34 + ..._long_context_fixture_guarded_23354a67.md" | 34 + ..._long_context_fixture_guarded_a3fd72c9.md" | 34 + ..._long_context_fixture_guarded_6488e757.md" | 34 + ..._long_context_fixture_guarded_8c630899.md" | 34 + ...ndidate_session_memory_sparse_54964348.md" | 68 + ...te_harness_smoke_2026-05-02T051002379Z.md" | 61 + ...te_harness_smoke_2026-05-02T151233517Z.md" | 98 + ...te_harness_smoke_2026-05-02T152948409Z.md" | 100 + ...te_harness_smoke_2026-05-02T154129980Z.md" | 100 + ...robustness_smoke_2026-05-03T070927523Z.md" | 228 ++ ...xt_fixture_smoke_2026-05-03T070957231Z.md" | 351 +++ ...ntext_real_smoke_2026-05-03T060617173Z.md" | 151 ++ ...ntext_real_smoke_2026-05-03T145644822Z.md" | 152 ++ ...moke_minimal_baseline_default_04e0bac9.md" | 49 + ...ndidate_session_memory_sparse_e55a0f28.md" | 49 + ...moke_minimal_baseline_default_9d0393b9.md" | 76 + ...ndidate_session_memory_sparse_1b6e0b9d.md" | 76 + ...moke_minimal_baseline_default_4c910090.md" | 76 + ...ndidate_session_memory_sparse_8b3d4e6e.md" | 76 + ...moke_minimal_baseline_default_c0d23f4f.md" | 76 + ...ndidate_session_memory_sparse_aa955a44.md" | 76 + ...moke_minimal_baseline_default_44ac96e8.md" | 70 + ...ndidate_session_memory_sparse_9a16434b.md" | 70 + ...candidate_eval_fixture_shadow_3b12231a.md" | 70 + ...moke_minimal_baseline_default_cb8962ff.md" | 70 + ...ndidate_session_memory_sparse_15460460.md" | 70 + ...candidate_eval_fixture_shadow_106533c5.md" | 70 + ..._minimal_alt_baseline_default_3f9bbfe6.md" | 70 + ...ndidate_session_memory_sparse_d8c6f5f8.md" | 70 + ...candidate_eval_fixture_shadow_84a38e91.md" | 70 + ..._minimal_alt_baseline_default_1f65e9f5.md" | 70 + ...ndidate_session_memory_sparse_fbf5e09d.md" | 70 + ...candidate_eval_fixture_shadow_ae2c9563.md" | 70 + ...moke_minimal_baseline_default_290cc011.md" | 70 + ...ndidate_session_memory_sparse_f0bf222d.md" | 70 + ...candidate_eval_fixture_shadow_44f81026.md" | 70 + ...moke_minimal_baseline_default_2296c3b6.md" | 70 + ...ndidate_session_memory_sparse_de72c558.md" | 70 + ...candidate_eval_fixture_shadow_3d7af2d8.md" | 70 + ..._minimal_alt_baseline_default_74a94fd2.md" | 70 + ...ndidate_session_memory_sparse_9a23ca8f.md" | 70 + ...candidate_eval_fixture_shadow_ed72e583.md" | 70 + ..._minimal_alt_baseline_default_5b189848.md" | 70 + ...ndidate_session_memory_sparse_7bb29ac2.md" | 70 + ...candidate_eval_fixture_shadow_2614401b.md" | 70 + ...ct_retrieval_baseline_default_d02d9ca2.md" | 106 + ...l_real_smoke_baseline_default_b963e6da.md" | 105 + ...ndidate_session_memory_sparse_96004ff8.md" | 105 + ...l_real_smoke_baseline_default_4015c73b.md" | 105 + ...ndidate_session_memory_sparse_54964348.md" | 105 + ..._contract_v0_baseline_default_0b6a625e.md" | 105 + ...ndidate_session_memory_sparse_a3fb1e0d.md" | 105 + .../README.md" | 65 + ...trac_beta_20260504T080713428Z_b26ab9b5.md" | 223 ++ ...23\350\256\272\347\264\242\345\274\225.md" | 13 + .../README.md" | 43 + .../_manual_conclusion.template.md" | 61 + ...tation_contract_v0_20260504T080713320Z.md" | 71 + .../v2/README.md" | 134 +- ObservrityTask/README.md | 41 +- ...0\346\234\211\345\206\205\345\256\271.txt" | 1 + scripts/evals/v2_create_manual_conclusion.ts | 378 +++ scripts/evals/v2_run_feedback.ts | 18 +- scripts/observability/deep_explain_action.ts | 125 +- scripts/observability/lib/artifact_tracker.ts | 103 +- .../observability/lib/deep_action_types.ts | 30 + .../observability/lib/deep_report_writer.ts | 45 +- .../observability/lib/mermaid_rich_graph.ts | 265 ++ .../lib/repair_chain_detector.ts | 20 +- .../lib/tool_result_extractor.ts | 61 +- .../rebuild_observability_db.ps1 | 8 +- .../CustomSelect/use-select-input.ts | 10 +- src/components/LogoV2/AnimatedClawd.tsx | 2 +- src/components/LogoV2/Clawd.tsx | 168 +- src/components/LogoV2/WelcomeV2.tsx | 181 +- src/query.ts | 199 +- src/services/api/claude.ts | 50 + tests/evals/v2/README.md | 15 +- tests/evals/v2/V2.5-feedback-loop-usage.md | 23 + ...e_harness_smoke_2026-05-02T051002379Z.json | 372 +++ ...e_harness_smoke_2026-05-02T151233517Z.json | 500 ++++ ...e_harness_smoke_2026-05-02T152948409Z.json | 501 ++++ ...e_harness_smoke_2026-05-02T154129980Z.json | 501 ++++ ...contract_20260504T080713428Z_49d7f7a4.json | 25 + ...tract_v0_20260504T080713428Z_9800acad.json | 25 + ...contract_20260504T080713428Z_61e2eafe.json | 20 + ...tract_v0_20260504T080713428Z_c0000d1b.json | 20 + ..._real_sm_20260504T080713428Z_bb73752c.json | 16 + ..._real_sm_20260504T080713428Z_cab49a4f.json | 16 + ...l_review_20260504T080713428Z_a8bd7226.json | 16 + ..._retriev_20260504T080713428Z_d58b1348.json | 16 + ...positive_20260504T080713428Z_1db87f20.json | 16 + ...nclusive_20260504T080713428Z_c78c9500.json | 16 + ...ontract__20260504T080713428Z_8e1909f3.json | 25 + ...tability_20260504T080713428Z_a143639b.json | 24 + ...contract_20260504T080713428Z_4857af82.json | 27 + ...tract_v0_20260504T080713428Z_66f265df.json | 24 + ...rac_beta_20260504T080713428Z_b26ab9b5.json | 82 + ...oke_minimal_baseline_default_04e0bac9.json | 131 + ...didate_session_memory_sparse_e55a0f28.json | 132 + ...oke_minimal_baseline_default_9d0393b9.json | 164 ++ ...didate_session_memory_sparse_1b6e0b9d.json | 171 ++ ...oke_minimal_baseline_default_4c910090.json | 164 ++ ...didate_session_memory_sparse_8b3d4e6e.json | 165 ++ ...oke_minimal_baseline_default_c0d23f4f.json | 164 ++ ...didate_session_memory_sparse_aa955a44.json | 165 ++ .../v2/scores/_manual-review.template.json | 10 - ...imal_baseline_default_04e0bac9.scores.json | 52 + ...session_memory_sparse_e55a0f28.scores.json | 52 + ...imal_baseline_default_9d0393b9.scores.json | 52 + ...session_memory_sparse_1b6e0b9d.scores.json | 52 + ...imal_baseline_default_4c910090.scores.json | 52 + ...session_memory_sparse_8b3d4e6e.scores.json | 52 + ...imal_baseline_default_c0d23f4f.scores.json | 52 + ...session_memory_sparse_aa955a44.scores.json | 52 + ...2_1_bind_runner_2026-05-01T152538693Z.json | 94 + ...2_1_bind_runner_2026-05-02T015153520Z.json | 94 + ...2_1_bind_runner_2026-05-02T184101202Z.json | 94 + ...2_1_bind_runner_2026-05-03T051916661Z.json | 94 + ...e_harness_alpha_2026-05-01T152603692Z.json | 89 + ...e_harness_alpha_2026-05-02T015220905Z.json | 89 + ...e_harness_alpha_2026-05-02T034708205Z.json | 87 + ...e_harness_alpha_2026-05-02T034906732Z.json | 87 + ...e_harness_alpha_2026-05-02T034956692Z.json | 87 + ...e_harness_alpha_2026-05-02T035227154Z.json | 89 + ...e_harness_alpha_2026-05-02T044801603Z.json | 89 + ...e_harness_alpha_2026-05-02T050005830Z.json | 89 + ...e_harness_alpha_2026-05-02T132242657Z.json | 89 + ...e_harness_alpha_2026-05-02T150900925Z.json | 85 + ...e_harness_alpha_2026-05-02T150946774Z.json | 85 + ...e_harness_alpha_2026-05-02T151140507Z.json | 89 + ...e_harness_alpha_2026-05-02T152641622Z.json | 89 + ...e_harness_alpha_2026-05-02T152846325Z.json | 89 + ...e_harness_alpha_2026-05-02T162534789Z.json | 89 + ...e_harness_alpha_2026-05-02T184125532Z.json | 89 + ...e_harness_alpha_2026-05-03T051916703Z.json | 89 + tools/duckdb/duckdb_cli-windows-amd64.zip | Bin 0 -> 13079721 bytes 240 files changed, 34927 insertions(+), 473 deletions(-) create mode 100644 .duckdb-py/bin/python.exe create mode 100644 .duckdb-py/bin/python3.12.exe create mode 100644 .duckdb-py/bin/python3.exe create mode 100644 .duckdb-py/bin/python3w.exe create mode 100644 .duckdb-py/bin/pythonw.exe create mode 100644 .duckdb-py/pyvenv.cfg create mode 100644 .tmp_action_0e05fe1b.json create mode 100644 .tmp_action_0e05fe1b_export_files.txt create mode 100644 .tmp_action_0e05fe1b_export_summary.json create mode 100644 "ObservrityTask/00-\350\265\204\346\226\231\350\276\223\345\205\245/screenshot-20260423-221043.png" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/03-\346\240\267\344\276\213/user_action_0e05fe1b_auto_report.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/04-\344\270\223\351\242\230\347\240\224\347\251\266/deep_explain_V1.1_feedback_loop.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/05-\344\273\273\345\212\241\344\271\246/deep_action_feedback_dag_task_spec.md" rename ObservrityTask/action-reports/deep/README.md => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/README.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/artifact_chain.csv" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/artifact_flow.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/baseline_action_report.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/debug_chain_flow.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/deep_report.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/graph_index.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/graph_manifest.json" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/phase_timeline_mapping.csv" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.full.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.overview.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_01_phase_01_10.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_02_phase_11_20.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_03_phase_21_30.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_04_phase_31_40.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_05_phase_41_50.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/rich_stage_flow.part_06_phase_51_60.mmd" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/snapshot_evidence_index.csv" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v1/06-\350\277\220\350\241\214\346\212\245\345\221\212/deep/user_action_0e05fe1b/tool_calls_rich.csv" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/01-\346\200\273\350\247\210/V2.5\345\275\223\345\211\215\344\275\277\347\224\250\346\226\271\345\274\217\357\274\210\344\272\272\345\267\245\344\270\273\345\257\274\357\274\211.md" rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2\347\254\254\344\270\200\351\230\266\346\256\265\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/00-\351\230\266\346\256\265\346\200\273\350\267\257\347\272\277/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2\347\254\254\344\270\200\351\230\266\346\256\265\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246.md" (100%) rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2\347\254\254\344\270\200\351\230\266\346\256\265\346\211\247\350\241\214\346\270\205\345\215\225.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/00-\351\230\266\346\256\265\346\200\273\350\267\257\347\272\277/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2\347\254\254\344\270\200\351\230\266\346\256\265\346\211\247\350\241\214\346\270\205\345\215\225.md" (100%) rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/01-V2.1-V2.2/V2.1\344\273\216\346\211\213\345\212\250\347\273\221\345\256\232\345\210\260\350\207\252\345\212\250\345\256\236\347\216\260runner.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/01-V2.1-V2.2/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2Beta\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246.md" rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/01-V2.1-V2.2/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2alpha\344\273\273\345\212\241\344\271\246.md" (100%) rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2_2.1\351\230\266\346\256\265\344\273\273\345\212\241\344\271\246" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/01-V2.1-V2.2/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2_2.1\351\230\266\346\256\265\344\273\273\345\212\241\344\271\246.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/02-V2.3-V2.5/V2.5\346\224\266\346\225\233\346\226\271\346\241\210\357\274\210\344\272\272\345\267\245\344\270\273\345\257\274\357\274\211.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/02-V2.3-V2.5/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.3\351\230\266\346\256\265\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/02-V2.3-V2.5/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.4\351\230\266\346\256\265\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/02-V2.3-V2.5/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.5Beta\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/02-V2.3-V2.5/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.5alpha\344\273\273\345\212\241\344\271\246.md" rename "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/v2_1_experiment_loop_patch_pack.md" => "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/90-\345\216\206\345\217\262\350\241\245\344\270\201\344\270\216\350\277\207\346\270\241\347\250\277/v2_1_experiment_loop_patch_pack.md" (100%) create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/02-\345\256\236\346\226\275\344\273\273\345\212\241\344\271\246/90-\345\216\206\345\217\262\350\241\245\344\270\201\344\270\216\350\277\207\346\270\241\347\250\277/\345\217\257\350\247\202\346\265\213\347\263\273\347\273\237V2.2.5\345\210\260V2.5\350\277\207\346\270\241\344\273\273\345\212\241\344\271\246.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/00-\351\230\205\350\257\273\345\205\245\345\217\243.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T050952070Z_execute_harness_smoke_minimal_baseline_default_04e0bac9_vs_run_2026-05-02T051002218Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_e55a0f28.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T151221799Z_execute_harness_smoke_minimal_baseline_default_9d0393b9_vs_run_2026-05-02T151233323Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1b6e0b9d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T152932165Z_execute_harness_smoke_minimal_baseline_default_4c910090_vs_run_2026-05-02T152948229Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_8b3d4e6e.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-02T154112175Z_execute_harness_smoke_minimal_baseline_default_c0d23f4f_vs_run_2026-05-02T154129799Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_aa955a44.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052005449Z_execute_harness_smoke_minimal_baseline_default_44ac96e8_vs_run_2026-05-03T052006941Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9a16434b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052005449Z_execute_harness_smoke_minimal_baseline_default_44ac96e8_vs_run_2026-05-03T052008567Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_3b12231a.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052010168Z_execute_harness_smoke_minimal_baseline_default_cb8962ff_vs_run_2026-05-03T052011674Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_15460460.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052010168Z_execute_harness_smoke_minimal_baseline_default_cb8962ff_vs_run_2026-05-03T052013327Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_106533c5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052014995Z_robustness_smoke_minimal_alt_baseline_default_3f9bbfe6_vs_run_2026-05-03T052016480Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_d8c6f5f8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052014995Z_robustness_smoke_minimal_alt_baseline_default_3f9bbfe6_vs_run_2026-05-03T052018150Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_84a38e91.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052019806Z_robustness_smoke_minimal_alt_baseline_default_1f65e9f5_vs_run_2026-05-03T052021298Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_fbf5e09d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052019806Z_robustness_smoke_minimal_alt_baseline_default_1f65e9f5_vs_run_2026-05-03T052022980Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ae2c9563.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052831406Z_execute_harness_smoke_minimal_baseline_default_290cc011_vs_run_2026-05-03T052832886Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_f0bf222d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052831406Z_execute_harness_smoke_minimal_baseline_default_290cc011_vs_run_2026-05-03T052834543Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_44f81026.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052836209Z_execute_harness_smoke_minimal_baseline_default_2296c3b6_vs_run_2026-05-03T052837654Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_de72c558.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052836209Z_execute_harness_smoke_minimal_baseline_default_2296c3b6_vs_run_2026-05-03T052839283Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_3d7af2d8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052840959Z_robustness_smoke_minimal_alt_baseline_default_74a94fd2_vs_run_2026-05-03T052842454Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_9a23ca8f.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052840959Z_robustness_smoke_minimal_alt_baseline_default_74a94fd2_vs_run_2026-05-03T052844080Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ed72e583.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052845684Z_robustness_smoke_minimal_alt_baseline_default_5b189848_vs_run_2026-05-03T052847130Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_7bb29ac2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T052845684Z_robustness_smoke_minimal_alt_baseline_default_5b189848_vs_run_2026-05-03T052848781Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2614401b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054515896Z_long_context_constraint_retention_baseline_default_75ffb5f8_vs_run_2026-05-03T054515906Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_b1c79e38.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818111Z_long_context_constraint_retention_baseline_default_a803d034_vs_run_2026-05-03T054818121Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_dae80196.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818137Z_long_context_constraint_retention_baseline_default_a2aa0e4d_vs_run_2026-05-03T054818142Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_cef43fc7.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818149Z_long_context_fact_retrieval_baseline_default_18de0c79_vs_run_2026-05-03T054818154Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_719b0b16.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818162Z_long_context_fact_retrieval_baseline_default_e89ede34_vs_run_2026-05-03T054818179Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_d511b6bb.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818186Z_long_context_distractor_resistance_baseline_default_cfc81fcc_vs_run_2026-05-03T054818190Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a669b877.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818198Z_long_context_distractor_resistance_baseline_default_28ac78af_vs_run_2026-05-03T054818204Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_4fc6ada1.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818214Z_long_context_compaction_pressure_baseline_default_5482a952_vs_run_2026-05-03T054818219Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_9a66e2de.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T054818227Z_long_context_compaction_pressure_baseline_default_99e7f903_vs_run_2026-05-03T054818232Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_1ce68f72.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352313Z_execute_harness_smoke_minimal_baseline_default_3a0649af_vs_run_2026-05-03T055352318Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_e165a301.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352313Z_execute_harness_smoke_minimal_baseline_default_3a0649af_vs_run_2026-05-03T055352332Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_a14307d2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352341Z_execute_harness_smoke_minimal_baseline_default_b25ed043_vs_run_2026-05-03T055352344Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_76a538e5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352341Z_execute_harness_smoke_minimal_baseline_default_b25ed043_vs_run_2026-05-03T055352350Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_2f764a55.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352355Z_robustness_smoke_minimal_alt_baseline_default_a1cc13ee_vs_run_2026-05-03T055352359Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_07052af2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352355Z_robustness_smoke_minimal_alt_baseline_default_a1cc13ee_vs_run_2026-05-03T055352366Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_6c85b5a2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352373Z_robustness_smoke_minimal_alt_baseline_default_5ab05e26_vs_run_2026-05-03T055352377Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_4a936d1b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T055352373Z_robustness_smoke_minimal_alt_baseline_default_5ab05e26_vs_run_2026-05-03T055352384Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_828b0684.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da_vs_run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927467Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1e5948a5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927462Z_execute_harness_smoke_minimal_baseline_default_49e858ae_vs_run_2026-05-03T070927478Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_09f1deec.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927487Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_862641d4.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927484Z_execute_harness_smoke_minimal_baseline_default_8600f149_vs_run_2026-05-03T070927491Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_61d3ed8d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927499Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_c53e147c.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927496Z_robustness_smoke_minimal_alt_baseline_default_231de0ad_vs_run_2026-05-03T070927505Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_1afeb0f4.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927513Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_242dc6f0.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070927510Z_robustness_smoke_minimal_alt_baseline_default_5ee185bf_vs_run_2026-05-03T070927518Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_59258ce7.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957132Z_long_context_constraint_retention_baseline_default_a928b6b2_vs_run_2026-05-03T070957141Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_4be1715e.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957154Z_long_context_constraint_retention_baseline_default_fa3b48d1_vs_run_2026-05-03T070957158Z_long_context_constraint_retention_candidate_long_context_fixture_guarded_6124af22.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957165Z_long_context_fact_retrieval_baseline_default_fdcab6c9_vs_run_2026-05-03T070957170Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_1abcd4c9.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957176Z_long_context_fact_retrieval_baseline_default_70401d6d_vs_run_2026-05-03T070957183Z_long_context_fact_retrieval_candidate_long_context_fixture_guarded_6d06184d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957189Z_long_context_distractor_resistance_baseline_default_4d94c847_vs_run_2026-05-03T070957194Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_23354a67.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957200Z_long_context_distractor_resistance_baseline_default_0f2affa1_vs_run_2026-05-03T070957205Z_long_context_distractor_resistance_candidate_long_context_fixture_guarded_a3fd72c9.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957212Z_long_context_compaction_pressure_baseline_default_c9cab754_vs_run_2026-05-03T070957216Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_6488e757.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T070957222Z_long_context_compaction_pressure_baseline_default_31b412ce_vs_run_2026-05-03T070957227Z_long_context_compaction_pressure_candidate_long_context_fixture_guarded_8c630899.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/compare_run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b_vs_run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T051002379Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T151233517Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T152948409Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_execute_harness_smoke_2026-05-02T154129980Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_3_robustness_smoke_2026-05-03T070927523Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_4_long_context_fixture_smoke_2026-05-03T070957231Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_4_long_context_real_smoke_2026-05-03T060617173Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/experiment_v2_4_long_context_real_smoke_2026-05-03T145644822Z.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T050952070Z_execute_harness_smoke_minimal_baseline_default_04e0bac9.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T051002218Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_e55a0f28.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T151221799Z_execute_harness_smoke_minimal_baseline_default_9d0393b9.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T151233323Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1b6e0b9d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T152932165Z_execute_harness_smoke_minimal_baseline_default_4c910090.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T152948229Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_8b3d4e6e.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T154112175Z_execute_harness_smoke_minimal_baseline_default_c0d23f4f.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-02T154129799Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_aa955a44.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052005449Z_execute_harness_smoke_minimal_baseline_default_44ac96e8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052006941Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_9a16434b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052008567Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_3b12231a.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052010168Z_execute_harness_smoke_minimal_baseline_default_cb8962ff.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052011674Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_15460460.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052013327Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_106533c5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052014995Z_robustness_smoke_minimal_alt_baseline_default_3f9bbfe6.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052016480Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_d8c6f5f8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052018150Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_84a38e91.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052019806Z_robustness_smoke_minimal_alt_baseline_default_1f65e9f5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052021298Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_fbf5e09d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052022980Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ae2c9563.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052831406Z_execute_harness_smoke_minimal_baseline_default_290cc011.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052832886Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_f0bf222d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052834543Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_44f81026.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052836209Z_execute_harness_smoke_minimal_baseline_default_2296c3b6.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052837654Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_de72c558.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052839283Z_execute_harness_smoke_minimal_candidate_eval_fixture_shadow_3d7af2d8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052840959Z_robustness_smoke_minimal_alt_baseline_default_74a94fd2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052842454Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_9a23ca8f.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052844080Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_ed72e583.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052845684Z_robustness_smoke_minimal_alt_baseline_default_5b189848.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052847130Z_robustness_smoke_minimal_alt_candidate_session_memory_sparse_7bb29ac2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T052848781Z_robustness_smoke_minimal_alt_candidate_eval_fixture_shadow_2614401b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T055736011Z_long_context_fact_retrieval_baseline_default_d02d9ca2.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T060601212Z_long_context_fact_retrieval_real_smoke_baseline_default_b963e6da.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T060616987Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_96004ff8.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T145624015Z_long_context_fact_retrieval_real_smoke_baseline_default_4015c73b.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T145644621Z_long_context_fact_retrieval_real_smoke_candidate_session_memory_sparse_54964348.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T153208617Z_long_context_fact_retrieval_real_smoke_contract_v0_baseline_default_0b6a625e.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/06-\350\277\220\350\241\214\346\212\245\345\221\212/run_2026-05-03T153229620Z_long_context_fact_retrieval_real_smoke_contract_v0_candidate_session_memory_sparse_a3fb1e0d.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/README.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/07-\345\217\215\351\246\210\346\212\245\345\221\212/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260504T080713428Z_b26ab9b5.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/08-\344\272\272\345\267\245\347\273\223\350\256\272/00-\344\272\272\345\267\245\347\273\223\350\256\272\347\264\242\345\274\225.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/08-\344\272\272\345\267\245\347\273\223\350\256\272/README.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/08-\344\272\272\345\267\245\347\273\223\350\256\272/_manual_conclusion.template.md" create mode 100644 "ObservrityTask/10-\347\263\273\347\273\237\347\211\210\346\234\254/v2/08-\344\272\272\345\267\245\347\273\223\350\256\272/manual_conclusion_v2_5_long_context_real_smoke_expectation_contract_v0_20260504T080713320Z.md" create mode 100644 "docs/\345\215\225\346\254\241\345\217\221\351\200\201\346\211\200\346\234\211\345\206\205\345\256\271.txt" create mode 100644 scripts/evals/v2_create_manual_conclusion.ts create mode 100644 tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T051002379Z.json create mode 100644 tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T151233517Z.json create mode 100644 tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T152948409Z.json create mode 100644 tests/evals/v2/experiment-runs/execute_harness_smoke_2026-05-02T154129980Z.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260504T080713428Z_49d7f7a4.json create mode 100644 tests/evals/v2/feedback/candidate-proposals/candidate_proposal_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260504T080713428Z_9800acad.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_after_contract_20260504T080713428Z_61e2eafe.json create mode 100644 tests/evals/v2/feedback/experiment-plans/experiment_plan_v2_5_long_context_real_smoke_expectation_contrac_candidate_feedback_input_contract_v0_20260504T080713428Z_c0000d1b.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260504T080713428Z_bb73752c.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_flaky_status_long_context_fact_retrieval_real_sm_20260504T080713428Z_cab49a4f.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_long_context_review_verdict_needs_manual_review_20260504T080713428Z_a8bd7226.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_manual_review_required_long_context_fact_retriev_20260504T080713428Z_d58b1348.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_missing_score_count_positive_20260504T080713428Z_1db87f20.json create mode 100644 tests/evals/v2/feedback/findings/finding_v2_5_long_context_real_smoke_expectation_contrac_risk_verdict_inconclusive_20260504T080713428Z_c78c9500.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_manual_review_boundary_persisted_after_contract__20260504T080713428Z_8e1909f3.json create mode 100644 tests/evals/v2/feedback/hypotheses/hypothesis_v2_5_long_context_real_smoke_expectation_contrac_runner_or_scenario_instability_20260504T080713428Z_a143639b.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_after_contract_20260504T080713428Z_4857af82.json create mode 100644 tests/evals/v2/feedback/proposals/proposal_v2_5_long_context_real_smoke_expectation_contrac_stabilize_feedback_input_contract_v0_20260504T080713428Z_66f265df.json create mode 100644 tests/evals/v2/feedback/runs/feedback_run_v2_5_long_context_real_smoke_expectation_contrac_beta_20260504T080713428Z_b26ab9b5.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T050952070Z_execute_harness_smoke_minimal_baseline_default_04e0bac9.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T051002218Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_e55a0f28.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T151221799Z_execute_harness_smoke_minimal_baseline_default_9d0393b9.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T151233323Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1b6e0b9d.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T152932165Z_execute_harness_smoke_minimal_baseline_default_4c910090.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T152948229Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_8b3d4e6e.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T154112175Z_execute_harness_smoke_minimal_baseline_default_c0d23f4f.json create mode 100644 tests/evals/v2/runs/run_2026-05-02T154129799Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_aa955a44.json delete mode 100644 tests/evals/v2/scores/_manual-review.template.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T050952070Z_execute_harness_smoke_minimal_baseline_default_04e0bac9.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T051002218Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_e55a0f28.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T151221799Z_execute_harness_smoke_minimal_baseline_default_9d0393b9.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T151233323Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_1b6e0b9d.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T152932165Z_execute_harness_smoke_minimal_baseline_default_4c910090.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T152948229Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_8b3d4e6e.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T154112175Z_execute_harness_smoke_minimal_baseline_default_c0d23f4f.scores.json create mode 100644 tests/evals/v2/scores/run_2026-05-02T154129799Z_execute_harness_smoke_minimal_candidate_session_memory_sparse_aa955a44.scores.json create mode 100644 tests/evals/v2/verification-reports/v2_1_bind_runner_2026-05-01T152538693Z.json create mode 100644 tests/evals/v2/verification-reports/v2_1_bind_runner_2026-05-02T015153520Z.json create mode 100644 tests/evals/v2/verification-reports/v2_1_bind_runner_2026-05-02T184101202Z.json create mode 100644 tests/evals/v2/verification-reports/v2_1_bind_runner_2026-05-03T051916661Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-01T152603692Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T015220905Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T034708205Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T034906732Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T034956692Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T035227154Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T044801603Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T050005830Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T132242657Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T150900925Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T150946774Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T151140507Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T152641622Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T152846325Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T162534789Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-02T184125532Z.json create mode 100644 tests/evals/v2/verification-reports/v2_2_execute_harness_alpha_2026-05-03T051916703Z.json create mode 100644 tools/duckdb/duckdb_cli-windows-amd64.zip diff --git a/.duckdb-py/bin/python.exe b/.duckdb-py/bin/python.exe new file mode 100644 index 0000000000000000000000000000000000000000..ed47dbab19a8d8618a5c13cbb271c8f054d60fec GIT binary patch literal 139799 zcmeFa3wTu3+4#FBBoH)lM@bZ`lu?6C6!{XZZK6;!Fu^@C!6=}hs9+#M<&q{d2o@nY zGs^BbkXqZ?e${F(-)F6@y(yP^NdhE*w*d8ms6|xV+jt46+{DiR_pZHXGKtuJ|8t)6 zKhJYcC(q1UdtKi3u6Mn+b=fO5*R*H@G)>Fqn@nojT0;3$so#J8(@gdW!`Gjn{blfT zXRP%EpF3mnoViWjhQ@_AHO`#xt(&=E!NQRD#@XJ+@B;7L1z!KzJIPL6m{~YavENv0#J=)n>$7f}Y)S^7x;h_;tD<~vhpej|7;>5EYyyY=R z<3T_5fe$_Da4P zpDMCPH97^mzCy+y_@2)9dwerKRSff}2B)0rBYo8d^8K%jLPV>i++wF(q2RQeuatX) zZ^oxe%H7!1BuRhB=ch+vU;H!+esdjoiq_CvJBf0C>RYx-;DzQXctR4*OF8N9MtSJ{ zRH^E`>E2jvAs|K-;QmluOFok z7iQBmeR!2-MBXnk;yF|PsA=U}I%Qxq9i(( zOotXxElH-C|KQ&>Ew05{LzC7CovkyL^AlYrfQz03R{SKhz?$bg-Q#dm z6;CH^U9Pmgb}4E{AKpw{=x5OL5%eRN$oc&nq$G+SRYCqA2;!RQFCI;a4fMz#&ovVA zNSfDr5HPXU#W{4)*x*3EULa?TP<@MU@|3HunnWw!wPiFn`z9j}9UF3u_^>x06_Sp* z>|KZ#@0KTx_=QhVB|b26$fHNMQzkx;u0?&s zI&P z(!4=)#0Y>{59$1Td@Caq&&eYpo_{jGK~r$emI8NTaN_y)YjAJQyZpws=+WQO!+6m) zdD-&$Dpa#}IsgM|GJ2$6WA_yI~A=8k~*Y|!ion$H>L(&8%X zn@fZ#UJ01{g619rp&_-*==2vWOKX_(ke*5O5g|B=O)}z>iw)!4A>rfcRjoTr*8X9F z_>{GC3N#H7vVOs{QMwUk-b|gK87+Q*9K(Fkd%?>>i~jnd2FiXC>xi}9Xgn3Iba!&t5GxX_`QL6F|Bl);E2Wd35& zyYV7jxDxgs6GUq*mTFZ&G9i_}7$OBB%3ZfmH!hD1LREG77RmEU3TiRe`ZY8|Ca$6- zgS4r}xt_2nyg3`JD^Hj4QM%251q}1~WYK{SDCY=oz}yLap+uQ*d%%PaZ9*5}FG4Q? zRQNR&%nfLz!R9M~HLksWL$aug#K@t6p#rlVcD?;4Fm(>y?|zM^LZ1GrgmXV+>E7b< z=Zg0qAQ9ep zqQ#;@5y|x;BOshCic$vj^c72s=Sd-YAKMi^5kB}f5ZBEn$(oA9^#$S9-?`$h;sQao zO!|6-0;TFMNY_1`x{fFb^mSk1N2F!y$G`E=AH^3aB>x#SP`#;U}z((ie(-%D2FSC5HK)BvqBS zc1DYz1)f87#gp)haEG*#EV`vzfiDc07KqGDx9~2F3%@oP=FVhMC8dN|i<3p)AR5_9 zUa+`DM9HF?9C*^JS!rND1sJgA(WxJkX#EuQsXxz}#@wdGclFwPha$;W z`KCCmW_{&UKq@^{^^gER&%?MXJ%R*M)KM5O(skcjq#AV~+TxzD6dLpTd%#+&`kN?f zc!{(`{&lpPeA(<6BNj%#^DoGYkr+0KOh)#&`pCta9=l5==KmK(pJHaBS)T|A68W11 zV@r`^%!9xV#z)*u(5t`AeTHX?bpl<1tug<4bRjbVa6^}w?a7xV^cT-DJHP@>Zf%twn4Jnnv+7sCY*S!%T3@9wkYb}j6S)EdaPy$d%y#QC z!OlEv-A?`P%b?djn?f%diN6#pFk6H`!QwKMrBaQKsuWe>sLW8YH(8XRj;QNIex3CA zz!8DD)9PTZVI&&PKu>@nD0_G;wp&M~H z*22-taRR>v-sIQ&I`NC~Njkv!G2t=&QfRh6`65Ifq3}8EOa3j=iZixJhQ~YtdZVdq zT@8B4qBoI5eGRIptv+((Og$>o$@N}9THmB))$Aye^fg8neaNbhZ3&GC#?L#Grn-w# z{-C~QVwMro@ zgxsS)CSx5?gt^vGGg)@;AW!aGUiiMpGtoG0sz2AulIJjGhk5PHjPLIMMLG$LY5))6QT+A^w?0Cte1kZr|5Q#@-B7>W8OsTmI*A!?tCAzy-w{7KIpR-l_zM-L^t(HM8^MBztPEgl z&*`&%knBQ=2T5+CmPLcAT*E9nbQ_Pb!dq$*m{A;kqx^9szi7w(0x`>MiyTUZDzPnI zb92N-xK3|3N;flOYCj})AUVW{bmtnCABR7MB^E#(0ebEA_J)0Gou$0BFMNNZ3N{X! zt(Qecw`<{^?qB!KkLRp^L3l1SaD%cNojL!-Gkl7X0C-E}$biuCOi2d0^KkpG=(G$F zDg1Ljz7NK%$O;YYR7-x5BUzy#kt3eaL$R1`g;a6#v?vJ zh?)f*S!IxZe&7N~&97B&F9ZvPZ+UCq8L`4=v3_ExxJo~9X>qMzKha-2BT_j~zwzrk6s|{7tnrrD-gkePAq1!)I$G%157eS0%ot|`apCYA(M6)I zN-RG*o=(*OkG@zhV&aWB^6PZ|Y{@lWl?gSOC1hG}k%as>i)FP&-GzIhFYREGSDm8t zT+S<`;8q)v)}aiZV=seo^!kRzAx6A5H-2Sqz|+N}e*5Y)5^3w#O{&=!zJ*$z3XcMD_;}cSjS(R3Pz&=BU z4!NUh{Av~`^M_J|8T``qme5UxWcIGw?;SD1(ued6H;cS4# z4uqH~&CX>(#y8z}o$q?z4cBex%o$IX$g|Zp7=-RM64}jsH|Zo9k=;Gk5eUajr%Alg z_^_oCly2t3bd<7mXhv|4~S35^~b-PUE}G|AKz9J&#mzsj2!Xm(evnqvx*%#Ql-ai9s}_~ zU%+9oaFO|Ua%n(dOT+tcaeQmVU343^ZlYm2yE;R|#dDt8s%-qKD`c$WV}`tr+1-_v z4;bS?3c%a`xW-C&C?83ImyO&S(@wsOQKz|*i!WSEp{3jHJjGUi>C^Gmu`S_zKVu*Z z=+Uo$oa(U+J)nLBvl~pzKa_!s=of#yE*q7Lt{pcZ`|JscystxJ>ulAPY}J+bB+)wE z?aK@HMGRc;eAO2p#8~AA{z_R!i>3y@@&$5zWu4?0m7Dd|zoi(1rfJRkT|Xyq2kkK? zozC>zrQIX-D7Nvccg+-oD;^&db(lgL>CY4=z!a2PemQK}yfolKy+J{R5UCNlN}%>Y z5XPTH=h#(5W%%q0i6rvRz8ma1s}D(wR-2l%;v3e8OWr zjM=A~smN%RVl|@0%~Y?~3wNMny)xTF|KNP;Cr&Od!%E;BCc9)oc(2gGcH7(zA z!5E~fIsD98HJVa%L(74&*+{u;`*d5vPplrPjFi3~gFLi`o*OfaApCu>5vH;ppOwPq zXFMvE+z1Gjne6JmzqkDq{?Sy8SQQ1;l+o$f&1Bf$Ce85=D33nVtjTWq&^Zcvp+J|6 zWS5$^q~%QkA+aP4RaGw%s8Y#kDO*5#Bu#icqxBE!$Pi^?zbm@BrKJ~h2OoDgsRX7e z$5JJWZhl3mJ}jFDRwhFiuvn41Qx-cRHw|9KRc`Gm;~gQPpV=eyoV+{GM!|xrOSw8juQ1S(H9d2~ITBrr!FYJo;hL zDr+^pfJ~u)njblf$>J5z&{81z&D@|_m5cp+Sr9EUq;brr8+$@-# zhTjqQR6+w^LAbkyl(Pe7Re|;TjZ{hGzeLSs(PF^3Eu_a6ynaGv_lqh23i{xi>QPZx zx0K3adpu`3B}8Z9`LZ)NpC{27YHI}|3-K^d1HvD&Z>vg%-nNPv?(Wy?TjDumoyxJ> zsS?lmI!}mMxE;yz5*Osp&d|B^gINLO5~pjpV6O-Xao(XHCrAzU1~&qWPiM}XZY)`J z)=MziT)58}XKNh!y%?tZYsJxIb|_sApzDB^5&(QK@#DbD=@Apd|TAhO+N_Pb*1P=o=r!HnS);P!`0-`1Lhag=*fCXI_@@YeQKg z+Fa5r75;ctUXTe%WqV`p00ReqYa_czttG+A&EdCjUeoeozF;-n05u|qa&I{SVs1dW zsg3YZVfijY?LIK=!NYO+zHSb9Y+7-fCisS5t6y;&Scd;rAGvLy7J5!!Q(ahoz(`Eb z8JaN|jH_d?W!(dZa}W(OUJp`wplfL>I~rf>Ue${~>|Y5cLWYYVQ$et9>}Z@=pSvrc zrc=Fd#bQC=qiRJOY;)*sJe#UQqjZ}uaRo-C@a}+1pRO;^r#)EN27Q=8R&Hy2B$GZ+ zLN-6jw;c0Oz+iK;OOJZDd1>8AbmX5c*(FlH6$I6(f&ixqf}Zvj7xf+I`W3ed`aO*2 z*n#!ph`d9r5N|MRs4r2g@Mj?s)O%?o{BWS^Yu*s<(U#C-hOhxcYgD#}Er!-vmyG8e z#*CG3PvdVeZf!1`4K~a!r<}uOhVJi@q~125|2nDve~|up(uqK2o8r%SRq?^DHfwlF z9)j^LC{xSf(~NpO<7o1(Q{w?cBn5*_AZpYOeni0^qLU4@Uq0@ ztfAEjU-EK1^ofbd?In(Xb{Wp)X2hd$uz$eV++bxZlcz{K2yGAVp;;Gx(DO?7aiXUb z%?lATaRbAH;a>iy0!6KdB=R3Kv4gGGvbcgR1X$6pNMMJ+8p-PYx>HEAUcFBBdyp#e zTxqgs$u`!H24`!n=)`2vXFLJV-Jf)pl%Fxomt?luAhXrQqBAS3&j2Vovw~SS-DkG6 zPMbjznO^Jn!zH%U?66*=7_(Jpi?X-7nf5uo=ikd+y|J{tWPK=Lb}>~w&`}jW&d2!s zDyvGC6*6gP-s=qyM}CUyeP-Sj@F{x>KC`M=40DP|8Vr2oBM)|H$qDr=b}O4vFJ>RqWUlGyuuh>)EE%emE&mF@K*N33 z{%JB{2+GDYn=*EQ!TdQL08seh__#cDfH^5I*1GI#8Q6ql^(zkwiuYI-{FDa{z2BqwI{CIkQTa_#}dVS_$ z0m=?Z(X2bc;pwY|<(3Wbj4Mu2rWXkcO^`xArBL@yxXGv1N2?|64oTaj>>K{V+Vkj8 zH47HL&Z+{TFEV@SN`LW4AK6`KN%nHL|4e^UzuPn!wPYxgEnB&RI8#D#Gt}01qJT56D(sNqStZ1cNG~{tY#< zzO_g`(o|q=lEKF9DuG;u!ZWOTo0kpsnPUsP?-M#;yDl5r{gES|IDR=_yF+*Z4tmL` zyRyL0tJ`E3$!v>lUwoR?0sUAkVjoG(DP(=}EU=C<^eEFYjtc1}s|uPgad-REocR&A zO@Dej(iU9trq(Qde$m%kt)sgcSI!=m>sUPzo1XtpDFG8)6zbS9{MXgmM(VrIk6xq3vP=@E)&*I*T5=WnlZw07Dn2M>=VV+SVy4HssQ9JwyMftphWv;3#ttky2XyVp z^t@Nk8sO|XCTvwdI-o?w|#b=fujuzTHzA1~O!Zd)B?^U@RtMb=kEA9UQtzR*(9 z2m3@HR3hrl5ybk@NypF!70QcD=^v=M`YJGt+??m>{vH)4NySA}?A{b$Cr2GAs*$uC zCGC%l0}`F=e%z&d%7696eR*=c>LtI~88n}fC7D${jFPXiRbTy0iNvoc@Rzn$cIm52 z;0a%+PpNK_bJgUjd2fLp{g8t7k?(pnJ^B{W#MN28wZDD%;fKw(fc|I~l5mDBvPC<| zi@fJCHS=Sd!}gV@sM& z2YiDe&=_GRUu20;sGK;iX_lgpv*MINq}VRTNgdr&erSAL0i-yU#&g3%gh=BG5P4@t zq4d72Kf}V^XE}QJOZ%~ly(54rb0{ER$^hdRdh4s^Koc2mnyGBp6F2c#+J)*`^(ZkC zeWovM|5SM}m2s&h8$us!ZzZC~X7by-cc30yOuj-nFT>tWQh21z^76evh@MT!jM0D} zWLU+VS!Yq+9wqsXUrxG=lXbk)AMjW`l?H`CdTb&wIi0%W3-COLpj<{10$;(|z;daOHAuvf)Yw&&w@33VeB|_tqN~r9pT-3=t5j zJ_&ZPko^ZT&8~y5NVk0OH!I_xl0^~+0O;v%9+ZDP)gK>Qu&b0)eCU-W`JH16kVK}k z@+)@EScgU-#xlw-PI4+s=Qzj-n)Ks>%JB<>btd16L$ZJ#YC!R_hhdFj#WX%CuiKl_ zheq9mVxLm7_0?t`RW{?en&XObc2PLW0lb98)B~Huc#JEq99OdJ4U9+gJKoSnjEAG1 zzgY_-k1H0yZ7vA1@*av=dk53P5inN~OJGfCMF?wtm>M8!+yvk1VwxfDYC)*&A zR$Xm>e74_Rca$??2{9ULb%<9ZEbw&*T8)cr{zLq`OuTv-?n_cr^z)mXNH`!C*nh@m ze`0Z2r1hKPA2uf|efs#$zz%!m_#};mUotBHp+^%48a>cgZKP9%oKmoj(nFUWxbUH>0(hLA7)jHbFjt*weqWSFv+*# zZ8@jaQROMqV}7Tcr?)#IpNxfO4Pbv95T2$FjLL(0ghgV}MZPED(WLJwev;k8M4!}6 zQho;hlT3f$42m~ZP}HEZ%$ZVb@g|cG;f%fB&{j1oYPW_@RD6-H*SHykm(8GZ_ql05 zW>SWK;7+n=1tNvMDF{VA=-UbofHcvW_6^q8WHdsx@Rl!ZlF<*>%D#e#sa0tE5~xi& zL0>fnQnJwMTVKY9@$$9Va;9iOd!X&j?0UVSRjWy4 zm-`}b7KDn7#P|aJ#DUD`~k_oIZdQ9Avf&zua{; z39e3bILXU!vIUfz<{VuT;FQGpEN7HE#(T=_<6Ws-iqIYsq3u(CJmno7m#VaO zkSnyY3Dk5)4Z&rsQe(IJQu-!?7!`%@@>j<|=*Om729*-8Dx!psD#DcF!R=AP1zzE) zQhy~2KEfCxM|$)L+N;K~6*0Me5d34$<0mEm4w>gV0}-z|%2==smUxl`pZUCKsy?C{ z`guxhR~MkN+>wwmFqq><#sXQ|q8`fI{s~RH{qUQ+M~FW*M!93L>yib;1nOd3e!F%#ci5Grt)R2ZyK>{kYVlE!+V*#m?6B761F*UfD#3VC{%KIM8M zZ+gP4OF8T*EPk@XFB$x)_#6Io_zF99GF&a{Og6;R3@Do}{*^;TKS_}R_5c!k^^;SB zwIZb^V`J1%O*vYPS1^y6S2fQrsPI^4P%EpCF1t~bWG}JpZ}b)m`Q*qHY!EaL1R28E zGY}QVvG+i0k^_lFSt3laKa*~YsaA5A@+>xxifAppKb7!2PUErJZ!@-4k4~gJ_8ow6 z=!2Alracre!dfxm9Y1`+y5O=$=un7fbDI?TAvhCCDYJ}5{N|g?aT)Rzhn+D$jYNA8 zr7}!XfBVVIol{sGN{N5Fz#qR%befn-Jc&r_DZYbkVl6|**GF#kVl=yi1LN1``6@T~ z^qLJBMzfzE77ip2QOk>d0UOSyZgZwA>XP?!82&?_H(WsnF zE=xuZqtv5f70vBWiB6}OukAggWI;!uZI2@@mnX8%#33jMox}=$ae+@iF}qk;ehBzy z$cv-TyF{PwQTlu{`aI~_%|h=N^?BE^^m&))^DQ-rS$Upd<-6=pi6F;zExQb?z}#25 z!EauhCn*(-oy-zK7{qQC)FkzTB}4q?<_xV6{Y6R3pQrb}j_l&| z0=$`n?Ks$YX7_x>=RsLV7;EN@WgwutW8Xtp`0@F~zzei0_FGa${xu&w=|wF*#h%TO zP#cEcsvNHjaCfVkSK%-8&i8xmd4{XM>|E@6rsp8@eX;4i^ZmW-@!(K)%XzAP2VPu# zD2NF?{zJNTuIiRujOdx|&I~#=fA)0Qr#iwIg$BYFdm%}c9+d-@Qr9cy!M1r)^e_(Y zw9lbJO25mv#O9C-y~F1-6#6Q8Nliw;9TGkEwG6Vy;^eBug$ortX`(C;+32y)A_IHZ zyRxdUx`$EAN8*%sc#)5l-?2Qvv@*To>tya>v{?h_gNJNEUKO$A z<&DIZ9{UmoQv6cs-<^9=WW5l}KVU;74buVTNhb&dS-f9I&%%G{rNN;Cn{q?Pb#TgS zl*}F3#mDeAc)BM#{0==%qeS>*w~P9!Z8Vx5kB)((lz_pXVhpLyJ%xXHRLG?V} z6GYHp=-WGpKr>l(*Pm_;lpY?RIQ_WTc5`dBe)%&i_J}TnxOBhE7uk8ln7jR^`uh5| zL&K|C9d0{vW@&qf?HfO|s%vjMGTay0;4$X5N*SZ=(3z!OW_$RZYV)<~(r0~(jMQJj<~3NSW;hXb2f|ME-2Z&vBC9YoTL83*1$*x(xT z)ySLP$d;+rJCdN5!b(^}$O@hLh4qUKPKneu3Hu*V(%T($^^?A4oVT9E=TRJW(_=yn zoDBN()I<&>edviC9u=AuIXp7-GHqJ7083UNq`#^4l{y8#cim}k27a|xHMxFIT^_~R zyIC?$bv1g>~DW(y> zTQNDON|i|4<>&_0o(PIP7^UITSIVy*Lau!hiM={B-QHP(*J)|^j6!rYC6QVEPJ?=g zzA!ekb9Fc@#m{QpP+$F=KyKy7$H_c}lF}%)yswv`?NaAsy|PRmFn9Q5EAW{>9rMIP z!;QHwIShN`On)h6i4oF5BQt6{_OjjF=0j<^?A>;FxZ?25aQIvCaU(e`9?t6=H&O&k z?KSu-TUm0BPsn3AN3S{LfHgO>bX?k1xlN6d-`u88WMA=BRhX+om}wteB(G-MzmTK*&VEpz^{Bejv3Zp3%=4Vt;<<)xG zoXqtyePs_-otHdB;Y}3;tJJRaE?$O~#j@$nV%ZE4BCwn;FNREI!sV`(>Cp}#uqiP@ zR$F|LL!U35iZ1h2b}g?JJ`(thC{{pqXFv3|-bTi{^^En}U%uIo-v#~O=R5tKBg7at~N>?IBB{KB-hSHR-JSOSCR0fw;U3l zlL@zlmgG&D(biiRiw<{*F(z)6?;YoBuLaKKZ#`7r~E0T9hElW$KKtfv-+ zQ^%RrzR8OB9|Aig{)l=Dm(3IN1;4qW#(d5iGg@rUN3Hnb&&W$t73Hlf-qmJE;*D+f zyi>hWI?cHq6k4dbfz6-AysH`|5t&XBy^E-P>Uq()g^(n1`9*2f>n}x%Ic}}V*zG3_ zB(BRUKM+ieh;x=da%e>8Y<&%HB^=OWKO)yiOv;jy7V)6@VZeM{W>1GkgkO~-d*%J* z*L>A|KW?Aw!Y-8tjne(bwj(G)>mSe@*n)047lRO*6Od}5;-%H1TXrRCcLx%US^ApI zyqS27e&-sR*4Jzg)D_+n^z2FQ5j`H+09K&_K{qbws%Pd8r5}H>{cWEmCMB&EG#+<) zZmifSly|o0oulI}|DhBD=BMv8_ zU$cRNyR+^agp8l>Nqm~j8D2}oDx*F+_7>_ena|dv^NES$94kRx_|(hdyx12i@~P`G zR(f)onPoR8{61E}2PzC`p?i_TcPnP`9M16&TU_wb zA(E$f#m7*o{Usi=)LuF-V!UEM3y$a}P7>5037c{sf~|fbp8ho6!5YjNQl<%h{b?>e zc$F1M@1=>;Px7tUP20;kp0~?4c(dPZJGj$7xUJ0|?jO9<_t$@Ln%r0VjIXVmw-PF7 zzCN~V$-um3hH7!W97WKh&q}H~957gBQ_<%bi24=pB`;BZ4w_r6T9(jlneU-&WVl53 z;YV?;W@Hcse8m%-VfFl-^4C({>B^LJSMH`OyUvk%Rf=oGPFc6Jm80f+{Y%G=ReB?~ z%uc4s)C+s@u}XIc>}{!mUx`x_#0XTV;}L3znNP_`&Y)MCRq;)pK<;|6oikF0N@{bh zg=F~S9I23_B^B%&=j&@G7hj*!!KiIf%WBkxVIH6z^-Aw2wCp>$fnOHP;0bp3q&%8 zFfE{W{D%}g=74fKXogkxDpZWqn8Ht_CcC!-e+Dxn&N$S$Di}6U+9~>ws6nN(e#6)h z%L*OGFRE=jIcAo^pWcPPQiswzGH{9@#XvicB7Swc2bOyp{DMMtynPd&uEdt`^XGmisEV z=_}=2foM+e>;7Ud{O^?)vPxvizw(b{vCW1+OX(3b!Q%WORg-(925;uam&^Kqe=swx zr|6p;AN0CXe1OVGu{K&?y+`2aPq(l0;B5F=+TLM#wj$9-FOdnkq9*5?tXtW8upgsm zLP&*w7EmtB@Bo+mrMQ{UJ~LHW?2~=1daBm!7f}ULIoq{{ zk#^V2+xy5_nPorhVnP8lB5&2F3v^MyAzPy88#9=?ZLxm$3ivFhhNDVtKN7^9RrYLY z4ion4@;)Z)4MD$xyog<TPNs$R_OCtX6sMvMYu5;@q4cTGZFExqx=kj*UG^4og6! zKM>Z_jr(apBy)*UMhMWs>BUR!S#Y$WKkZn4+d~I)d@QG+_t2GIwupjvB8> zdn#ucIc$70YiZzU(%$P?{8z`^tSK+kJui^^(jZf7sj;_513}eyjI#Mg+8|?~69BZn zy%SI`QjDoVs->m9Ep@*{_Tyw{9IE{%QT0Ak`L4*})0TPFvR5B>q)PL&vUHQEJ*9H% z(oY!w8ZSz0D2s&IYMQyheZSNzFR+>~Gu1k6Sy+K7Kj8R()bEh`GWq(LSy7$NOudL0EZVv@HN91AoI-}*@^#J|x6J<$K&T6hKh;9#mWN;96fMLqHZFi>2F~BNL z*L6QAA<5mZWk7PtxH%p;x?>NslQtt^93|YUCMNnzRoS))N+27BbKp_;_080|zw*|2 zBlqPz%i=)gcMA2pWUZL>A0u(iP$MySX!-VF-AimdWL=gRJtW|HFFvR`p2f`_=(~ga zQ1m0Y8N$ce@auhMC-XyIgNO5ZIUqeB^u;+TQxJ%>aXUt9Hmly9K~E32G|09Jo75jd z*)Dm%T&{+xI%>4pUUu?=l`k}PRL75F6N-MW&1-u`DXwa}>7%LlypcWZHT=D4mr>Ud55~9#F8noS9@%~zRj`J-m!0{)Fy#zsKt7R(uOK}BZ zjrnYKd@w{|>xg|d&QG&cjHM`v?0&&#W*4i4<3Vg@=}`@R?l+%_yp_$dQ*s9QuTRUr?Sm1&eFZ zdME35ULc*1=ihZ5Cbq6iF#C{qO|!QFd6RzU4}l7aRvl*l!0+KWbB2ZY!;B3nXFcT! z+5(St15OPe$_~%;skxN8J*HWleYv^mIQqcCfUrR3!hk(jjAM!)#b`ft8tOnTUkU)B zo&(^yX_%9XDNTpixjqK04K2?5SA82^Nj=^jRQ%aFt`K;d-Jx8^DyHz#V*M$Mx5Fw9 z^Q3s+)8Q-a6vnYPs|vEt(@cFqijuV{GQdf zPl*O8aqt!BI46>1?c$s8WF(oRuYN-u^c5RECn(1O{p=vqQ~gK6#Vyo`B$Yk>mh9_AUt0VA%i~6`7tzTZ<7v0^tfJ6Rcdxl9deekMbxk zl7Am2-ye4}gz;wysR2|GH}ZHDNFasIQtQHcpp9clxd5DHw(HT*K-P=JpJcCuB&x8% z&W_S_(&Kk2^hNpC$Rj|G=d8RQyM!n8g7Zo$(rPmST@vi%gLXcN>=C(B#o+zNZvD=m z!+jt8*!bgS1@cgVRry|pjXlzIfU-S{BmTsY={(}#+uki+r?cV{){x23B4Af z5YS5iO`rcpV_l?dUG#|lcrCjquDns|BUb3;RdAnvPP?d-E)LiOZF8h;0JlO)^VTNL zMG65}S*Ij}DoYG2Hv&UlL}ApyCl`+`)K|;ohj-$_hzyK6i@FjTShJ+i3i!N_u7+!A zF^T_F`KkQJbSHc>ikD`;&^TvoR71bWdXiS_!Zwp-aXlk;!D9PxE< zCVf};ct=jDuu;A50?#^qaK`<5Kz$Lc0dfL&)CPem=ilKu>G;Jkb@UJp{6V<1iR(Dg;BXrkNyV*^v8LrQdZTBWb={A(7TMFFY#TERQFPS z<%!ZoTc)q%Oq(4E<+{k~(j&5|%gcIf+qw+L@vv2^cnLjYILo9a z&x>VP3Dxkz7;wk*V*EIv&ClDIc(7@#|GeE*7}0H^!T!ouc-;UpBE&POF~@_x^K7Om z!P>op6a_HDx##&?r-8r{($jdz?);(WQ8cKLd&N`Ry zauN;HdN_2j?08w6YoY*6qf448gEsc*Kj&goJ$Ad&J-j8awh-lZ$Li=V?k6VVs3a*g zf+p?U2NZk^^o4IVavx6S4T}>ekIRerlwKR2yf0fDTB5Z z*QQOs^BFj{UcWkjv?MD%IF3Q2V@f6U$`fr*rOB01sFo6gV5L#Gl>5x{ADv91s5Vhu zZ4@n|s9A}`5XU>zSD#0m1N+JjO)Gl(3e}jyb9!VO$u6C_bx6@IAH_}GV4A|MvOHFy zNU6@-_u7)!3N`TXvs%vG!ISX0Siw$-i+hm4_q{w1{5j%02-6ES9|R3o4rFqTF+}XM zoHay|IuQhXkuv2mDPu33tW5g>{qhboElUWz^#t1j%CM)bJhp%a7t;&rVVQjbV~8FU zx#qdSJ`Kwuc8pGg?i|O5sE}UxM9n(1@_UXT2d63qbL3?flJ5!%C<}e2(&GkcqgGZ- zrpp$OxgYMFl_gtjbCmZ4pO(orQCw&jl0`=@ysD8`b`dza?v3|$e^uN}cl94aR{KF> z9K^F0qmNW|nK-O`1U~T}51{aI2k;&-{v^~SmU(K-&qP?8mln^^^w^JOmuTfe#R-*; z&!I=(AQ6@qF+>+@)+!*U9@vqTHxlJk{SA=ZY9x;<=X$d{tQ%BPr|U`9nEy!Brp&%t zUKCezg4&nim8G3NbE7{or_A~dq!P-ZQ;sn0whqg)+0F}mGWw^XqLmb)o~>nW(Wyd4 zs~Ix|QOh81?cjDjR*ha@^@x+*$0H|t>;qP%s=3@w{MQj zkd(#2xI?rhMj3YxFj7eMrkflbjrQ2DVQU4!kSOikk2qX|qH3+?o z*+xVI^H?24-PSppynzwoJvLvVn4Ej7^(QB<80w4cA8y~ERFG(NhSKS@=s3sNf5uVa z{tC>V#?Bb6JGPin z8jGXZ?{TLi!dq6p<>1TKSX$s5PD)n5qC({5IEZhy+kZqBsNYOA`6%^TDn6BL+|8h{ zV;w$c^FsN*DLh_t#BYvNO4Gw}meM2{4{l}`#1!wbK@BT_<;wPz)FQ3cWo_#p;3&d2 zqpuIi-hRCpAE`wD!BZ;A2gGLBav~&g_2-d2pXKeg{|sirg|-unEp&Nh0fKZKcOu}-Brtht2_xk+D)PW;|kPy%xC0k z-WSAmP;Ldz#EWv_e^vK{ls@yVyiF`T#$D-vRd9-QSj)&! z%2zQ$GkZY1fyaFo+z)5%f%AD^_xEz`0|?YI9yM~&J!q(#FAeh}w6U1pvS1t>98+5& zBw~Q4`XIWnjhCT!jjR1AvgpvSDx2&|tcSkpVsu>uj_^1$8KSjPu|ZU%T}-CKUj}z< z$;4#>%2p{It|{hnC2rigq9BmCydap!j^$Ftx*MOu<@>|d@9^9mKLMO-?Q7Js0rDry z2DKtK1&}?aHv=(qb|OQP*M3ODS(hNCW{K*jyPqT|EEeHi)uIG@TFy`XYK10WLf>CS2cbjM>f2&c{@Jw7 z{{H*Qq3YxE*n^OH)JLnMK9+qbL>|>H<0a_p-}^TCdTbWI&3o|PuO(tnr4)#%nOFBZHf4eiZ}sasE`hQ#mmF?SM1uM7fkzY#fC%pU` z=7+x0qZvjoBia(r;ypJGVzYnPp1otYd6-ii=6GJs9DCG()cAU!bQd?x-N*YbOoCcN z=VA`CyGNw`@{fX*FX@p9&|Uo7U-Q&>#I2$kr4}df*wpf=4W>~Pa(j}pH`S(ZsEAke zIP%z1>wYi;JMmZRu0Tcm*sRTK;ktgdVo&YPyb)ZGiL`skXd2?)==#1q4E$JsLi4n9 zw^=`e_j^?5m$}jLZT4Kv=hbXr3pAsjVikgX9FhG}_gtr%!`zvZpcE5|XHkQe%` z!Szw956(a+yYIr|+WS8>XS zT;JRLHg417QrWjzHMN(L*N0uJAkX_1OzuOMMwa9bUdGvnjTQF2ctR<9iRCSUSW(v? zLDr(7OBskUGx^Oj>hM#9Xi$lN=oaE7K<9o;Xmtez9O{qwFp?U7IsZ}Sk5A)_!1nk&toIsq9O02-`@NjJNbOgu{wj7Gi}`?C-@27LfE~p(TahuR ze^-cT7ClF5sE&=ByS{~7PaUeRP>pOkfBGE)I$g?!*%3snJ;xpU zMdQ|$P}F)8cSKrv8gLf^E;8nEpwmiiZuq_wBmae@WYJZ(Q&cf>G$n6|TINYbj}1cx zpGKecIqx96Qn#cqSodwH_*{U?j4D4`s{2h|C3`+t_Yp^}_@`@}P)JN4!^K9|=jlId z^9`7oYuzbbS8to?(d*$-Bhi3S;8a549Y*C#O*2s%oh*`)aj=U^_}K`|;(9sM=;=mf zSJPOz>mYP?rxc*759Qg_bl$N!rNu23Y0d4dk~uvSfv(2)f{B`;fyBslfl6DCY#_BJ zQBT77Q)T2i%swG8`VsY?#CWS757VdrU{kQPr<(uS7_m-u(CO&T7keS52PjRqR>x13 zQ#8zELfzsFGU!y;er&XQ8Jy!Maj)1e_xQ~QUWtU2;bJ*&BV24$9%}IoB78%+;gF%>YKSiAXR93W*Aa)=;%zmAOj7}NxQQJNOQvF)?acbGO&hZ^DMl`XO^lB0bpC3ItC%xQ~Z&G4V$>&u8G zi#9HY@qX1yFIihk6rFvMTE{i+jThc-tpl?AZvbLqH!`rRZz=x(qfGbRyI@-scKfUg z6?Ov+j^KdctA@iSN+CK}{R)jqzlSYQ{SFO}j9GCDm^uG&1neH+DOq$j*x3VV)2Z~R zI;GelAiwIY0)9t)dh^1U=1VZX&~Vtg`;T1W`xI&NDgVH|?5$azdjByQiqXrlHuI%; zfJ2Ng#cwWb`$rCMzY`)pi(z$&Np)c?Qw3{Xa5@ldWynU)wo z(6~2{m>dHIW=`9T(!-HAvqK{SiR#mPg(7rrCf$1XHw7uUJX3zc?{Q;IV)PpBX2Z8> z;tJoqJGk*-xE{TqvcC0RetD6fe-~G}KND#0DE7wU1O4Vc;eBE09lJSGI2b)%bu@m} z(H-vvE1z$CBPg4;6HZnCSODRR?B)>CmUT2C=Z?5e(5F|wZ%!VR?K9`2g?1`_3aT-e z*U;3JptW`bG13L{puwM@x z;{82q2rfPgw(zEN_rJNgK*leasH5qLD8IQwnO2o=NAm;{qoYRMc=ATB$iO8tnr?4w z+&^HbKhC(w@x51^<9p*wzlXaaH>h!S&hZ7&RJQT@4*nNDbzvGLG2L65hhk z2FN=s-pJ|7%6~LX$!xQ?lNqUMOaJszX+Dm-juvY-2heo-_~7Bj*9;Dv8Knn}$h${* zN4-LHbeO|~%|fMkEvnLD%lJJHUc$a=^NK=YI9FlF(Lu*Az~$M-s-r`}&}+DAg+luN zWzjFM80N>UE!lVC0yyt?;wA(WS3u&^Z|AymsWCYU)`}#!{n#E!9v_|tymYP1GA-&7d~)H1(!)Ef zbRj`#5dS>l5VNYpx@k z1a>mmPsjkE`pcWfAN%zuxR{F7f}c|x;@S@}2n)GFhZ^wHFcQSySPypHvYcMA*j(E^o^p@MA zjJhs+BH+Z|b3v@FlD@5b9+KGa11+Ubkw5+M<5uSK-|$A@#$;aQpxPr-mY*d{3WY!F zH=lL>O-&hw4lW3?Tlx&`KveawgkE+3ov+t=U1>o4U--Ds4A`k(C2vh9+ktgFf6g&@Twu!c39XDf>z z{@eK=%lSVZfE#GZeWCJi$;<&FV-5U6;73SdEj;BQ@qf+Q@w;01cj}J4OpdrwJ^$Yg zd*S^8rW{+|td+k1GC)87AE;r_7G{{J-mlPcef|K0tbuW{jhS$L+Kp9-`3)!!59 zJHGAeng3_!hQDybyWQ}1H*9pn8{P0qHyrDR=eyyVZg`R#X1U>h)-Pn-_qgG9H*9x9 zS>I56;f8m+;q7kN=!Q4C;gxPU)(y{h!!zCRBsa`*!~KnJ`);^hg(81dMr6!cHzH)A zfe1p?wHufyKLOR`oyF0zBUzH%uGaT4PD6~fA3gQEH{{E|4*75<-q$mT5^F}PY|d+> zNa>s%M3@GSb`G!Nm*lib;rmmHzzyw-Z|}3F3KswV{QrmobGB+#o{avY@6{)jB<4Hw zBbk}cRXl8XI+<*}#5p)7d8|UEKf6Ef3%@v><3xHAee<bqk>3zy8cCZ?s#mMl zuH^4>zBgzAZK8IS=F@`2{5%`PClU)%>i;VI5wzme75o}$djXK^v^n57k38wgOzlQ( zAvq!9A!353l=Es|b9?tS7sh4e-bngfa_fmVYV%b;=MtCl-=W@YQkSR_H<40LzO*F0 z)wF+Aze&+2^d+@5=p6!%wC*MBOY0O4eOr`Xr|PBp_&=}dv?o1k{NHM^PSGPot%da6 z3$=T>Kp_3Aeomo>GAa(&IAc-=wAqTFaA`AbV{!&Uf z(iv+Rcd6@qE+rOX^c)Ggil;_3&eC>(ugKsea&IN4udMZIDCyb%1kSPQPgOme?qZ{% zodkm`z*fQ~Jj>|F)0fwl%E#hA;kE`~com(FO@WD&5l&mecsrv$im|BI9L?fTMdKfK zAFG7Zu2UvclR2EVKsyhJ!ucZO4hQw>$tfAqlxo?L525*7+7}%z8opQSe6gioc;ghE z94+Y8ZlP8Pj&vZ-r~i(2@Ip(`ZL^hDcI=D5NO7|U*Gb8gWH|ajWF=GL=PaBLL#p3? zrwu9t{XnG-zoo4KfnA~uqDmccvE=(>;)n3tcCMu&RyW0-PpLW(Ocrbyv7%p zSlXN2G=KrFt>hg-OL4bv%Mv?!OJI2QjjJu41H;-$>~y>HhIo$ zZ{zHy@Vt*vvhrfl-rpM1yzE%qH)Gh2FXI zXWlgXyr$W8p}7n3-n>`MZ48BH&buN!yK#xC4DR)e!H8Mj8<&J;H+g4De&3M$yqi$szvr8xOaJhkbI$QLg=W^>EPT>5N1B;6 zTL=b~$-BrKn%y{m?t+;-r@Gz;#0c#ibo4p+79z9f^?6FAmn;a+n+F%C$~tqQR9WXq z%1W#Ry&@jYJWbes1mB4>Cu)HO^~7%d2|V^==ler9{%1FA`p@3Hw}}`0%y}L~__AB( zlZ^T)IRCrhSAXijUFU{2;VHCN?Y4iri=V(<=i+dzHa@wx7vEHV;W5%ud8u%o3v(&q z-IRaqK4gygba^H&X*auYrtrL&?>6!Tb{_E*oV{+j%r;YTsSml;2g=V~fKpS#P?kzP zbji$d3+w*-&h3}{dGm~T$@e-g`E1G0YlrxET#|j{w);*#@1D`;Uvudcd3I{_xZ&wl z&U3LF=DJ~_8#=lCHm74MvfHiSH&vc;_@2u5HhK8cPpbV?yVI&v(W=!>xN?INe*2IL zo1>$AlHYYwoeHBb|5$~ypLaQ;vr){2i)BLvIj?>0`f5zKW-yBzU*JF!^JzC_i z{b<6y;JYl}_<3DZx+2{-Y`=J-lIuNKF^WUHNKi7YK{(V2b z>xJC)*Nzys@R3<112-R2HWdK3{Zkg?JoebyAB_L>@Fn(u%PVTmA69nQxaIb1MnB!O z{PoL=!X64|IZbL3PkmBa+p&8izMurW5A?OaW&NJ3zG!EwGU4~n@BchwU0?Wr=m)>m zwdW~(AL{q~tA5W9^n2F&t)DgY%U|5@`49cdH$UAMzlMJOZSMEHvfuN){ht5Ouf67e z&prK~%g*W7{#kvWTl$s1blkX$yd_u8UUcC|@7VBM+=LsKcxxt2n{>XIyKkfTOTFb6 zjw&DJ%KE>{|C>Jl=7)dF%fI!5V`2WS5C5$X|4-^ewOPnd^Y>l5es^E{p2|=8KdG?x z#J=TH=>-MHPEW0)wEnC0loX(UhyODQ@WzKn-ul?hi8jIjFC%9YXMD6@^35au2g3J; zXj&oh9>V-WO)Hl2gy-?~6E7pYknb$wRfIvlbBNaxUcq-B@fn2md>e>25PrlrM7)Ra zXc1TH5ZAayVKv`-h_?_HoT6zzBQ9YB6RC%YOL+eon)U?o2M8bL`#0iUgeRS;X|ECY z65h@CZ7D^Ff=IERhuT!|Al@%0jKCVYc$ zG4b7mW6!4F#A^w6vmEfh*n9JMsJl0Q_%mbQB3X)TZBiIXS;B~At)Be(%@s^}L=xp6B&?zMt29?)P=BbA8To zo$H)A=X|acS(p!y_Xs%W0yIBBVGxV}4c5WX^ifa_pdn!hx(b3Vpq&AdK=6k7BH*qC z2sl7T0DJ%r8gBtQ0pJS|A^`msU^fVl0NoEz3# zhOyHG=mD0;KoA;)T0koRYz3ha(ER{uOo4s@%>ppf4DbM54lo;B9wNXr!0%u)&oTtk z0Pe$ryaBBMP$&S{2B5_NdV?SaXdJ*m90W-LIs)Jt2xkBd1%fdJf;yn_05gJs%>eo> zK(An6Gl0ecqzeV@2Q&*n>`gF6U_QWp5MluhWAAOy?=TH;_72Dc(0G9F?}EMobUDCz z5NZJp-2=J>;R~P@00x850q6*TEg%d7x*y;;2zWpP{|smyghfCD-wa3;1OlLeUj`Ho z0yGTj6A3|cAbc4B@dx0{0|XYp|KIcfeGMQ$6%mvOh*FV^!2S^Q)DXQmwHTEtr89*V zvYd{b-X9Lqfc)YvFiyan4bN#X{+SN{+g^g?bQrhFfXn$6;QSx_oj+nHHMA4KgrpOr zHKlPzXl>P}0?1QSK-57N0MX^(i#gy53hZa*gLnB>3 z6913(!0m$bQ6%T{s}6tiIOrgbThtJXHx%+ln~7W3NfT9!WqF% zugp%t62vIRU`p>yS5DJN)lY#(Y}Y>tlna*yZq-wXBg7E2R1mEqr5J@N60{KTVV#OZ(FS5VgPdi#|I>pbUg?fq3AoE`+q`duF|Z;)KR`EPap zvp(=UNo2jS)1LskF#iu-u+tv|e8vCZGtxkeNwi`#rqs?*fc}5szm@%`9(T|}JH+Y5=uByyX|$-zDI1afTebu1NbwIHQBXkJ z8Y0(0k*pWEo^abQk@W}DuK$JpWjDX{^=Dl``v0E)|7c)00=Bg6hscD0w19o{Bmd^H z>_9*UR0v4yH=Y#W*#e%zZ#-MT^9DSF-*^dt7Xf&<-*}$@?>*o}{KlICyjs9Z_>Fgf z83A?b3UjJxT! z%Ml~vSu&cF@j4kFkuj5uU&z=(#&I$(laZEwJKx=8{G}7IKjJcf#8t_0gFpDT9G$ErS z8Sj(v2^mw#_?e8~$=FH8VKV-z2c8^9GH%z0m5lsklp&)Y8QsbFfQ-+{m`cVRGJYWA zCo=vj?;APZLB?S+&XAEnMkE;P@Uw%ATx2{z#$Wk~{t=fV$1jo5_K)=cjhk)F{cqfC z+t>fb&9*)KpT>-U&f5kP>jevN~txXlZW+jX_GT zR>0roF9CgUszBzE-v%5(#2G7l;3%U6JidUpmc2H}3HDb~Mx60*wg(*dU-@Yul!3pM z>z{&9HR7z3EAX5Bl9Pi2sEE3qBVb7(VUCBZD{$E&mkk-iJY`2u zJ69)1*fUKJIGvf9+XHFStrTxNkcXw!FLCG^%sp#oZ*|(;-PO+A1Gr3qvTng#bu%}2 zW!Rw&g5uz~hLfd-y%j8b!OQ`!5eepM19v2M4`<*B;sjU}TO3QWvY=>+^T0g>c(4I~ zr$3umL~-7YTn#7Jvp}c7?MrLh{|s8BP`5Jk{Et_l!;~7pcg;=_cy{SH>DyUaDcPF2 zLLrn}?q6^fV3h$y|J6GX3zaskHAgomdn+Y(SNrXnxk9I4{(sez9-Q)D^Lh-Y=sMbN zS%Rgqw}q86To}*@=t1Z;mA1W=l`~XGrSr=v>Mt#SrP8r?1FF%3{j;DADzeuaNQa%N zz@4#QoUMj{Jw3I)nVmbZ0BzuMWpA}@ipt)=L)pRH)!oU<0_<)v7-tN|%WAu;A$|x0 zJkzdWtU>pH^DYSE;rLse6T-Mzxnmryyf6-+_o3YoFiMaPd&gM2T7fsXAlMNQ=Inw1 znAuxdI5~QPnt*b^s~&EkP1`wBwTl%we1P*p~mKbL3X420a)f%SHb0>s?h zE$y6u0)T)c#?#u_)y~n~8lw5n6fbhpcAx$8Epi&hZM*mWnWvlM-^n-#<7MsSY~=_< z?*JN1V4zkHA!`eG5`>^c$lBS%9T*_=0=678OQ;Y6_Tg#nVGZQ%+}xbaEUchNKzUhg zDIH3I+i!2>2)-1+l%s`%GnBDK!OTJkwhA+MxMn3=w6hOPm%;SbTU9XS2$Ose@PM?0 zQ)wVDnBbX6Z`=109MNDKk#ql|9e4A`O*bt_2c5A zmiE9(z?nR#isuo4k?2wDNOKPGZaM2swW_&?=-v*Lfs$^uz9{E;s_ zNx1pAxm!8>Qyd)SgFWbP;g9!ht<0SN@rLDp=Dj_6{6h+y$N#4sJe&MO>MoG^Z{d$R z03-JVBmN(+LJ8pA|0m&(GRSUi|12X9l<|Kz{E_e0u=_pNT9D`eDf~wsU;?oDJriI% z-5{SoAb{VV z?Rv!iM?JRr@EFazwpQZg-`ewJ}s>Le>~ge?*11({P(H; z`E1Ke{R{uxf8oRZ!SY}F74AQSf00-C7kS%%k$3(Vd2b*edmI7jz@1Ni@RA59M($S( z*bwjlKg{uW{B0NlN`;M+{1`j|ao9M1JukrT8f-k|r}i~`X8kJ$zFlS^pf$1I;=pJQ zEB+lf*oTBt5B?p8&Ve{Q2*^*W0TQ|e8zK1-Ge<(;v*BM)tThtK{3{0h7i_@$zv%^k z6$uIcRv#*GdsrJf4eqxpZ@s1E<8BLX^r)MGB|5n0v~{CM-_Fs}$;*wOyyOHop6Ka4 z;JvvFJ-7mO#hxrH0`?|u#{5^Dx3c&ZbF*-@b9U#yvW?qZV|x0(3IH!{T_kgpp%)S7 zhaVYo>09@D?7@o1&7GfIEpburUzd*PMUMZUq>ukDZ6Ip+XWry}9JtX#&K@i|;rmSV z^zdylu&eR30+)WQ&EQL8;GUS7BflBgyMb+i1-xVD2bX{BfCPETB+L(v5cs|9z@0KT zZ~_8uR+%~a(3^RJTV3!egeX6_(Zz4Zzfa`2_&zu{_|np^w!?8JM|&UsZB_Cg7Z-sq zMUiuM6Q&2(y1-Q~@bXECt(Tn~K=L0_{D<9b!PPdPFt9!2ciP&Z!*|^*;k#|{xr?Qc zC}Vj_Ank=8< zu)j6yZe|0&ucWD=b^3xK3eM->-?w&ka@Z>W@Atv(&lxBZDASVuuNwR!{-r#65ix## z5fOVkU^v?kC=!lr>HJrZ@Ne6x8<2J;ON%&zrMVkiL0Eb#_bm=R{dsHn3f{I-!Oc3* z`!=BO9k=@2jGt`dK-qs)99&j|^{{RHAOQ>}i)+Br;2U~o^lr9R=HT8Vzdbn366Mzf zqOM+cZdSruHUqdBE2(QDBn=k7rR^cL)3Z+o>Xd4PsD1co()c-~mk! z><%0pIDrj4xE{ED6tQ*C1ltm5p|uk@I|1@G{I*v1&fr#@_tpi=t)AQ-RofRDZ);sp zVWQ`Pe>R!UpF4Bu*Zz|ho-2)9Q>_8eIiE8GTihL$a6r^JrCLG|KXm@gX&vup({VnJ zjP(7-%5<|#boD)o52j}@)$6H6hjemBBYtr3@3C>Vr;Kg2_xy6cI#B(Pn1yTgvptn! zlshUTLM-`r#Yyjv)$jJ9SaS}jwSCW+x!kL_P?O%5zCKgD zQmWso+tWJHH}*_^R<|VIej|E)ePd>J)N`hrF4T-eaCH@n2sImJ)`J*xZ<$e*i-qm} ze@w(M_gjQbcxScS zvPPkIrGsx?O<%Eht=JuL$04?_KV{_+dJzd2UTZ&AEn& zd!OdOeeRR}OHZTwi#64kyD3aTlPNxNJ9h1&ll4I!ASNd&6}&~7#v+t-kkHy8B2)t^ ze6f-qXGYEF@@2C}@Q_7ipvhL8_9CIrxr_J1z&kmAUF2R0(^!rylK?K#)RPmPL(F?q z2m7$TS#!K7pNq=e(*bp1N4yCNy1vs)Pir0_(8mrrSB@U*UhZ#tgn7UNRNlNFXaQP8 zlJJ37C-W_xWuJB1 zG2GMpe!bp)4cZ$*rlzq~<)4!2TCSdWR6o-3PU~>_f-EHYZGtofgpuVQ#%%N<=jgnj z-*hdMom|fWZKc9FbVW9XJukP14S)gqL6I>17s+oWb_Fg5iL-!BWMDT`DQqr9-AHpb zRAqFXefObVdnv{#nvXXxY|t}4q`Qw;+_Ov_dpgttk~_=c{e#;yq@0lzh#a${sqrex@Uj zAemc=dYeA20MpKQKDr>1cS%u${A00I5WSptL8NU1on(#+sy#wwP{x;)$A z)FI+t3MpmN0h^riV^FB!Cuhcfwkpg(!^%;h{Y;QFAOJn2$^;gr%vb|mxadt=ZW(9+ zG29v$bRKPxr8LcXYdZ;@EjuYQ=}S232XZ!a8d-n+d}oQ|;B4ITV>5MUo^oOSkI$w# zXs;1Q8s?=Lo38;C@Zp2Y09r2@SN?H!*9;6>C%+bCol`2)igD4rW7qDn5(M?cSS~i4D-LM zcSqr%j^a~Er~5ozCBBl_<=psz()F3jAx&9%O{dhGayZ0U4nmcUFCNnFS+A(OGhG%Iq zw(sTrpw=P=2p@FF`WaMNB}>1$7cxy1zvt&HbL1WS{n;BwBC-}o;(C|g6%0JNmif^} zeqm4dMrHB(Jman@1fx`n`B~FSX>NRExF02`>Cd{2e7(F!jfVQ!%$yQsIG=Xs+BcD~^fGX43z`R(;j z6?d9dvey>PgfZVs8t>k{+uwG@sVZ9J3jRj}*7L$9pBp3YvGzNYQg*&gLQQcg>E~F= z+gSsqa5*nzgKPk+FDABHeLtIH--@YWh%M$*4t^58jLMUPK99&I{3ar~{zlAZUDqGN9`KYp}KrDqn=kHvZMIu?#ZONTMN zrFp9$#U6^(FI`jZzCM}3W9(+192h_2gJ72PWeb;+yuJTn>$zHG^$KPv|FIq0%}|KH z^Fd>M1z%6I2NbMYz9Or0ha)XXu!*sMU0%4pqAoOC?&hJnVYeD*PN+WT?Ysnc)_Z3j zPkZ^Jx#9-7)=cu7s|`b=Po1itrz~3*S)X)fG!1;)coP|8U;1+YK;J51fe@pzV=%Hr z<>HY}Td9Fqp1!hnjqbgUvr zm{7{xx8l+t9PUIMckvri2_O1?FCBj+OP-nYd5Q2+K{(L8=H{ZNOT9?JVu>eC4oaTA zE5co=f?jJ}fgwc)2sHBez!v(_9vZ>9Qvq&#rGytY78UW<8FW#23J0&g*X@YlAm#%-0j#9j z!GdqP@@0F1+xl`UHrQm_Pur#5JzEmWhnBm&LR>BD|3>rtLW!fjJ!AY@OUk8&r%Xo% zP;AytWrm2wz|%a`v24u;pQ1h<#jL6hE%o^lq`T+yZ@Lg(PFXw^NNlGoqZLA;oF-=l z&ybW*3V7KwryEB0Gvmx5@#mA}Il|tVH-e1Myx_gu3W^+RKH)bwiT=pfhjp6p%`(iT z=KHXE3$+h??Rj4zK$R~=Vl-ww2lM@8Q%@1h_(`lXWqQ!*IR*>$6q53 zaC&|8_^R{_%mSv9BUkQ?ec|ZZ!EVu6Y#G9?z-M?^DUfN*zFe8pkks?9~KH(S74Qp~-W}Km5{z=t~H;%PUE9 zWn(hOR{A`+BJs_>{#NZn-mBHE-aprlLwUxokzS<>afG&lGArUnoT8#r*Ye;cNIhWB zDTglx6TV}WW>@Qvu=P`YwZ#agwj1Tqqm0Y;PU=4oz1mZiSSRBthowCgz^)njIv6O^ zSr&a{IGaAiGq3Dm_bb0$tw!>zeyrFA-O)9xGl?k;`;zb!xY^>jZ?rl#$Cu-Wx;B4u z4JUjPhJV07ce>2_|NL0_CW59>ZGgrAA$(o5+dFZ&W9G&~sgt`%H}(fM(`^#&T=W~( z;53bjmBd`CUW@m02f)#ouaAsF+G&lia+lcLo>P?vfAQ{2}58^T7G z!T(5)ByI$c&?IQAM}=0b-0j95ommd+GJn07gF8|&$DXjduat13rcvTn!>bsR&SoiQ zcH|P59u)R$Zq8*oJHWq`Yx-$7=}J}!e{*g$N@$2V>j&Gb_1bB{K+gD9TG#1X>~V!x zsfO1z&MnnNV2fe zuk=IE;%^`h=@)8?4>8YAncNEsNIn)IsBMpu_1z;X=j8B`GVOl&V5647s2WF~ht_Aw zu9*N)zWDC_3QgFZQ6n^iK^IoB(zBXU_qgu7TAodsLqn8-%-r&&oBg@aqV%f9)00AF zv!N!*q%)xZT)*u`EBQ|K&!uHo_pQhoVQ22#k1zDs_T8Km&Chhxd5nnrG~` zg&PCKv2Gg*hZkL6@|`Q%m|ho*=b;=ZF*fiXf2~w)hN~-{cw`d3L8k|Cg@#q;uE}ED z*Pg$S&Zp^rkG*=Y%!>W(=-qEMg*KvgD(!LCd7_foxo4Y)d57oEx0jvhWsc`7+As{l zLJd(qGk`lAX|6 zVdJk|iiyGr6BS}3I|3|OZA6Y9%|)oih@Oy7-*r81Q?P@=;N=(Vpa%v^UbZ<4wTlcj zDf8T&;@Fqym4VsNT-JzlzPt|WlCsAD--S0(6Fey&;^;}fJQL0K!<>>s>$@wge) z4ltX6>2$cRH)V8`6$<6%iKb$@Q64T7iLF$E ztW6^X*QSWAr~TH7%Op&O-pi8~+W9{1*W|esZ2Sh_?6DXb(xWp>VY!Pm;qJFD`ty9c zZAu{LvKU(#w|%3S(Jjmy{3T>QjsPnSl} zzS7NFtlv;?3M&HrWJR+|fo(V2ZDk@=J4RWr9!Uk)9v%SRadQD2X@e19QUX| zjjjDWH^gNyVp1r*R0&_*of8Sq>F$&v?|8eAeJLD{!r zeoxkFfb)u9%8#4%JzOxF|MX4SX9gAldy zRI+s=oXtK-`I3fMev|%(9C(Vsdva&Y^$hnD4-k^X8#Q(wLY0d~-<_`QpfKgwYiJHB zjWqV?7H&YS$Dq#drET*R*(M}&|FZ6Wr86_b>DR^u=*uQ09$8>7?yMaOnCn!{j-Wks z7M-}n4KlcJKFFG-aPvq3Mz?%FeJQKm&hRs8S&Y2U^ccGv!m=K9l=dRNY7SjUa1Xad zSH2v>(qBcFK%$hF`@tT^6l}G;=fW^|^RKm=ug;Gcl=?|WiyF(M|4da=QGHK}ntajV z7;&R#qKnhgi1tw}C%rQdjh;L+nl`UXg10YVFS^J7aO7I^(tOO(jnu}Ql$g7uhha_( z4y) zM^zi@cG9%#okK^C6~jr#?v5B6VQ$fZQ2n2$s2jjfk1YyYw63*ixPx>ta0d$~_98y; z-M$2e%vP_2OTy*sGrAsJ(1$Z86)cO|eki`S13RHJcdnRZ4U9LvAk=AMgTP5!md$4z zdpXj~(wkD>*n{0RR>ay2mJ0&B88*>nu6_VXN4S9aVI4&$+RJ)T>iTA*$K9 ztCOeN?JVb^(V*EoA02>%;FtRenZ^8oo3dF z`xwfmNcBOQ5v=?JPPF+OltMeT^8&Qqk5`1*o^Bt_G%o()|Bdl2a<^$OFr<@BOoZ{SB%2S+s1Ik&lVEHC@fThsbi z3SNwSfM@!qOP(u;2dB}BZ-yTVuH5=rCM=;-pTt*sqB)=L&B1{-`J>%C9H{7REH72` z96{(o6xD6@1%pa_Yg&iW!Q@T)nYcmt(Ofe({PDHz+y#Z6X5L#3li_{B?jbgpyM~vN z6uhPM81yIN(oXgl??|JKnSOp|XX)hGl`vas{Koazp_R#0ANfY9^G>a$ZW7g5A357* zPnm?JY~n2Q<7KP#+5C>}Q0Vi?2R!G-aarx?`Q{dd5PDUP7*C zz#w4vlwle_l0UUzKl*KHP7ZFrFqw#=6{UZl}vx{UCuL+?ia+ir|L$cE+0$R zm_^YS+i%<`zd06~jWZJEqeCmKlD<`M(lO)SoS+NH+^bJHZ`0$oFMqwVY`#9Q$!m9C ziuOSR?O<|J$Y!?Gc!E!F_m4L)v&Z_TPY50_jpCw;N z69>JRstokmu;83vMb=-lOwWdoN1|BMq)?v&3)!|j_$uYm7QZYst`b;&w&YHb1A_^! zRW}dw6LG6HDB}{Np)8e?j1F>1CD2NeyCm?!BS;^)11ej|=~H7TDLp*lH+-e+Qy^;s#k-+}35dVC-PR`D%RN?!4 z7!^7=b~dIDLq!+x!sMP1#XMLNwtJ*$?xCnvOsTw~cAbv0vk%yIv$rCH z9<=Z!|7m8`tkL|bYBx{5XP@}|eqv>pyDjtd`0`thM2GIvOc;27?5JkNb7cbJjNF>L zM39mBz=a&*{660KdE+K7Y^VU_pv#M692v7x<}W6iPiO{z4~WZKHd!O+H*2y|cuH5v zP^8U%$0v7PxkOpBxflQCnz-df2D*Mu zkRf;n=gqmfqi9Fjv5zG+b3X&H$cIaiULu6DJ*%i0h~`>Ir@UzNQHAR3#Ea+tdCdIB zMg<3G6w~`?g{SxN-e&g`SD%R#Dfv7Pe#rf@rZXSXMAVO1;P|N_82D}_dt7V-N{1hUcHx+|}`Ip%inWMto@ZE1-ye^%eDqRks;e6ipZI#oWyUJ+YYbj~CK!@v&Ce@)BS=yph zigSi%e!wV3}s*F&D^NX%zUB*&WfL4zeC6GGTH{Ta&MP;rIqju{kf!rj z%I_@Xhc~rSzoltZIUD+LQ>kx0x?p!!;q-I4i;dy6m*QeL=ECESzFfBr+#x-o+1BU9 z^XN_6vDWwKm~drc9fg*FC!Rg(bnm5tr3aJqBR>lzdi6!d6NBzxi%!QNzp&R&2;k& zMv~SW#rZ^6`Up0~=AX{bm=C-lx?b1puI5jltqzHgC2|f~=w^d|px3RIUEjz|mh^SK zi>rG@_R$?row;^~0UDMC;YT$gY4SeYFZP@{jXp9L6$J^zte%`W(NU%KQKtug+4v)K z1RE)@pYJqByec*~;&lg&Lx|R+$%8rS@ohekYGO=d{so4sxCO1XB~~!8L|n@(toA0B z_Lw~tyhv!KJSozyxuPvnmEpFo@u~#cH_%?Val|aSPJVvha|Pdv%+lk6Dx~v-%D%1( zD|NgO-KkeZ8J7*+DtVqnZlrC`hQz>1B|5H^Msc7arqWG@kvaz%yZ_Khjom4P-r{w! zLzAZ!3C+R*0?yhZWm3JIk#3u}6Zj(BHx3TOh?LH*-}Wm#yf{zE=Q7o2PhC5fk=jpl z3)K;sZ-TL!TmLa4sNHVzW12?%?oyRPJT)t}Gs_d66=L^`mJwZt=4e)UXmAD@0Z;Jj zr8kq~Oza0orc*mq^aqd5XlIsPxR~s zYg-#utnRD$w9@$Hz?ZEwEzx`m;yr@Y#9&3frnRsLe0?-Vf0W@|y@TClKP$6$r_0_v z(-S%*hf^4M?VbNlZ0tBuG_IcSFdMah42t4oV9R;MeV|`xwby4Xjx+6j>hj`4ncWoX z8(zDRG?*sm>K`y@q8O3IBGEZgnzj()j>!v+ub2<35D8!Ud8+flI_f%Nu{d{m_!?TD zhh|(7W#V?0$gpX8xc|V1r2bt;sCM$G?IeDv7^;s!NVdzTtl7Scp*f0^HZyMyo+IpP z&_HTkv48wV7b}q)PN}Lpny$Je)Pt>Cdn#Nu-utjPD~B{ps<-g(JH9fI^6I@iviJO_ zGR|j(Iys~F#wr~e$JOydjVVf=OH~e96!sr;on#}|dYc@M$Je=kt>VPk%aZy_hj?DC z<|MD52}wG-@KyYy%7vBJ5pElKt|))%2M_(vbqGyhPL47Vqq);2%DxXTBG#K=(R-lOEe0j$RB@a*~gayj^`X zE-B!(RL8p1eqyQd)w;VXzsMk;oc4aAood9R8)HSsl)*Of_T^3CTBHazI{dZwq}9+I z>X|XLb6Lh z?w&JXO68q;suqTt&pi6AW-aQw#M0|xCOu4s6yiWe2_eq=zKD=}6sK06l2yNMV6}i{ zAiF}mVS!`Z93fZbfkkzm5t^l8MKb7_c#nz>x8vx^UZ^wU!wE;UGI68`+xo~PLGD}1 z07NBJC5~a&&}cjTd<-4)=1L*uZ1LrZF&3S`wVpe<2V*c+tLMk92HL4M&eRTuea3}S zd_*kdi%^#e)GCg>$!!ql?emp*4arQ8Q&^@izVLTYLmdz)`#ha;D}a#6fyDIXhraQZ zcSnD9vtVMq@HP&^ z(H@_?{JGZH&-PHfw|dCuc~1LNJ1N<&fn8+T_07o#nPs`mv82ScAmKpyorOu^5jE@g zyLeZ8Tk$BAKMk~GbFYlXl&aB7{Wui<^;1ZB%NK3p^Xkn5-9`6HTPiW`>l#tmG+eys zAgYMuT8N5Z!?xQcm~x(?Cmf9L`nKM%B-ZaFkC(Ck@m_{xrA2jtn2%SkeqD?!dMTNT zxULHXm-7c?7Pkh>fXA~t!`c!L4 z>*wSAX-7K!758h#RS6Cop>by>9r^PH)7&wa6T=<5Q4E_#g``P6(U0J}#wnBQnS+}n zk!4PWq&>+qpDla~8&$eFkFK9dblg#Zepn@Vke?yI)P|li2RXh!T|eb8?%};J^z|GVRLt+!Vfi|CugqzEd0GQ} zYKeTM7Q+>;+17Gx6*0$%)6P<%$LUS57q8vgAg1Q&fAZq7YSgqJ-6P+8M-@xwq8Lsv zj4(3b0?l=``u1%ugFFPE2E)^hDpJ>+ShUx z(pigfL!pZHYI-J&(38tqG`x*ccEmWy2C=641W6krVmO2f;LygJ^zPUkd&eSDc9>vp z5uV2t5Y@ly03mQz#qN~$rhE=jp*Vz72%&wP*|;C&_q=yww#wPG=kr1Dy3pe;tDm?& zjG8MA=xH(aU}w$1;qmd?H-o{qO`VqgV{=t5*hJR*3nR2;;}hv`ia+q@J&J z86v*yud(6!Y*aSudn}##0D~Qv3X1ivP6+5NSIHm!?AN@u`ZQs&x9#@v?B%=7Z?Q|4 zg!x2X$m*z`{I(ae@2a*_de{58UTPipBH+-YHS1?ZM`_lBxU?R~1_synsbBV`xt$NmRPa zKUDKY+$F@!Yv}Nx(}>!10k14yOb{m6{mz7;C=tzv2(=de#Ri|4x+DiPV&cP^?)*4q z5J9PHy0(cpjl;3<2e>`VuKsvV z>uVc2Aaj-~*dgBrOW(q4>mQkzq&YVKlr8Y{=ZF0ia}x1)%9W(PwJUr*cL@x66)q;X zmSRgzt*4LA`gk8mraXraQ&KYPavFa08ZABHM88&Lv;5d*rOu6D=ezdcqThT^%s|rI z@ki|k@%IsdW>sCI=lQ)Zzw>={dPUqJkbuplBd9=|KII!Hv;xlh&kD!+RAoG681hQH z11w9Yym>FR?CTGbZV8%u^=Ogtowrk&Kwn)-e{B3VC@MS{vq<@=KYo0s<`On2Ve-2l z6dY|L_HGgR;Q0EdkW(n5Q=#D{lkB4jk8_z*`?%>J3bG{*f1a)q_gLT9ciCIXAfhrI z5#svUeDC<1`bB-on4IjzKj1i_bd@S-^iXS*a+t3E_3U-??m=Phg$T_n`{%F$iISW4-8qQ?BqRmhp| z@slUwyqNq0Y|EWrB!tshH0Bc>UfEUwyXnfxBl#q=`{^)vArom7c>IukH6hn(@VC9} z^dw|R%;mRHkeIjhk<%BHba;!_$5}$2zl)u3lX7nX z@o{4EzS>ObVPdaDgeEG=8rakhK6!iDQumj-l4yxXsYJI45ARh1{pkaXFL$$+5HEwz zC)lbr(596)l|j!i!SSbDdaTsqmyC85MS1QTd)1AaWHnOg z{vOtI-RQNL+TDqoZvSF`qjmMyse)@J(^5h~;$r;px{oP))qiB`UNh3u8 z=lx+Q&P}Z>v`cdQtWRWdmg~CRWfZO@fBkR~>i#PC5?4@pO@M!mRUHnaMRh-52Yz`X zK!@WfbE=4c+cjm&nD{q*INJyJqj!KGmgxA>c8$TE8D;h($CX!4)GG2x4C>z26Cify z;RfH6l$7U!Ss_kUkoHubL9`a)NMp2UFq%p-EWb#Ux6&FX~+ob0gSbeU-(O z_$ax~oPzdlc3HY(diCkv(HY~!j><6ezau24(#jazy}@#AeZ(|AQzg*!aHIEi%u8Wsk>FfT6UC80AWHJg23 z$&ybWq8xL4Ccb9#6|YY{i0(V^A@2Qpj{-95&U{MYx5CqwRp{Jht8*uMQFnts#f zyXLxKw<@a37qGK1p3lEMwLPLh_`~YRwTEE!ffBww&GAsb@wF>Hxi!Aza;QRlJnwk( z*Zzk!D+(V{Rq{x7UuDc2^1h0erNs)`9!5UP*xaGEhadmm>xIKUPcm8^8vc!D%2 z0J?6}db0zEpRbchWL5ser{y*O%D-nH?`%i1LG9vjdxg*JA+u@yU<`-1&lkQ^1@+zX zV8OB)d!$j2{@t@K&Km)pHrE@eS8Xx*)d}Ov6u7iUvXYNFS=daO6J}ou;qud?!On>P zE?w>G(|e3RVcwC_^4D?R6Hb$#9-p7Y1h6?@7D?6!EXJN81X^Cz>~470t7hnWgjx={u@TsTo)?z?Ij1G-xB^+29|5$kB^V%Bf zaBfqD=FTg)%JPTC&Rm0v+$Z|hKx2<*Zg?k`n{wzx`DYLD6cLZK zn{=mH&(>X=ZL+yDs$IaFL%94=(!gK3!0E}C{WNQjR@`vk4{f-Bqjxv>6U8TQ;z66x z&DeAJdyWCu6vJa@z!sOTX}(@Ve%^TJGp)&i>d`b5r=?87u3HrN^$4kEh6L1BVCp}On_*{Y)?etX~ znqxKHKR+0!I2V4`G0G&n9tGJOWQ?1%XsL&{YK-3Q)ex96oFr8FJBXBBz3u{L=U_+b zcwhvV18VLuU=zQah0Cw{ew?VUnANx$BHL8xxbF4@KNj}5W|LgdSNi^Ex-J9SOEWYq zn8Fy2dH!}xlc?&u#Wa7{&Gt81hoM>NA>V?&qsl^-r#ZY`&|I^c`%8*6104$;TKDmo zCuru8loO8$7H-(>B;|FNzIRF+r}z@YoEL(2#@#U=F=qnj0l1rd*bZ;M)d%9$`f-_T zgS0N5@dt=9!+O2*@8j`u%tv(>h#MBukZB6C$4&%2ogU1mrL(AvvimkY*(g$-m_D^D z#MmfQ&1aSoxFR7(@5m3$IzL!fBi&o&+Ul4l3l*2e)IwpxhZ8A+J>J=YHd zlHu5`x$EAPIUW0zwG@_fZ6aaC=%<&l5AjFq`scw>066Sv@gTZ7q8*Oi7uG9#dZ+Z0 z+q329H}c2)=(G%yOEanNZNeo~Hw2K}hUWAg9$7omRiMriG@>zImcr`2ehxyO*)K7# z+I>%NBpKER2Hkm{Od9yaSwOrxXVSJ(gvQNpc>i#$D zqEXWNtft)93C_4ZGqfM&RaYKg_Su<)*G>%owzmHKCO`g#V`YAkkJa1iDT^DBe&z@uxEvv;_z2tD-5k@YCiW(Q8S^WYQEn}KDLeYd~G z9;eVn-U#`&2Jzb}I~KtayLozMUUWCk=MD3Lg;j|*&bmuYHe9pycT02w)G5Hgux!sX z23x69HqzDl;Rmm-SIwIk-$V1H%$^zG%m`p1Wb0nHtD6Z1H~vWK^Tj5VYvQj~WSmz$ z{MUwEO|&a_9IHX`>dO~Y{_r(V2*2cSc;`Bf;N5qC_{zf!daGmr9I;P%&8W+biNxG5 zNMC;`REjHV(dS%z7=p1!oee}^qVC)cHO3Y5l{`z+jp3jAyl7I*>z~v=sW;s8wU;6I zyK$l6P0^~9YaKgz^zSZsT*UI1q~3gY=%e6;0RIW09N6(LR(l7p%qHG3qOI6)2UBX>*~@_G$<>)4h>N;cxt&h%+Hnefwa3-l?fgvp+H; z0Id6t`|C^IR$(J`YjsbzlS&z9O3%6M5%V;D=XMaZ<5(YYR8qTPbwxiu{-;T@&xW|= zPoW7EJvspo;k+67kLEHhB7Ov89BVSthXn7?hoPv6c@1Ejj+)WFio)DcSylEzf1RFv z^vY>+?Gmv6FC{BIPdg7y<#1+Ir;Cb7xM;^c1aE|SNR(&$Ey;?5u=oy%0y)+g@Z%|RYrxI*@6M#NdG)tjX?pO+!b77QTh`P zckLR!Q-Vs$e9EiM<7dZjXBSxAAx^$Qm(!c}5+?M^o?QPHVuPv_?DIENr;EVTT+-5F zyV@C3@8?m$?7oLWYgd{;M9%P^Yje$0az3o#vTgd8{go1!Z_zB}=q*~^o~P(B z+?+pa+Dao+d=_yO=Q9|o!h7C<4H-O#(=_hWLmnBR=@cz%POXw}mLFO7u*{@@z(H-T zK@GFNa{Q5`tn{InE)RsmM?KNf$0yq~?V6Pxv1SLPmm4lOrEvFo9Rc=TPOv#1cQuCu zW__QLdd!-5a!RxDv|=c^oV(9%2|8$e(+-5AqW!{%1doAQEH`DY0> zsA4n`Q|d#5T|dEjW^$!Y;>vMalKP|m9hS8fXi4$s^0%6cYTx@ox_R|Q)ILYQB}zCv$71Q3(F?N6d!+)$?mz#e(;CRhwCn8446eSF zV@4O|Sc9PK<^I=0{sS^RMRcAUT9VjOX@}9(3A!q zysp6u3<>Z0ZK%V(WzAZCy-uu_cY3s4bNhV z8$aXLAe6Spkxsm2ZZf({FQe$_g`N{5(~oz@k+{vN#7r5Vv9kEKTV15I6D>=n)X_La zYwUL6teWJeC%9LEYuv1z?kez-Juog*bpIqBBlu|iRVHUp7g3DFl|VeX$GF}=)Un3J zo+G2WbhiR?DCHrBY?I~eYg8*HM0@(hrbXlI&@yR8?~I@Wo9*S1Wz_VQ3b{MaPBLT+ z>3!_hM&WUO0)?9Kfdnrs)7#H;4i{!Rr;ut{h|rT0@m)67lgDz9yPZc{xy9(wixtcV zs4!>P$8(>84@P;2yx@CC+c^uJMk0+`R={T`YSa z1=bBxT=&9^E!PLiXCF%$3Kf{hoB5FZ8m!%phmB|+9f|we(jz+Vr4eaQ}_r#vIIc6~1cG~8KY5#7yNal!KP@fe-bdJ6_TW4>{pIN^6~4x& zRZ<*MZ=l$z+KYb1Peprz_9nG*w)gJ}f8{xOF~Xy@=M9rgu4uIjeK|d@?!x6q7^@}a zK#muyuDKU@42zahOSP%iaPRS)XK@wt!yTnvuZtO2b|A%n92;34Ib8OEWYX&paAC5I zB0=sZW<~U#H?HAP=PseUkQ#Qj z!f0UAq)1m#5D*J09TAX@ASlvAqzVXvARrx)4hqr{3q`6TRiroR9qGMDk=~IaAf5eX zZ^YyAl>a&Rz4!aw`@I_-vR5)QnM|^>lF6)<5DiE3aoP?351uumeXDZI{0L%^x=s7I zp#FRjEkhcW<<`6*@`~Wc&WaBZCA(5}1d$gPNnAK$W*P`i)NJVZumXj<$I{Ql_k?qC z=cbBMY)*Y*;Sn}?EW>Re>%z3{K3@9CIr)%NVCy;8Od)-tNeVYg5F^={jF1ml8`qAc znVL1|ymd+a{!(Vhn=Jg{$+%ceH>2?dguCA6T_0b~1HShgZRRnbpon;^yR+tOlwFHo zFqLTZBP!eNv+x60%e7Pv`rdCJ)T+vOU}dw5=?+m(36llP*Xug>K1=ht^)M4>)1@h9 zJ8nG3+SizaPMO2uO3qiU`hHXM7Wg(=0>cAUrPkA_m58i!8#koQ6F=&#dNFgYgX~pe zn&`CSJB~Lfg5sle)r45tS1;5*Tb(4Q6{+5o0lmCq#Mf(douPOzTT$aN(XpGJa zzGEMPjj{5A&0xVtELGh$^Dw2OMRdTMt%z$Qfi?47!UwSP$>VI|q7*;mW!fL4N4YIm zZ_LE>mD5KXv?`6-Cw*;)bE0dOZm%QD&*g9FyrBD-Gn1mxWxwJaHppNTaMo;0dU}$j zsMprE#U2B|K6w6Ysv-Bpi|h8P2uF|=q{aQYTVmrbzxCQ;aD~MnGtH)t)mn+I{$yem z#$!sLJF(L3er~yL!{K=+?g}e3#@3HIMlXwKomPAh>30xWGk?CvH*>G*rUwGMtA)SKX{W>JOFwMI8oD{}3S2OGxaL9Fog_?&U&+pj90$8x16yzMVXm!Tz&(?bPlG-)y7Ks~W@e2?ntIo}EH%R=P6;6#B#u8H zSKC-{(d&M)rF!+DlF5gv!s4EwzQ<>ac$8o8V9xFewd0Er)NUf6V((KG9ORu10JDUv z=1Q7c*ESM2m#1ge4f(Yi`I>DHr|lAaiYH@Zhfb*+oa$+>{a$M`JJRY935a^-UV?k-fk%`Kxm^a%=9}foo|#}C z%)5p0qsOD%nYLl?s}`x5J!n_sGMbAx?c~BP&-7qvT4a4vZBecd$G3<(YnwJh-W^r( zw=Jt9F^X$|YYov}@a#*$wT9RsVlA?+R zWsXI?d-=fUE@nlTYn{IK*m&l_ITkC_`w8sS)1NrUK66BGlE~Y?SzR+&cT9B`(jx40 z32w%%BNrKWi=0@L!$zahwsA?&kIW%vnla}YN(J%qnla`HT!azPhxCcWu>;vW(y?aG zmma~ve+TTDjdzqTl2i4z+vh~9|9%qnorfzDvf?RX+HUY%Q2<0BxIlnkFNS|% z`00BSFmdlyZ5WA(i2+ebQ6MfZ4lZB246a0?8FPYg2$A>CgK$$`5M{;72RV-=K%U2Cknva=@3z^5<~Q1(fTy`hBzBgA8!Ym6CZ-w1ZU8Y=mX}#DkIg_h7s^15C6RfXR-02u0vAl-CE!=nLWT3=(+H zB7q+Y=8)joBm^k)O9(;pNDvC+=mjKr`vnOSVE&s$_!~mv5)$M<_yB*`)zyK<#zxT9 z)dhNbdO%-)9~kc_1ykJ>V5YYk%nj6n+Ob+NJJbpmM%uu__$RP9)(MuT20$6)QNDr% zO{+-Ix`qV(5Zc#~pc_Kp1`-TH7#kY{Gcz+_VPOG$ogD^W=f=V6`~+BCm;&Dxr@;Es zG+1Ar2OBH%U|PnrMuC!`KiR8)|cha(yc@vRXq0(tj5L9=e<^BmZv%KxS%cpgExGpb&sWhRewOOZx8&&?5K; zKrK}N3k4vvZfk04LJ81H*lUM>SpHsj$-0eVfaU;spjA=5m-F{+zlVcFX5G1s;-I0S zxu5b|{G9{-pM+RsqXQQe6@_({!5#yY7GOKvRJ;z2IZz3T0Y5s@pOBj(E4Yswp`xII z^aciK2_OSBhrP}OyQ12DKfuSw_%#$DuO5N)R1~%-`X3y=t5E^nnPlKG*XSDjk)9A) z%*2Fd@SXm<2q-n8JJVio{7ZUMqz4ldJ&J*K9=d#k?^;0hC3H)o7|3V-Rrzm_c90$= z0dMVh`X6=pNd$O4IjRDGl)nynfeBUs#h^IyuNmC_3wmrMl8G5IKvhAkHs}WfR3(1y zjW_>-9v4}`ief;IVqk|%4|=g50`KhiMzk9Lf*uc9%+8KxfYRalCu#7>h6;6!jmWGQ zC<(r+@h|AHk&W!EY^VyLTM^X{C@4r^AR;2Su8Tt74=6Q$??Hdm9}>yV&d!Qb!STIr zut!e>8IUm41)-Gy8Ql4c@-bqOqL3V|!rwC>f()$k_SG2lSM=scCzJr}tf&e=38+yu zAVXJxh=>gNy&W{8|Em8OA$|GK6<|lH04ni!42oZTXOQ<-^iY8UkO6E2G(BWM^<4s3 z0U}6maOch)O~XC$zxO}JpTi@x0I&x8-RL(4BxrgRgY>=P|BRmT9nxC>-3Dj|dyTNC zgT3;h1h2jq|0n$)0|VOf9!uh zhBI7b0vj7M`exj31PYRWL6410WMdaebVK!m9}ND29upJe5C1WdiR|oYNaWSMZnRg0 z(m$2|=jQ+IH!_(IO^X>~}v* zkzTMPqjZSw$*>W48jIhw*Fqzsvjw7ynED55}**(c}L;`lJ09;n>B~w5Kj! zJVkr*B<;Tb{slQLivHrYYZs3_ethh|LXRrz|0(^i^6$&>|Hl|LFz;gQ4bO`}82zIZ z5Tj!kfcv``9)^Ox{txjG)FT~4bb&GMBK!jb;+GiX`}MCe>f6oe@L!C7<+p$RYx(}O*4Ea*$;k->8?gPsW{k2I|7t6KYa@uZ7suHO{lR8j{}2OhH%_t> z0jZ86;H{k$NOF=w+l@IxIKjQFd(dV~44NO2f)=>9*5W}0+Wk*~_byj|*pM@z4f$s~ za*l@-`0&Gyd=(V>?AeerJ+Ff-9~DsaR2gIh+yOLj2*NW!&h(D>hi5U%&{LR;}JQ19CXnh;b!?*a`7_aJBv?LvF;F3=m<1tw@)F^tD|fhDvd zn@{cn2bgX#wF~XUyTBO&%0}$t;{)J_89;DwFo=kV0D-Rq(e~oEZ{MQr#i^;OAU!=D z6ciMI`V?=_k?94xvVA~HhA-&K4FEMMPr=93r=Tu95Hw^2g3i2`ptB?fbd{rQ!YSZ$ zLk<}Fm;~nAlfh7ZIv8%u0wc|NU;^5Oy`f$B(abLJfi_{k>0J;wy9-`GcsaKVBB!A( z_{%PcTG$1zAr8tOoCvSqLr9z71-Z~h9FMXKLr8`;-QuNPP+MCI8k_g*!UF>XVER)9 znCkrqzI<)~(}Qhbez+CPjP`)3i9xVDGYBfbB0=@aE~s0EHf3lJZd~03EfD(GcERWM zT`(~*0cK~W!Pl8#urfCS?Z1;?Zt)BF_H`bttt_DJzhg^FU}p6j7=rj?8@phBYZokj z{R&oBSHZ@&C9nx?zZ>h{z#>fl3T?mNpzU{S3uW`&{cmscg}3>?4siJRdm3oGM}@yf z+RufEX`wFygHx*i5k+0~3k?MYSr`=b)BbZ2b=7y2m7t{);$K%*NcxWib-~KY%1V$z zR=K)b-@Ad(F95gx9-Twmi!goUOLo`zZRN6S?T(9 zSP7`3zZ3~(!Qii}g`!9*PESuOPTx!0WA=NqBtb^G4eL>A*FZg8NP#jzp(+k7i3%Di zdqv>m;rtXV>Mqk#Q&YP78$f(rh_9>x^>iJjS%qRW0X`1mr;rf?XJ$tzU7s7=(bLmH z74cn9kih$o`1o}XPchOSIpPv%V4!F4onW8hdNqpRcl__wi9vkYBUj!Tz^zCFlu)Qj z>@|f#>L2jU>bSWXApv9jbI9I67ZT|15ujQ-_z(D2`4IoqDLA=|O$mDy2E(ft{;=A6 zMZEX}zD4 zN}&7c@9KN7PW(I!D0~zF1sb1NC}pp;)Bhk3Ufn|;h!5)kwMt3-y#Vq8RJFCVQ0Twc zCn7qAj}P+yUIOL!5^~Y4?f(NG^YngTsJXKDv;;G*3;ERi-YD5Ein7R zwBqq4SOTPg%|MH;GYK)oH}Ki-Z@=q52n2z3Zh3imP8QtC%+3}-6A*#A8~g6MT!ew>$wheu3eUB*F_*8Y}1%*Qu2gTcbggb*8r{&)C$2xzR|{`}1U zC;j|}@cq|5@z3}_>E{PN8gCyNrv8lo18MIEHUjkzfdcdoG}YhW|3?1L`0wNIW20l- zzwU2)@<1%~e-t)M`A72XmjvU##YankVvjBZ{r4~N_f@hl%${^V_%r;cz$|)(MU>?n zGxOg4^=Ihx3=Dh!zh{avh%&pM`D^@Nc|iWmXTI~_qxx_17tVp!iQzN|#-)h8*ac%R zhKcC(zlgWtY8aJ2ggWF8z`t-c{5^&~IOyyB7-pfb|82Za|7*No&VIT71LD1P|DsMLkmyFjH6s#8qI`@X zpnQxJASgq)4teH5$PT7Igg!bb z{~~X1Z}95XE3_{WYF&u(C8|q)0&0^z!KcC?&|UZnbQgt#-l}(?uR0kFe0&c&nhL>a zV>%dZ{Q$OR5w7tdq{H=GMOi6mZm$BZ z-Ed9U*$H}kKY`JXa=5Om0yBNJV5Yww%)vF?+(;K_n4AFJ!=s>Lei_0FC|}wHrEpzX z{|yNmR(C)P^!XWtzCOdl!(ei963l!V0`qedU}-t6#>z`0_HC-b8}0ON(&r zw*1|1X9X<6HQNeYyKQc)|Bvw1`O(k+_s3sq06k#*G1ngz{+Rqd#s7{ZHz&8`oxi~F z*W%_r&&4S$qV?wp?m~k65W(Jo)8|i%a2FF26y)aSzxZeQP%d|IF(DzXyKqN+UrOx# zfa9TXh}ckCTnu9HqYB~DM#+G^zb7H9Ohy9prKP0y5jbss&NpTwBO_LoMk7EexM4-E zKn6eP{Tb=W$k_Gv5g^8S{-5#@p74w%GBO2ATiXY=5A>vkP&NH2AHyGR$x_22h=|}y zlj@!nL=gHp-;o6(93mrwJMQ>+cvN>G0!oQL%9m3_ZR$b}L_`D-!CFe{lHQ*DaGw>1 zqt2{Mr%)oG5%6KYG)l)m=rOPiOi+##%F5?Z93DS>2v0ozIUl8{M<-AvK)(k954el? zC;3dKO0Wo639sMs_sU2Ab!NJM16|8Aj=$yaw*ZXg)tFD9Dsk50+MnfPV#*jlf(MR2 zGQx$(zn1^YCL`v4{`c}B1sdZA+J63z^1rXO_O3C1<&WAAM};4g>fh6&vrwcE0gdtp z`X2@F{lG*&Z};_&V*LBCmk(V{{v^=Q(9s=3@A&*lCOT<9AJsKy; z6Kc@r!vH#;vxDwnK4=%00-3HTzv|!nR_8sr0&>0NK%v(aP~?9dBbS>ieI=p+1ffe|B?oPvpx?OC#RUxuNgKMFLHC; zGcx!!-CFSCMV^Zn?))y@Uj2rZu%Ljxfxf}Ldni^1a8W`0204kAsIVZ4go{%Q7LJRu zAt0pBk&|m)5)~B`@be(k)J$AZIhyzkU0*umB{16OFxevy&&t$>~|4v*2Z2 z$m>V?RrZr7j*ycOA0#Ba0*goS!9hK6j;b(0PB4Rz@Y*F|)aih@2=r4#LRI;Og1jsh z@j;pU_u&g7-}$3t(l~h^C_p{ONFdNd*nf_}AHRh^dya{T0@Qx6s;Vl0&dxx@ zOawqnBCxTs0gjH2;FS?4`n{nrlMCp*-#D9#Ant(xdapO>q2TxTgM`uhys7ZHmF6T0 znp}u}tgSO$#nA5n<#@`14EV0khiAH==+%Anv#9bl+=GpS`>x5(;4)kF!B zPeD*n5Qqp51MlCz1aDqPf$Vp$;Q4OBASdkwXny|)bmhMUEx93}r6?A>t*Zj>+ZsUh zzzBE*c|D)m1~JoX@Oc9FH{p6X8SZ;#!?p2;#SM@L*R$<#J=+1F8Ov}zidvIy?fyDG z{HFH*{q?^Z`0t{D-#y2!OUo+$B$zzYDORbUuBF*oxVe>J0uB}y7TzTiNlx~2OedKX z0TvuSFVd1-=Hg_Be#WxsYeAaB$3#Q~B-oi`(AO6jXbzE}o+Rwo0Tx`U;0lQ{9;Z7< zD1Q~8^C1f{MzVtk`QZWul@EjX37SLn_^9i>;TeaIhvxudKYxG7{w;qmGC%KhgtCd!uh#YG*v|lB`5uCq%-bZ31TUf z;tbM#blA5{I|!>m=XBYJ0|$&S`=O^Xc$Jy(TZ8-Ytt=>>lRtnhM`ZA-n zib;Yc`fvU#7^j6Bt(TuVCL)aI_R%9XJo33mz1s|~D9LkTPTdZqWx?J@TXs;IOmRplVA{>_TPc8%6wWZqAD{_kolT@~iy=URfdwcS09>Z@f=f;PELZcm^MqLfcb_P8tY5 zH|()PY)UH?b->_MrY>p2|!0e5-Bbv=ZNFJ=Bw zaW64ORB!tT@ZLOTfGt7h0bA`V6}k)lmKZmciZEE$yY#w%}Ne45a9>NRL_J z|CMsCwKijtoIVD<-n4yR$Hn1{kB|F1H1T;nd17MdC61}5_iYq<%2A#@d$ujAzOGJQ zU7Y}UUXGDELFQRAN<~X6!F;Hqr$+@(o$CXXOYC;FFLj8pvk+_>4x=vShHyUe_Bfe} zjzq;WmD_PPHPzK-D^m(I0=6XZTe}sJg7#_Fv*=S^TMe@49nYS>1H_xyt6SRJA65Ar z)~$VZQYFiPbY^yzV1NY|%3)w?Drs-eAH=NsOm;GSw2^19a=BSJ;BJ)TA@$t*EZS_# z1k@mqGEfRDKFaB0li-?0vc?B8#duj3tKf`NdfI}5g6oSU7OFo&(-NlR(0)o;Bdm@Go!-F zF~5<;A*O?LbXCJ;3N+M5Dn!nEl(m`@22`;e26DS2U3CH*)B>x-<(Ry@GiT4qYmtlU zno!1dD!+MnmAm)IJ@)s-N4Uaf&Dydqo|j8(%11&!vi0)A#V$(kZVvnN&zFAuDAi?6 zNdT+$43z{=md$R=w0F1kzxCwF(uAw1v9K)-Kk>oaa!7U@$(rYOIRxFTw&tpbgtJC) zaXo6PInH17-;sRcp3$G9p-5VGrG2J_uLy(S;;XdeOv;G%odFA{-HpkZ%uI5v?UBQ+ zeSNcQvuDo2zK(lX)0UwtRcpbhc=zKXZ)D%+Co-nAWrJ(Ss#>IIfSeI;X1HvN_h8)( zufh9R{ueow&Q%c~^!2StGeL3`epq1IU2J0oB1V_ESv3TX21j1NjWGG#p#5#`Qyx5} zY4Jm|<3TvYEN01ah~bFzsASwZfI2(Yo(yR#;97Ejwt`M`AfRMH2>+maLZC*Mmm-5x zQx&%&ff#o9-L4ncs+_!sjHOfP!be+5@#gJki28HQ&)p2~_n-Z|dXPD?ZzO-m&(AM! zxryI=xsgW~o@tdb?nBB6j>}kDa@{ZZa-myW%fU^zVnS13efvIKW=ZPHvmUI!O%gvc zl|m*Q{McwKsxRq~cc+W$Te?(?wps5P87+0D zI#Kd`wq-m>!>i4#?)#$J;oz_rJOa6P6~CD08(j@@efre)P;(#G$YjPk_sEM%Hp(=0 zcP6Ll!l*v!-YQcrT6^ZQL&_w5&wDX>BOOfGQg(z(Di$(?EDh@v7n>KNtdq@=oR z<8(NtgEWs_c*t*LdV?v?LIn$Pp|9Xh;!*3ogwQ+Qj@$1Kl`Kl1lzp_`-qW;2IWnm+ z6ReVY&uW;aNW5lb{VP2FmbRq-v3U6f8#DI1O*H+sqkh$#Re02J0hc$QKkK`J+|gm% z9n|e3iIS3%)MfII?#j^I#xb|^U9TSrezG-k##3Rty49z%IyI`#!@0s@Y(rZ`9MzL@D4zb5UZs|GhYj<)%7v37QKTh;Bg9L> z%kb!NTdOg_E~sJED$!fm+JiN}y>rtnmiJa=)D;D9 zVI}1_8XChS&ywKj;s!(4BG&e)s>ad_yN+zdnZqYTP-8`JwvnHnx7V8}Y^uDG%KUHn zJqcTuv^U?Y#Wo@veeBKe*lIAyUg@aKN{-*UL5g&V-+B>$$loc9XzsIL>S)&dv`Tgw z;$lkv%vpi29~t4<+$}+84#830(=7C+Fp=d%gTll~-WsU>WeRo#@9}ZHJhN89aK2P* zTA6GqGyeg2Z>}80T^gIkedd-UO)MYr{9Rv0o%h2~>oHPylW0~yP5Sqf-KE7#EtYwy zTSW=)uM@ATtE;Eq>pW}RpQjM2b?qs}3*Pu9VWqI#*friwT*ZLa5o*!5RZ*%!ABr$$ zyCb~6RUc8OwN#56>c?(3uTqW3+j$+FwqVd9w9+(b73JgVW%5amW$u)@0ll|^)*+ zJn|s?xwPc=ih|l{C*gKS97Ja4IWMiV;NZ}%U#jl;hc$1!xtayYxw$tNVpQl6Oy&~} zH;IuL+I?lBJymSPQLa+dX-)%)2}x!`{fcqvIb^eg!2o-E%uMSHgb1*+mE?Y&tK?E}PLjvPklACBZ~w;c;s?ctz|e~{hNR4c(Kl@m=_ z5;ZMUGj)7umU4F`<1;j@$KUk9=5!&8PgFpvq*l~Oavi_A8x%ACZX$Q{{_K}i!@;}9xBS@<23Yw=p)ebpl=r2u0ZfY?| zZMEJykN+@_u4NU!tu3;6bn2e6Fe&_Ihia(iHHspPG?m+$#}#ILtH``9zmDC#-BXC{ z!Bd;r5!5$wzIRq$XhAkxpIl4-ldbGQg#=l}R_c(Q4mWWgs;f(+JAEG4P0yUw zpQ4CKNpTSblFMaEAT-`M#^ z>a%Pgk_<h%V2buz< z%&zp>439ZHl1piK)EV^U!q?HdfpdWmPQ8+j#jXH=(ERL^oLQG?K%5LEWn7jE)igNj)Kw_X;7>(a1IoatRVKM*`E z+kz99>iiWxd8cZ*)gsSOZ$i2tnQ&LCklQPqwWwkZsti4Yq|ZGO7&vWHbH*tVe%i?L zD0KE{u?Ek>H#hB8R#u|ly(56%05UK#x+E=45GNbq4QF02Ug}LQH7cGpZ{!>JCSDV$ z>d0iJc`{hx{b+*8Ru%0#Prklyv=!~@cCni>q{unKhUQiL`vTv3bW`-i-?TD8pH6lG z0blsxZe24ok^-AA<=?*awRU$;zGG#3ig8VBqA~0VoQu-c2eZ{|Z!BN>@FgoT1)omn zf#%TaL^*tB9+flwG^?=vFkh2F8$O&B)YX0B;!cHIwOz?6EG`x@?#(*TEU*pf}xNaYh6dJpm&-UCjU-LByvt}Wc&tbOe&DB|F z!l7kf4y2qu2DM0QIaP&T@GXzo)Dq3Dp{BMI+tWgc7KWAdl?UG@;a3s?*70zyTB_xp zv&T7Ij=wD`qRs8MAdnSQLiVzT4yW_xtQ^$dDifWXr>lPLN=hiM>nU@BgjQNGUGhQG z9O1Kvf_HQ1Bx3oIHzphMr-drGcN4QhzUYNDnLLe+?4G4;I|n~js&#!luhb>uo{JL3 zxQxidqw=4yZ!~4LH8g-l-pOq)-}Hqo!!m=}Z2cV3$zgrM|w!ivz*+L_+!q@?eviB>|08`YsHiB&oI1O3zh(Cr~;ovU0^m z33N)k_CCN2oQ}(`$8ZR+{9H;^P+{~OsP4lmV9N|ZT1xdX8E2i&aJrwr-1gC4$r0C)zM zF>~f~H7|`*#GjbC^U5eoBY~x5b~H9;tZk-MVjd^%XpxB8!OBuG#xqu|oDW0(mN!N)PZPdw~`mR(2d1#)BPw6?4C2~R~8Y265@Agf|XW+Z5HvLbFN zJwIX<684@=^@vJwWwX+VvV$xkLUI?uicw=7zPuKO{}hA5Gr#Vl!?YV4!}U1gio%RZ zs^V{pMxLj~BIHS|)wYkAUzN;WVT;1ZjD9ojZtJLw(%hSts%H84*q~8d7uP5KXOTFQ zh1Y|x;K@|tV`;<9C$ihDtzX58iqA9v!x z3$0&osKpVBQiVA7Et_bUReTxrk=cGKbhJTxXfn+-;ZgIbv-eDGagD2yUx({n(*U+D#8&SiF%*u)mv3O1M%b+<_tBqb*f6jR=A3BsXiIs zTRZ`j6F9whFS=CY9w8WT7g0^u!9FCXmE0)I&hGu>!j~MOb^oAvVg1S&5@)7%M=_JC zjm;FD(D~7vFAs7~hhS*5Ra)cl0B&g+qCvu;c#GqH;A?}D@-4jvWNQ2ae75FmBV;U3 z@{Ie^E28F~KhO>{b~qZ+qi(2Pae-ZCr!0^xU92NcMC%@HE>&5Ndg;2XB5sgH)8SgL zJK*74xSW#?Vs4#S;J@e=KbRJ|$;o%@$vD|kaBnwt$n2DaF`qV~zS$~Dl5`f+@vDN? zOB`@X?t}FhqXj0l$gvM&&!kc7fVR(L7@~*bnVhzcg`1Q%XqLP=_+fYDUg$+*I+Im9 zZuXbF%XhTfaz3u&KB1R?vd-XmA=GswGyK9=fznZSGos13R$u0q0i?&ELoN1y#=-*rNd6!ffda1NQTl@LAoqK;P zUdICIEkhEToCTE&F9U-IEo*Bb%&@e% z5gzIg*=-&5+teXy+xUp(D~|8a7d6!oVZLmm5j$K(oX}W$hYGx@o)!Trj862hUhomb zGf}emjns&t+a>gJ_?vYG7F;rRMXz>~mE+$8%n;A7)pxNqUbrLb=x1f>q(f-unty=S zn^R29?h%nbiSU%o^OjHH%lRF+oGt<+UgFRk6VtfBFXUbED3MmOKQw~h_+n>{E-opH zoU$I<>99ve&C;b!V@^cX-Eljyh%S$w zdfy|XU>vknC_v_BUu#dtym=!QW;!r&&a>D2hP0FH&hW`4Y3#Y1uL(j#6Bt)(PYI>R zX~%6<=D6{&EA5Qj!$H(PtCQlJ$dSoDT}^z-nu&lmwE5{*TCpc-kzv$Qvo463zW#hC zn|rUPQn3U`Rr1VUb0Ak85l`1f)8e<<-abwuFOe>t6VKCRP(SoQMp8ojqDQq&1*^Qh z@SLG{$#Yg2&C@JSQnO~-b40l*@kB0{!Nbx5z=*^a$w+R?gcWr$b!6Aj%?tr_E&Wz>`K^r47yYF}v!uJd^6< zE`;<}0@Q#)^4*OjlbUa^SNOmcME=>2CvUiA)m}a$TN`q{{=<=w`YDN|{sLF*(#{0| zO3S|3DdeKov&n%Bk&}UY}J_#Q;NHBgRVzII-KCR4kEtDtgq;`DMdI?P}xyXm(4sTdhILy zO8K?V7f(H%EpROJGxJ-0QV|Z9mg%{<^AjZl!YAfpKS|bqacgFdAHKdb-5Gn4!bH_4 zgI8i{^1k+lqocA-hm!KhQ952uB&`iOlkePk~D%@dD}9l)gqSnQQuSHx`3Pa66yvFF7 zjk5Z$Dtp1l33H5pf85rb;dGY;CwzuKVo%dZe3w$0D{OrxC`zCrGSCxK(g}-xwM$2} zC(&eh!Xec#j-e#!+ICuBob@w|$DW3p#A)*wW^+AHZ8oiLUO%FW)sW#IbeDYT0muHF zfF_hDlN!0}<-<`RV|U?X07BB4@EXm{taAi6q(-k7U+PJ;)@UoNB<&{}I>G5eV$SVS zrm$x4@GRolx*&D~%}Z&w=QAzdTBJ+LURrY>s!Oe64olU$N6x=XW~vd^EF|R7$e?%` zXmKVOQR9T~TZ|K*rbxR?-=cwlx6gH_vyB7b;zuC%Ha4~@gpQWn!#{^s{UjcTm-MXO zRV+%D2i}Tg)=Atvh<4BR+xK&dv9z^SIb-9ZD<4tvdJDzc0WB_s5lXwAtp3(`U)Y2q?2*c8~%Zu6CymVcB(bEUGNpJq~2Az zJCe{h(SAlGB;-B@j!SFLW3lZ|xDCeEQpKfinacGJq`tP35{ zIZ5SYH@YP6njdwG$>((IO~QLMYb38ZahucFq2T1M7vtgydA>oTSA!x^tnl{G={I>7 z_Z+riA0GW7xjRC~B?dP@tUDB6PKLwCkRE|YnoC-0D{%X8$|`|B?KyZdv&Q|!(Cf5i zPnxF7C?{96ulju(&wL&%4(~1+DyM!m8$mUrw(^53p>1Rex$jRP@SfLQ&16qHdtI_# zKxXYg_BiLjS|sJI^@iS9gQs+!e4mC#yLPr)CtX{EwFPwteOd(Z&-;o*#W->Zuf{kM z>tR!ic#6m9X>b^Om&n}h!{s?GBuwhuBjZkhj3`J$AZ27t*&$5M4}QSEd~r&WZVep z#c^;N$_)}z-7bamU0WNsc@&@_Y&H{anD=ESGh$?tLwoV{Sf`o$2~Wm^Zo6$yg_jej z?g@Xf*N9KXVWnY&bto2Ejtyy_d44cr)k^EsAd*Pv&H8u*d5D^_S6`~C`@M%fQB;`z zJ0IY<*-8xxEP10<@^FjM? z^2^f;RJD4nDW|Di6}CyfC0={TF@mfjelITl$O`Lv@MrPNznI*={O0U9N ztW6BtEW8+RTzf*WbGk=Qy{al*x5n311Sk4^E14=sQSoefRc3mIWd2%2jjOIOYvDTI zVXRUv!@6en;B>F96HdW7#)f8R5EAMnb$P3*opcHHH$oOJW4B1-kZ|G>AJOXY{P2F& z+eO2I+GWyHcrG2wT$Op8X1*_e>ug=UdAofOAOFQH49_By{-8y6ybhTxO`-}F9DG;I zs|u6-LHGEhRU}`Ya!h%8t-n^2)8*->Y@=WfXIbePCyXoGHUkw*PU@9f3{EvuZqj;O z4;R!weWBS5SO9P2sB;+w+(?2{h{FvN$LS@E$(bnhZWwy!XUE_wZH?G~gn64SiwE$5 z*U%Kf@3&#uH}jg@`2x>6RaLXzQlFoP>+sC-XNJ|Mh({$~bJAU7deW`mG|n)+BYE%8 zVSV)TZ0`fggv(jQTgg|>y~81$)=ae}q&!H81$|P^zU^>S?$1zyQop!2yHPue2pA~_V=1xd|=d;}` z3uF^?Cc*c`ZRSWRr06o=!096mC+jCJ@>4mswcFr4(W$|4@a~d{_OKQsPe_WS-(rqc zYKG0|n?7KLPr_-xHqYIJ^&*&~A*?llVrg8H0s3i0qt4HD#u2TY4)_*vuKUSDPMmOa zi-ID{rAH6L7v#S5dq?bu6Rvz3A+=Ednsaofwe!~LqWp$hA5cVf!V138n`RQWB=cUy z>N+`4*gljKtu^D^N|P57tm?%dguQeZzxVRtTPBLFD$05bJv~O%PPmdJzwH{i|RvWjyH!T+B&;#9o0vOr0dR#&nib>Y(0XFAFw*L88ed^64%?L(I_=z zIl$?1MSp(G(B(c(9f5j|F~>y7=-apJ&-kC*+}7}2KUc{{q_rc^Y-(dvl&QPqn9>&{ z_|-pbI>4^xLb9$pn^J&uvC*rZAcn!#pi?ZqB3k*rFD~ev)S$v%R7TB8%S^CP+jL+2 zvY%F&Di;h+3#A`X*zRX=0tS)CC6F6=)&tH=wtPuvgp30Gq}&087OXBFk}I4wvC?%CaY z7h9&yG{S?|@kJ_lO@z+ESXzk~)sfrovQ31{$PMU3m#?0R9*jHm5qzyYms+UPpv57# z9>Z5@Y6%`MZKl*&7%C5?1ibj>Mb|o2=eb%M8uU4uzW4e83UAJjZ&;KBN9;yyN;R7r z!;i3i`&uKV^+^IQtS*>v4)Qx!7hafUc>P95g3*k!WCMq;Zwk{!k#vXtl33^~T5nqQ z(`qG{onBo`^0vv6-E>M(Rp&cxHaYN=sFk8_n?@-Cq6Moh-NKJl(ppinkg2BaFt)Cf zk_6w))vUYX%=kflu@Pbv&KIi~?p0KYr{PjQs-c&!QD{ulovl2TI;po3O&JwJd`Zwr zJh|uK7b)Oj*WY{fdI>(0so7S{w%9gm1-QAjbt@HD#j=;iliwp-4u9nE3!3WL0=KFf z|G6tFKh%$p!H-hEg=u~JltiHZtnRZ3;=FAc0M@*rG5hPs(E#E54Z`m;DKph)xJI%L zs4?!oNNCslU{XTTy!F8ZH-i$d1(OhSRq#ghWfdJk3+EFG=VVSEB|VpO>9cI}BvNyW zQK8eY5gGO{h$Bb+8e9)D$_XrCT{BJ7Bh336{?KiG)veJ}jY*%pL}(-`Q3z{ki!33@ z7d1FdAG>=_UWq!Puemn-#;WJ6S09EwU$@XX1E-RU15I*cSa8K=G1bD*tvysFt*_ga zbw+CEwPjHmw8Ab=9k-|8&yme?3ds27R*)6Oc}XInVP5#qOD977x&W`t&o5 zu~5`uY9U&h)r+;#vKZ0i!Y@W9#p0Np@MhsIjj&%2GUv|C4!jQY8(d#{j-=PlKBN{j zs5(7SAs;}8sxLp8%j;vgIfPjbR7{pdD{Jw->7smrg!Isi+Q8M5+w9==B?+6$Y`#1+ z6zkZKRa8jSa^T?Kw?>RdovO&Pa7ew%_LH+5EA|-q1-Iy=Z&++E1vpo(c59h3FSE@C z-L8*h753 zwEpq_@k^s!6}dNv$d1Qs7yc4E7Dc5iH^o^C@GIY?7JD~_@e~B{+m{hKtoACwK4eh2 zpqDSF(KKgfC~Y>dK!_PD5+x`zuO;s)6qv-S{f$=h^KKYe+=EQ^}$H>6zdn9TWbrFdU_|J zt$$K2r_lGdq4LqLh);g`Nmpr(P*pD_Soft&dOF*Q_H6LhRK;8C%&$~m>#d|G**O=^ zZ=ZDNzR{(y2~Vz&8Pa+wLw`je@han3u(ie(bIRnixVcP2egXN!9euNI@);WZD(3!p*cQ=n6j*5Kt^+h zi?!p0&Y0K9O?J_eoix!l2juO&gDo9Hop1XJj>blx=4U8>VySsxNE9rvSZM)}4nKqI z&+I-woRCD8z~P6;u6NC8#>w?NeQHqOcPF#EoC3i_Mx{FnSaBC;#nw*Bv|?|5O>2A} zMu(aAsTm&{*dJ?SYOz^qG2dCb!szOTwTVYG2o?`ihonmK1!7Q8`=%eL?qLC480;nL z0#^Ov%t5FJ_gpQZKoG+n51>&+wWVAgvcTr410)D13n@#LF*E{zzOe&oAHj+vt$$4R z>_*9P8E(5pN=d0W$=+mM5j)!3jvwnS1X=ZHMa7kFwQ=IMm>ra#-8>YTwl3&$lvFC2 zwz$D)9NWZWS3;MTp-WlM8CsNuh(dN`=s6n;KlvR-XqSHOYZ#X$nYykm7T`dgvh+;- zj!+-g$5^{O`2cll-oD(IuKA8i>g~d#th*F`hcQ+s*Vo^iV!H!k{f2d3Ta2;l=E+P>}UXON0>r?YFJmTKyG`^?SKwTOjmX}H-% z&vB3Su9KK)D=wJTx} zBkSAY3D^1vJe9NfI;90Yspc=OI$&*P#g)6ho^=fU3w@!+lRC4y4 zpyLE$NKCK%_V?0yf4~uIzw`*(oiMucwl6Kl;^Q2VQTzAQ7y}C34DxxtSQhMrHB$_oLB|`rB!Ltt zFvai}xWn?!lMdU)8H4v5?w9UB{ms+E$D{nJcG*c>juFU$p>QqSS=ZWYZoR?l>|NQ_SY-Jxkry`pWxkqr|$XeCt809TLxL z+%IQv*?l;I#1lRa#^laJU4%F8T3uf@L$S3Pc6NJL!6`Obl zDv7~hIN=B`>@_Eb!~aiv-vSp^wf%q4O0$>U&%OWr{qDWRrQ7fJ`E^kcH1ztwXPTNA zUNw7+0~}y-m>FjVL9@iNw9>-Vv`hsppP8Co)2u{Ge2b41OA|B`tWUVH85+B-kz{m*91Z#Eg1Ks9J`+3|en`3V{O<!F1cv5qDsi`N*mVaz^A%%^S91UG#%ey^K_-@+tX`PYFtSwBEM-hL72zM@~xn z*DQyz^WVnb{b{4KuXf&6Z}7g8O<$7A=57jk34@N&w-jA2lG+syv?bhOD|tHla^~Sb zN^eYv{@=VUeF`$)&70kBbH=+1GgiDE^zphajqYildtlA#keBbwShBX&%tnPzNo`NO zx*_B0rHuGBLFs>aywB5hMx5OJ{hZsR`)7VMd0_Z$cdn7{wA}LA)kZx#AL?CvA?oKT zL8)6ZUP}wQtEm3Hix-zH+u5M-$=m@m4(|O+8|;O=cl(7A1O6wh+3H5Qqi~Vy%;Gn5 zil2UXK-S_}f1PS?9kcq*srIfbX7yfrYfkIj0Xqv$4+d*&uR0DkEPPCAm+6pC&3^aZ zabE(`zBkkLly*aY+H?QAablJT9!s1p%ufP0wive#r zwy(j(JH@x(HN9TrQ8}&9hQ2L*(qPbtCll{I^Oua7(#k2}Ee8~DKT;oC9%G`4WU1fU z_ruE?NV!Fc_cgh@aIy5#t%dht+Thj`y$@oyM@R6c{-D&;Mb9>L-uAago_nRV;qvxt zhnw>!tQ$Ia2?}`FD~{*x9@Hvhvh+lctN$Eb8nP~I-#vx0)a;>=?bkPnUDNQR&Pb+F zqtEW|{ArV%deXB}>!9Udzwuz?|BU?kj|;mRpB>X)`sH}n27}t}HUA?cMS4yi_t!gD z)5Pg?&-fn8x{o{fqOh-XT~Q?rDA>*{8a@L)ND9VZxQ@!>X8#NOmj~bz;j`iQ!k58o z?lgYvfX6KtGzh-D@yN|ipdb7|OMCb);j7}VAn*$KR(_Jb>A}FA4?xDP7mCBZP=vdy z2=`!-@du)w`-pHi7`5M{{7wW6cVQ9Etf7lnwOHag34g_D3^zT|9tZ;GIgEV?=*InC z)O$Y>?mZ*eT2kkHBHV{2Xvf(_f)C6K%|C|k?-moCV<}MFY>%XG`kz3#H=}*zB9V+E zCUC!-pugw$xLg7}z}W|cvzn+Qd^Gsb7BZmG?)iKU@o=Y@=K9?9gsTVw;V)Q((|;oB ze~<|0P7&;CDF!mo3%&>V&`p(vPO2=l0}pzGFL|XzxRZ@=2cF;w?zhw2m%kzU&*EM< z!u@^pbOF%-@BlW&6bsL?@C10kvcUC$Wr6EMN65iPziOlseT^1@Wr6uprqqMd-va%h z;0~rD?;P~pA)@Etp8+49Do|vhuPO_y4{$!2s1Ga)UBHL&(0`i4-G8n+arB}B{f*Gy zn6b@+3@1Q#;|>!Uzyp?rfdz^zaD9m7`k=}J*9T2E;;ux(J&A<7B1zMYzXke1awB-A z{Q-E|0s51VK>vXU@!-KAmIbvwuq^aLonTq0jBeEEN3xtg`#W#6EC2^?-v`03frr-8 zyIr7vFv^m7kO&^IEErW;;QGL_z`D_ZC(h%ef#5}d_id1_Ftvj+L`r4L`i`*lpe6>@9_51xllQe}bb1Iq%}2N};Aa-Sm|^=Q@d zWq*`#rz+u2QNo#tWQ9&k0UwwL znk;aAsH$${c9Q!F+#axAcmi_4eMy`Hp}G0xDOVshMLVZ6XU-gvAMQUT+?`7}i-T}y zFU=j=ns6p2;XYqlKlf!pKhA0*_Cvq}iz*8^W0G)}?KSB}U;V}e)ESTd@?*YupwW-J zi>W#ucODb&g%$MUj$c|dqCG7f-VQuyL%6$`a7GE?tgUkNYdr9lg=^A{D0lCEBe#{R zTUnd*<34AR&DrBe2geJD>jJ?8)`#$?e>qIJx7siL znmp8$Zp7W-nmbS%^y8j)kv{ILrq5pOO8#b{ldfVgD#MR&a{l>K;^+F zf(Pz4VfpJYR?A&PCB+Ac`%4$joeu2JpCwk}xR0HfAHMpHF5QTJv*(yvZ3A#pj^5RT z{>K@cg8qx8CA9kW?kW$u(u&b44_F_HHo>c1RNI6(sjcZl^bKY^9wnT~Nm;fQG{X`} zIH!}Q4Q@{FJs(auZ(R5pL;gkU=8PuXAx|HktX#j*P$1CbKPU9mOMIre=>Jxy@56C% zZxP`@xctw?3mN$0 zjqWZUaGRjXM+NQtdw3f8#$R@#qpYlGXb(wu! zJgA9w5qE63+C{BR@I^oKf$KvS+DEnQcLAx}szU#@wujso#yQi12W6*@QQ?NUv}?&L zv~6x0ZJjfeHh<)#%^x~x<17blm}RHjtReKxO!ygAS~uN7Yd^41&a@QzVsaw+YZpD+ z1g3vJ^xM1WLu!5&#unSpyJ+X$Z-_728Rk#^ua$l+-t*5tPq==IjvP56`UZj_>YvLO z%jjH5vEre>aCf5KpD3bV3U`X}V&HbMIWhg-?#h)bgu}TAcaZR>yZzB#oAhgZ@Oza`;DZ(rrO0bzft)OYS41e{|*-_vFOZ`7Z_>FKRsS zY!^#UDw8`kO@CF}{>p~Gh%x`k!@KA!?jL~%r;q(0c*1|S%pIxqjS$e7(ZvTP(qK~*XGe(RPHcU^Zqq|qrzAvWIN^P7_$?@Wr#p4C=;v_)x6SN+{b3X3 z%^xPlOPW6PHCEE*l6emC)6qTX+tnWs_aWG2W@eV7oy+QGZ2vSF!l|jWWy=>v=Pmzb||lh8whq z`-r|gk1PDe^98Ri{ny&|`@-c$z^Y4sRogW!JoD{F(CeLN9sSC@_RTc@@h+p*SwhpsxN0=73gDu1B4`o_T0=1*X3C=0C3|t-kWAqrXo7)#<;A_5ar4 z68)C^_LmycchjP2-+OHY-gz%K7Viw>-&W^fTRum)nHr z-`hvtPRDZJ5uShl-stx(um9`xS6BWkl8?IYbsVVU!0(3xJjGlljTAG@h0+7!nISdM zJ#lrrc=vi^LGao+5g8BRsSzZ{+y$1cIM!wz|3!-{v{AA_%d1JNv(HSoC? zJ{$fj@5|ngJx9y2=PDcfO{Zg7ej%{miv4G${84eRV5IKl61o8Z;BgoHZpLjk_D>E3 zW}3aVx7|J4)uqE8ai4pM%6J1%)@2FY-#!D}ZLk+)`+2sRX1j6CUYc#j`TPjB6Yq-O zkLN4)fnO-oenI}n=3>v_0U7aV`t(Tn)An<+@#Tkz>?TdXwc>YtNKhyNf9=! z7tf!eO$$cT$0Hwe+41gOJ6G&C)oj!>J3W>swq<_~Gz=^VT(`L_Sf&P|j5e~xy5jnK z)4+H?_0&`QdiLzOu__)m2}jr_C1;vh*y6JNp|CN)9=iN>3jW%jX|^#lV=J9zKcJn> zAnJC3@4Bto9CP~+4PS_|frAw4+iAuhHbNVH&C~jWb?VfKjvqfx$M$a$c9e=ev0_8X zG~}Q+iTT%Ub!yz2v-&VxVSixRNm6ph3ymD zf-V}N*nZ7JIqY6NojyQc!4rS}t?l9GvjM7Fx4VIs3rgLXj#|r${PKk5d z?ZAFbvt?a7-7akRyz$rG*_PQ~-F~V-(X+nV?H#2e+uha0pKbB4Ub!S}@q`^7_7t)m zBHQ3;wx~r1cb1bO4S#-KOWj_KHXZJ4b@5-md^ue>d(yQ}nr+C~c1WQ?v8P>%y7!Wh!&t|v2vQJ+<-@wymiS2dX_pn{e9@>$P9z94e zLALzW?bS;Ew6MDPj~X>fl;g$^62$&W-f!-*M}$o(+hZy^iO*?p>s8)c?4m)~-s$a) z7Qbnr^{Au%>h^x64fwRW_zxU7P~gA!KW`}ZbZko|Y={*5YPPF&)8N*te9nZ+h7k5L zY%9XHbDlO+N6wb|tlMlmsOrxV0rBtBu~K`(5BzXI?D1qV^VZ*6H^Dci0gD~BMttF&j?YwBKI zWzV`d4c>Z{Y0%C)(QH|T{T}RC*=ElZe@)NwnGXjPJH7RuxbtuC_N(21cEmJP#!p&0 ziLPRgc;WgFDR;^Vgic}|49B(ik&5v5 z*XQvEH=E70cI{f?GZrpgD8>Hqv%;VK^)Q_(K0v36*zczj_DA>AN%*C&_V%CwHiWRD z(`*P&!Di?j`U&jM!((6R^~uopIuWyBZmwTb_>xkH70>lbf5Nx3sE!hXfAliV~YI!V*3Er@Mas)c_g^^d(N6Ot@zGsfzybr=xc7-c85y*9g6u zlNn1JXAUmcHhA{+#930%t2}OF8=8Lo`du^rTD_^xbJ(z9g6BtfenuOoCAjsfu=Osl zSC@_HO$RnD5pgC?oan-xak$>~SEGBxh!OPTk7)Ga%1-}6#|!gdi}M8?+OmWSHZ7vV zd7n|yzV*tBRnop5Ve9a}vGP>M|C^uxeR}uWay{zewdB#eSC3tR#xLG`-fqPEt~c(df{zE$ z&M{4B@5C_io1eR0|10en{YT>O{>EQ70^dM+G9CP#pN#vtqdUa&w}O9F`R|XeI(Qui z{BZz7qQd&9(RgMADNnqJckKG|d}2G8c-NkV*bTH(u>Cerhu~x}@S9s3{ z?_1$DVZMVU4*OM-4-$<4UT~XgI)9{jTQPPm@OkIF&dz7v^O`8{0pfj3yeD8U=9PKR z4EAYVsmc72_pR^`{CH7Wi{iZ(yk~*eW_j-j@2BbyTzLNu?^96EQg6&SR|gNoZw)Ud zoAIQdKtIp@^O`-czwllhE{_E$_j%YK!uNj6#=3^OF2hmS@qUsPo_KKosUn}Ao=)dV zv6o~1D4LHoFJ7bAmcLc3rSMuWuem2;ZIRbUp9L>?PZQro!Fy=<4hpf~0kZir!a#0- z9sN(-5~T)H6UNqOyw z*OK{s8$KtU=csva4xa(bYp;CvE3Y;2nXj|bJ5%0wJID?kJY~}WJm5}MHh-4QqM{<& z@ctmN9<1TOdkT0Rm~r5JDO@Ks9C+Q7*XwxQddaD)LNx?YrTMB zedhI{lP3<-<{7fUfzRLKHF3s)*KPRBeyv{eIykRg@;StO)}od_X1Jb<$!iIHHNrXy<2T^Y_$IybgB;=Wp@e0A7>kGt0Gl$?FZg&x+SS`3zP* zn_7#@d3omToxi_zCvW)UF^|*Pp9@5tIEu4>_NG-l>)RSugQg^wPk1fE^Xwmg zVJphtS2=62nwZb8D1U!=*A%=i|G@gErughHotu&Wf#$oa^KJL|ztgTagIzY(`R}%* z{C8WF)Bn}UqcU;X&X>Kv{JqKUFh%2{tJ+-$gg7 z55^HZ)@FO5wisvIG44BliO{>h_Hi!a-m>i+UFPu{k0E(X&+`hWPL|MIjAuXK*$kMH zRFwwA34#xUe|i(<1$b=4V^toL@R+?X^68CyCZdlDHv+yB{2f|8&|BT$GktlJ$@I>T zhYJt?u=!Kk_4x<%^)DyI9!VYta)19F@QV300k}_r&&HU14rnPZy@0=TALoH|GvIxW z!?8c%*q+tzi!mRMb-p-M=o;_wn3wnT^4OopyKH~Mwl_QvlLi>v1kdz+>E$(R*3g#| zo)TkQ9?PuVm*+~K`|P|Qo#z;MEXi{%e8(csaqzev<$b84^!M%EE%xW|enIXde=#YZ zmYgWl-C6S{IKksq9yg1!2Q6IoyHHjYrJuiNm*@xcn27Jn5cg-j)R|Vi^$aZ=_Y5t0 z<0)G78v2NH$Iz5tFYC&FFRmS^D1Gi{Zg|fi`j|ZC;r+8b?&I+e_py1*$Kwp%C%b0d z5?$Hje&#*i=|{d1MV>x&dMX{-I9K$gd0fTg2_E0_etRBE@VJu4Q@jr}=aUJpvU!K4 zhv*gW^m&hix1Ts;0v+7CoDQs+M)|8I(#}ucpuELnY3rgfwD5&!kq3{LzFC|}OHLE5 z}wuRQb6(jE7 z(ByVi=Cf4z{chrZf1va=Uif>)#>Nug`L=!2I@-H+4eiCCtt-|20LaU6O=&uT&*& zWx{SBdB->{{n~s|$vqO4G?1teJ|jrVKxTnGhpC*0ev`N##>+#&@LE4C`WC!ggTs7x z4CcBsu~rbsh92cC=Ax;rL$lG>6+OZr3Nyd;2iQXU}ap z_pzP>J>0*79c>09;VVAd3GtjRx7#z`mXA()*STZ-w2W^ZMOV)7x&rsp&=>R(x zhJo9|VY`3$*nZ&D435jr2mUU8d^N7;H<%+Y;r;>JD{|jJj&1uf$5CRt<9p_>{dPXz zbvW*SD&cu7p2M1IYyXYb7I??^Dgze|{d;%bnEX_i9f|!r?a;zhl)o0v-_uuoe_?&W zJ>yqKulNe1(eLY9WnfzT>O2WE=4_qIJ_8BqJl4;ln*4gD7aJrg7mNo<4W%hPca!tNb<(_s-Opf`iq!hLUuP)C5CC{Pw~d-+DxQwijO+cmAq(4Z*`8r010E zl1Z{iNl48knI#!1TckwEir`1;ahNdIzep)gjFtOIe+Il0!lXzx$%yd7fQ=DvvdAO` z?@lQh-!0%tES`yYPeTYhLIfixtCWgw34l0}D~Ds(61)p?;n5jk9a6l&&!!9KnLfu0 z1DD_E!1B(#HJO z%65mzYH1r1-ZU&EShmDl6HJz*wjuxS z+ch*QB-r6JS`v(At3_@bGE8=av~Sm_jnUzdQ{v3Sf)U^1Xd9Ahx3qS|C(9{DM`(&E z-fneR6P=;))|A#pM@rM7;UU2(MvEy?b~yWaX4bAzaBv%^J=NjtZb`IOGO>sdj)sU1 zIX=~Hat>2o0kF$MQjvw65MwtDHJRlk*`a;&{MscQ5uKp8r#w_P2b=kC+YqCpyJe_# zuxt+rPBnFikCz>wE753n$RW+NRN6H2n?;*up6Ik`=Aw$@wgD$>MyDyxWCp$!)2R#2 z>{AeEW3<`Krg$V|wYX4nq}ps&JBaQb8{9piZO999^RV#n$QF-=hPQ}p85$W8-y+o5 zB2f+v3yVrXgBp<#kr?)(2TkP?h-eWW5f#_U7}_GC6%t5{O9+jUnfG=84UtE`q%_CZcwrUX<7aA3j5SG|7AuPf8Xy62*$N3860qF%W~vJI+K zQjF1=+#$_qm$gKbolaZpX3f;f)YMg(ngXR}vAw!CL+Q3`)?c4wNIQs|+@_f>MO9{+ z`N+OJ#R;-awj`kLtF@_(25@#ztGY|zw7TCN!JRD5&S5sF24oY_JS5m+OuTt(nv!C*gm$u8oOY|(5hAKES7UGI(nc+thVj3UV4K~_wb5$tV2@8WIpug~svUX! zFTSM4p+5AGhxJ9pg#v5Y0%FoKGCVReK0Yx#Au%M_Y_ue$8k3aZYCUe#%qMYfFf#!x zauLody3W;cppFA|9H`^K@0kNxk|Z{lW(~^9$jZdEc}udSB?@fN{EYcy=V#8JIv)(k zkuZvqq{6I{EGfHjc5rrBc64@3_MmKAc1Ct)c2@S1?40Z!*@f99+0v}Wvw~-Z&5E8C zGi%T++pLUP-4?_wNMDe-VBUfy3y?^iyQw(9a}GQ3rvaXrMu6jGRvV^D+>i$Wxs25< zH-u*_2I<1|0VGS84$n+%I3O{)bne9b)DB&&zW|b>gR}*tn=Y@X0O8dHcgo`cDb}St7m$X!SnB~9 zpbL`^$lozT=T4^qzNhO_jsRqz4xjdbMC%|u0Qvhpp6QGLWFx#g<#+Hl7O!r|3_#L# z7%m2+PzU*fW9jnR0!U+BI^P3QrVGQIIH05AKBStdOZgGJ#puHH2ZWs)!y$m!ba}lA zNC~_<4A-P&T`6w`Bm)?@!xRDXur8fahQPbS+zAR2bh)<%WUMZo-hedL#YzDr1Kyp^ zaJ;b=a6`CebDeiX)&jyUmm5+7$XHzo{tX|S0O}4C!4P;iBpN?6br|*oWS$PgctG~) z!n^`Vx-R9n09mfXXEh)Lbm?pYq^C|+F9K4g3)28I*!ueQCqROAd4&M7QU~b(NE2N; z@qkFWFhc+tq04<7Ai=s=QvunigDeJww-5nQRRbIXBup3P3?RumNCV80XX$ePZ$QFy zkbeRarIR`%AcJ&#P6wn+2bl^;jt;Vv)6wOg3rJ&K%I5)z)`e-*2&JRL=f43d)It6M z$Vy$TC=R2Ou_QpUba_n%BpB>+b8I#sQ*|k?0;E`%axNenbzzDDiPoi43djH*q#kx% zb<@SV4-l>i?%W>)q@gZWI3QWNG93iSR2^5-0ZG?E-U6gpm+~|~zH!mM1#bp}DaB@Z zOm@aP6S`p*)hw$ZV;rXJ@Y`&{bOry#yrV@nM>IDW4JMvKQN!WI7eNEN0oFzG21*^DPdmS?=Vtn-v7?k|9|(Cb)Ap+DwKN zhXKh&CC`rr|QsN{|h*n_xAe&azo{%AGL# zi|J5vET)7oC8~Y{Ze20?+rjCyo8nTPvO{Bz!67@5_E4mpV6YkOU^7^2NEQVHzG#KT zvDIm6o|cXU9$FIJ9V)q2dso@W^H4E%*@1%42$b!1i&cH;WJeHkg zfM0_RiO5rp)+^QQG<6*2l>1uygZfU%Mh!QEp_9R3ljBW^rg$&34F-oZ!DKaHmJriy zhM|clM~gGjO;k^#!`a1dw`yr6+GQ<6M6x8Bl0b~YZj{eZM0ClfmxQEX%F-nsI%&mJ zp()9ta;UHMacf-fxaUzeDi(&cL@Q<)-L!k-oTeb{3eewZa(1=aV=-%LmX)Nsq*oxn z9Qq8}oY#i!GW>8ZC)T&Nqt3!6xUw?dYK;A4#u1qz~1gvd3UaDxOL8@IvBcqlD zC=2h}s%5H)a;(#6cc$7{j9CD?cj$}ClwdHZSsA=#FGWs4HCMlaAc(U|VoZ*-c%JIj zzRJ#_ajB57fCOTlg_k^BT+uS-s%VkZ42p6Hm;^rBJmTxay7i`-vReB+>aSMwKo0&R zn7;z#I_MaN^`jJ5b~=7`u|Qq;);UPnL+|#5!iE-i)%^;1ghDWy%V}H)KBS`;CcZI! ztnOM1L) zXOWw26y0zCmO zVm}iUrqPT>j2D`KzQ4lvt^hi$)Kml(8Vj_pocf*^&#j93qNo5)GC+=cDRt%N4?&&Y zbcI%Hm?b{NEeb|13X?U?keF)GdSFqn@nojT0;3$so#J8(@gdW!`Gjn{blfT zXRP%EpF3mnoViWjhQ@_AHO`#xt(&=E!NQRD#@XJ+@B;7L1z!KzJIPL6m{~YavENv0#J=)n>$7f}Y)S^7x;h_;tD<~vhpej|7;>5EYyyY=R z<3T_5fe$_Da4P zpDMCPH97^mzCy+y_@2)9dwerKRSff}2B)0rBYo8d^8K%jLPV>i++wF(q2RQeuatX) zZ^oxe%H7!1BuRhB=ch+vU;H!+esdjoiq_CvJBf0C>RYx-;DzQXctR4*OF8N9MtSJ{ zRH^E`>E2jvAs|K-;QmluOFok z7iQBmeR!2-MBXnk;yF|PsA=U}I%Qxq9i(( zOotXxElH-C|KQ&>Ew05{LzC7CovkyL^AlYrfQz03R{SKhz?$bg-Q#dm z6;CH^U9Pmgb}4E{AKpw{=x5OL5%eRN$oc&nq$G+SRYCqA2;!RQFCI;a4fMz#&ovVA zNSfDr5HPXU#W{4)*x*3EULa?TP<@MU@|3HunnWw!wPiFn`z9j}9UF3u_^>x06_Sp* z>|KZ#@0KTx_=QhVB|b26$fHNMQzkx;u0?&s zI&P z(!4=)#0Y>{59$1Td@Caq&&eYpo_{jGK~r$emI8NTaN_y)YjAJQyZpws=+WQO!+6m) zdD-&$Dpa#}IsgM|GJ2$6WA_yI~A=8k~*Y|!ion$H>L(&8%X zn@fZ#UJ01{g619rp&_-*==2vWOKX_(ke*5O5g|B=O)}z>iw)!4A>rfcRjoTr*8X9F z_>{GC3N#H7vVOs{QMwUk-b|gK87+Q*9K(Fkd%?>>i~jnd2FiXC>xi}9Xgn3Iba!&t5GxX_`QL6F|Bl);E2Wd35& zyYV7jxDxgs6GUq*mTFZ&G9i_}7$OBB%3ZfmH!hD1LREG77RmEU3TiRe`ZY8|Ca$6- zgS4r}xt_2nyg3`JD^Hj4QM%251q}1~WYK{SDCY=oz}yLap+uQ*d%%PaZ9*5}FG4Q? zRQNR&%nfLz!R9M~HLksWL$aug#K@t6p#rlVcD?;4Fm(>y?|zM^LZ1GrgmXV+>E7b< z=Zg0qAQ9ep zqQ#;@5y|x;BOshCic$vj^c72s=Sd-YAKMi^5kB}f5ZBEn$(oA9^#$S9-?`$h;sQao zO!|6-0;TFMNY_1`x{fFb^mSk1N2F!y$G`E=AH^3aB>x#SP`#;U}z((ie(-%D2FSC5HK)BvqBS zc1DYz1)f87#gp)haEG*#EV`vzfiDc07KqGDx9~2F3%@oP=FVhMC8dN|i<3p)AR5_9 zUa+`DM9HF?9C*^JS!rND1sJgA(WxJkX#EuQsXxz}#@wdGclFwPha$;W z`KCCmW_{&UKq@^{^^gER&%?MXJ%R*M)KM5O(skcjq#AV~+TxzD6dLpTd%#+&`kN?f zc!{(`{&lpPeA(<6BNj%#^DoGYkr+0KOh)#&`pCta9=l5==KmK(pJHaBS)T|A68W11 zV@r`^%!9xV#z)*u(5t`AeTHX?bpl<1tug<4bRjbVa6^}w?a7xV^cT-DJHP@>Zf%twn4Jnnv+7sCY*S!%T3@9wkYb}j6S)EdaPy$d%y#QC z!OlEv-A?`P%b?djn?f%diN6#pFk6H`!QwKMrBaQKsuWe>sLW8YH(8XRj;QNIex3CA zz!8DD)9PTZVI&&PKu>@nD0_G;wp&M~H z*22-taRR>v-sIQ&I`NC~Njkv!G2t=&QfRh6`65Ifq3}8EOa3j=iZixJhQ~YtdZVdq zT@8B4qBoI5eGRIptv+((Og$>o$@N}9THmB))$Aye^fg8neaNbhZ3&GC#?L#Grn-w# z{-C~QVwMro@ zgxsS)CSx5?gt^vGGg)@;AW!aGUiiMpGtoG0sz2AulIJjGhk5PHjPLIMMLG$LY5))6QT+A^w?0Cte1kZr|5Q#@-B7>W8OsTmI*A!?tCAzy-w{7KIpR-l_zM-L^t(HM8^MBztPEgl z&*`&%knBQ=2T5+CmPLcAT*E9nbQ_Pb!dq$*m{A;kqx^9szi7w(0x`>MiyTUZDzPnI zb92N-xK3|3N;flOYCj})AUVW{bmtnCABR7MB^E#(0ebEA_J)0Gou$0BFMNNZ3N{X! zt(Qecw`<{^?qB!KkLRp^L3l1SaD%cNojL!-Gkl7X0C-E}$biuCOi2d0^KkpG=(G$F zDg1Ljz7NK%$O;YYR7-x5BUzy#kt3eaL$R1`g;a6#v?vJ zh?)f*S!IxZe&7N~&97B&F9ZvPZ+UCq8L`4=v3_ExxJo~9X>qMzKha-2BT_j~zwzrk6s|{7tnrrD-gkePAq1!)I$G%157eS0%ot|`apCYA(M6)I zN-RG*o=(*OkG@zhV&aWB^6PZ|Y{@lWl?gSOC1hG}k%as>i)FP&-GzIhFYREGSDm8t zT+S<`;8q)v)}aiZV=seo^!kRzAx6A5H-2Sqz|+N}e*5Y)5^3w#O{&=!zJ*$z3XcMD_;}cSjS(R3Pz&=BU z4!NUh{Av~`^M_J|8T``qme5UxWcIGw?;SD1(ued6H;cS4# z4uqH~&CX>(#y8z}o$q?z4cBex%o$IX$g|Zp7=-RM64}jsH|Zo9k=;Gk5eUajr%Alg z_^_oCly2t3bd<7mXhv|4~S35^~b-PUE}G|AKz9J&#mzsj2!Xm(evnqvx*%#Ql-ai9s}_~ zU%+9oaFO|Ua%n(dOT+tcaeQmVU343^ZlYm2yE;R|#dDt8s%-qKD`c$WV}`tr+1-_v z4;bS?3c%a`xW-C&C?83ImyO&S(@wsOQKz|*i!WSEp{3jHJjGUi>C^Gmu`S_zKVu*Z z=+Uo$oa(U+J)nLBvl~pzKa_!s=of#yE*q7Lt{pcZ`|JscystxJ>ulAPY}J+bB+)wE z?aK@HMGRc;eAO2p#8~AA{z_R!i>3y@@&$5zWu4?0m7Dd|zoi(1rfJRkT|Xyq2kkK? zozC>zrQIX-D7Nvccg+-oD;^&db(lgL>CY4=z!a2PemQK}yfolKy+J{R5UCNlN}%>Y z5XPTH=h#(5W%%q0i6rvRz8ma1s}D(wR-2l%;v3e8OWr zjM=A~smN%RVl|@0%~Y?~3wNMny)xTF|KNP;Cr&Od!%E;BCc9)oc(2gGcH7(zA z!5E~fIsD98HJVa%L(74&*+{u;`*d5vPplrPjFi3~gFLi`o*OfaApCu>5vH;ppOwPq zXFMvE+z1Gjne6JmzqkDq{?Sy8SQQ1;l+o$f&1Bf$Ce85=D33nVtjTWq&^Zcvp+J|6 zWS5$^q~%QkA+aP4RaGw%s8Y#kDO*5#Bu#icqxBE!$Pi^?zbm@BrKJ~h2OoDgsRX7e z$5JJWZhl3mJ}jFDRwhFiuvn41Qx-cRHw|9KRc`Gm;~gQPpV=eyoV+{GM!|xrOSw8juQ1S(H9d2~ITBrr!FYJo;hL zDr+^pfJ~u)njblf$>J5z&{81z&D@|_m5cp+Sr9EUq;brr8+$@-# zhTjqQR6+w^LAbkyl(Pe7Re|;TjZ{hGzeLSs(PF^3Eu_a6ynaGv_lqh23i{xi>QPZx zx0K3adpu`3B}8Z9`LZ)NpC{27YHI}|3-K^d1HvD&Z>vg%-nNPv?(Wy?TjDumoyxJ> zsS?lmI!}mMxE;yz5*Osp&d|B^gINLO5~pjpV6O-Xao(XHCrAzU1~&qWPiM}XZY)`J z)=MziT)58}XKNh!y%?tZYsJxIb|_sApzDB^5&(QK@#DbD=@Apd|TAhO+N_Pb*1P=o=r!HnS);P!`0-`1Lhag=*fCXI_@@YeQKg z+Fa5r75;ctUXTe%WqV`p00ReqYa_czttG+A&EdCjUeoeozF;-n05u|qa&I{SVs1dW zsg3YZVfijY?LIK=!NYO+zHSb9Y+7-fCisS5t6y;&Scd;rAGvLy7J5!!Q(ahoz(`Eb z8JaN|jH_d?W!(dZa}W(OUJp`wplfL>I~rf>Ue${~>|Y5cLWYYVQ$et9>}Z@=pSvrc zrc=Fd#bQC=qiRJOY;)*sJe#UQqjZ}uaRo-C@a}+1pRO;^r#)EN27Q=8R&Hy2B$GZ+ zLN-6jw;c0Oz+iK;OOJZDd1>8AbmX5c*(FlH6$I6(f&ixqf}Zvj7xf+I`W3ed`aO*2 z*n#!ph`d9r5N|MRs4r2g@Mj?s)O%?o{BWS^Yu*s<(U#C-hOhxcYgD#}Er!-vmyG8e z#*CG3PvdVeZf!1`4K~a!r<}uOhVJi@q~125|2nDve~|up(uqK2o8r%SRq?^DHfwlF z9)j^LC{xSf(~NpO<7o1(Q{w?cBn5*_AZpYOeni0^qLU4@Uq0@ ztfAEjU-EK1^ofbd?In(Xb{Wp)X2hd$uz$eV++bxZlcz{K2yGAVp;;Gx(DO?7aiXUb z%?lATaRbAH;a>iy0!6KdB=R3Kv4gGGvbcgR1X$6pNMMJ+8p-PYx>HEAUcFBBdyp#e zTxqgs$u`!H24`!n=)`2vXFLJV-Jf)pl%Fxomt?luAhXrQqBAS3&j2Vovw~SS-DkG6 zPMbjznO^Jn!zH%U?66*=7_(Jpi?X-7nf5uo=ikd+y|J{tWPK=Lb}>~w&`}jW&d2!s zDyvGC6*6gP-s=qyM}CUyeP-Sj@F{x>KC`M=40DP|8Vr2oBM)|H$qDr=b}O4vFJ>RqWUlGyuuh>)EE%emE&mF@K*N33 z{%JB{2+GDYn=*EQ!TdQL08seh__#cDfH^5I*1GI#8Q6ql^(zkwiuYI-{FDa{z2BqwI{CIkQTa_#}dVS_$ z0m=?Z(X2bc;pwY|<(3Wbj4Mu2rWXkcO^`xArBL@yxXGv1N2?|64oTaj>>K{V+Vkj8 zH47HL&Z+{TFEV@SN`LW4AK6`KN%nHL|4e^UzuPn!wPYxgEnB&RI8#D#Gt}01qJT56D(sNqStZ1cNG~{tY#< zzO_g`(o|q=lEKF9DuG;u!ZWOTo0kpsnPUsP?-M#;yDl5r{gES|IDR=_yF+*Z4tmL` zyRyL0tJ`E3$!v>lUwoR?0sUAkVjoG(DP(=}EU=C<^eEFYjtc1}s|uPgad-REocR&A zO@Dej(iU9trq(Qde$m%kt)sgcSI!=m>sUPzo1XtpDFG8)6zbS9{MXgmM(VrIk6xq3vP=@E)&*I*T5=WnlZw07Dn2M>=VV+SVy4HssQ9JwyMftphWv;3#ttky2XyVp z^t@Nk8sO|XCTvwdI-o?w|#b=fujuzTHzA1~O!Zd)B?^U@RtMb=kEA9UQtzR*(9 z2m3@HR3hrl5ybk@NypF!70QcD=^v=M`YJGt+??m>{vH)4NySA}?A{b$Cr2GAs*$uC zCGC%l0}`F=e%z&d%7696eR*=c>LtI~88n}fC7D${jFPXiRbTy0iNvoc@Rzn$cIm52 z;0a%+PpNK_bJgUjd2fLp{g8t7k?(pnJ^B{W#MN28wZDD%;fKw(fc|I~l5mDBvPC<| zi@fJCHS=Sd!}gV@sM& z2YiDe&=_GRUu20;sGK;iX_lgpv*MINq}VRTNgdr&erSAL0i-yU#&g3%gh=BG5P4@t zq4d72Kf}V^XE}QJOZ%~ly(54rb0{ER$^hdRdh4s^Koc2mnyGBp6F2c#+J)*`^(ZkC zeWovM|5SM}m2s&h8$us!ZzZC~X7by-cc30yOuj-nFT>tWQh21z^76evh@MT!jM0D} zWLU+VS!Yq+9wqsXUrxG=lXbk)AMjW`l?H`CdTb&wIi0%W3-COLpj<{10$;(|z;daOHAuvf)Yw&&w@33VeB|_tqN~r9pT-3=t5j zJ_&ZPko^ZT&8~y5NVk0OH!I_xl0^~+0O;v%9+ZDP)gK>Qu&b0)eCU-W`JH16kVK}k z@+)@EScgU-#xlw-PI4+s=Qzj-n)Ks>%JB<>btd16L$ZJ#YC!R_hhdFj#WX%CuiKl_ zheq9mVxLm7_0?t`RW{?en&XObc2PLW0lb98)B~Huc#JEq99OdJ4U9+gJKoSnjEAG1 zzgY_-k1H0yZ7vA1@*av=dk53P5inN~OJGfCMF?wtm>M8!+yvk1VwxfDYC)*&A zR$Xm>e74_Rca$??2{9ULb%<9ZEbw&*T8)cr{zLq`OuTv-?n_cr^z)mXNH`!C*nh@m ze`0Z2r1hKPA2uf|efs#$zz%!m_#};mUotBHp+^%48a>cgZKP9%oKmoj(nFUWxbUH>0(hLA7)jHbFjt*weqWSFv+*# zZ8@jaQROMqV}7Tcr?)#IpNxfO4Pbv95T2$FjLL(0ghgV}MZPED(WLJwev;k8M4!}6 zQho;hlT3f$42m~ZP}HEZ%$ZVb@g|cG;f%fB&{j1oYPW_@RD6-H*SHykm(8GZ_ql05 zW>SWK;7+n=1tNvMDF{VA=-UbofHcvW_6^q8WHdsx@Rl!ZlF<*>%D#e#sa0tE5~xi& zL0>fnQnJwMTVKY9@$$9Va;9iOd!X&j?0UVSRjWy4 zm-`}b7KDn7#P|aJ#DUD`~k_oIZdQ9Avf&zua{; z39e3bILXU!vIUfz<{VuT;FQGpEN7HE#(T=_<6Ws-iqIYsq3u(CJmno7m#VaO zkSnyY3Dk5)4Z&rsQe(IJQu-!?7!`%@@>j<|=*Om729*-8Dx!psD#DcF!R=AP1zzE) zQhy~2KEfCxM|$)L+N;K~6*0Me5d34$<0mEm4w>gV0}-z|%2==smUxl`pZUCKsy?C{ z`guxhR~MkN+>wwmFqq><#sXQ|q8`fI{s~RH{qUQ+M~FW*M!93L>yib;1nOd3e!F%#ci5Grt)R2ZyK>{kYVlE!+V*#m?6B761F*UfD#3VC{%KIM8M zZ+gP4OF8T*EPk@XFB$x)_#6Io_zF99GF&a{Og6;R3@Do}{*^;TKS_}R_5c!k^^;SB zwIZb^V`J1%O*vYPS1^y6S2fQrsPI^4P%EpCF1t~bWG}JpZ}b)m`Q*qHY!EaL1R28E zGY}QVvG+i0k^_lFSt3laKa*~YsaA5A@+>xxifAppKb7!2PUErJZ!@-4k4~gJ_8ow6 z=!2Alracre!dfxm9Y1`+y5O=$=un7fbDI?TAvhCCDYJ}5{N|g?aT)Rzhn+D$jYNA8 zr7}!XfBVVIol{sGN{N5Fz#qR%befn-Jc&r_DZYbkVl6|**GF#kVl=yi1LN1``6@T~ z^qLJBMzfzE77ip2QOk>d0UOSyZgZwA>XP?!82&?_H(WsnF zE=xuZqtv5f70vBWiB6}OukAggWI;!uZI2@@mnX8%#33jMox}=$ae+@iF}qk;ehBzy z$cv-TyF{PwQTlu{`aI~_%|h=N^?BE^^m&))^DQ-rS$Upd<-6=pi6F;zExQb?z}#25 z!EauhCn*(-oy-zK7{qQC)FkzTB}4q?<_xV6{Y6R3pQrb}j_l&| z0=$`n?Ks$YX7_x>=RsLV7;EN@WgwutW8Xtp`0@F~zzei0_FGa${xu&w=|wF*#h%TO zP#cEcsvNHjaCfVkSK%-8&i8xmd4{XM>|E@6rsp8@eX;4i^ZmW-@!(K)%XzAP2VPu# zD2NF?{zJNTuIiRujOdx|&I~#=fA)0Qr#iwIg$BYFdm%}c9+d-@Qr9cy!M1r)^e_(Y zw9lbJO25mv#O9C-y~F1-6#6Q8Nliw;9TGkEwG6Vy;^eBug$ortX`(C;+32y)A_IHZ zyRxdUx`$EAN8*%sc#)5l-?2Qvv@*To>tya>v{?h_gNJNEUKO$A z<&DIZ9{UmoQv6cs-<^9=WW5l}KVU;74buVTNhb&dS-f9I&%%G{rNN;Cn{q?Pb#TgS zl*}F3#mDeAc)BM#{0==%qeS>*w~P9!Z8Vx5kB)((lz_pXVhpLyJ%xXHRLG?V} z6GYHp=-WGpKr>l(*Pm_;lpY?RIQ_WTc5`dBe)%&i_J}TnxOBhE7uk8ln7jR^`uh5| zL&K|C9d0{vW@&qf?HfO|s%vjMGTay0;4$X5N*SZ=(3z!OW_$RZYV)<~(r0~(jMQJj<~3NSW;hXb2f|ME-2Z&vBC9YoTL83*1$*x(xT z)ySLP$d;+rJCdN5!b(^}$O@hLh4qUKPKneu3Hu*V(%T($^^?A4oVT9E=TRJW(_=yn zoDBN()I<&>edviC9u=AuIXp7-GHqJ7083UNq`#^4l{y8#cim}k27a|xHMxFIT^_~R zyIC?$bv1g>~DW(y> zTQNDON|i|4<>&_0o(PIP7^UITSIVy*Lau!hiM={B-QHP(*J)|^j6!rYC6QVEPJ?=g zzA!ekb9Fc@#m{QpP+$F=KyKy7$H_c}lF}%)yswv`?NaAsy|PRmFn9Q5EAW{>9rMIP z!;QHwIShN`On)h6i4oF5BQt6{_OjjF=0j<^?A>;FxZ?25aQIvCaU(e`9?t6=H&O&k z?KSu-TUm0BPsn3AN3S{LfHgO>bX?k1xlN6d-`u88WMA=BRhX+om}wteB(G-MzmTK*&VEpz^{Bejv3Zp3%=4Vt;<<)xG zoXqtyePs_-otHdB;Y}3;tJJRaE?$O~#j@$nV%ZE4BCwn;FNREI!sV`(>Cp}#uqiP@ zR$F|LL!U35iZ1h2b}g?JJ`(thC{{pqXFv3|-bTi{^^En}U%uIo-v#~O=R5tKBg7at~N>?IBB{KB-hSHR-JSOSCR0fw;U3l zlL@zlmgG&D(biiRiw<{*F(z)6?;YoBuLaKKZ#`7r~E0T9hElW$KKtfv-+ zQ^%RrzR8OB9|Aig{)l=Dm(3IN1;4qW#(d5iGg@rUN3Hnb&&W$t73Hlf-qmJE;*D+f zyi>hWI?cHq6k4dbfz6-AysH`|5t&XBy^E-P>Uq()g^(n1`9*2f>n}x%Ic}}V*zG3_ zB(BRUKM+ieh;x=da%e>8Y<&%HB^=OWKO)yiOv;jy7V)6@VZeM{W>1GkgkO~-d*%J* z*L>A|KW?Aw!Y-8tjne(bwj(G)>mSe@*n)047lRO*6Od}5;-%H1TXrRCcLx%US^ApI zyqS27e&-sR*4Jzg)D_+n^z2FQ5j`H+09K&_K{qbws%Pd8r5}H>{cWEmCMB&EG#+<) zZmifSly|o0oulI}|DhBD=BMv8_ zU$cRNyR+^agp8l>Nqm~j8D2}oDx*F+_7>_ena|dv^NES$94kRx_|(hdyx12i@~P`G zR(f)onPoR8{61E}2PzC`p?i_TcPnP`9M16&TU_wb zA(E$f#m7*o{Usi=)LuF-V!UEM3y$a}P7>5037c{sf~|fbp8ho6!5YjNQl<%h{b?>e zc$F1M@1=>;Px7tUP20;kp0~?4c(dPZJGj$7xUJ0|?jO9<_t$@Ln%r0VjIXVmw-PF7 zzCN~V$-um3hH7!W97WKh&q}H~957gBQ_<%bi24=pB`;BZ4w_r6T9(jlneU-&WVl53 z;YV?;W@Hcse8m%-VfFl-^4C({>B^LJSMH`OyUvk%Rf=oGPFc6Jm80f+{Y%G=ReB?~ z%uc4s)C+s@u}XIc>}{!mUx`x_#0XTV;}L3znNP_`&Y)MCRq;)pK<;|6oikF0N@{bh zg=F~S9I23_B^B%&=j&@G7hj*!!KiIf%WBkxVIH6z^-Aw2wCp>$fnOHP;0bp3q&%8 zFfE{W{D%}g=74fKXogkxDpZWqn8Ht_CcC!-e+Dxn&N$S$Di}6U+9~>ws6nN(e#6)h z%L*OGFRE=jIcAo^pWcPPQiswzGH{9@#XvicB7Swc2bOyp{DMMtynPd&uEdt`^XGmisEV z=_}=2foM+e>;7Ud{O^?)vPxvizw(b{vCW1+OX(3b!Q%WORg-(925;uam&^Kqe=swx zr|6p;AN0CXe1OVGu{K&?y+`2aPq(l0;B5F=+TLM#wj$9-FOdnkq9*5?tXtW8upgsm zLP&*w7EmtB@Bo+mrMQ{UJ~LHW?2~=1daBm!7f}ULIoq{{ zk#^V2+xy5_nPorhVnP8lB5&2F3v^MyAzPy88#9=?ZLxm$3ivFhhNDVtKN7^9RrYLY z4ion4@;)Z)4MD$xyog<TPNs$R_OCtX6sMvMYu5;@q4cTGZFExqx=kj*UG^4og6! zKM>Z_jr(apBy)*UMhMWs>BUR!S#Y$WKkZn4+d~I)d@QG+_t2GIwupjvB8> zdn#ucIc$70YiZzU(%$P?{8z`^tSK+kJui^^(jZf7sj;_513}eyjI#Mg+8|?~69BZn zy%SI`QjDoVs->m9Ep@*{_Tyw{9IE{%QT0Ak`L4*})0TPFvR5B>q)PL&vUHQEJ*9H% z(oY!w8ZSz0D2s&IYMQyheZSNzFR+>~Gu1k6Sy+K7Kj8R()bEh`GWq(LSy7$NOudL0EZVv@HN91AoI-}*@^#J|x6J<$K&T6hKh;9#mWN;96fMLqHZFi>2F~BNL z*L6QAA<5mZWk7PtxH%p;x?>NslQtt^93|YUCMNnzRoS))N+27BbKp_;_080|zw*|2 zBlqPz%i=)gcMA2pWUZL>A0u(iP$MySX!-VF-AimdWL=gRJtW|HFFvR`p2f`_=(~ga zQ1m0Y8N$ce@auhMC-XyIgNO5ZIUqeB^u;+TQxJ%>aXUt9Hmly9K~E32G|09Jo75jd z*)Dm%T&{+xI%>4pUUu?=l`k}PRL75F6N-MW&1-u`DXwa}>7%LlypcWZHT=D4mr>Ud55~9#F8noS9@%~zRj`J-m!0{)Fy#zsKt7R(uOK}BZ zjrnYKd@w{|>xg|d&QG&cjHM`v?0&&#W*4i4<3Vg@=}`@R?l+%_yp_$dQ*s9QuTRUr?Sm1&eFZ zdME35ULc*1=ihZ5Cbq6iF#C{qO|!QFd6RzU4}l7aRvl*l!0+KWbB2ZY!;B3nXFcT! z+5(St15OPe$_~%;skxN8J*HWleYv^mIQqcCfUrR3!hk(jjAM!)#b`ft8tOnTUkU)B zo&(^yX_%9XDNTpixjqK04K2?5SA82^Nj=^jRQ%aFt`K;d-Jx8^DyHz#V*M$Mx5Fw9 z^Q3s+)8Q-a6vnYPs|vEt(@cFqijuV{GQdf zPl*O8aqt!BI46>1?c$s8WF(oRuYN-u^c5RECn(1O{p=vqQ~gK6#Vyo`B$Yk>mh9_AUt0VA%i~6`7tzTZ<7v0^tfJ6Rcdxl9deekMbxk zl7Am2-ye4}gz;wysR2|GH}ZHDNFasIQtQHcpp9clxd5DHw(HT*K-P=JpJcCuB&x8% z&W_S_(&Kk2^hNpC$Rj|G=d8RQyM!n8g7Zo$(rPmST@vi%gLXcN>=C(B#o+zNZvD=m z!+jt8*!bgS1@cgVRry|pjXlzIfU-S{BmTsY={(}#+uki+r?cV{){x23B4Af z5YS5iO`rcpV_l?dUG#|lcrCjquDns|BUb3;RdAnvPP?d-E)LiOZF8h;0JlO)^VTNL zMG65}S*Ij}DoYG2Hv&UlL}ApyCl`+`)K|;ohj-$_hzyK6i@FjTShJ+i3i!N_u7+!A zF^T_F`KkQJbSHc>ikD`;&^TvoR71bWdXiS_!Zwp-aXlk;!D9PxE< zCVf};ct=jDuu;A50?#^qaK`<5Kz$Lc0dfL&)CPem=ilKu>G;Jkb@UJp{6V<1iR(Dg;BXrkNyV*^v8LrQdZTBWb={A(7TMFFY#TERQFPS z<%!ZoTc)q%Oq(4E<+{k~(j&5|%gcIf+qw+L@vv2^cnLjYILo9a z&x>VP3Dxkz7;wk*V*EIv&ClDIc(7@#|GeE*7}0H^!T!ouc-;UpBE&POF~@_x^K7Om z!P>op6a_HDx##&?r-8r{($jdz?);(WQ8cKLd&N`Ry zauN;HdN_2j?08w6YoY*6qf448gEsc*Kj&goJ$Ad&J-j8awh-lZ$Li=V?k6VVs3a*g zf+p?U2NZk^^o4IVavx6S4T}>ekIRerlwKR2yf0fDTB5Z z*QQOs^BFj{UcWkjv?MD%IF3Q2V@f6U$`fr*rOB01sFo6gV5L#Gl>5x{ADv91s5Vhu zZ4@n|s9A}`5XU>zSD#0m1N+JjO)Gl(3e}jyb9!VO$u6C_bx6@IAH_}GV4A|MvOHFy zNU6@-_u7)!3N`TXvs%vG!ISX0Siw$-i+hm4_q{w1{5j%02-6ES9|R3o4rFqTF+}XM zoHay|IuQhXkuv2mDPu33tW5g>{qhboElUWz^#t1j%CM)bJhp%a7t;&rVVQjbV~8FU zx#qdSJ`Kwuc8pGg?i|O5sE}UxM9n(1@_UXT2d63qbL3?flJ5!%C<}e2(&GkcqgGZ- zrpp$OxgYMFl_gtjbCmZ4pO(orQCw&jl0`=@ysD8`b`dza?v3|$e^uN}cl94aR{KF> z9K^F0qmNW|nK-O`1U~T}51{aI2k;&-{v^~SmU(K-&qP?8mln^^^w^JOmuTfe#R-*; z&!I=(AQ6@qF+>+@)+!*U9@vqTHxlJk{SA=ZY9x;<=X$d{tQ%BPr|U`9nEy!Brp&%t zUKCezg4&nim8G3NbE7{or_A~dq!P-ZQ;sn0whqg)+0F}mGWw^XqLmb)o~>nW(Wyd4 zs~Ix|QOh81?cjDjR*ha@^@x+*$0H|t>;qP%s=3@w{MQj zkd(#2xI?rhMj3YxFj7eMrkflbjrQ2DVQU4!kSOikk2qX|qH3+?o z*+xVI^H?24-PSppynzwoJvLvVn4Ej7^(QB<80w4cA8y~ERFG(NhSKS@=s3sNf5uVa z{tC>V#?Bb6JGPin z8jGXZ?{TLi!dq6p<>1TKSX$s5PD)n5qC({5IEZhy+kZqBsNYOA`6%^TDn6BL+|8h{ zV;w$c^FsN*DLh_t#BYvNO4Gw}meM2{4{l}`#1!wbK@BT_<;wPz)FQ3cWo_#p;3&d2 zqpuIi-hRCpAE`wD!BZ;A2gGLBav~&g_2-d2pXKeg{|sirg|-unEp&Nh0fKZKcOu}-Brtht2_xk+D)PW;|kPy%xC0k z-WSAmP;Ldz#EWv_e^vK{ls@yVyiF`T#$D-vRd9-QSj)&! z%2zQ$GkZY1fyaFo+z)5%f%AD^_xEz`0|?YI9yM~&J!q(#FAeh}w6U1pvS1t>98+5& zBw~Q4`XIWnjhCT!jjR1AvgpvSDx2&|tcSkpVsu>uj_^1$8KSjPu|ZU%T}-CKUj}z< z$;4#>%2p{It|{hnC2rigq9BmCydap!j^$Ftx*MOu<@>|d@9^9mKLMO-?Q7Js0rDry z2DKtK1&}?aHv=(qb|OQP*M3ODS(hNCW{K*jyPqT|EEeHi)uIG@TFy`XYK10WLf>CS2cbjM>f2&c{@Jw7 z{{H*Qq3YxE*n^OH)JLnMK9+qbL>|>H<0a_p-}^TCdTbWI&3o|PuO(tnr4)#%nOFBZHf4eiZ}sasE`hQ#mmF?SM1uM7fkzY#fC%pU` z=7+x0qZvjoBia(r;ypJGVzYnPp1otYd6-ii=6GJs9DCG()cAU!bQd?x-N*YbOoCcN z=VA`CyGNw`@{fX*FX@p9&|Uo7U-Q&>#I2$kr4}df*wpf=4W>~Pa(j}pH`S(ZsEAke zIP%z1>wYi;JMmZRu0Tcm*sRTK;ktgdVo&YPyb)ZGiL`skXd2?)==#1q4E$JsLi4n9 zw^=`e_j^?5m$}jLZT4Kv=hbXr3pAsjVikgX9FhG}_gtr%!`zvZpcE5|XHkQe%` z!Szw956(a+yYIr|+WS8>XS zT;JRLHg417QrWjzHMN(L*N0uJAkX_1OzuOMMwa9bUdGvnjTQF2ctR<9iRCSUSW(v? zLDr(7OBskUGx^Oj>hM#9Xi$lN=oaE7K<9o;Xmtez9O{qwFp?U7IsZ}Sk5A)_!1nk&toIsq9O02-`@NjJNbOgu{wj7Gi}`?C-@27LfE~p(TahuR ze^-cT7ClF5sE&=ByS{~7PaUeRP>pOkfBGE)I$g?!*%3snJ;xpU zMdQ|$P}F)8cSKrv8gLf^E;8nEpwmiiZuq_wBmae@WYJZ(Q&cf>G$n6|TINYbj}1cx zpGKecIqx96Qn#cqSodwH_*{U?j4D4`s{2h|C3`+t_Yp^}_@`@}P)JN4!^K9|=jlId z^9`7oYuzbbS8to?(d*$-Bhi3S;8a549Y*C#O*2s%oh*`)aj=U^_}K`|;(9sM=;=mf zSJPOz>mYP?rxc*759Qg_bl$N!rNu23Y0d4dk~uvSfv(2)f{B`;fyBslfl6DCY#_BJ zQBT77Q)T2i%swG8`VsY?#CWS757VdrU{kQPr<(uS7_m-u(CO&T7keS52PjRqR>x13 zQ#8zELfzsFGU!y;er&XQ8Jy!Maj)1e_xQ~QUWtU2;bJ*&BV24$9%}IoB78%+;gF%>YKSiAXR93W*Aa)=;%zmAOj7}NxQQJNOQvF)?acbGO&hZ^DMl`XO^lB0bpC3ItC%xQ~Z&G4V$>&u8G zi#9HY@qX1yFIihk6rFvMTE{i+jThc-tpl?AZvbLqH!`rRZz=x(qfGbRyI@-scKfUg z6?Ov+j^KdctA@iSN+CK}{R)jqzlSYQ{SFO}j9GCDm^uG&1neH+DOq$j*x3VV)2Z~R zI;GelAiwIY0)9t)dh^1U=1VZX&~Vtg`;T1W`xI&NDgVH|?5$azdjByQiqXrlHuI%; zfJ2Ng#cwWb`$rCMzY`)pi(z$&Np)c?Qw3{Xa5@ldWynU)wo z(6~2{m>dHIW=`9T(!-HAvqK{SiR#mPg(7rrCf$1XHw7uUJX3zc?{Q;IV)PpBX2Z8> z;tJoqJGk*-xE{TqvcC0RetD6fe-~G}KND#0DE7wU1O4Vc;eBE09lJSGI2b)%bu@m} z(H-vvE1z$CBPg4;6HZnCSODRR?B)>CmUT2C=Z?5e(5F|wZ%!VR?K9`2g?1`_3aT-e z*U;3JptW`bG13L{puwM@x z;{82q2rfPgw(zEN_rJNgK*leasH5qLD8IQwnO2o=NAm;{qoYRMc=ATB$iO8tnr?4w z+&^HbKhC(w@x51^<9p*wzlXaaH>h!S&hZ7&RJQT@4*nNDbzvGLG2L65hhk z2FN=s-pJ|7%6~LX$!xQ?lNqUMOaJszX+Dm-juvY-2heo-_~7Bj*9;Dv8Knn}$h${* zN4-LHbeO|~%|fMkEvnLD%lJJHUc$a=^NK=YI9FlF(Lu*Az~$M-s-r`}&}+DAg+luN zWzjFM80N>UE!lVC0yyt?;wA(WS3u&^Z|AymsWCYU)`}#!{n#E!9v_|tymYP1GA-&7d~)H1(!)Ef zbRj`#5dS>l5VNYpx@k z1a>mmPsjkE`pcWfAN%zuxR{F7f}c|x;@S@}2n)GFhZ^wHFcQSySPypHvYcMA*j(E^o^p@MA zjJhs+BH+Z|b3v@FlD@5b9+KGa11+Ubkw5+M<5uSK-|$A@#$;aQpxPr-mY*d{3WY!F zH=lL>O-&hw4lW3?Tlx&`KveawgkE+3ov+t=U1>o4U--Ds4A`k(C2vh9+ktgFf6g&@Twu!c39XDf>z z{@eK=%lSVZfE#GZeWCJi$;<&FV-5U6;73SdEj;BQ@qf+Q@w;01cj}J4OpdrwJ^$Yg zd*S^8rW{+|td+k1GC)87AE;r_7G{{J-mlPcef|K0tbuW{jhS$L+Kp9-`3)!!59 zJHGAeng3_!hQDybyWQ}1H*9pn8{P0qHyrDR=eyyVZg`R#X1U>h)-Pn-_qgG9H*9x9 zS>I56;f8m+;q7kN=!Q4C;gxPU)(y{h!!zCRBsa`*!~KnJ`);^hg(81dMr6!cHzH)A zfe1p?wHufyKLOR`oyF0zBUzH%uGaT4PD6~fA3gQEH{{E|4*75<-q$mT5^F}PY|d+> zNa>s%M3@GSb`G!Nm*lib;rmmHzzyw-Z|}3F3KswV{QrmobGB+#o{avY@6{)jB<4Hw zBbk}cRXl8XI+<*}#5p)7d8|UEKf6Ef3%@v><3xHAee<bqk>3zy8cCZ?s#mMl zuH^4>zBgzAZK8IS=F@`2{5%`PClU)%>i;VI5wzme75o}$djXK^v^n57k38wgOzlQ( zAvq!9A!353l=Es|b9?tS7sh4e-bngfa_fmVYV%b;=MtCl-=W@YQkSR_H<40LzO*F0 z)wF+Aze&+2^d+@5=p6!%wC*MBOY0O4eOr`Xr|PBp_&=}dv?o1k{NHM^PSGPot%da6 z3$=T>Kp_3Aeomo>GAa(&IAc-=wAqTFaA`AbV{!&Uf z(iv+Rcd6@qE+rOX^c)Ggil;_3&eC>(ugKsea&IN4udMZIDCyb%1kSPQPgOme?qZ{% zodkm`z*fQ~Jj>|F)0fwl%E#hA;kE`~com(FO@WD&5l&mecsrv$im|BI9L?fTMdKfK zAFG7Zu2UvclR2EVKsyhJ!ucZO4hQw>$tfAqlxo?L525*7+7}%z8opQSe6gioc;ghE z94+Y8ZlP8Pj&vZ-r~i(2@Ip(`ZL^hDcI=D5NO7|U*Gb8gWH|ajWF=GL=PaBLL#p3? zrwu9t{XnG-zoo4KfnA~uqDmccvE=(>;)n3tcCMu&RyW0-PpLW(Ocrbyv7%p zSlXN2G=KrFt>hg-OL4bv%Mv?!OJI2QjjJu41H;-$>~y>HhIo$ zZ{zHy@Vt*vvhrfl-rpM1yzE%qH)Gh2FXI zXWlgXyr$W8p}7n3-n>`MZ48BH&buN!yK#xC4DR)e!H8Mj8<&J;H+g4De&3M$yqi$szvr8xOaJhkbI$QLg=W^>EPT>5N1B;6 zTL=b~$-BrKn%y{m?t+;-r@Gz;#0c#ibo4p+79z9f^?6FAmn;a+n+F%C$~tqQR9WXq z%1W#Ry&@jYJWbes1mB4>Cu)HO^~7%d2|V^==ler9{%1FA`p@3Hw}}`0%y}L~__AB( zlZ^T)IRCrhSAXijUFU{2;VHCN?Y4iri=V(<=i+dzHa@wx7vEHV;W5%ud8u%o3v(&q z-IRaqK4gygba^H&X*auYrtrL&?>6!Tb{_E*oV{+j%r;YTsSml;2g=V~fKpS#P?kzP zbji$d3+w*-&h3}{dGm~T$@e-g`E1G0YlrxET#|j{w);*#@1D`;Uvudcd3I{_xZ&wl z&U3LF=DJ~_8#=lCHm74MvfHiSH&vc;_@2u5HhK8cPpbV?yVI&v(W=!>xN?INe*2IL zo1>$AlHYYwoeHBb|5$~ypLaQ;vr){2i)BLvIj?>0`f5zKW-yBzU*JF!^JzC_i z{b<6y;JYl}_<3DZx+2{-Y`=J-lIuNKF^WUHNKi7YK{(V2b z>xJC)*Nzys@R3<112-R2HWdK3{Zkg?JoebyAB_L>@Fn(u%PVTmA69nQxaIb1MnB!O z{PoL=!X64|IZbL3PkmBa+p&8izMurW5A?OaW&NJ3zG!EwGU4~n@BchwU0?Wr=m)>m zwdW~(AL{q~tA5W9^n2F&t)DgY%U|5@`49cdH$UAMzlMJOZSMEHvfuN){ht5Ouf67e z&prK~%g*W7{#kvWTl$s1blkX$yd_u8UUcC|@7VBM+=LsKcxxt2n{>XIyKkfTOTFb6 zjw&DJ%KE>{|C>Jl=7)dF%fI!5V`2WS5C5$X|4-^ewOPnd^Y>l5es^E{p2|=8KdG?x z#J=TH=>-MHPEW0)wEnC0loX(UhyODQ@WzKn-ul?hi8jIjFC%9YXMD6@^35au2g3J; zXj&oh9>V-WO)Hl2gy-?~6E7pYknb$wRfIvlbBNaxUcq-B@fn2md>e>25PrlrM7)Ra zXc1TH5ZAayVKv`-h_?_HoT6zzBQ9YB6RC%YOL+eon)U?o2M8bL`#0iUgeRS;X|ECY z65h@CZ7D^Ff=IERhuT!|Al@%0jKCVYc$ zG4b7mW6!4F#A^w6vmEfh*n9JMsJl0Q_%mbQB3X)TZBiIXS;B~At)Be(%@s^}L=xp6B&?zMt29?)P=BbA8To zo$H)A=X|acS(p!y_Xs%W0yIBBVGxV}4c5WX^ifa_pdn!hx(b3Vpq&AdK=6k7BH*qC z2sl7T0DJ%r8gBtQ0pJS|A^`msU^fVl0NoEz3# zhOyHG=mD0;KoA;)T0koRYz3ha(ER{uOo4s@%>ppf4DbM54lo;B9wNXr!0%u)&oTtk z0Pe$ryaBBMP$&S{2B5_NdV?SaXdJ*m90W-LIs)Jt2xkBd1%fdJf;yn_05gJs%>eo> zK(An6Gl0ecqzeV@2Q&*n>`gF6U_QWp5MluhWAAOy?=TH;_72Dc(0G9F?}EMobUDCz z5NZJp-2=J>;R~P@00x850q6*TEg%d7x*y;;2zWpP{|smyghfCD-wa3;1OlLeUj`Ho z0yGTj6A3|cAbc4B@dx0{0|XYp|KIcfeGMQ$6%mvOh*FV^!2S^Q)DXQmwHTEtr89*V zvYd{b-X9Lqfc)YvFiyan4bN#X{+SN{+g^g?bQrhFfXn$6;QSx_oj+nHHMA4KgrpOr zHKlPzXl>P}0?1QSK-57N0MX^(i#gy53hZa*gLnB>3 z6913(!0m$bQ6%T{s}6tiIOrgbThtJXHx%+ln~7W3NfT9!WqF% zugp%t62vIRU`p>yS5DJN)lY#(Y}Y>tlna*yZq-wXBg7E2R1mEqr5J@N60{KTVV#OZ(FS5VgPdi#|I>pbUg?fq3AoE`+q`duF|Z;)KR`EPap zvp(=UNo2jS)1LskF#iu-u+tv|e8vCZGtxkeNwi`#rqs?*fc}5szm@%`9(T|}JH+Y5=uByyX|$-zDI1afTebu1NbwIHQBXkJ z8Y0(0k*pWEo^abQk@W}DuK$JpWjDX{^=Dl``v0E)|7c)00=Bg6hscD0w19o{Bmd^H z>_9*UR0v4yH=Y#W*#e%zZ#-MT^9DSF-*^dt7Xf&<-*}$@?>*o}{KlICyjs9Z_>Fgf z83A?b3UjJxT! z%Ml~vSu&cF@j4kFkuj5uU&z=(#&I$(laZEwJKx=8{G}7IKjJcf#8t_0gFpDT9G$ErS z8Sj(v2^mw#_?e8~$=FH8VKV-z2c8^9GH%z0m5lsklp&)Y8QsbFfQ-+{m`cVRGJYWA zCo=vj?;APZLB?S+&XAEnMkE;P@Uw%ATx2{z#$Wk~{t=fV$1jo5_K)=cjhk)F{cqfC z+t>fb&9*)KpT>-U&f5kP>jevN~txXlZW+jX_GT zR>0roF9CgUszBzE-v%5(#2G7l;3%U6JidUpmc2H}3HDb~Mx60*wg(*dU-@Yul!3pM z>z{&9HR7z3EAX5Bl9Pi2sEE3qBVb7(VUCBZD{$E&mkk-iJY`2u zJ69)1*fUKJIGvf9+XHFStrTxNkcXw!FLCG^%sp#oZ*|(;-PO+A1Gr3qvTng#bu%}2 zW!Rw&g5uz~hLfd-y%j8b!OQ`!5eepM19v2M4`<*B;sjU}TO3QWvY=>+^T0g>c(4I~ zr$3umL~-7YTn#7Jvp}c7?MrLh{|s8BP`5Jk{Et_l!;~7pcg;=_cy{SH>DyUaDcPF2 zLLrn}?q6^fV3h$y|J6GX3zaskHAgomdn+Y(SNrXnxk9I4{(sez9-Q)D^Lh-Y=sMbN zS%Rgqw}q86To}*@=t1Z;mA1W=l`~XGrSr=v>Mt#SrP8r?1FF%3{j;DADzeuaNQa%N zz@4#QoUMj{Jw3I)nVmbZ0BzuMWpA}@ipt)=L)pRH)!oU<0_<)v7-tN|%WAu;A$|x0 zJkzdWtU>pH^DYSE;rLse6T-Mzxnmryyf6-+_o3YoFiMaPd&gM2T7fsXAlMNQ=Inw1 znAuxdI5~QPnt*b^s~&EkP1`wBwTl%we1P*p~mKbL3X420a)f%SHb0>s?h zE$y6u0)T)c#?#u_)y~n~8lw5n6fbhpcAx$8Epi&hZM*mWnWvlM-^n-#<7MsSY~=_< z?*JN1V4zkHA!`eG5`>^c$lBS%9T*_=0=678OQ;Y6_Tg#nVGZQ%+}xbaEUchNKzUhg zDIH3I+i!2>2)-1+l%s`%GnBDK!OTJkwhA+MxMn3=w6hOPm%;SbTU9XS2$Ose@PM?0 zQ)wVDnBbX6Z`=109MNDKk#ql|9e4A`O*bt_2c5A zmiE9(z?nR#isuo4k?2wDNOKPGZaM2swW_&?=-v*Lfs$^uz9{E;s_ zNx1pAxm!8>Qyd)SgFWbP;g9!ht<0SN@rLDp=Dj_6{6h+y$N#4sJe&MO>MoG^Z{d$R z03-JVBmN(+LJ8pA|0m&(GRSUi|12X9l<|Kz{E_e0u=_pNT9D`eDf~wsU;?oDJriI% z-5{SoAb{VV z?Rv!iM?JRr@EFazwpQZg-`ewJ}s>Le>~ge?*11({P(H; z`E1Ke{R{uxf8oRZ!SY}F74AQSf00-C7kS%%k$3(Vd2b*edmI7jz@1Ni@RA59M($S( z*bwjlKg{uW{B0NlN`;M+{1`j|ao9M1JukrT8f-k|r}i~`X8kJ$zFlS^pf$1I;=pJQ zEB+lf*oTBt5B?p8&Ve{Q2*^*W0TQ|e8zK1-Ge<(;v*BM)tThtK{3{0h7i_@$zv%^k z6$uIcRv#*GdsrJf4eqxpZ@s1E<8BLX^r)MGB|5n0v~{CM-_Fs}$;*wOyyOHop6Ka4 z;JvvFJ-7mO#hxrH0`?|u#{5^Dx3c&ZbF*-@b9U#yvW?qZV|x0(3IH!{T_kgpp%)S7 zhaVYo>09@D?7@o1&7GfIEpburUzd*PMUMZUq>ukDZ6Ip+XWry}9JtX#&K@i|;rmSV z^zdylu&eR30+)WQ&EQL8;GUS7BflBgyMb+i1-xVD2bX{BfCPETB+L(v5cs|9z@0KT zZ~_8uR+%~a(3^RJTV3!egeX6_(Zz4Zzfa`2_&zu{_|np^w!?8JM|&UsZB_Cg7Z-sq zMUiuM6Q&2(y1-Q~@bXECt(Tn~K=L0_{D<9b!PPdPFt9!2ciP&Z!*|^*;k#|{xr?Qc zC}Vj_Ank=8< zu)j6yZe|0&ucWD=b^3xK3eM->-?w&ka@Z>W@Atv(&lxBZDASVuuNwR!{-r#65ix## z5fOVkU^v?kC=!lr>HJrZ@Ne6x8<2J;ON%&zrMVkiL0Eb#_bm=R{dsHn3f{I-!Oc3* z`!=BO9k=@2jGt`dK-qs)99&j|^{{RHAOQ>}i)+Br;2U~o^lr9R=HT8Vzdbn366Mzf zqOM+cZdSruHUqdBE2(QDBn=k7rR^cL)3Z+o>Xd4PsD1co()c-~mk! z><%0pIDrj4xE{ED6tQ*C1ltm5p|uk@I|1@G{I*v1&fr#@_tpi=t)AQ-RofRDZ);sp zVWQ`Pe>R!UpF4Bu*Zz|ho-2)9Q>_8eIiE8GTihL$a6r^JrCLG|KXm@gX&vup({VnJ zjP(7-%5<|#boD)o52j}@)$6H6hjemBBYtr3@3C>Vr;Kg2_xy6cI#B(Pn1yTgvptn! zlshUTLM-`r#Yyjv)$jJ9SaS}jwSCW+x!kL_P?O%5zCKgD zQmWso+tWJHH}*_^R<|VIej|E)ePd>J)N`hrF4T-eaCH@n2sImJ)`J*xZ<$e*i-qm} ze@w(M_gjQbcxScS zvPPkIrGsx?O<%Eht=JuL$04?_KV{_+dJzd2UTZ&AEn& zd!OdOeeRR}OHZTwi#64kyD3aTlPNxNJ9h1&ll4I!ASNd&6}&~7#v+t-kkHy8B2)t^ ze6f-qXGYEF@@2C}@Q_7ipvhL8_9CIrxr_J1z&kmAUF2R0(^!rylK?K#)RPmPL(F?q z2m7$TS#!K7pNq=e(*bp1N4yCNy1vs)Pir0_(8mrrSB@U*UhZ#tgn7UNRNlNFXaQP8 zlJJ37C-W_xWuJB1 zG2GMpe!bp)4cZ$*rlzq~<)4!2TCSdWR6o-3PU~>_f-EHYZGtofgpuVQ#%%N<=jgnj z-*hdMom|fWZKc9FbVW9XJukP14S)gqL6I>17s+oWb_Fg5iL-!BWMDT`DQqr9-AHpb zRAqFXefObVdnv{#nvXXxY|t}4q`Qw;+_Ov_dpgttk~_=c{e#;yq@0lzh#a${sqrex@Uj zAemc=dYeA20MpKQKDr>1cS%u${A00I5WSptL8NU1on(#+sy#wwP{x;)$A z)FI+t3MpmN0h^riV^FB!Cuhcfwkpg(!^%;h{Y;QFAOJn2$^;gr%vb|mxadt=ZW(9+ zG29v$bRKPxr8LcXYdZ;@EjuYQ=}S232XZ!a8d-n+d}oQ|;B4ITV>5MUo^oOSkI$w# zXs;1Q8s?=Lo38;C@Zp2Y09r2@SN?H!*9;6>C%+bCol`2)igD4rW7qDn5(M?cSS~i4D-LM zcSqr%j^a~Er~5ozCBBl_<=psz()F3jAx&9%O{dhGayZ0U4nmcUFCNnFS+A(OGhG%Iq zw(sTrpw=P=2p@FF`WaMNB}>1$7cxy1zvt&HbL1WS{n;BwBC-}o;(C|g6%0JNmif^} zeqm4dMrHB(Jman@1fx`n`B~FSX>NRExF02`>Cd{2e7(F!jfVQ!%$yQsIG=Xs+BcD~^fGX43z`R(;j z6?d9dvey>PgfZVs8t>k{+uwG@sVZ9J3jRj}*7L$9pBp3YvGzNYQg*&gLQQcg>E~F= z+gSsqa5*nzgKPk+FDABHeLtIH--@YWh%M$*4t^58jLMUPK99&I{3ar~{zlAZUDqGN9`KYp}KrDqn=kHvZMIu?#ZONTMN zrFp9$#U6^(FI`jZzCM}3W9(+192h_2gJ72PWeb;+yuJTn>$zHG^$KPv|FIq0%}|KH z^Fd>M1z%6I2NbMYz9Or0ha)XXu!*sMU0%4pqAoOC?&hJnVYeD*PN+WT?Ysnc)_Z3j zPkZ^Jx#9-7)=cu7s|`b=Po1itrz~3*S)X)fG!1;)coP|8U;1+YK;J51fe@pzV=%Hr z<>HY}Td9Fqp1!hnjqbgUvr zm{7{xx8l+t9PUIMckvri2_O1?FCBj+OP-nYd5Q2+K{(L8=H{ZNOT9?JVu>eC4oaTA zE5co=f?jJ}fgwc)2sHBez!v(_9vZ>9Qvq&#rGytY78UW<8FW#23J0&g*X@YlAm#%-0j#9j z!GdqP@@0F1+xl`UHrQm_Pur#5JzEmWhnBm&LR>BD|3>rtLW!fjJ!AY@OUk8&r%Xo% zP;AytWrm2wz|%a`v24u;pQ1h<#jL6hE%o^lq`T+yZ@Lg(PFXw^NNlGoqZLA;oF-=l z&ybW*3V7KwryEB0Gvmx5@#mA}Il|tVH-e1Myx_gu3W^+RKH)bwiT=pfhjp6p%`(iT z=KHXE3$+h??Rj4zK$R~=Vl-ww2lM@8Q%@1h_(`lXWqQ!*IR*>$6q53 zaC&|8_^R{_%mSv9BUkQ?ec|ZZ!EVu6Y#G9?z-M?^DUfN*zFe8pkks?9~KH(S74Qp~-W}Km5{z=t~H;%PUE9 zWn(hOR{A`+BJs_>{#NZn-mBHE-aprlLwUxokzS<>afG&lGArUnoT8#r*Ye;cNIhWB zDTglx6TV}WW>@Qvu=P`YwZ#agwj1Tqqm0Y;PU=4oz1mZiSSRBthowCgz^)njIv6O^ zSr&a{IGaAiGq3Dm_bb0$tw!>zeyrFA-O)9xGl?k;`;zb!xY^>jZ?rl#$Cu-Wx;B4u z4JUjPhJV07ce>2_|NL0_CW59>ZGgrAA$(o5+dFZ&W9G&~sgt`%H}(fM(`^#&T=W~( z;53bjmBd`CUW@m02f)#ouaAsF+G&lia+lcLo>P?vfAQ{2}58^T7G z!T(5)ByI$c&?IQAM}=0b-0j95ommd+GJn07gF8|&$DXjduat13rcvTn!>bsR&SoiQ zcH|P59u)R$Zq8*oJHWq`Yx-$7=}J}!e{*g$N@$2V>j&Gb_1bB{K+gD9TG#1X>~V!x zsfO1z&MnnNV2fe zuk=IE;%^`h=@)8?4>8YAncNEsNIn)IsBMpu_1z;X=j8B`GVOl&V5647s2WF~ht_Aw zu9*N)zWDC_3QgFZQ6n^iK^IoB(zBXU_qgu7TAodsLqn8-%-r&&oBg@aqV%f9)00AF zv!N!*q%)xZT)*u`EBQ|K&!uHo_pQhoVQ22#k1zDs_T8Km&Chhxd5nnrG~` zg&PCKv2Gg*hZkL6@|`Q%m|ho*=b;=ZF*fiXf2~w)hN~-{cw`d3L8k|Cg@#q;uE}ED z*Pg$S&Zp^rkG*=Y%!>W(=-qEMg*KvgD(!LCd7_foxo4Y)d57oEx0jvhWsc`7+As{l zLJd(qGk`lAX|6 zVdJk|iiyGr6BS}3I|3|OZA6Y9%|)oih@Oy7-*r81Q?P@=;N=(Vpa%v^UbZ<4wTlcj zDf8T&;@Fqym4VsNT-JzlzPt|WlCsAD--S0(6Fey&;^;}fJQL0K!<>>s>$@wge) z4ltX6>2$cRH)V8`6$<6%iKb$@Q64T7iLF$E ztW6^X*QSWAr~TH7%Op&O-pi8~+W9{1*W|esZ2Sh_?6DXb(xWp>VY!Pm;qJFD`ty9c zZAu{LvKU(#w|%3S(Jjmy{3T>QjsPnSl} zzS7NFtlv;?3M&HrWJR+|fo(V2ZDk@=J4RWr9!Uk)9v%SRadQD2X@e19QUX| zjjjDWH^gNyVp1r*R0&_*of8Sq>F$&v?|8eAeJLD{!r zeoxkFfb)u9%8#4%JzOxF|MX4SX9gAldy zRI+s=oXtK-`I3fMev|%(9C(Vsdva&Y^$hnD4-k^X8#Q(wLY0d~-<_`QpfKgwYiJHB zjWqV?7H&YS$Dq#drET*R*(M}&|FZ6Wr86_b>DR^u=*uQ09$8>7?yMaOnCn!{j-Wks z7M-}n4KlcJKFFG-aPvq3Mz?%FeJQKm&hRs8S&Y2U^ccGv!m=K9l=dRNY7SjUa1Xad zSH2v>(qBcFK%$hF`@tT^6l}G;=fW^|^RKm=ug;Gcl=?|WiyF(M|4da=QGHK}ntajV z7;&R#qKnhgi1tw}C%rQdjh;L+nl`UXg10YVFS^J7aO7I^(tOO(jnu}Ql$g7uhha_( z4y) zM^zi@cG9%#okK^C6~jr#?v5B6VQ$fZQ2n2$s2jjfk1YyYw63*ixPx>ta0d$~_98y; z-M$2e%vP_2OTy*sGrAsJ(1$Z86)cO|eki`S13RHJcdnRZ4U9LvAk=AMgTP5!md$4z zdpXj~(wkD>*n{0RR>ay2mJ0&B88*>nu6_VXN4S9aVI4&$+RJ)T>iTA*$K9 ztCOeN?JVb^(V*EoA02>%;FtRenZ^8oo3dF z`xwfmNcBOQ5v=?JPPF+OltMeT^8&Qqk5`1*o^Bt_G%o()|Bdl2a<^$OFr<@BOoZ{SB%2S+s1Ik&lVEHC@fThsbi z3SNwSfM@!qOP(u;2dB}BZ-yTVuH5=rCM=;-pTt*sqB)=L&B1{-`J>%C9H{7REH72` z96{(o6xD6@1%pa_Yg&iW!Q@T)nYcmt(Ofe({PDHz+y#Z6X5L#3li_{B?jbgpyM~vN z6uhPM81yIN(oXgl??|JKnSOp|XX)hGl`vas{Koazp_R#0ANfY9^G>a$ZW7g5A357* zPnm?JY~n2Q<7KP#+5C>}Q0Vi?2R!G-aarx?`Q{dd5PDUP7*C zz#w4vlwle_l0UUzKl*KHP7ZFrFqw#=6{UZl}vx{UCuL+?ia+ir|L$cE+0$R zm_^YS+i%<`zd06~jWZJEqeCmKlD<`M(lO)SoS+NH+^bJHZ`0$oFMqwVY`#9Q$!m9C ziuOSR?O<|J$Y!?Gc!E!F_m4L)v&Z_TPY50_jpCw;N z69>JRstokmu;83vMb=-lOwWdoN1|BMq)?v&3)!|j_$uYm7QZYst`b;&w&YHb1A_^! zRW}dw6LG6HDB}{Np)8e?j1F>1CD2NeyCm?!BS;^)11ej|=~H7TDLp*lH+-e+Qy^;s#k-+}35dVC-PR`D%RN?!4 z7!^7=b~dIDLq!+x!sMP1#XMLNwtJ*$?xCnvOsTw~cAbv0vk%yIv$rCH z9<=Z!|7m8`tkL|bYBx{5XP@}|eqv>pyDjtd`0`thM2GIvOc;27?5JkNb7cbJjNF>L zM39mBz=a&*{660KdE+K7Y^VU_pv#M692v7x<}W6iPiO{z4~WZKHd!O+H*2y|cuH5v zP^8U%$0v7PxkOpBxflQCnz-df2D*Mu zkRf;n=gqmfqi9Fjv5zG+b3X&H$cIaiULu6DJ*%i0h~`>Ir@UzNQHAR3#Ea+tdCdIB zMg<3G6w~`?g{SxN-e&g`SD%R#Dfv7Pe#rf@rZXSXMAVO1;P|N_82D}_dt7V-N{1hUcHx+|}`Ip%inWMto@ZE1-ye^%eDqRks;e6ipZI#oWyUJ+YYbj~CK!@v&Ce@)BS=yph zigSi%e!wV3}s*F&D^NX%zUB*&WfL4zeC6GGTH{Ta&MP;rIqju{kf!rj z%I_@Xhc~rSzoltZIUD+LQ>kx0x?p!!;q-I4i;dy6m*QeL=ECESzFfBr+#x-o+1BU9 z^XN_6vDWwKm~drc9fg*FC!Rg(bnm5tr3aJqBR>lzdi6!d6NBzxi%!QNzp&R&2;k& zMv~SW#rZ^6`Up0~=AX{bm=C-lx?b1puI5jltqzHgC2|f~=w^d|px3RIUEjz|mh^SK zi>rG@_R$?row;^~0UDMC;YT$gY4SeYFZP@{jXp9L6$J^zte%`W(NU%KQKtug+4v)K z1RE)@pYJqByec*~;&lg&Lx|R+$%8rS@ohekYGO=d{so4sxCO1XB~~!8L|n@(toA0B z_Lw~tyhv!KJSozyxuPvnmEpFo@u~#cH_%?Val|aSPJVvha|Pdv%+lk6Dx~v-%D%1( zD|NgO-KkeZ8J7*+DtVqnZlrC`hQz>1B|5H^Msc7arqWG@kvaz%yZ_Khjom4P-r{w! zLzAZ!3C+R*0?yhZWm3JIk#3u}6Zj(BHx3TOh?LH*-}Wm#yf{zE=Q7o2PhC5fk=jpl z3)K;sZ-TL!TmLa4sNHVzW12?%?oyRPJT)t}Gs_d66=L^`mJwZt=4e)UXmAD@0Z;Jj zr8kq~Oza0orc*mq^aqd5XlIsPxR~s zYg-#utnRD$w9@$Hz?ZEwEzx`m;yr@Y#9&3frnRsLe0?-Vf0W@|y@TClKP$6$r_0_v z(-S%*hf^4M?VbNlZ0tBuG_IcSFdMah42t4oV9R;MeV|`xwby4Xjx+6j>hj`4ncWoX z8(zDRG?*sm>K`y@q8O3IBGEZgnzj()j>!v+ub2<35D8!Ud8+flI_f%Nu{d{m_!?TD zhh|(7W#V?0$gpX8xc|V1r2bt;sCM$G?IeDv7^;s!NVdzTtl7Scp*f0^HZyMyo+IpP z&_HTkv48wV7b}q)PN}Lpny$Je)Pt>Cdn#Nu-utjPD~B{ps<-g(JH9fI^6I@iviJO_ zGR|j(Iys~F#wr~e$JOydjVVf=OH~e96!sr;on#}|dYc@M$Je=kt>VPk%aZy_hj?DC z<|MD52}wG-@KyYy%7vBJ5pElKt|))%2M_(vbqGyhPL47Vqq);2%DxXTBG#K=(R-lOEe0j$RB@a*~gayj^`X zE-B!(RL8p1eqyQd)w;VXzsMk;oc4aAood9R8)HSsl)*Of_T^3CTBHazI{dZwq}9+I z>X|XLb6Lh z?w&JXO68q;suqTt&pi6AW-aQw#M0|xCOu4s6yiWe2_eq=zKD=}6sK06l2yNMV6}i{ zAiF}mVS!`Z93fZbfkkzm5t^l8MKb7_c#nz>x8vx^UZ^wU!wE;UGI68`+xo~PLGD}1 z07NBJC5~a&&}cjTd<-4)=1L*uZ1LrZF&3S`wVpe<2V*c+tLMk92HL4M&eRTuea3}S zd_*kdi%^#e)GCg>$!!ql?emp*4arQ8Q&^@izVLTYLmdz)`#ha;D}a#6fyDIXhraQZ zcSnD9vtVMq@HP&^ z(H@_?{JGZH&-PHfw|dCuc~1LNJ1N<&fn8+T_07o#nPs`mv82ScAmKpyorOu^5jE@g zyLeZ8Tk$BAKMk~GbFYlXl&aB7{Wui<^;1ZB%NK3p^Xkn5-9`6HTPiW`>l#tmG+eys zAgYMuT8N5Z!?xQcm~x(?Cmf9L`nKM%B-ZaFkC(Ck@m_{xrA2jtn2%SkeqD?!dMTNT zxULHXm-7c?7Pkh>fXA~t!`c!L4 z>*wSAX-7K!758h#RS6Cop>by>9r^PH)7&wa6T=<5Q4E_#g``P6(U0J}#wnBQnS+}n zk!4PWq&>+qpDla~8&$eFkFK9dblg#Zepn@Vke?yI)P|li2RXh!T|eb8?%};J^z|GVRLt+!Vfi|CugqzEd0GQ} zYKeTM7Q+>;+17Gx6*0$%)6P<%$LUS57q8vgAg1Q&fAZq7YSgqJ-6P+8M-@xwq8Lsv zj4(3b0?l=``u1%ugFFPE2E)^hDpJ>+ShUx z(pigfL!pZHYI-J&(38tqG`x*ccEmWy2C=641W6krVmO2f;LygJ^zPUkd&eSDc9>vp z5uV2t5Y@ly03mQz#qN~$rhE=jp*Vz72%&wP*|;C&_q=yww#wPG=kr1Dy3pe;tDm?& zjG8MA=xH(aU}w$1;qmd?H-o{qO`VqgV{=t5*hJR*3nR2;;}hv`ia+q@J&J z86v*yud(6!Y*aSudn}##0D~Qv3X1ivP6+5NSIHm!?AN@u`ZQs&x9#@v?B%=7Z?Q|4 zg!x2X$m*z`{I(ae@2a*_de{58UTPipBH+-YHS1?ZM`_lBxU?R~1_synsbBV`xt$NmRPa zKUDKY+$F@!Yv}Nx(}>!10k14yOb{m6{mz7;C=tzv2(=de#Ri|4x+DiPV&cP^?)*4q z5J9PHy0(cpjl;3<2e>`VuKsvV z>uVc2Aaj-~*dgBrOW(q4>mQkzq&YVKlr8Y{=ZF0ia}x1)%9W(PwJUr*cL@x66)q;X zmSRgzt*4LA`gk8mraXraQ&KYPavFa08ZABHM88&Lv;5d*rOu6D=ezdcqThT^%s|rI z@ki|k@%IsdW>sCI=lQ)Zzw>={dPUqJkbuplBd9=|KII!Hv;xlh&kD!+RAoG681hQH z11w9Yym>FR?CTGbZV8%u^=Ogtowrk&Kwn)-e{B3VC@MS{vq<@=KYo0s<`On2Ve-2l z6dY|L_HGgR;Q0EdkW(n5Q=#D{lkB4jk8_z*`?%>J3bG{*f1a)q_gLT9ciCIXAfhrI z5#svUeDC<1`bB-on4IjzKj1i_bd@S-^iXS*a+t3E_3U-??m=Phg$T_n`{%F$iISW4-8qQ?BqRmhp| z@slUwyqNq0Y|EWrB!tshH0Bc>UfEUwyXnfxBl#q=`{^)vArom7c>IukH6hn(@VC9} z^dw|R%;mRHkeIjhk<%BHba;!_$5}$2zl)u3lX7nX z@o{4EzS>ObVPdaDgeEG=8rakhK6!iDQumj-l4yxXsYJI45ARh1{pkaXFL$$+5HEwz zC)lbr(596)l|j!i!SSbDdaTsqmyC85MS1QTd)1AaWHnOg z{vOtI-RQNL+TDqoZvSF`qjmMyse)@J(^5h~;$r;px{oP))qiB`UNh3u8 z=lx+Q&P}Z>v`cdQtWRWdmg~CRWfZO@fBkR~>i#PC5?4@pO@M!mRUHnaMRh-52Yz`X zK!@WfbE=4c+cjm&nD{q*INJyJqj!KGmgxA>c8$TE8D;h($CX!4)GG2x4C>z26Cify z;RfH6l$7U!Ss_kUkoHubL9`a)NMp2UFq%p-EWb#Ux6&FX~+ob0gSbeU-(O z_$ax~oPzdlc3HY(diCkv(HY~!j><6ezau24(#jazy}@#AeZ(|AQzg*!aHIEi%u8Wsk>FfT6UC80AWHJg23 z$&ybWq8xL4Ccb9#6|YY{i0(V^A@2Qpj{-95&U{MYx5CqwRp{Jht8*uMQFnts#f zyXLxKw<@a37qGK1p3lEMwLPLh_`~YRwTEE!ffBww&GAsb@wF>Hxi!Aza;QRlJnwk( z*Zzk!D+(V{Rq{x7UuDc2^1h0erNs)`9!5UP*xaGEhadmm>xIKUPcm8^8vc!D%2 z0J?6}db0zEpRbchWL5ser{y*O%D-nH?`%i1LG9vjdxg*JA+u@yU<`-1&lkQ^1@+zX zV8OB)d!$j2{@t@K&Km)pHrE@eS8Xx*)d}Ov6u7iUvXYNFS=daO6J}ou;qud?!On>P zE?w>G(|e3RVcwC_^4D?R6Hb$#9-p7Y1h6?@7D?6!EXJN81X^Cz>~470t7hnWgjx={u@TsTo)?z?Ij1G-xB^+29|5$kB^V%Bf zaBfqD=FTg)%JPTC&Rm0v+$Z|hKx2<*Zg?k`n{wzx`DYLD6cLZK zn{=mH&(>X=ZL+yDs$IaFL%94=(!gK3!0E}C{WNQjR@`vk4{f-Bqjxv>6U8TQ;z66x z&DeAJdyWCu6vJa@z!sOTX}(@Ve%^TJGp)&i>d`b5r=?87u3HrN^$4kEh6L1BVCp}On_*{Y)?etX~ znqxKHKR+0!I2V4`G0G&n9tGJOWQ?1%XsL&{YK-3Q)ex96oFr8FJBXBBz3u{L=U_+b zcwhvV18VLuU=zQah0Cw{ew?VUnANx$BHL8xxbF4@KNj}5W|LgdSNi^Ex-J9SOEWYq zn8Fy2dH!}xlc?&u#Wa7{&Gt81hoM>NA>V?&qsl^-r#ZY`&|I^c`%8*6104$;TKDmo zCuru8loO8$7H-(>B;|FNzIRF+r}z@YoEL(2#@#U=F=qnj0l1rd*bZ;M)d%9$`f-_T zgS0N5@dt=9!+O2*@8j`u%tv(>h#MBukZB6C$4&%2ogU1mrL(AvvimkY*(g$-m_D^D z#MmfQ&1aSoxFR7(@5m3$IzL!fBi&o&+Ul4l3l*2e)IwpxhZ8A+J>J=YHd zlHu5`x$EAPIUW0zwG@_fZ6aaC=%<&l5AjFq`scw>066Sv@gTZ7q8*Oi7uG9#dZ+Z0 z+q329H}c2)=(G%yOEanNZNeo~Hw2K}hUWAg9$7omRiMriG@>zImcr`2ehxyO*)K7# z+I>%NBpKER2Hkm{Od9yaSwOrxXVSJ(gvQNpc>i#$D zqEXWNtft)93C_4ZGqfM&RaYKg_Su<)*G>%owzmHKCO`g#V`YAkkJa1iDT^DBe&z@uxEvv;_z2tD-5k@YCiW(Q8S^WYQEn}KDLeYd~G z9;eVn-U#`&2Jzb}I~KtayLozMUUWCk=MD3Lg;j|*&bmuYHe9pycT02w)G5Hgux!sX z23x69HqzDl;Rmm-SIwIk-$V1H%$^zG%m`p1Wb0nHtD6Z1H~vWK^Tj5VYvQj~WSmz$ z{MUwEO|&a_9IHX`>dO~Y{_r(V2*2cSc;`Bf;N5qC_{zf!daGmr9I;P%&8W+biNxG5 zNMC;`REjHV(dS%z7=p1!oee}^qVC)cHO3Y5l{`z+jp3jAyl7I*>z~v=sW;s8wU;6I zyK$l6P0^~9YaKgz^zSZsT*UI1q~3gY=%e6;0RIW09N6(LR(l7p%qHG3qOI6)2UBX>*~@_G$<>)4h>N;cxt&h%+Hnefwa3-l?fgvp+H; z0Id6t`|C^IR$(J`YjsbzlS&z9O3%6M5%V;D=XMaZ<5(YYR8qTPbwxiu{-;T@&xW|= zPoW7EJvspo;k+67kLEHhB7Ov89BVSthXn7?hoPv6c@1Ejj+)WFio)DcSylEzf1RFv z^vY>+?Gmv6FC{BIPdg7y<#1+Ir;Cb7xM;^c1aE|SNR(&$Ey;?5u=oy%0y)+g@Z%|RYrxI*@6M#NdG)tjX?pO+!b77QTh`P zckLR!Q-Vs$e9EiM<7dZjXBSxAAx^$Qm(!c}5+?M^o?QPHVuPv_?DIENr;EVTT+-5F zyV@C3@8?m$?7oLWYgd{;M9%P^Yje$0az3o#vTgd8{go1!Z_zB}=q*~^o~P(B z+?+pa+Dao+d=_yO=Q9|o!h7C<4H-O#(=_hWLmnBR=@cz%POXw}mLFO7u*{@@z(H-T zK@GFNa{Q5`tn{InE)RsmM?KNf$0yq~?V6Pxv1SLPmm4lOrEvFo9Rc=TPOv#1cQuCu zW__QLdd!-5a!RxDv|=c^oV(9%2|8$e(+-5AqW!{%1doAQEH`DY0> zsA4n`Q|d#5T|dEjW^$!Y;>vMalKP|m9hS8fXi4$s^0%6cYTx@ox_R|Q)ILYQB}zCv$71Q3(F?N6d!+)$?mz#e(;CRhwCn8446eSF zV@4O|Sc9PK<^I=0{sS^RMRcAUT9VjOX@}9(3A!q zysp6u3<>Z0ZK%V(WzAZCy-uu_cY3s4bNhV z8$aXLAe6Spkxsm2ZZf({FQe$_g`N{5(~oz@k+{vN#7r5Vv9kEKTV15I6D>=n)X_La zYwUL6teWJeC%9LEYuv1z?kez-Juog*bpIqBBlu|iRVHUp7g3DFl|VeX$GF}=)Un3J zo+G2WbhiR?DCHrBY?I~eYg8*HM0@(hrbXlI&@yR8?~I@Wo9*S1Wz_VQ3b{MaPBLT+ z>3!_hM&WUO0)?9Kfdnrs)7#H;4i{!Rr;ut{h|rT0@m)67lgDz9yPZc{xy9(wixtcV zs4!>P$8(>84@P;2yx@CC+c^uJMk0+`R={T`YSa z1=bBxT=&9^E!PLiXCF%$3Kf{hoB5FZ8m!%phmB|+9f|we(jz+Vr4eaQ}_r#vIIc6~1cG~8KY5#7yNal!KP@fe-bdJ6_TW4>{pIN^6~4x& zRZ<*MZ=l$z+KYb1Peprz_9nG*w)gJ}f8{xOF~Xy@=M9rgu4uIjeK|d@?!x6q7^@}a zK#muyuDKU@42zahOSP%iaPRS)XK@wt!yTnvuZtO2b|A%n92;34Ib8OEWYX&paAC5I zB0=sZW<~U#H?HAP=PseUkQ#Qj z!f0UAq)1m#5D*J09TAX@ASlvAqzVXvARrx)4hqr{3q`6TRiroR9qGMDk=~IaAf5eX zZ^YyAl>a&Rz4!aw`@I_-vR5)QnM|^>lF6)<5DiE3aoP?351uumeXDZI{0L%^x=s7I zp#FRjEkhcW<<`6*@`~Wc&WaBZCA(5}1d$gPNnAK$W*P`i)NJVZumXj<$I{Ql_k?qC z=cbBMY)*Y*;Sn}?EW>Re>%z3{K3@9CIr)%NVCy;8Od)-tNeVYg5F^={jF1ml8`qAc znVL1|ymd+a{!(Vhn=Jg{$+%ceH>2?dguCA6T_0b~1HShgZRRnbpon;^yR+tOlwFHo zFqLTZBP!eNv+x60%e7Pv`rdCJ)T+vOU}dw5=?+m(36llP*Xug>K1=ht^)M4>)1@h9 zJ8nG3+SizaPMO2uO3qiU`hHXM7Wg(=0>cAUrPkA_m58i!8#koQ6F=&#dNFgYgX~pe zn&`CSJB~Lfg5sle)r45tS1;5*Tb(4Q6{+5o0lmCq#Mf(douPOzTT$aN(XpGJa zzGEMPjj{5A&0xVtELGh$^Dw2OMRdTMt%z$Qfi?47!UwSP$>VI|q7*;mW!fL4N4YIm zZ_LE>mD5KXv?`6-Cw*;)bE0dOZm%QD&*g9FyrBD-Gn1mxWxwJaHppNTaMo;0dU}$j zsMprE#U2B|K6w6Ysv-Bpi|h8P2uF|=q{aQYTVmrbzxCQ;aD~MnGtH)t)mn+I{$yem z#$!sLJF(L3er~yL!{K=+?g}e3#@3HIMlXwKomPAh>30xWGk?CvH*>G*rUwGMtA)SKX{W>JOFwMI8oD{}3S2OGxaL9Fog_?&U&+pj90$8x16yzMVXm!Tz&(?bPlG-)y7Ks~W@e2?ntIo}EH%R=P6;6#B#u8H zSKC-{(d&M)rF!+DlF5gv!s4EwzQ<>ac$8o8V9xFewd0Er)NUf6V((KG9ORu10JDUv z=1Q7c*ESM2m#1ge4f(Yi`I>DHr|lAaiYH@Zhfb*+oa$+>{a$M`JJRY935a^-UV?k-fk%`Kxm^a%=9}foo|#}C z%)5p0qsOD%nYLl?s}`x5J!n_sGMbAx?c~BP&-7qvT4a4vZBecd$G3<(YnwJh-W^r( zw=Jt9F^X$|YYov}@a#*$wT9RsVlA?+R zWsXI?d-=fUE@nlTYn{IK*m&l_ITkC_`w8sS)1NrUK66BGlE~Y?SzR+&cT9B`(jx40 z32w%%BNrKWi=0@L!$zahwsA?&kIW%vnla}YN(J%qnla`HT!azPhxCcWu>;vW(y?aG zmma~ve+TTDjdzqTl2i4z+vh~9|9%qnorfzDvf?RX+HUY%Q2<0BxIlnkFNS|% z`00BSFmdlyZ5WA(i2+ebQ6MfZ4lZB246a0?8FPYg2$A>CgK$$`5M{;72RV-=K%U2Cknva=@3z^5<~Q1(fTy`hBzBgA8!Ym6CZ-w1ZU8Y=mX}#DkIg_h7s^15C6RfXR-02u0vAl-CE!=nLWT3=(+H zB7q+Y=8)joBm^k)O9(;pNDvC+=mjKr`vnOSVE&s$_!~mv5)$M<_yB*`)zyK<#zxT9 z)dhNbdO%-)9~kc_1ykJ>V5YYk%nj6n+Ob+NJJbpmM%uu__$RP9)(MuT20$6)QNDr% zO{+-Ix`qV(5Zc#~pc_Kp1`-TH7#kY{Gcz+_VPOG$ogD^W=f=V6`~+BCm;&Dxr@;Es zG+1Ar2OBH%U|PnrMuC!`KiR8)|cha(yc@vRXq0(tj5L9=e<^BmZv%KxS%cpgExGpb&sWhRewOOZx8&&?5K; zKrK}N3k4vvZfk04LJ81H*lUM>SpHsj$-0eVfaU;spjA=5m-F{+zlVcFX5G1s;-I0S zxu5b|{G9{-pM+RsqXQQe6@_({!5#yY7GOKvRJ;z2IZz3T0Y5s@pOBj(E4Yswp`xII z^aciK2_OSBhrP}OyQ12DKfuSw_%#$DuO5N)R1~%-`X3y=t5E^nnPlKG*XSDjk)9A) z%*2Fd@SXm<2q-n8JJVio{7ZUMqz4ldJ&J*K9=d#k?^;0hC3H)o7|3V-Rrzm_c90$= z0dMVh`X6=pNd$O4IjRDGl)nynfeBUs#h^IyuNmC_3wmrMl8G5IKvhAkHs}WfR3(1y zjW_>-9v4}`ief;IVqk|%4|=g50`KhiMzk9Lf*uc9%+8KxfYRalCu#7>h6;6!jmWGQ zC<(r+@h|AHk&W!EY^VyLTM^X{C@4r^AR;2Su8Tt74=6Q$??Hdm9}>yV&d!Qb!STIr zut!e>8IUm41)-Gy8Ql4c@-bqOqL3V|!rwC>f()$k_SG2lSM=scCzJr}tf&e=38+yu zAVXJxh=>gNy&W{8|Em8OA$|GK6<|lH04ni!42oZTXOQ<-^iY8UkO6E2G(BWM^<4s3 z0U}6maOch)O~XC$zxO}JpTi@x0I&x8-RL(4BxrgRgY>=P|BRmT9nxC>-3Dj|dyTNC zgT3;h1h2jq|0n$)0|VOf9!uh zhBI7b0vj7M`exj31PYRWL6410WMdaebVK!m9}ND29upJe5C1WdiR|oYNaWSMZnRg0 z(m$2|=jQ+IH!_(IO^X>~}v* zkzTMPqjZSw$*>W48jIhw*Fqzsvjw7ynED55}**(c}L;`lJ09;n>B~w5Kj! zJVkr*B<;Tb{slQLivHrYYZs3_ethh|LXRrz|0(^i^6$&>|Hl|LFz;gQ4bO`}82zIZ z5Tj!kfcv``9)^Ox{txjG)FT~4bb&GMBK!jb;+GiX`}MCe>f6oe@L!C7<+p$RYx(}O*4Ea*$;k->8?gPsW{k2I|7t6KYa@uZ7suHO{lR8j{}2OhH%_t> z0jZ86;H{k$NOF=w+l@IxIKjQFd(dV~44NO2f)=>9*5W}0+Wk*~_byj|*pM@z4f$s~ za*l@-`0&Gyd=(V>?AeerJ+Ff-9~DsaR2gIh+yOLj2*NW!&h(D>hi5U%&{LR;}JQ19CXnh;b!?*a`7_aJBv?LvF;F3=m<1tw@)F^tD|fhDvd zn@{cn2bgX#wF~XUyTBO&%0}$t;{)J_89;DwFo=kV0D-Rq(e~oEZ{MQr#i^;OAU!=D z6ciMI`V?=_k?94xvVA~HhA-&K4FEMMPr=93r=Tu95Hw^2g3i2`ptB?fbd{rQ!YSZ$ zLk<}Fm;~nAlfh7ZIv8%u0wc|NU;^5Oy`f$B(abLJfi_{k>0J;wy9-`GcsaKVBB!A( z_{%PcTG$1zAr8tOoCvSqLr9z71-Z~h9FMXKLr8`;-QuNPP+MCI8k_g*!UF>XVER)9 znCkrqzI<)~(}Qhbez+CPjP`)3i9xVDGYBfbB0=@aE~s0EHf3lJZd~03EfD(GcERWM zT`(~*0cK~W!Pl8#urfCS?Z1;?Zt)BF_H`bttt_DJzhg^FU}p6j7=rj?8@phBYZokj z{R&oBSHZ@&C9nx?zZ>h{z#>fl3T?mNpzU{S3uW`&{cmscg}3>?4siJRdm3oGM}@yf z+RufEX`wFygHx*i5k+0~3k?MYSr`=b)BbZ2b=7y2m7t{);$K%*NcxWib-~KY%1V$z zR=K)b-@Ad(F95gx9-Twmi!goUOLo`zZRN6S?T(9 zSP7`3zZ3~(!Qii}g`!9*PESuOPTx!0WA=NqBtb^G4eL>A*FZg8NP#jzp(+k7i3%Di zdqv>m;rtXV>Mqk#Q&YP78$f(rh_9>x^>iJjS%qRW0X`1mr;rf?XJ$tzU7s7=(bLmH z74cn9kih$o`1o}XPchOSIpPv%V4!F4onW8hdNqpRcl__wi9vkYBUj!Tz^zCFlu)Qj z>@|f#>L2jU>bSWXApv9jbI9I67ZT|15ujQ-_z(D2`4IoqDLA=|O$mDy2E(ft{;=A6 zMZEX}zD4 zN}&7c@9KN7PW(I!D0~zF1sb1NC}pp;)Bhk3Ufn|;h!5)kwMt3-y#Vq8RJFCVQ0Twc zCn7qAj}P+yUIOL!5^~Y4?f(NG^YngTsJXKDv;;G*3;ERi-YD5Ein7R zwBqq4SOTPg%|MH;GYK)oH}Ki-Z@=q52n2z3Zh3imP8QtC%+3}-6A*#A8~g6MT!ew>$wheu3eUB*F_*8Y}1%*Qu2gTcbggb*8r{&)C$2xzR|{`}1U zC;j|}@cq|5@z3}_>E{PN8gCyNrv8lo18MIEHUjkzfdcdoG}YhW|3?1L`0wNIW20l- zzwU2)@<1%~e-t)M`A72XmjvU##YankVvjBZ{r4~N_f@hl%${^V_%r;cz$|)(MU>?n zGxOg4^=Ihx3=Dh!zh{avh%&pM`D^@Nc|iWmXTI~_qxx_17tVp!iQzN|#-)h8*ac%R zhKcC(zlgWtY8aJ2ggWF8z`t-c{5^&~IOyyB7-pfb|82Za|7*No&VIT71LD1P|DsMLkmyFjH6s#8qI`@X zpnQxJASgq)4teH5$PT7Igg!bb z{~~X1Z}95XE3_{WYF&u(C8|q)0&0^z!KcC?&|UZnbQgt#-l}(?uR0kFe0&c&nhL>a zV>%dZ{Q$OR5w7tdq{H=GMOi6mZm$BZ z-Ed9U*$H}kKY`JXa=5Om0yBNJV5Yww%)vF?+(;K_n4AFJ!=s>Lei_0FC|}wHrEpzX z{|yNmR(C)P^!XWtzCOdl!(ei963l!V0`qedU}-t6#>z`0_HC-b8}0ON(&r zw*1|1X9X<6HQNeYyKQc)|Bvw1`O(k+_s3sq06k#*G1ngz{+Rqd#s7{ZHz&8`oxi~F z*W%_r&&4S$qV?wp?m~k65W(Jo)8|i%a2FF26y)aSzxZeQP%d|IF(DzXyKqN+UrOx# zfa9TXh}ckCTnu9HqYB~DM#+G^zb7H9Ohy9prKP0y5jbss&NpTwBO_LoMk7EexM4-E zKn6eP{Tb=W$k_Gv5g^8S{-5#@p74w%GBO2ATiXY=5A>vkP&NH2AHyGR$x_22h=|}y zlj@!nL=gHp-;o6(93mrwJMQ>+cvN>G0!oQL%9m3_ZR$b}L_`D-!CFe{lHQ*DaGw>1 zqt2{Mr%)oG5%6KYG)l)m=rOPiOi+##%F5?Z93DS>2v0ozIUl8{M<-AvK)(k954el? zC;3dKO0Wo639sMs_sU2Ab!NJM16|8Aj=$yaw*ZXg)tFD9Dsk50+MnfPV#*jlf(MR2 zGQx$(zn1^YCL`v4{`c}B1sdZA+J63z^1rXO_O3C1<&WAAM};4g>fh6&vrwcE0gdtp z`X2@F{lG*&Z};_&V*LBCmk(V{{v^=Q(9s=3@A&*lCOT<9AJsKy; z6Kc@r!vH#;vxDwnK4=%00-3HTzv|!nR_8sr0&>0NK%v(aP~?9dBbS>ieI=p+1ffe|B?oPvpx?OC#RUxuNgKMFLHC; zGcx!!-CFSCMV^Zn?))y@Uj2rZu%Ljxfxf}Ldni^1a8W`0204kAsIVZ4go{%Q7LJRu zAt0pBk&|m)5)~B`@be(k)J$AZIhyzkU0*umB{16OFxevy&&t$>~|4v*2Z2 z$m>V?RrZr7j*ycOA0#Ba0*goS!9hK6j;b(0PB4Rz@Y*F|)aih@2=r4#LRI;Og1jsh z@j;pU_u&g7-}$3t(l~h^C_p{ONFdNd*nf_}AHRh^dya{T0@Qx6s;Vl0&dxx@ zOawqnBCxTs0gjH2;FS?4`n{nrlMCp*-#D9#Ant(xdapO>q2TxTgM`uhys7ZHmF6T0 znp}u}tgSO$#nA5n<#@`14EV0khiAH==+%Anv#9bl+=GpS`>x5(;4)kF!B zPeD*n5Qqp51MlCz1aDqPf$Vp$;Q4OBASdkwXny|)bmhMUEx93}r6?A>t*Zj>+ZsUh zzzBE*c|D)m1~JoX@Oc9FH{p6X8SZ;#!?p2;#SM@L*R$<#J=+1F8Ov}zidvIy?fyDG z{HFH*{q?^Z`0t{D-#y2!OUo+$B$zzYDORbUuBF*oxVe>J0uB}y7TzTiNlx~2OedKX z0TvuSFVd1-=Hg_Be#WxsYeAaB$3#Q~B-oi`(AO6jXbzE}o+Rwo0Tx`U;0lQ{9;Z7< zD1Q~8^C1f{MzVtk`QZWul@EjX37SLn_^9i>;TeaIhvxudKYxG7{w;qmGC%KhgtCd!uh#YG*v|lB`5uCq%-bZ31TUf z;tbM#blA5{I|!>m=XBYJ0|$&S`=O^Xc$Jy(TZ8-Ytt=>>lRtnhM`ZA-n zib;Yc`fvU#7^j6Bt(TuVCL)aI_R%9XJo33mz1s|~D9LkTPTdZqWx?J@TXs;IOmRplVA{>_TPc8%6wWZqAD{_kolT@~iy=URfdwcS09>Z@f=f;PELZcm^MqLfcb_P8tY5 zH|()PY)UH?b->_MrY>p2|!0e5-Bbv=ZNFJ=Bw zaW64ORB!tT@ZLOTfGt7h0bA`V6}k)lmKZmciZEE$yY#w%}Ne45a9>NRL_J z|CMsCwKijtoIVD<-n4yR$Hn1{kB|F1H1T;nd17MdC61}5_iYq<%2A#@d$ujAzOGJQ zU7Y}UUXGDELFQRAN<~X6!F;Hqr$+@(o$CXXOYC;FFLj8pvk+_>4x=vShHyUe_Bfe} zjzq;WmD_PPHPzK-D^m(I0=6XZTe}sJg7#_Fv*=S^TMe@49nYS>1H_xyt6SRJA65Ar z)~$VZQYFiPbY^yzV1NY|%3)w?Drs-eAH=NsOm;GSw2^19a=BSJ;BJ)TA@$t*EZS_# z1k@mqGEfRDKFaB0li-?0vc?B8#duj3tKf`NdfI}5g6oSU7OFo&(-NlR(0)o;Bdm@Go!-F zF~5<;A*O?LbXCJ;3N+M5Dn!nEl(m`@22`;e26DS2U3CH*)B>x-<(Ry@GiT4qYmtlU zno!1dD!+MnmAm)IJ@)s-N4Uaf&Dydqo|j8(%11&!vi0)A#V$(kZVvnN&zFAuDAi?6 zNdT+$43z{=md$R=w0F1kzxCwF(uAw1v9K)-Kk>oaa!7U@$(rYOIRxFTw&tpbgtJC) zaXo6PInH17-;sRcp3$G9p-5VGrG2J_uLy(S;;XdeOv;G%odFA{-HpkZ%uI5v?UBQ+ zeSNcQvuDo2zK(lX)0UwtRcpbhc=zKXZ)D%+Co-nAWrJ(Ss#>IIfSeI;X1HvN_h8)( zufh9R{ueow&Q%c~^!2StGeL3`epq1IU2J0oB1V_ESv3TX21j1NjWGG#p#5#`Qyx5} zY4Jm|<3TvYEN01ah~bFzsASwZfI2(Yo(yR#;97Ejwt`M`AfRMH2>+maLZC*Mmm-5x zQx&%&ff#o9-L4ncs+_!sjHOfP!be+5@#gJki28HQ&)p2~_n-Z|dXPD?ZzO-m&(AM! zxryI=xsgW~o@tdb?nBB6j>}kDa@{ZZa-myW%fU^zVnS13efvIKW=ZPHvmUI!O%gvc zl|m*Q{McwKsxRq~cc+W$Te?(?wps5P87+0D zI#Kd`wq-m>!>i4#?)#$J;oz_rJOa6P6~CD08(j@@efre)P;(#G$YjPk_sEM%Hp(=0 zcP6Ll!l*v!-YQcrT6^ZQL&_w5&wDX>BOOfGQg(z(Di$(?EDh@v7n>KNtdq@=oR z<8(NtgEWs_c*t*LdV?v?LIn$Pp|9Xh;!*3ogwQ+Qj@$1Kl`Kl1lzp_`-qW;2IWnm+ z6ReVY&uW;aNW5lb{VP2FmbRq-v3U6f8#DI1O*H+sqkh$#Re02J0hc$QKkK`J+|gm% z9n|e3iIS3%)MfII?#j^I#xb|^U9TSrezG-k##3Rty49z%IyI`#!@0s@Y(rZ`9MzL@D4zb5UZs|GhYj<)%7v37QKTh;Bg9L> z%kb!NTdOg_E~sJED$!fm+JiN}y>rtnmiJa=)D;D9 zVI}1_8XChS&ywKj;s!(4BG&e)s>ad_yN+zdnZqYTP-8`JwvnHnx7V8}Y^uDG%KUHn zJqcTuv^U?Y#Wo@veeBKe*lIAyUg@aKN{-*UL5g&V-+B>$$loc9XzsIL>S)&dv`Tgw z;$lkv%vpi29~t4<+$}+84#830(=7C+Fp=d%gTll~-WsU>WeRo#@9}ZHJhN89aK2P* zTA6GqGyeg2Z>}80T^gIkedd-UO)MYr{9Rv0o%h2~>oHPylW0~yP5Sqf-KE7#EtYwy zTSW=)uM@ATtE;Eq>pW}RpQjM2b?qs}3*Pu9VWqI#*friwT*ZLa5o*!5RZ*%!ABr$$ zyCb~6RUc8OwN#56>c?(3uTqW3+j$+FwqVd9w9+(b73JgVW%5amW$u)@0ll|^)*+ zJn|s?xwPc=ih|l{C*gKS97Ja4IWMiV;NZ}%U#jl;hc$1!xtayYxw$tNVpQl6Oy&~} zH;IuL+I?lBJymSPQLa+dX-)%)2}x!`{fcqvIb^eg!2o-E%uMSHgb1*+mE?Y&tK?E}PLjvPklACBZ~w;c;s?ctz|e~{hNR4c(Kl@m=_ z5;ZMUGj)7umU4F`<1;j@$KUk9=5!&8PgFpvq*l~Oavi_A8x%ACZX$Q{{_K}i!@;}9xBS@<23Yw=p)ebpl=r2u0ZfY?| zZMEJykN+@_u4NU!tu3;6bn2e6Fe&_Ihia(iHHspPG?m+$#}#ILtH``9zmDC#-BXC{ z!Bd;r5!5$wzIRq$XhAkxpIl4-ldbGQg#=l}R_c(Q4mWWgs;f(+JAEG4P0yUw zpQ4CKNpTSblFMaEAT-`M#^ z>a%Pgk_<h%V2buz< z%&zp>439ZHl1piK)EV^U!q?HdfpdWmPQ8+j#jXH=(ERL^oLQG?K%5LEWn7jE)igNj)Kw_X;7>(a1IoatRVKM*`E z+kz99>iiWxd8cZ*)gsSOZ$i2tnQ&LCklQPqwWwkZsti4Yq|ZGO7&vWHbH*tVe%i?L zD0KE{u?Ek>H#hB8R#u|ly(56%05UK#x+E=45GNbq4QF02Ug}LQH7cGpZ{!>JCSDV$ z>d0iJc`{hx{b+*8Ru%0#Prklyv=!~@cCni>q{unKhUQiL`vTv3bW`-i-?TD8pH6lG z0blsxZe24ok^-AA<=?*awRU$;zGG#3ig8VBqA~0VoQu-c2eZ{|Z!BN>@FgoT1)omn zf#%TaL^*tB9+flwG^?=vFkh2F8$O&B)YX0B;!cHIwOz?6EG`x@?#(*TEU*pf}xNaYh6dJpm&-UCjU-LByvt}Wc&tbOe&DB|F z!l7kf4y2qu2DM0QIaP&T@GXzo)Dq3Dp{BMI+tWgc7KWAdl?UG@;a3s?*70zyTB_xp zv&T7Ij=wD`qRs8MAdnSQLiVzT4yW_xtQ^$dDifWXr>lPLN=hiM>nU@BgjQNGUGhQG z9O1Kvf_HQ1Bx3oIHzphMr-drGcN4QhzUYNDnLLe+?4G4;I|n~js&#!luhb>uo{JL3 zxQxidqw=4yZ!~4LH8g-l-pOq)-}Hqo!!m=}Z2cV3$zgrM|w!ivz*+L_+!q@?eviB>|08`YsHiB&oI1O3zh(Cr~;ovU0^m z33N)k_CCN2oQ}(`$8ZR+{9H;^P+{~OsP4lmV9N|ZT1xdX8E2i&aJrwr-1gC4$r0C)zM zF>~f~H7|`*#GjbC^U5eoBY~x5b~H9;tZk-MVjd^%XpxB8!OBuG#xqu|oDW0(mN!N)PZPdw~`mR(2d1#)BPw6?4C2~R~8Y265@Agf|XW+Z5HvLbFN zJwIX<684@=^@vJwWwX+VvV$xkLUI?uicw=7zPuKO{}hA5Gr#Vl!?YV4!}U1gio%RZ zs^V{pMxLj~BIHS|)wYkAUzN;WVT;1ZjD9ojZtJLw(%hSts%H84*q~8d7uP5KXOTFQ zh1Y|x;K@|tV`;<9C$ihDtzX58iqA9v!x z3$0&osKpVBQiVA7Et_bUReTxrk=cGKbhJTxXfn+-;ZgIbv-eDGagD2yUx({n(*U+D#8&SiF%*u)mv3O1M%b+<_tBqb*f6jR=A3BsXiIs zTRZ`j6F9whFS=CY9w8WT7g0^u!9FCXmE0)I&hGu>!j~MOb^oAvVg1S&5@)7%M=_JC zjm;FD(D~7vFAs7~hhS*5Ra)cl0B&g+qCvu;c#GqH;A?}D@-4jvWNQ2ae75FmBV;U3 z@{Ie^E28F~KhO>{b~qZ+qi(2Pae-ZCr!0^xU92NcMC%@HE>&5Ndg;2XB5sgH)8SgL zJK*74xSW#?Vs4#S;J@e=KbRJ|$;o%@$vD|kaBnwt$n2DaF`qV~zS$~Dl5`f+@vDN? zOB`@X?t}FhqXj0l$gvM&&!kc7fVR(L7@~*bnVhzcg`1Q%XqLP=_+fYDUg$+*I+Im9 zZuXbF%XhTfaz3u&KB1R?vd-XmA=GswGyK9=fznZSGos13R$u0q0i?&ELoN1y#=-*rNd6!ffda1NQTl@LAoqK;P zUdICIEkhEToCTE&F9U-IEo*Bb%&@e% z5gzIg*=-&5+teXy+xUp(D~|8a7d6!oVZLmm5j$K(oX}W$hYGx@o)!Trj862hUhomb zGf}emjns&t+a>gJ_?vYG7F;rRMXz>~mE+$8%n;A7)pxNqUbrLb=x1f>q(f-unty=S zn^R29?h%nbiSU%o^OjHH%lRF+oGt<+UgFRk6VtfBFXUbED3MmOKQw~h_+n>{E-opH zoU$I<>99ve&C;b!V@^cX-Eljyh%S$w zdfy|XU>vknC_v_BUu#dtym=!QW;!r&&a>D2hP0FH&hW`4Y3#Y1uL(j#6Bt)(PYI>R zX~%6<=D6{&EA5Qj!$H(PtCQlJ$dSoDT}^z-nu&lmwE5{*TCpc-kzv$Qvo463zW#hC zn|rUPQn3U`Rr1VUb0Ak85l`1f)8e<<-abwuFOe>t6VKCRP(SoQMp8ojqDQq&1*^Qh z@SLG{$#Yg2&C@JSQnO~-b40l*@kB0{!Nbx5z=*^a$w+R?gcWr$b!6Aj%?tr_E&Wz>`K^r47yYF}v!uJd^6< zE`;<}0@Q#)^4*OjlbUa^SNOmcME=>2CvUiA)m}a$TN`q{{=<=w`YDN|{sLF*(#{0| zO3S|3DdeKov&n%Bk&}UY}J_#Q;NHBgRVzII-KCR4kEtDtgq;`DMdI?P}xyXm(4sTdhILy zO8K?V7f(H%EpROJGxJ-0QV|Z9mg%{<^AjZl!YAfpKS|bqacgFdAHKdb-5Gn4!bH_4 zgI8i{^1k+lqocA-hm!KhQ952uB&`iOlkePk~D%@dD}9l)gqSnQQuSHx`3Pa66yvFF7 zjk5Z$Dtp1l33H5pf85rb;dGY;CwzuKVo%dZe3w$0D{OrxC`zCrGSCxK(g}-xwM$2} zC(&eh!Xec#j-e#!+ICuBob@w|$DW3p#A)*wW^+AHZ8oiLUO%FW)sW#IbeDYT0muHF zfF_hDlN!0}<-<`RV|U?X07BB4@EXm{taAi6q(-k7U+PJ;)@UoNB<&{}I>G5eV$SVS zrm$x4@GRolx*&D~%}Z&w=QAzdTBJ+LURrY>s!Oe64olU$N6x=XW~vd^EF|R7$e?%` zXmKVOQR9T~TZ|K*rbxR?-=cwlx6gH_vyB7b;zuC%Ha4~@gpQWn!#{^s{UjcTm-MXO zRV+%D2i}Tg)=Atvh<4BR+xK&dv9z^SIb-9ZD<4tvdJDzc0WB_s5lXwAtp3(`U)Y2q?2*c8~%Zu6CymVcB(bEUGNpJq~2Az zJCe{h(SAlGB;-B@j!SFLW3lZ|xDCeEQpKfinacGJq`tP35{ zIZ5SYH@YP6njdwG$>((IO~QLMYb38ZahucFq2T1M7vtgydA>oTSA!x^tnl{G={I>7 z_Z+riA0GW7xjRC~B?dP@tUDB6PKLwCkRE|YnoC-0D{%X8$|`|B?KyZdv&Q|!(Cf5i zPnxF7C?{96ulju(&wL&%4(~1+DyM!m8$mUrw(^53p>1Rex$jRP@SfLQ&16qHdtI_# zKxXYg_BiLjS|sJI^@iS9gQs+!e4mC#yLPr)CtX{EwFPwteOd(Z&-;o*#W->Zuf{kM z>tR!ic#6m9X>b^Om&n}h!{s?GBuwhuBjZkhj3`J$AZ27t*&$5M4}QSEd~r&WZVep z#c^;N$_)}z-7bamU0WNsc@&@_Y&H{anD=ESGh$?tLwoV{Sf`o$2~Wm^Zo6$yg_jej z?g@Xf*N9KXVWnY&bto2Ejtyy_d44cr)k^EsAd*Pv&H8u*d5D^_S6`~C`@M%fQB;`z zJ0IY<*-8xxEP10<@^FjM? z^2^f;RJD4nDW|Di6}CyfC0={TF@mfjelITl$O`Lv@MrPNznI*={O0U9N ztW6BtEW8+RTzf*WbGk=Qy{al*x5n311Sk4^E14=sQSoefRc3mIWd2%2jjOIOYvDTI zVXRUv!@6en;B>F96HdW7#)f8R5EAMnb$P3*opcHHH$oOJW4B1-kZ|G>AJOXY{P2F& z+eO2I+GWyHcrG2wT$Op8X1*_e>ug=UdAofOAOFQH49_By{-8y6ybhTxO`-}F9DG;I zs|u6-LHGEhRU}`Ya!h%8t-n^2)8*->Y@=WfXIbePCyXoGHUkw*PU@9f3{EvuZqj;O z4;R!weWBS5SO9P2sB;+w+(?2{h{FvN$LS@E$(bnhZWwy!XUE_wZH?G~gn64SiwE$5 z*U%Kf@3&#uH}jg@`2x>6RaLXzQlFoP>+sC-XNJ|Mh({$~bJAU7deW`mG|n)+BYE%8 zVSV)TZ0`fggv(jQTgg|>y~81$)=ae}q&!H81$|P^zU^>S?$1zyQop!2yHPue2pA~_V=1xd|=d;}` z3uF^?Cc*c`ZRSWRr06o=!096mC+jCJ@>4mswcFr4(W$|4@a~d{_OKQsPe_WS-(rqc zYKG0|n?7KLPr_-xHqYIJ^&*&~A*?llVrg8H0s3i0qt4HD#u2TY4)_*vuKUSDPMmOa zi-ID{rAH6L7v#S5dq?bu6Rvz3A+=Ednsaofwe!~LqWp$hA5cVf!V138n`RQWB=cUy z>N+`4*gljKtu^D^N|P57tm?%dguQeZzxVRtTPBLFD$05bJv~O%PPmdJzwH{i|RvWjyH!T+B&;#9o0vOr0dR#&nib>Y(0XFAFw*L88ed^64%?L(I_=z zIl$?1MSp(G(B(c(9f5j|F~>y7=-apJ&-kC*+}7}2KUc{{q_rc^Y-(dvl&QPqn9>&{ z_|-pbI>4^xLb9$pn^J&uvC*rZAcn!#pi?ZqB3k*rFD~ev)S$v%R7TB8%S^CP+jL+2 zvY%F&Di;h+3#A`X*zRX=0tS)CC6F6=)&tH=wtPuvgp30Gq}&087OXBFk}I4wvC?%CaY z7h9&yG{S?|@kJ_lO@z+ESXzk~)sfrovQ31{$PMU3m#?0R9*jHm5qzyYms+UPpv57# z9>Z5@Y6%`MZKl*&7%C5?1ibj>Mb|o2=eb%M8uU4uzW4e83UAJjZ&;KBN9;yyN;R7r z!;i3i`&uKV^+^IQtS*>v4)Qx!7hafUc>P95g3*k!WCMq;Zwk{!k#vXtl33^~T5nqQ z(`qG{onBo`^0vv6-E>M(Rp&cxHaYN=sFk8_n?@-Cq6Moh-NKJl(ppinkg2BaFt)Cf zk_6w))vUYX%=kflu@Pbv&KIi~?p0KYr{PjQs-c&!QD{ulovl2TI;po3O&JwJd`Zwr zJh|uK7b)Oj*WY{fdI>(0so7S{w%9gm1-QAjbt@HD#j=;iliwp-4u9nE3!3WL0=KFf z|G6tFKh%$p!H-hEg=u~JltiHZtnRZ3;=FAc0M@*rG5hPs(E#E54Z`m;DKph)xJI%L zs4?!oNNCslU{XTTy!F8ZH-i$d1(OhSRq#ghWfdJk3+EFG=VVSEB|VpO>9cI}BvNyW zQK8eY5gGO{h$Bb+8e9)D$_XrCT{BJ7Bh336{?KiG)veJ}jY*%pL}(-`Q3z{ki!33@ z7d1FdAG>=_UWq!Puemn-#;WJ6S09EwU$@XX1E-RU15I*cSa8K=G1bD*tvysFt*_ga zbw+CEwPjHmw8Ab=9k-|8&yme?3ds27R*)6Oc}XInVP5#qOD977x&W`t&o5 zu~5`uY9U&h)r+;#vKZ0i!Y@W9#p0Np@MhsIjj&%2GUv|C4!jQY8(d#{j-=PlKBN{j zs5(7SAs;}8sxLp8%j;vgIfPjbR7{pdD{Jw->7smrg!Isi+Q8M5+w9==B?+6$Y`#1+ z6zkZKRa8jSa^T?Kw?>RdovO&Pa7ew%_LH+5EA|-q1-Iy=Z&++E1vpo(c59h3FSE@C z-L8*h753 zwEpq_@k^s!6}dNv$d1Qs7yc4E7Dc5iH^o^C@GIY?7JD~_@e~B{+m{hKtoACwK4eh2 zpqDSF(KKgfC~Y>dK!_PD5+x`zuO;s)6qv-S{f$=h^KKYe+=EQ^}$H>6zdn9TWbrFdU_|J zt$$K2r_lGdq4LqLh);g`Nmpr(P*pD_Soft&dOF*Q_H6LhRK;8C%&$~m>#d|G**O=^ zZ=ZDNzR{(y2~Vz&8Pa+wLw`je@han3u(ie(bIRnixVcP2egXN!9euNI@);WZD(3!p*cQ=n6j*5Kt^+h zi?!p0&Y0K9O?J_eoix!l2juO&gDo9Hop1XJj>blx=4U8>VySsxNE9rvSZM)}4nKqI z&+I-woRCD8z~P6;u6NC8#>w?NeQHqOcPF#EoC3i_Mx{FnSaBC;#nw*Bv|?|5O>2A} zMu(aAsTm&{*dJ?SYOz^qG2dCb!szOTwTVYG2o?`ihonmK1!7Q8`=%eL?qLC480;nL z0#^Ov%t5FJ_gpQZKoG+n51>&+wWVAgvcTr410)D13n@#LF*E{zzOe&oAHj+vt$$4R z>_*9P8E(5pN=d0W$=+mM5j)!3jvwnS1X=ZHMa7kFwQ=IMm>ra#-8>YTwl3&$lvFC2 zwz$D)9NWZWS3;MTp-WlM8CsNuh(dN`=s6n;KlvR-XqSHOYZ#X$nYykm7T`dgvh+;- zj!+-g$5^{O`2cll-oD(IuKA8i>g~d#th*F`hcQ+s*Vo^iV!H!k{f2d3Ta2;l=E+P>}UXON0>r?YFJmTKyG`^?SKwTOjmX}H-% z&vB3Su9KK)D=wJTx} zBkSAY3D^1vJe9NfI;90Yspc=OI$&*P#g)6ho^=fU3w@!+lRC4y4 zpyLE$NKCK%_V?0yf4~uIzw`*(oiMucwl6Kl;^Q2VQTzAQ7y}C34DxxtSQhMrHB$_oLB|`rB!Ltt zFvai}xWn?!lMdU)8H4v5?w9UB{ms+E$D{nJcG*c>juFU$p>QqSS=ZWYZoR?l>|NQ_SY-Jxkry`pWxkqr|$XeCt809TLxL z+%IQv*?l;I#1lRa#^laJU4%F8T3uf@L$S3Pc6NJL!6`Obl zDv7~hIN=B`>@_Eb!~aiv-vSp^wf%q4O0$>U&%OWr{qDWRrQ7fJ`E^kcH1ztwXPTNA zUNw7+0~}y-m>FjVL9@iNw9>-Vv`hsppP8Co)2u{Ge2b41OA|B`tWUVH85+B-kz{m*91Z#Eg1Ks9J`+3|en`3V{O<!F1cv5qDsi`N*mVaz^A%%^S91UG#%ey^K_-@+tX`PYFtSwBEM-hL72zM@~xn z*DQyz^WVnb{b{4KuXf&6Z}7g8O<$7A=57jk34@N&w-jA2lG+syv?bhOD|tHla^~Sb zN^eYv{@=VUeF`$)&70kBbH=+1GgiDE^zphajqYildtlA#keBbwShBX&%tnPzNo`NO zx*_B0rHuGBLFs>aywB5hMx5OJ{hZsR`)7VMd0_Z$cdn7{wA}LA)kZx#AL?CvA?oKT zL8)6ZUP}wQtEm3Hix-zH+u5M-$=m@m4(|O+8|;O=cl(7A1O6wh+3H5Qqi~Vy%;Gn5 zil2UXK-S_}f1PS?9kcq*srIfbX7yfrYfkIj0Xqv$4+d*&uR0DkEPPCAm+6pC&3^aZ zabE(`zBkkLly*aY+H?QAablJT9!s1p%ufP0wive#r zwy(j(JH@x(HN9TrQ8}&9hQ2L*(qPbtCll{I^Oua7(#k2}Ee8~DKT;oC9%G`4WU1fU z_ruE?NV!Fc_cgh@aIy5#t%dht+Thj`y$@oyM@R6c{-D&;Mb9>L-uAago_nRV;qvxt zhnw>!tQ$Ia2?}`FD~{*x9@Hvhvh+lctN$Eb8nP~I-#vx0)a;>=?bkPnUDNQR&Pb+F zqtEW|{ArV%deXB}>!9Udzwuz?|BU?kj|;mRpB>X)`sH}n27}t}HUA?cMS4yi_t!gD z)5Pg?&-fn8x{o{fqOh-XT~Q?rDA>*{8a@L)ND9VZxQ@!>X8#NOmj~bz;j`iQ!k58o z?lgYvfX6KtGzh-D@yN|ipdb7|OMCb);j7}VAn*$KR(_Jb>A}FA4?xDP7mCBZP=vdy z2=`!-@du)w`-pHi7`5M{{7wW6cVQ9Etf7lnwOHag34g_D3^zT|9tZ;GIgEV?=*InC z)O$Y>?mZ*eT2kkHBHV{2Xvf(_f)C6K%|C|k?-moCV<}MFY>%XG`kz3#H=}*zB9V+E zCUC!-pugw$xLg7}z}W|cvzn+Qd^Gsb7BZmG?)iKU@o=Y@=K9?9gsTVw;V)Q((|;oB ze~<|0P7&;CDF!mo3%&>V&`p(vPO2=l0}pzGFL|XzxRZ@=2cF;w?zhw2m%kzU&*EM< z!u@^pbOF%-@BlW&6bsL?@C10kvcUC$Wr6EMN65iPziOlseT^1@Wr6uprqqMd-va%h z;0~rD?;P~pA)@Etp8+49Do|vhuPO_y4{$!2s1Ga)UBHL&(0`i4-G8n+arB}B{f*Gy zn6b@+3@1Q#;|>!Uzyp?rfdz^zaD9m7`k=}J*9T2E;;ux(J&A<7B1zMYzXke1awB-A z{Q-E|0s51VK>vXU@!-KAmIbvwuq^aLonTq0jBeEEN3xtg`#W#6EC2^?-v`03frr-8 zyIr7vFv^m7kO&^IEErW;;QGL_z`D_ZC(h%ef#5}d_id1_Ftvj+L`r4L`i`*lpe6>@9_51xllQe}bb1Iq%}2N};Aa-Sm|^=Q@d zWq*`#rz+u2QNo#tWQ9&k0UwwL znk;aAsH$${c9Q!F+#axAcmi_4eMy`Hp}G0xDOVshMLVZ6XU-gvAMQUT+?`7}i-T}y zFU=j=ns6p2;XYqlKlf!pKhA0*_Cvq}iz*8^W0G)}?KSB}U;V}e)ESTd@?*YupwW-J zi>W#ucODb&g%$MUj$c|dqCG7f-VQuyL%6$`a7GE?tgUkNYdr9lg=^A{D0lCEBe#{R zTUnd*<34AR&DrBe2geJD>jJ?8)`#$?e>qIJx7siL znmp8$Zp7W-nmbS%^y8j)kv{ILrq5pOO8#b{ldfVgD#MR&a{l>K;^+F zf(Pz4VfpJYR?A&PCB+Ac`%4$joeu2JpCwk}xR0HfAHMpHF5QTJv*(yvZ3A#pj^5RT z{>K@cg8qx8CA9kW?kW$u(u&b44_F_HHo>c1RNI6(sjcZl^bKY^9wnT~Nm;fQG{X`} zIH!}Q4Q@{FJs(auZ(R5pL;gkU=8PuXAx|HktX#j*P$1CbKPU9mOMIre=>Jxy@56C% zZxP`@xctw?3mN$0 zjqWZUaGRjXM+NQtdw3f8#$R@#qpYlGXb(wu! zJgA9w5qE63+C{BR@I^oKf$KvS+DEnQcLAx}szU#@wujso#yQi12W6*@QQ?NUv}?&L zv~6x0ZJjfeHh<)#%^x~x<17blm}RHjtReKxO!ygAS~uN7Yd^41&a@QzVsaw+YZpD+ z1g3vJ^xM1WLu!5&#unSpyJ+X$Z-_728Rk#^ua$l+-t*5tPq==IjvP56`UZj_>YvLO z%jjH5vEre>aCf5KpD3bV3U`X}V&HbMIWhg-?#h)bgu}TAcaZR>yZzB#oAhgZ@Oza`;DZ(rrO0bzft)OYS41e{|*-_vFOZ`7Z_>FKRsS zY!^#UDw8`kO@CF}{>p~Gh%x`k!@KA!?jL~%r;q(0c*1|S%pIxqjS$e7(ZvTP(qK~*XGe(RPHcU^Zqq|qrzAvWIN^P7_$?@Wr#p4C=;v_)x6SN+{b3X3 z%^xPlOPW6PHCEE*l6emC)6qTX+tnWs_aWG2W@eV7oy+QGZ2vSF!l|jWWy=>v=Pmzb||lh8whq z`-r|gk1PDe^98Ri{ny&|`@-c$z^Y4sRogW!JoD{F(CeLN9sSC@_RTc@@h+p*SwhpsxN0=73gDu1B4`o_T0=1*X3C=0C3|t-kWAqrXo7)#<;A_5ar4 z68)C^_LmycchjP2-+OHY-gz%K7Viw>-&W^fTRum)nHr z-`hvtPRDZJ5uShl-stx(um9`xS6BWkl8?IYbsVVU!0(3xJjGlljTAG@h0+7!nISdM zJ#lrrc=vi^LGao+5g8BRsSzZ{+y$1cIM!wz|3!-{v{AA_%d1JNv(HSoC? zJ{$fj@5|ngJx9y2=PDcfO{Zg7ej%{miv4G${84eRV5IKl61o8Z;BgoHZpLjk_D>E3 zW}3aVx7|J4)uqE8ai4pM%6J1%)@2FY-#!D}ZLk+)`+2sRX1j6CUYc#j`TPjB6Yq-O zkLN4)fnO-oenI}n=3>v_0U7aV`t(Tn)An<+@#Tkz>?TdXwc>YtNKhyNf9=! z7tf!eO$$cT$0Hwe+41gOJ6G&C)oj!>J3W>swq<_~Gz=^VT(`L_Sf&P|j5e~xy5jnK z)4+H?_0&`QdiLzOu__)m2}jr_C1;vh*y6JNp|CN)9=iN>3jW%jX|^#lV=J9zKcJn> zAnJC3@4Bto9CP~+4PS_|frAw4+iAuhHbNVH&C~jWb?VfKjvqfx$M$a$c9e=ev0_8X zG~}Q+iTT%Ub!yz2v-&VxVSixRNm6ph3ymD zf-V}N*nZ7JIqY6NojyQc!4rS}t?l9GvjM7Fx4VIs3rgLXj#|r${PKk5d z?ZAFbvt?a7-7akRyz$rG*_PQ~-F~V-(X+nV?H#2e+uha0pKbB4Ub!S}@q`^7_7t)m zBHQ3;wx~r1cb1bO4S#-KOWj_KHXZJ4b@5-md^ue>d(yQ}nr+C~c1WQ?v8P>%y7!Wh!&t|v2vQJ+<-@wymiS2dX_pn{e9@>$P9z94e zLALzW?bS;Ew6MDPj~X>fl;g$^62$&W-f!-*M}$o(+hZy^iO*?p>s8)c?4m)~-s$a) z7Qbnr^{Au%>h^x64fwRW_zxU7P~gA!KW`}ZbZko|Y={*5YPPF&)8N*te9nZ+h7k5L zY%9XHbDlO+N6wb|tlMlmsOrxV0rBtBu~K`(5BzXI?D1qV^VZ*6H^Dci0gD~BMttF&j?YwBKI zWzV`d4c>Z{Y0%C)(QH|T{T}RC*=ElZe@)NwnGXjPJH7RuxbtuC_N(21cEmJP#!p&0 ziLPRgc;WgFDR;^Vgic}|49B(ik&5v5 z*XQvEH=E70cI{f?GZrpgD8>Hqv%;VK^)Q_(K0v36*zczj_DA>AN%*C&_V%CwHiWRD z(`*P&!Di?j`U&jM!((6R^~uopIuWyBZmwTb_>xkH70>lbf5Nx3sE!hXfAliV~YI!V*3Er@Mas)c_g^^d(N6Ot@zGsfzybr=xc7-c85y*9g6u zlNn1JXAUmcHhA{+#930%t2}OF8=8Lo`du^rTD_^xbJ(z9g6BtfenuOoCAjsfu=Osl zSC@_HO$RnD5pgC?oan-xak$>~SEGBxh!OPTk7)Ga%1-}6#|!gdi}M8?+OmWSHZ7vV zd7n|yzV*tBRnop5Ve9a}vGP>M|C^uxeR}uWay{zewdB#eSC3tR#xLG`-fqPEt~c(df{zE$ z&M{4B@5C_io1eR0|10en{YT>O{>EQ70^dM+G9CP#pN#vtqdUa&w}O9F`R|XeI(Qui z{BZz7qQd&9(RgMADNnqJckKG|d}2G8c-NkV*bTH(u>Cerhu~x}@S9s3{ z?_1$DVZMVU4*OM-4-$<4UT~XgI)9{jTQPPm@OkIF&dz7v^O`8{0pfj3yeD8U=9PKR z4EAYVsmc72_pR^`{CH7Wi{iZ(yk~*eW_j-j@2BbyTzLNu?^96EQg6&SR|gNoZw)Ud zoAIQdKtIp@^O`-czwllhE{_E$_j%YK!uNj6#=3^OF2hmS@qUsPo_KKosUn}Ao=)dV zv6o~1D4LHoFJ7bAmcLc3rSMuWuem2;ZIRbUp9L>?PZQro!Fy=<4hpf~0kZir!a#0- z9sN(-5~T)H6UNqOyw z*OK{s8$KtU=csva4xa(bYp;CvE3Y;2nXj|bJ5%0wJID?kJY~}WJm5}MHh-4QqM{<& z@ctmN9<1TOdkT0Rm~r5JDO@Ks9C+Q7*XwxQddaD)LNx?YrTMB zedhI{lP3<-<{7fUfzRLKHF3s)*KPRBeyv{eIykRg@;StO)}od_X1Jb<$!iIHHNrXy<2T^Y_$IybgB;=Wp@e0A7>kGt0Gl$?FZg&x+SS`3zP* zn_7#@d3omToxi_zCvW)UF^|*Pp9@5tIEu4>_NG-l>)RSugQg^wPk1fE^Xwmg zVJphtS2=62nwZb8D1U!=*A%=i|G@gErughHotu&Wf#$oa^KJL|ztgTagIzY(`R}%* z{C8WF)Bn}UqcU;X&X>Kv{JqKUFh%2{tJ+-$gg7 z55^HZ)@FO5wisvIG44BliO{>h_Hi!a-m>i+UFPu{k0E(X&+`hWPL|MIjAuXK*$kMH zRFwwA34#xUe|i(<1$b=4V^toL@R+?X^68CyCZdlDHv+yB{2f|8&|BT$GktlJ$@I>T zhYJt?u=!Kk_4x<%^)DyI9!VYta)19F@QV300k}_r&&HU14rnPZy@0=TALoH|GvIxW z!?8c%*q+tzi!mRMb-p-M=o;_wn3wnT^4OopyKH~Mwl_QvlLi>v1kdz+>E$(R*3g#| zo)TkQ9?PuVm*+~K`|P|Qo#z;MEXi{%e8(csaqzev<$b84^!M%EE%xW|enIXde=#YZ zmYgWl-C6S{IKksq9yg1!2Q6IoyHHjYrJuiNm*@xcn27Jn5cg-j)R|Vi^$aZ=_Y5t0 z<0)G78v2NH$Iz5tFYC&FFRmS^D1Gi{Zg|fi`j|ZC;r+8b?&I+e_py1*$Kwp%C%b0d z5?$Hje&#*i=|{d1MV>x&dMX{-I9K$gd0fTg2_E0_etRBE@VJu4Q@jr}=aUJpvU!K4 zhv*gW^m&hix1Ts;0v+7CoDQs+M)|8I(#}ucpuELnY3rgfwD5&!kq3{LzFC|}OHLE5 z}wuRQb6(jE7 z(ByVi=Cf4z{chrZf1va=Uif>)#>Nug`L=!2I@-H+4eiCCtt-|20LaU6O=&uT&*& zWx{SBdB->{{n~s|$vqO4G?1teJ|jrVKxTnGhpC*0ev`N##>+#&@LE4C`WC!ggTs7x z4CcBsu~rbsh92cC=Ax;rL$lG>6+OZr3Nyd;2iQXU}ap z_pzP>J>0*79c>09;VVAd3GtjRx7#z`mXA()*STZ-w2W^ZMOV)7x&rsp&=>R(x zhJo9|VY`3$*nZ&D435jr2mUU8d^N7;H<%+Y;r;>JD{|jJj&1uf$5CRt<9p_>{dPXz zbvW*SD&cu7p2M1IYyXYb7I??^Dgze|{d;%bnEX_i9f|!r?a;zhl)o0v-_uuoe_?&W zJ>yqKulNe1(eLY9WnfzT>O2WE=4_qIJ_8BqJl4;ln*4gD7aJrg7mNo<4W%hPca!tNb<(_s-Opf`iq!hLUuP)C5CC{Pw~d-+DxQwijO+cmAq(4Z*`8r010E zl1Z{iNl48knI#!1TckwEir`1;ahNdIzep)gjFtOIe+Il0!lXzx$%yd7fQ=DvvdAO` z?@lQh-!0%tES`yYPeTYhLIfixtCWgw34l0}D~Ds(61)p?;n5jk9a6l&&!!9KnLfu0 z1DD_E!1B(#HJO z%65mzYH1r1-ZU&EShmDl6HJz*wjuxS z+ch*QB-r6JS`v(At3_@bGE8=av~Sm_jnUzdQ{v3Sf)U^1Xd9Ahx3qS|C(9{DM`(&E z-fneR6P=;))|A#pM@rM7;UU2(MvEy?b~yWaX4bAzaBv%^J=NjtZb`IOGO>sdj)sU1 zIX=~Hat>2o0kF$MQjvw65MwtDHJRlk*`a;&{MscQ5uKp8r#w_P2b=kC+YqCpyJe_# zuxt+rPBnFikCz>wE753n$RW+NRN6H2n?;*up6Ik`=Aw$@wgD$>MyDyxWCp$!)2R#2 z>{AeEW3<`Krg$V|wYX4nq}ps&JBaQb8{9piZO999^RV#n$QF-=hPQ}p85$W8-y+o5 zB2f+v3yVrXgBp<#kr?)(2TkP?h-eWW5f#_U7}_GC6%t5{O9+jUnfG=84UtE`q%_CZcwrUX<7aA3j5SG|7AuPf8Xy62*$N3860qF%W~vJI+K zQjF1=+#$_qm$gKbolaZpX3f;f)YMg(ngXR}vAw!CL+Q3`)?c4wNIQs|+@_f>MO9{+ z`N+OJ#R;-awj`kLtF@_(25@#ztGY|zw7TCN!JRD5&S5sF24oY_JS5m+OuTt(nv!C*gm$u8oOY|(5hAKES7UGI(nc+thVj3UV4K~_wb5$tV2@8WIpug~svUX! zFTSM4p+5AGhxJ9pg#v5Y0%FoKGCVReK0Yx#Au%M_Y_ue$8k3aZYCUe#%qMYfFf#!x zauLody3W;cppFA|9H`^K@0kNxk|Z{lW(~^9$jZdEc}udSB?@fN{EYcy=V#8JIv)(k zkuZvqq{6I{EGfHjc5rrBc64@3_MmKAc1Ct)c2@S1?40Z!*@f99+0v}Wvw~-Z&5E8C zGi%T++pLUP-4?_wNMDe-VBUfy3y?^iyQw(9a}GQ3rvaXrMu6jGRvV^D+>i$Wxs25< zH-u*_2I<1|0VGS84$n+%I3O{)bne9b)DB&&zW|b>gR}*tn=Y@X0O8dHcgo`cDb}St7m$X!SnB~9 zpbL`^$lozT=T4^qzNhO_jsRqz4xjdbMC%|u0Qvhpp6QGLWFx#g<#+Hl7O!r|3_#L# z7%m2+PzU*fW9jnR0!U+BI^P3QrVGQIIH05AKBStdOZgGJ#puHH2ZWs)!y$m!ba}lA zNC~_<4A-P&T`6w`Bm)?@!xRDXur8fahQPbS+zAR2bh)<%WUMZo-hedL#YzDr1Kyp^ zaJ;b=a6`CebDeiX)&jyUmm5+7$XHzo{tX|S0O}4C!4P;iBpN?6br|*oWS$PgctG~) z!n^`Vx-R9n09mfXXEh)Lbm?pYq^C|+F9K4g3)28I*!ueQCqROAd4&M7QU~b(NE2N; z@qkFWFhc+tq04<7Ai=s=QvunigDeJww-5nQRRbIXBup3P3?RumNCV80XX$ePZ$QFy zkbeRarIR`%AcJ&#P6wn+2bl^;jt;Vv)6wOg3rJ&K%I5)z)`e-*2&JRL=f43d)It6M z$Vy$TC=R2Ou_QpUba_n%BpB>+b8I#sQ*|k?0;E`%axNenbzzDDiPoi43djH*q#kx% zb<@SV4-l>i?%W>)q@gZWI3QWNG93iSR2^5-0ZG?E-U6gpm+~|~zH!mM1#bp}DaB@Z zOm@aP6S`p*)hw$ZV;rXJ@Y`&{bOry#yrV@nM>IDW4JMvKQN!WI7eNEN0oFzG21*^DPdmS?=Vtn-v7?k|9|(Cb)Ap+DwKN zhXKh&CC`rr|QsN{|h*n_xAe&azo{%AGL# zi|J5vET)7oC8~Y{Ze20?+rjCyo8nTPvO{Bz!67@5_E4mpV6YkOU^7^2NEQVHzG#KT zvDIm6o|cXU9$FIJ9V)q2dso@W^H4E%*@1%42$b!1i&cH;WJeHkg zfM0_RiO5rp)+^QQG<6*2l>1uygZfU%Mh!QEp_9R3ljBW^rg$&34F-oZ!DKaHmJriy zhM|clM~gGjO;k^#!`a1dw`yr6+GQ<6M6x8Bl0b~YZj{eZM0ClfmxQEX%F-nsI%&mJ zp()9ta;UHMacf-fxaUzeDi(&cL@Q<)-L!k-oTeb{3eewZa(1=aV=-%LmX)Nsq*oxn z9Qq8}oY#i!GW>8ZC)T&Nqt3!6xUw?dYK;A4#u1qz~1gvd3UaDxOL8@IvBcqlD zC=2h}s%5H)a;(#6cc$7{j9CD?cj$}ClwdHZSsA=#FGWs4HCMlaAc(U|VoZ*-c%JIj zzRJ#_ajB57fCOTlg_k^BT+uS-s%VkZ42p6Hm;^rBJmTxay7i`-vReB+>aSMwKo0&R zn7;z#I_MaN^`jJ5b~=7`u|Qq;);UPnL+|#5!iE-i)%^;1ghDWy%V}H)KBS`;CcZI! ztnOM1L) zXOWw26y0zCmO zVm}iUrqPT>j2D`KzQ4lvt^hi$)Kml(8Vj_pocf*^&#j93qNo5)GC+=cDRt%N4?&&Y zbcI%Hm?b{NEeb|13X?U?keF)GdSFqn@nojT0;3$so#J8(@gdW!`Gjn{blfT zXRP%EpF3mnoViWjhQ@_AHO`#xt(&=E!NQRD#@XJ+@B;7L1z!KzJIPL6m{~YavENv0#J=)n>$7f}Y)S^7x;h_;tD<~vhpej|7;>5EYyyY=R z<3T_5fe$_Da4P zpDMCPH97^mzCy+y_@2)9dwerKRSff}2B)0rBYo8d^8K%jLPV>i++wF(q2RQeuatX) zZ^oxe%H7!1BuRhB=ch+vU;H!+esdjoiq_CvJBf0C>RYx-;DzQXctR4*OF8N9MtSJ{ zRH^E`>E2jvAs|K-;QmluOFok z7iQBmeR!2-MBXnk;yF|PsA=U}I%Qxq9i(( zOotXxElH-C|KQ&>Ew05{LzC7CovkyL^AlYrfQz03R{SKhz?$bg-Q#dm z6;CH^U9Pmgb}4E{AKpw{=x5OL5%eRN$oc&nq$G+SRYCqA2;!RQFCI;a4fMz#&ovVA zNSfDr5HPXU#W{4)*x*3EULa?TP<@MU@|3HunnWw!wPiFn`z9j}9UF3u_^>x06_Sp* z>|KZ#@0KTx_=QhVB|b26$fHNMQzkx;u0?&s zI&P z(!4=)#0Y>{59$1Td@Caq&&eYpo_{jGK~r$emI8NTaN_y)YjAJQyZpws=+WQO!+6m) zdD-&$Dpa#}IsgM|GJ2$6WA_yI~A=8k~*Y|!ion$H>L(&8%X zn@fZ#UJ01{g619rp&_-*==2vWOKX_(ke*5O5g|B=O)}z>iw)!4A>rfcRjoTr*8X9F z_>{GC3N#H7vVOs{QMwUk-b|gK87+Q*9K(Fkd%?>>i~jnd2FiXC>xi}9Xgn3Iba!&t5GxX_`QL6F|Bl);E2Wd35& zyYV7jxDxgs6GUq*mTFZ&G9i_}7$OBB%3ZfmH!hD1LREG77RmEU3TiRe`ZY8|Ca$6- zgS4r}xt_2nyg3`JD^Hj4QM%251q}1~WYK{SDCY=oz}yLap+uQ*d%%PaZ9*5}FG4Q? zRQNR&%nfLz!R9M~HLksWL$aug#K@t6p#rlVcD?;4Fm(>y?|zM^LZ1GrgmXV+>E7b< z=Zg0qAQ9ep zqQ#;@5y|x;BOshCic$vj^c72s=Sd-YAKMi^5kB}f5ZBEn$(oA9^#$S9-?`$h;sQao zO!|6-0;TFMNY_1`x{fFb^mSk1N2F!y$G`E=AH^3aB>x#SP`#;U}z((ie(-%D2FSC5HK)BvqBS zc1DYz1)f87#gp)haEG*#EV`vzfiDc07KqGDx9~2F3%@oP=FVhMC8dN|i<3p)AR5_9 zUa+`DM9HF?9C*^JS!rND1sJgA(WxJkX#EuQsXxz}#@wdGclFwPha$;W z`KCCmW_{&UKq@^{^^gER&%?MXJ%R*M)KM5O(skcjq#AV~+TxzD6dLpTd%#+&`kN?f zc!{(`{&lpPeA(<6BNj%#^DoGYkr+0KOh)#&`pCta9=l5==KmK(pJHaBS)T|A68W11 zV@r`^%!9xV#z)*u(5t`AeTHX?bpl<1tug<4bRjbVa6^}w?a7xV^cT-DJHP@>Zf%twn4Jnnv+7sCY*S!%T3@9wkYb}j6S)EdaPy$d%y#QC z!OlEv-A?`P%b?djn?f%diN6#pFk6H`!QwKMrBaQKsuWe>sLW8YH(8XRj;QNIex3CA zz!8DD)9PTZVI&&PKu>@nD0_G;wp&M~H z*22-taRR>v-sIQ&I`NC~Njkv!G2t=&QfRh6`65Ifq3}8EOa3j=iZixJhQ~YtdZVdq zT@8B4qBoI5eGRIptv+((Og$>o$@N}9THmB))$Aye^fg8neaNbhZ3&GC#?L#Grn-w# z{-C~QVwMro@ zgxsS)CSx5?gt^vGGg)@;AW!aGUiiMpGtoG0sz2AulIJjGhk5PHjPLIMMLG$LY5))6QT+A^w?0Cte1kZr|5Q#@-B7>W8OsTmI*A!?tCAzy-w{7KIpR-l_zM-L^t(HM8^MBztPEgl z&*`&%knBQ=2T5+CmPLcAT*E9nbQ_Pb!dq$*m{A;kqx^9szi7w(0x`>MiyTUZDzPnI zb92N-xK3|3N;flOYCj})AUVW{bmtnCABR7MB^E#(0ebEA_J)0Gou$0BFMNNZ3N{X! zt(Qecw`<{^?qB!KkLRp^L3l1SaD%cNojL!-Gkl7X0C-E}$biuCOi2d0^KkpG=(G$F zDg1Ljz7NK%$O;YYR7-x5BUzy#kt3eaL$R1`g;a6#v?vJ zh?)f*S!IxZe&7N~&97B&F9ZvPZ+UCq8L`4=v3_ExxJo~9X>qMzKha-2BT_j~zwzrk6s|{7tnrrD-gkePAq1!)I$G%157eS0%ot|`apCYA(M6)I zN-RG*o=(*OkG@zhV&aWB^6PZ|Y{@lWl?gSOC1hG}k%as>i)FP&-GzIhFYREGSDm8t zT+S<`;8q)v)}aiZV=seo^!kRzAx6A5H-2Sqz|+N}e*5Y)5^3w#O{&=!zJ*$z3XcMD_;}cSjS(R3Pz&=BU z4!NUh{Av~`^M_J|8T``qme5UxWcIGw?;SD1(ued6H;cS4# z4uqH~&CX>(#y8z}o$q?z4cBex%o$IX$g|Zp7=-RM64}jsH|Zo9k=;Gk5eUajr%Alg z_^_oCly2t3bd<7mXhv|4~S35^~b-PUE}G|AKz9J&#mzsj2!Xm(evnqvx*%#Ql-ai9s}_~ zU%+9oaFO|Ua%n(dOT+tcaeQmVU343^ZlYm2yE;R|#dDt8s%-qKD`c$WV}`tr+1-_v z4;bS?3c%a`xW-C&C?83ImyO&S(@wsOQKz|*i!WSEp{3jHJjGUi>C^Gmu`S_zKVu*Z z=+Uo$oa(U+J)nLBvl~pzKa_!s=of#yE*q7Lt{pcZ`|JscystxJ>ulAPY}J+bB+)wE z?aK@HMGRc;eAO2p#8~AA{z_R!i>3y@@&$5zWu4?0m7Dd|zoi(1rfJRkT|Xyq2kkK? zozC>zrQIX-D7Nvccg+-oD;^&db(lgL>CY4=z!a2PemQK}yfolKy+J{R5UCNlN}%>Y z5XPTH=h#(5W%%q0i6rvRz8ma1s}D(wR-2l%;v3e8OWr zjM=A~smN%RVl|@0%~Y?~3wNMny)xTF|KNP;Cr&Od!%E;BCc9)oc(2gGcH7(zA z!5E~fIsD98HJVa%L(74&*+{u;`*d5vPplrPjFi3~gFLi`o*OfaApCu>5vH;ppOwPq zXFMvE+z1Gjne6JmzqkDq{?Sy8SQQ1;l+o$f&1Bf$Ce85=D33nVtjTWq&^Zcvp+J|6 zWS5$^q~%QkA+aP4RaGw%s8Y#kDO*5#Bu#icqxBE!$Pi^?zbm@BrKJ~h2OoDgsRX7e z$5JJWZhl3mJ}jFDRwhFiuvn41Qx-cRHw|9KRc`Gm;~gQPpV=eyoV+{GM!|xrOSw8juQ1S(H9d2~ITBrr!FYJo;hL zDr+^pfJ~u)njblf$>J5z&{81z&D@|_m5cp+Sr9EUq;brr8+$@-# zhTjqQR6+w^LAbkyl(Pe7Re|;TjZ{hGzeLSs(PF^3Eu_a6ynaGv_lqh23i{xi>QPZx zx0K3adpu`3B}8Z9`LZ)NpC{27YHI}|3-K^d1HvD&Z>vg%-nNPv?(Wy?TjDumoyxJ> zsS?lmI!}mMxE;yz5*Osp&d|B^gINLO5~pjpV6O-Xao(XHCrAzU1~&qWPiM}XZY)`J z)=MziT)58}XKNh!y%?tZYsJxIb|_sApzDB^5&(QK@#DbD=@Apd|TAhO+N_Pb*1P=o=r!HnS);P!`0-`1Lhag=*fCXI_@@YeQKg z+Fa5r75;ctUXTe%WqV`p00ReqYa_czttG+A&EdCjUeoeozF;-n05u|qa&I{SVs1dW zsg3YZVfijY?LIK=!NYO+zHSb9Y+7-fCisS5t6y;&Scd;rAGvLy7J5!!Q(ahoz(`Eb z8JaN|jH_d?W!(dZa}W(OUJp`wplfL>I~rf>Ue${~>|Y5cLWYYVQ$et9>}Z@=pSvrc zrc=Fd#bQC=qiRJOY;)*sJe#UQqjZ}uaRo-C@a}+1pRO;^r#)EN27Q=8R&Hy2B$GZ+ zLN-6jw;c0Oz+iK;OOJZDd1>8AbmX5c*(FlH6$I6(f&ixqf}Zvj7xf+I`W3ed`aO*2 z*n#!ph`d9r5N|MRs4r2g@Mj?s)O%?o{BWS^Yu*s<(U#C-hOhxcYgD#}Er!-vmyG8e z#*CG3PvdVeZf!1`4K~a!r<}uOhVJi@q~125|2nDve~|up(uqK2o8r%SRq?^DHfwlF z9)j^LC{xSf(~NpO<7o1(Q{w?cBn5*_AZpYOeni0^qLU4@Uq0@ ztfAEjU-EK1^ofbd?In(Xb{Wp)X2hd$uz$eV++bxZlcz{K2yGAVp;;Gx(DO?7aiXUb z%?lATaRbAH;a>iy0!6KdB=R3Kv4gGGvbcgR1X$6pNMMJ+8p-PYx>HEAUcFBBdyp#e zTxqgs$u`!H24`!n=)`2vXFLJV-Jf)pl%Fxomt?luAhXrQqBAS3&j2Vovw~SS-DkG6 zPMbjznO^Jn!zH%U?66*=7_(Jpi?X-7nf5uo=ikd+y|J{tWPK=Lb}>~w&`}jW&d2!s zDyvGC6*6gP-s=qyM}CUyeP-Sj@F{x>KC`M=40DP|8Vr2oBM)|H$qDr=b}O4vFJ>RqWUlGyuuh>)EE%emE&mF@K*N33 z{%JB{2+GDYn=*EQ!TdQL08seh__#cDfH^5I*1GI#8Q6ql^(zkwiuYI-{FDa{z2BqwI{CIkQTa_#}dVS_$ z0m=?Z(X2bc;pwY|<(3Wbj4Mu2rWXkcO^`xArBL@yxXGv1N2?|64oTaj>>K{V+Vkj8 zH47HL&Z+{TFEV@SN`LW4AK6`KN%nHL|4e^UzuPn!wPYxgEnB&RI8#D#Gt}01qJT56D(sNqStZ1cNG~{tY#< zzO_g`(o|q=lEKF9DuG;u!ZWOTo0kpsnPUsP?-M#;yDl5r{gES|IDR=_yF+*Z4tmL` zyRyL0tJ`E3$!v>lUwoR?0sUAkVjoG(DP(=}EU=C<^eEFYjtc1}s|uPgad-REocR&A zO@Dej(iU9trq(Qde$m%kt)sgcSI!=m>sUPzo1XtpDFG8)6zbS9{MXgmM(VrIk6xq3vP=@E)&*I*T5=WnlZw07Dn2M>=VV+SVy4HssQ9JwyMftphWv;3#ttky2XyVp z^t@Nk8sO|XCTvwdI-o?w|#b=fujuzTHzA1~O!Zd)B?^U@RtMb=kEA9UQtzR*(9 z2m3@HR3hrl5ybk@NypF!70QcD=^v=M`YJGt+??m>{vH)4NySA}?A{b$Cr2GAs*$uC zCGC%l0}`F=e%z&d%7696eR*=c>LtI~88n}fC7D${jFPXiRbTy0iNvoc@Rzn$cIm52 z;0a%+PpNK_bJgUjd2fLp{g8t7k?(pnJ^B{W#MN28wZDD%;fKw(fc|I~l5mDBvPC<| zi@fJCHS=Sd!}gV@sM& z2YiDe&=_GRUu20;sGK;iX_lgpv*MINq}VRTNgdr&erSAL0i-yU#&g3%gh=BG5P4@t zq4d72Kf}V^XE}QJOZ%~ly(54rb0{ER$^hdRdh4s^Koc2mnyGBp6F2c#+J)*`^(ZkC zeWovM|5SM}m2s&h8$us!ZzZC~X7by-cc30yOuj-nFT>tWQh21z^76evh@MT!jM0D} zWLU+VS!Yq+9wqsXUrxG=lXbk)AMjW`l?H`CdTb&wIi0%W3-COLpj<{10$;(|z;daOHAuvf)Yw&&w@33VeB|_tqN~r9pT-3=t5j zJ_&ZPko^ZT&8~y5NVk0OH!I_xl0^~+0O;v%9+ZDP)gK>Qu&b0)eCU-W`JH16kVK}k z@+)@EScgU-#xlw-PI4+s=Qzj-n)Ks>%JB<>btd16L$ZJ#YC!R_hhdFj#WX%CuiKl_ zheq9mVxLm7_0?t`RW{?en&XObc2PLW0lb98)B~Huc#JEq99OdJ4U9+gJKoSnjEAG1 zzgY_-k1H0yZ7vA1@*av=dk53P5inN~OJGfCMF?wtm>M8!+yvk1VwxfDYC)*&A zR$Xm>e74_Rca$??2{9ULb%<9ZEbw&*T8)cr{zLq`OuTv-?n_cr^z)mXNH`!C*nh@m ze`0Z2r1hKPA2uf|efs#$zz%!m_#};mUotBHp+^%48a>cgZKP9%oKmoj(nFUWxbUH>0(hLA7)jHbFjt*weqWSFv+*# zZ8@jaQROMqV}7Tcr?)#IpNxfO4Pbv95T2$FjLL(0ghgV}MZPED(WLJwev;k8M4!}6 zQho;hlT3f$42m~ZP}HEZ%$ZVb@g|cG;f%fB&{j1oYPW_@RD6-H*SHykm(8GZ_ql05 zW>SWK;7+n=1tNvMDF{VA=-UbofHcvW_6^q8WHdsx@Rl!ZlF<*>%D#e#sa0tE5~xi& zL0>fnQnJwMTVKY9@$$9Va;9iOd!X&j?0UVSRjWy4 zm-`}b7KDn7#P|aJ#DUD`~k_oIZdQ9Avf&zua{; z39e3bILXU!vIUfz<{VuT;FQGpEN7HE#(T=_<6Ws-iqIYsq3u(CJmno7m#VaO zkSnyY3Dk5)4Z&rsQe(IJQu-!?7!`%@@>j<|=*Om729*-8Dx!psD#DcF!R=AP1zzE) zQhy~2KEfCxM|$)L+N;K~6*0Me5d34$<0mEm4w>gV0}-z|%2==smUxl`pZUCKsy?C{ z`guxhR~MkN+>wwmFqq><#sXQ|q8`fI{s~RH{qUQ+M~FW*M!93L>yib;1nOd3e!F%#ci5Grt)R2ZyK>{kYVlE!+V*#m?6B761F*UfD#3VC{%KIM8M zZ+gP4OF8T*EPk@XFB$x)_#6Io_zF99GF&a{Og6;R3@Do}{*^;TKS_}R_5c!k^^;SB zwIZb^V`J1%O*vYPS1^y6S2fQrsPI^4P%EpCF1t~bWG}JpZ}b)m`Q*qHY!EaL1R28E zGY}QVvG+i0k^_lFSt3laKa*~YsaA5A@+>xxifAppKb7!2PUErJZ!@-4k4~gJ_8ow6 z=!2Alracre!dfxm9Y1`+y5O=$=un7fbDI?TAvhCCDYJ}5{N|g?aT)Rzhn+D$jYNA8 zr7}!XfBVVIol{sGN{N5Fz#qR%befn-Jc&r_DZYbkVl6|**GF#kVl=yi1LN1``6@T~ z^qLJBMzfzE77ip2QOk>d0UOSyZgZwA>XP?!82&?_H(WsnF zE=xuZqtv5f70vBWiB6}OukAggWI;!uZI2@@mnX8%#33jMox}=$ae+@iF}qk;ehBzy z$cv-TyF{PwQTlu{`aI~_%|h=N^?BE^^m&))^DQ-rS$Upd<-6=pi6F;zExQb?z}#25 z!EauhCn*(-oy-zK7{qQC)FkzTB}4q?<_xV6{Y6R3pQrb}j_l&| z0=$`n?Ks$YX7_x>=RsLV7;EN@WgwutW8Xtp`0@F~zzei0_FGa${xu&w=|wF*#h%TO zP#cEcsvNHjaCfVkSK%-8&i8xmd4{XM>|E@6rsp8@eX;4i^ZmW-@!(K)%XzAP2VPu# zD2NF?{zJNTuIiRujOdx|&I~#=fA)0Qr#iwIg$BYFdm%}c9+d-@Qr9cy!M1r)^e_(Y zw9lbJO25mv#O9C-y~F1-6#6Q8Nliw;9TGkEwG6Vy;^eBug$ortX`(C;+32y)A_IHZ zyRxdUx`$EAN8*%sc#)5l-?2Qvv@*To>tya>v{?h_gNJNEUKO$A z<&DIZ9{UmoQv6cs-<^9=WW5l}KVU;74buVTNhb&dS-f9I&%%G{rNN;Cn{q?Pb#TgS zl*}F3#mDeAc)BM#{0==%qeS>*w~P9!Z8Vx5kB)((lz_pXVhpLyJ%xXHRLG?V} z6GYHp=-WGpKr>l(*Pm_;lpY?RIQ_WTc5`dBe)%&i_J}TnxOBhE7uk8ln7jR^`uh5| zL&K|C9d0{vW@&qf?HfO|s%vjMGTay0;4$X5N*SZ=(3z!OW_$RZYV)<~(r0~(jMQJj<~3NSW;hXb2f|ME-2Z&vBC9YoTL83*1$*x(xT z)ySLP$d;+rJCdN5!b(^}$O@hLh4qUKPKneu3Hu*V(%T($^^?A4oVT9E=TRJW(_=yn zoDBN()I<&>edviC9u=AuIXp7-GHqJ7083UNq`#^4l{y8#cim}k27a|xHMxFIT^_~R zyIC?$bv1g>~DW(y> zTQNDON|i|4<>&_0o(PIP7^UITSIVy*Lau!hiM={B-QHP(*J)|^j6!rYC6QVEPJ?=g zzA!ekb9Fc@#m{QpP+$F=KyKy7$H_c}lF}%)yswv`?NaAsy|PRmFn9Q5EAW{>9rMIP z!;QHwIShN`On)h6i4oF5BQt6{_OjjF=0j<^?A>;FxZ?25aQIvCaU(e`9?t6=H&O&k z?KSu-TUm0BPsn3AN3S{LfHgO>bX?k1xlN6d-`u88WMA=BRhX+om}wteB(G-MzmTK*&VEpz^{Bejv3Zp3%=4Vt;<<)xG zoXqtyePs_-otHdB;Y}3;tJJRaE?$O~#j@$nV%ZE4BCwn;FNREI!sV`(>Cp}#uqiP@ zR$F|LL!U35iZ1h2b}g?JJ`(thC{{pqXFv3|-bTi{^^En}U%uIo-v#~O=R5tKBg7at~N>?IBB{KB-hSHR-JSOSCR0fw;U3l zlL@zlmgG&D(biiRiw<{*F(z)6?;YoBuLaKKZ#`7r~E0T9hElW$KKtfv-+ zQ^%RrzR8OB9|Aig{)l=Dm(3IN1;4qW#(d5iGg@rUN3Hnb&&W$t73Hlf-qmJE;*D+f zyi>hWI?cHq6k4dbfz6-AysH`|5t&XBy^E-P>Uq()g^(n1`9*2f>n}x%Ic}}V*zG3_ zB(BRUKM+ieh;x=da%e>8Y<&%HB^=OWKO)yiOv;jy7V)6@VZeM{W>1GkgkO~-d*%J* z*L>A|KW?Aw!Y-8tjne(bwj(G)>mSe@*n)047lRO*6Od}5;-%H1TXrRCcLx%US^ApI zyqS27e&-sR*4Jzg)D_+n^z2FQ5j`H+09K&_K{qbws%Pd8r5}H>{cWEmCMB&EG#+<) zZmifSly|o0oulI}|DhBD=BMv8_ zU$cRNyR+^agp8l>Nqm~j8D2}oDx*F+_7>_ena|dv^NES$94kRx_|(hdyx12i@~P`G zR(f)onPoR8{61E}2PzC`p?i_TcPnP`9M16&TU_wb zA(E$f#m7*o{Usi=)LuF-V!UEM3y$a}P7>5037c{sf~|fbp8ho6!5YjNQl<%h{b?>e zc$F1M@1=>;Px7tUP20;kp0~?4c(dPZJGj$7xUJ0|?jO9<_t$@Ln%r0VjIXVmw-PF7 zzCN~V$-um3hH7!W97WKh&q}H~957gBQ_<%bi24=pB`;BZ4w_r6T9(jlneU-&WVl53 z;YV?;W@Hcse8m%-VfFl-^4C({>B^LJSMH`OyUvk%Rf=oGPFc6Jm80f+{Y%G=ReB?~ z%uc4s)C+s@u}XIc>}{!mUx`x_#0XTV;}L3znNP_`&Y)MCRq;)pK<;|6oikF0N@{bh zg=F~S9I23_B^B%&=j&@G7hj*!!KiIf%WBkxVIH6z^-Aw2wCp>$fnOHP;0bp3q&%8 zFfE{W{D%}g=74fKXogkxDpZWqn8Ht_CcC!-e+Dxn&N$S$Di}6U+9~>ws6nN(e#6)h z%L*OGFRE=jIcAo^pWcPPQiswzGH{9@#XvicB7Swc2bOyp{DMMtynPd&uEdt`^XGmisEV z=_}=2foM+e>;7Ud{O^?)vPxvizw(b{vCW1+OX(3b!Q%WORg-(925;uam&^Kqe=swx zr|6p;AN0CXe1OVGu{K&?y+`2aPq(l0;B5F=+TLM#wj$9-FOdnkq9*5?tXtW8upgsm zLP&*w7EmtB@Bo+mrMQ{UJ~LHW?2~=1daBm!7f}ULIoq{{ zk#^V2+xy5_nPorhVnP8lB5&2F3v^MyAzPy88#9=?ZLxm$3ivFhhNDVtKN7^9RrYLY z4ion4@;)Z)4MD$xyog<TPNs$R_OCtX6sMvMYu5;@q4cTGZFExqx=kj*UG^4og6! zKM>Z_jr(apBy)*UMhMWs>BUR!S#Y$WKkZn4+d~I)d@QG+_t2GIwupjvB8> zdn#ucIc$70YiZzU(%$P?{8z`^tSK+kJui^^(jZf7sj;_513}eyjI#Mg+8|?~69BZn zy%SI`QjDoVs->m9Ep@*{_Tyw{9IE{%QT0Ak`L4*})0TPFvR5B>q)PL&vUHQEJ*9H% z(oY!w8ZSz0D2s&IYMQyheZSNzFR+>~Gu1k6Sy+K7Kj8R()bEh`GWq(LSy7$NOudL0EZVv@HN91AoI-}*@^#J|x6J<$K&T6hKh;9#mWN;96fMLqHZFi>2F~BNL z*L6QAA<5mZWk7PtxH%p;x?>NslQtt^93|YUCMNnzRoS))N+27BbKp_;_080|zw*|2 zBlqPz%i=)gcMA2pWUZL>A0u(iP$MySX!-VF-AimdWL=gRJtW|HFFvR`p2f`_=(~ga zQ1m0Y8N$ce@auhMC-XyIgNO5ZIUqeB^u;+TQxJ%>aXUt9Hmly9K~E32G|09Jo75jd z*)Dm%T&{+xI%>4pUUu?=l`k}PRL75F6N-MW&1-u`DXwa}>7%LlypcWZHT=D4mr>Ud55~9#F8noS9@%~zRj`J-m!0{)Fy#zsKt7R(uOK}BZ zjrnYKd@w{|>xg|d&QG&cjHM`v?0&&#W*4i4<3Vg@=}`@R?l+%_yp_$dQ*s9QuTRUr?Sm1&eFZ zdME35ULc*1=ihZ5Cbq6iF#C{qO|!QFd6RzU4}l7aRvl*l!0+KWbB2ZY!;B3nXFcT! z+5(St15OPe$_~%;skxN8J*HWleYv^mIQqcCfUrR3!hk(jjAM!)#b`ft8tOnTUkU)B zo&(^yX_%9XDNTpixjqK04K2?5SA82^Nj=^jRQ%aFt`K;d-Jx8^DyHz#V*M$Mx5Fw9 z^Q3s+)8Q-a6vnYPs|vEt(@cFqijuV{GQdf zPl*O8aqt!BI46>1?c$s8WF(oRuYN-u^c5RECn(1O{p=vqQ~gK6#Vyo`B$Yk>mh9_AUt0VA%i~6`7tzTZ<7v0^tfJ6Rcdxl9deekMbxk zl7Am2-ye4}gz;wysR2|GH}ZHDNFasIQtQHcpp9clxd5DHw(HT*K-P=JpJcCuB&x8% z&W_S_(&Kk2^hNpC$Rj|G=d8RQyM!n8g7Zo$(rPmST@vi%gLXcN>=C(B#o+zNZvD=m z!+jt8*!bgS1@cgVRry|pjXlzIfU-S{BmTsY={(}#+uki+r?cV{){x23B4Af z5YS5iO`rcpV_l?dUG#|lcrCjquDns|BUb3;RdAnvPP?d-E)LiOZF8h;0JlO)^VTNL zMG65}S*Ij}DoYG2Hv&UlL}ApyCl`+`)K|;ohj-$_hzyK6i@FjTShJ+i3i!N_u7+!A zF^T_F`KkQJbSHc>ikD`;&^TvoR71bWdXiS_!Zwp-aXlk;!D9PxE< zCVf};ct=jDuu;A50?#^qaK`<5Kz$Lc0dfL&)CPem=ilKu>G;Jkb@UJp{6V<1iR(Dg;BXrkNyV*^v8LrQdZTBWb={A(7TMFFY#TERQFPS z<%!ZoTc)q%Oq(4E<+{k~(j&5|%gcIf+qw+L@vv2^cnLjYILo9a z&x>VP3Dxkz7;wk*V*EIv&ClDIc(7@#|GeE*7}0H^!T!ouc-;UpBE&POF~@_x^K7Om z!P>op6a_HDx##&?r-8r{($jdz?);(WQ8cKLd&N`Ry zauN;HdN_2j?08w6YoY*6qf448gEsc*Kj&goJ$Ad&J-j8awh-lZ$Li=V?k6VVs3a*g zf+p?U2NZk^^o4IVavx6S4T}>ekIRerlwKR2yf0fDTB5Z z*QQOs^BFj{UcWkjv?MD%IF3Q2V@f6U$`fr*rOB01sFo6gV5L#Gl>5x{ADv91s5Vhu zZ4@n|s9A}`5XU>zSD#0m1N+JjO)Gl(3e}jyb9!VO$u6C_bx6@IAH_}GV4A|MvOHFy zNU6@-_u7)!3N`TXvs%vG!ISX0Siw$-i+hm4_q{w1{5j%02-6ES9|R3o4rFqTF+}XM zoHay|IuQhXkuv2mDPu33tW5g>{qhboElUWz^#t1j%CM)bJhp%a7t;&rVVQjbV~8FU zx#qdSJ`Kwuc8pGg?i|O5sE}UxM9n(1@_UXT2d63qbL3?flJ5!%C<}e2(&GkcqgGZ- zrpp$OxgYMFl_gtjbCmZ4pO(orQCw&jl0`=@ysD8`b`dza?v3|$e^uN}cl94aR{KF> z9K^F0qmNW|nK-O`1U~T}51{aI2k;&-{v^~SmU(K-&qP?8mln^^^w^JOmuTfe#R-*; z&!I=(AQ6@qF+>+@)+!*U9@vqTHxlJk{SA=ZY9x;<=X$d{tQ%BPr|U`9nEy!Brp&%t zUKCezg4&nim8G3NbE7{or_A~dq!P-ZQ;sn0whqg)+0F}mGWw^XqLmb)o~>nW(Wyd4 zs~Ix|QOh81?cjDjR*ha@^@x+*$0H|t>;qP%s=3@w{MQj zkd(#2xI?rhMj3YxFj7eMrkflbjrQ2DVQU4!kSOikk2qX|qH3+?o z*+xVI^H?24-PSppynzwoJvLvVn4Ej7^(QB<80w4cA8y~ERFG(NhSKS@=s3sNf5uVa z{tC>V#?Bb6JGPin z8jGXZ?{TLi!dq6p<>1TKSX$s5PD)n5qC({5IEZhy+kZqBsNYOA`6%^TDn6BL+|8h{ zV;w$c^FsN*DLh_t#BYvNO4Gw}meM2{4{l}`#1!wbK@BT_<;wPz)FQ3cWo_#p;3&d2 zqpuIi-hRCpAE`wD!BZ;A2gGLBav~&g_2-d2pXKeg{|sirg|-unEp&Nh0fKZKcOu}-Brtht2_xk+D)PW;|kPy%xC0k z-WSAmP;Ldz#EWv_e^vK{ls@yVyiF`T#$D-vRd9-QSj)&! z%2zQ$GkZY1fyaFo+z)5%f%AD^_xEz`0|?YI9yM~&J!q(#FAeh}w6U1pvS1t>98+5& zBw~Q4`XIWnjhCT!jjR1AvgpvSDx2&|tcSkpVsu>uj_^1$8KSjPu|ZU%T}-CKUj}z< z$;4#>%2p{It|{hnC2rigq9BmCydap!j^$Ftx*MOu<@>|d@9^9mKLMO-?Q7Js0rDry z2DKtK1&}?aHv=(qb|OQP*M3ODS(hNCW{K*jyPqT|EEeHi)uIG@TFy`XYK10WLf>CS2cbjM>f2&c{@Jw7 z{{H*Qq3YxE*n^OH)JLnMK9+qbL>|>H<0a_p-}^TCdTbWI&3o|PuO(tnr4)#%nOFBZHf4eiZ}sasE`hQ#mmF?SM1uM7fkzY#fC%pU` z=7+x0qZvjoBia(r;ypJGVzYnPp1otYd6-ii=6GJs9DCG()cAU!bQd?x-N*YbOoCcN z=VA`CyGNw`@{fX*FX@p9&|Uo7U-Q&>#I2$kr4}df*wpf=4W>~Pa(j}pH`S(ZsEAke zIP%z1>wYi;JMmZRu0Tcm*sRTK;ktgdVo&YPyb)ZGiL`skXd2?)==#1q4E$JsLi4n9 zw^=`e_j^?5m$}jLZT4Kv=hbXr3pAsjVikgX9FhG}_gtr%!`zvZpcE5|XHkQe%` z!Szw956(a+yYIr|+WS8>XS zT;JRLHg417QrWjzHMN(L*N0uJAkX_1OzuOMMwa9bUdGvnjTQF2ctR<9iRCSUSW(v? zLDr(7OBskUGx^Oj>hM#9Xi$lN=oaE7K<9o;Xmtez9O{qwFp?U7IsZ}Sk5A)_!1nk&toIsq9O02-`@NjJNbOgu{wj7Gi}`?C-@27LfE~p(TahuR ze^-cT7ClF5sE&=ByS{~7PaUeRP>pOkfBGE)I$g?!*%3snJ;xpU zMdQ|$P}F)8cSKrv8gLf^E;8nEpwmiiZuq_wBmae@WYJZ(Q&cf>G$n6|TINYbj}1cx zpGKecIqx96Qn#cqSodwH_*{U?j4D4`s{2h|C3`+t_Yp^}_@`@}P)JN4!^K9|=jlId z^9`7oYuzbbS8to?(d*$-Bhi3S;8a549Y*C#O*2s%oh*`)aj=U^_}K`|;(9sM=;=mf zSJPOz>mYP?rxc*759Qg_bl$N!rNu23Y0d4dk~uvSfv(2)f{B`;fyBslfl6DCY#_BJ zQBT77Q)T2i%swG8`VsY?#CWS757VdrU{kQPr<(uS7_m-u(CO&T7keS52PjRqR>x13 zQ#8zELfzsFGU!y;er&XQ8Jy!Maj)1e_xQ~QUWtU2;bJ*&BV24$9%}IoB78%+;gF%>YKSiAXR93W*Aa)=;%zmAOj7}NxQQJNOQvF)?acbGO&hZ^DMl`XO^lB0bpC3ItC%xQ~Z&G4V$>&u8G zi#9HY@qX1yFIihk6rFvMTE{i+jThc-tpl?AZvbLqH!`rRZz=x(qfGbRyI@-scKfUg z6?Ov+j^KdctA@iSN+CK}{R)jqzlSYQ{SFO}j9GCDm^uG&1neH+DOq$j*x3VV)2Z~R zI;GelAiwIY0)9t)dh^1U=1VZX&~Vtg`;T1W`xI&NDgVH|?5$azdjByQiqXrlHuI%; zfJ2Ng#cwWb`$rCMzY`)pi(z$&Np)c?Qw3{Xa5@ldWynU)wo z(6~2{m>dHIW=`9T(!-HAvqK{SiR#mPg(7rrCf$1XHw7uUJX3zc?{Q;IV)PpBX2Z8> z;tJoqJGk*-xE{TqvcC0RetD6fe-~G}KND#0DE7wU1O4Vc;eBE09lJSGI2b)%bu@m} z(H-vvE1z$CBPg4;6HZnCSODRR?B)>CmUT2C=Z?5e(5F|wZ%!VR?K9`2g?1`_3aT-e z*U;3JptW`bG13L{puwM@x z;{82q2rfPgw(zEN_rJNgK*leasH5qLD8IQwnO2o=NAm;{qoYRMc=ATB$iO8tnr?4w z+&^HbKhC(w@x51^<9p*wzlXaaH>h!S&hZ7&RJQT@4*nNDbzvGLG2L65hhk z2FN=s-pJ|7%6~LX$!xQ?lNqUMOaJszX+Dm-juvY-2heo-_~7Bj*9;Dv8Knn}$h${* zN4-LHbeO|~%|fMkEvnLD%lJJHUc$a=^NK=YI9FlF(Lu*Az~$M-s-r`}&}+DAg+luN zWzjFM80N>UE!lVC0yyt?;wA(WS3u&^Z|AymsWCYU)`}#!{n#E!9v_|tymYP1GA-&7d~)H1(!)Ef zbRj`#5dS>l5VNYpx@k z1a>mmPsjkE`pcWfAN%zuxR{F7f}c|x;@S@}2n)GFhZ^wHFcQSySPypHvYcMA*j(E^o^p@MA zjJhs+BH+Z|b3v@FlD@5b9+KGa11+Ubkw5+M<5uSK-|$A@#$;aQpxPr-mY*d{3WY!F zH=lL>O-&hw4lW3?Tlx&`KveawgkE+3ov+t=U1>o4U--Ds4A`k(C2vh9+ktgFf6g&@Twu!c39XDf>z z{@eK=%lSVZfE#GZeWCJi$;<&FV-5U6;73SdEj;BQ@qf+Q@w;01cj}J4OpdrwJ^$Yg zd*S^8rW{+|td+k1GC)87AE;r_7G{{J-mlPcef|K0tbuW{jhS$L+Kp9-`3)!!59 zJHGAeng3_!hQDybyWQ}1H*9pn8{P0qHyrDR=eyyVZg`R#X1U>h)-Pn-_qgG9H*9x9 zS>I56;f8m+;q7kN=!Q4C;gxPU)(y{h!!zCRBsa`*!~KnJ`);^hg(81dMr6!cHzH)A zfe1p?wHufyKLOR`oyF0zBUzH%uGaT4PD6~fA3gQEH{{E|4*75<-q$mT5^F}PY|d+> zNa>s%M3@GSb`G!Nm*lib;rmmHzzyw-Z|}3F3KswV{QrmobGB+#o{avY@6{)jB<4Hw zBbk}cRXl8XI+<*}#5p)7d8|UEKf6Ef3%@v><3xHAee<bqk>3zy8cCZ?s#mMl zuH^4>zBgzAZK8IS=F@`2{5%`PClU)%>i;VI5wzme75o}$djXK^v^n57k38wgOzlQ( zAvq!9A!353l=Es|b9?tS7sh4e-bngfa_fmVYV%b;=MtCl-=W@YQkSR_H<40LzO*F0 z)wF+Aze&+2^d+@5=p6!%wC*MBOY0O4eOr`Xr|PBp_&=}dv?o1k{NHM^PSGPot%da6 z3$=T>Kp_3Aeomo>GAa(&IAc-=wAqTFaA`AbV{!&Uf z(iv+Rcd6@qE+rOX^c)Ggil;_3&eC>(ugKsea&IN4udMZIDCyb%1kSPQPgOme?qZ{% zodkm`z*fQ~Jj>|F)0fwl%E#hA;kE`~com(FO@WD&5l&mecsrv$im|BI9L?fTMdKfK zAFG7Zu2UvclR2EVKsyhJ!ucZO4hQw>$tfAqlxo?L525*7+7}%z8opQSe6gioc;ghE z94+Y8ZlP8Pj&vZ-r~i(2@Ip(`ZL^hDcI=D5NO7|U*Gb8gWH|ajWF=GL=PaBLL#p3? zrwu9t{XnG-zoo4KfnA~uqDmccvE=(>;)n3tcCMu&RyW0-PpLW(Ocrbyv7%p zSlXN2G=KrFt>hg-OL4bv%Mv?!OJI2QjjJu41H;-$>~y>HhIo$ zZ{zHy@Vt*vvhrfl-rpM1yzE%qH)Gh2FXI zXWlgXyr$W8p}7n3-n>`MZ48BH&buN!yK#xC4DR)e!H8Mj8<&J;H+g4De&3M$yqi$szvr8xOaJhkbI$QLg=W^>EPT>5N1B;6 zTL=b~$-BrKn%y{m?t+;-r@Gz;#0c#ibo4p+79z9f^?6FAmn;a+n+F%C$~tqQR9WXq z%1W#Ry&@jYJWbes1mB4>Cu)HO^~7%d2|V^==ler9{%1FA`p@3Hw}}`0%y}L~__AB( zlZ^T)IRCrhSAXijUFU{2;VHCN?Y4iri=V(<=i+dzHa@wx7vEHV;W5%ud8u%o3v(&q z-IRaqK4gygba^H&X*auYrtrL&?>6!Tb{_E*oV{+j%r;YTsSml;2g=V~fKpS#P?kzP zbji$d3+w*-&h3}{dGm~T$@e-g`E1G0YlrxET#|j{w);*#@1D`;Uvudcd3I{_xZ&wl z&U3LF=DJ~_8#=lCHm74MvfHiSH&vc;_@2u5HhK8cPpbV?yVI&v(W=!>xN?INe*2IL zo1>$AlHYYwoeHBb|5$~ypLaQ;vr){2i)BLvIj?>0`f5zKW-yBzU*JF!^JzC_i z{b<6y;JYl}_<3DZx+2{-Y`=J-lIuNKF^WUHNKi7YK{(V2b z>xJC)*Nzys@R3<112-R2HWdK3{Zkg?JoebyAB_L>@Fn(u%PVTmA69nQxaIb1MnB!O z{PoL=!X64|IZbL3PkmBa+p&8izMurW5A?OaW&NJ3zG!EwGU4~n@BchwU0?Wr=m)>m zwdW~(AL{q~tA5W9^n2F&t)DgY%U|5@`49cdH$UAMzlMJOZSMEHvfuN){ht5Ouf67e z&prK~%g*W7{#kvWTl$s1blkX$yd_u8UUcC|@7VBM+=LsKcxxt2n{>XIyKkfTOTFb6 zjw&DJ%KE>{|C>Jl=7)dF%fI!5V`2WS5C5$X|4-^ewOPnd^Y>l5es^E{p2|=8KdG?x z#J=TH=>-MHPEW0)wEnC0loX(UhyODQ@WzKn-ul?hi8jIjFC%9YXMD6@^35au2g3J; zXj&oh9>V-WO)Hl2gy-?~6E7pYknb$wRfIvlbBNaxUcq-B@fn2md>e>25PrlrM7)Ra zXc1TH5ZAayVKv`-h_?_HoT6zzBQ9YB6RC%YOL+eon)U?o2M8bL`#0iUgeRS;X|ECY z65h@CZ7D^Ff=IERhuT!|Al@%0jKCVYc$ zG4b7mW6!4F#A^w6vmEfh*n9JMsJl0Q_%mbQB3X)TZBiIXS;B~At)Be(%@s^}L=xp6B&?zMt29?)P=BbA8To zo$H)A=X|acS(p!y_Xs%W0yIBBVGxV}4c5WX^ifa_pdn!hx(b3Vpq&AdK=6k7BH*qC z2sl7T0DJ%r8gBtQ0pJS|A^`msU^fVl0NoEz3# zhOyHG=mD0;KoA;)T0koRYz3ha(ER{uOo4s@%>ppf4DbM54lo;B9wNXr!0%u)&oTtk z0Pe$ryaBBMP$&S{2B5_NdV?SaXdJ*m90W-LIs)Jt2xkBd1%fdJf;yn_05gJs%>eo> zK(An6Gl0ecqzeV@2Q&*n>`gF6U_QWp5MluhWAAOy?=TH;_72Dc(0G9F?}EMobUDCz z5NZJp-2=J>;R~P@00x850q6*TEg%d7x*y;;2zWpP{|smyghfCD-wa3;1OlLeUj`Ho z0yGTj6A3|cAbc4B@dx0{0|XYp|KIcfeGMQ$6%mvOh*FV^!2S^Q)DXQmwHTEtr89*V zvYd{b-X9Lqfc)YvFiyan4bN#X{+SN{+g^g?bQrhFfXn$6;QSx_oj+nHHMA4KgrpOr zHKlPzXl>P}0?1QSK-57N0MX^(i#gy53hZa*gLnB>3 z6913(!0m$bQ6%T{s}6tiIOrgbThtJXHx%+ln~7W3NfT9!WqF% zugp%t62vIRU`p>yS5DJN)lY#(Y}Y>tlna*yZq-wXBg7E2R1mEqr5J@N60{KTVV#OZ(FS5VgPdi#|I>pbUg?fq3AoE`+q`duF|Z;)KR`EPap zvp(=UNo2jS)1LskF#iu-u+tv|e8vCZGtxkeNwi`#rqs?*fc}5szm@%`9(T|}JH+Y5=uByyX|$-zDI1afTebu1NbwIHQBXkJ z8Y0(0k*pWEo^abQk@W}DuK$JpWjDX{^=Dl``v0E)|7c)00=Bg6hscD0w19o{Bmd^H z>_9*UR0v4yH=Y#W*#e%zZ#-MT^9DSF-*^dt7Xf&<-*}$@?>*o}{KlICyjs9Z_>Fgf z83A?b3UjJxT! z%Ml~vSu&cF@j4kFkuj5uU&z=(#&I$(laZEwJKx=8{G}7IKjJcf#8t_0gFpDT9G$ErS z8Sj(v2^mw#_?e8~$=FH8VKV-z2c8^9GH%z0m5lsklp&)Y8QsbFfQ-+{m`cVRGJYWA zCo=vj?;APZLB?S+&XAEnMkE;P@Uw%ATx2{z#$Wk~{t=fV$1jo5_K)=cjhk)F{cqfC z+t>fb&9*)KpT>-U&f5kP>jevN~txXlZW+jX_GT zR>0roF9CgUszBzE-v%5(#2G7l;3%U6JidUpmc2H}3HDb~Mx60*wg(*dU-@Yul!3pM z>z{&9HR7z3EAX5Bl9Pi2sEE3qBVb7(VUCBZD{$E&mkk-iJY`2u zJ69)1*fUKJIGvf9+XHFStrTxNkcXw!FLCG^%sp#oZ*|(;-PO+A1Gr3qvTng#bu%}2 zW!Rw&g5uz~hLfd-y%j8b!OQ`!5eepM19v2M4`<*B;sjU}TO3QWvY=>+^T0g>c(4I~ zr$3umL~-7YTn#7Jvp}c7?MrLh{|s8BP`5Jk{Et_l!;~7pcg;=_cy{SH>DyUaDcPF2 zLLrn}?q6^fV3h$y|J6GX3zaskHAgomdn+Y(SNrXnxk9I4{(sez9-Q)D^Lh-Y=sMbN zS%Rgqw}q86To}*@=t1Z;mA1W=l`~XGrSr=v>Mt#SrP8r?1FF%3{j;DADzeuaNQa%N zz@4#QoUMj{Jw3I)nVmbZ0BzuMWpA}@ipt)=L)pRH)!oU<0_<)v7-tN|%WAu;A$|x0 zJkzdWtU>pH^DYSE;rLse6T-Mzxnmryyf6-+_o3YoFiMaPd&gM2T7fsXAlMNQ=Inw1 znAuxdI5~QPnt*b^s~&EkP1`wBwTl%we1P*p~mKbL3X420a)f%SHb0>s?h zE$y6u0)T)c#?#u_)y~n~8lw5n6fbhpcAx$8Epi&hZM*mWnWvlM-^n-#<7MsSY~=_< z?*JN1V4zkHA!`eG5`>^c$lBS%9T*_=0=678OQ;Y6_Tg#nVGZQ%+}xbaEUchNKzUhg zDIH3I+i!2>2)-1+l%s`%GnBDK!OTJkwhA+MxMn3=w6hOPm%;SbTU9XS2$Ose@PM?0 zQ)wVDnBbX6Z`=109MNDKk#ql|9e4A`O*bt_2c5A zmiE9(z?nR#isuo4k?2wDNOKPGZaM2swW_&?=-v*Lfs$^uz9{E;s_ zNx1pAxm!8>Qyd)SgFWbP;g9!ht<0SN@rLDp=Dj_6{6h+y$N#4sJe&MO>MoG^Z{d$R z03-JVBmN(+LJ8pA|0m&(GRSUi|12X9l<|Kz{E_e0u=_pNT9D`eDf~wsU;?oDJriI% z-5{SoAb{VV z?Rv!iM?JRr@EFazwpQZg-`ewJ}s>Le>~ge?*11({P(H; z`E1Ke{R{uxf8oRZ!SY}F74AQSf00-C7kS%%k$3(Vd2b*edmI7jz@1Ni@RA59M($S( z*bwjlKg{uW{B0NlN`;M+{1`j|ao9M1JukrT8f-k|r}i~`X8kJ$zFlS^pf$1I;=pJQ zEB+lf*oTBt5B?p8&Ve{Q2*^*W0TQ|e8zK1-Ge<(;v*BM)tThtK{3{0h7i_@$zv%^k z6$uIcRv#*GdsrJf4eqxpZ@s1E<8BLX^r)MGB|5n0v~{CM-_Fs}$;*wOyyOHop6Ka4 z;JvvFJ-7mO#hxrH0`?|u#{5^Dx3c&ZbF*-@b9U#yvW?qZV|x0(3IH!{T_kgpp%)S7 zhaVYo>09@D?7@o1&7GfIEpburUzd*PMUMZUq>ukDZ6Ip+XWry}9JtX#&K@i|;rmSV z^zdylu&eR30+)WQ&EQL8;GUS7BflBgyMb+i1-xVD2bX{BfCPETB+L(v5cs|9z@0KT zZ~_8uR+%~a(3^RJTV3!egeX6_(Zz4Zzfa`2_&zu{_|np^w!?8JM|&UsZB_Cg7Z-sq zMUiuM6Q&2(y1-Q~@bXECt(Tn~K=L0_{D<9b!PPdPFt9!2ciP&Z!*|^*;k#|{xr?Qc zC}Vj_Ank=8< zu)j6yZe|0&ucWD=b^3xK3eM->-?w&ka@Z>W@Atv(&lxBZDASVuuNwR!{-r#65ix## z5fOVkU^v?kC=!lr>HJrZ@Ne6x8<2J;ON%&zrMVkiL0Eb#_bm=R{dsHn3f{I-!Oc3* z`!=BO9k=@2jGt`dK-qs)99&j|^{{RHAOQ>}i)+Br;2U~o^lr9R=HT8Vzdbn366Mzf zqOM+cZdSruHUqdBE2(QDBn=k7rR^cL)3Z+o>Xd4PsD1co()c-~mk! z><%0pIDrj4xE{ED6tQ*C1ltm5p|uk@I|1@G{I*v1&fr#@_tpi=t)AQ-RofRDZ);sp zVWQ`Pe>R!UpF4Bu*Zz|ho-2)9Q>_8eIiE8GTihL$a6r^JrCLG|KXm@gX&vup({VnJ zjP(7-%5<|#boD)o52j}@)$6H6hjemBBYtr3@3C>Vr;Kg2_xy6cI#B(Pn1yTgvptn! zlshUTLM-`r#Yyjv)$jJ9SaS}jwSCW+x!kL_P?O%5zCKgD zQmWso+tWJHH}*_^R<|VIej|E)ePd>J)N`hrF4T-eaCH@n2sImJ)`J*xZ<$e*i-qm} ze@w(M_gjQbcxScS zvPPkIrGsx?O<%Eht=JuL$04?_KV{_+dJzd2UTZ&AEn& zd!OdOeeRR}OHZTwi#64kyD3aTlPNxNJ9h1&ll4I!ASNd&6}&~7#v+t-kkHy8B2)t^ ze6f-qXGYEF@@2C}@Q_7ipvhL8_9CIrxr_J1z&kmAUF2R0(^!rylK?K#)RPmPL(F?q z2m7$TS#!K7pNq=e(*bp1N4yCNy1vs)Pir0_(8mrrSB@U*UhZ#tgn7UNRNlNFXaQP8 zlJJ37C-W_xWuJB1 zG2GMpe!bp)4cZ$*rlzq~<)4!2TCSdWR6o-3PU~>_f-EHYZGtofgpuVQ#%%N<=jgnj z-*hdMom|fWZKc9FbVW9XJukP14S)gqL6I>17s+oWb_Fg5iL-!BWMDT`DQqr9-AHpb zRAqFXefObVdnv{#nvXXxY|t}4q`Qw;+_Ov_dpgttk~_=c{e#;yq@0lzh#a${sqrex@Uj zAemc=dYeA20MpKQKDr>1cS%u${A00I5WSptL8NU1on(#+sy#wwP{x;)$A z)FI+t3MpmN0h^riV^FB!Cuhcfwkpg(!^%;h{Y;QFAOJn2$^;gr%vb|mxadt=ZW(9+ zG29v$bRKPxr8LcXYdZ;@EjuYQ=}S232XZ!a8d-n+d}oQ|;B4ITV>5MUo^oOSkI$w# zXs;1Q8s?=Lo38;C@Zp2Y09r2@SN?H!*9;6>C%+bCol`2)igD4rW7qDn5(M?cSS~i4D-LM zcSqr%j^a~Er~5ozCBBl_<=psz()F3jAx&9%O{dhGayZ0U4nmcUFCNnFS+A(OGhG%Iq zw(sTrpw=P=2p@FF`WaMNB}>1$7cxy1zvt&HbL1WS{n;BwBC-}o;(C|g6%0JNmif^} zeqm4dMrHB(Jman@1fx`n`B~FSX>NRExF02`>Cd{2e7(F!jfVQ!%$yQsIG=Xs+BcD~^fGX43z`R(;j z6?d9dvey>PgfZVs8t>k{+uwG@sVZ9J3jRj}*7L$9pBp3YvGzNYQg*&gLQQcg>E~F= z+gSsqa5*nzgKPk+FDABHeLtIH--@YWh%M$*4t^58jLMUPK99&I{3ar~{zlAZUDqGN9`KYp}KrDqn=kHvZMIu?#ZONTMN zrFp9$#U6^(FI`jZzCM}3W9(+192h_2gJ72PWeb;+yuJTn>$zHG^$KPv|FIq0%}|KH z^Fd>M1z%6I2NbMYz9Or0ha)XXu!*sMU0%4pqAoOC?&hJnVYeD*PN+WT?Ysnc)_Z3j zPkZ^Jx#9-7)=cu7s|`b=Po1itrz~3*S)X)fG!1;)coP|8U;1+YK;J51fe@pzV=%Hr z<>HY}Td9Fqp1!hnjqbgUvr zm{7{xx8l+t9PUIMckvri2_O1?FCBj+OP-nYd5Q2+K{(L8=H{ZNOT9?JVu>eC4oaTA zE5co=f?jJ}fgwc)2sHBez!v(_9vZ>9Qvq&#rGytY78UW<8FW#23J0&g*X@YlAm#%-0j#9j z!GdqP@@0F1+xl`UHrQm_Pur#5JzEmWhnBm&LR>BD|3>rtLW!fjJ!AY@OUk8&r%Xo% zP;AytWrm2wz|%a`v24u;pQ1h<#jL6hE%o^lq`T+yZ@Lg(PFXw^NNlGoqZLA;oF-=l z&ybW*3V7KwryEB0Gvmx5@#mA}Il|tVH-e1Myx_gu3W^+RKH)bwiT=pfhjp6p%`(iT z=KHXE3$+h??Rj4zK$R~=Vl-ww2lM@8Q%@1h_(`lXWqQ!*IR*>$6q53 zaC&|8_^R{_%mSv9BUkQ?ec|ZZ!EVu6Y#G9?z-M?^DUfN*zFe8pkks?9~KH(S74Qp~-W}Km5{z=t~H;%PUE9 zWn(hOR{A`+BJs_>{#NZn-mBHE-aprlLwUxokzS<>afG&lGArUnoT8#r*Ye;cNIhWB zDTglx6TV}WW>@Qvu=P`YwZ#agwj1Tqqm0Y;PU=4oz1mZiSSRBthowCgz^)njIv6O^ zSr&a{IGaAiGq3Dm_bb0$tw!>zeyrFA-O)9xGl?k;`;zb!xY^>jZ?rl#$Cu-Wx;B4u z4JUjPhJV07ce>2_|NL0_CW59>ZGgrAA$(o5+dFZ&W9G&~sgt`%H}(fM(`^#&T=W~( z;53bjmBd`CUW@m02f)#ouaAsF+G&lia+lcLo>P?vfAQ{2}58^T7G z!T(5)ByI$c&?IQAM}=0b-0j95ommd+GJn07gF8|&$DXjduat13rcvTn!>bsR&SoiQ zcH|P59u)R$Zq8*oJHWq`Yx-$7=}J}!e{*g$N@$2V>j&Gb_1bB{K+gD9TG#1X>~V!x zsfO1z&MnnNV2fe zuk=IE;%^`h=@)8?4>8YAncNEsNIn)IsBMpu_1z;X=j8B`GVOl&V5647s2WF~ht_Aw zu9*N)zWDC_3QgFZQ6n^iK^IoB(zBXU_qgu7TAodsLqn8-%-r&&oBg@aqV%f9)00AF zv!N!*q%)xZT)*u`EBQ|K&!uHo_pQhoVQ22#k1zDs_T8Km&Chhxd5nnrG~` zg&PCKv2Gg*hZkL6@|`Q%m|ho*=b;=ZF*fiXf2~w)hN~-{cw`d3L8k|Cg@#q;uE}ED z*Pg$S&Zp^rkG*=Y%!>W(=-qEMg*KvgD(!LCd7_foxo4Y)d57oEx0jvhWsc`7+As{l zLJd(qGk`lAX|6 zVdJk|iiyGr6BS}3I|3|OZA6Y9%|)oih@Oy7-*r81Q?P@=;N=(Vpa%v^UbZ<4wTlcj zDf8T&;@Fqym4VsNT-JzlzPt|WlCsAD--S0(6Fey&;^;}fJQL0K!<>>s>$@wge) z4ltX6>2$cRH)V8`6$<6%iKb$@Q64T7iLF$E ztW6^X*QSWAr~TH7%Op&O-pi8~+W9{1*W|esZ2Sh_?6DXb(xWp>VY!Pm;qJFD`ty9c zZAu{LvKU(#w|%3S(Jjmy{3T>QjsPnSl} zzS7NFtlv;?3M&HrWJR+|fo(V2ZDk@=J4RWr9!Uk)9v%SRadQD2X@e19QUX| zjjjDWH^gNyVp1r*R0&_*of8Sq>F$&v?|8eAeJLD{!r zeoxkFfb)u9%8#4%JzOxF|MX4SX9gAldy zRI+s=oXtK-`I3fMev|%(9C(Vsdva&Y^$hnD4-k^X8#Q(wLY0d~-<_`QpfKgwYiJHB zjWqV?7H&YS$Dq#drET*R*(M}&|FZ6Wr86_b>DR^u=*uQ09$8>7?yMaOnCn!{j-Wks z7M-}n4KlcJKFFG-aPvq3Mz?%FeJQKm&hRs8S&Y2U^ccGv!m=K9l=dRNY7SjUa1Xad zSH2v>(qBcFK%$hF`@tT^6l}G;=fW^|^RKm=ug;Gcl=?|WiyF(M|4da=QGHK}ntajV z7;&R#qKnhgi1tw}C%rQdjh;L+nl`UXg10YVFS^J7aO7I^(tOO(jnu}Ql$g7uhha_( z4y) zM^zi@cG9%#okK^C6~jr#?v5B6VQ$fZQ2n2$s2jjfk1YyYw63*ixPx>ta0d$~_98y; z-M$2e%vP_2OTy*sGrAsJ(1$Z86)cO|eki`S13RHJcdnRZ4U9LvAk=AMgTP5!md$4z zdpXj~(wkD>*n{0RR>ay2mJ0&B88*>nu6_VXN4S9aVI4&$+RJ)T>iTA*$K9 ztCOeN?JVb^(V*EoA02>%;FtRenZ^8oo3dF z`xwfmNcBOQ5v=?JPPF+OltMeT^8&Qqk5`1*o^Bt_G%o()|Bdl2a<^$OFr<@BOoZ{SB%2S+s1Ik&lVEHC@fThsbi z3SNwSfM@!qOP(u;2dB}BZ-yTVuH5=rCM=;-pTt*sqB)=L&B1{-`J>%C9H{7REH72` z96{(o6xD6@1%pa_Yg&iW!Q@T)nYcmt(Ofe({PDHz+y#Z6X5L#3li_{B?jbgpyM~vN z6uhPM81yIN(oXgl??|JKnSOp|XX)hGl`vas{Koazp_R#0ANfY9^G>a$ZW7g5A357* zPnm?JY~n2Q<7KP#+5C>}Q0Vi?2R!G-aarx?`Q{dd5PDUP7*C zz#w4vlwle_l0UUzKl*KHP7ZFrFqw#=6{UZl}vx{UCuL+?ia+ir|L$cE+0$R zm_^YS+i%<`zd06~jWZJEqeCmKlD<`M(lO)SoS+NH+^bJHZ`0$oFMqwVY`#9Q$!m9C ziuOSR?O<|J$Y!?Gc!E!F_m4L)v&Z_TPY50_jpCw;N z69>JRstokmu;83vMb=-lOwWdoN1|BMq)?v&3)!|j_$uYm7QZYst`b;&w&YHb1A_^! zRW}dw6LG6HDB}{Np)8e?j1F>1CD2NeyCm?!BS;^)11ej|=~H7TDLp*lH+-e+Qy^;s#k-+}35dVC-PR`D%RN?!4 z7!^7=b~dIDLq!+x!sMP1#XMLNwtJ*$?xCnvOsTw~cAbv0vk%yIv$rCH z9<=Z!|7m8`tkL|bYBx{5XP@}|eqv>pyDjtd`0`thM2GIvOc;27?5JkNb7cbJjNF>L zM39mBz=a&*{660KdE+K7Y^VU_pv#M692v7x<}W6iPiO{z4~WZKHd!O+H*2y|cuH5v zP^8U%$0v7PxkOpBxflQCnz-df2D*Mu zkRf;n=gqmfqi9Fjv5zG+b3X&H$cIaiULu6DJ*%i0h~`>Ir@UzNQHAR3#Ea+tdCdIB zMg<3G6w~`?g{SxN-e&g`SD%R#Dfv7Pe#rf@rZXSXMAVO1;P|N_82D}_dt7V-N{1hUcHx+|}`Ip%inWMto@ZE1-ye^%eDqRks;e6ipZI#oWyUJ+YYbj~CK!@v&Ce@)BS=yph zigSi%e!wV3}s*F&D^NX%zUB*&WfL4zeC6GGTH{Ta&MP;rIqju{kf!rj z%I_@Xhc~rSzoltZIUD+LQ>kx0x?p!!;q-I4i;dy6m*QeL=ECESzFfBr+#x-o+1BU9 z^XN_6vDWwKm~drc9fg*FC!Rg(bnm5tr3aJqBR>lzdi6!d6NBzxi%!QNzp&R&2;k& zMv~SW#rZ^6`Up0~=AX{bm=C-lx?b1puI5jltqzHgC2|f~=w^d|px3RIUEjz|mh^SK zi>rG@_R$?row;^~0UDMC;YT$gY4SeYFZP@{jXp9L6$J^zte%`W(NU%KQKtug+4v)K z1RE)@pYJqByec*~;&lg&Lx|R+$%8rS@ohekYGO=d{so4sxCO1XB~~!8L|n@(toA0B z_Lw~tyhv!KJSozyxuPvnmEpFo@u~#cH_%?Val|aSPJVvha|Pdv%+lk6Dx~v-%D%1( zD|NgO-KkeZ8J7*+DtVqnZlrC`hQz>1B|5H^Msc7arqWG@kvaz%yZ_Khjom4P-r{w! zLzAZ!3C+R*0?yhZWm3JIk#3u}6Zj(BHx3TOh?LH*-}Wm#yf{zE=Q7o2PhC5fk=jpl z3)K;sZ-TL!TmLa4sNHVzW12?%?oyRPJT)t}Gs_d66=L^`mJwZt=4e)UXmAD@0Z;Jj zr8kq~Oza0orc*mq^aqd5XlIsPxR~s zYg-#utnRD$w9@$Hz?ZEwEzx`m;yr@Y#9&3frnRsLe0?-Vf0W@|y@TClKP$6$r_0_v z(-S%*hf^4M?VbNlZ0tBuG_IcSFdMah42t4oV9R;MeV|`xwby4Xjx+6j>hj`4ncWoX z8(zDRG?*sm>K`y@q8O3IBGEZgnzj()j>!v+ub2<35D8!Ud8+flI_f%Nu{d{m_!?TD zhh|(7W#V?0$gpX8xc|V1r2bt;sCM$G?IeDv7^;s!NVdzTtl7Scp*f0^HZyMyo+IpP z&_HTkv48wV7b}q)PN}Lpny$Je)Pt>Cdn#Nu-utjPD~B{ps<-g(JH9fI^6I@iviJO_ zGR|j(Iys~F#wr~e$JOydjVVf=OH~e96!sr;on#}|dYc@M$Je=kt>VPk%aZy_hj?DC z<|MD52}wG-@KyYy%7vBJ5pElKt|))%2M_(vbqGyhPL47Vqq);2%DxXTBG#K=(R-lOEe0j$RB@a*~gayj^`X zE-B!(RL8p1eqyQd)w;VXzsMk;oc4aAood9R8)HSsl)*Of_T^3CTBHazI{dZwq}9+I z>X|XLb6Lh z?w&JXO68q;suqTt&pi6AW-aQw#M0|xCOu4s6yiWe2_eq=zKD=}6sK06l2yNMV6}i{ zAiF}mVS!`Z93fZbfkkzm5t^l8MKb7_c#nz>x8vx^UZ^wU!wE;UGI68`+xo~PLGD}1 z07NBJC5~a&&}cjTd<-4)=1L*uZ1LrZF&3S`wVpe<2V*c+tLMk92HL4M&eRTuea3}S zd_*kdi%^#e)GCg>$!!ql?emp*4arQ8Q&^@izVLTYLmdz)`#ha;D}a#6fyDIXhraQZ zcSnD9vtVMq@HP&^ z(H@_?{JGZH&-PHfw|dCuc~1LNJ1N<&fn8+T_07o#nPs`mv82ScAmKpyorOu^5jE@g zyLeZ8Tk$BAKMk~GbFYlXl&aB7{Wui<^;1ZB%NK3p^Xkn5-9`6HTPiW`>l#tmG+eys zAgYMuT8N5Z!?xQcm~x(?Cmf9L`nKM%B-ZaFkC(Ck@m_{xrA2jtn2%SkeqD?!dMTNT zxULHXm-7c?7Pkh>fXA~t!`c!L4 z>*wSAX-7K!758h#RS6Cop>by>9r^PH)7&wa6T=<5Q4E_#g``P6(U0J}#wnBQnS+}n zk!4PWq&>+qpDla~8&$eFkFK9dblg#Zepn@Vke?yI)P|li2RXh!T|eb8?%};J^z|GVRLt+!Vfi|CugqzEd0GQ} zYKeTM7Q+>;+17Gx6*0$%)6P<%$LUS57q8vgAg1Q&fAZq7YSgqJ-6P+8M-@xwq8Lsv zj4(3b0?l=``u1%ugFFPE2E)^hDpJ>+ShUx z(pigfL!pZHYI-J&(38tqG`x*ccEmWy2C=641W6krVmO2f;LygJ^zPUkd&eSDc9>vp z5uV2t5Y@ly03mQz#qN~$rhE=jp*Vz72%&wP*|;C&_q=yww#wPG=kr1Dy3pe;tDm?& zjG8MA=xH(aU}w$1;qmd?H-o{qO`VqgV{=t5*hJR*3nR2;;}hv`ia+q@J&J z86v*yud(6!Y*aSudn}##0D~Qv3X1ivP6+5NSIHm!?AN@u`ZQs&x9#@v?B%=7Z?Q|4 zg!x2X$m*z`{I(ae@2a*_de{58UTPipBH+-YHS1?ZM`_lBxU?R~1_synsbBV`xt$NmRPa zKUDKY+$F@!Yv}Nx(}>!10k14yOb{m6{mz7;C=tzv2(=de#Ri|4x+DiPV&cP^?)*4q z5J9PHy0(cpjl;3<2e>`VuKsvV z>uVc2Aaj-~*dgBrOW(q4>mQkzq&YVKlr8Y{=ZF0ia}x1)%9W(PwJUr*cL@x66)q;X zmSRgzt*4LA`gk8mraXraQ&KYPavFa08ZABHM88&Lv;5d*rOu6D=ezdcqThT^%s|rI z@ki|k@%IsdW>sCI=lQ)Zzw>={dPUqJkbuplBd9=|KII!Hv;xlh&kD!+RAoG681hQH z11w9Yym>FR?CTGbZV8%u^=Ogtowrk&Kwn)-e{B3VC@MS{vq<@=KYo0s<`On2Ve-2l z6dY|L_HGgR;Q0EdkW(n5Q=#D{lkB4jk8_z*`?%>J3bG{*f1a)q_gLT9ciCIXAfhrI z5#svUeDC<1`bB-on4IjzKj1i_bd@S-^iXS*a+t3E_3U-??m=Phg$T_n`{%F$iISW4-8qQ?BqRmhp| z@slUwyqNq0Y|EWrB!tshH0Bc>UfEUwyXnfxBl#q=`{^)vArom7c>IukH6hn(@VC9} z^dw|R%;mRHkeIjhk<%BHba;!_$5}$2zl)u3lX7nX z@o{4EzS>ObVPdaDgeEG=8rakhK6!iDQumj-l4yxXsYJI45ARh1{pkaXFL$$+5HEwz zC)lbr(596)l|j!i!SSbDdaTsqmyC85MS1QTd)1AaWHnOg z{vOtI-RQNL+TDqoZvSF`qjmMyse)@J(^5h~;$r;px{oP))qiB`UNh3u8 z=lx+Q&P}Z>v`cdQtWRWdmg~CRWfZO@fBkR~>i#PC5?4@pO@M!mRUHnaMRh-52Yz`X zK!@WfbE=4c+cjm&nD{q*INJyJqj!KGmgxA>c8$TE8D;h($CX!4)GG2x4C>z26Cify z;RfH6l$7U!Ss_kUkoHubL9`a)NMp2UFq%p-EWb#Ux6&FX~+ob0gSbeU-(O z_$ax~oPzdlc3HY(diCkv(HY~!j><6ezau24(#jazy}@#AeZ(|AQzg*!aHIEi%u8Wsk>FfT6UC80AWHJg23 z$&ybWq8xL4Ccb9#6|YY{i0(V^A@2Qpj{-95&U{MYx5CqwRp{Jht8*uMQFnts#f zyXLxKw<@a37qGK1p3lEMwLPLh_`~YRwTEE!ffBww&GAsb@wF>Hxi!Aza;QRlJnwk( z*Zzk!D+(V{Rq{x7UuDc2^1h0erNs)`9!5UP*xaGEhadmm>xIKUPcm8^8vc!D%2 z0J?6}db0zEpRbchWL5ser{y*O%D-nH?`%i1LG9vjdxg*JA+u@yU<`-1&lkQ^1@+zX zV8OB)d!$j2{@t@K&Km)pHrE@eS8Xx*)d}Ov6u7iUvXYNFS=daO6J}ou;qud?!On>P zE?w>G(|e3RVcwC_^4D?R6Hb$#9-p7Y1h6?@7D?6!EXJN81X^Cz>~470t7hnWgjx={u@TsTo)?z?Ij1G-xB^+29|5$kB^V%Bf zaBfqD=FTg)%JPTC&Rm0v+$Z|hKx2<*Zg?k`n{wzx`DYLD6cLZK zn{=mH&(>X=ZL+yDs$IaFL%94=(!gK3!0E}C{WNQjR@`vk4{f-Bqjxv>6U8TQ;z66x z&DeAJdyWCu6vJa@z!sOTX}(@Ve%^TJGp)&i>d`b5r=?87u3HrN^$4kEh6L1BVCp}On_*{Y)?etX~ znqxKHKR+0!I2V4`G0G&n9tGJOWQ?1%XsL&{YK-3Q)ex96oFr8FJBXBBz3u{L=U_+b zcwhvV18VLuU=zQah0Cw{ew?VUnANx$BHL8xxbF4@KNj}5W|LgdSNi^Ex-J9SOEWYq zn8Fy2dH!}xlc?&u#Wa7{&Gt81hoM>NA>V?&qsl^-r#ZY`&|I^c`%8*6104$;TKDmo zCuru8loO8$7H-(>B;|FNzIRF+r}z@YoEL(2#@#U=F=qnj0l1rd*bZ;M)d%9$`f-_T zgS0N5@dt=9!+O2*@8j`u%tv(>h#MBukZB6C$4&%2ogU1mrL(AvvimkY*(g$-m_D^D z#MmfQ&1aSoxFR7(@5m3$IzL!fBi&o&+Ul4l3l*2e)IwpxhZ8A+J>J=YHd zlHu5`x$EAPIUW0zwG@_fZ6aaC=%<&l5AjFq`scw>066Sv@gTZ7q8*Oi7uG9#dZ+Z0 z+q329H}c2)=(G%yOEanNZNeo~Hw2K}hUWAg9$7omRiMriG@>zImcr`2ehxyO*)K7# z+I>%NBpKER2Hkm{Od9yaSwOrxXVSJ(gvQNpc>i#$D zqEXWNtft)93C_4ZGqfM&RaYKg_Su<)*G>%owzmHKCO`g#V`YAkkJa1iDT^DBe&z@uxEvv;_z2tD-5k@YCiW(Q8S^WYQEn}KDLeYd~G z9;eVn-U#`&2Jzb}I~KtayLozMUUWCk=MD3Lg;j|*&bmuYHe9pycT02w)G5Hgux!sX z23x69HqzDl;Rmm-SIwIk-$V1H%$^zG%m`p1Wb0nHtD6Z1H~vWK^Tj5VYvQj~WSmz$ z{MUwEO|&a_9IHX`>dO~Y{_r(V2*2cSc;`Bf;N5qC_{zf!daGmr9I;P%&8W+biNxG5 zNMC;`REjHV(dS%z7=p1!oee}^qVC)cHO3Y5l{`z+jp3jAyl7I*>z~v=sW;s8wU;6I zyK$l6P0^~9YaKgz^zSZsT*UI1q~3gY=%e6;0RIW09N6(LR(l7p%qHG3qOI6)2UBX>*~@_G$<>)4h>N;cxt&h%+Hnefwa3-l?fgvp+H; z0Id6t`|C^IR$(J`YjsbzlS&z9O3%6M5%V;D=XMaZ<5(YYR8qTPbwxiu{-;T@&xW|= zPoW7EJvspo;k+67kLEHhB7Ov89BVSthXn7?hoPv6c@1Ejj+)WFio)DcSylEzf1RFv z^vY>+?Gmv6FC{BIPdg7y<#1+Ir;Cb7xM;^c1aE|SNR(&$Ey;?5u=oy%0y)+g@Z%|RYrxI*@6M#NdG)tjX?pO+!b77QTh`P zckLR!Q-Vs$e9EiM<7dZjXBSxAAx^$Qm(!c}5+?M^o?QPHVuPv_?DIENr;EVTT+-5F zyV@C3@8?m$?7oLWYgd{;M9%P^Yje$0az3o#vTgd8{go1!Z_zB}=q*~^o~P(B z+?+pa+Dao+d=_yO=Q9|o!h7C<4H-O#(=_hWLmnBR=@cz%POXw}mLFO7u*{@@z(H-T zK@GFNa{Q5`tn{InE)RsmM?KNf$0yq~?V6Pxv1SLPmm4lOrEvFo9Rc=TPOv#1cQuCu zW__QLdd!-5a!RxDv|=c^oV(9%2|8$e(+-5AqW!{%1doAQEH`DY0> zsA4n`Q|d#5T|dEjW^$!Y;>vMalKP|m9hS8fXi4$s^0%6cYTx@ox_R|Q)ILYQB}zCv$71Q3(F?N6d!+)$?mz#e(;CRhwCn8446eSF zV@4O|Sc9PK<^I=0{sS^RMRcAUT9VjOX@}9(3A!q zysp6u3<>Z0ZK%V(WzAZCy-uu_cY3s4bNhV z8$aXLAe6Spkxsm2ZZf({FQe$_g`N{5(~oz@k+{vN#7r5Vv9kEKTV15I6D>=n)X_La zYwUL6teWJeC%9LEYuv1z?kez-Juog*bpIqBBlu|iRVHUp7g3DFl|VeX$GF}=)Un3J zo+G2WbhiR?DCHrBY?I~eYg8*HM0@(hrbXlI&@yR8?~I@Wo9*S1Wz_VQ3b{MaPBLT+ z>3!_hM&WUO0)?9Kfdnrs)7#H;4i{!Rr;ut{h|rT0@m)67lgDz9yPZc{xy9(wixtcV zs4!>P$8(>84@P;2yx@CC+c^uJMk0+`R={T`YSa z1=bBxT=&9^E!PLiXCF%$3Kf{hoB5FZ8m!%phmB|+9f|we(jz+Vr4eaQ}_r#vIIc6~1cG~8KY5#7yNal!KP@fe-bdJ6_TW4>{pIN^6~4x& zRZ<*MZ=l$z+KYb1Peprz_9nG*w)gJ}f8{xOF~Xy@=M9rgu4uIjeK|d@?!x6q7^@}a zK#muyuDKU@42zahOSP%iaPRS)XK@wt!yTnvuZtO2b|A%n92;34Ib8OEWYX&paAC5I zB0=sZW<~U#H?HAP=PseUkQ#Qj z!f0UAq)1m#5D*J09TAX@ASlvAqzVXvARrx)4hqr{3q`6TRiroR9qGMDk=~IaAf5eX zZ^YyAl>a&Rz4!aw`@I_-vR5)QnM|^>lF6)<5DiE3aoP?351uumeXDZI{0L%^x=s7I zp#FRjEkhcW<<`6*@`~Wc&WaBZCA(5}1d$gPNnAK$W*P`i)NJVZumXj<$I{Ql_k?qC z=cbBMY)*Y*;Sn}?EW>Re>%z3{K3@9CIr)%NVCy;8Od)-tNeVYg5F^={jF1ml8`qAc znVL1|ymd+a{!(Vhn=Jg{$+%ceH>2?dguCA6T_0b~1HShgZRRnbpon;^yR+tOlwFHo zFqLTZBP!eNv+x60%e7Pv`rdCJ)T+vOU}dw5=?+m(36llP*Xug>K1=ht^)M4>)1@h9 zJ8nG3+SizaPMO2uO3qiU`hHXM7Wg(=0>cAUrPkA_m58i!8#koQ6F=&#dNFgYgX~pe zn&`CSJB~Lfg5sle)r45tS1;5*Tb(4Q6{+5o0lmCq#Mf(douPOzTT$aN(XpGJa zzGEMPjj{5A&0xVtELGh$^Dw2OMRdTMt%z$Qfi?47!UwSP$>VI|q7*;mW!fL4N4YIm zZ_LE>mD5KXv?`6-Cw*;)bE0dOZm%QD&*g9FyrBD-Gn1mxWxwJaHppNTaMo;0dU}$j zsMprE#U2B|K6w6Ysv-Bpi|h8P2uF|=q{aQYTVmrbzxCQ;aD~MnGtH)t)mn+I{$yem z#$!sLJF(L3er~yL!{K=+?g}e3#@3HIMlXwKomPAh>30xWGk?CvH*>G*rUwGMtA)SKX{W>JOFwMI8oD{}3S2OGxaL9Fog_?&U&+pj90$8x16yzMVXm!Tz&(?bPlG-)y7Ks~W@e2?ntIo}EH%R=P6;6#B#u8H zSKC-{(d&M)rF!+DlF5gv!s4EwzQ<>ac$8o8V9xFewd0Er)NUf6V((KG9ORu10JDUv z=1Q7c*ESM2m#1ge4f(Yi`I>DHr|lAaiYH@Zhfb*+oa$+>{a$M`JJRY935a^-UV?k-fk%`Kxm^a%=9}foo|#}C z%)5p0qsOD%nYLl?s}`x5J!n_sGMbAx?c~BP&-7qvT4a4vZBecd$G3<(YnwJh-W^r( zw=Jt9F^X$|YYov}@a#*$wT9RsVlA?+R zWsXI?d-=fUE@nlTYn{IK*m&l_ITkC_`w8sS)1NrUK66BGlE~Y?SzR+&cT9B`(jx40 z32w%%BNrKWi=0@L!$zahwsA?&kIW%vnla}YN(J%qnla`HT!azPhxCcWu>;vW(y?aG zmma~ve+TTDjdzqTl2i4z+vh~9|9%qnorfzDvf?RX+HUY%Q2<0BxIlnkFNS|% z`00BSFmdlyZ5WA(i2+ebQ6MfZ4lZB246a0?8FPYg2$A>CgK$$`5M{;72RV-=K%U2Cknva=@3z^5<~Q1(fTy`hBzBgA8!Ym6CZ-w1ZU8Y=mX}#DkIg_h7s^15C6RfXR-02u0vAl-CE!=nLWT3=(+H zB7q+Y=8)joBm^k)O9(;pNDvC+=mjKr`vnOSVE&s$_!~mv5)$M<_yB*`)zyK<#zxT9 z)dhNbdO%-)9~kc_1ykJ>V5YYk%nj6n+Ob+NJJbpmM%uu__$RP9)(MuT20$6)QNDr% zO{+-Ix`qV(5Zc#~pc_Kp1`-TH7#kY{Gcz+_VPOG$ogD^W=f=V6`~+BCm;&Dxr@;Es zG+1Ar2OBH%U|PnrMuC!`KiR8)|cha(yc@vRXq0(tj5L9=e<^BmZv%KxS%cpgExGpb&sWhRewOOZx8&&?5K; zKrK}N3k4vvZfk04LJ81H*lUM>SpHsj$-0eVfaU;spjA=5m-F{+zlVcFX5G1s;-I0S zxu5b|{G9{-pM+RsqXQQe6@_({!5#yY7GOKvRJ;z2IZz3T0Y5s@pOBj(E4Yswp`xII z^aciK2_OSBhrP}OyQ12DKfuSw_%#$DuO5N)R1~%-`X3y=t5E^nnPlKG*XSDjk)9A) z%*2Fd@SXm<2q-n8JJVio{7ZUMqz4ldJ&J*K9=d#k?^;0hC3H)o7|3V-Rrzm_c90$= z0dMVh`X6=pNd$O4IjRDGl)nynfeBUs#h^IyuNmC_3wmrMl8G5IKvhAkHs}WfR3(1y zjW_>-9v4}`ief;IVqk|%4|=g50`KhiMzk9Lf*uc9%+8KxfYRalCu#7>h6;6!jmWGQ zC<(r+@h|AHk&W!EY^VyLTM^X{C@4r^AR;2Su8Tt74=6Q$??Hdm9}>yV&d!Qb!STIr zut!e>8IUm41)-Gy8Ql4c@-bqOqL3V|!rwC>f()$k_SG2lSM=scCzJr}tf&e=38+yu zAVXJxh=>gNy&W{8|Em8OA$|GK6<|lH04ni!42oZTXOQ<-^iY8UkO6E2G(BWM^<4s3 z0U}6maOch)O~XC$zxO}JpTi@x0I&x8-RL(4BxrgRgY>=P|BRmT9nxC>-3Dj|dyTNC zgT3;h1h2jq|0n$)0|VOf9!uh zhBI7b0vj7M`exj31PYRWL6410WMdaebVK!m9}ND29upJe5C1WdiR|oYNaWSMZnRg0 z(m$2|=jQ+IH!_(IO^X>~}v* zkzTMPqjZSw$*>W48jIhw*Fqzsvjw7ynED55}**(c}L;`lJ09;n>B~w5Kj! zJVkr*B<;Tb{slQLivHrYYZs3_ethh|LXRrz|0(^i^6$&>|Hl|LFz;gQ4bO`}82zIZ z5Tj!kfcv``9)^Ox{txjG)FT~4bb&GMBK!jb;+GiX`}MCe>f6oe@L!C7<+p$RYx(}O*4Ea*$;k->8?gPsW{k2I|7t6KYa@uZ7suHO{lR8j{}2OhH%_t> z0jZ86;H{k$NOF=w+l@IxIKjQFd(dV~44NO2f)=>9*5W}0+Wk*~_byj|*pM@z4f$s~ za*l@-`0&Gyd=(V>?AeerJ+Ff-9~DsaR2gIh+yOLj2*NW!&h(D>hi5U%&{LR;}JQ19CXnh;b!?*a`7_aJBv?LvF;F3=m<1tw@)F^tD|fhDvd zn@{cn2bgX#wF~XUyTBO&%0}$t;{)J_89;DwFo=kV0D-Rq(e~oEZ{MQr#i^;OAU!=D z6ciMI`V?=_k?94xvVA~HhA-&K4FEMMPr=93r=Tu95Hw^2g3i2`ptB?fbd{rQ!YSZ$ zLk<}Fm;~nAlfh7ZIv8%u0wc|NU;^5Oy`f$B(abLJfi_{k>0J;wy9-`GcsaKVBB!A( z_{%PcTG$1zAr8tOoCvSqLr9z71-Z~h9FMXKLr8`;-QuNPP+MCI8k_g*!UF>XVER)9 znCkrqzI<)~(}Qhbez+CPjP`)3i9xVDGYBfbB0=@aE~s0EHf3lJZd~03EfD(GcERWM zT`(~*0cK~W!Pl8#urfCS?Z1;?Zt)BF_H`bttt_DJzhg^FU}p6j7=rj?8@phBYZokj z{R&oBSHZ@&C9nx?zZ>h{z#>fl3T?mNpzU{S3uW`&{cmscg}3>?4siJRdm3oGM}@yf z+RufEX`wFygHx*i5k+0~3k?MYSr`=b)BbZ2b=7y2m7t{);$K%*NcxWib-~KY%1V$z zR=K)b-@Ad(F95gx9-Twmi!goUOLo`zZRN6S?T(9 zSP7`3zZ3~(!Qii}g`!9*PESuOPTx!0WA=NqBtb^G4eL>A*FZg8NP#jzp(+k7i3%Di zdqv>m;rtXV>Mqk#Q&YP78$f(rh_9>x^>iJjS%qRW0X`1mr;rf?XJ$tzU7s7=(bLmH z74cn9kih$o`1o}XPchOSIpPv%V4!F4onW8hdNqpRcl__wi9vkYBUj!Tz^zCFlu)Qj z>@|f#>L2jU>bSWXApv9jbI9I67ZT|15ujQ-_z(D2`4IoqDLA=|O$mDy2E(ft{;=A6 zMZEX}zD4 zN}&7c@9KN7PW(I!D0~zF1sb1NC}pp;)Bhk3Ufn|;h!5)kwMt3-y#Vq8RJFCVQ0Twc zCn7qAj}P+yUIOL!5^~Y4?f(NG^YngTsJXKDv;;G*3;ERi-YD5Ein7R zwBqq4SOTPg%|MH;GYK)oH}Ki-Z@=q52n2z3Zh3imP8QtC%+3}-6A*#A8~g6MT!ew>$wheu3eUB*F_*8Y}1%*Qu2gTcbggb*8r{&)C$2xzR|{`}1U zC;j|}@cq|5@z3}_>E{PN8gCyNrv8lo18MIEHUjkzfdcdoG}YhW|3?1L`0wNIW20l- zzwU2)@<1%~e-t)M`A72XmjvU##YankVvjBZ{r4~N_f@hl%${^V_%r;cz$|)(MU>?n zGxOg4^=Ihx3=Dh!zh{avh%&pM`D^@Nc|iWmXTI~_qxx_17tVp!iQzN|#-)h8*ac%R zhKcC(zlgWtY8aJ2ggWF8z`t-c{5^&~IOyyB7-pfb|82Za|7*No&VIT71LD1P|DsMLkmyFjH6s#8qI`@X zpnQxJASgq)4teH5$PT7Igg!bb z{~~X1Z}95XE3_{WYF&u(C8|q)0&0^z!KcC?&|UZnbQgt#-l}(?uR0kFe0&c&nhL>a zV>%dZ{Q$OR5w7tdq{H=GMOi6mZm$BZ z-Ed9U*$H}kKY`JXa=5Om0yBNJV5Yww%)vF?+(;K_n4AFJ!=s>Lei_0FC|}wHrEpzX z{|yNmR(C)P^!XWtzCOdl!(ei963l!V0`qedU}-t6#>z`0_HC-b8}0ON(&r zw*1|1X9X<6HQNeYyKQc)|Bvw1`O(k+_s3sq06k#*G1ngz{+Rqd#s7{ZHz&8`oxi~F z*W%_r&&4S$qV?wp?m~k65W(Jo)8|i%a2FF26y)aSzxZeQP%d|IF(DzXyKqN+UrOx# zfa9TXh}ckCTnu9HqYB~DM#+G^zb7H9Ohy9prKP0y5jbss&NpTwBO_LoMk7EexM4-E zKn6eP{Tb=W$k_Gv5g^8S{-5#@p74w%GBO2ATiXY=5A>vkP&NH2AHyGR$x_22h=|}y zlj@!nL=gHp-;o6(93mrwJMQ>+cvN>G0!oQL%9m3_ZR$b}L_`D-!CFe{lHQ*DaGw>1 zqt2{Mr%)oG5%6KYG)l)m=rOPiOi+##%F5?Z93DS>2v0ozIUl8{M<-AvK)(k954el? zC;3dKO0Wo639sMs_sU2Ab!NJM16|8Aj=$yaw*ZXg)tFD9Dsk50+MnfPV#*jlf(MR2 zGQx$(zn1^YCL`v4{`c}B1sdZA+J63z^1rXO_O3C1<&WAAM};4g>fh6&vrwcE0gdtp z`X2@F{lG*&Z};_&V*LBCmk(V{{v^=Q(9s=3@A&*lCOT<9AJsKy; z6Kc@r!vH#;vxDwnK4=%00-3HTzv|!nR_8sr0&>0NK%v(aP~?9dBbS>ieI=p+1ffe|B?oPvpx?OC#RUxuNgKMFLHC; zGcx!!-CFSCMV^Zn?))y@Uj2rZu%Ljxfxf}Ldni^1a8W`0204kAsIVZ4go{%Q7LJRu zAt0pBk&|m)5)~B`@be(k)J$AZIhyzkU0*umB{16OFxevy&&t$>~|4v*2Z2 z$m>V?RrZr7j*ycOA0#Ba0*goS!9hK6j;b(0PB4Rz@Y*F|)aih@2=r4#LRI;Og1jsh z@j;pU_u&g7-}$3t(l~h^C_p{ONFdNd*nf_}AHRh^dya{T0@Qx6s;Vl0&dxx@ zOawqnBCxTs0gjH2;FS?4`n{nrlMCp*-#D9#Ant(xdapO>q2TxTgM`uhys7ZHmF6T0 znp}u}tgSO$#nA5n<#@`14EV0khiAH==+%Anv#9bl+=GpS`>x5(;4)kF!B zPeD*n5Qqp51MlCz1aDqPf$Vp$;Q4OBASdkwXny|)bmhMUEx93}r6?A>t*Zj>+ZsUh zzzBE*c|D)m1~JoX@Oc9FH{p6X8SZ;#!?p2;#SM@L*R$<#J=+1F8Ov}zidvIy?fyDG z{HFH*{q?^Z`0t{D-#y2!OUo+$B$zzYDORbUuBF*oxVe>J0uB}y7TzTiNlx~2OedKX z0TvuSFVd1-=Hg_Be#WxsYeAaB$3#Q~B-oi`(AO6jXbzE}o+Rwo0Tx`U;0lQ{9;Z7< zD1Q~8^C1f{MzVtk`QZWul@EjX37SLn_^9i>;TeaIhvxudKYxG7{w;qmGC%KhgtCd!uh#YG*v|lB`5uCq%-bZ31TUf z;tbM#blA5{I|!>m=XBYJ0|$&S`=O^Xc$Jy(TZ8-Ytt=>>lRtnhM`ZA-n zib;Yc`fvU#7^j6Bt(TuVCL)aI_R%9XJo33mz1s|~D9LkTPTdZqWx?J@TXs;IOmRplVA{>_TPc8%6wWZqAD{_kolT@~iy=URfdwcS09>Z@f=f;PELZcm^MqLfcb_P8tY5 zH|()PY)UH?b->_MrY>p2|!0e5-Bbv=ZNFJ=Bw zaW64ORB!tT@ZLOTfGt7h0bA`V6}k)lmKZmciZEE$yY#w%}Ne45a9>NRL_J z|CMsCwKijtoIVD<-n4yR$Hn1{kB|F1H1T;nd17MdC61}5_iYq<%2A#@d$ujAzOGJQ zU7Y}UUXGDELFQRAN<~X6!F;Hqr$+@(o$CXXOYC;FFLj8pvk+_>4x=vShHyUe_Bfe} zjzq;WmD_PPHPzK-D^m(I0=6XZTe}sJg7#_Fv*=S^TMe@49nYS>1H_xyt6SRJA65Ar z)~$VZQYFiPbY^yzV1NY|%3)w?Drs-eAH=NsOm;GSw2^19a=BSJ;BJ)TA@$t*EZS_# z1k@mqGEfRDKFaB0li-?0vc?B8#duj3tKf`NdfI}5g6oSU7OFo&(-NlR(0)o;Bdm@Go!-F zF~5<;A*O?LbXCJ;3N+M5Dn!nEl(m`@22`;e26DS2U3CH*)B>x-<(Ry@GiT4qYmtlU zno!1dD!+MnmAm)IJ@)s-N4Uaf&Dydqo|j8(%11&!vi0)A#V$(kZVvnN&zFAuDAi?6 zNdT+$43z{=md$R=w0F1kzxCwF(uAw1v9K)-Kk>oaa!7U@$(rYOIRxFTw&tpbgtJC) zaXo6PInH17-;sRcp3$G9p-5VGrG2J_uLy(S;;XdeOv;G%odFA{-HpkZ%uI5v?UBQ+ zeSNcQvuDo2zK(lX)0UwtRcpbhc=zKXZ)D%+Co-nAWrJ(Ss#>IIfSeI;X1HvN_h8)( zufh9R{ueow&Q%c~^!2StGeL3`epq1IU2J0oB1V_ESv3TX21j1NjWGG#p#5#`Qyx5} zY4Jm|<3TvYEN01ah~bFzsASwZfI2(Yo(yR#;97Ejwt`M`AfRMH2>+maLZC*Mmm-5x zQx&%&ff#o9-L4ncs+_!sjHOfP!be+5@#gJki28HQ&)p2~_n-Z|dXPD?ZzO-m&(AM! zxryI=xsgW~o@tdb?nBB6j>}kDa@{ZZa-myW%fU^zVnS13efvIKW=ZPHvmUI!O%gvc zl|m*Q{McwKsxRq~cc+W$Te?(?wps5P87+0D zI#Kd`wq-m>!>i4#?)#$J;oz_rJOa6P6~CD08(j@@efre)P;(#G$YjPk_sEM%Hp(=0 zcP6Ll!l*v!-YQcrT6^ZQL&_w5&wDX>BOOfGQg(z(Di$(?EDh@v7n>KNtdq@=oR z<8(NtgEWs_c*t*LdV?v?LIn$Pp|9Xh;!*3ogwQ+Qj@$1Kl`Kl1lzp_`-qW;2IWnm+ z6ReVY&uW;aNW5lb{VP2FmbRq-v3U6f8#DI1O*H+sqkh$#Re02J0hc$QKkK`J+|gm% z9n|e3iIS3%)MfII?#j^I#xb|^U9TSrezG-k##3Rty49z%IyI`#!@0s@Y(rZ`9MzL@D4zb5UZs|GhYj<)%7v37QKTh;Bg9L> z%kb!NTdOg_E~sJED$!fm+JiN}y>rtnmiJa=)D;D9 zVI}1_8XChS&ywKj;s!(4BG&e)s>ad_yN+zdnZqYTP-8`JwvnHnx7V8}Y^uDG%KUHn zJqcTuv^U?Y#Wo@veeBKe*lIAyUg@aKN{-*UL5g&V-+B>$$loc9XzsIL>S)&dv`Tgw z;$lkv%vpi29~t4<+$}+84#830(=7C+Fp=d%gTll~-WsU>WeRo#@9}ZHJhN89aK2P* zTA6GqGyeg2Z>}80T^gIkedd-UO)MYr{9Rv0o%h2~>oHPylW0~yP5Sqf-KE7#EtYwy zTSW=)uM@ATtE;Eq>pW}RpQjM2b?qs}3*Pu9VWqI#*friwT*ZLa5o*!5RZ*%!ABr$$ zyCb~6RUc8OwN#56>c?(3uTqW3+j$+FwqVd9w9+(b73JgVW%5amW$u)@0ll|^)*+ zJn|s?xwPc=ih|l{C*gKS97Ja4IWMiV;NZ}%U#jl;hc$1!xtayYxw$tNVpQl6Oy&~} zH;IuL+I?lBJymSPQLa+dX-)%)2}x!`{fcqvIb^eg!2o-E%uMSHgb1*+mE?Y&tK?E}PLjvPklACBZ~w;c;s?ctz|e~{hNR4c(Kl@m=_ z5;ZMUGj)7umU4F`<1;j@$KUk9=5!&8PgFpvq*l~Oavi_A8x%ACZX$Q{{_K}i!@;}9xBS@<23Yw=p)ebpl=r2u0ZfY?| zZMEJykN+@_u4NU!tu3;6bn2e6Fe&_Ihia(iHHspPG?m+$#}#ILtH``9zmDC#-BXC{ z!Bd;r5!5$wzIRq$XhAkxpIl4-ldbGQg#=l}R_c(Q4mWWgs;f(+JAEG4P0yUw zpQ4CKNpTSblFMaEAT-`M#^ z>a%Pgk_<h%V2buz< z%&zp>439ZHl1piK)EV^U!q?HdfpdWmPQ8+j#jXH=(ERL^oLQG?K%5LEWn7jE)igNj)Kw_X;7>(a1IoatRVKM*`E z+kz99>iiWxd8cZ*)gsSOZ$i2tnQ&LCklQPqwWwkZsti4Yq|ZGO7&vWHbH*tVe%i?L zD0KE{u?Ek>H#hB8R#u|ly(56%05UK#x+E=45GNbq4QF02Ug}LQH7cGpZ{!>JCSDV$ z>d0iJc`{hx{b+*8Ru%0#Prklyv=!~@cCni>q{unKhUQiL`vTv3bW`-i-?TD8pH6lG z0blsxZe24ok^-AA<=?*awRU$;zGG#3ig8VBqA~0VoQu-c2eZ{|Z!BN>@FgoT1)omn zf#%TaL^*tB9+flwG^?=vFkh2F8$O&B)YX0B;!cHIwOz?6EG`x@?#(*TEU*pf}xNaYh6dJpm&-UCjU-LByvt}Wc&tbOe&DB|F z!l7kf4y2qu2DM0QIaP&T@GXzo)Dq3Dp{BMI+tWgc7KWAdl?UG@;a3s?*70zyTB_xp zv&T7Ij=wD`qRs8MAdnSQLiVzT4yW_xtQ^$dDifWXr>lPLN=hiM>nU@BgjQNGUGhQG z9O1Kvf_HQ1Bx3oIHzphMr-drGcN4QhzUYNDnLLe+?4G4;I|n~js&#!luhb>uo{JL3 zxQxidqw=4yZ!~4LH8g-l-pOq)-}Hqo!!m=}Z2cV3$zgrM|w!ivz*+L_+!q@?eviB>|08`YsHiB&oI1O3zh(Cr~;ovU0^m z33N)k_CCN2oQ}(`$8ZR+{9H;^P+{~OsP4lmV9N|ZT1xdX8E2i&aJrwr-1gC4$r0C)zM zF>~f~H7|`*#GjbC^U5eoBY~x5b~H9;tZk-MVjd^%XpxB8!OBuG#xqu|oDW0(mN!N)PZPdw~`mR(2d1#)BPw6?4C2~R~8Y265@Agf|XW+Z5HvLbFN zJwIX<684@=^@vJwWwX+VvV$xkLUI?uicw=7zPuKO{}hA5Gr#Vl!?YV4!}U1gio%RZ zs^V{pMxLj~BIHS|)wYkAUzN;WVT;1ZjD9ojZtJLw(%hSts%H84*q~8d7uP5KXOTFQ zh1Y|x;K@|tV`;<9C$ihDtzX58iqA9v!x z3$0&osKpVBQiVA7Et_bUReTxrk=cGKbhJTxXfn+-;ZgIbv-eDGagD2yUx({n(*U+D#8&SiF%*u)mv3O1M%b+<_tBqb*f6jR=A3BsXiIs zTRZ`j6F9whFS=CY9w8WT7g0^u!9FCXmE0)I&hGu>!j~MOb^oAvVg1S&5@)7%M=_JC zjm;FD(D~7vFAs7~hhS*5Ra)cl0B&g+qCvu;c#GqH;A?}D@-4jvWNQ2ae75FmBV;U3 z@{Ie^E28F~KhO>{b~qZ+qi(2Pae-ZCr!0^xU92NcMC%@HE>&5Ndg;2XB5sgH)8SgL zJK*74xSW#?Vs4#S;J@e=KbRJ|$;o%@$vD|kaBnwt$n2DaF`qV~zS$~Dl5`f+@vDN? zOB`@X?t}FhqXj0l$gvM&&!kc7fVR(L7@~*bnVhzcg`1Q%XqLP=_+fYDUg$+*I+Im9 zZuXbF%XhTfaz3u&KB1R?vd-XmA=GswGyK9=fznZSGos13R$u0q0i?&ELoN1y#=-*rNd6!ffda1NQTl@LAoqK;P zUdICIEkhEToCTE&F9U-IEo*Bb%&@e% z5gzIg*=-&5+teXy+xUp(D~|8a7d6!oVZLmm5j$K(oX}W$hYGx@o)!Trj862hUhomb zGf}emjns&t+a>gJ_?vYG7F;rRMXz>~mE+$8%n;A7)pxNqUbrLb=x1f>q(f-unty=S zn^R29?h%nbiSU%o^OjHH%lRF+oGt<+UgFRk6VtfBFXUbED3MmOKQw~h_+n>{E-opH zoU$I<>99ve&C;b!V@^cX-Eljyh%S$w zdfy|XU>vknC_v_BUu#dtym=!QW;!r&&a>D2hP0FH&hW`4Y3#Y1uL(j#6Bt)(PYI>R zX~%6<=D6{&EA5Qj!$H(PtCQlJ$dSoDT}^z-nu&lmwE5{*TCpc-kzv$Qvo463zW#hC zn|rUPQn3U`Rr1VUb0Ak85l`1f)8e<<-abwuFOe>t6VKCRP(SoQMp8ojqDQq&1*^Qh z@SLG{$#Yg2&C@JSQnO~-b40l*@kB0{!Nbx5z=*^a$w+R?gcWr$b!6Aj%?tr_E&Wz>`K^r47yYF}v!uJd^6< zE`;<}0@Q#)^4*OjlbUa^SNOmcME=>2CvUiA)m}a$TN`q{{=<=w`YDN|{sLF*(#{0| zO3S|3DdeKov&n%Bk&}UY}J_#Q;NHBgRVzII-KCR4kEtDtgq;`DMdI?P}xyXm(4sTdhILy zO8K?V7f(H%EpROJGxJ-0QV|Z9mg%{<^AjZl!YAfpKS|bqacgFdAHKdb-5Gn4!bH_4 zgI8i{^1k+lqocA-hm!KhQ952uB&`iOlkePk~D%@dD}9l)gqSnQQuSHx`3Pa66yvFF7 zjk5Z$Dtp1l33H5pf85rb;dGY;CwzuKVo%dZe3w$0D{OrxC`zCrGSCxK(g}-xwM$2} zC(&eh!Xec#j-e#!+ICuBob@w|$DW3p#A)*wW^+AHZ8oiLUO%FW)sW#IbeDYT0muHF zfF_hDlN!0}<-<`RV|U?X07BB4@EXm{taAi6q(-k7U+PJ;)@UoNB<&{}I>G5eV$SVS zrm$x4@GRolx*&D~%}Z&w=QAzdTBJ+LURrY>s!Oe64olU$N6x=XW~vd^EF|R7$e?%` zXmKVOQR9T~TZ|K*rbxR?-=cwlx6gH_vyB7b;zuC%Ha4~@gpQWn!#{^s{UjcTm-MXO zRV+%D2i}Tg)=Atvh<4BR+xK&dv9z^SIb-9ZD<4tvdJDzc0WB_s5lXwAtp3(`U)Y2q?2*c8~%Zu6CymVcB(bEUGNpJq~2Az zJCe{h(SAlGB;-B@j!SFLW3lZ|xDCeEQpKfinacGJq`tP35{ zIZ5SYH@YP6njdwG$>((IO~QLMYb38ZahucFq2T1M7vtgydA>oTSA!x^tnl{G={I>7 z_Z+riA0GW7xjRC~B?dP@tUDB6PKLwCkRE|YnoC-0D{%X8$|`|B?KyZdv&Q|!(Cf5i zPnxF7C?{96ulju(&wL&%4(~1+DyM!m8$mUrw(^53p>1Rex$jRP@SfLQ&16qHdtI_# zKxXYg_BiLjS|sJI^@iS9gQs+!e4mC#yLPr)CtX{EwFPwteOd(Z&-;o*#W->Zuf{kM z>tR!ic#6m9X>b^Om&n}h!{s?GBuwhuBjZkhj3`J$AZ27t*&$5M4}QSEd~r&WZVep z#c^;N$_)}z-7bamU0WNsc@&@_Y&H{anD=ESGh$?tLwoV{Sf`o$2~Wm^Zo6$yg_jej z?g@Xf*N9KXVWnY&bto2Ejtyy_d44cr)k^EsAd*Pv&H8u*d5D^_S6`~C`@M%fQB;`z zJ0IY<*-8xxEP10<@^FjM? z^2^f;RJD4nDW|Di6}CyfC0={TF@mfjelITl$O`Lv@MrPNznI*={O0U9N ztW6BtEW8+RTzf*WbGk=Qy{al*x5n311Sk4^E14=sQSoefRc3mIWd2%2jjOIOYvDTI zVXRUv!@6en;B>F96HdW7#)f8R5EAMnb$P3*opcHHH$oOJW4B1-kZ|G>AJOXY{P2F& z+eO2I+GWyHcrG2wT$Op8X1*_e>ug=UdAofOAOFQH49_By{-8y6ybhTxO`-}F9DG;I zs|u6-LHGEhRU}`Ya!h%8t-n^2)8*->Y@=WfXIbePCyXoGHUkw*PU@9f3{EvuZqj;O z4;R!weWBS5SO9P2sB;+w+(?2{h{FvN$LS@E$(bnhZWwy!XUE_wZH?G~gn64SiwE$5 z*U%Kf@3&#uH}jg@`2x>6RaLXzQlFoP>+sC-XNJ|Mh({$~bJAU7deW`mG|n)+BYE%8 zVSV)TZ0`fggv(jQTgg|>y~81$)=ae}q&!H81$|P^zU^>S?$1zyQop!2yHPue2pA~_V=1xd|=d;}` z3uF^?Cc*c`ZRSWRr06o=!096mC+jCJ@>4mswcFr4(W$|4@a~d{_OKQsPe_WS-(rqc zYKG0|n?7KLPr_-xHqYIJ^&*&~A*?llVrg8H0s3i0qt4HD#u2TY4)_*vuKUSDPMmOa zi-ID{rAH6L7v#S5dq?bu6Rvz3A+=Ednsaofwe!~LqWp$hA5cVf!V138n`RQWB=cUy z>N+`4*gljKtu^D^N|P57tm?%dguQeZzxVRtTPBLFD$05bJv~O%PPmdJzwH{i|RvWjyH!T+B&;#9o0vOr0dR#&nib>Y(0XFAFw*L88ed^64%?L(I_=z zIl$?1MSp(G(B(c(9f5j|F~>y7=-apJ&-kC*+}7}2KUc{{q_rc^Y-(dvl&QPqn9>&{ z_|-pbI>4^xLb9$pn^J&uvC*rZAcn!#pi?ZqB3k*rFD~ev)S$v%R7TB8%S^CP+jL+2 zvY%F&Di;h+3#A`X*zRX=0tS)CC6F6=)&tH=wtPuvgp30Gq}&087OXBFk}I4wvC?%CaY z7h9&yG{S?|@kJ_lO@z+ESXzk~)sfrovQ31{$PMU3m#?0R9*jHm5qzyYms+UPpv57# z9>Z5@Y6%`MZKl*&7%C5?1ibj>Mb|o2=eb%M8uU4uzW4e83UAJjZ&;KBN9;yyN;R7r z!;i3i`&uKV^+^IQtS*>v4)Qx!7hafUc>P95g3*k!WCMq;Zwk{!k#vXtl33^~T5nqQ z(`qG{onBo`^0vv6-E>M(Rp&cxHaYN=sFk8_n?@-Cq6Moh-NKJl(ppinkg2BaFt)Cf zk_6w))vUYX%=kflu@Pbv&KIi~?p0KYr{PjQs-c&!QD{ulovl2TI;po3O&JwJd`Zwr zJh|uK7b)Oj*WY{fdI>(0so7S{w%9gm1-QAjbt@HD#j=;iliwp-4u9nE3!3WL0=KFf z|G6tFKh%$p!H-hEg=u~JltiHZtnRZ3;=FAc0M@*rG5hPs(E#E54Z`m;DKph)xJI%L zs4?!oNNCslU{XTTy!F8ZH-i$d1(OhSRq#ghWfdJk3+EFG=VVSEB|VpO>9cI}BvNyW zQK8eY5gGO{h$Bb+8e9)D$_XrCT{BJ7Bh336{?KiG)veJ}jY*%pL}(-`Q3z{ki!33@ z7d1FdAG>=_UWq!Puemn-#;WJ6S09EwU$@XX1E-RU15I*cSa8K=G1bD*tvysFt*_ga zbw+CEwPjHmw8Ab=9k-|8&yme?3ds27R*)6Oc}XInVP5#qOD977x&W`t&o5 zu~5`uY9U&h)r+;#vKZ0i!Y@W9#p0Np@MhsIjj&%2GUv|C4!jQY8(d#{j-=PlKBN{j zs5(7SAs;}8sxLp8%j;vgIfPjbR7{pdD{Jw->7smrg!Isi+Q8M5+w9==B?+6$Y`#1+ z6zkZKRa8jSa^T?Kw?>RdovO&Pa7ew%_LH+5EA|-q1-Iy=Z&++E1vpo(c59h3FSE@C z-L8*h753 zwEpq_@k^s!6}dNv$d1Qs7yc4E7Dc5iH^o^C@GIY?7JD~_@e~B{+m{hKtoACwK4eh2 zpqDSF(KKgfC~Y>dK!_PD5+x`zuO;s)6qv-S{f$=h^KKYe+=EQ^}$H>6zdn9TWbrFdU_|J zt$$K2r_lGdq4LqLh);g`Nmpr(P*pD_Soft&dOF*Q_H6LhRK;8C%&$~m>#d|G**O=^ zZ=ZDNzR{(y2~Vz&8Pa+wLw`je@han3u(ie(bIRnixVcP2egXN!9euNI@);WZD(3!p*cQ=n6j*5Kt^+h zi?!p0&Y0K9O?J_eoix!l2juO&gDo9Hop1XJj>blx=4U8>VySsxNE9rvSZM)}4nKqI z&+I-woRCD8z~P6;u6NC8#>w?NeQHqOcPF#EoC3i_Mx{FnSaBC;#nw*Bv|?|5O>2A} zMu(aAsTm&{*dJ?SYOz^qG2dCb!szOTwTVYG2o?`ihonmK1!7Q8`=%eL?qLC480;nL z0#^Ov%t5FJ_gpQZKoG+n51>&+wWVAgvcTr410)D13n@#LF*E{zzOe&oAHj+vt$$4R z>_*9P8E(5pN=d0W$=+mM5j)!3jvwnS1X=ZHMa7kFwQ=IMm>ra#-8>YTwl3&$lvFC2 zwz$D)9NWZWS3;MTp-WlM8CsNuh(dN`=s6n;KlvR-XqSHOYZ#X$nYykm7T`dgvh+;- zj!+-g$5^{O`2cll-oD(IuKA8i>g~d#th*F`hcQ+s*Vo^iV!H!k{f2d3Ta2;l=E+P>}UXON0>r?YFJmTKyG`^?SKwTOjmX}H-% z&vB3Su9KK)D=wJTx} zBkSAY3D^1vJe9NfI;90Yspc=OI$&*P#g)6ho^=fU3w@!+lRC4y4 zpyLE$NKCK%_V?0yf4~uIzw`*(oiMucwl6Kl;^Q2VQTzAQ7y}C34DxxtSQhMrHB$_oLB|`rB!Ltt zFvai}xWn?!lMdU)8H4v5?w9UB{ms+E$D{nJcG*c>juFU$p>QqSS=ZWYZoR?l>|NQ_SY-Jxkry`pWxkqr|$XeCt809TLxL z+%IQv*?l;I#1lRa#^laJU4%F8T3uf@L$S3Pc6NJL!6`Obl zDv7~hIN=B`>@_Eb!~aiv-vSp^wf%q4O0$>U&%OWr{qDWRrQ7fJ`E^kcH1ztwXPTNA zUNw7+0~}y-m>FjVL9@iNw9>-Vv`hsppP8Co)2u{Ge2b41OA|B`tWUVH85+B-kz{m*91Z#Eg1Ks9J`+3|en`3V{O<!F1cv5qDsi`N*mVaz^A%%^S91UG#%ey^K_-@+tX`PYFtSwBEM-hL72zM@~xn z*DQyz^WVnb{b{4KuXf&6Z}7g8O<$7A=57jk34@N&w-jA2lG+syv?bhOD|tHla^~Sb zN^eYv{@=VUeF`$)&70kBbH=+1GgiDE^zphajqYildtlA#keBbwShBX&%tnPzNo`NO zx*_B0rHuGBLFs>aywB5hMx5OJ{hZsR`)7VMd0_Z$cdn7{wA}LA)kZx#AL?CvA?oKT zL8)6ZUP}wQtEm3Hix-zH+u5M-$=m@m4(|O+8|;O=cl(7A1O6wh+3H5Qqi~Vy%;Gn5 zil2UXK-S_}f1PS?9kcq*srIfbX7yfrYfkIj0Xqv$4+d*&uR0DkEPPCAm+6pC&3^aZ zabE(`zBkkLly*aY+H?QAablJT9!s1p%ufP0wive#r zwy(j(JH@x(HN9TrQ8}&9hQ2L*(qPbtCll{I^Oua7(#k2}Ee8~DKT;oC9%G`4WU1fU z_ruE?NV!Fc_cgh@aIy5#t%dht+Thj`y$@oyM@R6c{-D&;Mb9>L-uAago_nRV;qvxt zhnw>!tQ$Ia2?}`FD~{*x9@Hvhvh+lctN$Eb8nP~I-#vx0)a;>=?bkPnUDNQR&Pb+F zqtEW|{ArV%deXB}>!9Udzwuz?|BU?kj|;mRpB>X)`sH}n27}t}HUA?cMS4yi_t!gD z)5Pg?&-fn8x{o{fqOh-XT~Q?rDA>*{8a@L)ND9VZxQ@!>X8#NOmj~bz;j`iQ!k58o z?lgYvfX6KtGzh-D@yN|ipdb7|OMCb);j7}VAn*$KR(_Jb>A}FA4?xDP7mCBZP=vdy z2=`!-@du)w`-pHi7`5M{{7wW6cVQ9Etf7lnwOHag34g_D3^zT|9tZ;GIgEV?=*InC z)O$Y>?mZ*eT2kkHBHV{2Xvf(_f)C6K%|C|k?-moCV<}MFY>%XG`kz3#H=}*zB9V+E zCUC!-pugw$xLg7}z}W|cvzn+Qd^Gsb7BZmG?)iKU@o=Y@=K9?9gsTVw;V)Q((|;oB ze~<|0P7&;CDF!mo3%&>V&`p(vPO2=l0}pzGFL|XzxRZ@=2cF;w?zhw2m%kzU&*EM< z!u@^pbOF%-@BlW&6bsL?@C10kvcUC$Wr6EMN65iPziOlseT^1@Wr6uprqqMd-va%h z;0~rD?;P~pA)@Etp8+49Do|vhuPO_y4{$!2s1Ga)UBHL&(0`i4-G8n+arB}B{f*Gy zn6b@+3@1Q#;|>!Uzyp?rfdz^zaD9m7`k=}J*9T2E;;ux(J&A<7B1zMYzXke1awB-A z{Q-E|0s51VK>vXU@!-KAmIbvwuq^aLonTq0jBeEEN3xtg`#W#6EC2^?-v`03frr-8 zyIr7vFv^m7kO&^IEErW;;QGL_z`D_ZC(h%ef#5}d_id1_Ftvj+L`r4L`i`*lpe6>@9_51xllQe}bb1Iq%}2N};Aa-Sm|^=Q@d zWq*`#rz+u2QNo#tWQ9&k0UwwL znk;aAsH$${c9Q!F+#axAcmi_4eMy`Hp}G0xDOVshMLVZ6XU-gvAMQUT+?`7}i-T}y zFU=j=ns6p2;XYqlKlf!pKhA0*_Cvq}iz*8^W0G)}?KSB}U;V}e)ESTd@?*YupwW-J zi>W#ucODb&g%$MUj$c|dqCG7f-VQuyL%6$`a7GE?tgUkNYdr9lg=^A{D0lCEBe#{R zTUnd*<34AR&DrBe2geJD>jJ?8)`#$?e>qIJx7siL znmp8$Zp7W-nmbS%^y8j)kv{ILrq5pOO8#b{ldfVgD#MR&a{l>K;^+F zf(Pz4VfpJYR?A&PCB+Ac`%4$joeu2JpCwk}xR0HfAHMpHF5QTJv*(yvZ3A#pj^5RT z{>K@cg8qx8CA9kW?kW$u(u&b44_F_HHo>c1RNI6(sjcZl^bKY^9wnT~Nm;fQG{X`} zIH!}Q4Q@{FJs(auZ(R5pL;gkU=8PuXAx|HktX#j*P$1CbKPU9mOMIre=>Jxy@56C% zZxP`@xctw?3mN$0 zjqWZUaGRjXM+NQtdw3f8#$R@#qpYlGXb(wu! zJgA9w5qE63+C{BR@I^oKf$KvS+DEnQcLAx}szU#@wujso#yQi12W6*@QQ?NUv}?&L zv~6x0ZJjfeHh<)#%^x~x<17blm}RHjtReKxO!ygAS~uN7Yd^41&a@QzVsaw+YZpD+ z1g3vJ^xM1WLu!5&#unSpyJ+X$Z-_728Rk#^ua$l+-t*5tPq==IjvP56`UZj_>YvLO z%jjH5vEre>aCf5KpD3bV3U`X}V&HbMIWhg-?#h)bgu}TAcaZR>yZzB#oAhgZ@Oza`;DZ(rrO0bzft)OYS41e{|*-_vFOZ`7Z_>FKRsS zY!^#UDw8`kO@CF}{>p~Gh%x`k!@KA!?jL~%r;q(0c*1|S%pIxqjS$e7(ZvTP(qK~*XGe(RPHcU^Zqq|qrzAvWIN^P7_$?@Wr#p4C=;v_)x6SN+{b3X3 z%^xPlOPW6PHCEE*l6emC)6qTX+tnWs_aWG2W@eV7oy+QGZ2vSF!l|jWWy=>v=Pmzb||lh8whq z`-r|gk1PDe^98Ri{ny&|`@-c$z^Y4sRogW!JoD{F(CeLN9sSC@_RTc@@h+p*SwhpsxN0=73gDu1B4`o_T0=1*X3C=0C3|t-kWAqrXo7)#<;A_5ar4 z68)C^_LmycchjP2-+OHY-gz%K7Viw>-&W^fTRum)nHr z-`hvtPRDZJ5uShl-stx(um9`xS6BWkl8?IYbsVVU!0(3xJjGlljTAG@h0+7!nISdM zJ#lrrc=vi^LGao+5g8BRsSzZ{+y$1cIM!wz|3!-{v{AA_%d1JNv(HSoC? zJ{$fj@5|ngJx9y2=PDcfO{Zg7ej%{miv4G${84eRV5IKl61o8Z;BgoHZpLjk_D>E3 zW}3aVx7|J4)uqE8ai4pM%6J1%)@2FY-#!D}ZLk+)`+2sRX1j6CUYc#j`TPjB6Yq-O zkLN4)fnO-oenI}n=3>v_0U7aV`t(Tn)An<+@#Tkz>?TdXwc>YtNKhyNf9=! z7tf!eO$$cT$0Hwe+41gOJ6G&C)oj!>J3W>swq<_~Gz=^VT(`L_Sf&P|j5e~xy5jnK z)4+H?_0&`QdiLzOu__)m2}jr_C1;vh*y6JNp|CN)9=iN>3jW%jX|^#lV=J9zKcJn> zAnJC3@4Bto9CP~+4PS_|frAw4+iAuhHbNVH&C~jWb?VfKjvqfx$M$a$c9e=ev0_8X zG~}Q+iTT%Ub!yz2v-&VxVSixRNm6ph3ymD zf-V}N*nZ7JIqY6NojyQc!4rS}t?l9GvjM7Fx4VIs3rgLXj#|r${PKk5d z?ZAFbvt?a7-7akRyz$rG*_PQ~-F~V-(X+nV?H#2e+uha0pKbB4Ub!S}@q`^7_7t)m zBHQ3;wx~r1cb1bO4S#-KOWj_KHXZJ4b@5-md^ue>d(yQ}nr+C~c1WQ?v8P>%y7!Wh!&t|v2vQJ+<-@wymiS2dX_pn{e9@>$P9z94e zLALzW?bS;Ew6MDPj~X>fl;g$^62$&W-f!-*M}$o(+hZy^iO*?p>s8)c?4m)~-s$a) z7Qbnr^{Au%>h^x64fwRW_zxU7P~gA!KW`}ZbZko|Y={*5YPPF&)8N*te9nZ+h7k5L zY%9XHbDlO+N6wb|tlMlmsOrxV0rBtBu~K`(5BzXI?D1qV^VZ*6H^Dci0gD~BMttF&j?YwBKI zWzV`d4c>Z{Y0%C)(QH|T{T}RC*=ElZe@)NwnGXjPJH7RuxbtuC_N(21cEmJP#!p&0 ziLPRgc;WgFDR;^Vgic}|49B(ik&5v5 z*XQvEH=E70cI{f?GZrpgD8>Hqv%;VK^)Q_(K0v36*zczj_DA>AN%*C&_V%CwHiWRD z(`*P&!Di?j`U&jM!((6R^~uopIuWyBZmwTb_>xkH70>lbf5Nx3sE!hXfAliV~YI!V*3Er@Mas)c_g^^d(N6Ot@zGsfzybr=xc7-c85y*9g6u zlNn1JXAUmcHhA{+#930%t2}OF8=8Lo`du^rTD_^xbJ(z9g6BtfenuOoCAjsfu=Osl zSC@_HO$RnD5pgC?oan-xak$>~SEGBxh!OPTk7)Ga%1-}6#|!gdi}M8?+OmWSHZ7vV zd7n|yzV*tBRnop5Ve9a}vGP>M|C^uxeR}uWay{zewdB#eSC3tR#xLG`-fqPEt~c(df{zE$ z&M{4B@5C_io1eR0|10en{YT>O{>EQ70^dM+G9CP#pN#vtqdUa&w}O9F`R|XeI(Qui z{BZz7qQd&9(RgMADNnqJckKG|d}2G8c-NkV*bTH(u>Cerhu~x}@S9s3{ z?_1$DVZMVU4*OM-4-$<4UT~XgI)9{jTQPPm@OkIF&dz7v^O`8{0pfj3yeD8U=9PKR z4EAYVsmc72_pR^`{CH7Wi{iZ(yk~*eW_j-j@2BbyTzLNu?^96EQg6&SR|gNoZw)Ud zoAIQdKtIp@^O`-czwllhE{_E$_j%YK!uNj6#=3^OF2hmS@qUsPo_KKosUn}Ao=)dV zv6o~1D4LHoFJ7bAmcLc3rSMuWuem2;ZIRbUp9L>?PZQro!Fy=<4hpf~0kZir!a#0- z9sN(-5~T)H6UNqOyw z*OK{s8$KtU=csva4xa(bYp;CvE3Y;2nXj|bJ5%0wJID?kJY~}WJm5}MHh-4QqM{<& z@ctmN9<1TOdkT0Rm~r5JDO@Ks9C+Q7*XwxQddaD)LNx?YrTMB zedhI{lP3<-<{7fUfzRLKHF3s)*KPRBeyv{eIykRg@;StO)}od_X1Jb<$!iIHHNrXy<2T^Y_$IybgB;=Wp@e0A7>kGt0Gl$?FZg&x+SS`3zP* zn_7#@d3omToxi_zCvW)UF^|*Pp9@5tIEu4>_NG-l>)RSugQg^wPk1fE^Xwmg zVJphtS2=62nwZb8D1U!=*A%=i|G@gErughHotu&Wf#$oa^KJL|ztgTagIzY(`R}%* z{C8WF)Bn}UqcU;X&X>Kv{JqKUFh%2{tJ+-$gg7 z55^HZ)@FO5wisvIG44BliO{>h_Hi!a-m>i+UFPu{k0E(X&+`hWPL|MIjAuXK*$kMH zRFwwA34#xUe|i(<1$b=4V^toL@R+?X^68CyCZdlDHv+yB{2f|8&|BT$GktlJ$@I>T zhYJt?u=!Kk_4x<%^)DyI9!VYta)19F@QV300k}_r&&HU14rnPZy@0=TALoH|GvIxW z!?8c%*q+tzi!mRMb-p-M=o;_wn3wnT^4OopyKH~Mwl_QvlLi>v1kdz+>E$(R*3g#| zo)TkQ9?PuVm*+~K`|P|Qo#z;MEXi{%e8(csaqzev<$b84^!M%EE%xW|enIXde=#YZ zmYgWl-C6S{IKksq9yg1!2Q6IoyHHjYrJuiNm*@xcn27Jn5cg-j)R|Vi^$aZ=_Y5t0 z<0)G78v2NH$Iz5tFYC&FFRmS^D1Gi{Zg|fi`j|ZC;r+8b?&I+e_py1*$Kwp%C%b0d z5?$Hje&#*i=|{d1MV>x&dMX{-I9K$gd0fTg2_E0_etRBE@VJu4Q@jr}=aUJpvU!K4 zhv*gW^m&hix1Ts;0v+7CoDQs+M)|8I(#}ucpuELnY3rgfwD5&!kq3{LzFC|}OHLE5 z}wuRQb6(jE7 z(ByVi=Cf4z{chrZf1va=Uif>)#>Nug`L=!2I@-H+4eiCCtt-|20LaU6O=&uT&*& zWx{SBdB->{{n~s|$vqO4G?1teJ|jrVKxTnGhpC*0ev`N##>+#&@LE4C`WC!ggTs7x z4CcBsu~rbsh92cC=Ax;rL$lG>6+OZr3Nyd;2iQXU}ap z_pzP>J>0*79c>09;VVAd3GtjRx7#z`mXA()*STZ-w2W^ZMOV)7x&rsp&=>R(x zhJo9|VY`3$*nZ&D435jr2mUU8d^N7;H<%+Y;r;>JD{|jJj&1uf$5CRt<9p_>{dPXz zbvW*SD&cu7p2M1IYyXYb7I??^Dgze|{d;%bnEX_i9f|!r?a;zhl)o0v-_uuoe_?&W zJ>yqKulNe1(eLY9WnfzT>O2WE=4_qIJ_8BqJl4;ln*4gD7aJrg7mNo<4W%hPca!tNb<(_s-Opf`iq!hLUuP)C5CC{Pw~d-+DxQwijO+cmAq(4Z*`8r010E zl1Z{iNl48knI#!1TckwEir`1;ahNdIzep)gjFtOIe+Il0!lXzx$%yd7fQ=DvvdAO` z?@lQh-!0%tES`yYPeTYhLIfixtCWgw34l0}D~Ds(61)p?;n5jk9a6l&&!!9KnLfu0 z1DD_E!1B(#HJO z%65mzYH1r1-ZU&EShmDl6HJz*wjuxS z+ch*QB-r6JS`v(At3_@bGE8=av~Sm_jnUzdQ{v3Sf)U^1Xd9Ahx3qS|C(9{DM`(&E z-fneR6P=;))|A#pM@rM7;UU2(MvEy?b~yWaX4bAzaBv%^J=NjtZb`IOGO>sdj)sU1 zIX=~Hat>2o0kF$MQjvw65MwtDHJRlk*`a;&{MscQ5uKp8r#w_P2b=kC+YqCpyJe_# zuxt+rPBnFikCz>wE753n$RW+NRN6H2n?;*up6Ik`=Aw$@wgD$>MyDyxWCp$!)2R#2 z>{AeEW3<`Krg$V|wYX4nq}ps&JBaQb8{9piZO999^RV#n$QF-=hPQ}p85$W8-y+o5 zB2f+v3yVrXgBp<#kr?)(2TkP?h-eWW5f#_U7}_GC6%t5{O9+jUnfG=84UtE`q%_CZcwrUX<7aA3j5SG|7AuPf8Xy62*$N3860qF%W~vJI+K zQjF1=+#$_qm$gKbolaZpX3f;f)YMg(ngXR}vAw!CL+Q3`)?c4wNIQs|+@_f>MO9{+ z`N+OJ#R;-awj`kLtF@_(25@#ztGY|zw7TCN!JRD5&S5sF24oY_JS5m+OuTt(nv!C*gm$u8oOY|(5hAKES7UGI(nc+thVj3UV4K~_wb5$tV2@8WIpug~svUX! zFTSM4p+5AGhxJ9pg#v5Y0%FoKGCVReK0Yx#Au%M_Y_ue$8k3aZYCUe#%qMYfFf#!x zauLody3W;cppFA|9H`^K@0kNxk|Z{lW(~^9$jZdEc}udSB?@fN{EYcy=V#8JIv)(k zkuZvqq{6I{EGfHjc5rrBc64@3_MmKAc1Ct)c2@S1?40Z!*@f99+0v}Wvw~-Z&5E8C zGi%T++pLUP-4?_wNMDe-VBUfy3y?^iyQw(9a}GQ3rvaXrMu6jGRvV^D+>i$Wxs25< zH-u*_2I<1|0VGS84$n+%I3O{)bne9b)DB&&zW|b>gR}*tn=Y@X0O8dHcgo`cDb}St7m$X!SnB~9 zpbL`^$lozT=T4^qzNhO_jsRqz4xjdbMC%|u0Qvhpp6QGLWFx#g<#+Hl7O!r|3_#L# z7%m2+PzU*fW9jnR0!U+BI^P3QrVGQIIH05AKBStdOZgGJ#puHH2ZWs)!y$m!ba}lA zNC~_<4A-P&T`6w`Bm)?@!xRDXur8fahQPbS+zAR2bh)<%WUMZo-hedL#YzDr1Kyp^ zaJ;b=a6`CebDeiX)&jyUmm5+7$XHzo{tX|S0O}4C!4P;iBpN?6br|*oWS$PgctG~) z!n^`Vx-R9n09mfXXEh)Lbm?pYq^C|+F9K4g3)28I*!ueQCqROAd4&M7QU~b(NE2N; z@qkFWFhc+tq04<7Ai=s=QvunigDeJww-5nQRRbIXBup3P3?RumNCV80XX$ePZ$QFy zkbeRarIR`%AcJ&#P6wn+2bl^;jt;Vv)6wOg3rJ&K%I5)z)`e-*2&JRL=f43d)It6M z$Vy$TC=R2Ou_QpUba_n%BpB>+b8I#sQ*|k?0;E`%axNenbzzDDiPoi43djH*q#kx% zb<@SV4-l>i?%W>)q@gZWI3QWNG93iSR2^5-0ZG?E-U6gpm+~|~zH!mM1#bp}DaB@Z zOm@aP6S`p*)hw$ZV;rXJ@Y`&{bOry#yrV@nM>IDW4JMvKQN!WI7eNEN0oFzG21*^DPdmS?=Vtn-v7?k|9|(Cb)Ap+DwKN zhXKh&CC`rr|QsN{|h*n_xAe&azo{%AGL# zi|J5vET)7oC8~Y{Ze20?+rjCyo8nTPvO{Bz!67@5_E4mpV6YkOU^7^2NEQVHzG#KT zvDIm6o|cXU9$FIJ9V)q2dso@W^H4E%*@1%42$b!1i&cH;WJeHkg zfM0_RiO5rp)+^QQG<6*2l>1uygZfU%Mh!QEp_9R3ljBW^rg$&34F-oZ!DKaHmJriy zhM|clM~gGjO;k^#!`a1dw`yr6+GQ<6M6x8Bl0b~YZj{eZM0ClfmxQEX%F-nsI%&mJ zp()9ta;UHMacf-fxaUzeDi(&cL@Q<)-L!k-oTeb{3eewZa(1=aV=-%LmX)Nsq*oxn z9Qq8}oY#i!GW>8ZC)T&Nqt3!6xUw?dYK;A4#u1qz~1gvd3UaDxOL8@IvBcqlD zC=2h}s%5H)a;(#6cc$7{j9CD?cj$}ClwdHZSsA=#FGWs4HCMlaAc(U|VoZ*-c%JIj zzRJ#_ajB57fCOTlg_k^BT+uS-s%VkZ42p6Hm;^rBJmTxay7i`-vReB+>aSMwKo0&R zn7;z#I_MaN^`jJ5b~=7`u|Qq;);UPnL+|#5!iE-i)%^;1ghDWy%V}H)KBS`;CcZI! ztnOM1L) zXOWw26y0zCmO zVm}iUrqPT>j2D`KzQ4lvt^hi$)Kml(8Vj_pocf*^&#j93qNo5)GC+=cDRt%N4?&&Y zbcI%Hm?b{NEeb|13X?U?keF)GdS ziy2)5f&s*U=qeZxGbp+YVgMD%%=hmzX9U^x-uu1p-TVE1@9l99)m>d(U0qdO9Zthe zo0mc{Z~Bg3ar zlr9^|T#72QMD<{b+QP7w5hxJU51-t*6lL@erILLVRsS8I?}smUH~72!89Js2){JY zwtK88YErxG|Ix`SK0c;wy#%_CY&~m=8~@ITD^@-;jiSs8@+qPiWX~e|EtAzgGZY!0 zxUtjb3E5mP=LuX^QQTNErUlDU%>#uZ&EH}uN=zl@3S6>?vL!DK@WT~9xMH1tFuAy! zNJ}z7mVb{LbpC-hg(~c|zsdwS;7C#NV_AYZ=DFPXt43FRl4@5i6yWjK!KeokV>WT=SUB`uvhycBTlJD#=Ic$9{ZQem9v0h>rvb!O;3avN$1`u zlC{Csvqhuf0#q{tk*zpZLf5l{T&UO=xGXi&`u0I^b+%qC(nNn;^jwI3I3_6{j|L`5 zCxZr>Cji8h$X0&>D7u5fR;+sy^7 zaL$>bIi4?93_S*`amD>?pd+q$;;T*2g|1vb+;P|02_8^VmP8Q|zQjDYn#(>pLReug z<%&o3M-{P3d_9Yua1S(M6?84Z2C01I@+gY7K2w|kWTKRv;0Ormc~c3XxUgzxJM-LZ zVp#J6X_2HEc7ra9b`u)&czddLqX(@~A^`3kA zjUrc6De*?3gXn7FQ}zaPx~YjFpy9^9S2KS~`=_|Go}_LdVJM&!xS|@#aJojdgGi|_ zSFEka6As)TnOS>@b(^c@Bf85n?>M1uG(xjz61v2tf1#F{ zE7n{EAUOc)d=&-K_CmrVg)6#JsR_>%U6vd|S(YAYw~j;Hh?bKX-`5=E16pzzW=Tn* zM->?dvhja_E=OHTvhKU`%I@Hr()Pfv>Y%E8O%Ee$>PlM?ojON4ex)tOeDhytxM~Jk z;fhs>`*Wd)qzn3KpyrD1JBa>t6kT@|-R6oS^{pj6Im9U*If!Z;MX$IRG{Bm1^KJF% znU*UG!072Sfbi-!pJwT*&*cv3F6@L}IWuOGeCkgGpLNa;QwfHUWH0humMW-@mQ|tg8m$mBofj@k@{Ro9CYiDe;FN@+MFZvAy@S2 z2f{_816;5A#HWJ3<^^`<1qcnBE$9kToq-$SW{)LJAd%`^+(246s_%@zVt$YAAsE|; za-km5O`9xPL$>;6SyVSt!yPperQrwZ;%rjtn4~!v%pKxyNR1o0EGgIP9yh78r6djx zQWe`SnlSO+k1KN34~JuMxH3J&QUtEoUkk_;W02YElejC^VH2-}`%iR4)CwontQ}DZ zRoUu8NsdrT0JQ2TCRs5`Nmo(EAsB>3HAx_J2H!nvPXO3*hp>bs;Q3vV%jUz;HfJ>YUdmxVbG?MCX)`Fw;ccGRCE)v-gZtglcKCJ7s) zr4PADTlBSPIWJ3Qqcae1XjUz8e!57PDVjhR2?s({((f#9u$HVrb3kyvA(jJ?_2pY- z4pUi*E+yvDvabI|I|a1aq&raM@g~&KDH9Xz&}6&6b!iaYhOK_E`90VN#-O7L?PTT! z@KhsW4EUn8d2W7!elz%y>5X9Ex$r9DL!=d{PUSk9y|#l$0-+gM#EB_H4u=_AzVpy9?SR<>qUQKCIB<=3P7h$0x1zdeAEw+ zeC|tP^|yF$GyB|j%%wJNHnU`|DS6uq%;~rlY)hIkF@}BUxPuW3QhJdkErt?o&LJ8g zjKHNN`e2KQYX?~ntVqw3bgY*Xpc#nvr6r>46Q!VLuQ`qJ#ZBrr2!*h*hEM!>ik-Ng zMrxh~=_Mu9mRu#uz&uZ6KSwKOqB?jxiVYe?SP_1cUsS_o6-YLa)3%tgF4Jrt zOH4rwPKgsBrc@^!5(gush=LsT%@Abi zkSYtYB(%svtw5h6)u}}+B;h?tGnDlBdL!ZaSQ3Ph!%YevfcQFEa;OiXkqL;MESa%L z6*8W?&>qW9j0ZY)G;azNJCmz`J}CGk76=R+#U>VLs!B(`eq?7(SK!89r%P3Bq{SF_?_`P?PFlLy zK->VM0CIcKayLqKZa~3=y98}Hi0a{K(@~VQlr$Y#PnO%(h=6a1@bxPN2HYgZdkX)TIT|#x zdNgtK%Ap`7+W*`ObV)h^V9dB^ z(`+6?JJ&^Yr*aPp97!m^$kKw2VPuf3*N;_0?R^-MbexcEB&j4FA2`sr(ov8CI==%` zIf$-LOZ>`C_#M@4Vl6d(nlu#gGvoi~_ArvJuWI32m6!fiBrAZTjC*IJGLeQmsbzyn7 z1}Gdc7MG(dqC8vUPh<%|d(s~$A=9KS^k6+rRE?oa_K^mwq29_C^ca*JGSaXRkQUj9 zt`UtEq7pQ+9a=Uoka%LgkvXvwlq*>bdoW|*+mPu&R7hNlO;nzw+!=)Q7;Es!2QQ3j zk$g-fz9SO?Jtio^t~=PFt<&&=tRe{@xFT(Ww1X&LVu!5bxO(;~ z5O`3@&tQDYW zGN;kn27f4=KnIm|P_j&)o4)DO8FcVmh04LOHgVy~ zezJsOr8Y^>p{36^@&SY`f^sif^*WnI(2aGX8`JSaPBzXPKNGuG`ovpO0w>8)vR}m{ zCAuQ{4M}<<%Vk~RCN0Ifw^DKq;>lvVAb=|pkX6KG$E3+DEG8VudK_zQ3t~zGFeOpt zB3wYOLwW@awHTwHP#V!5J5d)-)~3P4OX&7AdY$nZR9U{nG3SDpZ0!+4`bIDI>%sm`8wqWu*7l$?j(x7(9R4W7UzDJfEen+(jRp*-F* z`@c6>j5GWDpW5vhd{U%E7}fctG|3r<4%TX1k`4o5*(Lh(Gjt9+5!jGnpO4 z3Zl#L^-{q&%rzX*jrd9ow!V2>;~Us<<<*mULE4=gU!}$!_gPpA7YTy42vhfkmGL$7 zIgNSl_xx>>tl`#$$M>1PtLGd~kgN&W&sJ{rygs-;x^I7LeaK+x}3FZe| zw(SiF6L*z}=3Z$?&w{8y)ZXtgoS^C)W7GIXML{QQ$yAwhh|2R|)2@BF{*_(8Hh!#v zKqa3(@{Dg(5Oj}kWC^Cr^b76C`M)Sl{`x60Lo^WE5uMCCr(EI14 ze*Pry)s{CWc^BfjN&VtU+MYNX^ZcVeumi!F0|GzY6|!SqEajw9!3u zDhd417s;6}fMUcWS?9l^Q;53`WKn%tG6#i{eW-wVAUiB{=6a3#D}d14!V+UcPdaWZ zKLUpAW#e;oVLXHZJdR?_Q&&Q|bH&bT;+bj=tRiIDtJa~&CjK%Dv_>}LYJ^zXK9;1w z$npj6Um!qaNFFGTRHE=v&o8;Ua|Efu#mgJlp|0+k=EnZmt~eyvjw zjf(WH2O}t#tJtp=i%za+sv3QI;G!kfc?Q-9RcEC-Gm%1wIR$HDmmr{rm^EeowQ>M4 zome>=RmHP$K&v?iBsjs))s;*Km27^~2To#zZ2RL`$TQ1kwrsB^Q#_ZO6rgV{9>oK* z#IFME0eNaTkg}O?v%qGdjmLs3`N}_|h{PkwWpxO&otvZ-i}xBVio>|FYDr@!B*YW% z6U&SHJtM&QYQA!JR3i`Z8#AnfJ0Z`-jA)E=k9yVUt@zoAoty~H;_f)S&zn$hr~tGMC6Ava1*Ba!>L?Xu$|fwvVail?gCiX)V!v8vf;?@kk|O=H!?H*(ksCg_Fi6hFSv znw?mQtbl1_JoDt(d}>0D&e(3VfNXHceD1Ex{rM%^8g!6%L}vdzGDOx3VRD z>SUK#Ak?&l4af(Cgx(O2?r|mb0qvJ#f#R4HSl><-5?^e^UP=hLh_#ayl?G2u>O2xU zmJFu5qC|J)EkR12b(Zy|u+jpCYukU@h*e=LO^5TKg|*O>`lG+8t9KSD;f}k(PCf!M z7>%N0+3QmQ7^9uRCR%o!!00Xj?N$Cv_bwJ>@QSml2{NZ3jdXO1E^rD^#W=w&V&^^%&Nn*aru_TeC>4W7CH{Y(Fv?#euZ^E(GokkXdnk!Kz zIYw|vnkppy8`5MwGzT`T#mb7zTgI3*67*wH-G{AxA2F6g&&MRt8nL^&>YHJfz+zWg zL=ME~Tp{P_nq*a0xgVpDgmC)P3s>Bg9E;u(U59G z46xi&8S{iye7$&HSG1OOF!xLpRvVlaJ2CeG^$EAy%Hph;E0F3!ymf zTxnUGeJ5hi3~xGsmcvF&#HflD-XaGHWZA@VzmoEGk`hKa`Rr9CA^wDOF}gq=?<1v+2x*8@NtTnyPD`^3Svn+#fCXJBEvnkn*1lYRTX~H{YY@_FC0WOYprCRI zaI*CQ=+RS3qRr3?exkW+6YiuyT14-|zKu7U36MtLxYE^&5i2W6C0R?f&?J6V5+85Z z`Z3@nB-m6|kXW74?qp3S5A!w62u@?}gXVPwl_j9}s02^s(jPUmb&<7&i&{&p&391!yP|oNPrnnzmNM z{J9)-VjV7N9MdN$pHI>vL@1H6Ig(NxF=(pR(JoSzDBCZHmHVRz!f4RK7?F9EoQR=% zKf2CGAz`Rl?G1pHyD8>cGaPhQY5Jr%h;9(c`e5B5W{ramY_x<`(|{T5AhOnyB*DGF z<{@fIb+phl)516VLarSstSXkTub>~SP1y+~V1*fzj}2nw7*LRySqNOdav*XfX3(FO zpk$&vOizl!4`St6G^OCRL?7m^dhU}VR-PoQoVW^A#L6R)gJTQxFtRvE6X6H9i@E3r zb~R8JS#%Yul716KLOE~Hk1nJJzKwc;7oL3OmrP2kb6x-^^N07z>@1m#@-_*-^;aTB zB&!sBqXW|{@Q~`1llpqVGYdrqJT3oz{fzxj+&J0S_8Iuvem8C${k2^UI*27Ak@6L~ z`0l+W7bt7Q8yk?_z#*RsIpCX!J%_b6z3=KQnv!Ho>Eg)ihEdyiwiR1>(#=#K$(L~;Ns*WfI!>)DVP5GLVG{6 zFX4LmL5^#2Q52C^_{=9R3Q`Im_{2x4P=ed+OgnA!uiT{h%DU~00lUg3Y;ul+1!sj4 z@dLqcpjKgK+Hu!Io>gsVlOOKCW2j6G7EqIxqt~PfSj5^ew-UbT$;T!xoCxsIj#drk zX2EN$Y^=4p#&>O!W?&N0-sHgOr|56`$#Wcc8T!Evc--BPqwVzrP4pt|7yRR;2zAj7 zMjt**^Q3hdGJ%*6>rCaxLumOSGO@D*ZdEw4^5QJMx3f>&N+NzW>^Vurhb%@2(UaV5S-x>H-KrmxC-X8c+qQ_H+S-8rd&%UF`=8Q(XVM9WahGZT6kF@p6$53XT;9O&2vN6HoTTPBE*G1p_&f-7PxuN!lHu>jdQ_!B;uNiOZ45KA z3>2!0ojW7Ol_VC#ydgtx8tW0M&=vz`IAb+N@BJM0bwhe_<0|P~ygJ1-p;L5eSOwSk zG8f1>bKBJ=T3*xy6t|6~I$eWt z9;U8DvK~`XuAYnj1ec)E!-@zSvutzA2|YBr|%w8*dC$ZK7(}8$nW`; zb;xUt`o27O^Y{4ufjj}=e4&da0x;DVrB5z0G;NX9WzX4j2)g|>q=ze+7J@bc5-W1V< zC$$*85kfL~zhrbA%8=2LA;V(|=oW*@AM*Rq+6bik>5nwg+%j+>8e2hA_yXgDa?%p% zEx9`XAJ-41^&|K}Z3-3;fDdV%1NgnJnIzfN=WH`Z)_dBvz4mjm+1)z>P<1n|M+%&D#r@Q2gUS7op#@)@C(X@5!$= zv{^xJV_#a=9Q+JNO=5c@wKwt{K%c+0L3oi7A@iHROYnC-vQZ}YKWAz#JlTet1)rV_ z-}ZoK+u>IZW>NNXF@YyGB7Vt%2Kcm8p2!CvEgw-t;vi}{Jgr?2&M^htFi}YSLM*dDudc{NAWOT?aLi#luwaB>UQ(@3NsV(;|neK>G zfxwZm@E_}+P!Z3P*_zm$)RjH9+%3G6r2xx)LSdkF~Z4M`Zi2 z@cF0ubLgItt6Q>1Ccz9yD;wo*kT!tM{kS%%`A_~wye*iHo{{PJS&JY0Y~EbWQ-Huavvb|X0;nl@|I=egXr8J?UNWz zB79ra`Fl%kad$GUCs`Eaz+;HV`Jrb-2Z8t}iw+#@w6iSUqnGllU%DCHwV z77xf+X+Prq_<=l1gZ|lpo=8r7mc%2{F7X+H`My5s|M%^Xm_T$Cf;uw!ICO2fe-qmi zPjAmRnAVOTVTOm;M?SxXQ6n%{np4B^Q#QIfmbc+;z_a~GB6C&yXwatpw)jTc`@8&Y zv8}D|av6W5g^ql_kHG&F8HleEuOPPdfkudy|E^u~D__K~o!e=J^o+!?_Ldw-xcQkn+A^Z@Ik*(7-8;9w})VJbL{~%vsfM#laqB^y9Rg7vdAZ z595S*2Kn;j)t%hzX8!ExG~1YR_6!mDdj@a<_@SX3z7HqV-#5tL#~;5N>c!(2IXgPJ zxX_$j`GOx(xqhRfC|5uHIw>zy7$D%_S4{^`u8fj4LI3IQ*a<0;^H ze&Zsy-?w_sA$$RN$!muB3;Z}<_$6D8S-3$s1^)8B2Z!>6-h3zzzyIh-+9toKE8u(a z130i@C_xx_o5~~ge^hU=zW{bz$!kwfqy_r#jX-+zoi7v+>(fp$lq2vA@#P7iITQ`$ ztnlXrc#{ka!hzO>U>qzLr0kAV+VLG(D8r=zl5`MWxn4TLpMLs@6DsiZT1xyR)Q>dd z%_9nfmQiNG5%59+{ewJ_mv{X;7ILe~#)ZM(h&a~Yk597>`QX2gLu*#M7CM%9%%{!$ zo_}ly{<+L}m$N+BA-}&vKCMHZ>ahO~?ob}yA%CI+eOib9o#>E1*CBtSL;geu{;?hM z)gAI?L%!EXu>SY?ln(UvlP6nnjArtdj~dFEB=kqvSiFKW&1J63aN_<;;0VSX^HHYe zre-n?{!9O_e*X1`f5*$e@!{Wi_}?*K$WGU#QnRtLdFuoN<@2|^T%MOh=dRz+b8@_v z)<5Ir`-a^AZ2zEz%QSeYSo{eNK!>jQ(;i5pJ!&7`&PbmCe2eSx+DMbX!=kB8QTj-0 z0h-{oMcNE-6kczntpOeJ`XTKMI0J70(u)9n@CG9t4EPbR0O@ML8XbyCK$`rWl4QJF zkWK;A!nf&NNE5gfzbthS=`_GQ11RbO(nWxO;e9}8uuYta_ZiZQ0Po>_jdU3x`PKVs zq^Uvp-i`MQ($Ec6jF*Z74WK(dW+)-O2+#|!7SdQ>P<{sJJ4pkE;vI@~EZ_^gW=NL- zPBNsZ2}nBwmi>fpVFVA@e+c*?%>f*W*AHo|VW{^*p%bKW_(CngD?mCJuoZ6t!DDkM z!n+>n6u?cxpcACi0QceDh4cx)XL!?)E(0_(qp1Bz6L=l(L8OZSoy~C-326e?;Jtu! z3Sg(<&;im~fIs8CN@xHd^IWsx#$jqfrL>FNguE+OQw|n81$Z+Q1ny)s>~Gpl+tNNHv({Ov{r8dKE=VWtD;%YZ20#s+14^b-6$5u1}#&^@*e6`kF_e4R=6_N z$xn$=dDX5ip&N5@EDN|~bl*|+tDWJC~Bl=y;Kq9X^FV`0>bB8+Z%i5>qwj^zZpe-WfOh!hM z4q?*rIzN&L<^Mka=WC!kt*wu0$8fD_AL93KZMeR`RqscovcT^Sh6z_kqA!@z6?-ecf92C5vA@zrPGPzG8va5@7QF))~ca-FR0kl)Va4>Ir% z24;7lyV@cDh{;zou#tgEhh;5wVW2Jp{+$J7SKkY&pY zzz2lM_!5S%SOG5JM84zLv22CG0l*04{HC$&@Et0oBO!I2^)o*NU!SHy9DE4^&pMVp zPcWGu7zjZe{eyrSt3WWqkPv*zV%nw_5S(4ma{my15cy)~hL3TcivvKth%O2D2M=$a zoSa%pus{0;@N5Kv5dXzOd`d!FYYEoTGgM$lKJ-!4ZjztI_Z9~52<>#wKq65q!Mflx zjX)Ue5ah!LN|DBRGs>d2Dmnx~1D*l+4%D%W*@_OKj2QfopP^HH4s(`$m!rNYI`TZ1 z|E&sTq%;lR^!z8`imdU7Kr|Hx|xDVY`IM|*;|UAPx7m^22Bz=G5vWtRXRFPO?wc9nlD`ljXE z%B}&SP>mb;Zbh{yGhg#iPChc>+(C|^JA%EE%4|=60X)D3pP~YIGFP+<$3DcvBiO?u zj3+ZN)sylFKg^ z8{j5;ga-4x$k$d6S6VGqUV+{LK{&ec@Z!M@DXJGz{2(8HUs&EF4AvyIm*X@9_z-~y z1_>b>M0wC~w5+e5!NDGa75D%)hVtJ zhab!f0_i%W5oXZxX3EElgesi<5bJw-Q&|)&zI?IJ2UPx{p~0SBJZd9SVP3QfsdNek z4d4Y))M1he@(P6iQpCER0wTplIvu=%q;p7`rq!eHh1Q$kR+E_YEkrF>V3L#g;j^h7 z3eNn{wn9pk0xgLiZnE#lWQe&SF3hIX{-IG4318Uzw{sjZ z+|)aO_62Ro;~DI45*TU{<{xANzc6X%gC+syCgvt)46zYt+rJ&^`SCo1zpF6e@4U&l z{f@>NH2?H=;0?E0j?wg8RmvN6|5x7*{F%?+-}4U#|NrXSfgkPo?YXT7uOGdCtBc{~ z+rBadw?X|5bV7lD0IsQgPk0)1@^_Yt-AuWqZvZcUacRZH!D+hO1;G8^=U)x{-=hJX zLQ-oLsaPDt$10-Ret=y7>seH+7NC&=6{`z42N0m$!=leoag-U`4v8cSXTxO8`8}4* zf#lrv5$H)#HPfCN+5tJm0aOMY3#bC<45$j|2dD-Z3#bmb9Z&=C1Yjq?8-Sew%K!n| zJu*4A|4j}V{@@Sr=YD{{@dNy$AK)cFz}No(kK^eMk4#S8AK;CCfG2mB&FlRvzgMIDzH0vgqzemlJsePL0C?c`C$O=}nYPJlh` zwUWnqg}@It@*F*}5XC)3`bOYv{~&LESSW{CX5q%7hK7*rAuTj;h0#5Lp*6u;J9GiZ zJ(y-8&xLx0_y-F(?lLIDF3`~U)&MHe7ePZUG)#tb$Ya5@PT$K5z$znDz+uE1ZW=DX zG^k-R;yD-RInOzEQ&tn#wCCeVE`8#jr)Lny6FUT<&TaVInsJ9XTQ^bA^|;kg{QGRbb)l!F_Y98bmF4q z-|XWl*PHyZB7F&zbgWIY0#9F3f3njwXPfDBhY>zMuJ04V52V{~tB*ZSFcb-8dTV@> zK%V$Pdm1KY9FB=efIl2Z_Ml0Up>-~|2uJ3pp`Z?Cs7->g91bM{5^9<|jnU9>@F7=d zWlF`(SXkc|why9h?#W?%9LoMCI4+wKJ;(s-ru^{8w)K3r?Fi2Kv@@)`9K(h0o2Rzam@H?Eh z@Ud0J^!z3UQ?qtnKm^Y zxVAIN98F#|5-aD29P|ND59ZFr7O)FKa+fQteEvaiisDcXwl=QeyWa1P*wUv?Sem2T zSw;LN#6dJ~<4Bz;dk?0Y%wtdW{PWTJ1r5eln+9*~e*W-gmx>#K&wdZwRo*{!aNzFi zhUpGZE-XC|wD6Ce_DhYIy}NPeO1C#p&e%QP#JjbZH)V^_41K$eLrlYJbpPz!m|xcA zL-SvCO}8nr0re{S&dXl-WeJsgFY-UWXgZ%MkhauK)=H?n+N)MmZIzjYElVv=dys~; z*6+GAs8wdy)+%Tv82_vt%W*W{ODTD->#EFAb6Iah*=%3@zmas=>?W4gm(0-Jp3l~D z)LKHNeYQz+-Lm&t_R)VWsfzd*S*;XwF+}l2bg@e0Utf2J=PyuYUConzJu&x<_8_sum)i12yd=~?bIb|dHl-;kYru0R~#+v5Ig(Wqu5qr${BtJ2k|M`y%$VV~QQeRmI0Yn7iIa@}0=x&B4B z6L;KI(qg1>-IonW5uNXEDSEta-?Z9`<;UaCmH(Can}!v&U3x^$v7cpJG|om>$18*SIWVi_UU#j886&=GEb2S7 z?nJW10gl?fMelTS51O0}SIEoy^HDm>j++pDuy~2z$lm)dC5b%W>BPP+FzlIM>wIj> z-6d-dy501BAF=ezyq8PU7tBSIT7y+Lm+hYR@?+21J?Ya1V3ez@9@?CxG+XU{T4%Ub z)cs`f#Db!a6SnEZ8BxEP-5*&0Q^g+xwUjxpL(VG49Qv@YxC@K&{WXxX))E$}D{i@I zIzK&u^LEk0A^YBLHMfdbzI9;5UP3>o*YDM-^ZZ|4J1}^|rSRVQvnTDE^L1Ik$qn1? zJUIEl^yQx+{f0yjt@qSQn^{v7)MRwtG>=mh{d@P*T3NFw>*C6?3t7E(WnNmH*xPyM z&it)OBOd+n;>y=fe+9QT&K(*sr{&|gZ8MWFv`miZG^cME_VRrGow9~)y~CTcwgtZv zC>n12bw}0S>04GDJ6NsAO~@%Px%N5?qxTu@x?%r1Jb0j0*J#5}%c|>pO^B{nYW;Ax*J=0C zT)UkQU#f;JSe>~rQ*-?2!k(?S&&<=DD%AUB=A8P8<*SXT0d`hfbBepvy!m)>?XbwI z6%$rU?9T1krsi3&%z4Qe>gv+Z)`uh`)pJ}zih}Y>EId0|8++7iE}Zl_{Lr8ktm&`! z&DbKlecgF5yUV;|bJS0}PfJsZtj!G#uP^)jDC+jL`V#)dvvdB;-Jxu@;}&ZMHEB}U zD(Tg`_v@2}O^EY#DrS9|AJJ{nZnIw#N{z=Is+eE@bX*?$>elBynhal7Oe#z^u`+44 z9kTYs$Gem38@m_2ta!0v(2L;b*Rtw{A2N?SS*mHYo?RFHSuOg5lb%C@o+RrlZ%pad zgIlfM+8SS*v3$Xn1tmt!(~E9i@cE-PFlWcP)>$RhzAr=>N8IjJ>@!<2985I6%XiN|EMA)V2CnM@WOsYUh7M*F-;j0{5t|Ull4&61psM2_qRa6<(+M*YAH{4@0CdvWZ)}+q8 zsux}9PgO4Jf2_$tJw-V^U%z?uIaMtc*5T=fb&Z=mpJ%AFY!b#y9o9Un#OCS8Y^j0w z1XjMWVd>{UToSNe?;o41?gpXEcY)li=I}?#w zzGZX89>v@vd*A23uwP3pJhQsA^;X8Kzjk+Wo9c07`QW{yt+itpy%z0DUnV%YN8|0r z;@QdDf*!B+Xx=z`k$BUX9KZX07ls!5T!^Ua7Tqu+`Q;9;^iABDg~wBURRZz~Z$+Ll zw%Yo3ACFZO+R)GN{eq&p#;#?8-EkT&N`)1%YB8@`lYOub+{sCievofS1hG8e1~?1_;%XW{XjciGq1TK%@QVN*ov{?_b6 zmhk*O2{N%O1Z3ww(yOaczR z9F)0GJ$Kg!m-wjP*IK3AsZGBZRlK8q*^zDKPIl5E)2FDz0y7o#t93Gdz3{No*4XaE zJ-?Ts?1{4^VaX#91_#;qsB-v~%~fr^F(~}=aTB*&86j`%c2>8H@43FhxWq|UC-Kgw zQ}bf}Xxvy{djYqU_U~(%p_^yCt!v_d`I0fUr(1elSYq-<*>&lu`txg2&$d2vxS>Mi z?YqEnc5{u;YT5m-ug+Jj)w|&1{$W(VoIQt3UsqZde+o57*)I)0ofE&lwjz&BMJw6P zn5UldhCk}w)Qx9L^Cmm(kyQ958<0T#VB;EvU;}$;`X5oBuV1o?xOIJQ`M7zXtLDcn zESqfKZRNuEYc`B4wtwiM6Wzt5E~2=1vzub^lc5$YO71!1Z->vF33 z`kRB@qd7gk+KgS6_aSc+CuH_6rPP~|PPo zdhF-tM$Oho^|!iRw<_U#VJ$Fr&WmN^pKN96Hy@rk$7HwB&8w`v(-Y3;L~Bp2ef7Cd z>iDen!}P{(onUw*)wjiXU8+XoEZ>eBjmV>V~UEQpLND*jlt zD*DMnX$;)<-@uDvu-)XSsmARQ_N}}{L=7} zpv)+`HY11Ea`ZrUGP|W_Tg{D^+o&%$1BI2AX@>RI%fs~6WHc40>6N~k+9%4Wr1Y!% zp^tCktkyN}d?pRGtB+H^p|JkRy$zpdRD8X^LU_o`Xa$B@%}KBNzg+Tbl0BlvEeySu z=rHVJ#dOp1OCS4Qo5;3|Kd`c}c>4U99<_#vX&O08TC#FouIZINyP{*WuXy#9t&^ga zsEeaw)^ZG*SHxM3+FWMu#h-i(izbfP@JscCVROqj7aQIx)MJ~^R>l&V+yIP9qA zvxYj=>WEb7@}MF71(Tgjn*(!1J06c4yD?sE=o|a%nOlTS#VyTe=g0gNZ878co06yv z6|cuk?B#oXA?w}aQ@#%}&6W>s%3a?*SGb_6JnQ9AC2D(9!!HZQjLTPCVevBE*z)(t znAhQ=p3XHsMMsDEPKh+mFU+h}__Oi8ubuzPbXUFS`<66JHL%}VW$1W0*JxLOvf1O- zTU&-#u!6!`mPAY#`l+#@=CA%|w=NV_yPvHaU3Kfa&z@tPDfUg{pDzuMiV1uEaqQfe z8R0c`M;fBfNbKLt*JwP1f$x@>eP*_A!z|A+cFo0_mPKAi*^v9Ld@0ylxU$kC`O`TRBe>V9}LMJP1uTG?lIqwuG@t-t%=B~&! z<-$JvtE-OfEbFdHMd-vV@nrMMr%46DVX5n)j{XweBeV2axRFrdT)*?5`h;QinmFK= z(}zoauD!|2y4$+z>_Yu77MWHBpHEH6b+ut{o^j?(&m8`T4?Pck8DYFJR%pINb>mQ0 zLgjBWYr17{GByuWAD>lw*0a9>|KZr&eLA)U-C`A@&t|upscjNkO)NPkojSchw>I;D zX2B=>8EToYM5>%!=f|!KKK(SB|9N^`bo7Lut&J}|s0cHOK6UBY?S{GutXb;R6k)1O zSJw+a^^Q(?PF@S$Fy8L0Y(w#VRH9F*XTBQX*HmDhoW3w%QRx_V?UYrcrn34=ju=HW2ePa< zR+0+03;Pcj8E>0mDh!zzTYOge{QI*{gJ0k1+IK>@R(`UcN3n0Ema#e09SkFRknG}@ovrm3?GQaVwrca<6Tle zy#7$}lF418msGDfo_6;0z}@gm!c*5d&ZwxWtg0LdFpMgVWz^myO~~NY6T(qn(~7&2ekJ--e5KUr`azPzV?pb93PN2X>YdP;)h~oi_Vc- z7O-#oD2~`8{++-4jPX8DB#XKWjDHUHk=+>6w~g(|KdK^J1~yto*l`U(GkoAm?i63M;*%fc%ysta?&He7vw zNEq6*c2jxT>zM()rp=1B9=4`XHQe+}dCsw(PYo!`ybpISSxh-e{e0ldrPjZu9_k-` z;m>X9_4r+qK98ds|I`qF2}Ln^HR5c7&6HEk@0Wy*JYGXD*VN|`O2eR@dw?jT^Fhf zMy~QW{3hp4VOVeL(3s|ovpu(;H)x5{X!$9wW-k7a=$2cB~3#@E^* z&9C3yuXTNQ-#ANjV)L!iho=YMYE_*%FLhh?%>_Eq_p2;=H^p8nRc^lW@K&QpeXWA7 z;*vA-f>wTfRH zB^jYjtcQ1wY<^$2E#NZ`p3Yt2&wt84urA7CM!r#VO-!eY0VxLzA1EAHUtD4P>+p(C z7H`Cw3JYzkS)3*L(q`YdYo&dE#WvM!@wz#!WV5lxvR3O-#0B43^|{k)$4zu|U(vH` z^va+?y?XleD&_SpdpYL*u}kNh8b=(sc(iEEx~SMM4`0+r9NwX6IrwF!a*KsQPn)M> zf{gyy8(Q#L>AdTrEc=CX9;ob^(pY<~R7s)9arM9{*4nvi3r;n&EthHMv}Wa>Y{9(u5mFnw6RJ{d&>T@g|%YA;k2H@1_w!2H|N6C#*CF8_p#ynDHqa8 zuM`e_wav20c1nHLiJi`*W33mqE}S4qKh>;MdT*-Yg!&7#@m9`LvGrU#@64tNcbw~NhF?9!(6Q<_wD4b*UQz7FWSww(y?o>J z!QJvpHy`^lV&m*`r#Ba^Gj626ve@yv-sAy=pEh3`!}B;-ow=>tE2~?C!mSvu7QdZ6 z(SXwHxze2aw*w|b{IuL4>%@_uoOzOcT|SPS8}qip_w9Y(ff@VQ{3^SH4bjIre%@n- zoir%PscT)*ZF~8Lm>1P|2KI@J-E+ighV;_Vm``y5>bV!=2F8)a*>t-VyIbm(Kl>6o z{WHh8)Z^WG?dzAc{G_dyqaPWWX?}Q}Q7X7U?P1~A<=JuPFWsA3BiY+Dzq;U4Phrc7 zsTl(`CytMv=>KF#W8bai6@eEbCQq2-=@hn#ZE@RiYR3N8XOAekmSEfDY_oRVn4SBZ zyOoq0@A%xZdcIZiC&Tg0tAY~sj(n-Q^{GlDZm(K!)~9Q7hbI zMqW;|s_pm7i<1qJL4mOwPR^#v&Ehjs$?~2$!MoL?ia!y(Z|Wa2pmyG4q07dYKUzJt zw7WDOx;?@8RYtsO)2A&;>JxKUho4w$dF*pmu~TE?o5(SmpPk)TDqeh+WBSJTo!b16 z?qMDmS8utJJHA2UR$_g}nLUTUs;`1HXw1*|03$MZhc`OQ?> znHhAd^`X;8t6c@x*4U(IeA!s0c+Di|$)B4?*Gtj_sfR>{mqVM4s|xQ$QCY3e$J89m z`Kzk(JS+Eomukn+gZ*zt%?gOKao3a1a*vrB7_IW*?UAjeGd>jRH}mY46@9*5vaVD$ zidXt^?5*`r_Ofqv-wuVEG&aOIU47`G`K~LkWZIsdI?-t*f1XMecx;?+m2&Kh%l#;{ z7t&fVSnqb2qV*N_%u35kck_=t9PW03dsT1I&ZsH7AGnqN_Womuv7(EeUW(?sGuzIy ztj<0hW3x3?uc@=jhKmZis$xsY#v$<&>ZC)QE~M;ld61j>Yn5taY1!s&AErJD538u# z@q0bRJvuRMM`_8ly1QQ&E_`(>v=uwt_X)-W_Mfrjzd6=DTKL>!*Qi)Odhmf=L3Pd1@%bLPxB(RybMdzjDRX2(2_ktewjI*XV^cRW_e^bcyHEm{{#M|Mxz2 zYYl!W4IY#2q`-cy+V~*Wqu@lumI#IShMk7Qp7c;~R8-wyYOrd_l8WMwDyb7*&R*IS zxFBlyzAT+?FYRk;7W56OtUTE`Z^ld26!rRzPn|N@L&|0^>o@{Akr)$i6nn~gPm}a&%?RV)bVoi<4|;!QZnXQ< zVbr1Hmfp3SH+hWS+h_N}F{x98gZ*D`9CK-iaqy(IzgGCnj21jlU%9bn0X1{VKIh4W zmRUh-2A`<1((82kX-mxUpBo$=T{${%ntIHMd+z1;mwLBkJ77vbFPAG9&$LYFq{{rcQdQ*ev`R*%6HFJLXwC^*uNktE5 z0*{~PCoU*?tg1WxTIq`g+q^vjqR*dK@*8r#F8g?n%lo;}-97sRmt4!=)Kkx)s5mWk z!=gjMsniVe_XHb%GoXA&op1VdeR;*klg&e^8%LT?V?Gkh*4q>6+v9~`eV*C<=jo~Q zt<t#WVO zGJ#E2elY*#?F(N?4^GG^Sh_Sk)JbKlPi9`=X2Ywu~iKP>3l6|1QGNsqhNW?Z!o zX}a}*8ue#(%i-XvgxwDtnzJ-A8^1i-srsb!OsmQ|MXcV;o%%UBtsijtqoG7qyY$J@ z>j&(704{gEL z=9k*T=bIj`3d{NHX26%2%D6qX6>*jeW~COq4Gm8z#F@hpy$9v_HWOP*Cb&v6*DB2M z`ZSzc<1~6^kfnOGl}*mxtZ0|a{?Vpg8;@VBh|6))s@=lszVgMR$!D5RA4zsTW|`T# z96}|Aq&Q%ARXGS%T zpXiw$bFt9TJwoU{EN6pi(-edJUq86GuVgI?)7$g5&uD|pb;%c{)8-zCZj4D+jM08S zW^UxFgy*Lp7KS#=YCiCy{KVXYRk(HgaZT8p7by1s=z4G`w%O)ou+B__)#k*lmB@6ox7?K}Tra8By z)T;B)(vqmWhmFr_XLpK_m>>I?b)J9oeRqp(sT*EIHa_`!KPzXNcCOmEWfoni!&%YY zA3WNZ+O71$am$+Q3nh;xkN-_|>v7A@yRAR>SZHypu%LVNnIp*`o|%nrO6g{tHS1Z! zu+sHAGamhV3A-{wX+-nbfa`90-1A?Xvo>JaGfCLcV4jsurTG`WoVwMc=OB$m2}2)7 z9jkuYJED42&atBWhu7Vw)z)G|A2+z9_F>)O^M!rFCECiK;`*>DL4%*$ zJAJ9-;dcLfau*=k`285iTcuqF&OeK@;L0^Ix?|ow`}Jbk7sI%(HT5T-q_ZV+BcD9j z++Z^<&T{AZj8}KAPmTTLcy#^E5!O|6TA!|qiT{7tdk?rMmS7eLV|!JRZ;a-}Aoj zd%ySIy$!X!(_K|vUEMR?Ra3LWXC9$zeqc=YaQWm)GIV`p?}{^(XaYS*;$^l|U*QXrl82TS`VB=K8 zUWeW8J`$o{E|%gW%00a_Q#IwbmhITHH?4lp3e`cZvASVH?8W7;Bog(5^)#+grfYfA zD5>$=%UPqTd74{LP1jDODE69U`KavHzGK+cdwFbZfF$)ogKFCc(L6LK7LGB>(`30N zSL!iQ==CvG?oo~XeZ2O>c8ZomTBm^>@v|2k)X^Nd?1!RgY`a$q`bEAJXAl+KvMi_6 zbK8a7a2jHvZ5f#9{3LU=Z}`yyZ~pyzmucJ&?7~i(+VqioevVmNqaw|S|KKaL|Bc zwhDdJg_@>|gRp#hrh*^1|=yhh)sXf%}DO1xkj_5lLPI@za(HU+Tz=pp=4o7lH1k&1 z8@w%R!&^ME`;Ddqc+11AIv)o-iKLLV@0}Nyn!91+-t*R#FbQGPPZ0t3r9R-f|Lu)#au$zFKtA zpas5MT-gb1Yw`10DqK8Tc%we(c_cSgL+FVZ=T6g-Ov)|6O~ZDrk%ChrwyT28QY<;p z58Sp$-JrdSt&xQneuWkRiW7PG`7d{CCfc&6;@eUyB)Gtzhy>^89rQExan{8x~DB~S&qU*c^NC#r}|84$LalnIz{;;6p(9mxDI0- zvAe6M{I#UwQH=dSs9{@$5Y6=EvNX;`n{X{m?BVyV6oS3wlCCl~nvy3;Wowt)c7)Y2 zM7=e@hci3W0poomT&Q<8td4u8BPY_b3e3biL`w8V^VXgPG|R16Se6{ccJ7@X8i>tr zRdoV>(@z`bF7HJ8lkoiz19&-&x|PE+f}VU6J{;V$XQFLzRXDwH{*>#yxXX9xtk%m? z<28<)*Nu*xI@0n;l&9mCe&dq5yn*fwOU5!|YIs=`Wuu ztouHjE=EOOb0J!2>-AlIE&B42ninwk>JDB84$M5*d=S|lTd#lb6}N+JAc|zg{D?%$ zTD_Lrf&;&F{T2`RyE3~=8K2ORbe>Co<>#79XXK9=j$3KY?0Q`h==ls@lHI_;vXa#? z|LIyun=BkT9G_r(rTN5JVsqyR`R;7Fgz0limOOldeZ%4_j;}l#qzr9KzT;Wa=g2?i zxzSGMS@%%}dwn3ATOFBLKfS-;jd-LfL)NCEq>)Yy6NL+fP4*bti{x1p&5Eofau@Lr7AyB=i(>Q3N{nCL z3oJHOy%lA4AyCP9&FZ<;XZ8`Nsm84?Y9S@QgMr@jMmx(ce5z`4i6?uB4iL+iVxgQp z)H5(dw|KB5JIMWG5unByF<4K`9fo&bZzx8PXJkYNGxj(7Dc+`mN|OOwW>JtuBzg4Cg2394pU$Y5xt~;ppnXVr!Q9=ms|O>Z+M*6Esti-VD zkIFBF%u_c~?yF<_6GnX)A9~G~2L}fvn9n&COoD;cQddOM-GNoWipst%l zYDoU_X=7iZ;@Jb=i$7H1y`s z;PO79uzu9%U_;Xh^`2U8_a&_{_lH@`PRQOVq}cqaKnq=-#zeT}d6KcvNImzK1=YGE zYuF3nwhVY|s*s5)mmAqNi3m4+x)JsIBx0+qSvhWx?&hw84Rj+3leU;!`mCZ4*zBk7 zdPZ7|^Q8B|NPwx#Jgit*q^kkvS_g9FPgoV!sI6pQ!E|!o78vZOXw|My4iFDd%sG|_ zBROWTPTY|w9KC1kz>+@evJ_F9l~&?Gua!q>y$zc;|8%kG2ys(3ma@3IxvQ|QkX_k3 z@?Lvkx#1hlc{MM}M^m#j_xX*|iq&g$+<~vF(Gc}G_f$uYYL>9>$z(luuR#xq{z_&h z%VpE6!&GF_@%@2x_-SUGPHnomJZ7d%?`6@@$4Jzb_(VUgN%s_&$E+;%qhYrr3s#rpB8bS5Fb(=Xe3>}5IsC8k+bN;BPscs$D_FpwmDtLdt$#9^sRk$mKM}Z zu0%E0NSiE=9*cU>J3yzfM3l6>bdodYYO05uhuHKJ3bS3csY)|ngDC_RMl8IF6NC2L z>T>RBeGwu#WlDdK@ykFP{S6G!<;M?`67O}oH#r-|IS1kF7sj{gAKI>Lz|PuqWslA4 z`qr8dE{_hYOIL64h`vi`(VauhJd}{geM)o3cy=W$tu2K?moiKr-*RD|)=iSm)pEI| zQ$pL-J~f?U{zZy=BeR=pQDna6C!L<>O*!WMjYXArN>0S)tc16#tc`C&%%J2P3uSTM z_8JG+!i(Op(BATxwbQkMT9}u%!VKNZYh&yzc8rqoXvk&7E8wn&Z9nsKTl>?m@Kog9Ii*rr3>J9-aYJj%xE~+kwI#gM+dd%(sWz)w94(sNKIn5gV=CRp5`s4 zj#P*FLd7a{G@);$>222mdn`#w+BHdcc*%Y}bewd>6MJ1k-Lm9FlelKhwvzkRQ$;`{b`XDC@d`aGw~jo@ADCL%2(aj@u>~j)yCrQcR-88n98Vwl#%yf zLh9QfV~zQlqr%K1O?g;PiO==*Hqr^%#oR0DLrXPdO;jo(ZK_d})!T|fJ9rCb=2SG* z=xA}(U6=h!Cm0XYoAN6{0x+S5+EgO>)P|HqFdI zv9sddcvmLCbVZIObs;bAg1&nbJ9Crp0jsIvw3xuQX>|sx8ty$K>Ivitt!s*wB`nG5 z65ZDRYvS9X@ma>{V1AmO1P0<%pT-iowQ#g{f!yJ+ zDI)PD7U-50>~yWU`?1|H0Au>u?Zo@oeR2M7n9xY^i>u3;h1dD>YdJeaDJvrF_i(OU zLj~3iYLZ)xJRBQnUJK>I*_wY6l~&z)+r6!9HJNgL)S!uU6%-8SJaDS26bh>!m3W<= zd`Dnlr4}7EW_P||1!y(7*-0FKA1g)a!I<2#sJqgr-)Nh^5||ZC>n4_8Z#ckHV!hjCkejA}Ng&vrpY7@1T|$PLv>kPZrm9OvbX|y6Cnfg5m=hABQK!4t@-&IVa~6fvUd03sFA%Nid_~_k zeI#W#=Gw5D0PR#cJ@=P`!$cCSk`40HhoaA7l0zzqN+ouYTJNdRB?G0Z-@iUiSs*G} z7`F6iX(3rttmfJpbC_r8knym(OTa+ESvjflu}1P&`Z)JHkp@j_2OCVi$>&!xu{H7g ze5R8XOWCoVvmGRnC#NRzLJ}w8{bHlgEIn0@s-l<8#$Mk1n9yFR$%bY0FeZA^%nwXCNeMKS!if_wzST0kcumoKa6>vKR26N z;N(^8-iZ_QuV)E6x?)#mRTkWxrB<|CD_4Ap|K%zrT#a=luyy0sjmzVo%dGq`$E!cn z7*ib_m0+HSu|K87B)#A4hd#}Z)G}1QQ*)U`-BPl0{P?XA^(y9vNyhi`TN7*z zj21@4_v+%$;-`!uOCq^K&t+{&{3wSzIG&Z+dr-rJtDoggv_O}aRoq3w)zu0_8YNMc#+%@Bpd`VVKO zXuIB3`8cC5>Pw}^q$@mQq5rtjmIlyRbC0P;I3;9v&k%c_WS)-fSw(%di@LOIc6KU< zF==@Iq|iu$Tn<^C&9aw4RAA>|hQ2Gk{9{3V!DPzk`tJJ9=?cj_#WamqN_-y5ajqzj zue#qUaXw;QGqpRv`RW?nys^b4Qs*xFrD9lETxo7`*>NYno3L8tc5KmMZ|f=PWz*lP zuzO#n^umW_cFI1y!&ljHROb?DJi0rTv)mBz^im;0G*~FX15kV*etpJPSLjoVmr0nEOW@!!jXD0Vt6Vj35Qq>dzHP-U^3bQg z&`1I%G#f>;i9!Q`Q&<`j>{N<;NbU!zb(S z_*m>31%nXED8orfr?VsrRul$L^yc$OOnXrpwWaC165J3CCTnK{L~;k)@-XP&&Tj2ge%LCa5>>?qOn7OaoI~R$^CR)YaOM6WE2)!6D zAQ`u$kb#POx8OT7vzLsuvIyk40RH(;w?rRZ|{>)B`7jeXq-|Xdj?eko_glVZ= zB^&~;F{8Z6`#o9dnh)#{Y3X@YJ4N|9HvNsg$u^_xGI+^)zU3j4u(zsfKMiI+K{BAL zz+s>ghW#u~ENAt!ZPyP~{WPOqc}{~k-qs+jcTSq=x~`?=?AZpS@Q#gH!i;K%>pfbR%g=V|# zmyc-+Ep{ySL)>qMhP~?}=hz-GzgK(9Eo3xVoeU%YN}f3g=sIZD@R3GpWfMD2x%7z6 zhf%Lwwz1je@9&Q)a+vBa66Tg=&}%r~TFf~;nQ!+nG9z_C{@YC_2m2mfeIOe(2UG>G%u-%YA%RHS$Lrvf? zyoGSvm49vK#N}Mieq9F*%s2h6>WP@OWlwWc1J)uhvoGZ**_mJ@?>>z$b_L7LsRbg* z^&AtQAEz%sLuoCr=R8Z8$ij9P`l{RTfgZl{dzYS*oTK~Rr0u5DlT0LG|6TBG_h64z zSbnX`eA4#zsH_pbhd1Gk1n(E;zw%(5(vxnu>Ym}<3Zo6B@ahh8D>EpwK8Io8AYD=^ zSw$N8=tPu}ys4YT9Wf^L+lF)_ob~TrrjPV$zD=wW^pbCTSM?GlSiIxQ^wrYK*}FM= zfC`SmdWdszN~{WoGJmmYBih$WE@i)LQGl_l5HEaW3OxZwOGDVOorCEH#Cr2AP2Q?0 z1^aP1^K!jk4ZjbUG{~|kfp;oZpQWoWd`NalW$6=I7(u*;h!kF~^tSmkCpN(4{zKDYncDhVRD4KJ%{^V`(cZ1@~L~DEx zGdhZL#AsMrvL%dbCs5jyfX}M8W^ZMmffm-`X6GI34ZArS_ANEUTdsbfBkH?3G$$87 z*)BLUpY_)?cg~)*CGDT>tZo6Zwe$XgCubh)h<5^>YA0Q<$T2YQCTOx6H=+$)d5i(z`;HSxX;odg7TfX8EQ+ z5Bs^*t!Sb;@cs)=&RFA#&Y-r*X*r-Wa1*6Uk)$79z73;%)wUdcXfU3}-6(_EE!nKK zQG}$>+7v&9y3~fhg15|Adbs}$2^H9#p+%7cEkoim>Pym^=bZ-`wB{zpoZ+Nri#Am1 zGu$!g)Z1_PsY{*jNGr5bSm(=!BFTrA4(yo^y|lb~?Tc}Hk^HGhy6<~QyO&E3l^mOK z@}NZ?h}d77y=(}E@iYe&foJ3F@WOz@(TWbcfH%jjCbso{mOOfb=-QJMBp#Cf_Sqbw zo~0X7+UjgZuSc3lS7H{Y4`6RpkWLZZ!b;DizNaBPHDXL5cvFsxes52(`^(wo`RAM@ z37=h5juaQ35sSUlwWIWk2bGH$>5CKNlnRXJ_!`>YNA>Qup7kJBGW3faA6vdJmK>Rf zoF(AY)OX)_BEBY8f)cwdQxd9*`e zZlv_Ypwz^5qeWVoPkoUT$nrt8Vsg#TbAW{`R24OGS!(rKAq}tNt}UA2ME~3Unp6+z zs;EDV4_tUvK$>Xor|Y4SLv&oUEyZY9Zx>uGJR5{=#E`Ii-aLyT`nvt#cCfYV?P2em zYSUZQKv`+{xL>c{9t17pdfg#8z8$CP;iF*NckqZkeYHLQ?1(VDPzy-YzP3@{(!@0n zugGQE#?9NLNeluH1LK1uB)0U5(j-NaI7#EqAO(>-?yT_1P;f?0MKoNdR0a}X{s3`fH!a6fRd6DP#WtA%HzC1 zMZ6a%i@yyj6MaB+k`MTl>z7#$OUrQpuXyp^o zU6}wnE0REaWh&^cN&x+}sbH}F4H#-F0K?7s5Q@NUD3?E!i-O;2ET&oZ!KVEqzk-*^kwsSP`7{wjZ1jY384vt z2W=2Kmhqqm!oa`)7#th~v*UeW_WK}Mm>dENQzKw;dIVr*MgeAS5-iV8f?kL_xPk{G z5T;fkVDVsXZVoKWPlLt9MXh|Bg_TZmre$;IeujfCC|BGKBRd%48ioCS6yvm&jgvWo8a2*X& z-D%t-=06bntQCSgkkV8aJjEYf_*7-LA;NP56 zhV&aE+{fc1q-Fn&0Qji$=Lj5-KFGP@@qW_M|B`-#0YL;;H>ia-|3U%yXpQsd&qE0a zN?0F!6;&o~~K3Yoy;c)KU`39|kBRgfdOSY;2o=?i9XlYso*qF0 z$bi6MeKNtxsIu9E-_lD^QBm#Op#Q;PLydBT$t3N!!QglFNlZ)#gPqrq_J4?gsF5(4 z)@S42(&O=VOiX(a2G?TN={K~1%q7A|A`E1s)+fSW+sEV0AUz@hXZ8lYo|e}7K>j4c zy>$V8M}L%wiHROzkn})LPm90o{+=F^Lk&O%g4x~(HNpa6K;W=B8x{YS zK8=lyzya#O3?F#Udoy|HA(Qck8vm9)<-h?}R;U6*hllNwa`Rxy&(F=x!$*4~^c!mY zTlze<1FTGt0bvxWc5H)*j0^?SR+HQZ1PKr=Y|NnF>5mO2gq)y5IFS%~gaa9hF5P=m zjaq-(K3)LQLlHI@z==p;fJlJ49Jeu_m2dw&y#+fP8=`^rK|s!gO$KCSc>Q$-=O6w& ze=hx$p5UD!J==jFgYYv0R1zFf!a&CT9sOf`J|8>eaDWYpK#$C1*n=G#Dj?9q$#7nO zU3~CY{pI4J0FVJ;5Z6a>lL6Eq(mpbmA~vprzoCZ=9Qh7zGFU$sf2aVVha4VkH2=Hy zb0LG|Tjruj5*8Lb0TbuuAdtO&aQp7v2T_~kzo&;&jga!wlaZ8^@?x_GWUmPoiN>4j z=^|4&~L`V%2GKaC>!wtq{XgvU#6oQnhHro(7%!Lqh|@ zCy5Z308bL1{T~r;Gfcy#6Re1ew%jx zwv=Y?UK)nIdl_jM8E7{3|9iyWmoo6|rQwm1;$g70g>PwFPrt|f`_lhadPW`|MjD1c z5G_)-ce3AraA|5FZMGC(-~ryTxPERrZUa`KHF^-E5) zar|qJ{MIZX{ol-g<+pkKYx|pWZOZxoo18E}|KbDj6Z1hZg7Cvn{L}BI{_H0#-FZ%WW`mmW0U-v?4X6@U1Q-$0-7&wgX{T?LT$!*8qz3jNo8##z4V0385*$NpNN z=#eIP6KDd8UO@lw6HU+&a~70_n1IR`#-J+vDyWXQ2FfCDf|e93P!($rYGQ3cd8{+^ z6FY%ViO^U4+7do3-%eq0e|QR4j9FO$KyEg z3_{R%9EgCv-xm`&5IKbd(GUmm0Vl!ZR0tW9IFJMV!U>2k7(xp47;sdp?87zD6)M@CA$v^?U3L>{uU||K1OMzr*1B^aNO(odiqsQv~1dx0x9*wy+3# zA^x{z9GG0ifjM}*u&@A@7iYlAhTnG@md`@J@8X8vcMZD&eZIK=c0ONtn*ZwrFDL&@ zv$6C4XQa(qh#Yp?!`1smCK|n2_76Ea(?fOLc`K_O_MZvRnVO0U&_f6PYjTRRLI05e z?G6253XnotF*8&3%zwsrR8?IkP(`E9NoW3pE)w&2N z)YOn}C=$GQsxQ_X`7=HmEdB9Q{(UX>$s`%nDQ=;JIb5MMY5@`nk^w5}**>|{fc z27bY3fr*NW?uo0GF7k);dPB1cr@jA*AA1nuLy3+)dVcTDojV91s$CyEkKf@V1h54v zzUweuyJmXLA|4Jlf#5g%(HsPPgkZon7C~_42Q|L}a2nh4^Px7ar1vog61`s6? z2%zVUocG0gFTywFdHKM9;}3m>g9B*-;v)^f77$-u{Q6iR=mh;dFy2flfk%!U-0Z?; z3*<4Hh-RIB;Quj!&}fq*hYxQO5E_6j#2|xlks!{W^`DZGm1TVbinQ5>b%|nhb#?WA zJKqojbXICsG)&K;qmQmDDHg7KDSp45&nc;?@ELti=k48cdn znQAK?`WOCx#Sh>(CcLRtDoe=zpZTYv?F0{Rj2n*?I`co_r@$*h^kci5Ha8zeqtPi) zWq;!TlYV}PmIX;5q(IRL>2LJ&1OL}AKjZ&e_selAa{Mpw3FQA}{8U00{xlzcY5%u} zKlmHOy!@~H8Enos{RhyC@ZdkDANU3jjEw#TK8ow)e<>Z^*uK;Lr?iiMFF!y3KFNLS z=NAEe_dgT;jK5Cx-{ceSf#C1D!uQ4D@ujHs+>>ZMhlPalzvdnr^hCaqqc(Cl zaR}v`IYc2G|J!_%{?~l7oy~UtUzo3(ssud|wwZr-jM%BH5Tqdduv33}pU2O3DiY81 zYb?{x_UUH)((7Mhmy%pg5p2;|Xp6?VDiUl_f*tD11ezWk1g$|Qe%PYfK7Yh7<=j); zh+jGbNeWj(PpuVXBG`4*L9UUDTwqFGp z>!<-^ogct=X!m^|XagUIhd^8304SfFgD?-uW>!Ecv>j^~VI0sRwEbW_P!Eg)LhQcz zi2(?M00ZND24O7ED2(k{x9MgHu{Oh3*6jXoJWODUidx2EkS|f5W&j&sOxW=xFsSe zAixb@X7i)|4>L~I$jj@Nn24a@1wHMH7dNFOa@b5>rzy7@$cTwRjN?cnoEnG>KkB`d zD7L|R87Zkvgrmr-5&x*?prD{oks%;JDY#)rq#=Vp>i5x8QXJOVM1UAv$A7MOrl+T; zppZ8;H@{(iLq|ps>FLk)UaSNJXlam;kWd>+K?K1+)gNMIqNkytfVK_^F)_6sL_m}X z+lTQhFxe=;dZY<*a#Dz3CM6}R^RxU8cI>RM3D}9PNDHuD2GQ{k`5l~{FRQ7YRaH`A zLqxE*u($@Rf2v3HXbYQw93XB||}82$2( zm^<+OZHWGi^!0zOHL@X?LxlRBzfH8XbaZ?6AoRbjhJ~B;2+sexgva;7%YdgObXl=% zLMgmq0fA@(0L*`d%{m0{UjuCLM?&@~$UFEDpZexLMe67G$RRo;=n=kC4aQrCTO23E zS|h&mIJ-aMs(;>_BsxnI;;E5X>S}*p(C}0U`o$GMmKPF7{pVQf+fG2 zdUzh>J~9M_PxL_M10#_A=rYKCdIc1Qn1L^$QlKSL71X~}11(W%pfm0~_>y1%x|0n- z#fwXz_LU9jO}o18?`~c9cO!cg#jnuU4Pn#QeG&S)bvAt4m!NO^>HrSzN!I<^HbXeL z7lHn5xDN>n3j;-QcL=`hlDrs@k&!{zmz1SG1Qlt4;M1Ex@FC+7Xv}^L+VY=)hFrKu zf%}xt1rea7C<1hpMuM)|w=k|b5$;b4K}lm3$oYGvFfuX%rlzLA+$7wyOpQX{^aPlm zo+j*7W)NRA+@q}foH767Ylc(*f70*vDK3_lE^dEOjhhxP=h34gF27co^Ye3a>ggkk z<)6|G0e*g7ex8f#g<=wfWO7#Vx{$yLEgdZ#Jw1e#i6fHS6v;`|goOkU63(L{uauBsi{G5{Rsi+jAtMrA%Uj}K~32opMcuzDN z+QWI%%Wyrx6V{LK@L888XwNpowPI*!2#k)7f|;>?xIT=6rG>fm=TR_=a9zL?Y*!5C zf7XRRCe{CZ`~Q*#kZZ+{bXr_W<;MX4r&#tK68Y&^oZ}!Dml`ZM27~XF1;~Vta2#M^ zVo@O+t5K4lJbHwKorOhey?!SJ6)!Kh5C@ASIHn*YBTu`Nf()97q8z8!>t*>F=&4DF zPl-W(upXup3~;egI<~=ISy_;AC;4tQIX<7c^ux z(%$3LKlR}EU-9?gdJg@nc<5{OgeZRkc#ZV@8$eWt!2kODKAoQH%XJNx*ynUNok?ig zO7cdDkNwf3(_X#ag{K@%9UC=w-(SgnkVq*qP>3m>^UV@V58N5s|6t`IFN&8Z_OP#a zt>=MazweCONk~qy9(?quNc2p?Qins*@<8`ZD~*>T8pE9~b^0qf2~`m{OL%B+899|R zh8^RAug)a--|?^Z?ln)}T!-V93Dzw!)Ry!qishLZ?b1}TbJAo|RNTrj-8)!)zm~n6 z()DUto2V4y-Kje8nq{gn?ecaECXZseh~wU|r%&rs)NSqtnt!k2R^5K5)xh&KAlb#3 zyxNiJEFHGALTYi^2UEJT41ZwdQ0|Rd#aI1<>Z=#R`_i~MZRTi|rr+(Dld(7+0Ja>E zHDFS_yq0t8TATRVs6eH~6t>o;&~!pf&;2UzWcwRx&FQ(DpfhXk;3T$o z=3Ivo6m>_L^231-9S$KR9V?hPj5)-!{#p+8| zwiP2>j&r#c!S!;duh+W_R-{mx7NM$y@UWTWFyt!7qtobIBwUUIAfPB z*CAor{Itg8QSIP?)Z}2uA>#d|W3SF9HQZf^RcH7vDdFYA5zi+k7|_>fwmtCd0LGD# zgX1`TSj0P75nXGd>HhRffwlgT?_PRP?J0fN7^E1Jz~6b5=1D=1ZGd{8u_3=dT6dmHX=-On2mEc#??xL^rIEnVHa z7yiS><+G$(#nviOYyMRO*Q~7MMP6F2s1x&pGPy(jk`l*!Qh6zjJ2C;j2*HCW)`Q{3B`jCpIEk%%rot`c$v9f}Y_+~78_@qIuG6tJrw^AKr85?`-2PyVrwrPSI_*>hh=}`8;aZ z6bcxCTA6TF_Tx`B*s&L=-0}NvCyyQTnT^t83NB|=IXjg`BdK|tbFDrurlRqVs~>CN z;a5ZjSM6rrr^z_>$H_R3@7*&XsaH&dDtd`dM*c(y9)SGnF=k>}}3XL{#WA@lOw zNxy(Eu66ta4*d^E;u8`AmKk^D?|lT$(XVt2VLaA8O1nVW+7im#9jnl{i#< zFiHx;Ym75D)7~m^X$s|A2O6+9X}gYcc$5W5cHDLC?krf{CoaGUPk+x-Q&#;UsuNZC zkPm}&Jc z$MlH5{B#4wJ%8`uIP#IYngx5>S-p~RrHh-Mdiv8E09z{P9li z7C={r;8MeA=L0WJT%w{BPyl>*f=UjhPq@tTyJznspxG8#z%RHVY~8C z&0Qr=h?q(PS=@=5Dn$3#pQtql43HTk(RUGkGHvYGG=su>&+&Avd+1kdo>LRbeJc)o zTs|=>FTvk}Z!t*clX&?K87bJWlGvaSM9X+F~AnR8H?#d_B9yp4-7&{nNr$v=p zM{$vtYftoEgXDf*jRarA$f4{3)FzMfs2no4VJy z_%~(J*TY>Z7EdrAYAbKr`kt~6^9%8+N!ZhQ$hzJ?$WJa%C9aS3B{9{GPu11F!y?1c z;pl72Xz_BdakooL=aL>!tME=7!Zlb8Gcmt-FH_Ib;M)Y>nlbwB#)+=sXS{)0ojpEu0TSEbCP-|Wj&E;R z=sTe)m+rtlk3J2~1Q|;Bn5*oI>4;x7(IC%zR~3o*)Z}(*7vs$8*SX5LJ|9onqD=CS z%vAmWIlbG?$EMEUU?! z@V09vaE(M+UU4S<=#KA{7Ry?q@4nf+E!?A0YV=7Dii3S-gyP=JQ_0D9BPRLJbr}*U z>}$kRxD?LZ*mZV^)PvRRCft(HZ>8qj%lNYLY)O}wM={>%;HB+Ob&u$)1s1rG?QBdbwh=*8<3Hck9XY;j{EG%08Hkb+Jv*FMvGa6Fj9iFZPo^wC5QT1 zMixy@2<@-}pR8pnzRPjAOA;NIJDK*_eNYHQaixZqj(78Ru#Y@v!lUV*8_$3Kb^^XBL`}R!FL3gRip*QR zuP6IqwXo$3CxbV`bBplh%16rV@OFZi3IEqJxHhp|Ki;KjV3ltW z&77i}v$M_6L;U1$&p`N+rp`_VDb!PigK%@udSNQmKVl?d6*&l z3fG>L3lhcK4SzIfa=Uv2`||CB`PbY_5{c*2M;Erm1tfSy^NzM^8A+w=GqR9!Wu{^Q z=Q%x_U!MwnAqSXy!#(_t-BNm_Qv(IaCDG!N>;@Go&$4*k7!`;9TZ1xPWN?6 zb4j8sqyCWbY0)mtPdg;8qG&?m+zZLtIDM~ps^0kQ_J~u$NWg`@l%n*T3{WDT8275n zA-aDjK6L16-XF3G*Hx1)h70@I9`6Mx2xoE z^~iQ*Ufj9uT<3GoX)-@|B~h3HL;9EZ?&y3k>~ZJVqNwhU-6ZA!>x*xw-YwxU+=S^a zciBC&nkP~0S{fQN7;V5(Kj?aWxxG;?h%BD2Er@!n)WOBGcO#uT{rWRYRflM;9F@a3(1 z7hM}Pvqv1{$Ghz?wJLInPw$LDbYFkrjS;Apg z$)e(C7tPz=$4adnQYYmk(i_iyvOQ`iv?d0eJb3D7f?RGrGtAR4e#VM38P1&j+fh%KidxC`Na*1V4$jCdoAdcRmGml%<-v8|*eykNj@VcuTqs9(dkx5pSoT+D#2?kqcYG>$BFBD7WtyhGs#q$mWRwb z$WpG{kUYbFywzD~{{?t~4hX?8O9%%Ps4vXTvGPuipO5IJcl5z~5CB zx8~5s@tlautY7eQ39(3P*wNyhr54u@1c2)bs;ZPCqM`v##rSIGzQGrAT#dHAwV%EA zt=UC71En`x&5bVI2yzQ(FBSVhNh$xHQ7w$vtxgb4FTFx*bTu%EXk2)Byjhmtlktt? z%%#gy9@Fo4GrO2=@l*-FGMJQ~a?of`-9QG{E{`~OUR2gYgZH5_f*I|~OUt`6HKSA~ z7VOOcAI>**Kg$K0YH*$8@^&{jw@NX*)hh1Ny(mBR-6gIhnv8r;Yulbh2OGAh=-r)H zJKC1+HTqtg4W+&)$1N#F;+6lNaxklA`-#Yww@Dlz;*J-0_)W91``e>V&(F`(`*^(2 zW2T%Cdrgpnr0$1 zr5D*mV>a2l{ccsSla$3RZggIqK`}@LJQuQxuvv>-Vrue-Zx;XTsXf?eo5}6BOQGyZ zg%v*^?~K2_+!Z6|nH$@=SjAs{m=}pWw&lW^n_KmFOGXKyo*IMaCsfFZO+Yrs9)E2X z%C&FFRTLC0WFN)eAMs=PV0)P8_{)Y((R-g?(m7uswir$9Eb`1{ZqCe+qM6<9 z+UypyxP5}pr5AlC*rO><4tLevyIZo;$l5xi`|K7{^?8qLc8ukh>%7>QKTe|1&yMc< zVk}#$Bv-S1;bsHr$G83PqShH;z2l>Z^R6YX+s1e^ilOSQd6&FY={i;Q{wUKIXLOYfi$Z z6-38#PLfLJ-!xl_c1AS?@mgNx%h-12P`{QSpQg%3=A?5YWA4`Fx9(b$dX4H!E$N;; zD@pn3638M7BPz6>epL{w@vt`|cJR=v2z54hrBZW2z4UWOOw+Y|O-?O$TwRRj(xUWQ z5!_NP$m3&;?vq-P({19l+jE%g94dv)(vM*R`U?J7p^OYTmUnX^ zvX#&DRfx{n#c*#Ed?00bg!057sAf7W##8Uz5%zI=<|9 zkIP-M(e_y&tq&&5EuYudOGAdMGQ;Vdlf$JU0anTDyYjPG^dc{|Z_zI#DB`THI zc1GmB!6CA#ekB)nH#3PyE52#;Q$NOTnx})xdSLLZY;CXB7QHoewia69K2BBaqS(|$ z(%1CiN6vOt9x^xGtHd}gW*2D zBC?^6@<5dc@kv1*rZ_l&SKft$C|Sp0KMyG~gywJae|=JJM$^hD+s#e$id}Ys^?<;X zl)U1oko*+sEXM+|fL~8dd2M9h%-2wI#Wi80t0iU*!Pa%dkBDR+n2|ghW~w^fRH52( zx>hO7kAmvnr=Zid+5OFaB{u1riNR?!;HG*k2Ip;z$U(p8TP7zMrzZ z^mSaa_#Kb@TgU6P?fnW5NN#1q?*zlC6|Xa1y7dh$xHl=wxa&uJ&#-6LgP%oc3>9(`vHuBS!oP~)^npP8!FIq&UKa!XtjeSUMA1Lad%fjEOt zC_Se=Cui1X!%H+q$h`7pQOtpFBv*fcxB@Jt0&A3QA1T;13lb}Sd}h6U=g#B z6;KNV;?K9&Wt^S7ZLy>ZH#=-iFTzh_gUCO=;xBw}bw5~hPOJoN!ZaSjTRN`t@*ZTW z#g~ABj>dyDHc1bSd{?QMCz41+I;cm+N2NOXH7KI2%_e51i1P-f51q3w>UkUGC6Ruj z&a3l*MLHR>`(Y2`^{~F*qgtxWp@MI;yh^mEz4x}f3^1{05NXge-*O6 zcs#Nv&--~uMIn;xJR-`zOw%+iOigBrEN#}FLOg_`tYvAli!!04sL&IV7Nv*^Z7S~n zoNwlvW>Vqte$V^=TyEcd=ex@}=bn4+dhVUKZ0I2V{LQWUHu3gKI=MTy`e&`mH&=#L zD3)Jq?>=|Diov=Z(=q@;6fr6ejb>XLYP{SzBllFs)JL{8GG#}#j!t1g)S z`>uWldn-ooDl}0s)L!QOEGnw-hgIx^xA()$MNvZic}@o1{MYAle`*tbtH7yA+*GBv z+G;D$+rO;7*_V4;`FV7kRhxwIX;VD((RNC13uF;lPEwxJjWA=Ka+Oh$n z#~Y094-d)^9BXYIGphW7W%t*=D0#F_Yied^<-5@Drzzz(7OAXQv!(a@*RPMHYR0+} z(K?S9ozXW+POWRY+$H6_|Ly6)$r@?8D*}>Cr#!0e-<IR}HNuw2D+70a8bMt0uycxKkV4P#X?-_;(|d@S2KdRIEzwJmU4*9I%kChBSV z>0uowT~5d@$B|k0>L%T%RX>kzcJt+LS)N51YnM9S_wTMeF3QA-vRt5D3ZtUnb0xkr zr|FGsIqICzj;gI;O%_iDW#&=ZE}-waAxbB?FNb#CxUOC5QRQ(m=*Ik~xHLH4rv8Ly8jb9dZz?%;o>QC{%OLg&0WW;qv2JB-L%O*!cq zRdsx-zs=ex`&>$|!_nK6_e7}WgrAv{+AaUh&mHbZC9i>!=L~e`wpgK~a3OmDmC?pE zF1*o_IX82=6(-zuzTwn`ywCV8)LT>QeP6XV$Cj`TTvapnwSL!Wy|#*uV&l^XLL&O7 zyBMC@)Zgf>pXuaBx1dG!xRLiJwsca}#f_6TR&dj=_I80ghud6U>=3=KL7B_ z{1K-*?^<>3eD~XJ)`wjaWKI$8yxK1OXJhNv`FC`rv@DD|vT2w1Ru$?v`k+9`Rc z7Bn>rP+piFn$x8F!HNrgTpn1fs-C@+Tl%~k#Tik4?$rK+dSMFhqC;M4wCNXoKyc~o zo3y8W%C8nwkGQ9BXv&7h1KfhbH&g9PJoc3LdLI^8dQ` zVEflSSXCXn?NRC+bU?Xu-jo!@KXEei_HX}Hq&a872?BK^f4eQl1=GvZjW^GFups_;!Xuti zevF!K^q}TWS8di=-Ac8H2=$xRX~7BAU7n9a23=r3%iHwwhb>xbd+k(Ow&zrrV;=iM zS&RF)6x`X-)9yco$~mhNqfWI;KJ&Qj@Q5FhBPrb;{z;uPTA#Ljr9W{GcS-x)<6V+3 z`SsTDyx?%vxj*?&Fw61S>yTlhq|TP34TWY(m1`EH%~PK}cWkbcZQ#73!5R~Mca%ox zEb$F#MVZ`ubn76@xEh6Rq87T&owh1w%~g|Xze(4(PpXa|*(PVX%cS-tXG`;_}DIxZ51 zXRAD|`fxj+pjI8)j{Kp|`Yp5gP0z2?)m91U5HwDwENAp}ifWaF_kG3+bkdy460=vR z#%mvX`#>dWE7dah{_{R}ggsgpg}+~|4eXn}LHlrre&scm2Lyx{duRmq1Y+V;CsZkLeJh_Vl?aBmr~9+u>u zR0?-$inhJqcXCLtkt;ni+Bp8I>9e`Tt4scql)bn27}G+lX-aDJ!eEaWK}uQg1NZj) z)=T}~iwzO}QAy<8+41{rFZg@9p51uW)vaZ~k^3#(v@@MMg#37|NaOy=pmn@$6_GvP z=Vte8?W*Sf{Iy4}Xz$ELOAbSExJSJ|c5~va&FA-?@9zC(PTGlRRuNgCi%8ig2LEDz ztR6T2_NfCitCaDmmVHmd}`N`v?R4ASv`2O z^o*aUdgm)WUHCLIN;Jf67(L+T40Lny%KF16q;Y017wed$bvq7J^x*aMWI1ksI{GQJi(ACyYDK!lQB${!&#&b zL)9C(g0KbcPCvV8Qv6gmaA#X54_GIvJht*ZrF>G8J@Sw76HHQPr6eqB^x{~HQF=j9 z{aFV)80|?Jr!qZI$+>7i-mmt5^xq2eBo+fYF5Y+fviay|x91NnnYi~6GAqo(TSw_Jj%o>f(6@K>&~e?9`vUC~P_>d2W| zFcU^v3L_4>!7A*kSI^%x*KFO`_(!{mrU&*dh~HonF*QE8Y}L%w%}ssA_fsxBJ$+@L zpMv_U^iw=@=E0M8(+w=oYEd*y<@X_dJz)l^ zf7kzHFTd;F0~{kUm-H_h9f8aMaPbviRX)l%E&od3v{S4Q6t5d@?Z ztx0qgU0M7tr@AZW#mLYd7L=p9Pr@9f)s%ho?!ZjfRc37;PH(32s%m#(Tb~5w`8R$% zaeeQ|fRKf^9L!dF7^NpBpw`_$eIhDs`b_HDCs_U4D(#-f%Qb8XK%XCEc6@aRve!p~GaD|z>1I30J@J%#;u)93(isK`s#OR5 zy|#81SsA?99dvxxPe1?I46beQ7d^1@?`nOK+hS!nkIf$we9Sc8V8x8RhMR1ogFYxfis-Lj>h1H0w=b}nA zyf$yXcTu-}8_(s~lX2edo}|AvUe)b*M~*+otMU8QC-^F-m)z1#O=k5{kGOd8Rmif) z7QTaQ2Sm;O`NsiO?)yVUf6kkiM09N;ZjTNaGD>^6HsxT~?rfX08zN5ZX=CHSFDe9Dy-s-n?UHw&+KoHLDq8U^Jax`@tQ zerNm0F?GQZQvD`;$&+#Y=A1m#M7OG4P=BMod()ymR#n|jOY-AS5=CX&#ve0YS7dxf;n9kM;2vFV#|+Pm^=&h%OCRM< zI}75Q-8F5QW@^!A+*9q?an%+llwqGuOVb_?e^YYaAKQj!dv^C8f5D#4urosm?vI+G zsCz=iD8MED)v=6}rwEqXQ&^Ah|l zJHQ&4`=a&n?bmo%l$dos+S?|4klMXwIpf?`-8<;_;MY~kkhrg9i;}n6J7#y*DoS6M zY8^PGZ%Dl2A4$}8m*!q0LVjIY21{M4P1RxzM-6Tj?(Eg5ns=~s*F!t0zzpi=sau!* z(aSAl(>nh@E;s&RRY#H6Gtb?+fK0bWt!)Q3s?2`l-iw?n?xfK+(uXn=z1Q6J=GKq- zc7|6vKbvwX+bVs5%dwzd3WCF3`fukXC@p(8y<2`k>YrzdO_shIRuZL@|Dau_=c0wM zP^pp~wD4M3({>6sMf`4YrE5BL3)julh+gXYzG`)D`7d2(j+|x~NUiO5Z(&8cm%D*( zaKU`o+YtGNr*`j$oWNUeZ%)m>H6ZrL>KXIr!S*V@lq0b#A4Q(<-ye}uu#)#A(8tGM z^r=m|niL5{?m1U7Gn#w;)ZK7V4}dKT|^H#V9Xr0Qu^(wwkNLL8mci-ZNGe@=W9`D~P{OwOc+!88(?90K) zc{66M;1~0I4Q^twW%rwF7ZXx{%zx1JQuae&v3|jYBhHa3ZLS)ahK>zLx@$T_E9$4X zIT>4f3@mFZ(nv~uG4!1MvCK{7%hJ?sM-=|=4Tv0FMm@b#b6xj6Xw)~P; zlRc-a+i6vgCK*lr71l@blLnVhpSO9$+tAYn5KZM^ai3oYpJF@ zU0Sig$B^BznJHT}yM4>2`sRO-O-)zV*B& zH`805_Mg@?hr8bW_y4$U%}|7!bIwx9!X9Bc31{bbH}`rcS~%MowxNmi_)5i{sG{GC zI3uVjkDu`N7nLeCGkug8wkhyv{+)hTRMzCEo5h@Szc$|_ghTDrid}VnqVs+eO3^D^ z$Bl4r>JTNv@^w42B6t1LZ}Eompv==S_S@cG`_)3TXQ{kwWw%h>H5Cf5zi;f2ysp$z zh1oCm_EqWf&dThCO5)72t!^&|EU=yHeYZupU(<;t3X?AXa60sKu70D;ma(%%%db}& z_M{Xk<4czBif&#o7l{JhqcldT{NPmn`fRtoEqXTRIv7t2&{Q9q*so>POUR7e@0YC% zLfYFt(N!^X_&qO^VQxKH$|uL`22#mhjr<&4RydEItFhAgn%n#1?(SO@`aQE+bFm|5 z*0}UzTELE#+-Zn@$lK^`}mBX zESpz{D~t5J!q;C`n6S2`n|-w2h_<39)YG>Q)r5_PxD3;(R1*!H@~goL=Pfz~9_phW zCQ(fcrtfd$?`xszHI4Goiak3n`D7g6ssHO9Q6yEl#&(y&FZ&wBc54#bT}fcl^p}#k zC6(F%ilN-ZdeMRfu)*R)JEb)l3dM!TS|oH+ z{^gvKiKe2-FjxbKN#KtzEakj3OluzC{8RFUv&m`4wqV`qExi?qgiD zviy(6G`=0!zUS(M`I{9hT5?`oXLp#d2|LUT?GvXe6t|*^f|t!(G~ox)WvW7-!#nhN zQxA`bt9m1RgzuFNF5VC(@}Kz4YEP^cK^{yS=T$3Gq?stgtgOq_CzMOmOxJ<2OH(dt zPa81rz1f((S$>+z#nY(nP0UtJw^TaSO0-O&+FN_j%%a#7*xsjZIwVh<@}=fC%iK$O zCEW@c*hSQqa%r9!=oRklXgBp16p8veVN@s4Flv6wO!c)x*BG}+QgUi7DpJrlUG?+a z1W-~bEfbe_5sjnr6lSZKwH9@1w{VPVV`T+>4Y*cXby$~a3yhk!~+H;phK! z{b{oI!U`)s8Qm_xBeIG1-gT`1iv5rJ@6Ia8TSf)B!kB@rp>@h z&!_#?>G~LcK+6d5k>Eeay@SB#!0UZ1{crU<;Eo5z?gURexR-~(ok0ZdA0q5@LMGoQ z1nyoUBmNZswged5K}6t85K{S`iA6uR!{5f=DxLM=1vno@?2CbJxL1ixO(O*El_Frj zM#iKP0{2bPw8L32G#`)`j2{o*cjgmPhEgDQhw97cTZ9A3QxWZ)N7!ESP&@TI22=U-fC&o?x7HO>uCqVcTim+2IA+sJ60_XS; zuvH|+AOl9=CjlQONMvEOL>3Hy2UCGBNu`9qeOLtU7^8Ut_lGeqvaY132%At`8^+V}TEwK>slg?p%}BiKmqel(eWT zzi$nO*d~DtTY&7EXA{B#9-u7D%@E51)&~=;4-#3x`oQQ$xTB81{d5HG;vqzq&}c5%#_H&r|8Bnr5|k5$=CK)R#pV=FeLZy-zQ+l zj3mpsJ3#+pC`;slCGY@cfh~~*tPdy)SRYvMoDa{rAQQ8|&jgt;fw}?bpphw1FR-5g zcl*`lAQka}FYr`sNGQHG?QmZvfjbZD!bu84`uQNoJmA41@KzF8!1{o)fc1e3&#&Yn zAf4v*@qnQp?n5MB!@qm?jz$~qRwQsQC4sv-$&qEl2;2`!4hRMlxW|&feU1dqpd)@&N8`l`DTn4)AF&k%dpuji1_YobeFu@enfr zD`w}~JosAWUsY8_=aL?KjD)+50v;I3^T0{WgV-Z(Bo+F9vUJP-#`^Fv-8e7f)BBAh zKrbYgh#3HX7yn)KS67Rb4cy5~;QTET?mCjdxh5n8>H*5ZF_Z(~LBw`0c?Bf__x;w( z16kcDD+_;BH^P0rj63mn(a+Rp+f@>?p!A4*N>O*V2qdts0)w>&W3Vr0^DCs-a>nM1;(^Kg)qJWJ>8Ip z`=|-rOHGzQTS>Q*4fGpp>qZ<)G4AiAzpN;az97uQ&Li3=eO@KSh z3Eas|it`_nw`H$C?#oLEDsi|An;<{x>NiStBlMeP$JCNGK;~c;F(O;CssMRh?+JqyHL&+iN8ypl4CUD*%39#2CaL+t}GZ2aI;sIp$ zB5eX^BGMl2oF{MxI)OX9$)Tc8>o>A8n9?VOe@FRazZo6sz$nj2G!HIp7!PtF=D}%{ z11S$aY!?Hue?;>@+($+p`15Oc0Ox1{55)8{dH{u`#T_btTqHd?Q0YLpwgdl>X0Y`#&cvLqF3uh}pY{A!O23XN+)+I13%ePrk#y(y@foVa<^xxw)w!vooNv9w4$xg%TlUxWVI z>3ywtWI$2&9TJ88BjCa7XX!Lg@J~YEiq9G^1}-t zLw?q#9pAsz@<-f|2TVU1X{d&BEYA6p?8rm7->FX8zXtuE)27RYt*b2RgE4(XSzaJN>T^glhQ%_k-QLyC<4eo` zbJ}#-u*i$rxO~n#(p8uLoHQ`(zm@*arL(^9UrK*{>Hb^b-{}|c>-nqWpMOg^*N5l# z^8a4`C*3zgepyMowT|&{f;pFeRHJ(Npx#Ld!*}Q_5a^Q zJK|cKexy75v_0+dccN=E@+@{6?eX{jH~MRr*Z(^GUn>6}laKG;zjNR_2flOQI|u%M zZ~*6)D=1fbiaC?&PCwl!Ww|F@bxps2_*9UYTtH@xFMQGg4yyPtyQ6YR{bYwfc*0VVb9v!j2hc-w0UOizVR#w zv}>FM&q6S*DhFE8mdgm<--$Q2asOfbi6_W8+Ha%HF52-kb{A;F&zxg06Mmx&BBIO& z`mj6-fF5uj|1|!%cOGqz8JpX=8DiUOw3{#nyqR+dtN?BcG@?AAeW}Fe+WS-Shq)N< zFYrm0mX=ArzP^dk(b0*qv9Ss0k%8WTKVwIZw!?Uyz&yYe%L305V9rxu&ecF0QLNi} zV(sb%Nzk6518}VKVq2U`TC`{pLEGlf^($7aAk5xy#97w;0s4`#|3y1+#ugssDHil5 z?%RG0+Z@oROZ+2vR-rJ}a=0#DP*AXjp+)wI_`}|-xcb(L8iM1Rdu90>a^u)qa>8}I z)Q-Geo<*m4GY?qax zzk)yRtw9&D;QzLyNV-oNZOG7eNKAv+ zo;Dozf;~;UN#e^0IR^cjzh?u(|1|4M@n`C9oSy}~zY_PGOYITCCKc^5#X1SkX{gbwxVKnJ18sXJZ*LUp z$s#dON9(KGY2r5EWJCBrefpF>BL&x7>+_EvLnj8@y)zrd_H;6KrD$J`cC|G$)aX?_ zXF_U22=+2)D}uIjvNlsWC8c%NZL}Sf=+9LR;s5aALn6E9t-jx=gK&>0+CtI$T!99( zi4@z2%h*H8=p=fNx5O3{ZNJdwr-lZx4QO^jzO>ztZMU~We^BDI8^XV#{*LT7u?)Q@ zf1X+|p+UT7y*3TC^(xZ9oOi<5veNc@V8@C!d$Ra5dKS-ocqDFeVq|g0-+!h3V(7rV zlJB7&XT}^Nu|D&q`|iawfIVUzdKJ$sVeD?iwgzH5Ou9Y;4Yl!i1)YRC82T%4G8yUn zHStF}urGmoyso7Htt`_f2JH{pSZshUBY2U8NDj8lc)Ph4eKPddBM8H z?1A0BeS12}2>8~Yg8!%JpZfF?14sOC1umuScrI<8PGbG-X#15Kom4}ESSK-hRhOX6 z3gXZ3WZSlFd7p#-r|O^j^pZ#oI!unDaP#54r5V`S|$wB;aLKbQZXqkrH!Rk^%m^JRWx`+z(}ol=*_Z*}3##Q$9UKS%$3 zsXYLLV?)o+#lNAn{)unj@t3bp|LrpPU*P|tLx-*w78b^Ri~i`*qj4W=e>ZH{5SNjW zK_INuKYH|tAnc#^8#iwJX#au1KRi5~F#g|y|3=%9aY4?*ufkosao?g3axsWS9P8`9 z)!sAsPrTpR*o|bIFeA4&sM1gTjXq`be@O1dRU~Cg?|S>6{{+4b<;mdw8Th+Fdf&XS zPkz3`zjNS!jRPUPcC+yatp9+#Ee4^jK_LE7U zc{2Id_f(eP@ee+6%!#Wk^*wQ)mKu0N@X~!+b1_!;j#Xg1-)4yaGoE&ShQ$ zzZ`s$az%^0a1BCmS za8JNJm{-Ozgbz<(F9@zh;ob|}vw&-}xOW8iQ_TWgaQ_bOQ;?jc zj`AP^?^Zi$$^50%?!LZv4-XHLm4#ab#4Z=;$GLx8v&Z!p+>3+daSX~m2=<5Iy&nf* zT|=@i!$dQ)1_}ePBkdSId`kW}H9vc`NmnbS($hLU8{A zmItnH;@+yc8FkC%MOf1Xx8ig1$GNVnzBcr^%y=#}o=JTtDM~JX!~xfoa6b#K72{d_ zb;;&(ah;I%xj%*e_pnRy{Dz72JUPy7;`*t`k56!Y8uOL)SRS~>Sf6Z~LYaV*uJe8@ ze=Ped*vEtC^PXELUOyD$U;t;*yOA5uo{^~HiVyrTz|9`;U}SS%hIsD+?ybTK`8oW5 zSM`>jC&jfVTua9DZSb6QoTJ9QId}#vuD#;fuejEPXTBbA8bgvEBohJPAS;_JzysW? zy7HGjk3{x2mjCrVX7qY6g9Gjo|al`@FZSc%~re5MYIIdmdImCF@B9lMjuo7CM z;%fSiI_bHBvh>%NKf>v?yvn!a<>LgBclRR6y?vf!-UuO&0^%I~Uj+4uKc=XyG zu4Ci69j^aAfq7tDV@!ahvq$-`;8<%Pv26abW!}(loLhba=WpTO09=#CGs~HJiR%rx z&kEN+@eEcxpPE^7z`Pi~Gf&z4>ns1d;3dy9=sNKf&i?uH!ghN7iK&;kHw4$65C_bk z;p@lpXTp8Tv$Cw1UW37T|9j`R)mbk?*BC*&QFF%D*UTT+b)UqaAXopGMX$x-`A@jN zfvJ~xkI0Lx6ncK2q32`yH&o_K8nRDpm#U#%EP0UyYaof_*@GDJA~PA@pU`yIUOV_$ z{&kfz14G(bJ)LA^{5^BC_3H0^g_E!yb3&(L@;Flp85oj;V}GXdF*^J4(BP*kOpnan0X4bX7KWb4$^M|KI~OB zDZdLd5=meW@En%70hR>7!ozn1{1z;zL72V{SnDI5xtuDvn8T%x((# zOoe>@fIccX93!K5WAeeYVeP3D_Ikak?URy{N^+iLll1tL@KLWJm6w`(a^tt+&etv#_MP2PK_96bfxQ8C|;aKNVR;F~khhtvc z(~D#Oxf$YnOVIWP=V6us3|unO&`|o=kBZznhaTJFSSI>@k~Dqnv*UhroMXUw4jj+o z9g8@}f#Z58@2ZcbUkPWn;+P5d3u6ED(sm9BE3A;av&Iwnf#X&jH`8Sgv|!oafwKCP z^l`5-j*0ND4Ep{omoen*=K185*L)JTaSjPyKbu?%+(3L@RLhn9Js1$_QW9}~wsxPKPMeK_91J~oc|aGZhrWG`O{lPi0;pShxT`b>Y0dA@x4l4QjN(tT+h zSK)X9$G5oO9>)?muEg;a?gJHtY?YSHHsBe!tF_awTZSsg^HdReblI1rUigFDKDm)3 zg?f;i!5hf&m4P^C6_VNOV=AYR))sQ}YXSl5V$nvK4Z`_}Q_gF-Dz<%A| zjp(r<&Q+k?;<)fe$R>I}O-xKoC8mkEqmxaa;Y)o_-1~L!>?V?b|EhTW2{J{G!C_7) z`P?pgoCy7yiu%&8i$AikiZ?IGt%G*t@@8ZD{tZTMFL*DY@65okU@uixU;4Fqg732F zV;>#I4dt(2z@F0wq$nqqloaB=#+tK98%n<}zI@7FCs`aCN?$hIU;SNI`gP&`C9vB3 z|B|%oiYuG`r}XpwN*J;9Q`BFH`zdjl^#9FIQmjIeVrAO9K~KpIh-uhkn2&qVZ4`R! z5aIHW8hEB3X4nW`KGQ*eMFHl!jbW~P6|6hhgYWt|NkH4@jO{D7d(f{+1lzzfkVa$K zG*Bk%@g1P84#0LA`U1spFL)I610tbsCf66kxV7ta(&=q`&mj z4)i6S>!*-Th(}%VBQ{PE&wYVy2lf}xV_S}Wtc5@i_OHN>)~2rbC2wBQb6B)}(E7>b z-1z_!elduIT{uFHK6ps4D_}njcaW~Q9RtUvX&$qSqAs1*prB@3+XvTY!5>VN{K_>`+YzYeFqE^pUGeL znR42bT!+2v*grsfMeG~ADtu0M@7{e4aYx6*uU!WC{v`1$3iuAFj%L?O& zW7ou&%~w|UzbSr63ZJHvqUSjfx3u2)b>&|ltakpNJ2xR4|5J2UU11pd>-zR7Fed(& zJPE9MqjjzudPwL44{PXd^>}eFQ-Km!Mg^)VHGU{+8$(0<+m!0vo}v^fEf@;oTM*nk zqeQ8xN#2@LR3l0ihKl$t4&<;g)fyBQzFC8I5MNG&n(J3s=T(-ZsYdl4ML`I0nrc)~ zFN$g=NfX{6esf8hYLo|zI$KE6RHLjx_aUA^N;>d&CS^jg;O|&!I{cbQO{J#5`$YIV z4t}GHY*h3fv1ti7mnv=U%REQWUlDiwD({*C4;3Ii2TDNMPOB#~27Jq&PHw_HyB5)5myPz~!2FGa;S;V%DTP zkm^rqQv>L~Xk2Cpt&IXrrvn|<4)EZ~P#~MW8Z{ZdBDNf$2vdgq6xKuh$?#@H$3#xp z1I@@uD+uKPVLpVZQAdIH0W>$YDGmD90BRt><_p)fk)S$mIq^q#|K$)1pFOjN{N!=8-~}R1sZE2`23u7Rj0@N;)O$MWy8bDD$ z)3wJGzM`B90dIMkrKqq*`Xiidd1_0!0-+7xZn(O3e=T)2t{sPOVPj`CTz!V=IE^9d zYC;FLodui6x8n|1U(OY(k1%Mi&lU=~w&uL$Y7pN}I9%OPU^i6AvF6&cg&MXt906a* zw{+0p@NI{(g|_{dYOAZ+vh8dvxk87TvY8n)S5wn>5I71QCfZr@KQXa^>KF|o3b`Cd zfsMm*@hboX+$D~X1=qq@V6)VQ$F<@LnQyXR$2vho2cUQ|cPW>r#>2nE)!D*{c1!t- zxdL@HN1Krx4p#_tS+aRTuDT|ZioRyOS?FuZqNA@Vr3&Nf11Id+4mRdCJizzkbjk%s z_K6YDXWQHJY&eh*-%g5>(9zzWF94#any5{*7_PpOJ3vcYTSs@WhPJNGAPt>?99<2z zt|eDPOKXS)G^hhD23l%4%h2>e0t0om2M#gUV{7PI=s^OO<`x=5xC1OS*aLKQv<3{& zwbUEPZZH8I_5jNPL%0JqbO)O2Y3S(aXlt0W^*9<9gY+zP*gEV%+<`+HOhDUm(BMIw z!Gko0XmPbQbOsF6)6mm3H`f?4&_c^{kcE~7dvL=E4B={74rXf)&@i_&9|$CHhiCvn zx*9q|EG=}k*xI^my&4kqHEZ!zUsJ9C=n|9(^6~Pcg)Ra`F{?f|SA5hXXF`B3er8`J-mVA=x1|V)+eNDL( zB{HK~NA@35Y{9kX+F3x|m((VG2H-GKQq`pb$JG6a5ZnR9*eJ0zy|4pm<-a<*3sc`EVV5x)zx@xJ1a-Fl{mPh9_wq? zNn8yWnE))35{@eRyZg?8?;QBff$tpnU*>=zMbVp21B?UA0_+3a0)QFD6g2@J0Y^ng zP@+txdPWIHq zf}ta2FBhf_K$7Lc`~r|LxiAv}(o`-lE;#maKKuF zVHrT|+3J`BO$X0+j$(7(`faJs6 zQw@fX0iq_CS0O-J%Ry8iMy6cueF1_xq$Zt(0CAHGvm77+au6SY6oapcbrjzGevpMc z0EoU^%8en7-g5Y~0Z4!xrCk9MCP(QMfRxCkjPh;WQImrl2Z$T^nplzW79KfL)v%L=zyb<@js{5GT2G1OP#3 zP42Gn)>|(39RN|4i**1X_Htpa0OY=0m|FmGlEbGGAlKyZ=?mG00kkG}1Nb>xE}cmL zk*^nX0OBDR#u6Z5a`<=v#7r*CPJpyjk>tld%@0pHFRraES_1vYGR9(R@`Jk007Y=)4}ODK88e4T6{bqlVsc1gq-81KO1aMA+gaLJ0kUFt0{@pnM5(mZk^ozn@RJt5oczH=n~jy7 zgafAh-}vTJ%@+ahC0MYQS@L0yPDZR0SGFED{QM;=$4)L5sF$ft}f zNRgD}Kbgy3T8~UXO=zH=jOOu$TxpH0k1s-yCp#-?rVM8qtW9#fl%Tl`q*huqT&P72 zltt~@$Yfemc#cA@-~&ZL>ln7c!Odr7A%$o30tl$)Mv?yjc^$Ur${BQ zP)(MaJA!OANE)n9;8KBBbdO2X;>yrT6Znu*jn%yy#g|X+{Cb;Y-$TX z)(kf9BRs1mLbXFnCYEWXTb~pP9AQe^wnjfoB?8+P8fg|x*3%UnA&?`F%TNFu#7S<@ zcav@Gq?Fpqs_co-!iZIG-7FN#t9(h-(jQ_)CDVS?M@nr8k|V{KFRWc8s7I}_m{FKv z%@0~DMx0q3Nku}Mp`d2iiM6`LSSJqG9?J`qI>?2j#33b^sK+-cZc^UMVPV4Ka_uFN zK%H?Nu)1NT<_eg!OnE}!E8T!e142nGlPD@#7_Gc(8bfSuK^Pm>0Cnj*ITXcBj#L~O z`K6oH2ILiRHsV`2^0?UAO@XG6i8#{+^ah&;S{~OiBDs%oVMqKdX>LTB$3hipQ zP!gpuitog*2Slf)LJ6QMNn0~oK3K2{$CA;d&3qtqj(9HY0>@ ziy2)5f&s*U=qeZxGbp+YVgMD%%=hmzX9U^x-uu1p-TVE1@9l99)m>d(U0qdO9Zthe zo0mc{Z~Bg3ar zlr9^|T#72QMD<{b+QP7w5hxJU51-t*6lL@erILLVRsS8I?}smUH~72!89Js2){JY zwtK88YErxG|Ix`SK0c;wy#%_CY&~m=8~@ITD^@-;jiSs8@+qPiWX~e|EtAzgGZY!0 zxUtjb3E5mP=LuX^QQTNErUlDU%>#uZ&EH}uN=zl@3S6>?vL!DK@WT~9xMH1tFuAy! zNJ}z7mVb{LbpC-hg(~c|zsdwS;7C#NV_AYZ=DFPXt43FRl4@5i6yWjK!KeokV>WT=SUB`uvhycBTlJD#=Ic$9{ZQem9v0h>rvb!O;3avN$1`u zlC{Csvqhuf0#q{tk*zpZLf5l{T&UO=xGXi&`u0I^b+%qC(nNn;^jwI3I3_6{j|L`5 zCxZr>Cji8h$X0&>D7u5fR;+sy^7 zaL$>bIi4?93_S*`amD>?pd+q$;;T*2g|1vb+;P|02_8^VmP8Q|zQjDYn#(>pLReug z<%&o3M-{P3d_9Yua1S(M6?84Z2C01I@+gY7K2w|kWTKRv;0Ormc~c3XxUgzxJM-LZ zVp#J6X_2HEc7ra9b`u)&czddLqX(@~A^`3kA zjUrc6De*?3gXn7FQ}zaPx~YjFpy9^9S2KS~`=_|Go}_LdVJM&!xS|@#aJojdgGi|_ zSFEka6As)TnOS>@b(^c@Bf85n?>M1uG(xjz61v2tf1#F{ zE7n{EAUOc)d=&-K_CmrVg)6#JsR_>%U6vd|S(YAYw~j;Hh?bKX-`5=E16pzzW=Tn* zM->?dvhja_E=OHTvhKU`%I@Hr()Pfv>Y%E8O%Ee$>PlM?ojON4ex)tOeDhytxM~Jk z;fhs>`*Wd)qzn3KpyrD1JBa>t6kT@|-R6oS^{pj6Im9U*If!Z;MX$IRG{Bm1^KJF% znU*UG!072Sfbi-!pJwT*&*cv3F6@L}IWuOGeCkgGpLNa;QwfHUWH0humMW-@mQ|tg8m$mBofj@k@{Ro9CYiDe;FN@+MFZvAy@S2 z2f{_816;5A#HWJ3<^^`<1qcnBE$9kToq-$SW{)LJAd%`^+(246s_%@zVt$YAAsE|; za-km5O`9xPL$>;6SyVSt!yPperQrwZ;%rjtn4~!v%pKxyNR1o0EGgIP9yh78r6djx zQWe`SnlSO+k1KN34~JuMxH3J&QUtEoUkk_;W02YElejC^VH2-}`%iR4)CwontQ}DZ zRoUu8NsdrT0JQ2TCRs5`Nmo(EAsB>3HAx_J2H!nvPXO3*hp>bs;Q3vV%jUz;HfJ>YUdmxVbG?MCX)`Fw;ccGRCE)v-gZtglcKCJ7s) zr4PADTlBSPIWJ3Qqcae1XjUz8e!57PDVjhR2?s({((f#9u$HVrb3kyvA(jJ?_2pY- z4pUi*E+yvDvabI|I|a1aq&raM@g~&KDH9Xz&}6&6b!iaYhOK_E`90VN#-O7L?PTT! z@KhsW4EUn8d2W7!elz%y>5X9Ex$r9DL!=d{PUSk9y|#l$0-+gM#EB_H4u=_AzVpy9?SR<>qUQKCIB<=3P7h$0x1zdeAEw+ zeC|tP^|yF$GyB|j%%wJNHnU`|DS6uq%;~rlY)hIkF@}BUxPuW3QhJdkErt?o&LJ8g zjKHNN`e2KQYX?~ntVqw3bgY*Xpc#nvr6r>46Q!VLuQ`qJ#ZBrr2!*h*hEM!>ik-Ng zMrxh~=_Mu9mRu#uz&uZ6KSwKOqB?jxiVYe?SP_1cUsS_o6-YLa)3%tgF4Jrt zOH4rwPKgsBrc@^!5(gush=LsT%@Abi zkSYtYB(%svtw5h6)u}}+B;h?tGnDlBdL!ZaSQ3Ph!%YevfcQFEa;OiXkqL;MESa%L z6*8W?&>qW9j0ZY)G;azNJCmz`J}CGk76=R+#U>VLs!B(`eq?7(SK!89r%P3Bq{SF_?_`P?PFlLy zK->VM0CIcKayLqKZa~3=y98}Hi0a{K(@~VQlr$Y#PnO%(h=6a1@bxPN2HYgZdkX)TIT|#x zdNgtK%Ap`7+W*`ObV)h^V9dB^ z(`+6?JJ&^Yr*aPp97!m^$kKw2VPuf3*N;_0?R^-MbexcEB&j4FA2`sr(ov8CI==%` zIf$-LOZ>`C_#M@4Vl6d(nlu#gGvoi~_ArvJuWI32m6!fiBrAZTjC*IJGLeQmsbzyn7 z1}Gdc7MG(dqC8vUPh<%|d(s~$A=9KS^k6+rRE?oa_K^mwq29_C^ca*JGSaXRkQUj9 zt`UtEq7pQ+9a=Uoka%LgkvXvwlq*>bdoW|*+mPu&R7hNlO;nzw+!=)Q7;Es!2QQ3j zk$g-fz9SO?Jtio^t~=PFt<&&=tRe{@xFT(Ww1X&LVu!5bxO(;~ z5O`3@&tQDYW zGN;kn27f4=KnIm|P_j&)o4)DO8FcVmh04LOHgVy~ zezJsOr8Y^>p{36^@&SY`f^sif^*WnI(2aGX8`JSaPBzXPKNGuG`ovpO0w>8)vR}m{ zCAuQ{4M}<<%Vk~RCN0Ifw^DKq;>lvVAb=|pkX6KG$E3+DEG8VudK_zQ3t~zGFeOpt zB3wYOLwW@awHTwHP#V!5J5d)-)~3P4OX&7AdY$nZR9U{nG3SDpZ0!+4`bIDI>%sm`8wqWu*7l$?j(x7(9R4W7UzDJfEen+(jRp*-F* z`@c6>j5GWDpW5vhd{U%E7}fctG|3r<4%TX1k`4o5*(Lh(Gjt9+5!jGnpO4 z3Zl#L^-{q&%rzX*jrd9ow!V2>;~Us<<<*mULE4=gU!}$!_gPpA7YTy42vhfkmGL$7 zIgNSl_xx>>tl`#$$M>1PtLGd~kgN&W&sJ{rygs-;x^I7LeaK+x}3FZe| zw(SiF6L*z}=3Z$?&w{8y)ZXtgoS^C)W7GIXML{QQ$yAwhh|2R|)2@BF{*_(8Hh!#v zKqa3(@{Dg(5Oj}kWC^Cr^b76C`M)Sl{`x60Lo^WE5uMCCr(EI14 ze*Pry)s{CWc^BfjN&VtU+MYNX^ZcVeumi!F0|GzY6|!SqEajw9!3u zDhd417s;6}fMUcWS?9l^Q;53`WKn%tG6#i{eW-wVAUiB{=6a3#D}d14!V+UcPdaWZ zKLUpAW#e;oVLXHZJdR?_Q&&Q|bH&bT;+bj=tRiIDtJa~&CjK%Dv_>}LYJ^zXK9;1w z$npj6Um!qaNFFGTRHE=v&o8;Ua|Efu#mgJlp|0+k=EnZmt~eyvjw zjf(WH2O}t#tJtp=i%za+sv3QI;G!kfc?Q-9RcEC-Gm%1wIR$HDmmr{rm^EeowQ>M4 zome>=RmHP$K&v?iBsjs))s;*Km27^~2To#zZ2RL`$TQ1kwrsB^Q#_ZO6rgV{9>oK* z#IFME0eNaTkg}O?v%qGdjmLs3`N}_|h{PkwWpxO&otvZ-i}xBVio>|FYDr@!B*YW% z6U&SHJtM&QYQA!JR3i`Z8#AnfJ0Z`-jA)E=k9yVUt@zoAoty~H;_f)S&zn$hr~tGMC6Ava1*Ba!>L?Xu$|fwvVail?gCiX)V!v8vf;?@kk|O=H!?H*(ksCg_Fi6hFSv znw?mQtbl1_JoDt(d}>0D&e(3VfNXHceD1Ex{rM%^8g!6%L}vdzGDOx3VRD z>SUK#Ak?&l4af(Cgx(O2?r|mb0qvJ#f#R4HSl><-5?^e^UP=hLh_#ayl?G2u>O2xU zmJFu5qC|J)EkR12b(Zy|u+jpCYukU@h*e=LO^5TKg|*O>`lG+8t9KSD;f}k(PCf!M z7>%N0+3QmQ7^9uRCR%o!!00Xj?N$Cv_bwJ>@QSml2{NZ3jdXO1E^rD^#W=w&V&^^%&Nn*aru_TeC>4W7CH{Y(Fv?#euZ^E(GokkXdnk!Kz zIYw|vnkppy8`5MwGzT`T#mb7zTgI3*67*wH-G{AxA2F6g&&MRt8nL^&>YHJfz+zWg zL=ME~Tp{P_nq*a0xgVpDgmC)P3s>Bg9E;u(U59G z46xi&8S{iye7$&HSG1OOF!xLpRvVlaJ2CeG^$EAy%Hph;E0F3!ymf zTxnUGeJ5hi3~xGsmcvF&#HflD-XaGHWZA@VzmoEGk`hKa`Rr9CA^wDOF}gq=?<1v+2x*8@NtTnyPD`^3Svn+#fCXJBEvnkn*1lYRTX~H{YY@_FC0WOYprCRI zaI*CQ=+RS3qRr3?exkW+6YiuyT14-|zKu7U36MtLxYE^&5i2W6C0R?f&?J6V5+85Z z`Z3@nB-m6|kXW74?qp3S5A!w62u@?}gXVPwl_j9}s02^s(jPUmb&<7&i&{&p&391!yP|oNPrnnzmNM z{J9)-VjV7N9MdN$pHI>vL@1H6Ig(NxF=(pR(JoSzDBCZHmHVRz!f4RK7?F9EoQR=% zKf2CGAz`Rl?G1pHyD8>cGaPhQY5Jr%h;9(c`e5B5W{ramY_x<`(|{T5AhOnyB*DGF z<{@fIb+phl)516VLarSstSXkTub>~SP1y+~V1*fzj}2nw7*LRySqNOdav*XfX3(FO zpk$&vOizl!4`St6G^OCRL?7m^dhU}VR-PoQoVW^A#L6R)gJTQxFtRvE6X6H9i@E3r zb~R8JS#%Yul716KLOE~Hk1nJJzKwc;7oL3OmrP2kb6x-^^N07z>@1m#@-_*-^;aTB zB&!sBqXW|{@Q~`1llpqVGYdrqJT3oz{fzxj+&J0S_8Iuvem8C${k2^UI*27Ak@6L~ z`0l+W7bt7Q8yk?_z#*RsIpCX!J%_b6z3=KQnv!Ho>Eg)ihEdyiwiR1>(#=#K$(L~;Ns*WfI!>)DVP5GLVG{6 zFX4LmL5^#2Q52C^_{=9R3Q`Im_{2x4P=ed+OgnA!uiT{h%DU~00lUg3Y;ul+1!sj4 z@dLqcpjKgK+Hu!Io>gsVlOOKCW2j6G7EqIxqt~PfSj5^ew-UbT$;T!xoCxsIj#drk zX2EN$Y^=4p#&>O!W?&N0-sHgOr|56`$#Wcc8T!Evc--BPqwVzrP4pt|7yRR;2zAj7 zMjt**^Q3hdGJ%*6>rCaxLumOSGO@D*ZdEw4^5QJMx3f>&N+NzW>^Vurhb%@2(UaV5S-x>H-KrmxC-X8c+qQ_H+S-8rd&%UF`=8Q(XVM9WahGZT6kF@p6$53XT;9O&2vN6HoTTPBE*G1p_&f-7PxuN!lHu>jdQ_!B;uNiOZ45KA z3>2!0ojW7Ol_VC#ydgtx8tW0M&=vz`IAb+N@BJM0bwhe_<0|P~ygJ1-p;L5eSOwSk zG8f1>bKBJ=T3*xy6t|6~I$eWt z9;U8DvK~`XuAYnj1ec)E!-@zSvutzA2|YBr|%w8*dC$ZK7(}8$nW`; zb;xUt`o27O^Y{4ufjj}=e4&da0x;DVrB5z0G;NX9WzX4j2)g|>q=ze+7J@bc5-W1V< zC$$*85kfL~zhrbA%8=2LA;V(|=oW*@AM*Rq+6bik>5nwg+%j+>8e2hA_yXgDa?%p% zEx9`XAJ-41^&|K}Z3-3;fDdV%1NgnJnIzfN=WH`Z)_dBvz4mjm+1)z>P<1n|M+%&D#r@Q2gUS7op#@)@C(X@5!$= zv{^xJV_#a=9Q+JNO=5c@wKwt{K%c+0L3oi7A@iHROYnC-vQZ}YKWAz#JlTet1)rV_ z-}ZoK+u>IZW>NNXF@YyGB7Vt%2Kcm8p2!CvEgw-t;vi}{Jgr?2&M^htFi}YSLM*dDudc{NAWOT?aLi#luwaB>UQ(@3NsV(;|neK>G zfxwZm@E_}+P!Z3P*_zm$)RjH9+%3G6r2xx)LSdkF~Z4M`Zi2 z@cF0ubLgItt6Q>1Ccz9yD;wo*kT!tM{kS%%`A_~wye*iHo{{PJS&JY0Y~EbWQ-Huavvb|X0;nl@|I=egXr8J?UNWz zB79ra`Fl%kad$GUCs`Eaz+;HV`Jrb-2Z8t}iw+#@w6iSUqnGllU%DCHwV z77xf+X+Prq_<=l1gZ|lpo=8r7mc%2{F7X+H`My5s|M%^Xm_T$Cf;uw!ICO2fe-qmi zPjAmRnAVOTVTOm;M?SxXQ6n%{np4B^Q#QIfmbc+;z_a~GB6C&yXwatpw)jTc`@8&Y zv8}D|av6W5g^ql_kHG&F8HleEuOPPdfkudy|E^u~D__K~o!e=J^o+!?_Ldw-xcQkn+A^Z@Ik*(7-8;9w})VJbL{~%vsfM#laqB^y9Rg7vdAZ z595S*2Kn;j)t%hzX8!ExG~1YR_6!mDdj@a<_@SX3z7HqV-#5tL#~;5N>c!(2IXgPJ zxX_$j`GOx(xqhRfC|5uHIw>zy7$D%_S4{^`u8fj4LI3IQ*a<0;^H ze&Zsy-?w_sA$$RN$!muB3;Z}<_$6D8S-3$s1^)8B2Z!>6-h3zzzyIh-+9toKE8u(a z130i@C_xx_o5~~ge^hU=zW{bz$!kwfqy_r#jX-+zoi7v+>(fp$lq2vA@#P7iITQ`$ ztnlXrc#{ka!hzO>U>qzLr0kAV+VLG(D8r=zl5`MWxn4TLpMLs@6DsiZT1xyR)Q>dd z%_9nfmQiNG5%59+{ewJ_mv{X;7ILe~#)ZM(h&a~Yk597>`QX2gLu*#M7CM%9%%{!$ zo_}ly{<+L}m$N+BA-}&vKCMHZ>ahO~?ob}yA%CI+eOib9o#>E1*CBtSL;geu{;?hM z)gAI?L%!EXu>SY?ln(UvlP6nnjArtdj~dFEB=kqvSiFKW&1J63aN_<;;0VSX^HHYe zre-n?{!9O_e*X1`f5*$e@!{Wi_}?*K$WGU#QnRtLdFuoN<@2|^T%MOh=dRz+b8@_v z)<5Ir`-a^AZ2zEz%QSeYSo{eNK!>jQ(;i5pJ!&7`&PbmCe2eSx+DMbX!=kB8QTj-0 z0h-{oMcNE-6kczntpOeJ`XTKMI0J70(u)9n@CG9t4EPbR0O@ML8XbyCK$`rWl4QJF zkWK;A!nf&NNE5gfzbthS=`_GQ11RbO(nWxO;e9}8uuYta_ZiZQ0Po>_jdU3x`PKVs zq^Uvp-i`MQ($Ec6jF*Z74WK(dW+)-O2+#|!7SdQ>P<{sJJ4pkE;vI@~EZ_^gW=NL- zPBNsZ2}nBwmi>fpVFVA@e+c*?%>f*W*AHo|VW{^*p%bKW_(CngD?mCJuoZ6t!DDkM z!n+>n6u?cxpcACi0QceDh4cx)XL!?)E(0_(qp1Bz6L=l(L8OZSoy~C-326e?;Jtu! z3Sg(<&;im~fIs8CN@xHd^IWsx#$jqfrL>FNguE+OQw|n81$Z+Q1ny)s>~Gpl+tNNHv({Ov{r8dKE=VWtD;%YZ20#s+14^b-6$5u1}#&^@*e6`kF_e4R=6_N z$xn$=dDX5ip&N5@EDN|~bl*|+tDWJC~Bl=y;Kq9X^FV`0>bB8+Z%i5>qwj^zZpe-WfOh!hM z4q?*rIzN&L<^Mka=WC!kt*wu0$8fD_AL93KZMeR`RqscovcT^Sh6z_kqA!@z6?-ecf92C5vA@zrPGPzG8va5@7QF))~ca-FR0kl)Va4>Ir% z24;7lyV@cDh{;zou#tgEhh;5wVW2Jp{+$J7SKkY&pY zzz2lM_!5S%SOG5JM84zLv22CG0l*04{HC$&@Et0oBO!I2^)o*NU!SHy9DE4^&pMVp zPcWGu7zjZe{eyrSt3WWqkPv*zV%nw_5S(4ma{my15cy)~hL3TcivvKth%O2D2M=$a zoSa%pus{0;@N5Kv5dXzOd`d!FYYEoTGgM$lKJ-!4ZjztI_Z9~52<>#wKq65q!Mflx zjX)Ue5ah!LN|DBRGs>d2Dmnx~1D*l+4%D%W*@_OKj2QfopP^HH4s(`$m!rNYI`TZ1 z|E&sTq%;lR^!z8`imdU7Kr|Hx|xDVY`IM|*;|UAPx7m^22Bz=G5vWtRXRFPO?wc9nlD`ljXE z%B}&SP>mb;Zbh{yGhg#iPChc>+(C|^JA%EE%4|=60X)D3pP~YIGFP+<$3DcvBiO?u zj3+ZN)sylFKg^ z8{j5;ga-4x$k$d6S6VGqUV+{LK{&ec@Z!M@DXJGz{2(8HUs&EF4AvyIm*X@9_z-~y z1_>b>M0wC~w5+e5!NDGa75D%)hVtJ zhab!f0_i%W5oXZxX3EElgesi<5bJw-Q&|)&zI?IJ2UPx{p~0SBJZd9SVP3QfsdNek z4d4Y))M1he@(P6iQpCER0wTplIvu=%q;p7`rq!eHh1Q$kR+E_YEkrF>V3L#g;j^h7 z3eNn{wn9pk0xgLiZnE#lWQe&SF3hIX{-IG4318Uzw{sjZ z+|)aO_62Ro;~DI45*TU{<{xANzc6X%gC+syCgvt)46zYt+rJ&^`SCo1zpF6e@4U&l z{f@>NH2?H=;0?E0j?wg8RmvN6|5x7*{F%?+-}4U#|NrXSfgkPo?YXT7uOGdCtBc{~ z+rBadw?X|5bV7lD0IsQgPk0)1@^_Yt-AuWqZvZcUacRZH!D+hO1;G8^=U)x{-=hJX zLQ-oLsaPDt$10-Ret=y7>seH+7NC&=6{`z42N0m$!=leoag-U`4v8cSXTxO8`8}4* zf#lrv5$H)#HPfCN+5tJm0aOMY3#bC<45$j|2dD-Z3#bmb9Z&=C1Yjq?8-Sew%K!n| zJu*4A|4j}V{@@Sr=YD{{@dNy$AK)cFz}No(kK^eMk4#S8AK;CCfG2mB&FlRvzgMIDzH0vgqzemlJsePL0C?c`C$O=}nYPJlh` zwUWnqg}@It@*F*}5XC)3`bOYv{~&LESSW{CX5q%7hK7*rAuTj;h0#5Lp*6u;J9GiZ zJ(y-8&xLx0_y-F(?lLIDF3`~U)&MHe7ePZUG)#tb$Ya5@PT$K5z$znDz+uE1ZW=DX zG^k-R;yD-RInOzEQ&tn#wCCeVE`8#jr)Lny6FUT<&TaVInsJ9XTQ^bA^|;kg{QGRbb)l!F_Y98bmF4q z-|XWl*PHyZB7F&zbgWIY0#9F3f3njwXPfDBhY>zMuJ04V52V{~tB*ZSFcb-8dTV@> zK%V$Pdm1KY9FB=efIl2Z_Ml0Up>-~|2uJ3pp`Z?Cs7->g91bM{5^9<|jnU9>@F7=d zWlF`(SXkc|why9h?#W?%9LoMCI4+wKJ;(s-ru^{8w)K3r?Fi2Kv@@)`9K(h0o2Rzam@H?Eh z@Ud0J^!z3UQ?qtnKm^Y zxVAIN98F#|5-aD29P|ND59ZFr7O)FKa+fQteEvaiisDcXwl=QeyWa1P*wUv?Sem2T zSw;LN#6dJ~<4Bz;dk?0Y%wtdW{PWTJ1r5eln+9*~e*W-gmx>#K&wdZwRo*{!aNzFi zhUpGZE-XC|wD6Ce_DhYIy}NPeO1C#p&e%QP#JjbZH)V^_41K$eLrlYJbpPz!m|xcA zL-SvCO}8nr0re{S&dXl-WeJsgFY-UWXgZ%MkhauK)=H?n+N)MmZIzjYElVv=dys~; z*6+GAs8wdy)+%Tv82_vt%W*W{ODTD->#EFAb6Iah*=%3@zmas=>?W4gm(0-Jp3l~D z)LKHNeYQz+-Lm&t_R)VWsfzd*S*;XwF+}l2bg@e0Utf2J=PyuYUConzJu&x<_8_sum)i12yd=~?bIb|dHl-;kYru0R~#+v5Ig(Wqu5qr${BtJ2k|M`y%$VV~QQeRmI0Yn7iIa@}0=x&B4B z6L;KI(qg1>-IonW5uNXEDSEta-?Z9`<;UaCmH(Can}!v&U3x^$v7cpJG|om>$18*SIWVi_UU#j886&=GEb2S7 z?nJW10gl?fMelTS51O0}SIEoy^HDm>j++pDuy~2z$lm)dC5b%W>BPP+FzlIM>wIj> z-6d-dy501BAF=ezyq8PU7tBSIT7y+Lm+hYR@?+21J?Ya1V3ez@9@?CxG+XU{T4%Ub z)cs`f#Db!a6SnEZ8BxEP-5*&0Q^g+xwUjxpL(VG49Qv@YxC@K&{WXxX))E$}D{i@I zIzK&u^LEk0A^YBLHMfdbzI9;5UP3>o*YDM-^ZZ|4J1}^|rSRVQvnTDE^L1Ik$qn1? zJUIEl^yQx+{f0yjt@qSQn^{v7)MRwtG>=mh{d@P*T3NFw>*C6?3t7E(WnNmH*xPyM z&it)OBOd+n;>y=fe+9QT&K(*sr{&|gZ8MWFv`miZG^cME_VRrGow9~)y~CTcwgtZv zC>n12bw}0S>04GDJ6NsAO~@%Px%N5?qxTu@x?%r1Jb0j0*J#5}%c|>pO^B{nYW;Ax*J=0C zT)UkQU#f;JSe>~rQ*-?2!k(?S&&<=DD%AUB=A8P8<*SXT0d`hfbBepvy!m)>?XbwI z6%$rU?9T1krsi3&%z4Qe>gv+Z)`uh`)pJ}zih}Y>EId0|8++7iE}Zl_{Lr8ktm&`! z&DbKlecgF5yUV;|bJS0}PfJsZtj!G#uP^)jDC+jL`V#)dvvdB;-Jxu@;}&ZMHEB}U zD(Tg`_v@2}O^EY#DrS9|AJJ{nZnIw#N{z=Is+eE@bX*?$>elBynhal7Oe#z^u`+44 z9kTYs$Gem38@m_2ta!0v(2L;b*Rtw{A2N?SS*mHYo?RFHSuOg5lb%C@o+RrlZ%pad zgIlfM+8SS*v3$Xn1tmt!(~E9i@cE-PFlWcP)>$RhzAr=>N8IjJ>@!<2985I6%XiN|EMA)V2CnM@WOsYUh7M*F-;j0{5t|Ull4&61psM2_qRa6<(+M*YAH{4@0CdvWZ)}+q8 zsux}9PgO4Jf2_$tJw-V^U%z?uIaMtc*5T=fb&Z=mpJ%AFY!b#y9o9Un#OCS8Y^j0w z1XjMWVd>{UToSNe?;o41?gpXEcY)li=I}?#w zzGZX89>v@vd*A23uwP3pJhQsA^;X8Kzjk+Wo9c07`QW{yt+itpy%z0DUnV%YN8|0r z;@QdDf*!B+Xx=z`k$BUX9KZX07ls!5T!^Ua7Tqu+`Q;9;^iABDg~wBURRZz~Z$+Ll zw%Yo3ACFZO+R)GN{eq&p#;#?8-EkT&N`)1%YB8@`lYOub+{sCievofS1hG8e1~?1_;%XW{XjciGq1TK%@QVN*ov{?_b6 zmhk*O2{N%O1Z3ww(yOaczR z9F)0GJ$Kg!m-wjP*IK3AsZGBZRlK8q*^zDKPIl5E)2FDz0y7o#t93Gdz3{No*4XaE zJ-?Ts?1{4^VaX#91_#;qsB-v~%~fr^F(~}=aTB*&86j`%c2>8H@43FhxWq|UC-Kgw zQ}bf}Xxvy{djYqU_U~(%p_^yCt!v_d`I0fUr(1elSYq-<*>&lu`txg2&$d2vxS>Mi z?YqEnc5{u;YT5m-ug+Jj)w|&1{$W(VoIQt3UsqZde+o57*)I)0ofE&lwjz&BMJw6P zn5UldhCk}w)Qx9L^Cmm(kyQ958<0T#VB;EvU;}$;`X5oBuV1o?xOIJQ`M7zXtLDcn zESqfKZRNuEYc`B4wtwiM6Wzt5E~2=1vzub^lc5$YO71!1Z->vF33 z`kRB@qd7gk+KgS6_aSc+CuH_6rPP~|PPo zdhF-tM$Oho^|!iRw<_U#VJ$Fr&WmN^pKN96Hy@rk$7HwB&8w`v(-Y3;L~Bp2ef7Cd z>iDen!}P{(onUw*)wjiXU8+XoEZ>eBjmV>V~UEQpLND*jlt zD*DMnX$;)<-@uDvu-)XSsmARQ_N}}{L=7} zpv)+`HY11Ea`ZrUGP|W_Tg{D^+o&%$1BI2AX@>RI%fs~6WHc40>6N~k+9%4Wr1Y!% zp^tCktkyN}d?pRGtB+H^p|JkRy$zpdRD8X^LU_o`Xa$B@%}KBNzg+Tbl0BlvEeySu z=rHVJ#dOp1OCS4Qo5;3|Kd`c}c>4U99<_#vX&O08TC#FouIZINyP{*WuXy#9t&^ga zsEeaw)^ZG*SHxM3+FWMu#h-i(izbfP@JscCVROqj7aQIx)MJ~^R>l&V+yIP9qA zvxYj=>WEb7@}MF71(Tgjn*(!1J06c4yD?sE=o|a%nOlTS#VyTe=g0gNZ878co06yv z6|cuk?B#oXA?w}aQ@#%}&6W>s%3a?*SGb_6JnQ9AC2D(9!!HZQjLTPCVevBE*z)(t znAhQ=p3XHsMMsDEPKh+mFU+h}__Oi8ubuzPbXUFS`<66JHL%}VW$1W0*JxLOvf1O- zTU&-#u!6!`mPAY#`l+#@=CA%|w=NV_yPvHaU3Kfa&z@tPDfUg{pDzuMiV1uEaqQfe z8R0c`M;fBfNbKLt*JwP1f$x@>eP*_A!z|A+cFo0_mPKAi*^v9Ld@0ylxU$kC`O`TRBe>V9}LMJP1uTG?lIqwuG@t-t%=B~&! z<-$JvtE-OfEbFdHMd-vV@nrMMr%46DVX5n)j{XweBeV2axRFrdT)*?5`h;QinmFK= z(}zoauD!|2y4$+z>_Yu77MWHBpHEH6b+ut{o^j?(&m8`T4?Pck8DYFJR%pINb>mQ0 zLgjBWYr17{GByuWAD>lw*0a9>|KZr&eLA)U-C`A@&t|upscjNkO)NPkojSchw>I;D zX2B=>8EToYM5>%!=f|!KKK(SB|9N^`bo7Lut&J}|s0cHOK6UBY?S{GutXb;R6k)1O zSJw+a^^Q(?PF@S$Fy8L0Y(w#VRH9F*XTBQX*HmDhoW3w%QRx_V?UYrcrn34=ju=HW2ePa< zR+0+03;Pcj8E>0mDh!zzTYOge{QI*{gJ0k1+IK>@R(`UcN3n0Ema#e09SkFRknG}@ovrm3?GQaVwrca<6Tle zy#7$}lF418msGDfo_6;0z}@gm!c*5d&ZwxWtg0LdFpMgVWz^myO~~NY6T(qn(~7&2ekJ--e5KUr`azPzV?pb93PN2X>YdP;)h~oi_Vc- z7O-#oD2~`8{++-4jPX8DB#XKWjDHUHk=+>6w~g(|KdK^J1~yto*l`U(GkoAm?i63M;*%fc%ysta?&He7vw zNEq6*c2jxT>zM()rp=1B9=4`XHQe+}dCsw(PYo!`ybpISSxh-e{e0ldrPjZu9_k-` z;m>X9_4r+qK98ds|I`qF2}Ln^HR5c7&6HEk@0Wy*JYGXD*VN|`O2eR@dw?jT^Fhf zMy~QW{3hp4VOVeL(3s|ovpu(;H)x5{X!$9wW-k7a=$2cB~3#@E^* z&9C3yuXTNQ-#ANjV)L!iho=YMYE_*%FLhh?%>_Eq_p2;=H^p8nRc^lW@K&QpeXWA7 z;*vA-f>wTfRH zB^jYjtcQ1wY<^$2E#NZ`p3Yt2&wt84urA7CM!r#VO-!eY0VxLzA1EAHUtD4P>+p(C z7H`Cw3JYzkS)3*L(q`YdYo&dE#WvM!@wz#!WV5lxvR3O-#0B43^|{k)$4zu|U(vH` z^va+?y?XleD&_SpdpYL*u}kNh8b=(sc(iEEx~SMM4`0+r9NwX6IrwF!a*KsQPn)M> zf{gyy8(Q#L>AdTrEc=CX9;ob^(pY<~R7s)9arM9{*4nvi3r;n&EthHMv}Wa>Y{9(u5mFnw6RJ{d&>T@g|%YA;k2H@1_w!2H|N6C#*CF8_p#ynDHqa8 zuM`e_wav20c1nHLiJi`*W33mqE}S4qKh>;MdT*-Yg!&7#@m9`LvGrU#@64tNcbw~NhF?9!(6Q<_wD4b*UQz7FWSww(y?o>J z!QJvpHy`^lV&m*`r#Ba^Gj626ve@yv-sAy=pEh3`!}B;-ow=>tE2~?C!mSvu7QdZ6 z(SXwHxze2aw*w|b{IuL4>%@_uoOzOcT|SPS8}qip_w9Y(ff@VQ{3^SH4bjIre%@n- zoir%PscT)*ZF~8Lm>1P|2KI@J-E+ighV;_Vm``y5>bV!=2F8)a*>t-VyIbm(Kl>6o z{WHh8)Z^WG?dzAc{G_dyqaPWWX?}Q}Q7X7U?P1~A<=JuPFWsA3BiY+Dzq;U4Phrc7 zsTl(`CytMv=>KF#W8bai6@eEbCQq2-=@hn#ZE@RiYR3N8XOAekmSEfDY_oRVn4SBZ zyOoq0@A%xZdcIZiC&Tg0tAY~sj(n-Q^{GlDZm(K!)~9Q7hbI zMqW;|s_pm7i<1qJL4mOwPR^#v&Ehjs$?~2$!MoL?ia!y(Z|Wa2pmyG4q07dYKUzJt zw7WDOx;?@8RYtsO)2A&;>JxKUho4w$dF*pmu~TE?o5(SmpPk)TDqeh+WBSJTo!b16 z?qMDmS8utJJHA2UR$_g}nLUTUs;`1HXw1*|03$MZhc`OQ?> znHhAd^`X;8t6c@x*4U(IeA!s0c+Di|$)B4?*Gtj_sfR>{mqVM4s|xQ$QCY3e$J89m z`Kzk(JS+Eomukn+gZ*zt%?gOKao3a1a*vrB7_IW*?UAjeGd>jRH}mY46@9*5vaVD$ zidXt^?5*`r_Ofqv-wuVEG&aOIU47`G`K~LkWZIsdI?-t*f1XMecx;?+m2&Kh%l#;{ z7t&fVSnqb2qV*N_%u35kck_=t9PW03dsT1I&ZsH7AGnqN_Womuv7(EeUW(?sGuzIy ztj<0hW3x3?uc@=jhKmZis$xsY#v$<&>ZC)QE~M;ld61j>Yn5taY1!s&AErJD538u# z@q0bRJvuRMM`_8ly1QQ&E_`(>v=uwt_X)-W_Mfrjzd6=DTKL>!*Qi)Odhmf=L3Pd1@%bLPxB(RybMdzjDRX2(2_ktewjI*XV^cRW_e^bcyHEm{{#M|Mxz2 zYYl!W4IY#2q`-cy+V~*Wqu@lumI#IShMk7Qp7c;~R8-wyYOrd_l8WMwDyb7*&R*IS zxFBlyzAT+?FYRk;7W56OtUTE`Z^ld26!rRzPn|N@L&|0^>o@{Akr)$i6nn~gPm}a&%?RV)bVoi<4|;!QZnXQ< zVbr1Hmfp3SH+hWS+h_N}F{x98gZ*D`9CK-iaqy(IzgGCnj21jlU%9bn0X1{VKIh4W zmRUh-2A`<1((82kX-mxUpBo$=T{${%ntIHMd+z1;mwLBkJ77vbFPAG9&$LYFq{{rcQdQ*ev`R*%6HFJLXwC^*uNktE5 z0*{~PCoU*?tg1WxTIq`g+q^vjqR*dK@*8r#F8g?n%lo;}-97sRmt4!=)Kkx)s5mWk z!=gjMsniVe_XHb%GoXA&op1VdeR;*klg&e^8%LT?V?Gkh*4q>6+v9~`eV*C<=jo~Q zt<t#WVO zGJ#E2elY*#?F(N?4^GG^Sh_Sk)JbKlPi9`=X2Ywu~iKP>3l6|1QGNsqhNW?Z!o zX}a}*8ue#(%i-XvgxwDtnzJ-A8^1i-srsb!OsmQ|MXcV;o%%UBtsijtqoG7qyY$J@ z>j&(704{gEL z=9k*T=bIj`3d{NHX26%2%D6qX6>*jeW~COq4Gm8z#F@hpy$9v_HWOP*Cb&v6*DB2M z`ZSzc<1~6^kfnOGl}*mxtZ0|a{?Vpg8;@VBh|6))s@=lszVgMR$!D5RA4zsTW|`T# z96}|Aq&Q%ARXGS%T zpXiw$bFt9TJwoU{EN6pi(-edJUq86GuVgI?)7$g5&uD|pb;%c{)8-zCZj4D+jM08S zW^UxFgy*Lp7KS#=YCiCy{KVXYRk(HgaZT8p7by1s=z4G`w%O)ou+B__)#k*lmB@6ox7?K}Tra8By z)T;B)(vqmWhmFr_XLpK_m>>I?b)J9oeRqp(sT*EIHa_`!KPzXNcCOmEWfoni!&%YY zA3WNZ+O71$am$+Q3nh;xkN-_|>v7A@yRAR>SZHypu%LVNnIp*`o|%nrO6g{tHS1Z! zu+sHAGamhV3A-{wX+-nbfa`90-1A?Xvo>JaGfCLcV4jsurTG`WoVwMc=OB$m2}2)7 z9jkuYJED42&atBWhu7Vw)z)G|A2+z9_F>)O^M!rFCECiK;`*>DL4%*$ zJAJ9-;dcLfau*=k`285iTcuqF&OeK@;L0^Ix?|ow`}Jbk7sI%(HT5T-q_ZV+BcD9j z++Z^<&T{AZj8}KAPmTTLcy#^E5!O|6TA!|qiT{7tdk?rMmS7eLV|!JRZ;a-}Aoj zd%ySIy$!X!(_K|vUEMR?Ra3LWXC9$zeqc=YaQWm)GIV`p?}{^(XaYS*;$^l|U*QXrl82TS`VB=K8 zUWeW8J`$o{E|%gW%00a_Q#IwbmhITHH?4lp3e`cZvASVH?8W7;Bog(5^)#+grfYfA zD5>$=%UPqTd74{LP1jDODE69U`KavHzGK+cdwFbZfF$)ogKFCc(L6LK7LGB>(`30N zSL!iQ==CvG?oo~XeZ2O>c8ZomTBm^>@v|2k)X^Nd?1!RgY`a$q`bEAJXAl+KvMi_6 zbK8a7a2jHvZ5f#9{3LU=Z}`yyZ~pyzmucJ&?7~i(+VqioevVmNqaw|S|KKaL|Bc zwhDdJg_@>|gRp#hrh*^1|=yhh)sXf%}DO1xkj_5lLPI@za(HU+Tz=pp=4o7lH1k&1 z8@w%R!&^ME`;Ddqc+11AIv)o-iKLLV@0}Nyn!91+-t*R#FbQGPPZ0t3r9R-f|Lu)#au$zFKtA zpas5MT-gb1Yw`10DqK8Tc%we(c_cSgL+FVZ=T6g-Ov)|6O~ZDrk%ChrwyT28QY<;p z58Sp$-JrdSt&xQneuWkRiW7PG`7d{CCfc&6;@eUyB)Gtzhy>^89rQExan{8x~DB~S&qU*c^NC#r}|84$LalnIz{;;6p(9mxDI0- zvAe6M{I#UwQH=dSs9{@$5Y6=EvNX;`n{X{m?BVyV6oS3wlCCl~nvy3;Wowt)c7)Y2 zM7=e@hci3W0poomT&Q<8td4u8BPY_b3e3biL`w8V^VXgPG|R16Se6{ccJ7@X8i>tr zRdoV>(@z`bF7HJ8lkoiz19&-&x|PE+f}VU6J{;V$XQFLzRXDwH{*>#yxXX9xtk%m? z<28<)*Nu*xI@0n;l&9mCe&dq5yn*fwOU5!|YIs=`Wuu ztouHjE=EOOb0J!2>-AlIE&B42ninwk>JDB84$M5*d=S|lTd#lb6}N+JAc|zg{D?%$ zTD_Lrf&;&F{T2`RyE3~=8K2ORbe>Co<>#79XXK9=j$3KY?0Q`h==ls@lHI_;vXa#? z|LIyun=BkT9G_r(rTN5JVsqyR`R;7Fgz0limOOldeZ%4_j;}l#qzr9KzT;Wa=g2?i zxzSGMS@%%}dwn3ATOFBLKfS-;jd-LfL)NCEq>)Yy6NL+fP4*bti{x1p&5Eofau@Lr7AyB=i(>Q3N{nCL z3oJHOy%lA4AyCP9&FZ<;XZ8`Nsm84?Y9S@QgMr@jMmx(ce5z`4i6?uB4iL+iVxgQp z)H5(dw|KB5JIMWG5unByF<4K`9fo&bZzx8PXJkYNGxj(7Dc+`mN|OOwW>JtuBzg4Cg2394pU$Y5xt~;ppnXVr!Q9=ms|O>Z+M*6Esti-VD zkIFBF%u_c~?yF<_6GnX)A9~G~2L}fvn9n&COoD;cQddOM-GNoWipst%l zYDoU_X=7iZ;@Jb=i$7H1y`s z;PO79uzu9%U_;Xh^`2U8_a&_{_lH@`PRQOVq}cqaKnq=-#zeT}d6KcvNImzK1=YGE zYuF3nwhVY|s*s5)mmAqNi3m4+x)JsIBx0+qSvhWx?&hw84Rj+3leU;!`mCZ4*zBk7 zdPZ7|^Q8B|NPwx#Jgit*q^kkvS_g9FPgoV!sI6pQ!E|!o78vZOXw|My4iFDd%sG|_ zBROWTPTY|w9KC1kz>+@evJ_F9l~&?Gua!q>y$zc;|8%kG2ys(3ma@3IxvQ|QkX_k3 z@?Lvkx#1hlc{MM}M^m#j_xX*|iq&g$+<~vF(Gc}G_f$uYYL>9>$z(luuR#xq{z_&h z%VpE6!&GF_@%@2x_-SUGPHnomJZ7d%?`6@@$4Jzb_(VUgN%s_&$E+;%qhYrr3s#rpB8bS5Fb(=Xe3>}5IsC8k+bN;BPscs$D_FpwmDtLdt$#9^sRk$mKM}Z zu0%E0NSiE=9*cU>J3yzfM3l6>bdodYYO05uhuHKJ3bS3csY)|ngDC_RMl8IF6NC2L z>T>RBeGwu#WlDdK@ykFP{S6G!<;M?`67O}oH#r-|IS1kF7sj{gAKI>Lz|PuqWslA4 z`qr8dE{_hYOIL64h`vi`(VauhJd}{geM)o3cy=W$tu2K?moiKr-*RD|)=iSm)pEI| zQ$pL-J~f?U{zZy=BeR=pQDna6C!L<>O*!WMjYXArN>0S)tc16#tc`C&%%J2P3uSTM z_8JG+!i(Op(BATxwbQkMT9}u%!VKNZYh&yzc8rqoXvk&7E8wn&Z9nsKTl>?m@Kog9Ii*rr3>J9-aYJj%xE~+kwI#gM+dd%(sWz)w94(sNKIn5gV=CRp5`s4 zj#P*FLd7a{G@);$>222mdn`#w+BHdcc*%Y}bewd>6MJ1k-Lm9FlelKhwvzkRQ$;`{b`XDC@d`aGw~jo@ADCL%2(aj@u>~j)yCrQcR-88n98Vwl#%yf zLh9QfV~zQlqr%K1O?g;PiO==*Hqr^%#oR0DLrXPdO;jo(ZK_d})!T|fJ9rCb=2SG* z=xA}(U6=h!Cm0XYoAN6{0x+S5+EgO>)P|HqFdI zv9sddcvmLCbVZIObs;bAg1&nbJ9Crp0jsIvw3xuQX>|sx8ty$K>Ivitt!s*wB`nG5 z65ZDRYvS9X@ma>{V1AmO1P0<%pT-iowQ#g{f!yJ+ zDI)PD7U-50>~yWU`?1|H0Au>u?Zo@oeR2M7n9xY^i>u3;h1dD>YdJeaDJvrF_i(OU zLj~3iYLZ)xJRBQnUJK>I*_wY6l~&z)+r6!9HJNgL)S!uU6%-8SJaDS26bh>!m3W<= zd`Dnlr4}7EW_P||1!y(7*-0FKA1g)a!I<2#sJqgr-)Nh^5||ZC>n4_8Z#ckHV!hjCkejA}Ng&vrpY7@1T|$PLv>kPZrm9OvbX|y6Cnfg5m=hABQK!4t@-&IVa~6fvUd03sFA%Nid_~_k zeI#W#=Gw5D0PR#cJ@=P`!$cCSk`40HhoaA7l0zzqN+ouYTJNdRB?G0Z-@iUiSs*G} z7`F6iX(3rttmfJpbC_r8knym(OTa+ESvjflu}1P&`Z)JHkp@j_2OCVi$>&!xu{H7g ze5R8XOWCoVvmGRnC#NRzLJ}w8{bHlgEIn0@s-l<8#$Mk1n9yFR$%bY0FeZA^%nwXCNeMKS!if_wzST0kcumoKa6>vKR26N z;N(^8-iZ_QuV)E6x?)#mRTkWxrB<|CD_4Ap|K%zrT#a=luyy0sjmzVo%dGq`$E!cn z7*ib_m0+HSu|K87B)#A4hd#}Z)G}1QQ*)U`-BPl0{P?XA^(y9vNyhi`TN7*z zj21@4_v+%$;-`!uOCq^K&t+{&{3wSzIG&Z+dr-rJtDoggv_O}aRoq3w)zu0_8YNMc#+%@Bpd`VVKO zXuIB3`8cC5>Pw}^q$@mQq5rtjmIlyRbC0P;I3;9v&k%c_WS)-fSw(%di@LOIc6KU< zF==@Iq|iu$Tn<^C&9aw4RAA>|hQ2Gk{9{3V!DPzk`tJJ9=?cj_#WamqN_-y5ajqzj zue#qUaXw;QGqpRv`RW?nys^b4Qs*xFrD9lETxo7`*>NYno3L8tc5KmMZ|f=PWz*lP zuzO#n^umW_cFI1y!&ljHROb?DJi0rTv)mBz^im;0G*~FX15kV*etpJPSLjoVmr0nEOW@!!jXD0Vt6Vj35Qq>dzHP-U^3bQg z&`1I%G#f>;i9!Q`Q&<`j>{N<;NbU!zb(S z_*m>31%nXED8orfr?VsrRul$L^yc$OOnXrpwWaC165J3CCTnK{L~;k)@-XP&&Tj2ge%LCa5>>?qOn7OaoI~R$^CR)YaOM6WE2)!6D zAQ`u$kb#POx8OT7vzLsuvIyk40RH(;w?rRZ|{>)B`7jeXq-|Xdj?eko_glVZ= zB^&~;F{8Z6`#o9dnh)#{Y3X@YJ4N|9HvNsg$u^_xGI+^)zU3j4u(zsfKMiI+K{BAL zz+s>ghW#u~ENAt!ZPyP~{WPOqc}{~k-qs+jcTSq=x~`?=?AZpS@Q#gH!i;K%>pfbR%g=V|# zmyc-+Ep{ySL)>qMhP~?}=hz-GzgK(9Eo3xVoeU%YN}f3g=sIZD@R3GpWfMD2x%7z6 zhf%Lwwz1je@9&Q)a+vBa66Tg=&}%r~TFf~;nQ!+nG9z_C{@YC_2m2mfeIOe(2UG>G%u-%YA%RHS$Lrvf? zyoGSvm49vK#N}Mieq9F*%s2h6>WP@OWlwWc1J)uhvoGZ**_mJ@?>>z$b_L7LsRbg* z^&AtQAEz%sLuoCr=R8Z8$ij9P`l{RTfgZl{dzYS*oTK~Rr0u5DlT0LG|6TBG_h64z zSbnX`eA4#zsH_pbhd1Gk1n(E;zw%(5(vxnu>Ym}<3Zo6B@ahh8D>EpwK8Io8AYD=^ zSw$N8=tPu}ys4YT9Wf^L+lF)_ob~TrrjPV$zD=wW^pbCTSM?GlSiIxQ^wrYK*}FM= zfC`SmdWdszN~{WoGJmmYBih$WE@i)LQGl_l5HEaW3OxZwOGDVOorCEH#Cr2AP2Q?0 z1^aP1^K!jk4ZjbUG{~|kfp;oZpQWoWd`NalW$6=I7(u*;h!kF~^tSmkCpN(4{zKDYncDhVRD4KJ%{^V`(cZ1@~L~DEx zGdhZL#AsMrvL%dbCs5jyfX}M8W^ZMmffm-`X6GI34ZArS_ANEUTdsbfBkH?3G$$87 z*)BLUpY_)?cg~)*CGDT>tZo6Zwe$XgCubh)h<5^>YA0Q<$T2YQCTOx6H=+$)d5i(z`;HSxX;odg7TfX8EQ+ z5Bs^*t!Sb;@cs)=&RFA#&Y-r*X*r-Wa1*6Uk)$79z73;%)wUdcXfU3}-6(_EE!nKK zQG}$>+7v&9y3~fhg15|Adbs}$2^H9#p+%7cEkoim>Pym^=bZ-`wB{zpoZ+Nri#Am1 zGu$!g)Z1_PsY{*jNGr5bSm(=!BFTrA4(yo^y|lb~?Tc}Hk^HGhy6<~QyO&E3l^mOK z@}NZ?h}d77y=(}E@iYe&foJ3F@WOz@(TWbcfH%jjCbso{mOOfb=-QJMBp#Cf_Sqbw zo~0X7+UjgZuSc3lS7H{Y4`6RpkWLZZ!b;DizNaBPHDXL5cvFsxes52(`^(wo`RAM@ z37=h5juaQ35sSUlwWIWk2bGH$>5CKNlnRXJ_!`>YNA>Qup7kJBGW3faA6vdJmK>Rf zoF(AY)OX)_BEBY8f)cwdQxd9*`e zZlv_Ypwz^5qeWVoPkoUT$nrt8Vsg#TbAW{`R24OGS!(rKAq}tNt}UA2ME~3Unp6+z zs;EDV4_tUvK$>Xor|Y4SLv&oUEyZY9Zx>uGJR5{=#E`Ii-aLyT`nvt#cCfYV?P2em zYSUZQKv`+{xL>c{9t17pdfg#8z8$CP;iF*NckqZkeYHLQ?1(VDPzy-YzP3@{(!@0n zugGQE#?9NLNeluH1LK1uB)0U5(j-NaI7#EqAO(>-?yT_1P;f?0MKoNdR0a}X{s3`fH!a6fRd6DP#WtA%HzC1 zMZ6a%i@yyj6MaB+k`MTl>z7#$OUrQpuXyp^o zU6}wnE0REaWh&^cN&x+}sbH}F4H#-F0K?7s5Q@NUD3?E!i-O;2ET&oZ!KVEqzk-*^kwsSP`7{wjZ1jY384vt z2W=2Kmhqqm!oa`)7#th~v*UeW_WK}Mm>dENQzKw;dIVr*MgeAS5-iV8f?kL_xPk{G z5T;fkVDVsXZVoKWPlLt9MXh|Bg_TZmre$;IeujfCC|BGKBRd%48ioCS6yvm&jgvWo8a2*X& z-D%t-=06bntQCSgkkV8aJjEYf_*7-LA;NP56 zhV&aE+{fc1q-Fn&0Qji$=Lj5-KFGP@@qW_M|B`-#0YL;;H>ia-|3U%yXpQsd&qE0a zN?0F!6;&o~~K3Yoy;c)KU`39|kBRgfdOSY;2o=?i9XlYso*qF0 z$bi6MeKNtxsIu9E-_lD^QBm#Op#Q;PLydBT$t3N!!QglFNlZ)#gPqrq_J4?gsF5(4 z)@S42(&O=VOiX(a2G?TN={K~1%q7A|A`E1s)+fSW+sEV0AUz@hXZ8lYo|e}7K>j4c zy>$V8M}L%wiHROzkn})LPm90o{+=F^Lk&O%g4x~(HNpa6K;W=B8x{YS zK8=lyzya#O3?F#Udoy|HA(Qck8vm9)<-h?}R;U6*hllNwa`Rxy&(F=x!$*4~^c!mY zTlze<1FTGt0bvxWc5H)*j0^?SR+HQZ1PKr=Y|NnF>5mO2gq)y5IFS%~gaa9hF5P=m zjaq-(K3)LQLlHI@z==p;fJlJ49Jeu_m2dw&y#+fP8=`^rK|s!gO$KCSc>Q$-=O6w& ze=hx$p5UD!J==jFgYYv0R1zFf!a&CT9sOf`J|8>eaDWYpK#$C1*n=G#Dj?9q$#7nO zU3~CY{pI4J0FVJ;5Z6a>lL6Eq(mpbmA~vprzoCZ=9Qh7zGFU$sf2aVVha4VkH2=Hy zb0LG|Tjruj5*8Lb0TbuuAdtO&aQp7v2T_~kzo&;&jga!wlaZ8^@?x_GWUmPoiN>4j z=^|4&~L`V%2GKaC>!wtq{XgvU#6oQnhHro(7%!Lqh|@ zCy5Z308bL1{T~r;Gfcy#6Re1ew%jx zwv=Y?UK)nIdl_jM8E7{3|9iyWmoo6|rQwm1;$g70g>PwFPrt|f`_lhadPW`|MjD1c z5G_)-ce3AraA|5FZMGC(-~ryTxPERrZUa`KHF^-E5) zar|qJ{MIZX{ol-g<+pkKYx|pWZOZxoo18E}|KbDj6Z1hZg7Cvn{L}BI{_H0#-FZ%WW`mmW0U-v?4X6@U1Q-$0-7&wgX{T?LT$!*8qz3jNo8##z4V0385*$NpNN z=#eIP6KDd8UO@lw6HU+&a~70_n1IR`#-J+vDyWXQ2FfCDf|e93P!($rYGQ3cd8{+^ z6FY%ViO^U4+7do3-%eq0e|QR4j9FO$KyEg z3_{R%9EgCv-xm`&5IKbd(GUmm0Vl!ZR0tW9IFJMV!U>2k7(xp47;sdp?87zD6)M@CA$v^?U3L>{uU||K1OMzr*1B^aNO(odiqsQv~1dx0x9*wy+3# zA^x{z9GG0ifjM}*u&@A@7iYlAhTnG@md`@J@8X8vcMZD&eZIK=c0ONtn*ZwrFDL&@ zv$6C4XQa(qh#Yp?!`1smCK|n2_76Ea(?fOLc`K_O_MZvRnVO0U&_f6PYjTRRLI05e z?G6253XnotF*8&3%zwsrR8?IkP(`E9NoW3pE)w&2N z)YOn}C=$GQsxQ_X`7=HmEdB9Q{(UX>$s`%nDQ=;JIb5MMY5@`nk^w5}**>|{fc z27bY3fr*NW?uo0GF7k);dPB1cr@jA*AA1nuLy3+)dVcTDojV91s$CyEkKf@V1h54v zzUweuyJmXLA|4Jlf#5g%(HsPPgkZon7C~_42Q|L}a2nh4^Px7ar1vog61`s6? z2%zVUocG0gFTywFdHKM9;}3m>g9B*-;v)^f77$-u{Q6iR=mh;dFy2flfk%!U-0Z?; z3*<4Hh-RIB;Quj!&}fq*hYxQO5E_6j#2|xlks!{W^`DZGm1TVbinQ5>b%|nhb#?WA zJKqojbXICsG)&K;qmQmDDHg7KDSp45&nc;?@ELti=k48cdn znQAK?`WOCx#Sh>(CcLRtDoe=zpZTYv?F0{Rj2n*?I`co_r@$*h^kci5Ha8zeqtPi) zWq;!TlYV}PmIX;5q(IRL>2LJ&1OL}AKjZ&e_selAa{Mpw3FQA}{8U00{xlzcY5%u} zKlmHOy!@~H8Enos{RhyC@ZdkDANU3jjEw#TK8ow)e<>Z^*uK;Lr?iiMFF!y3KFNLS z=NAEe_dgT;jK5Cx-{ceSf#C1D!uQ4D@ujHs+>>ZMhlPalzvdnr^hCaqqc(Cl zaR}v`IYc2G|J!_%{?~l7oy~UtUzo3(ssud|wwZr-jM%BH5Tqdduv33}pU2O3DiY81 zYb?{x_UUH)((7Mhmy%pg5p2;|Xp6?VDiUl_f*tD11ezWk1g$|Qe%PYfK7Yh7<=j); zh+jGbNeWj(PpuVXBG`4*L9UUDTwqFGp z>!<-^ogct=X!m^|XagUIhd^8304SfFgD?-uW>!Ecv>j^~VI0sRwEbW_P!Eg)LhQcz zi2(?M00ZND24O7ED2(k{x9MgHu{Oh3*6jXoJWODUidx2EkS|f5W&j&sOxW=xFsSe zAixb@X7i)|4>L~I$jj@Nn24a@1wHMH7dNFOa@b5>rzy7@$cTwRjN?cnoEnG>KkB`d zD7L|R87Zkvgrmr-5&x*?prD{oks%;JDY#)rq#=Vp>i5x8QXJOVM1UAv$A7MOrl+T; zppZ8;H@{(iLq|ps>FLk)UaSNJXlam;kWd>+K?K1+)gNMIqNkytfVK_^F)_6sL_m}X z+lTQhFxe=;dZY<*a#Dz3CM6}R^RxU8cI>RM3D}9PNDHuD2GQ{k`5l~{FRQ7YRaH`A zLqxE*u($@Rf2v3HXbYQw93XB||}82$2( zm^<+OZHWGi^!0zOHL@X?LxlRBzfH8XbaZ?6AoRbjhJ~B;2+sexgva;7%YdgObXl=% zLMgmq0fA@(0L*`d%{m0{UjuCLM?&@~$UFEDpZexLMe67G$RRo;=n=kC4aQrCTO23E zS|h&mIJ-aMs(;>_BsxnI;;E5X>S}*p(C}0U`o$GMmKPF7{pVQf+fG2 zdUzh>J~9M_PxL_M10#_A=rYKCdIc1Qn1L^$QlKSL71X~}11(W%pfm0~_>y1%x|0n- z#fwXz_LU9jO}o18?`~c9cO!cg#jnuU4Pn#QeG&S)bvAt4m!NO^>HrSzN!I<^HbXeL z7lHn5xDN>n3j;-QcL=`hlDrs@k&!{zmz1SG1Qlt4;M1Ex@FC+7Xv}^L+VY=)hFrKu zf%}xt1rea7C<1hpMuM)|w=k|b5$;b4K}lm3$oYGvFfuX%rlzLA+$7wyOpQX{^aPlm zo+j*7W)NRA+@q}foH767Ylc(*f70*vDK3_lE^dEOjhhxP=h34gF27co^Ye3a>ggkk z<)6|G0e*g7ex8f#g<=wfWO7#Vx{$yLEgdZ#Jw1e#i6fHS6v;`|goOkU63(L{uauBsi{G5{Rsi+jAtMrA%Uj}K~32opMcuzDN z+QWI%%Wyrx6V{LK@L888XwNpowPI*!2#k)7f|;>?xIT=6rG>fm=TR_=a9zL?Y*!5C zf7XRRCe{CZ`~Q*#kZZ+{bXr_W<;MX4r&#tK68Y&^oZ}!Dml`ZM27~XF1;~Vta2#M^ zVo@O+t5K4lJbHwKorOhey?!SJ6)!Kh5C@ASIHn*YBTu`Nf()97q8z8!>t*>F=&4DF zPl-W(upXup3~;egI<~=ISy_;AC;4tQIX<7c^ux z(%$3LKlR}EU-9?gdJg@nc<5{OgeZRkc#ZV@8$eWt!2kODKAoQH%XJNx*ynUNok?ig zO7cdDkNwf3(_X#ag{K@%9UC=w-(SgnkVq*qP>3m>^UV@V58N5s|6t`IFN&8Z_OP#a zt>=MazweCONk~qy9(?quNc2p?Qins*@<8`ZD~*>T8pE9~b^0qf2~`m{OL%B+899|R zh8^RAug)a--|?^Z?ln)}T!-V93Dzw!)Ry!qishLZ?b1}TbJAo|RNTrj-8)!)zm~n6 z()DUto2V4y-Kje8nq{gn?ecaECXZseh~wU|r%&rs)NSqtnt!k2R^5K5)xh&KAlb#3 zyxNiJEFHGALTYi^2UEJT41ZwdQ0|Rd#aI1<>Z=#R`_i~MZRTi|rr+(Dld(7+0Ja>E zHDFS_yq0t8TATRVs6eH~6t>o;&~!pf&;2UzWcwRx&FQ(DpfhXk;3T$o z=3Ivo6m>_L^231-9S$KR9V?hPj5)-!{#p+8| zwiP2>j&r#c!S!;duh+W_R-{mx7NM$y@UWTWFyt!7qtobIBwUUIAfPB z*CAor{Itg8QSIP?)Z}2uA>#d|W3SF9HQZf^RcH7vDdFYA5zi+k7|_>fwmtCd0LGD# zgX1`TSj0P75nXGd>HhRffwlgT?_PRP?J0fN7^E1Jz~6b5=1D=1ZGd{8u_3=dT6dmHX=-On2mEc#??xL^rIEnVHa z7yiS><+G$(#nviOYyMRO*Q~7MMP6F2s1x&pGPy(jk`l*!Qh6zjJ2C;j2*HCW)`Q{3B`jCpIEk%%rot`c$v9f}Y_+~78_@qIuG6tJrw^AKr85?`-2PyVrwrPSI_*>hh=}`8;aZ z6bcxCTA6TF_Tx`B*s&L=-0}NvCyyQTnT^t83NB|=IXjg`BdK|tbFDrurlRqVs~>CN z;a5ZjSM6rrr^z_>$H_R3@7*&XsaH&dDtd`dM*c(y9)SGnF=k>}}3XL{#WA@lOw zNxy(Eu66ta4*d^E;u8`AmKk^D?|lT$(XVt2VLaA8O1nVW+7im#9jnl{i#< zFiHx;Ym75D)7~m^X$s|A2O6+9X}gYcc$5W5cHDLC?krf{CoaGUPk+x-Q&#;UsuNZC zkPm}&Jc z$MlH5{B#4wJ%8`uIP#IYngx5>S-p~RrHh-Mdiv8E09z{P9li z7C={r;8MeA=L0WJT%w{BPyl>*f=UjhPq@tTyJznspxG8#z%RHVY~8C z&0Qr=h?q(PS=@=5Dn$3#pQtql43HTk(RUGkGHvYGG=su>&+&Avd+1kdo>LRbeJc)o zTs|=>FTvk}Z!t*clX&?K87bJWlGvaSM9X+F~AnR8H?#d_B9yp4-7&{nNr$v=p zM{$vtYftoEgXDf*jRarA$f4{3)FzMfs2no4VJy z_%~(J*TY>Z7EdrAYAbKr`kt~6^9%8+N!ZhQ$hzJ?$WJa%C9aS3B{9{GPu11F!y?1c z;pl72Xz_BdakooL=aL>!tME=7!Zlb8Gcmt-FH_Ib;M)Y>nlbwB#)+=sXS{)0ojpEu0TSEbCP-|Wj&E;R z=sTe)m+rtlk3J2~1Q|;Bn5*oI>4;x7(IC%zR~3o*)Z}(*7vs$8*SX5LJ|9onqD=CS z%vAmWIlbG?$EMEUU?! z@V09vaE(M+UU4S<=#KA{7Ry?q@4nf+E!?A0YV=7Dii3S-gyP=JQ_0D9BPRLJbr}*U z>}$kRxD?LZ*mZV^)PvRRCft(HZ>8qj%lNYLY)O}wM={>%;HB+Ob&u$)1s1rG?QBdbwh=*8<3Hck9XY;j{EG%08Hkb+Jv*FMvGa6Fj9iFZPo^wC5QT1 zMixy@2<@-}pR8pnzRPjAOA;NIJDK*_eNYHQaixZqj(78Ru#Y@v!lUV*8_$3Kb^^XBL`}R!FL3gRip*QR zuP6IqwXo$3CxbV`bBplh%16rV@OFZi3IEqJxHhp|Ki;KjV3ltW z&77i}v$M_6L;U1$&p`N+rp`_VDb!PigK%@udSNQmKVl?d6*&l z3fG>L3lhcK4SzIfa=Uv2`||CB`PbY_5{c*2M;Erm1tfSy^NzM^8A+w=GqR9!Wu{^Q z=Q%x_U!MwnAqSXy!#(_t-BNm_Qv(IaCDG!N>;@Go&$4*k7!`;9TZ1xPWN?6 zb4j8sqyCWbY0)mtPdg;8qG&?m+zZLtIDM~ps^0kQ_J~u$NWg`@l%n*T3{WDT8275n zA-aDjK6L16-XF3G*Hx1)h70@I9`6Mx2xoE z^~iQ*Ufj9uT<3GoX)-@|B~h3HL;9EZ?&y3k>~ZJVqNwhU-6ZA!>x*xw-YwxU+=S^a zciBC&nkP~0S{fQN7;V5(Kj?aWxxG;?h%BD2Er@!n)WOBGcO#uT{rWRYRflM;9F@a3(1 z7hM}Pvqv1{$Ghz?wJLInPw$LDbYFkrjS;Apg z$)e(C7tPz=$4adnQYYmk(i_iyvOQ`iv?d0eJb3D7f?RGrGtAR4e#VM38P1&j+fh%KidxC`Na*1V4$jCdoAdcRmGml%<-v8|*eykNj@VcuTqs9(dkx5pSoT+D#2?kqcYG>$BFBD7WtyhGs#q$mWRwb z$WpG{kUYbFywzD~{{?t~4hX?8O9%%Ps4vXTvGPuipO5IJcl5z~5CB zx8~5s@tlautY7eQ39(3P*wNyhr54u@1c2)bs;ZPCqM`v##rSIGzQGrAT#dHAwV%EA zt=UC71En`x&5bVI2yzQ(FBSVhNh$xHQ7w$vtxgb4FTFx*bTu%EXk2)Byjhmtlktt? z%%#gy9@Fo4GrO2=@l*-FGMJQ~a?of`-9QG{E{`~OUR2gYgZH5_f*I|~OUt`6HKSA~ z7VOOcAI>**Kg$K0YH*$8@^&{jw@NX*)hh1Ny(mBR-6gIhnv8r;Yulbh2OGAh=-r)H zJKC1+HTqtg4W+&)$1N#F;+6lNaxklA`-#Yww@Dlz;*J-0_)W91``e>V&(F`(`*^(2 zW2T%Cdrgpnr0$1 zr5D*mV>a2l{ccsSla$3RZggIqK`}@LJQuQxuvv>-Vrue-Zx;XTsXf?eo5}6BOQGyZ zg%v*^?~K2_+!Z6|nH$@=SjAs{m=}pWw&lW^n_KmFOGXKyo*IMaCsfFZO+Yrs9)E2X z%C&FFRTLC0WFN)eAMs=PV0)P8_{)Y((R-g?(m7uswir$9Eb`1{ZqCe+qM6<9 z+UypyxP5}pr5AlC*rO><4tLevyIZo;$l5xi`|K7{^?8qLc8ukh>%7>QKTe|1&yMc< zVk}#$Bv-S1;bsHr$G83PqShH;z2l>Z^R6YX+s1e^ilOSQd6&FY={i;Q{wUKIXLOYfi$Z z6-38#PLfLJ-!xl_c1AS?@mgNx%h-12P`{QSpQg%3=A?5YWA4`Fx9(b$dX4H!E$N;; zD@pn3638M7BPz6>epL{w@vt`|cJR=v2z54hrBZW2z4UWOOw+Y|O-?O$TwRRj(xUWQ z5!_NP$m3&;?vq-P({19l+jE%g94dv)(vM*R`U?J7p^OYTmUnX^ zvX#&DRfx{n#c*#Ed?00bg!057sAf7W##8Uz5%zI=<|9 zkIP-M(e_y&tq&&5EuYudOGAdMGQ;Vdlf$JU0anTDyYjPG^dc{|Z_zI#DB`THI zc1GmB!6CA#ekB)nH#3PyE52#;Q$NOTnx})xdSLLZY;CXB7QHoewia69K2BBaqS(|$ z(%1CiN6vOt9x^xGtHd}gW*2D zBC?^6@<5dc@kv1*rZ_l&SKft$C|Sp0KMyG~gywJae|=JJM$^hD+s#e$id}Ys^?<;X zl)U1oko*+sEXM+|fL~8dd2M9h%-2wI#Wi80t0iU*!Pa%dkBDR+n2|ghW~w^fRH52( zx>hO7kAmvnr=Zid+5OFaB{u1riNR?!;HG*k2Ip;z$U(p8TP7zMrzZ z^mSaa_#Kb@TgU6P?fnW5NN#1q?*zlC6|Xa1y7dh$xHl=wxa&uJ&#-6LgP%oc3>9(`vHuBS!oP~)^npP8!FIq&UKa!XtjeSUMA1Lad%fjEOt zC_Se=Cui1X!%H+q$h`7pQOtpFBv*fcxB@Jt0&A3QA1T;13lb}Sd}h6U=g#B z6;KNV;?K9&Wt^S7ZLy>ZH#=-iFTzh_gUCO=;xBw}bw5~hPOJoN!ZaSjTRN`t@*ZTW z#g~ABj>dyDHc1bSd{?QMCz41+I;cm+N2NOXH7KI2%_e51i1P-f51q3w>UkUGC6Ruj z&a3l*MLHR>`(Y2`^{~F*qgtxWp@MI;yh^mEz4x}f3^1{05NXge-*O6 zcs#Nv&--~uMIn;xJR-`zOw%+iOigBrEN#}FLOg_`tYvAli!!04sL&IV7Nv*^Z7S~n zoNwlvW>Vqte$V^=TyEcd=ex@}=bn4+dhVUKZ0I2V{LQWUHu3gKI=MTy`e&`mH&=#L zD3)Jq?>=|Diov=Z(=q@;6fr6ejb>XLYP{SzBllFs)JL{8GG#}#j!t1g)S z`>uWldn-ooDl}0s)L!QOEGnw-hgIx^xA()$MNvZic}@o1{MYAle`*tbtH7yA+*GBv z+G;D$+rO;7*_V4;`FV7kRhxwIX;VD((RNC13uF;lPEwxJjWA=Ka+Oh$n z#~Y094-d)^9BXYIGphW7W%t*=D0#F_Yied^<-5@Drzzz(7OAXQv!(a@*RPMHYR0+} z(K?S9ozXW+POWRY+$H6_|Ly6)$r@?8D*}>Cr#!0e-<IR}HNuw2D+70a8bMt0uycxKkV4P#X?-_;(|d@S2KdRIEzwJmU4*9I%kChBSV z>0uowT~5d@$B|k0>L%T%RX>kzcJt+LS)N51YnM9S_wTMeF3QA-vRt5D3ZtUnb0xkr zr|FGsIqICzj;gI;O%_iDW#&=ZE}-waAxbB?FNb#CxUOC5QRQ(m=*Ik~xHLH4rv8Ly8jb9dZz?%;o>QC{%OLg&0WW;qv2JB-L%O*!cq zRdsx-zs=ex`&>$|!_nK6_e7}WgrAv{+AaUh&mHbZC9i>!=L~e`wpgK~a3OmDmC?pE zF1*o_IX82=6(-zuzTwn`ywCV8)LT>QeP6XV$Cj`TTvapnwSL!Wy|#*uV&l^XLL&O7 zyBMC@)Zgf>pXuaBx1dG!xRLiJwsca}#f_6TR&dj=_I80ghud6U>=3=KL7B_ z{1K-*?^<>3eD~XJ)`wjaWKI$8yxK1OXJhNv`FC`rv@DD|vT2w1Ru$?v`k+9`Rc z7Bn>rP+piFn$x8F!HNrgTpn1fs-C@+Tl%~k#Tik4?$rK+dSMFhqC;M4wCNXoKyc~o zo3y8W%C8nwkGQ9BXv&7h1KfhbH&g9PJoc3LdLI^8dQ` zVEflSSXCXn?NRC+bU?Xu-jo!@KXEei_HX}Hq&a872?BK^f4eQl1=GvZjW^GFups_;!Xuti zevF!K^q}TWS8di=-Ac8H2=$xRX~7BAU7n9a23=r3%iHwwhb>xbd+k(Ow&zrrV;=iM zS&RF)6x`X-)9yco$~mhNqfWI;KJ&Qj@Q5FhBPrb;{z;uPTA#Ljr9W{GcS-x)<6V+3 z`SsTDyx?%vxj*?&Fw61S>yTlhq|TP34TWY(m1`EH%~PK}cWkbcZQ#73!5R~Mca%ox zEb$F#MVZ`ubn76@xEh6Rq87T&owh1w%~g|Xze(4(PpXa|*(PVX%cS-tXG`;_}DIxZ51 zXRAD|`fxj+pjI8)j{Kp|`Yp5gP0z2?)m91U5HwDwENAp}ifWaF_kG3+bkdy460=vR z#%mvX`#>dWE7dah{_{R}ggsgpg}+~|4eXn}LHlrre&scm2Lyx{duRmq1Y+V;CsZkLeJh_Vl?aBmr~9+u>u zR0?-$inhJqcXCLtkt;ni+Bp8I>9e`Tt4scql)bn27}G+lX-aDJ!eEaWK}uQg1NZj) z)=T}~iwzO}QAy<8+41{rFZg@9p51uW)vaZ~k^3#(v@@MMg#37|NaOy=pmn@$6_GvP z=Vte8?W*Sf{Iy4}Xz$ELOAbSExJSJ|c5~va&FA-?@9zC(PTGlRRuNgCi%8ig2LEDz ztR6T2_NfCitCaDmmVHmd}`N`v?R4ASv`2O z^o*aUdgm)WUHCLIN;Jf67(L+T40Lny%KF16q;Y017wed$bvq7J^x*aMWI1ksI{GQJi(ACyYDK!lQB${!&#&b zL)9C(g0KbcPCvV8Qv6gmaA#X54_GIvJht*ZrF>G8J@Sw76HHQPr6eqB^x{~HQF=j9 z{aFV)80|?Jr!qZI$+>7i-mmt5^xq2eBo+fYF5Y+fviay|x91NnnYi~6GAqo(TSw_Jj%o>f(6@K>&~e?9`vUC~P_>d2W| zFcU^v3L_4>!7A*kSI^%x*KFO`_(!{mrU&*dh~HonF*QE8Y}L%w%}ssA_fsxBJ$+@L zpMv_U^iw=@=E0M8(+w=oYEd*y<@X_dJz)l^ zf7kzHFTd;F0~{kUm-H_h9f8aMaPbviRX)l%E&od3v{S4Q6t5d@?Z ztx0qgU0M7tr@AZW#mLYd7L=p9Pr@9f)s%ho?!ZjfRc37;PH(32s%m#(Tb~5w`8R$% zaeeQ|fRKf^9L!dF7^NpBpw`_$eIhDs`b_HDCs_U4D(#-f%Qb8XK%XCEc6@aRve!p~GaD|z>1I30J@J%#;u)93(isK`s#OR5 zy|#81SsA?99dvxxPe1?I46beQ7d^1@?`nOK+hS!nkIf$we9Sc8V8x8RhMR1ogFYxfis-Lj>h1H0w=b}nA zyf$yXcTu-}8_(s~lX2edo}|AvUe)b*M~*+otMU8QC-^F-m)z1#O=k5{kGOd8Rmif) z7QTaQ2Sm;O`NsiO?)yVUf6kkiM09N;ZjTNaGD>^6HsxT~?rfX08zN5ZX=CHSFDe9Dy-s-n?UHw&+KoHLDq8U^Jax`@tQ zerNm0F?GQZQvD`;$&+#Y=A1m#M7OG4P=BMod()ymR#n|jOY-AS5=CX&#ve0YS7dxf;n9kM;2vFV#|+Pm^=&h%OCRM< zI}75Q-8F5QW@^!A+*9q?an%+llwqGuOVb_?e^YYaAKQj!dv^C8f5D#4urosm?vI+G zsCz=iD8MED)v=6}rwEqXQ&^Ah|l zJHQ&4`=a&n?bmo%l$dos+S?|4klMXwIpf?`-8<;_;MY~kkhrg9i;}n6J7#y*DoS6M zY8^PGZ%Dl2A4$}8m*!q0LVjIY21{M4P1RxzM-6Tj?(Eg5ns=~s*F!t0zzpi=sau!* z(aSAl(>nh@E;s&RRY#H6Gtb?+fK0bWt!)Q3s?2`l-iw?n?xfK+(uXn=z1Q6J=GKq- zc7|6vKbvwX+bVs5%dwzd3WCF3`fukXC@p(8y<2`k>YrzdO_shIRuZL@|Dau_=c0wM zP^pp~wD4M3({>6sMf`4YrE5BL3)julh+gXYzG`)D`7d2(j+|x~NUiO5Z(&8cm%D*( zaKU`o+YtGNr*`j$oWNUeZ%)m>H6ZrL>KXIr!S*V@lq0b#A4Q(<-ye}uu#)#A(8tGM z^r=m|niL5{?m1U7Gn#w;)ZK7V4}dKT|^H#V9Xr0Qu^(wwkNLL8mci-ZNGe@=W9`D~P{OwOc+!88(?90K) zc{66M;1~0I4Q^twW%rwF7ZXx{%zx1JQuae&v3|jYBhHa3ZLS)ahK>zLx@$T_E9$4X zIT>4f3@mFZ(nv~uG4!1MvCK{7%hJ?sM-=|=4Tv0FMm@b#b6xj6Xw)~P; zlRc-a+i6vgCK*lr71l@blLnVhpSO9$+tAYn5KZM^ai3oYpJF@ zU0Sig$B^BznJHT}yM4>2`sRO-O-)zV*B& zH`805_Mg@?hr8bW_y4$U%}|7!bIwx9!X9Bc31{bbH}`rcS~%MowxNmi_)5i{sG{GC zI3uVjkDu`N7nLeCGkug8wkhyv{+)hTRMzCEo5h@Szc$|_ghTDrid}VnqVs+eO3^D^ z$Bl4r>JTNv@^w42B6t1LZ}Eompv==S_S@cG`_)3TXQ{kwWw%h>H5Cf5zi;f2ysp$z zh1oCm_EqWf&dThCO5)72t!^&|EU=yHeYZupU(<;t3X?AXa60sKu70D;ma(%%%db}& z_M{Xk<4czBif&#o7l{JhqcldT{NPmn`fRtoEqXTRIv7t2&{Q9q*so>POUR7e@0YC% zLfYFt(N!^X_&qO^VQxKH$|uL`22#mhjr<&4RydEItFhAgn%n#1?(SO@`aQE+bFm|5 z*0}UzTELE#+-Zn@$lK^`}mBX zESpz{D~t5J!q;C`n6S2`n|-w2h_<39)YG>Q)r5_PxD3;(R1*!H@~goL=Pfz~9_phW zCQ(fcrtfd$?`xszHI4Goiak3n`D7g6ssHO9Q6yEl#&(y&FZ&wBc54#bT}fcl^p}#k zC6(F%ilN-ZdeMRfu)*R)JEb)l3dM!TS|oH+ z{^gvKiKe2-FjxbKN#KtzEakj3OluzC{8RFUv&m`4wqV`qExi?qgiD zviy(6G`=0!zUS(M`I{9hT5?`oXLp#d2|LUT?GvXe6t|*^f|t!(G~ox)WvW7-!#nhN zQxA`bt9m1RgzuFNF5VC(@}Kz4YEP^cK^{yS=T$3Gq?stgtgOq_CzMOmOxJ<2OH(dt zPa81rz1f((S$>+z#nY(nP0UtJw^TaSO0-O&+FN_j%%a#7*xsjZIwVh<@}=fC%iK$O zCEW@c*hSQqa%r9!=oRklXgBp16p8veVN@s4Flv6wO!c)x*BG}+QgUi7DpJrlUG?+a z1W-~bEfbe_5sjnr6lSZKwH9@1w{VPVV`T+>4Y*cXby$~a3yhk!~+H;phK! z{b{oI!U`)s8Qm_xBeIG1-gT`1iv5rJ@6Ia8TSf)B!kB@rp>@h z&!_#?>G~LcK+6d5k>Eeay@SB#!0UZ1{crU<;Eo5z?gURexR-~(ok0ZdA0q5@LMGoQ z1nyoUBmNZswged5K}6t85K{S`iA6uR!{5f=DxLM=1vno@?2CbJxL1ixO(O*El_Frj zM#iKP0{2bPw8L32G#`)`j2{o*cjgmPhEgDQhw97cTZ9A3QxWZ)N7!ESP&@TI22=U-fC&o?x7HO>uCqVcTim+2IA+sJ60_XS; zuvH|+AOl9=CjlQONMvEOL>3Hy2UCGBNu`9qeOLtU7^8Ut_lGeqvaY132%At`8^+V}TEwK>slg?p%}BiKmqel(eWT zzi$nO*d~DtTY&7EXA{B#9-u7D%@E51)&~=;4-#3x`oQQ$xTB81{d5HG;vqzq&}c5%#_H&r|8Bnr5|k5$=CK)R#pV=FeLZy-zQ+l zj3mpsJ3#+pC`;slCGY@cfh~~*tPdy)SRYvMoDa{rAQQ8|&jgt;fw}?bpphw1FR-5g zcl*`lAQka}FYr`sNGQHG?QmZvfjbZD!bu84`uQNoJmA41@KzF8!1{o)fc1e3&#&Yn zAf4v*@qnQp?n5MB!@qm?jz$~qRwQsQC4sv-$&qEl2;2`!4hRMlxW|&feU1dqpd)@&N8`l`DTn4)AF&k%dpuji1_YobeFu@enfr zD`w}~JosAWUsY8_=aL?KjD)+50v;I3^T0{WgV-Z(Bo+F9vUJP-#`^Fv-8e7f)BBAh zKrbYgh#3HX7yn)KS67Rb4cy5~;QTET?mCjdxh5n8>H*5ZF_Z(~LBw`0c?Bf__x;w( z16kcDD+_;BH^P0rj63mn(a+Rp+f@>?p!A4*N>O*V2qdts0)w>&W3Vr0^DCs-a>nM1;(^Kg)qJWJ>8Ip z`=|-rOHGzQTS>Q*4fGpp>qZ<)G4AiAzpN;az97uQ&Li3=eO@KSh z3Eas|it`_nw`H$C?#oLEDsi|An;<{x>NiStBlMeP$JCNGK;~c;F(O;CssMRh?+JqyHL&+iN8ypl4CUD*%39#2CaL+t}GZ2aI;sIp$ zB5eX^BGMl2oF{MxI)OX9$)Tc8>o>A8n9?VOe@FRazZo6sz$nj2G!HIp7!PtF=D}%{ z11S$aY!?Hue?;>@+($+p`15Oc0Ox1{55)8{dH{u`#T_btTqHd?Q0YLpwgdl>X0Y`#&cvLqF3uh}pY{A!O23XN+)+I13%ePrk#y(y@foVa<^xxw)w!vooNv9w4$xg%TlUxWVI z>3ywtWI$2&9TJ88BjCa7XX!Lg@J~YEiq9G^1}-t zLw?q#9pAsz@<-f|2TVU1X{d&BEYA6p?8rm7->FX8zXtuE)27RYt*b2RgE4(XSzaJN>T^glhQ%_k-QLyC<4eo` zbJ}#-u*i$rxO~n#(p8uLoHQ`(zm@*arL(^9UrK*{>Hb^b-{}|c>-nqWpMOg^*N5l# z^8a4`C*3zgepyMowT|&{f;pFeRHJ(Npx#Ld!*}Q_5a^Q zJK|cKexy75v_0+dccN=E@+@{6?eX{jH~MRr*Z(^GUn>6}laKG;zjNR_2flOQI|u%M zZ~*6)D=1fbiaC?&PCwl!Ww|F@bxps2_*9UYTtH@xFMQGg4yyPtyQ6YR{bYwfc*0VVb9v!j2hc-w0UOizVR#w zv}>FM&q6S*DhFE8mdgm<--$Q2asOfbi6_W8+Ha%HF52-kb{A;F&zxg06Mmx&BBIO& z`mj6-fF5uj|1|!%cOGqz8JpX=8DiUOw3{#nyqR+dtN?BcG@?AAeW}Fe+WS-Shq)N< zFYrm0mX=ArzP^dk(b0*qv9Ss0k%8WTKVwIZw!?Uyz&yYe%L305V9rxu&ecF0QLNi} zV(sb%Nzk6518}VKVq2U`TC`{pLEGlf^($7aAk5xy#97w;0s4`#|3y1+#ugssDHil5 z?%RG0+Z@oROZ+2vR-rJ}a=0#DP*AXjp+)wI_`}|-xcb(L8iM1Rdu90>a^u)qa>8}I z)Q-Geo<*m4GY?qax zzk)yRtw9&D;QzLyNV-oNZOG7eNKAv+ zo;Dozf;~;UN#e^0IR^cjzh?u(|1|4M@n`C9oSy}~zY_PGOYITCCKc^5#X1SkX{gbwxVKnJ18sXJZ*LUp z$s#dON9(KGY2r5EWJCBrefpF>BL&x7>+_EvLnj8@y)zrd_H;6KrD$J`cC|G$)aX?_ zXF_U22=+2)D}uIjvNlsWC8c%NZL}Sf=+9LR;s5aALn6E9t-jx=gK&>0+CtI$T!99( zi4@z2%h*H8=p=fNx5O3{ZNJdwr-lZx4QO^jzO>ztZMU~We^BDI8^XV#{*LT7u?)Q@ zf1X+|p+UT7y*3TC^(xZ9oOi<5veNc@V8@C!d$Ra5dKS-ocqDFeVq|g0-+!h3V(7rV zlJB7&XT}^Nu|D&q`|iawfIVUzdKJ$sVeD?iwgzH5Ou9Y;4Yl!i1)YRC82T%4G8yUn zHStF}urGmoyso7Htt`_f2JH{pSZshUBY2U8NDj8lc)Ph4eKPddBM8H z?1A0BeS12}2>8~Yg8!%JpZfF?14sOC1umuScrI<8PGbG-X#15Kom4}ESSK-hRhOX6 z3gXZ3WZSlFd7p#-r|O^j^pZ#oI!unDaP#54r5V`S|$wB;aLKbQZXqkrH!Rk^%m^JRWx`+z(}ol=*_Z*}3##Q$9UKS%$3 zsXYLLV?)o+#lNAn{)unj@t3bp|LrpPU*P|tLx-*w78b^Ri~i`*qj4W=e>ZH{5SNjW zK_INuKYH|tAnc#^8#iwJX#au1KRi5~F#g|y|3=%9aY4?*ufkosao?g3axsWS9P8`9 z)!sAsPrTpR*o|bIFeA4&sM1gTjXq`be@O1dRU~Cg?|S>6{{+4b<;mdw8Th+Fdf&XS zPkz3`zjNS!jRPUPcC+yatp9+#Ee4^jK_LE7U zc{2Id_f(eP@ee+6%!#Wk^*wQ)mKu0N@X~!+b1_!;j#Xg1-)4yaGoE&ShQ$ zzZ`s$az%^0a1BCmS za8JNJm{-Ozgbz<(F9@zh;ob|}vw&-}xOW8iQ_TWgaQ_bOQ;?jc zj`AP^?^Zi$$^50%?!LZv4-XHLm4#ab#4Z=;$GLx8v&Z!p+>3+daSX~m2=<5Iy&nf* zT|=@i!$dQ)1_}ePBkdSId`kW}H9vc`NmnbS($hLU8{A zmItnH;@+yc8FkC%MOf1Xx8ig1$GNVnzBcr^%y=#}o=JTtDM~JX!~xfoa6b#K72{d_ zb;;&(ah;I%xj%*e_pnRy{Dz72JUPy7;`*t`k56!Y8uOL)SRS~>Sf6Z~LYaV*uJe8@ ze=Ped*vEtC^PXELUOyD$U;t;*yOA5uo{^~HiVyrTz|9`;U}SS%hIsD+?ybTK`8oW5 zSM`>jC&jfVTua9DZSb6QoTJ9QId}#vuD#;fuejEPXTBbA8bgvEBohJPAS;_JzysW? zy7HGjk3{x2mjCrVX7qY6g9Gjo|al`@FZSc%~re5MYIIdmdImCF@B9lMjuo7CM z;%fSiI_bHBvh>%NKf>v?yvn!a<>LgBclRR6y?vf!-UuO&0^%I~Uj+4uKc=XyG zu4Ci69j^aAfq7tDV@!ahvq$-`;8<%Pv26abW!}(loLhba=WpTO09=#CGs~HJiR%rx z&kEN+@eEcxpPE^7z`Pi~Gf&z4>ns1d;3dy9=sNKf&i?uH!ghN7iK&;kHw4$65C_bk z;p@lpXTp8Tv$Cw1UW37T|9j`R)mbk?*BC*&QFF%D*UTT+b)UqaAXopGMX$x-`A@jN zfvJ~xkI0Lx6ncK2q32`yH&o_K8nRDpm#U#%EP0UyYaof_*@GDJA~PA@pU`yIUOV_$ z{&kfz14G(bJ)LA^{5^BC_3H0^g_E!yb3&(L@;Flp85oj;V}GXdF*^J4(BP*kOpnan0X4bX7KWb4$^M|KI~OB zDZdLd5=meW@En%70hR>7!ozn1{1z;zL72V{SnDI5xtuDvn8T%x((# zOoe>@fIccX93!K5WAeeYVeP3D_Ikak?URy{N^+iLll1tL@KLWJm6w`(a^tt+&etv#_MP2PK_96bfxQ8C|;aKNVR;F~khhtvc z(~D#Oxf$YnOVIWP=V6us3|unO&`|o=kBZznhaTJFSSI>@k~Dqnv*UhroMXUw4jj+o z9g8@}f#Z58@2ZcbUkPWn;+P5d3u6ED(sm9BE3A;av&Iwnf#X&jH`8Sgv|!oafwKCP z^l`5-j*0ND4Ep{omoen*=K185*L)JTaSjPyKbu?%+(3L@RLhn9Js1$_QW9}~wsxPKPMeK_91J~oc|aGZhrWG`O{lPi0;pShxT`b>Y0dA@x4l4QjN(tT+h zSK)X9$G5oO9>)?muEg;a?gJHtY?YSHHsBe!tF_awTZSsg^HdReblI1rUigFDKDm)3 zg?f;i!5hf&m4P^C6_VNOV=AYR))sQ}YXSl5V$nvK4Z`_}Q_gF-Dz<%A| zjp(r<&Q+k?;<)fe$R>I}O-xKoC8mkEqmxaa;Y)o_-1~L!>?V?b|EhTW2{J{G!C_7) z`P?pgoCy7yiu%&8i$AikiZ?IGt%G*t@@8ZD{tZTMFL*DY@65okU@uixU;4Fqg732F zV;>#I4dt(2z@F0wq$nqqloaB=#+tK98%n<}zI@7FCs`aCN?$hIU;SNI`gP&`C9vB3 z|B|%oiYuG`r}XpwN*J;9Q`BFH`zdjl^#9FIQmjIeVrAO9K~KpIh-uhkn2&qVZ4`R! z5aIHW8hEB3X4nW`KGQ*eMFHl!jbW~P6|6hhgYWt|NkH4@jO{D7d(f{+1lzzfkVa$K zG*Bk%@g1P84#0LA`U1spFL)I610tbsCf66kxV7ta(&=q`&mj z4)i6S>!*-Th(}%VBQ{PE&wYVy2lf}xV_S}Wtc5@i_OHN>)~2rbC2wBQb6B)}(E7>b z-1z_!elduIT{uFHK6ps4D_}njcaW~Q9RtUvX&$qSqAs1*prB@3+XvTY!5>VN{K_>`+YzYeFqE^pUGeL znR42bT!+2v*grsfMeG~ADtu0M@7{e4aYx6*uU!WC{v`1$3iuAFj%L?O& zW7ou&%~w|UzbSr63ZJHvqUSjfx3u2)b>&|ltakpNJ2xR4|5J2UU11pd>-zR7Fed(& zJPE9MqjjzudPwL44{PXd^>}eFQ-Km!Mg^)VHGU{+8$(0<+m!0vo}v^fEf@;oTM*nk zqeQ8xN#2@LR3l0ihKl$t4&<;g)fyBQzFC8I5MNG&n(J3s=T(-ZsYdl4ML`I0nrc)~ zFN$g=NfX{6esf8hYLo|zI$KE6RHLjx_aUA^N;>d&CS^jg;O|&!I{cbQO{J#5`$YIV z4t}GHY*h3fv1ti7mnv=U%REQWUlDiwD({*C4;3Ii2TDNMPOB#~27Jq&PHw_HyB5)5myPz~!2FGa;S;V%DTP zkm^rqQv>L~Xk2Cpt&IXrrvn|<4)EZ~P#~MW8Z{ZdBDNf$2vdgq6xKuh$?#@H$3#xp z1I@@uD+uKPVLpVZQAdIH0W>$YDGmD90BRt><_p)fk)S$mIq^q#|K$)1pFOjN{N!=8-~}R1sZE2`23u7Rj0@N;)O$MWy8bDD$ z)3wJGzM`B90dIMkrKqq*`Xiidd1_0!0-+7xZn(O3e=T)2t{sPOVPj`CTz!V=IE^9d zYC;FLodui6x8n|1U(OY(k1%Mi&lU=~w&uL$Y7pN}I9%OPU^i6AvF6&cg&MXt906a* zw{+0p@NI{(g|_{dYOAZ+vh8dvxk87TvY8n)S5wn>5I71QCfZr@KQXa^>KF|o3b`Cd zfsMm*@hboX+$D~X1=qq@V6)VQ$F<@LnQyXR$2vho2cUQ|cPW>r#>2nE)!D*{c1!t- zxdL@HN1Krx4p#_tS+aRTuDT|ZioRyOS?FuZqNA@Vr3&Nf11Id+4mRdCJizzkbjk%s z_K6YDXWQHJY&eh*-%g5>(9zzWF94#any5{*7_PpOJ3vcYTSs@WhPJNGAPt>?99<2z zt|eDPOKXS)G^hhD23l%4%h2>e0t0om2M#gUV{7PI=s^OO<`x=5xC1OS*aLKQv<3{& zwbUEPZZH8I_5jNPL%0JqbO)O2Y3S(aXlt0W^*9<9gY+zP*gEV%+<`+HOhDUm(BMIw z!Gko0XmPbQbOsF6)6mm3H`f?4&_c^{kcE~7dvL=E4B={74rXf)&@i_&9|$CHhiCvn zx*9q|EG=}k*xI^my&4kqHEZ!zUsJ9C=n|9(^6~Pcg)Ra`F{?f|SA5hXXF`B3er8`J-mVA=x1|V)+eNDL( zB{HK~NA@35Y{9kX+F3x|m((VG2H-GKQq`pb$JG6a5ZnR9*eJ0zy|4pm<-a<*3sc`EVV5x)zx@xJ1a-Fl{mPh9_wq? zNn8yWnE))35{@eRyZg?8?;QBff$tpnU*>=zMbVp21B?UA0_+3a0)QFD6g2@J0Y^ng zP@+txdPWIHq zf}ta2FBhf_K$7Lc`~r|LxiAv}(o`-lE;#maKKuF zVHrT|+3J`BO$X0+j$(7(`faJs6 zQw@fX0iq_CS0O-J%Ry8iMy6cueF1_xq$Zt(0CAHGvm77+au6SY6oapcbrjzGevpMc z0EoU^%8en7-g5Y~0Z4!xrCk9MCP(QMfRxCkjPh;WQImrl2Z$T^nplzW79KfL)v%L=zyb<@js{5GT2G1OP#3 zP42Gn)>|(39RN|4i**1X_Htpa0OY=0m|FmGlEbGGAlKyZ=?mG00kkG}1Nb>xE}cmL zk*^nX0OBDR#u6Z5a`<=v#7r*CPJpyjk>tld%@0pHFRraES_1vYGR9(R@`Jk007Y=)4}ODK88e4T6{bqlVsc1gq-81KO1aMA+gaLJ0kUFt0{@pnM5(mZk^ozn@RJt5oczH=n~jy7 zgafAh-}vTJ%@+ahC0MYQS@L0yPDZR0SGFED{QM;=$4)L5sF$ft}f zNRgD}Kbgy3T8~UXO=zH=jOOu$TxpH0k1s-yCp#-?rVM8qtW9#fl%Tl`q*huqT&P72 zltt~@$Yfemc#cA@-~&ZL>ln7c!Odr7A%$o30tl$)Mv?yjc^$Ur${BQ zP)(MaJA!OANE)n9;8KBBbdO2X;>yrT6Znu*jn%yy#g|X+{Cb;Y-$TX z)(kf9BRs1mLbXFnCYEWXTb~pP9AQe^wnjfoB?8+P8fg|x*3%UnA&?`F%TNFu#7S<@ zcav@Gq?Fpqs_co-!iZIG-7FN#t9(h-(jQ_)CDVS?M@nr8k|V{KFRWc8s7I}_m{FKv z%@0~DMx0q3Nku}Mp`d2iiM6`LSSJqG9?J`qI>?2j#33b^sK+-cZc^UMVPV4Ka_uFN zK%H?Nu)1NT<_eg!OnE}!E8T!e142nGlPD@#7_Gc(8bfSuK^Pm>0Cnj*ITXcBj#L~O z`K6oH2ILiRHsV`2^0?UAO@XG6i8#{+^ah&;S{~OiBDs%oVMqKdX>LTB$3hipQ zP!gpuitog*2Slf)LJ6QMNn0~oK3K2{$CA;d&3qtqj(9HY0>@_x#wEoqolNZgenaZ@^J(Ua-mq^hOUFaWGM&u&lA%@{Li@TqHBk7IA&6aGuH12G z&3SU*mekM8Z$3FkFIyTbOFyr?5p-W%$(8K&?DujRdHS(;9;sL7C6va$Zc(%BLf`d; zJz*%E6@1OG$SwbIe2!fNNPrFKx`Fuw$>MVnx&@AYh?e8Y)X1+q56qcE+m&RVU8oiO z$>qO#Pe*cgU%U-DEUkGzR7R)-%W1W~@YIUW@66Ldw$ukNTU{W@NH0!aD7+$ve#D?I zG5-e@y5zg-uadHhW97Wy)lgB#^pguq!`C|M~6)xffZq%ZekxYY|uu?jm`4dQg+gGK;(Pwvzx7j z3LMlcd)<~V=nZAdBZ7We(CR#s{1heKTT=wZNclzM>Z4P?Sv#R&xUC!8l)m3SsVO?! z`T>i_h_}WH~h_m*?AmE18vi6!^~asFdw zl3hG0)nB>%T3>)1Ni!^KIUQg^vjKm_#|bJFb)uFu?~M&9JkdI~@P`Vrz-nP^c}?gL z&I3-Q+Bj63)OJJ?dF@Z>@Lj&|=iRPtzf2Xot>74XBr3upUl~zGlxkwu?%i%1{Vn){ zunx5U3F;4}UA}?LLMgPF!5mq7KT-d(qd^B`Mad;W%`q!#9pBy&SNi$X)_TL+7Gi!h zTcu}`ztXQFRH%v_up#`Lda!4YO1f4|i*uo_H(i(j4L0pGr7yk?Zn=d{)RPIdYSB9d zIr@ZJh1c6vXlaL8ZXYB+yT~jMRyAzTmPJ&-oWN^(KVkCAu=3=3a4c(WwmnE(=)>&6 z3xS3_2Qx8zaq?)GI}mSSISOE>bjWf+e~N|0H}Nv*oJJX5tpTW#O5LR$r$*^^XdlAJ zaf6b*p^`Ck{2UWpwEjW=M3;-;l)vU3+S#`_Be40E@8mG~*&i;8naTdgnbzo>Y%yl$ zi*Kt)AynLG1#)y}^f9PUctG&f=AVPdjw)o%N0(n^#sOFx5u4o+^=eQV`)$MV@EW$; zk&L^2adGCfa62G1A+B>uNd9Wm(s%y`@v0#A8mxPOi%I>qlvBG%h80VOj%?HX)9)g= zz|gvx7zAg8xIj>l30p9P;wZ#`nY1b3h(xA+ovZj~W)N6A&pNW$?RT`mDY4V%k{@zy zAfq-j7^b&lQt?Ti4Po@`0QLd%Fnu1#XWz}%8P_i%4xUU?Ek+&e(*0~e?l?Q-C~%_S2YFHl>y$FJxHq$gB?}De;u<9gMOK6Pzs@2Y2zwD- zI6@v}r3OUKxl;Mx_fheQew6itQ0nuTaV*ZiyG>eGV(Ei{2iRqENY3e3Ze^G$ZvCV{DQ!|o3wB7{xs35}H6zA0Zu=6gH;^*wj8>1~3=mre2Fkqhz$W)IS2 zK4kUsJ^Sl?E`|HMDn@c9)0nl^UOfgFGp)PTj(E@5SVvmc-)Boxq{wHKf`2fK!Mvj~ zlNK6W-*#4@3i+bcYdZW8;hFGmX+Te`xTgx%9#K)zEXDb*4K@fW)S4k3%%8*8Dmp(- zLRmVD7mx)0JpDk*qtGg^A!n}W4miI+X@@R}ku6RouvS|y&l$9k{!wPgSkEK^;Dk2; zaAno`&M%I$=1rsPDlQgH4Ehja0%{Z;py~=#xTq5#HYr{w_)SE9Fi?}lCHCB25_}zL z80xA^YblQGj+8KLlT*XRXyR`t=>M{OLrSk1@f+??R44_FjGiqpSVD{?2~oY?q#k*b z?g2aCn0|z%DyWt;P@4`K8uP|(vUh>4w7(Nk6G$D9Ex~wbS#M`&b;4)pTCQcmUbp1i z7vu{cKr%m=Du#AvFiBCe*VYL&8}=_X zJ&$$v=MJf=3U%#L!q@SVh`i!xlY${<*5ZRvr+Zn}qV^J|G<1i`xZ7D;<5G;2U_U*vq9={lFPiHNT?F{l-lB1b)uI zJ*G=_L~#44gtJ?~XGKV%B5S@i51V^OuJ0!J={Ulb+om(G0TdxuB?QEi`gv=#0&WOn zOuuIN$>XgJfNgZC*>+O@OsR=bez|{BKv(Tu=T7S&NWD*P{x4lV0+19YK_TQPOzSH; ze@NsL8@PmC{-u*l#r3h_dD-`e0r&<6sA!iVN)& zTR@r`!B>M4u9)S8N7VIlw@$}GqMY6 z^wVYt3HPLj-b))rZT)ir=S_nnr`33ke&4lo*esO!5-YT*H#_%dd*NvoDLzsFvc4J@jYWF@kJ_)Z- zcK`EJAYfW@>B6ReVEFIlgLUHtJ7cX9k3l(A*qYA1gI~P?EL!F3#1jw!$Asea2=e?_ z-u9gJ{JStw#GXY@rlJqqKx-F6$FPBIV`1~WzW>ma37X2<=#6M0Qh1(!%K zaC_i)X<&c4>?^4dvT2i|x z?$U+Z5_QXUGy7?t<&*k4;9+Rk^9>Q{DIs@#zds}4zS4IdSZ!W+4QwdltRp5eUPJW1 z$W&?lD-q}bPZp09EBgn>zJQ8Q7qR8iIwI|R_LKj0>-94(t#TgDQpHhipx?ge=uyVU zlb-W2GZWj@&FGe3x(LQo(oA!1=5i^UV$g}W<2~JS6gS!%y7#<(=V9r=eY6Pu609el5H@T9_qI&lWrf9#5!xeBU8Lob2h~_ge6v1jkDIBuQx~@#}k{Ju3h$hp< zWP_gPZbxB521Q*Jj*SBR&ba&7mc5D2`=gqv=Qmkdp*^3OTlcZxGUi#mf&YOZL{Q4< zvKvR}OuZmz`lBaFUVfs9@QXCkr&#f?_w?_#qwn2t6(SVHNuAsJG>}w(FSd9M{ypMO zCKvZ`ovDcrEiTOE{A9nC-OXi178$=`dH>9m{9@x6mpS?fAd2ndYRGcSEN?9~n;wCm zLjUGu{&9~l{)>B9LDb=f^k49^wxeDV4Dc0XonBuy)3^e-;+0gsR9iXr-J@9?WrO{9 zFtMK1T5P1`um4%XUME0XRFHZq>Dw=US=U@K-R3$kS0_(o;Tou4kUa_P<%~mMnQ$~3I zjNz%x;=8;w4>z~>MW~QoQWf{+!_b}mc01u`H)bl66^OvTBWs!;1=6LP*$jhIo0Kf< zu2oDE!!-QrK-O5+Qp4;Mjlz(oxBiP^^d+A&N=g@dp*dHUU3RRB-4|x7iaYO1F!5-7 z5zW)teJYvOacW^0n)3^JHGW;rhJ7&+fU%WzI_sD#FJBcFy*+#sEFe7V*S39hrx03` z$UAL;2`|GCDilk|V~N~AMP%d3-l#6vZ%L}R_^tlZhQBb=Z>jbgDkr0qIxxOb`j9)u zY#sGxdGCu&Fx-IvjE^&@MBYH>lhcMA^}o!lr?d!ye`a0C^><~2Etpj!^@QNczjto* zgxa@+f!lKuYAvvqNmH`zq9_&(9Xr2orzIp?(v;QEY2OJAFyz4 z8a8x0n7!4w8i|VF{UPdw1*U*GtI#1cl21%*AA}b^F>Ua+7rC*g!5ot>E3!~+0OW;k zh4qe-5h>t^NtvB&;ZWh!g6ij?t88kYgTZ~J_l=v$lPh~<{J*KiG83Puc&z;9R(?(7 zbCA^Izc(%>xEL-xPE^IG-dP+nYz2q;y{ck)Vw0C%vwC@zw>n|!^bU>YZ z`Wp-gGs^Q6u9i?pR`*)+{;{!kSr`72s*m&%xbdePveV!fiW&IT$Knv zZ}!XuxX#jM>mc0+jax9{4YKs5s7j8pp{GT6{Uh=};Gujod{fNukfZez4=X)C{+ZX5 z%hL&X=gg$o_UG^xv4c-*XHATZTmsugcD`rw92#)CG4gq=jvW6l{*4s)O4_IsGzK8^ zdA5O>9y~4v*2$qylUN4~cJ1+>gyk*!n@AyNIZQQMmqb;Yf5%pPY0i1_}kHV@DvB-~ z0zIi-Auf}p}t;g(y+$@U;k!E-!CH)`4 zDwj(i(KX(r=KDN4R+~{?grRghz*qnxVas*kXRn}ZiR5AO1*y5y@p#LkgRyD9|EPi9 z7&RUX}~=pUpGjx-P$7%Y6X0*2Mc~#qfXY04A0KGJF|S$Z)a0 zQB_moB|-&T+i?z4ER*Cz0z8k@~OK*G!e4(+x8B4WX` z%9H_5K-e=X14$R~FDyY4w9_MU^#k7dp&5^4Wv{;25I5k|;|rp^P^A=o!nfJ$yoF=4 zxBQVfy_L2&VI!lRIXv>;Y2Kg(<-~<&vJcGK=75&k>Qdqo8aVmq0e%= zmGo+_C>$sV%_dlw|8Ob8yjMcqb;;yST;NR&KoU5N$pf3~cV?hyT(*@rMb)@BLdMya5dRap}2FRmH~9DCcvey$6fJ`8aKU8u8UI z`l{c?Hw9bg+U@&GF{~))(0A6EAM^{bhXBK0=v27!$oU6#Ut^@>(Ve6-s0t-tXuA)r zKj_tNK7WhzVKtDl}?*nZ+ji6FLb@AG-C{RrX-rUsEmh_FR?*Evdil>Iyy zA`umbjP;6%$K+(wB%0XCU%az^UGiNY6|#ZTl`J{ z%Nm&mY*eo7vd^t()hW(s-o+&WJv~!QbjSZ;3WZ-j^}_`LOCsAocBgGi6Xh*0yLJ^* zPbeWS(_LThsgHodoOp|M?Zk$w#7r2p5jOMi?!4}K-LJJ4>k0Jp9pzJZ-V>?~GnM`I zK>>w&Z)?0{%@<3FN_`a4M{E)f8UCy%KjHp-(5QdlvuLvAmuI_#=*9zw?v-3iM zMUBZbBGjn7XUtc4j^kAx8a`O;-0a>NczI;M_H^^L#GaxoN-x?D;hl%6S z!P2}KB-`}GCaEbuSq5SmBM2;<4c}E8Fs5Umfiu<`kSSCQ&GQNQ{=DxJj*?VogGn#$ z+I{P_p$@r@s3>2oPm{VJ>dZ(BM4;TaO;5GP>#sSGi=8=JErv&O#l*vuAQDUU*)KF0 z>UHP1g<}43Jcne`2Fi*Rstw@ALjKtre_Wx@R4dU3thX9TYWPS`CQwuCHL~3wvoC+& zSP29e%XYaQr}CdD2-ekIsmTU#PQfblqQ7;pl%c;h<9iL2-d%?^i`!|E5)AYv{GOqxZ`3Kwh`yu zBdS~lbe$KnVSl~XbR_gGiv85d{vVP2v`8Vw1I8u)B$v(;x9Zs%)|J{7xd5gRE$r4? z)LZ(~F_FDvgyK@3Bnku|4nQcS%<^%7$X0ope@*!aTeSeHw(I3O>J6C|Uj_k}0kO4% zV9{aT0=c$biqmuj+1nfHu%pH?9YCGk%Hf@opk8;GGe%!G=ZVjrnJot$%9ms~-jWN` z!}Q`Z1KGB98sK)yqi}NtGmBH@rCmhWXvB%7H;z#~8os1}+e#=Ol*Si<_EQE~Efm57nOB(@M4s|EVnhOD8{ zC8i#OrfGa+EcFg{W-sC*fa#Y^x(izGN4B!dLfF4HK8e|G@)O~ubq^8p0q&6Z6nhSl zhiN65m9zi46xedt%B`uK{%X{9C@;CVKa7}fA?8vz zd-%P-vd}3R-N5UVj)ng=NiXp5RT6|*`D~-C0JpNgszw`EmmO?FSG+?|mhIiaLn_)s zHKR_ZC|*79b*!`#lCvAUu@|skC3tA+3Wko!bk0^kN4!}I!`ITKVRg>hfZeCd5^@kN zr2O+E&OiDuQM>=P)Q#m+6Lc=3&t4W!cy`3y^F!Oh=Y=F&YcMX&0%kpWvK;R+AGD-c z_THikjw;834{UZ5h0$V4ZywR0s*4HAy)f>~`eZl;LZ?{XbWy=0Cs_}nF2P4;&fC^> zH7Y+}Ls*ec2ie=vQP1wvlw9W1r5_h0!`q8-MLpY=E1G+QAL|zzb47 z1qS|`Moue~-n^EJ0?kUo4&|GN&&)zwzv88_ZF|PeX)VIr*<`&Q{+Nf45m}?y9SdES zIShzAgi6Rfj7q|qj+u7gPY2XCJ!5Sh%+~|^{3ZRv2_2957MX~wB>5*5@Y@t!-y z3!umow3p}tncd^`_*uEnc{*i!P5cEL|DNg|@k@a|Crq_VQFfL!Muq+iJ>KmW2GQ}W z>lkLqYT@jODV5qO^I2*XzCg;aAx=<6|0Zv%o%aUJyAC+cK3Hqmr^}ong3H%mIkOL` zASL8wnLG#$uu~@wi3W1}yH}!ZIR_%<%6VRM;PpX^m8qaU>k`vepNNKq2wgKdM&U?O zxbK6Q_REeV*53yM8wXYI%*JtpEH1c7@JoHTTI@I>R^+dW`c_aMeCMX^0t0agZMZrY z-oa>97kXX@N$O(1ZO4Zt`12#*l*a03dY4>(Pkk34hhAa{x*wo+?5G@7OQ0Yk$!%q zaGZl@FXp$(p$i4}Li!_HAHjq9r~PZrQluy#i%I@YkIVFk@Lsj?WiK)~A<82n=80iq zJo}ko$G3B54W#K3z#@y3ZrF&MN7N-871@*7cTJe#d9L``l&@oup0p`>{1EVxQEn~Z z(9A{9^sD?RonoT=K|p$n4!2nXJ9aK0kA4U@yJQe9IU@D$0>#1%U1pt3P8|0Ea!zO&6rQ{L2i>pcatAShOXeM`=q*sjv8+6Y6E?;bzpRB_`3>(R zoP|CU6nu69pDd4=&p=*Lge|LsQv~mf4!k(xC#bkS^xFYu_A5dx?Eky~9P2P80L6s8 z9Eq~uuWyMTW|y|l7;Mk{1kD+;tNF7S2F;yv7or&qhuG1R&ge>ooo7egj%Awr{S)lR z87^Kivqb1u|?R~ z03ptD7<#kDMn-Cg7V7FG7mpj*Z@&sioEG9h43A4%^?@>M$E6~8{Jakc5`g>GPsslk zh7|$&sJ72w<-<~ddM42DwV`(e7fPqn%G&>L|A@^1lhYO8#Q}6^r}m9HlW>MDX%UguDGj@`e&)`{M0(qBO>ss8AcIfhKz%J zd;m*e!e5cOi-1Ozx-a;wj8%jncSw8|pZ2`r@zEq}?)LKMjLYv&l!WK$fQYRIqOC}dw!jri~o%-r6i%>WJUD`lBFL2c1(kHwRQL@L}~-D?xLSLTt=m-)e4g?6`f{TC{G z%l7=`e+%aw@E1g1P|uKIN`klD=bk}hYmk>Uu0%y^KLXTH9d_*pQ4h%ekto@H@aqPi zfk{4YI}Poo3B8E<-Pbnn;TLkLmoHAKrL|oKrH;ygX@mmoP?rL$Dl8Fmb&wN?zy%+xPs6Q@o))Or*5kU7_Z=TWYohOfNHq`D%H7J z0sSmVilpDxPw@?MH}TAWnL@7>QvD%0#PCH+l|}16T$H=;&j0HO;|cdHW#=y60#_FQ zMo0%{X(^%nz4|IPhWZb{wa%%o1*qN*NCJ%4|~Oy>_&WJ=s*8Y zr+Yl?DG#W^J(0gc@Aq!24|D41p@a*z@&lf1?2;elrUp(?vbCQmra@v72;XSsy@ZFa zidP#2f+FDLqYI^H0)UZA7Xwas?tn{vjGdnd8FYED>6S*Tx=juh7+r9Rsy)X{_|B`? zXnE(3n0RT(Hvf0FOHxFAE$7|ecvP4xA7CJ?JYf)H497>eo-jHpm@dpTQFgHINVtdWYpyGn-N&{I;Sn+$ z-*m2k7c=7gWiyH{lFot`j~(jqm9u1gsT16Yx`xt4-bK#bBlwahrpDq8e=y6!^wqk) zx+Hz#K73a44{?_Gl0=TQ#RH@0&i4YR)bzaVXm5ghP!;aKc6 z^M_wCAbckdX_3Op|9R%#!0nJ))pns@WsX9&pS<9JFmZlZ!gE>K7mKYRb3h%B^GDYd z2MK!RhuO9HFPaVT*0qdwTm4sMObn?Af+RMsY*gbvL!CqUk}T2g#ong5HR)oTUp$gf zXBaW?uqkWU`x>@j(2(Bo*k+(God90Pr96cLGnTJ zVPit%%O@Qc&MQa%edgnq4?oUZR1gCRZ69VRpL{1-sd!@Z^0&NNtE$JPuuPV&0N{$q zmDYl@`J`=IrCp-BXPDSVw35+dsPX$B$OP>oN$vAAPxM=95&UU9&S|nso}{O1pBmkStq0!_{MaVG9#Z97o*Yy(-Ax*y@ zd=`A;B>BLC#L^a%)-!3zSd ztOo7=tu0j!S8CSCsN}%jIRo0K{PB_#wMWz5h74hmnZB4%#xG&o~g(GH&2vvgx zH$agsdhPFgO!t3qUJ6Ln3htXX+$W7=k%JJ(cE-Ag5l^!o6bNjn*kB_ZO#%tdI0QIzX9eqvB|1FGE2}-2kdh2OiqG75;%@vJwXbKT(|P;KVzc85~J)5NMyDz6kBkYrpKvUx0|g^?C5mX=q$@=|m2 z@zFnGYVOZtW68@SZfcxSNh$x7j8BhW5-&j%y#KZ?%HXL7v@IkT?zU;(lmIktI7!eQ z2&3`^Gyy%n+B>0EVNDMPXM!|R_Pa!ev6OHC`rO)U{E=&K*tfF?RW`r{T?zk;3UBYk z)?y3E?DERo-Q}St@7};LZv@PevX(y1>i=4RmlJ3md!VoY!^i9s=xC|k3ym-+It6Bv zt@B|%rF=g7T)H~QaO$i~F1!!+&9)1!)V}VU{y9m+FO8jRKJ9RwDT}9xYb0A7`;wX^ zsWFz)RP63QHGHNXd<5<$BKIP11r`1;5tk#*JJH_q z14a{+3}a*e%eTx*6xZH*c^_}@P;8y^a@X&=gdd3@j5cLh=1&z&Zq=o6_pn@v&CSh$ z@+Mm7jeJvNJETahej&c%43ReJSFA^BNXrfS33oMxT$emluD6AbnhEG_Y^^la-*DSm z5CfyO0`}BXgLdQ~WUJs!NZACTza?rX0?M&$*f{H=knbHfke_X!{{i;tZ3y{n^sS4Q!yB)&Ip=e6?_d}$3pC0m_vZTKR;Sd` zpR1&+9;Q6!#>E%3Wq!Sl_N*7wZq;jrGZ0}DwYOHhRr=pcOYN6}iRWCJqhv%imc4>r za@Stk)*ePRnqp#`I+7L8$0QiBG7tBLH7Bc=3GyB|_xD=hDX55=ijzD94@?L#TMfWq zq#q<2h{^%qNS1FNul!k_EAx4JdBYAvUjL)CVbprX4Tbw7br9{9ydC3qzWBGJC9B2R zG0lfDQ5S;-Z@mkDIy!2Yt0Y4jT{u7)jE6j~(?j?qp_Ci# z%}2^0m-3tMFVxT4R1Q0sWWe-i;_b_i0TpvLd*iW_>^~HmM)A)DlBB5ocek!F)870l za%~7mWM(kbgyM7g*TTSMgt2`ODlX>k?3kk#rX;MIL3PM1WJ`M7@pl85l-WMgsh+Z! zHqcG-bktcXIrmM%N-X91g9L4vv4hVI<(bvO%$kZmR004_?zcYU_k&>a=k|qw@Pnq? zmV|liTSz$VJ}x1MR3E#$Xnr_f8LXCyPl?f2Q*eFg{5D5^$E|eMv!gZ{mSTm^f8iHr zAg<>b5H2iu&+WYy=DLlH-#QF)Awo?>nDsF>)F^^>(x6NUQ@x{YPl=Eq!wvN@-9e|; z`FAvr;q&y{>+`Ak&!+^ZNec||XGTI@U$yuDG)hnZ~`8%Un~#HYva z?L|Dm*``Ahz&rCP{Wvg}{Fb==$#ADf31!X?0G=FW?L4g=swCi67ZSEC==J!f)`+ey z&h#wuJ$R=UBp3r^Y{v}h#sJDU+KL6L8bF{e(^?nD_E8)lYosUUyERN1&yb<;0Z%F& z<{HM2`JUvS?S(!=C?Th+ug$-OY@NRrSRQ{4i#U+KSX(hxa1Dih6+|RFIw|m4XRH1z zUVb9Vwvm}Zm0-j@+EptS+uE4*#l2s;`-Kt1`Xu?BDWK3%7+`>OepiG0eD>R`8ppqi zG;C}D-dHThTgWER+f+op<;K+yi4nqIE@Bn!%j8=kgvZLE$-eL%cYx4)BWH4fdP~^f z342M}tZI)<&w>1G6spguLJlZNfAbD{Snu^yGN+MSt1te(40+PPH9=~{@4eqAThkhO5K|m@w&x%H*{3YFD`p47Z%*;r-Nk;7k z?x(y3PXeA5pKg%!pHy};PTCVy)pKWj*r8FhrkoL5i_-1d=Q{_Y#ekn?5xpgyECXhB zE$_Sm$rw^b0fB>KF;o}*IPMBB;)DUUt9Wn}vRvX5O8;?(yE0I)vs{agF5@i9Th|xh zk_V`WjrJWjQ9lRBNq4bqY~)`Vn+{E*=Whk{xm{gvF2fkqexFJ`(O1lmWrGxu}>OYO&&Cau=OeJL&)b53}M^~>9c z)~xm6|o!5=XCdBb||x-38Ec9k1e_&0B1c_#An8^j?ef2Z?82b^ zk!^4?4wtl9vWLN)uubA>88XA8K#D^A$mmhJ1N$bvoK@5rCT;1^9M z6a0^b!K7}QSXZVOYUw&T_wf2B0?Db##pg(3^BNeHOlZLEk$rxxQu6n_=wUv~nP(3} zU8ybL?QR*P5Ayfm7cY@uToC(?`Uh*yV|_Da<(mv$-;`>*nN)md5##eS}OM*qyrn%DM! zZvyYaQCe&&E^*!=1pikT{%WGG1K^!es{;N^IVMM&3)vswkT-U@^TMNp)hOPfiShAI zreL<)PG>i0->YHT2f5(|(s$6tRY0< z9WQ^HPOHB1p9H6h=BVt>WYEKaJ0FzGTPc(}`kfg-$JxpuPh3W5_SMBcy!OHg^i@;< zVsRdoS=-X-jt-B0cxb4MXB<^rZyz63lVT>{#E2i4XhFcDEZcl@1C zDR+J28GzgzJQr^uw-+uRge#ERvpH+GFC{%JwJCX!=bX1@JZ;3tg3waz@;3}b{v1(CLX+f6@e^&t6NYr%`Ev0^*ztVoB= z;w4_uMiftN47a$CxW5?mo)d;^ez)_bwIwSzuFrXL%mZozn@0qG3?apL9bZB}A9tBo zVI9xf3D|D~!pFQk^*R4mU&1pV&9rA_pyzbiA;87&Ld@t8U#M~0y&K^h|EQZ>SwP%q zd$;_ya;PfYxl1xxsk}VlkDgm3?8z_<}+Ds2V~V7oC-qQW+89V z$Em(x`SC<5T)|x*)g=kBSqM|nuR$CoMT(IdhiieVfpCQ{R6d}dvUBglEZe4RLJ$b4 z`e!R!p;pv87xZ)cH0jfJyjQ#?urA_m}9JEzMDiJsdjTN2NgfkwuVmE6lU z4nmu42?eL_{mPb@9bG6<8N!AM%7NFb0N?x%DKl~l?}kg;9s~%{?-;Q(!eZ0bvKnAh zR(B1cV)xLMv2V0R2z-npyi4mY1%TZ+yR#hwF04kcR7S76gtjK_#JeDBCR)jzM~Fif z&O0|={ONYKM4`7{OiWJ~?c+f4CRPNgxT(qDzq$W_8k%Xl!lshUkaRled^VUE3HT_c zPiL)$Ah$2P=yu@2buhKHrgZGiy%&9Sa9;lIPo>ocDgTe~+@NGkid%Z7+~+hnd|x;* z{+VWVo_f))=2E&v{6KF~JFWYit>eUz;-zkdw44gTWb5uJ`NyjCRST z95-t1x(GXrb$!RJeqXGGmEwL~ewJz_jx}~xAYCGo()UW&>pjab=PDeNmB>H17BfUg zqC*TMuOI8x>d31PfYbY)mz#%ex=GLjPNXSp}ooK%wd8Zc`I6d#S z5JH>1oa#A0n)Aj-r$dkU-UO)O=;rlR^+!yP94xy$KCw_xwAtFgZG%1SYkMxcyR7Fq zGIL32Q|qR*`c0j=V_7G2e-htta3%!$3=yQufB&s9YoRq(NvOolExx$do&04%XXEk( zshGMf@0w1_tVXpJR8H}mgkJkLcc^jn(B5U`?GO8Wva9Z^-<6VyZQI}KE^a-{^Mh1L zVM9dnA310|j%p$GY+s6!oIXGta0GxrgWsr&w` zS9KTTP{x{0D-I{5@CiofFvaZ4|Jcz^ue!vCGtq?+n0{usXCA=;s=s`1V|mJOU@ls+ zq2ksYL{^k(U3VTqT7>FyymkzZd6nUq8824LT6gNr!~pH+_U=*26Q-jTxrK!_N)=OJ8le zmB5I*1gj$*GRf|RB%ZZa-oX9-a+r1QI~ln5yr?LuN=fEBi}YyQ&8nh*eRFhpj&MeSeeYfpJ2G$&s9*bJyP0+4YcE^x&HgOZ0=f5H4mK&9NsHZb<8i*cpJ==a&AbUP+ zYOTc%9Fx;w@wlx6_Tew-zVxP>q_ZJf%jUB|#!h{#@|eo3R~ck3;zv8MW1{;?J3(to zvYhW%q3@2^An5XV%=vbS|G>yNj;Y8NOB{MheCy)+$W=D$a zVOelLBL>z#W_Y`|p#AU}S3s5{`b%@)nANHCqq_4l+eLNdKKefxbQ_+3dr(u>$*Bjt zkPc)AwN>a*#5H93LcPb+{I~k3vi9|y5#EK}rfcE0dd_Lq@T%od2@bpp`*HY{1zB39 zfaFx;zH~B89-3cR%U#l2`9Ob;9U6AYDSS4;oBo4uCy~Sy{2x)T zbYY4AXEt4>>5&R2N$99p>1ootgjFxZ!fgL-?b%zQ(;@GHJ5b2Lqsb^-rI2fOo+Ms#UhIQ zz5LOOfItOrcK}0pgG!`+#++Nvqq)dS-W#v)im7wDs!^EIVzo59?jY`>|!G{Zm(v z3j01xOv4xh$uAYBHSfAD3M3LAD%npHNA?G#u>ySu)J> z9_g3L$ljI7R=2EmxiGP0Q{?sUZ>mtAj7QaH6qA3&6N@N^8ei5RiP^#|`b*Ewi3)k~ zE=hlL-y4Gh?@6vt6Pp18W1s1(hItI4A5jHNI?` zl`i7<|8V>^c%9ukw#vGx;r&%iDVv$8ulfhgUCyjouUXJC_t7q``U{S7E&{2#`tT>8 zRHmIayd@;IG4wMfqq zg1VUT$#*BGlr{4G+lym7td%yrMaq@XLdO$`A{xl>OZi)$?3#Zd%ZW;=j&}&oJ{h(I z&rh?@S=CqK$es5wZyt`01W2b4wGt774KtQPE|K#vl{Ch+=KF1X`f>ku1Irg8l0py2 z2Z;9YcTj>Rp$=2VA{(8hh~q~&UzqteDbjv#eGpq7VF&kGOtwF%KY#s2CIoC2f7HIlMStdXW!m{Vr-cT5z52dzv!2);@KMgcxJk9%!b4jx z|M-}oU;WZx5r9())G>Q?_W_!3#tiY7*VH@$)NKJo8;D4$x5Lf$?V|>gnrLqypHNDj z=H>3#;_gcHuhdTB)Bi`)cLuWgynlkO(J4PeV{9T7w-Y<-|Ne*ViUZ-8s5YcuBZ&{UA;===uZh)e?zpsK}=prSfqHWhaa|giCt5cq+*Enwxf+(tyCOFgBIXc{fqkjR zd+|6W+jT2}?+1o#dE0SH=zw0wM%e}AMTBG&m;DI?2G(nP#u8#cb=wz)*tcy!#B*Nh zp~)h#3;SQLMz-SJJ=%bHZuI$Y4iVC-5;p_(zd&SEft8!!`1`c9`$ag&HF=#a{ldTh zgS3lWtv)3m>iD!6S-YWodP#v3I0T!QL5Yoq`%lJ+WxwKmtMoF_0D{N{3826SpjgD} zn2q*)_-engFzeN;tFy?uBoZ$RHxZt0_?mCURe2j0ul|kyKEbwC2=Z{p?*}>l6&i`p6#?95(_J@X;=CGZN zzjhSx0h#}3-P1EOj1-u7%QvQNz9&N(n%+oyk#a7B*HhDb{ZEHW=9JN{{#hgb-fylx zl-YoE4W!SZ#O4QXE}}Mksh~6S+dq>2gcViiA6D&H<7F=a;OfZD>GF7%0gZi4GQRao zXbhixf|(+v_;PM%xA6CctQy{gaJkf)!S$u3LVi<|DoK$<{o{#7s(T5-GGvA9xEkrE zD&PBJG2@o$SasDqE1fLiL8wF+1~UF@HS%Yu%wGBkA>?@9Y5%oGy zrh(-Jv`fn+&J%{E-wMTn-z$HfQ!LD*CuZMh-e+%E)ke_TNQ)J_dzM|x~pX)aFy)1S|_F%K0LJ5FD+GB zM8rJnw{Vb>4WRcHN$!GMvQ_GCAzO&wbH0FV2L^xlMb74Kw}ZpJVuK&eMUWd+QrpUZ zKl-Hi&qJgWs??g#$qD#cb{jPvr)mA(d-6b~QKh{%GklO`fK#En;KGD^{Af4>OzA5F ztIN7b20*XJmDYlQfBY%eCKoj^n_RuqsDOtz^~&$=%YxKjGu=20#bKtG$XuDPJad=$ z!UMjBI%*M;Jl5H@$7Xq!5!WhQrm$z3wtBVGyeZJ-*&b5l}0d4jBoU zKRSCdW4<5rzReKC3AMho%?0t$eW|*w-wV1#!7U4=xDavy*N6RH zV}7k)n2#YUY|O^?=*PM`+Q!p%f0>1!zWPm7S`GJftiOLrWkpg)8R@=M6ecfF=3nJ) z!^W|d6B!LRAwXu=LZM;8d=3#%(BAu@Y=`T9_-D}oPk@$zP4a(5e(0X}VutgKqUZC+ zzeFy6)`G8eO$ux++@{wJujiJo)>n#M#$!G95!S%3%Pdd3ZJ}A$bcKvMRcMy$f+1y) zs49897f3rYnS-pg9kR9cvi@-!8)HCGRMtzEDKrZ)W||Qh`lnaoI{0wfT!!x;@I!xGdc<9{}3Hc=}vN|1zu4f>lry0rec5NwLzo5n_ z*|TXN)AM+t7CuuQjv3@XBwDoh!9DwyY(KSqE1a>%H_eZRvAx$zn0v6?+w7CTdwl;~ zT_y&|qc|tmd84>mzvmkiMVK-xRsesK&N~s}dW?{~xvBS|7(f+Z@IfNNMKCszRuf>(h2!v5e^FD0#OJ*$!?eqqSVn$%^+ z2p&LRc}*>|@jDu@wiKD0z5!RgJkHjyP*}jK_SLwv8T8?t@&YG-`V_|m>D#Mn+1KYH zUtzLTsnZr$!*{7Odh$(c#Ix@xU#rm2BI_NL-`#x`hYo=%Yl$3u@?9!)lVE7fQ^7($ z5e$jskoG=!1FHtx64Os*KG?R@-fsV`t zJ3_0vVoKVSPuMuq*DMp|4rA_-xD0zlXT=N5ty||!cU&dYE{IHI+U`^?J4{}W3+$pE zZ92ar_pLj0;o!1XAqiKXf-+xCg}J9ciRhZtnP~Sij*Vhau?2VBfxjVN!G-eU#d`;! z&wJ6gH`VpZK&u2H&=A9TpyBwjx9YeIc)*TV{8w;j7whT5z#OJM+hQ$Jsgq!s2v}7_ zLcZtke|PWo-;K4EQ9D9)ARb=@kz^NM#Co(?;$gRX$uq{Z83%~M<^)$$dc}~1X(oTQC+rF0aQcWdZ7S7Uzh`ra2Veqg}i{x|+b z*lTw{J~aDfp9Z{Fm1(j><*Mbu;$~!3LaPwRi7y|a@y;6UdL^NU(jCAPoJMF;=HI(u zAcgd;I$NVWu%i77TYLO;_~)B=&y3Ox013Vw&ast(pAH@&+y?IHnPo>ZaC?4qhJI=d z&u?k3lvdi!zDHoplmak*&FDZet}p(mWXK%?ql)32df&CrsSF0eKKZ1E{i<1sWlwCe zsvP!LRm^0E?8%rZLJCtmOsg8|+}k@dZ`2)YcDiDt?7SSR3_Nm5ggK%kYqJFQS&p11 zjX!m`t;L46#sr3(W(4Xtr!qUtNGQ0?X}LY6*5eybqI%%R<{hf&fFKF-sKV z)w59;0aZmx?XAd&2Ga4~6UJwL76TP(QX*|ghtv&9l zddX0~z}~;AJ?w%0IBNr3xca3Jo^9q`o`G7t8bO-BR160hj3QzWzoFj5cP%q}9ljwUPw|Y)x)5HpdMw38w)~ZuKkZm)>>l z*c`kOIL57YcV3d2W&o2qN3rB}P6~q1aDCHV=R8!A&f=I4ZKEmtm--DlQrwGvQjb|O zAN3~;D)Mmu_dY-LW_2_drsT8;l1=V;vE)TueiW@3VYI;uVO{jYlY8dt|t+$-HSyeC6Y&|W_?!E~fiCtn4 zKko&-e$z$2n@DPae_V{zmNVd)ciKgHn}Q_H@ubzdMr-b+4*|@cnOH#hjckpQKXX5s zwdt@7W`B0Ilz<}XafToN{M2Oyefj%P!=fC)rYmgqJ#^AuN}wg2K0k^qM(OgzDXt&l zdN-LIjyPS&zUdMxYXDcXE#qv4RnI}i}%Z@TeMN{ED@n%4)A+q%D)|1zi@ z)F1pf#R~-@_xqTe4r0UOw(RhlHug!WFPPl#(!gr8oF>@XF8ih~+=m8$uOo(yGjT3- z4X&9HyWT&>oDk^n-K*U+^9Yx-3v+)QHnLL#k_XS;!{;?{wt?44^ja9k2 zDa9z6%Y%YxI!$=*kzmH$oGmUacz-WOAq;ruDO#kB(QJ+zbVR%rw|oJ#v0TPspJ;l; zW;=WPyH5q90i9#_i=F=}4_N^J#=7hl197#TPjnk}{IuP5wNcKqr?R}_Ksk?sl+Jpf zhAG1}M@p+@DOQZYhkH3fOy<7}xa;yw_)8|b^QpbN>o#&KJy{Hn#GKK3himkUVJC3^ z^ZWWCIw^1#Toa-{8Ipg>HGhs=*h`qqat2u^G983A}aG>K4+^BxEW-&et8-hYzQt} z?P7dQ-z!V zjO}Xm{HgMCiOAR$2Mc^tTcZ7uXuLwSR&7h3HW+Uxw-hkT%0}B zQ&YJT#nRIQa62|$@fu#T3{Jwwe$DlFK%f5m(~2Zz zCXfgoVCPl-sPXSDK~t!tRo;E3-K?v6wt${M(`2&33JR|n5sS9N(P4KU%r~RMa@%V&R5R5KxMlE^NpwOq^rIDkGJ*o9$Um6;z?V%-j|qg zi4*>AbGbvWIHIEym^M=uvLExzCdKWKu<5= zR-c!LiToVndZSYPv)8?#`?yYgDg?U>CZ2x)TK5jhS&r6=Khy>RD=BO<8lS!dD&nBG ztvS(QSk|xF0}IIO?$+ra;UK_TxjBB|0x2ar-}k;pX%k#Qgfx`s?;i}sb-Cz z1PNcJjH~RiysFA_fi@LtQk|*KIFC-YV!Akm*W#`mtt)p#;$Dciu@G%~w{(9MtNlD^ zu73;jUH5&YR7%N{bP-!)cJWQKCv{+4dKSTjs`o4_?v>AF-W zmskBGvC#|AY}g&aP#i?-?S$nL=}XLw@pp2+UX!XCOsBEA&#dUO_j^_FwkjC(qA9+j zDUzMio}&l(3%6R^>MoDLE)S6_FVFxNlfmx(V9g5PFz$gFKx ztz7PxF}RP2)PA)rY;7j|cGf^bt7vOa1_Wf7@A6k*N0CE$EGIM z??SirVi%fjX1>IqmQh%IlAW3_h7z0@(Sk+H02YQ@dQ4Fgb)(AsjYgrZsAIL+w73L| zEZD&iLvQciU<-4FmK8?5L@^b*+BDYisy8k^od4ZdZ351|Uxx3tJ#_nL?O;uld;7qz zRLi{B?;z{#$LpF| zO#@p+bkc*^iUgUwL&4F1%wc3Ow*Y7IUe;krWdkzm7|HF0zQE%)y3k0q!wmIoy;-S1 zo!$!|u-i*7?5j-dv&2e#R4_J3l!i+zPTj4=F8hC?x|oE?i7&ORV9hq?uMD$GZOJZ; zdx%&&b-(E402oThE$4FzOt4mcYxWQ@U&~#0?n!(=jN*X7EoocGwZ8{Dn3D@9gBUW5 zuEAJY3STE-PDM{nP@q(%?d{}MI%f^eO+^5y_ozwr6wX)ge0`HL$-npVg;Z&$-oppFfG_9B>gq3}(jGgf)v^Z(S;^lA z6VN$y= zaTvSPbUdHh6mYoXUEJ;<<4BB<$`SGOC+Af64E|h%w7;=&Uv?DKyCpH7{@K8^x>dP? z;Fo}mG9(f}7QF^z#qwwCw3;X$br!Q;%oF)yb=aIj;6Yd(OP)YOXsyDH3)K~iC0=u_bhWk)c+GA`&sjVp;O&giT1^<-qg0Fm3xTKphd3HJ| zK@AA_E%=u&sI>dkxjx!wh~Wxaz9H~^rd8MIJBc!!uL!1iFCxPl8isRWAL>TkGp*nI z?&k!V-!0Cs^%*!PUC2s*o$%!Is+*1ERnu-&C^F?lG5POw)#6iqt&?hxM*=gexMNe1 zw|bXxjIW6Gr%%< zEv6#riDzu@+9K9X{>{1|sOSl;@0Ia_iagEUyW{pPQG=HZb)7rOFc&1=Dc3%?$LVjO zlqqvU{(9CkuXM7&9^=XLKda?fp+1D*g1K8`0&Px$#-c)`<4s@^9XmbG!1N#qNsXG> zn8Y=+(#|LV$@;9I{$tPToij7Ey> zZPudmifWKx zTp^zzZ=by9z2x&`5h*wrB6nBORo74C-Fzxoa%Kb5lbNchSGmB-w#h@cg}q?;0czn3 zs(aTmgoAZMFx;nTRazw7=WcjU@X&pOeam>6y<2x0Wkl$Cb)dYEI{|+a=)UFByz;=5 zG)*dfAct^%eMG9eH@GVN=>g~>UdARJA_i*}|9$Pna@C_Ux-xLRaP0$C7S%BOCrBEN z=7s31Akt2q4RmX*x?KT3b4VtFZLMmWsjLo}7?Cy{boNKM^e z1RtnQWdv^m5XN`H#oFkVcH9<6)(tlT@&irrP08cmDT!R@x*NGvDP#jx_Q^n-Q-c%G zc-vv%z<1hOgE*;MzcRp)Z2Xt%qD#);gOmeJ4A%&ZFn$xju(MgJv#Kgx&ZlNesa?|U ziH`7nMi++Ab^8Uj6y1Bp9HQMJwxH5Mq2o`2e4=+c5{~Ok;AxacKG5X5xfkQB z^kguY@!nI$xjo@*#HTH+)vwnhRQF-k-f8ssrgFh)I$XYRPi7pB_wW4j{8igm-#$iI zIi%aF#)__rd*YHAjiEb4(mliVpR6(K7Z|;J$}=|?x$Nxuf%P|vRc1V?%kd(mv1BY zT;U%#YLF4MY5*~D{kc9HsCCEQDYaEO8dfQP`^@Bo?~}r|B8oTRqr$MiU70lI(YwKr zPC$izR^sFcEZL<$gFk28iqGY9euEmVlhzA3S+6fPtvW zAUh0plg+m`D*2D_7wQEG%C}$2G}VX#lUNANI<%FB1ozva7$xlu&HU06-FXeF*Uqf< zys$e|qqUKuL^3&T{z9uIcHz#QH#hd(_15;3tz<ocrNd{dx@-nI;5TA|Qt z>dPgBG18cdHY2BRs;%PGp_s5i#<%wu3a!DCncnGjmmTsV)N-e#RDB{_&Z}0Va#lP5 zJ@3il?=oK#4)IgFX{%)t!7K1%JgeXC?tT}iDCJH+Wj~ga_P#;m=!K8zBPB6mx)t3A ztBWln`+u)O0FPa@s~9y}bQbs`7aJm@3B0W#IfEq%)z`t7u61UH#Q=8&cz`GA5!FR@ zfF3S`PwcxcjVd=7IC(>Rv!k18=9z*u7g6WrAOelp!Y$`EBD80g)k7LhRFtKa;AA5A zW+4F|* zBXU~AzA_*(|8^1ga)S$;eD95)+fAuIrJfz!{KJM510uzQ zln^RIYwA3(eedP1Rh4uj$! zL)wtr-AY~Nt5sAdN<0z#T+3OUBAM7@2G3moB9=PS_46141NW%`!P_2XG7>!CN=hHj zYxwxbai)S#sY~rw0b``LO1|HKYQgj#%kWRsTXN0t4f3=nHFJ|v`k}kPK7i9m1o%EW zU>Z-fOs*I-a3X|!Xk+Ha3E@khZ$0uS2Z?NjbaVnxwZ~pD$G8_*RCj%eOdSfzsXE2a z9Fpq|=$=02R1?^!UUddVg#-fcPjUm@s}U)tT9Gr8#b~{wdNaycJNBBwssT6E;O|lJ zE+UR7H4d-)fxsT@Y3-&oxDwU=!mmseP zFGC;)i0&3pAKY1N=ZFg8|A`lhx<)PSa9`?}+y+bszfrv>#aC)wtfC-wUb?!S?pCY+ zi*}pqe2`fVhh%D?FW+)d{!&n%9-@=M!0RPI0(iaxG?b{(#xGj7XlyfG88JHxXf0at zXWO^VY6J$4y{`{kG#C`oqKJP?Hb&RPYFX=z^dBAF+Uh9uDUfYL_KYX@D&_e+c@{1Q zn13G$b~x{mIvrG}5XpOs*}arh#(>x=T)0V!84(zC1IsWikZHEW(JiVmA72C)xR8Y* z%)fsA+vD`poEJ)9r;O7F&nmQZUC<-i(*!nE>34j?xQllNENkF~ed6v7S+`#TlLmQq ztfx!m&4)V`59`b{4#y3tRaY>@HJFDzHTr`xIV605=+OPfbH~P5jswUc&e_{>-pce z+oG`$y79^AlSC!F^M~2oyj24kTJ0KwJ;`D(rQd5alF^AI)Rfeqq>%8;#w2J>CDH4z zmFeK~-uOig!Y_&z!teTW{8pOJ-|^FO5QB6cE87RBy>3u1gY~mx zwWOJ#zg}t@rKoFOfSz7TEuP3HK=NQe(yKXo&QqH21BmeR!5sj-0ihdrQB#>*2Ord9m-Ug^g{On%#81)q~SW;d4X3~9W$*g+;>0t{5~%Fvss|| zDkH%lJn=3jq+RwX9`BDn{;qvKpwsVLRfz4y+z0}1H_7iCXO=Bx`s>rdfcNMgZ}^>~ zArTw%URrvzHiN0lQ@2=_HOg@9o(-IV^w^GZ`|fUx*--mZ?8j_)aLqtcuXd*PjrT93 zn>3~r_UKT{+%U&pBkt{tah0s93lI7&$6oyTmko58VX3UuQPdIRhVQ#yI}!8;hxu<* zp~3E#KT~hI2o}0oaRQhmX3WI)*iUyww?)423#235yHASmGg5)uttkeQ+ry!F@Ao(&WW2eh01k;IxQ?}hTAS8(jbiJDXYsv z(OCnN;r4)5$Sw34b6;WnT2Ct>$QDF0*EGC1bz%nawv^}03KSRDX z{?=vg{*C3ursXOhJ7=i2#Xcr{@zN>$e$T0S_MS<=Lzt)yAQ8@ZV#VbiNBwPUP>39- zA^RKs?WDf>7doO;)0+TW@k><`^+OW@w0ZoND%~uNtxXm=uy1=Y_LmUWS1xP8=F7D# zaQQuaB(yNic|rbbMmUup6_DE^e>_Y5MF5;Cm`P6 zojVz34lod5D4%BVz-8#VI?M><&*vYMPCc6uSw7C%TW}5Vp0d~h*@2fx4BHD!{BKpu zB|mgL@1hpszIeFx*ehcqy1s;beJw^B-^rlF3VTr5RLb;a)C#yVWpIbOW?0{kho{0H zWR_o3tKdDjSG@=Up3%G873^`{f1~uayD52}J2H7hNd`+EydQmfkG#4yAgX@(iXJ=u zWe{Fm_8+yGtRm_<4r6hl?5h{|K8Dl>w1ndxfM4L=SUau0yGwL#_kWah zK7fNCw$@O`>gM#<=rwFOa$SDuYrM_fmQe)0hq-_0d8U+s&UL_veJNJ@kaE_GQA7=! z=lp^uYW@0BF7gkB#b~~uB@-@CPzKLIn=Fe>b*xC#m5{oa7Pz9(^&kF&3{NoPQdtir ztf60qc}}kz^9TENdL~!g!?EC)WNnjdrLtad99k|{E4U*DamW^MsNt!j!T#p1V+8K> z9<}~xkF<04aLyWFNmQ2;-v|&c-M-js!~TloByU?2B8|q*oafmwwvm1>$Tx<&;v}(3 znB3W`YRI~&Gwn8kpjoNDvVFcBY5I|cFH6~0aqC!ERB=r5$NazJoMeCGDW{3J;W?B& zS1cjd>-z%RKc!2^@&%R(~_#^Qj(!UBL2*|+kAJr_|WwY z;IVT<^(L5uPcc`Kat0F_@)@Kv(5B(1f#|EZ)oZ1nKQQnv?k2_`)hZ@wH{GH!`JPbY zzrr=2*Xr^I?msfQR*KV=Q=2~nN}jS{s5+*;`7zb(CH8RrzcWqe&pjibU>_vO@AVkY zxq_ipYZ_(XuN@?#5q7&<^D_tU5l%YJa27G!?aZ(#%Hx3Hljz;-I+QJN_nLfh^uV(X z$G60@Z+6ev2NXBA@06m0j*IegKxg&z%orM1KVEQ!`hRua|3wBQTX+rY4Y`cp@920(cory6qala zZ%}J-Z#nHDQTRSzme}^YEIgsaC=Dyrm3% zFpI%#|H+U>UXJ^4#>{ul7t&MlCeP5fe; z)A@+o_GxCMyY5o}v3$Cvcja*-Ib_8l;&hNvcG7sfZL`4mq@1qdu_s^NeL*K6C1F=j z{CbN)`;Eu1aZvJ-{|m?R%TO1qD03O7Av}I0(`vub0Pz9;!~amd%EYH+QBnpZ*DR8W zZKp8g%L0GmQMaX$A$RbOsD5g`M55;W(Y^<5S@YDuFPqsJ@VM^QSLK)K#gOtW04${N z-%D6Unq$d~i}xge!1&)K zFhW%ZdpzDL-|H~jkbCq(^Xa9`cP++Dcla~zaYQke9!GQ^t6?7&RkGN+o;+@orFjzY zq11>`M3T$bz0$KR%XO)=5|I350vq}6g3O7%m$cuk9kcxgg-p4?hfS!(2WJR^`c^2{ z3#YMIHtPlC{z)L3p?_|bl@EAJgX(}C5Lu!hi}nqj2;gOfSOf}O`ZV@(SY9I@K1kpu zJSI@eh`cJl#t$oCv>R9vcZ+P~I;`5MT;+!Hk)G&3;0}`k(Y;%sKapFj+E@K}i>*O+ z-pa$-vXQQE5Jf7b&(W=L2GFtrnE~R(l_Qe*wCclaTTt4{HJ9_bF3XCh*OGPt@5SJC zOUqo~a4>e^%PHI3J0&$+s4KQ$9hp?WqcOk*ZaNiPo_L4I-8TN6b-@HGXf;zxb%}vb z3|7n`UlE*F%-2wnCkAU3Ggu(~jG?BGr9zVN9$#25Aiq}S27Bu1d#9e()udv;4oz4ymwEQNb620>B&5}gHREn=G=LaY)FwKolcQlxiDp<$2OFim1Bs`G+J&D zpfGW>$G40ypt$Y8PaG=-2dVk*Xm^vGH3nykd<)HeQ=bDzXJ?C!D7BQt{8yx6XoZ73 zM8@_6Oh05V-Qsb^(W#<}Ov=VnL7BLx!4^-RsIxG41rORLrnPICj$gccr{ODm#EG?; z>AsK%{;QfM`*kyq%bhG{1N2eCCK5C_~gRX+m|ojy?r}{?0w{IkcExj zRkAD?D)QPIkx`M!s;@AE8f45Itd3xcjb8!>)B4gP<&GOqC%##1M1vCZFh~ zi1YGLjb0tpugcy{%+~NjT`G?rE>FEgC<+~_rv0$60-8=Ct?F-6`zd0hj;B(Jl4+}bwVy*_Rz*itx{< zM{;#=L*GBs68&r+Kq7V-x0lfK*z#KFj;KyTy;2YkQ8{Gqn& zYB6#d8pT%IqJr(JO;Rs^V!yMNepYy%=5#y-G`IgUBT>g$dwUwlJk8kV!L03@ct>Kj z*m!pHQCR{AG)xxiaE)rIDt2j~RIZJ2DLV9RnP*%_&& zdf0|CHC7Df6D21u?Tv#JSw2U;?2p&(s;bT&owRe1Z${yQE%pvBRc@|Mj*fPrf()S+ zAY0N+O_lh`gQ6tbd-UWP9jh&UXJ*VF z1$dg0ze`wiREr$N@|UA^)APkoP3<N*GdNZ(m-@8 z-nq7Bg5P%g^zbTQ07TS;4hO;oK3u&}jR3uDyv(iK%y~_}^eFl0{utG#YVnw7s-j{o zYZH_E+Je;Mw6Skd(<7c4Ce`yA{a8Tls#OKFXLa&SL`%qpuaxJJ%c3^-j*5E45g|V^ zm{o7?d>v-*+b4Z;RAC#4%H2Rr5+5%>4O^aA?VQ(GZv0Q+gBTSbnhZZxL7aAzOM0c^ z+s>J1W5|=!3~cvU#reYN-cFz^)5tsQkTG;Osq~m(vSv zNS2Z;d8B`DgviYqhZ~zq!+Wn3`EM>0%|`vim2_VscI-6)T2BEM_0^&!H$In4NZ1#{ z;XR4{JxjD%2aYg_ykkm|?ZzJCA2jC&L~VpcMcdh?$>>UQ^t}%keTMq#_6jCUYv-xg z{;RLi-5{_XToWcAb;+wSC1dWNfJCeEFxg8AVB|+6@~wXS=y!TfQB0KIE=Amw!&;tfi5!v5HYlajON$h0^doeM zFa~f!;$qwL<=NYlY&l)+HJnMxB@$Ti)VcuC;tSM`aGlAlaSn7vo9CF;Bsts}Av^Mi z54fLZpJ71IXEX;#Rzc1#O@qrMd=W?Tebw_*Qk>7iw9vR+AcTth3+EV6u{!$rn@Ty? z>K66FHYbjKE~s;IQ#&XmXQ;ZjLe**e#JXg*0dK!CvO}}h1BQ_6AgIkCb>zun^q6~c znd;_V6^6!-3n(E49PUA8^T|TDth98hwC|ayxx){uV5)Zadaw_<89VPk=P2H$@;H0^ z5t540@tFR8Xt+|?Z-th+88UcE-K^DpbQ5EJ6m7RbU~aSPpVN3UD5SDwX%#&TrrXCa zN?fcn136aN!pPbbzDto~#87t_F1WLI^GNJPo4M@Cb?sOFA%};XhcA~-sK&M%)QTZu z`x8_GguWHlj=@^V!r-=4P*FY@mcEBzYejk>REswV{JYu{aBt54^4p>p$%tG(@x)!Z z*EITxmHwi!X8Qwnox2q!rpUf=;&;IPF(aY7;-3tQq6#OK`TDo7vljB&L`6vfJ9qa$ zfp`s_Cub3)eR$?s)OPWW$lE^QT9%&VB_B2Uib zlbdC34*VVQSQXXH(%uQOs8#Lu;`7KUM0@a%*)Ky|@O~!IfZ=UMmXgnNm_3NQ7M{Cc zz6t|31jk%ow~C5^_;t2HyY}%H5P`wV7rA5Dt>187%aS)Q-AXRH1iKwo;rxfnpJb>^ zRIjnC&5}5aLGT)msk|>ed|rccQWiENKLs+RF5cUnsJbCKaNK{T-aU^kq?_S+X2(60 z%?xWBK4+X=xLH{&@Ab>1ri~lfsVp4PGrYglw0)b$4ljvBtdPYDm)Kn%KzaKijZTpd z`#I#8{0meO9{K3Ka5CvqHRl_zpgt!1m;w73VE=9RR8<|#|BCpo-o1K0H1+l9_;_~t z4GCf@y+VVWA9Z}8sj2U?*wy>K z#saxIpNj`|=v@gavbuWUeLHI)2d;jei(4@J$B*%8`QkQGf(=^jL44Q4CP2!g&Fi=A z&b|yWYPIC(e`~0AJGA!=tCO>n1zS$dXhL)6XLGIr#T-+%TYl-LZujV&OGpO^kB*Mr zp3>yK^X5VBSoXG&Ay;OD^Gb{JMYuKNw>2%Ml<;QC@ik>j|OD1n<=Oij=_Bm=7XdGUPlk~8C z6mlV9IH~gR-w(*mF;9;S=}B;Y(x(tmH2+LDgiEM4r>;C1Kq^yMD*`@BQolAy7e1JY zvL_)+y$i-<->ZA6kto+sj9^dfPb3>%l>}>Ra?9I1?(TlOZe;T_$#nsOsZi0$%=+Y{5eHAm&`DF?EV2vcGln1 z=*@E!eAi(3>*WaK5~}LRm%lPDN)L+ISh)pH?tVxRA&C?b6aV=i|1PTB zcushrm#&~#)8lLg(8}OCJUaSbLH?KW9$Jq=<{$25!yKL5w3qYN`XlWAd+O0^+pYZH zwpB@Qlk!B*qlyX^B1OReq&a2)`$uleaa3OJae&X|+qtQ~A4)K_lDOy@Db-jtrPzg% z7q8gnT-S2@N}mvjpv<3ejR3HQYfhcyYmDm#W9>l+z>s+0a}%GPLQl9}kwm7k(QLor zcnHH9!@XuQ6Tf_XK0V;5D(}J%=Tg(Ap(@wMep(pxWX@~!tzG)JTG)MN8XFtWZS-^c z3Keubjqybo`5w*mtx5lDPm122?Dxie%Z)Qe16RDS7m*S+tM2Vbo-eOqJBF@WNvxAb zHUF+c=cpNlDStk`^O`@a{-0t4S%QO0k6meG=}+|;4^xcIUBL(6Gn6#{V^)gS&$S^BZmU7r$BOd)OA4s{*m*Rw{`MK2$S!w zw!W*1&)4hc{^RB9x9*QmDd!76cRD#|CuK%1r)xI(zn*7T zcW?F2{KPl?!)*KE_Dn0!5W6h@TiB*@0EV6q z>OBAPI=N=98Es9RVKl`|!LIxK3Z+lq*T{c2OMEUmF_Ay-ftKZcE3w=$U!T@5)0gPW zQ1`P(A(#Fr+53MO=+wR@L7M+rW)q}hNi%# z%jvf@|MOUF-$kC85=|A9Gu`HDrOrLHzeRYS4ZRo+KWll(QW7Hccb!ABA}SyFT%qddM(LNBku`Me^f| zVn8D_DueX-x)?pkI|;jDbw$^mQ*@5*B%zH${=5D9$Kb2av(Il9y?7MD;%UWit84l^ zf?45NzJCli*Bb}b+nNJ4is9b|>?I}t7`qrhRk97Il7m>p=?^Txv+*i@W^#JY#U}lK zEcpLiMdVRB1<&Z_u~v`TNjb