fix(reference): improve line attribution accuracy and add Claude Code model extraction

muraalee · muraalee · commit 7962a282a611 · 2026-02-02T18:11:26.000-08:00
The reference implementation had two issues affecting trace accuracy:

1. Line Attribution: When processing edits with context lines (common in
   Claude Code's Edit tool), the entire new_string was attributed to AI,
   including unchanged surrounding lines. This produced inflated attribution
   ranges.

2. Model Identification: Claude Code does not include the model identifier
   in hook payloads. Traces were created with missing model_id, making it
   impossible to distinguish which model produced the code.

Changes:
- Add diffToFindChangedLines() to compute actual changed lines by comparing
  old_string and new_string, excluding context lines from attribution
- Add extractModelFromTranscript() to parse Claude Code's JSONL transcript
  files and extract the model identifier from message entries
- Add resolveModel() helper to transparently handle model resolution for
  both Cursor (direct payload) and Claude Code (transcript extraction)
- Update PostToolUse, SessionStart, and SessionEnd handlers to use the
  new model resolution logic
diff --git a/reference/trace-hook.ts b/reference/trace-hook.ts
@@ -1,11 +1,23 @@
 #!/usr/bin/env bun
 
+/**
+ * Agent Trace Hook Handler
+ *
+ * This script processes hook events from AI coding tools (Cursor, Claude Code)
+ * and generates trace records for attribution tracking. It reads JSON input
+ * from stdin and dispatches to the appropriate handler based on hook_event_name.
+ *
+ * Supported tools:
+ * - Cursor: afterFileEdit, afterTabFileEdit, afterShellExecution, sessionStart, sessionEnd
+ * - Claude Code: PostToolUse, SessionStart, SessionEnd
+ */
+
 import {
   createTrace,
   appendTrace,
   computeRangePositions,
   tryReadFile,
-  type ContributorType,
+  extractModelFromTranscript,
   type FileEdit,
 } from "./trace-store";
 
@@ -32,6 +44,25 @@ interface HookInput {
   cwd?: string;
 }
 
+/**
+ * Resolves the model identifier from hook input.
+ *
+ * Different tools provide model information differently:
+ * - Cursor: Sends model directly in the hook payload via `input.model`
+ * - Claude Code: Does not include model in payload; must be extracted from transcript
+ *
+ * This function handles both cases transparently.
+ */
+function resolveModel(input: HookInput): string | undefined {
+  if (input.model) {
+    return input.model;
+  }
+  if (input.transcript_path) {
+    return extractModelFromTranscript(input.transcript_path);
+  }
+  return undefined;
+}
+
 const handlers: Record<string, (input: HookInput) => void> = {
   afterFileEdit: (input) => {
     const rangePositions = computeRangePositions(input.edits ?? [], tryReadFile(input.file_path!));
@@ -108,7 +139,7 @@ const handlers: Record<string, (input: HookInput) => void> = {
       : undefined;
 
     appendTrace(createTrace("ai", file, {
-      model: input.model,
+      model: resolveModel(input),
       rangePositions,
       transcript: input.transcript_path,
       metadata: {
@@ -122,14 +153,14 @@ const handlers: Record<string, (input: HookInput) => void> = {
 
   SessionStart: (input) => {
     appendTrace(createTrace("ai", ".sessions", {
-      model: input.model,
+      model: resolveModel(input),
       metadata: { event: "session_start", session_id: input.session_id, source: input.source },
     }));
   },
 
   SessionEnd: (input) => {
     appendTrace(createTrace("ai", ".sessions", {
-      model: input.model,
+      model: resolveModel(input),
       metadata: { event: "session_end", session_id: input.session_id, reason: input.reason },
     }));
   },
diff --git a/reference/trace-store.ts b/reference/trace-store.ts
@@ -1,5 +1,5 @@
 import { execFileSync } from "child_process";
-import { existsSync, mkdirSync, appendFileSync, readFileSync } from "fs";
+import { existsSync, mkdirSync, appendFileSync, readFileSync, openSync, fstatSync, readSync, closeSync } from "fs";
 import { join, relative } from "path";
 
 export interface Range {
@@ -94,30 +94,179 @@ export function normalizeModelId(model?: string): string | undefined {
   return model;
 }
 
+/**
+ * Extracts the model identifier from a Claude Code transcript file.
+ *
+ * Claude Code stores conversation transcripts as JSONL files where each line
+ * represents a message exchange. The model identifier is stored at `entry.message.model`.
+ * This function reads only the tail of the file to efficiently get the most recent model,
+ * which handles cases where the model may have changed during a session.
+ *
+ * @param transcriptPath - Absolute path to the Claude Code transcript JSONL file
+ * @returns The model identifier (e.g., "claude-opus-4-5-20251101") or undefined if not found
+ *
+ * @example
+ * ```typescript
+ * const model = extractModelFromTranscript("/path/to/transcript.jsonl");
+ * // Returns: "claude-opus-4-5-20251101"
+ * ```
+ */
+export function extractModelFromTranscript(transcriptPath: string): string | undefined {
+  try {
+    const fd = openSync(transcriptPath, "r");
+    const stats = fstatSync(fd);
+
+    // Read last 8KB - sufficient for recent JSONL entries
+    const readSize = Math.min(stats.size, 8 * 1024);
+    const buffer = Buffer.alloc(readSize);
+    readSync(fd, buffer, 0, readSize, stats.size - readSize);
+    closeSync(fd);
+
+    const content = buffer.toString("utf-8");
+    const lines = content.split("\n");
+
+    // Iterate from end to get the most recent model
+    for (let i = lines.length - 1; i >= 0; i--) {
+      const line = lines[i].trim();
+      if (!line) continue;
+
+      try {
+        const entry = JSON.parse(line);
+        if (entry.message?.model) {
+          return entry.message.model;
+        }
+      } catch {
+        // Skip malformed/partial JSON lines (first line may be truncated)
+        continue;
+      }
+    }
+
+    return undefined;
+  } catch {
+    // File doesn't exist or isn't readable
+    return undefined;
+  }
+}
+
 export interface RangePosition {
   start_line: number;
   end_line: number;
 }
 
+/**
+ * Computes which lines in `newStr` are actually new or modified compared to `oldStr`.
+ *
+ * This function performs a simple line-by-line diff to distinguish between:
+ * - Context lines: Lines that exist in both old and new strings (not attributed)
+ * - Changed lines: Lines that are new or modified (attributed to AI)
+ *
+ * This is necessary because some tools (like Claude Code's Edit tool) include
+ * surrounding context lines in both `old_string` and `new_string`. Without this
+ * diff, we would incorrectly attribute unchanged context lines to the AI.
+ *
+ * @param oldStr - The original string before the edit
+ * @param newStr - The new string after the edit
+ * @returns Array of 0-indexed line offsets within `newStr` that are new or modified
+ *
+ * @example
+ * ```typescript
+ * // old: "line1\nline2\nline3"
+ * // new: "line1\nNEW LINE\nline3"
+ * diffToFindChangedLines(old, new); // Returns [1] - only the middle line changed
+ * ```
+ */
+function diffToFindChangedLines(oldStr: string, newStr: string): number[] {
+  const oldLines = oldStr.split("\n");
+  const newLines = newStr.split("\n");
+  const changedOffsets: number[] = [];
+
+  let oldIdx = 0;
+
+  for (let newIdx = 0; newIdx < newLines.length; newIdx++) {
+    if (oldIdx < oldLines.length && oldLines[oldIdx] === newLines[newIdx]) {
+      // Matching line - this is context, not a change
+      oldIdx++;
+    } else {
+      // Check if this line from newStr exists later in oldStr (handles deletions)
+      let foundAhead = false;
+      for (let lookAhead = oldIdx; lookAhead < oldLines.length; lookAhead++) {
+        if (oldLines[lookAhead] === newLines[newIdx]) {
+          oldIdx = lookAhead + 1;
+          foundAhead = true;
+          break;
+        }
+      }
+
+      if (!foundAhead) {
+        // Line is genuinely new or modified - attribute to AI
+        changedOffsets.push(newIdx);
+      }
+    }
+  }
+
+  return changedOffsets;
+}
+
 export function computeRangePositions(edits: FileEdit[], fileContent?: string): RangePosition[] {
   return edits
     .filter((e) => e.new_string)
-    .map((edit) => {
+    .flatMap((edit) => {
+      // Case 1: Has explicit range from tool → use it
       if (edit.range) {
-        return {
+        return [{
           start_line: edit.range.start_line_number,
           end_line: edit.range.end_line_number,
-        };
+        }];
+      }
+
+      // Case 2: Has both old_string and new_string → diff them to find actual changes
+      if (edit.old_string && edit.new_string && fileContent) {
+        const idx = fileContent.indexOf(edit.new_string);
+        if (idx !== -1) {
+          const startLine = fileContent.substring(0, idx).split("\n").length;
+          const changedOffsets = diffToFindChangedLines(edit.old_string, edit.new_string);
+
+          if (changedOffsets.length === 0) {
+            return [];
+          }
+
+          // Convert offsets to line ranges, merging adjacent lines
+          const ranges: RangePosition[] = [];
+          let rangeStart = changedOffsets[0];
+          let rangeEnd = changedOffsets[0];
+
+          for (let i = 1; i < changedOffsets.length; i++) {
+            if (changedOffsets[i] === rangeEnd + 1) {
+              rangeEnd = changedOffsets[i];
+            } else {
+              ranges.push({
+                start_line: startLine + rangeStart,
+                end_line: startLine + rangeEnd,
+              });
+              rangeStart = changedOffsets[i];
+              rangeEnd = changedOffsets[i];
+            }
+          }
+
+          ranges.push({
+            start_line: startLine + rangeStart,
+            end_line: startLine + rangeEnd,
+          });
+
+          return ranges;
+        }
       }
+
+      // Case 3: Fallback - attribute entire new_string (original behavior)
       const lineCount = edit.new_string.split("\n").length;
       if (fileContent) {
         const idx = fileContent.indexOf(edit.new_string);
         if (idx !== -1) {
           const startLine = fileContent.substring(0, idx).split("\n").length;
-          return { start_line: startLine, end_line: startLine + lineCount - 1 };
+          return [{ start_line: startLine, end_line: startLine + lineCount - 1 }];
         }
       }
-      return { start_line: 1, end_line: lineCount };
+      return [{ start_line: 1, end_line: lineCount }];
     });
 }