humanloop
diff --git a/‎.fernignore‎
Lines changed: 3 additions & 1 deletion b/‎.fernignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎babel.config.js‎
Lines changed: 12 additions & 0 deletions b/‎babel.config.js‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎jest.config.js‎
Lines changed: 8 additions & 0 deletions b/‎jest.config.js‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎package.json‎
Lines changed: 4 additions & 0 deletions b/‎package.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/eval_utils/context.ts‎
Lines changed: 9 additions & 1 deletion b/‎src/eval_utils/context.ts‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/eval_utils/run.ts‎
Lines changed: 87 additions & 14 deletions b/‎src/eval_utils/run.ts‎
Lines changed: 87 additions & 14 deletions
diff --git a/‎src/eval_utils/types.ts‎
Lines changed: 27 additions & 21 deletions b/‎src/eval_utils/types.ts‎
Lines changed: 27 additions & 21 deletions
@@ -16,9 +16,11 @@ tests
 
 .github/workflows/ci.yml
 
-# Prettier
+# Config files
 
 .prettierrc.yml
+babel.config.js
+jest.config.js
 
 # Package Scripts
 
 
@@ -0,0 +1,12 @@
+module.exports = {
+  presets: [
+      [
+        "@babel/preset-env",
+        { targets: { node: "current" } }
+      ],
+      '@babel/preset-typescript'
+  ],
+  plugins: [
+    "@babel/plugin-transform-modules-commonjs"
+  ]
+};
@@ -2,4 +2,12 @@
 module.exports = {
     preset: "ts-jest",
     testEnvironment: "node",
+    // If Jest complains about an unknown symbol when running tests, you're dealing with dependency
+    // written in ES module instead of CJS format. Add the dependency in the exclusive regex group below.
+    // All modules NOT matching the pattern (thus exclusive grouping) will be passed to babel for
+    // transpilation before tests are ran.
+    transformIgnorePatterns: ["<rootDir>/node_modules/(?!p-map/)"],
+    transform: {
+        "\\.js$": "babel-jest"
+    }
 };
@@ -38,6 +38,9 @@
     },
     "devDependencies": {
         "@anthropic-ai/sdk": "^0.32.1",
+        "@babel/core": "^7.26.0",
+        "@babel/plugin-transform-modules-commonjs": "^7.26.3",
+        "@babel/preset-env": "^7.26.0",
         "@trivago/prettier-plugin-sort-imports": "^5.2.0",
         "@types/cli-progress": "^3.11.6",
         "@types/jest": "29.5.5",
@@ -46,6 +49,7 @@
         "@types/qs": "6.9.8",
         "@types/readable-stream": "^4.0.15",
         "@types/url-join": "4.0.1",
+        "babel-jest": "^29.7.0",
         "cohere-ai": "^7.15.0",
         "dotenv": "^16.4.6",
         "fetch-mock-jest": "^1.5.1",
 
@@ -3,11 +3,13 @@ import hash from "stable-hash";
 import { FlowLogRequest, PromptLogRequest } from "../api";
 import { DatapointResponse } from "../api";
 import { Humanloop } from "../index";
+import { Version } from "./types";
 
 type EvaluationContextState = {
     fileId?: string;
     path?: string;
     uploadCallback: (logId: string, datapoint: DatapointResponse) => void;
+    evaluatedVersion?: Version;
 };
 
 type EvaluationContextKey = {
@@ -45,6 +47,7 @@ class EvaluationContext {
             : {
                   fileId: this.state.fileId,
                   path: this.state.path,
+                  evaluatedVersion: this.state.evaluatedVersion,
               };
     }
 
@@ -66,9 +69,14 @@ class EvaluationContext {
     }
 
     public getDatapoint(key: EvaluationContextKey): EvaluationContextValue {
+        if (key.inputs !== undefined && "inputs" in key.inputs) {
+            key = { ...key, inputs: key.inputs.inputs as Record<string, unknown> };
+        }
         const mappings = this.inputMappings.get(hash(key));
         if (!mappings || mappings.length === 0) {
-            throw new Error(`No input mappings found for: ${JSON.stringify(key)}`);
+            throw new Error(
+                `No input mappings found for: ${JSON.stringify(key)}. Try using peekDatapoint() first.`,
+            );
         }
         return mappings.pop()!;
     }
 
@@ -9,7 +9,6 @@
  */
 import cliProgress from "cli-progress";
 import { Humanloop, HumanloopClient } from "index";
-import { AsyncFunction } from "otel";
 import pMap from "p-map";
 
 import {
@@ -62,11 +61,13 @@ export function overloadLog<T extends Flows | Prompts>(client: T): T {
     ) => {
         let response: LogResponse | undefined;
         if (evaluationContext.isEvaluatedFile(request)) {
+            const state = evaluationContext.getState();
             const { runId, sourceDatapointId, uploadCallback } =
                 evaluationContext.getDatapoint({
                     inputs: request.inputs,
                     messages: request.messages,
                 });
+
             if (request.runId === undefined) {
                 request = {
                     ...request,
@@ -79,9 +80,52 @@ export function overloadLog<T extends Flows | Prompts>(client: T): T {
                     sourceDatapointId: sourceDatapointId,
                 };
             }
+            if (client instanceof Flows) {
+                request = {
+                    ...request,
+                    traceStatus: "complete",
+                };
+            }
 
-            // @ts-ignore
-            response = await originalLog(request, options);
+            if ("flow" in request) {
+                if (
+                    JSON.stringify(state!.evaluatedVersion) !==
+                    JSON.stringify(request.flow)
+                ) {
+                    response = await originalLog(
+                        {
+                            ...request,
+                            // @ts-ignore Log under the version expected by evaluation, not
+                            // one determined by decorators. Otherwise the evaluation will stale
+                            flow: state?.evaluatedVersion,
+                            output: undefined,
+                            error: `The version of the evaluated Flow must match the version of the callable. Expected: ${JSON.stringify(state!.evaluatedVersion)}, got: ${JSON.stringify(request.flow)}`,
+                        },
+                        options,
+                    );
+                }
+            }
+
+            if ("prompt" in request) {
+                if (
+                    JSON.stringify(state!.evaluatedVersion) !==
+                    JSON.stringify(request.prompt)
+                ) {
+                    response = await originalLog({
+                        ...request,
+                        // @ts-ignore Log under the version expected by evaluation, not
+                        // one determined by decorators. Otherwise the evaluation will stale
+                        prompt: state?.evaluatedVersion,
+                        output: undefined,
+                        error: `The version of the evaluated Prompt must match the version of the callable. Expected: ${JSON.stringify(state!.evaluatedVersion)}, got: ${JSON.stringify(request.prompt)}`,
+                    });
+                }
+            }
+
+            if (response === undefined) {
+                // Version validation passed, make a normal request
+                response = await originalLog(request, options);
+            }
 
             // @ts-ignore
             uploadCallback(response.id);
@@ -111,6 +155,28 @@ export async function runEval(
         throw new Error("You must provide a path or id in your `file`.");
     }
 
+    if (file.callable && "path" in file.callable) {
+        if (file.path !== file.callable.path) {
+            throw new Error(
+                "The path of the evaluated `file` must match the path of your decorated `callable`. Expected path: " +
+                    file.path +
+                    ", got: " +
+                    file.callable.path,
+            );
+        }
+    }
+
+    if (file.callable && "version" in file.callable) {
+        if (file.version !== file.callable.version) {
+            throw new Error(
+                "The version of the evaluated `file` must match the version of your decorated `callable`. Expected version: " +
+                    JSON.stringify(file.version) +
+                    ", got: " +
+                    JSON.stringify(file.callable.version),
+            );
+        }
+    }
+
     let type: FileType;
     if (file.type) {
         type = file.type;
@@ -308,7 +374,9 @@ export async function runEval(
         path: hlFile.path,
         uploadCallback: async (logId: string, datapoint: DatapointResponse) => {
             await runLocalEvaluators(client, logId, datapoint, localEvaluators);
+            progressBar.increment();
         },
+        evaluatedVersion: file.version,
     });
 
     async function processDatapoint(
@@ -327,11 +395,15 @@ export async function runEval(
         try {
             evaluationContext.addDatapoint(datapoint, runId);
             let output: string;
-            if ("messages" in datapoint && datapoint.messages !== undefined) {
-                output = await function_!(datapoint.inputs, datapoint.messages);
-            } else {
-                output = await function_!(datapoint.inputs);
+            if (datapoint.inputs === undefined) {
+                throw new Error(`Datapoint 'inputs' attribute is undefined.`);
             }
+            output = await function_!(
+                // @ts-ignore
+                datapoint.inputs,
+                datapoint.messages,
+            );
+
             if (typeof output !== "string") {
                 try {
                     output = JSON.stringify(output);
@@ -354,7 +426,8 @@ export async function runEval(
                 // The log function will take care of the sourceDatapointId and runId from the context
                 // See overloadLog in this module for more details
                 await logFunc({
-                    inputs: datapoint.inputs,
+                    inputs: { ...datapoint.inputs },
+                    messages: datapoint.messages,
                     output: output,
                     startTime: start_time,
                     endTime: new Date(),
@@ -363,7 +436,7 @@ export async function runEval(
         } catch (e) {
             const errorMessage = e instanceof Error ? e.message : String(e);
             await logFunc({
-                inputs: datapoint.inputs,
+                inputs: { ...datapoint.inputs },
                 error: errorMessage,
                 sourceDatapointId: datapoint.id,
                 startTime: start_time,
@@ -395,7 +468,6 @@ export async function runEval(
             hlDataset.datapoints!,
             async (datapoint) => {
                 await processDatapoint(datapoint, runId);
-                progressBar.increment();
             },
             { concurrency: workers },
         );
@@ -471,8 +543,8 @@ function getLogFunction(
         // TODO: why does the Log `id` field refer to the file ID in the API?
         // Why are both `id` and `version_id` needed in the API?
         id: fileId,
-        versionId: versionId,
-        runId: runId,
+        versionId,
+        runId,
     };
 
     switch (type) {
@@ -501,7 +573,7 @@ async function runLocalEvaluators(
     client: HumanloopClient,
     logId: string,
     datapoint: DatapointResponse | undefined,
-    localEvaluators: [EvaluatorResponse, Function | AsyncFunction][],
+    localEvaluators: [EvaluatorResponse, Function][],
 ) {
     const log = await client.logs.get(logId);
 
@@ -512,7 +584,7 @@ async function runLocalEvaluators(
             if (evaluator.spec.argumentsType === "target_required") {
                 judgment = await evalFunction(log, datapoint);
             } else {
-                judgment = evalFunction(log);
+                judgment = await evalFunction(log);
             }
 
             await client.evaluators.log({
@@ -525,6 +597,7 @@ async function runLocalEvaluators(
             });
         } catch (e) {
             await client.evaluators.log({
+                path: evaluator.path,
                 versionId: evaluator.versionId,
                 parentId: logId,
                 error: e instanceof Error ? e.message : String(e),
 
@@ -6,7 +6,6 @@ import {
     CreatePromptLogResponse,
     CreateToolLogResponse,
     CreateDatapointRequest as DatapointRequest,
-    EvaluatorArgumentsType,
     EvaluatorResponse,
     EvaluatorReturnTypeEnum,
     EvaluatorsRequest,
@@ -26,16 +25,16 @@ import {
     ToolRequest,
     ToolResponse,
     UpdateDatesetAction as UpdateDatasetAction,
-} from "api";
-
+} from "../api";
+import { DatapointResponse, EvaluatorArgumentsType } from "../api/types";
 import { FileType } from "../api/types/FileType";
 
 type EvaluatorVersion =
     | LlmEvaluatorRequest
     | HumanEvaluatorRequest
     | CodeEvaluatorRequest
     | ExternalEvaluatorRequest;
-type Version =
+export type Version =
     | FlowKernelRequest
     | PromptKernelRequest
     | ToolKernelRequest
@@ -75,7 +74,15 @@ export interface File extends Identifiers {
      * `output = callable(datapoint.inputs, messages=datapoint.messages)`.
      * It should return a single string output. If not, you must provide a custom_logger.
      */
-    callable?: (...args: any[]) => string | Promise<string>;
+    callable?:
+        | ((inputs: any, messages?: any[]) => string | Promise<string>)
+        // Decorated callables carry metadata about path and version
+        // Which should match the ones provided in the File
+        | {
+              (inputs: any, messages?: any[]): string | Promise<string>;
+              version: Version;
+              path: string;
+          };
 }
 
 export interface Dataset extends Identifiers {
@@ -89,26 +96,25 @@ export interface Dataset extends Identifiers {
 }
 
 export interface Evaluator extends Identifiers {
-    /** The type of arguments the Evaluator expects - only required for local Evaluators. */
-    argsType?: EvaluatorArgumentsType;
     /** The type of return value the Evaluator produces - only required for local Evaluators. */
     returnType?: EvaluatorReturnTypeEnum;
-    /** The function to run on the logs to produce the judgment - only required for local Evaluators. */
-    callable?: (...args: any[]) => any; // TODO define explicitly the args and return type
-    /**
-     * Optional function that logs the output judgment from your Evaluator to Humanloop.
-     * If provided, it will be called as follows:
-     *
-     * ```typescript
-     * judgment = callable(log);
-     * log = custom_logger(client, judgment);
-     * ```
-     *
-     * Inside the custom_logger, you can use the Humanloop client to log the judgment to Humanloop.
-     * If not provided, your function must return a single string, and by default, the code will be used to inform the version of the external Evaluator on Humanloop.
-     */
     /**The threshold to check the Evaluator against. If the aggregate value of the Evaluator is below this threshold, the check will fail.*/
     threshold?: number;
+    callable: Function;
+    argsType: EvaluatorArgumentsType;
+}
+
+export interface TargetFreeEvaluator extends Evaluator {
+    argsType: "target_free";
+    callable: (log: LogResponse) => string | number | boolean;
+}
+
+export interface TargetedEvaluator extends Evaluator {
+    argsType: "target_required";
+    callable: (
+        inputs: LogResponse,
+        target: DatapointResponse,
+    ) => string | number | boolean;
 }
 
 /**