Eval works with simple callables

Andrei Bratu · Andrei Bratu · commit 8de2fe916769 · 2024-12-14T19:40:28.000Z
diff --git a/package.json b/package.json
@@ -13,14 +13,6 @@
         "test": "jest --detectOpenHandles --forceExit"
     },
     "dependencies": {
-        "form-data": "^4.0.0",
-        "form-data-encoder": "^4.0.2",
-        "formdata-node": "^6.0.3",
-        "node-fetch": "2.7.0",
-        "qs": "6.11.2",
-        "readable-stream": "^4.5.2",
-        "ts-json-schema-generator": "^2.3.0",
-        "url-join": "4.0.1",
         "@opentelemetry/api": "^1.9.0",
         "@opentelemetry/auto-instrumentations-node": "^0.53.0",
         "@opentelemetry/sdk-metrics": "^1.28.0",
@@ -30,32 +22,42 @@
         "@traceloop/instrumentation-anthropic": "^0.11.1",
         "@traceloop/instrumentation-cohere": "^0.11.1",
         "@traceloop/instrumentation-openai": "^0.11.3",
-        "uuid": "^11.0.3",
+        "cli-progress": "^3.12.0",
+        "form-data": "^4.0.0",
+        "form-data-encoder": "^4.0.2",
+        "formdata-node": "^6.0.3",
         "nanoid": "^5.0.9",
-        "cli-progress": "^3.12.0"
+        "node-fetch": "2.7.0",
+        "p-map": "^7.0.3",
+        "qs": "6.11.2",
+        "readable-stream": "^4.5.2",
+        "stable-hash": "^0.0.4",
+        "ts-json-schema-generator": "^2.3.0",
+        "url-join": "4.0.1",
+        "uuid": "^11.0.3"
     },
     "devDependencies": {
+        "@anthropic-ai/sdk": "^0.32.1",
+        "@trivago/prettier-plugin-sort-imports": "^5.2.0",
+        "@types/cli-progress": "^3.11.6",
         "@types/jest": "29.5.5",
         "@types/node": "17.0.33",
         "@types/node-fetch": "2.6.9",
         "@types/qs": "6.9.8",
         "@types/readable-stream": "^4.0.15",
         "@types/url-join": "4.0.1",
+        "cohere-ai": "^7.15.0",
+        "dotenv": "^16.4.6",
         "fetch-mock-jest": "^1.5.1",
         "jest": "29.7.0",
         "jest-environment-jsdom": "29.7.0",
+        "jsonschema": "^1.4.1",
+        "openai": "^4.74.0",
+        "prettier": "^3.4.2",
         "ts-jest": "29.1.1",
         "ts-loader": "^9.3.1",
         "typescript": "4.6.4",
-        "webpack": "^5.94.0",
-        "openai": "^4.74.0",
-        "@anthropic-ai/sdk": "^0.32.1",
-        "cohere-ai": "^7.15.0",
-        "dotenv": "^16.4.6",
-        "jsonschema": "^1.4.1",
-        "@trivago/prettier-plugin-sort-imports": "^5.2.0",
-        "prettier": "^3.4.2",
-        "@types/cli-progress": "^3.11.6"
+        "webpack": "^5.94.0"
     },
     "browser": {
         "fs": false,
diff --git a/src/eval_utils/context.ts b/src/eval_utils/context.ts
@@ -1,3 +1,5 @@
+import hash from "stable-hash";
+
 import { FlowLogRequest, PromptLogRequest } from "../api";
 import { DatapointResponse } from "../api";
 import { Humanloop } from "../index";
@@ -22,8 +24,7 @@ type EvaluationContextValue = {
 class EvaluationContext {
     private state?: EvaluationContextState;
     private static instance: EvaluationContext;
-    private inputMappings: Map<EvaluationContextKey, EvaluationContextValue[]> =
-        new Map();
+    private inputMappings: Map<string, EvaluationContextValue[]> = new Map();
 
     private constructor() {}
 
@@ -51,7 +52,7 @@ class EvaluationContext {
         if (this.state === undefined) {
             throw new Error("EvaluationContext state is not set");
         }
-        const key = { inputs: datapoint.inputs, messages: datapoint.messages };
+        const key = hash({ inputs: datapoint.inputs, messages: datapoint.messages });
 
         if (!this.inputMappings.has(key)) {
             this.inputMappings.set(key, []);
@@ -65,21 +66,22 @@ class EvaluationContext {
     }
 
     public getDatapoint(key: EvaluationContextKey): EvaluationContextValue {
-        const mappings = this.inputMappings.get(key);
+        const mappings = this.inputMappings.get(hash(key));
         if (!mappings || mappings.length === 0) {
             throw new Error(`No input mappings found for: ${JSON.stringify(key)}`);
         }
         return mappings.pop()!;
     }
 
     public peekDatapoint(key: EvaluationContextKey): boolean {
-        const mappings = this.inputMappings.get(key);
+        const mappings = this.inputMappings.get(hash(key));
         return mappings !== undefined && mappings.length > 0;
     }
 
     public isEvaluatedFile(args: FlowLogRequest | PromptLogRequest) {
         return (
-            this.state && this.state.fileId === args.id && this.state.path === args.path
+            this.state &&
+            (this.state.fileId === args.id || this.state.path === args.path)
         );
     }
 }
diff --git a/src/eval_utils/run.ts b/src/eval_utils/run.ts
@@ -10,10 +10,10 @@
 import cliProgress from "cli-progress";
 import { Humanloop, HumanloopClient } from "index";
 import { AsyncFunction } from "otel";
+import pMap from "p-map";
 
 import {
     BooleanEvaluatorStatsResponse,
-    CreateEvaluatorLogRequest,
     CreateEvaluatorLogResponse,
     CreateFlowLogResponse,
     CreatePromptLogResponse,
@@ -51,13 +51,10 @@ type LogResponse =
     | CreatePromptLogResponse
     | CreateToolLogResponse
     | CreateEvaluatorLogResponse;
-type LogRequest =
-    | FlowLogRequest
-    | PromptLogRequest
-    | ToolLogRequest
-    | CreateEvaluatorLogRequest;
 
 export function overloadLog<T extends Flows | Prompts>(client: T): T {
+    const originalLog = client.log.bind(client);
+
     // @ts-ignore
     const _overloadedLog: T["log"] = async (
         request: FlowLogRequest | PromptLogRequest,
@@ -83,20 +80,22 @@ export function overloadLog<T extends Flows | Prompts>(client: T): T {
                 };
             }
 
-            response = await client.log(request, options);
+            // @ts-ignore
+            response = await originalLog(request, options);
 
+            // @ts-ignore
             uploadCallback(response.id);
         } else {
-            response = await client.log(request, options);
+            // @ts-ignore
+            response = await originalLog(request, options);
         }
 
         return response;
     };
 
-    return {
-        ...client,
-        log: _overloadedLog,
-    };
+    client.log = _overloadedLog.bind(client);
+
+    return client;
 }
 
 export async function runEval(
@@ -105,6 +104,7 @@ export async function runEval(
     dataset: Dataset,
     name?: string,
     evaluators: Evaluator[] = [],
+    workers: number = 8,
 ): Promise<EvaluatorCheck[]> {
     // Get or create the file on Humanloop
     if (!file.path && !file.id) {
@@ -145,6 +145,7 @@ export async function runEval(
             }
             const updatedData = { ...rest, ...version } as FlowRequest;
             hlFile = await client.flows.upsert(updatedData);
+            break;
         }
         case "prompt": {
             hlFile = await client.prompts.upsert({
@@ -307,7 +308,6 @@ export async function runEval(
         path: hlFile.path,
         uploadCallback: async (logId: string, datapoint: DatapointResponse) => {
             await runLocalEvaluators(client, logId, datapoint, localEvaluators);
-            progressBar.increment();
         },
     });
 
@@ -327,11 +327,8 @@ export async function runEval(
         try {
             evaluationContext.addDatapoint(datapoint, runId);
             let output: string;
-            if ("messages" in datapoint) {
-                output = await function_!({
-                    ...datapoint.inputs,
-                    messages: datapoint.messages,
-                });
+            if ("messages" in datapoint && datapoint.messages !== undefined) {
+                output = await function_!(datapoint.inputs, datapoint.messages);
             } else {
                 output = await function_!(datapoint.inputs);
             }
@@ -356,10 +353,7 @@ export async function runEval(
 
                 // The log function will take care of the sourceDatapointId and runId from the context
                 // See overloadLog in this module for more details
-                console.debug(
-                    `function_ ${function_} is a simple callable, datapoint context was not consumed`,
-                );
-                logFunc({
+                await logFunc({
                     inputs: datapoint.inputs,
                     output: output,
                     startTime: start_time,
@@ -368,13 +362,14 @@ export async function runEval(
             }
         } catch (e) {
             const errorMessage = e instanceof Error ? e.message : String(e);
-            logFunc({
+            await logFunc({
                 inputs: datapoint.inputs,
                 error: errorMessage,
                 sourceDatapointId: datapoint.id,
                 startTime: start_time,
                 endTime: new Date(),
             });
+            // console.log(e);
             console.warn(
                 `\nYour ${type}'s callable failed for Datapoint: ${datapoint.id}.\nError: ${errorMessage}`,
             );
@@ -396,11 +391,14 @@ export async function runEval(
         );
         const totalDatapoints = hlDataset.datapoints!.length;
         progressBar.start(totalDatapoints, 0);
-        const promises = hlDataset.datapoints!.map(async (datapoint) => {
-            await processDatapoint(datapoint, runId);
-            progressBar.increment();
-        });
-        await Promise.all(promises);
+        await pMap(
+            hlDataset.datapoints!,
+            async (datapoint) => {
+                await processDatapoint(datapoint, runId);
+                progressBar.increment();
+            },
+            { concurrency: workers },
+        );
         progressBar.stop();
     } else {
         // TODO: trigger run when updated API is available
@@ -466,8 +464,9 @@ function getLogFunction(
     fileId: string,
     versionId: string,
     runId: string,
-): (args: LogRequest) => Promise<LogResponse> {
+) {
     /** Returns the appropriate log function pre-filled with common parameters. */
+
     const logRequest = {
         // TODO: why does the Log `id` field refer to the file ID in the API?
         // Why are both `id` and `version_id` needed in the API?
@@ -478,22 +477,21 @@ function getLogFunction(
 
     switch (type) {
         case "flow":
-            return (args: FlowLogRequest) =>
-                client.flows.log({
+            return async (args: FlowLogRequest) =>
+                await client.flows.log({
                     ...logRequest,
                     traceStatus: "complete",
                     ...args,
                 });
         case "prompt":
-            return (args: PromptLogRequest) =>
-                client.prompts.log({ ...logRequest, ...args });
-        case "evaluator":
-            // @ts-ignore
-            return (args: CreateEvaluatorLogRequest) =>
-                client.evaluators.log({ ...logRequest, ...args });
+            return async (args: PromptLogRequest) =>
+                await client.prompts.log({ ...logRequest, ...args });
+        // case "evaluator":
+        //     return (args: CreateEvaluatorLogRequest) =>
+        //         client.evaluators.log({ ...logRequest, ...args });
         case "tool":
-            return (args: ToolLogRequest) =>
-                client.tools.log({ ...logRequest, ...args });
+            return async (args: ToolLogRequest) =>
+                await client.tools.log({ ...logRequest, ...args });
         default:
             throw new Error(`Unsupported File version: ${type}`);
     }
@@ -517,15 +515,16 @@ async function runLocalEvaluators(
                 judgment = evalFunction(log);
             }
 
-            client.evaluators.log({
+            await client.evaluators.log({
+                path: evaluator.path,
                 versionId: evaluator.versionId,
                 parentId: logId,
                 judgment: judgment,
                 startTime: startTime,
                 endTime: new Date(),
             });
         } catch (e) {
-            client.evaluators.log({
+            await client.evaluators.log({
                 versionId: evaluator.versionId,
                 parentId: logId,
                 error: e instanceof Error ? e.message : String(e),
diff --git a/src/humanloop.client.ts b/src/humanloop.client.ts
@@ -2,18 +2,16 @@ import { NodeTracerProvider, Tracer } from "@opentelemetry/sdk-trace-node";
 import { AnthropicInstrumentation } from "@traceloop/instrumentation-anthropic";
 import { CohereInstrumentation } from "@traceloop/instrumentation-cohere";
 import { OpenAIInstrumentation } from "@traceloop/instrumentation-openai";
-import CohereAI from "cohere-ai";
 
-import { Dataset, Evaluator, EvaluatorCheck, File } from "../eval_utils/types";
 import { HumanloopClient as BaseHumanloopClient } from "./Client";
 import { Evaluations as BaseEvaluations } from "./api/resources/evaluations/client/Client";
 import { Flows } from "./api/resources/flows/client/Client";
 import { Prompts } from "./api/resources/prompts/client/Client";
 import { FlowKernelRequest } from "./api/types/FlowKernelRequest";
 import { ToolKernelRequest } from "./api/types/ToolKernelRequest";
 import { overloadLog, runEval } from "./eval_utils/run";
+import { Dataset, Evaluator, EvaluatorCheck, File } from "./eval_utils/types";
 import { HumanloopSpanExporter } from "./otel/exporter";
-import { moduleIsInstalled } from "./otel/helpers";
 import { HumanloopSpanProcessor } from "./otel/processor";
 import { flowUtilityFactory } from "./utilities/flow";
 import { UtilityPromptKernel, promptUtilityFactory } from "./utilities/prompt";
@@ -32,8 +30,9 @@ class ExtendedEvaluations extends BaseEvaluations {
         dataset: Dataset,
         name?: string,
         evaluators: Evaluator[] = [],
+        workers: number = 8,
     ): Promise<EvaluatorCheck[]> {
-        return runEval(this._client, file, dataset, name, evaluators);
+        return runEval(this._client, file, dataset, name, evaluators, workers);
     }
 }
 
diff --git a/src/otel/helpers.ts b/src/otel/helpers.ts
@@ -194,21 +194,6 @@ export function isHumanloopSpan(span: ReadableSpan): boolean {
     return span.attributes[HUMANLOOP_FILE_TYPE_KEY] !== undefined;
 }
 
-/**
- * Determines if the current Node.js environment has a specific module installed.
- *
- * @param moduleName - Name of the module to check
- * @returns True if the module is installed, false otherwise
- */
-export function moduleIsInstalled(moduleName: string): boolean {
-    try {
-        require.resolve(moduleName);
-        return true;
-    } catch {
-        return false;
-    }
-}
-
 /**
  * Generates a unique span ID.
  *
diff --git a/yarn.lock b/yarn.lock