Skip to content

Commit f91133b

Browse files
author
Andrei Bratu
committed
QA pass over eval run
1 parent 8de2fe9 commit f91133b

20 files changed

Lines changed: 1306 additions & 310 deletions

.fernignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ tests
1616

1717
.github/workflows/ci.yml
1818

19-
# Prettier
19+
# Config files
2020

2121
.prettierrc.yml
22+
babel.config.js
23+
jest.config.js
2224

2325
# Package Scripts
2426

babel.config.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
module.exports = {
2+
presets: [
3+
[
4+
"@babel/preset-env",
5+
{ targets: { node: "current" } }
6+
],
7+
'@babel/preset-typescript'
8+
],
9+
plugins: [
10+
"@babel/plugin-transform-modules-commonjs"
11+
]
12+
};

jest.config.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,12 @@
22
module.exports = {
33
preset: "ts-jest",
44
testEnvironment: "node",
5+
// If Jest complains about an unknown symbol when running tests, you're dealing with dependency
6+
// written in ES module instead of CJS format. Add the dependency in the exclusive regex group below.
7+
// All modules NOT matching the pattern (thus exclusive grouping) will be passed to babel for
8+
// transpilation before tests are ran.
9+
transformIgnorePatterns: ["<rootDir>/node_modules/(?!p-map/)"],
10+
transform: {
11+
"\\.js$": "babel-jest"
12+
}
513
};

package.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
},
3939
"devDependencies": {
4040
"@anthropic-ai/sdk": "^0.32.1",
41+
"@babel/core": "^7.26.0",
42+
"@babel/plugin-transform-modules-commonjs": "^7.26.3",
43+
"@babel/preset-env": "^7.26.0",
4144
"@trivago/prettier-plugin-sort-imports": "^5.2.0",
4245
"@types/cli-progress": "^3.11.6",
4346
"@types/jest": "29.5.5",
@@ -46,6 +49,7 @@
4649
"@types/qs": "6.9.8",
4750
"@types/readable-stream": "^4.0.15",
4851
"@types/url-join": "4.0.1",
52+
"babel-jest": "^29.7.0",
4953
"cohere-ai": "^7.15.0",
5054
"dotenv": "^16.4.6",
5155
"fetch-mock-jest": "^1.5.1",

src/eval_utils/context.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ import hash from "stable-hash";
33
import { FlowLogRequest, PromptLogRequest } from "../api";
44
import { DatapointResponse } from "../api";
55
import { Humanloop } from "../index";
6+
import { Version } from "./types";
67

78
type EvaluationContextState = {
89
fileId?: string;
910
path?: string;
1011
uploadCallback: (logId: string, datapoint: DatapointResponse) => void;
12+
evaluatedVersion?: Version;
1113
};
1214

1315
type EvaluationContextKey = {
@@ -45,6 +47,7 @@ class EvaluationContext {
4547
: {
4648
fileId: this.state.fileId,
4749
path: this.state.path,
50+
evaluatedVersion: this.state.evaluatedVersion,
4851
};
4952
}
5053

@@ -66,9 +69,14 @@ class EvaluationContext {
6669
}
6770

6871
public getDatapoint(key: EvaluationContextKey): EvaluationContextValue {
72+
if (key.inputs !== undefined && "inputs" in key.inputs) {
73+
key = { ...key, inputs: key.inputs.inputs as Record<string, unknown> };
74+
}
6975
const mappings = this.inputMappings.get(hash(key));
7076
if (!mappings || mappings.length === 0) {
71-
throw new Error(`No input mappings found for: ${JSON.stringify(key)}`);
77+
throw new Error(
78+
`No input mappings found for: ${JSON.stringify(key)}. Try using peekDatapoint() first.`,
79+
);
7280
}
7381
return mappings.pop()!;
7482
}

src/eval_utils/run.ts

Lines changed: 87 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
*/
1010
import cliProgress from "cli-progress";
1111
import { Humanloop, HumanloopClient } from "index";
12-
import { AsyncFunction } from "otel";
1312
import pMap from "p-map";
1413

1514
import {
@@ -62,11 +61,13 @@ export function overloadLog<T extends Flows | Prompts>(client: T): T {
6261
) => {
6362
let response: LogResponse | undefined;
6463
if (evaluationContext.isEvaluatedFile(request)) {
64+
const state = evaluationContext.getState();
6565
const { runId, sourceDatapointId, uploadCallback } =
6666
evaluationContext.getDatapoint({
6767
inputs: request.inputs,
6868
messages: request.messages,
6969
});
70+
7071
if (request.runId === undefined) {
7172
request = {
7273
...request,
@@ -79,9 +80,52 @@ export function overloadLog<T extends Flows | Prompts>(client: T): T {
7980
sourceDatapointId: sourceDatapointId,
8081
};
8182
}
83+
if (client instanceof Flows) {
84+
request = {
85+
...request,
86+
traceStatus: "complete",
87+
};
88+
}
8289

83-
// @ts-ignore
84-
response = await originalLog(request, options);
90+
if ("flow" in request) {
91+
if (
92+
JSON.stringify(state!.evaluatedVersion) !==
93+
JSON.stringify(request.flow)
94+
) {
95+
response = await originalLog(
96+
{
97+
...request,
98+
// @ts-ignore Log under the version expected by evaluation, not
99+
// one determined by decorators. Otherwise the evaluation will stale
100+
flow: state?.evaluatedVersion,
101+
output: undefined,
102+
error: `The version of the evaluated Flow must match the version of the callable. Expected: ${JSON.stringify(state!.evaluatedVersion)}, got: ${JSON.stringify(request.flow)}`,
103+
},
104+
options,
105+
);
106+
}
107+
}
108+
109+
if ("prompt" in request) {
110+
if (
111+
JSON.stringify(state!.evaluatedVersion) !==
112+
JSON.stringify(request.prompt)
113+
) {
114+
response = await originalLog({
115+
...request,
116+
// @ts-ignore Log under the version expected by evaluation, not
117+
// one determined by decorators. Otherwise the evaluation will stale
118+
prompt: state?.evaluatedVersion,
119+
output: undefined,
120+
error: `The version of the evaluated Prompt must match the version of the callable. Expected: ${JSON.stringify(state!.evaluatedVersion)}, got: ${JSON.stringify(request.prompt)}`,
121+
});
122+
}
123+
}
124+
125+
if (response === undefined) {
126+
// Version validation passed, make a normal request
127+
response = await originalLog(request, options);
128+
}
85129

86130
// @ts-ignore
87131
uploadCallback(response.id);
@@ -111,6 +155,28 @@ export async function runEval(
111155
throw new Error("You must provide a path or id in your `file`.");
112156
}
113157

158+
if (file.callable && "path" in file.callable) {
159+
if (file.path !== file.callable.path) {
160+
throw new Error(
161+
"The path of the evaluated `file` must match the path of your decorated `callable`. Expected path: " +
162+
file.path +
163+
", got: " +
164+
file.callable.path,
165+
);
166+
}
167+
}
168+
169+
if (file.callable && "version" in file.callable) {
170+
if (file.version !== file.callable.version) {
171+
throw new Error(
172+
"The version of the evaluated `file` must match the version of your decorated `callable`. Expected version: " +
173+
JSON.stringify(file.version) +
174+
", got: " +
175+
JSON.stringify(file.callable.version),
176+
);
177+
}
178+
}
179+
114180
let type: FileType;
115181
if (file.type) {
116182
type = file.type;
@@ -308,7 +374,9 @@ export async function runEval(
308374
path: hlFile.path,
309375
uploadCallback: async (logId: string, datapoint: DatapointResponse) => {
310376
await runLocalEvaluators(client, logId, datapoint, localEvaluators);
377+
progressBar.increment();
311378
},
379+
evaluatedVersion: file.version,
312380
});
313381

314382
async function processDatapoint(
@@ -327,11 +395,15 @@ export async function runEval(
327395
try {
328396
evaluationContext.addDatapoint(datapoint, runId);
329397
let output: string;
330-
if ("messages" in datapoint && datapoint.messages !== undefined) {
331-
output = await function_!(datapoint.inputs, datapoint.messages);
332-
} else {
333-
output = await function_!(datapoint.inputs);
398+
if (datapoint.inputs === undefined) {
399+
throw new Error(`Datapoint 'inputs' attribute is undefined.`);
334400
}
401+
output = await function_!(
402+
// @ts-ignore
403+
datapoint.inputs,
404+
datapoint.messages,
405+
);
406+
335407
if (typeof output !== "string") {
336408
try {
337409
output = JSON.stringify(output);
@@ -354,7 +426,8 @@ export async function runEval(
354426
// The log function will take care of the sourceDatapointId and runId from the context
355427
// See overloadLog in this module for more details
356428
await logFunc({
357-
inputs: datapoint.inputs,
429+
inputs: { ...datapoint.inputs },
430+
messages: datapoint.messages,
358431
output: output,
359432
startTime: start_time,
360433
endTime: new Date(),
@@ -363,7 +436,7 @@ export async function runEval(
363436
} catch (e) {
364437
const errorMessage = e instanceof Error ? e.message : String(e);
365438
await logFunc({
366-
inputs: datapoint.inputs,
439+
inputs: { ...datapoint.inputs },
367440
error: errorMessage,
368441
sourceDatapointId: datapoint.id,
369442
startTime: start_time,
@@ -395,7 +468,6 @@ export async function runEval(
395468
hlDataset.datapoints!,
396469
async (datapoint) => {
397470
await processDatapoint(datapoint, runId);
398-
progressBar.increment();
399471
},
400472
{ concurrency: workers },
401473
);
@@ -471,8 +543,8 @@ function getLogFunction(
471543
// TODO: why does the Log `id` field refer to the file ID in the API?
472544
// Why are both `id` and `version_id` needed in the API?
473545
id: fileId,
474-
versionId: versionId,
475-
runId: runId,
546+
versionId,
547+
runId,
476548
};
477549

478550
switch (type) {
@@ -501,7 +573,7 @@ async function runLocalEvaluators(
501573
client: HumanloopClient,
502574
logId: string,
503575
datapoint: DatapointResponse | undefined,
504-
localEvaluators: [EvaluatorResponse, Function | AsyncFunction][],
576+
localEvaluators: [EvaluatorResponse, Function][],
505577
) {
506578
const log = await client.logs.get(logId);
507579

@@ -512,7 +584,7 @@ async function runLocalEvaluators(
512584
if (evaluator.spec.argumentsType === "target_required") {
513585
judgment = await evalFunction(log, datapoint);
514586
} else {
515-
judgment = evalFunction(log);
587+
judgment = await evalFunction(log);
516588
}
517589

518590
await client.evaluators.log({
@@ -525,6 +597,7 @@ async function runLocalEvaluators(
525597
});
526598
} catch (e) {
527599
await client.evaluators.log({
600+
path: evaluator.path,
528601
versionId: evaluator.versionId,
529602
parentId: logId,
530603
error: e instanceof Error ? e.message : String(e),

src/eval_utils/types.ts

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import {
66
CreatePromptLogResponse,
77
CreateToolLogResponse,
88
CreateDatapointRequest as DatapointRequest,
9-
EvaluatorArgumentsType,
109
EvaluatorResponse,
1110
EvaluatorReturnTypeEnum,
1211
EvaluatorsRequest,
@@ -26,16 +25,16 @@ import {
2625
ToolRequest,
2726
ToolResponse,
2827
UpdateDatesetAction as UpdateDatasetAction,
29-
} from "api";
30-
28+
} from "../api";
29+
import { DatapointResponse, EvaluatorArgumentsType } from "../api/types";
3130
import { FileType } from "../api/types/FileType";
3231

3332
type EvaluatorVersion =
3433
| LlmEvaluatorRequest
3534
| HumanEvaluatorRequest
3635
| CodeEvaluatorRequest
3736
| ExternalEvaluatorRequest;
38-
type Version =
37+
export type Version =
3938
| FlowKernelRequest
4039
| PromptKernelRequest
4140
| ToolKernelRequest
@@ -75,7 +74,15 @@ export interface File extends Identifiers {
7574
* `output = callable(datapoint.inputs, messages=datapoint.messages)`.
7675
* It should return a single string output. If not, you must provide a custom_logger.
7776
*/
78-
callable?: (...args: any[]) => string | Promise<string>;
77+
callable?:
78+
| ((inputs: any, messages?: any[]) => string | Promise<string>)
79+
// Decorated callables carry metadata about path and version
80+
// Which should match the ones provided in the File
81+
| {
82+
(inputs: any, messages?: any[]): string | Promise<string>;
83+
version: Version;
84+
path: string;
85+
};
7986
}
8087

8188
export interface Dataset extends Identifiers {
@@ -89,26 +96,25 @@ export interface Dataset extends Identifiers {
8996
}
9097

9198
export interface Evaluator extends Identifiers {
92-
/** The type of arguments the Evaluator expects - only required for local Evaluators. */
93-
argsType?: EvaluatorArgumentsType;
9499
/** The type of return value the Evaluator produces - only required for local Evaluators. */
95100
returnType?: EvaluatorReturnTypeEnum;
96-
/** The function to run on the logs to produce the judgment - only required for local Evaluators. */
97-
callable?: (...args: any[]) => any; // TODO define explicitly the args and return type
98-
/**
99-
* Optional function that logs the output judgment from your Evaluator to Humanloop.
100-
* If provided, it will be called as follows:
101-
*
102-
* ```typescript
103-
* judgment = callable(log);
104-
* log = custom_logger(client, judgment);
105-
* ```
106-
*
107-
* Inside the custom_logger, you can use the Humanloop client to log the judgment to Humanloop.
108-
* If not provided, your function must return a single string, and by default, the code will be used to inform the version of the external Evaluator on Humanloop.
109-
*/
110101
/**The threshold to check the Evaluator against. If the aggregate value of the Evaluator is below this threshold, the check will fail.*/
111102
threshold?: number;
103+
callable: Function;
104+
argsType: EvaluatorArgumentsType;
105+
}
106+
107+
export interface TargetFreeEvaluator extends Evaluator {
108+
argsType: "target_free";
109+
callable: (log: LogResponse) => string | number | boolean;
110+
}
111+
112+
export interface TargetedEvaluator extends Evaluator {
113+
argsType: "target_required";
114+
callable: (
115+
inputs: LogResponse,
116+
target: DatapointResponse,
117+
) => string | number | boolean;
112118
}
113119

114120
/**

0 commit comments

Comments
 (0)