Skip to content

Commit 3af78c2

Browse files
authored
Merge pull request #142 from salesforcecli/er/add-metrics-to-test-results-summary
W-18507456: add Metric Pass percentage to test results summary
2 parents 5a45104 + 0a88bdf commit 3af78c2

4 files changed

Lines changed: 276 additions & 0 deletions

File tree

src/handleTestResults.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ export function humanFormat(results: AgentTestResultsResponse): string {
7272
const ux = new Ux();
7373

7474
const tables: string[] = [];
75+
const metricResults = [];
7576
for (const testCase of results.testCases) {
7677
let table = ux.makeTable({
7778
title: `${ansis.bold(`Test Case #${testCase.testNumber}`)}\n${ansis.dim('Utterance')}: ${
@@ -122,6 +123,8 @@ export function humanFormat(results: AgentTestResultsResponse): string {
122123
width: '100%',
123124
});
124125
tables.push(table);
126+
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
127+
metricResults.push(...metrics);
125128
}
126129
// add a line break between end of the first table and the utterance of the next
127130
tables.push('\n');
@@ -145,6 +148,12 @@ export function humanFormat(results: AgentTestResultsResponse): string {
145148
}, 0);
146149
const outcomePassPercent = (outcomePassCount / results.testCases.length) * 100;
147150

151+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
152+
const metricPassCount = metricResults.filter(
153+
(f) => f.result === 'PASS' || f.name === 'output_latency_milliseconds'
154+
).length;
155+
const metricPassPercent = metricResults.length > 0 ? (metricPassCount / metricResults.length) * 100 : 0;
156+
148157
const final = {
149158
Status: results.status,
150159
Duration: results.endTime
@@ -153,6 +162,7 @@ export function humanFormat(results: AgentTestResultsResponse): string {
153162
'Topic Pass %': `${topicPassPercent.toFixed(2)}%`,
154163
'Action Pass %': `${actionPassPercent.toFixed(2)}%`,
155164
'Outcome Pass %': `${outcomePassPercent.toFixed(2)}%`,
165+
...(metricResults.length ? { 'Metric Pass %': `${metricPassPercent.toFixed(2)}%` } : {}),
156166
};
157167

158168
const resultsTable = makeSimpleTable(final, ansis.bold.blue('Test Results'));

test/handleTestResults.test.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,25 @@ describe('truncate', () => {
8181
expect(truncate(0, 0)).to.equal('0');
8282
});
8383
});
84+
85+
describe('metric calculations', () => {
86+
it('should handle test cases with no metrics', async () => {
87+
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/4.json', 'utf8');
88+
const input = JSON.parse(raw) as AgentTestResultsResponse;
89+
const output = humanFormat(input);
90+
expect(output).to.not.include('Metric Pass %');
91+
});
92+
it('should correctly calculate metric pass percentage', async () => {
93+
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/5.json', 'utf8');
94+
const input = JSON.parse(raw) as AgentTestResultsResponse;
95+
const output = humanFormat(input);
96+
expect(output).to.include('Metric Pass % 33.33%');
97+
});
98+
99+
it('should handle test cases where all metrics fail', async () => {
100+
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/6.json', 'utf8');
101+
const input = JSON.parse(raw) as AgentTestResultsResponse;
102+
const output = humanFormat(input);
103+
expect(output).to.include('Metric Pass % 0.00%');
104+
});
105+
});
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{
2+
"status": "COMPLETED",
3+
"startTime": "2025-01-07T12:00:00Z",
4+
"endTime": "2025-01-07T12:00:10.35Z",
5+
"errorMessage": null,
6+
"subjectName": "test-subject",
7+
"testCases": [
8+
{
9+
"testNumber": 1,
10+
"status": "COMPLETED",
11+
"inputs": { "utterance": "test1" },
12+
"testResults": [
13+
{
14+
"actualValue": "Local_Events_Information",
15+
"endTime": "2025-05-15T17:24:07Z",
16+
"errorCode": 0,
17+
"expectedValue": "Local_Events_Information",
18+
"metricLabel": "topic_assertion",
19+
"name": "topic_assertion",
20+
"result": "PASS",
21+
"score": 1,
22+
"startTime": "2025-05-15T17:23:57Z",
23+
"status": "COMPLETED"
24+
},
25+
{
26+
"actualValue": "[]",
27+
"endTime": "2025-05-15T17:24:07Z",
28+
"errorCode": 0,
29+
"expectedValue": "[]",
30+
"metricLabel": "actions_assertion",
31+
"name": "actions_assertion",
32+
"result": "PASS",
33+
"score": 1,
34+
"startTime": "2025-05-15T17:23:57Z",
35+
"status": "COMPLETED"
36+
},
37+
{
38+
"actualValue": "Could you please specify the type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
39+
"endTime": "2025-05-15T17:24:07Z",
40+
"errorCode": 0,
41+
"expectedValue": "Could you let me know what type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
42+
"metricLabel": "output_validation",
43+
"name": "output_validation",
44+
"result": "PASS",
45+
"score": 1,
46+
"startTime": "2025-05-15T17:23:57Z",
47+
"status": "COMPLETED"
48+
},
49+
{
50+
"endTime": "2025-05-15T17:24:08Z",
51+
"errorCode": 0,
52+
"metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user's query.",
53+
"metricLabel": "completeness",
54+
"name": "completeness",
55+
"result": "FAILURE",
56+
"score": 0.2,
57+
"startTime": "2025-05-15T17:23:57Z",
58+
"status": "COMPLETED",
59+
"actualValue": "",
60+
"expectedValue": ""
61+
}
62+
],
63+
"startTime": "2024-01-01T00:00:00Z",
64+
"generatedData": {
65+
"actionsSequence": [],
66+
"outcome": "test outcome",
67+
"topic": "test topic"
68+
}
69+
},
70+
{
71+
"testNumber": 2,
72+
"status": "COMPLETED",
73+
"inputs": { "utterance": "test2" },
74+
"testResults": [
75+
{
76+
"actualValue": "Weather_and_Temperature_Information",
77+
"endTime": "2025-05-15T17:24:08Z",
78+
"errorCode": 0,
79+
"expectedValue": "Weather_and_Temperature_Information",
80+
"metricLabel": "topic_assertion",
81+
"name": "topic_assertion",
82+
"result": "PASS",
83+
"score": 1,
84+
"startTime": "2025-05-15T17:23:57Z",
85+
"status": "COMPLETED"
86+
},
87+
{
88+
"endTime": "2025-05-15T17:24:08Z",
89+
"errorCode": 0,
90+
"metricExplainability": "The answer is a short sentence that does not provide any weather information, and it does not seem to be related to the question. It is difficult to understand what the answer is trying to say, and it does not contain any grammar errors.",
91+
"metricLabel": "coherence",
92+
"name": "coherence",
93+
"result": "FAILURE",
94+
"score": 0.2,
95+
"startTime": "2025-05-15T17:23:57Z",
96+
"status": "COMPLETED",
97+
"actualValue": "",
98+
"expectedValue": ""
99+
},
100+
{
101+
"endTime": "2025-05-15T17:24:08Z",
102+
"errorCode": 0,
103+
"metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user's query.",
104+
"metricLabel": "completeness",
105+
"name": "completeness",
106+
"result": "PASS",
107+
"score": 0.9,
108+
"startTime": "2025-05-15T17:23:57Z",
109+
"status": "COMPLETED",
110+
"actualValue": "",
111+
"expectedValue": ""
112+
}
113+
],
114+
"startTime": "2024-01-01T00:00:00Z",
115+
"generatedData": {
116+
"actionsSequence": [],
117+
"outcome": "test outcome",
118+
"topic": "test topic"
119+
}
120+
}
121+
]
122+
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{
2+
"status": "COMPLETED",
3+
"startTime": "2025-01-07T12:00:00Z",
4+
"endTime": "2025-01-07T12:00:10.35Z",
5+
"errorMessage": null,
6+
"subjectName": "test-subject",
7+
"testCases": [
8+
{
9+
"testNumber": 1,
10+
"status": "COMPLETED",
11+
"inputs": { "utterance": "test1" },
12+
"testResults": [
13+
{
14+
"actualValue": "Local_Events_Information",
15+
"endTime": "2025-05-15T17:24:07Z",
16+
"errorCode": 0,
17+
"expectedValue": "Local_Events_Information",
18+
"metricLabel": "topic_assertion",
19+
"name": "topic_assertion",
20+
"result": "PASS",
21+
"score": 1,
22+
"startTime": "2025-05-15T17:23:57Z",
23+
"status": "COMPLETED"
24+
},
25+
{
26+
"actualValue": "[]",
27+
"endTime": "2025-05-15T17:24:07Z",
28+
"errorCode": 0,
29+
"expectedValue": "[]",
30+
"metricLabel": "actions_assertion",
31+
"name": "actions_assertion",
32+
"result": "PASS",
33+
"score": 1,
34+
"startTime": "2025-05-15T17:23:57Z",
35+
"status": "COMPLETED"
36+
},
37+
{
38+
"actualValue": "Could you please specify the type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
39+
"endTime": "2025-05-15T17:24:07Z",
40+
"errorCode": 0,
41+
"expectedValue": "Could you let me know what type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
42+
"metricLabel": "output_validation",
43+
"name": "output_validation",
44+
"result": "PASS",
45+
"score": 1,
46+
"startTime": "2025-05-15T17:23:57Z",
47+
"status": "COMPLETED"
48+
},
49+
{
50+
"endTime": "2025-05-15T17:24:08Z",
51+
"errorCode": 0,
52+
"metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user's query.",
53+
"metricLabel": "completeness",
54+
"name": "completeness",
55+
"result": "FAILURE",
56+
"score": 0.2,
57+
"startTime": "2025-05-15T17:23:57Z",
58+
"status": "COMPLETED",
59+
"actualValue": "",
60+
"expectedValue": ""
61+
}
62+
],
63+
"startTime": "2024-01-01T00:00:00Z",
64+
"generatedData": {
65+
"actionsSequence": [],
66+
"outcome": "test outcome",
67+
"topic": "test topic"
68+
}
69+
},
70+
{
71+
"testNumber": 2,
72+
"status": "COMPLETED",
73+
"inputs": { "utterance": "test2" },
74+
"testResults": [
75+
{
76+
"actualValue": "Weather_and_Temperature_Information",
77+
"endTime": "2025-05-15T17:24:08Z",
78+
"errorCode": 0,
79+
"expectedValue": "Weather_and_Temperature_Information",
80+
"metricLabel": "topic_assertion",
81+
"name": "topic_assertion",
82+
"result": "PASS",
83+
"score": 1,
84+
"startTime": "2025-05-15T17:23:57Z",
85+
"status": "COMPLETED"
86+
},
87+
{
88+
"endTime": "2025-05-15T17:24:08Z",
89+
"errorCode": 0,
90+
"metricExplainability": "The answer is a short sentence that does not provide any weather information, and it does not seem to be related to the question. It is difficult to understand what the answer is trying to say, and it does not contain any grammar errors.",
91+
"metricLabel": "coherence",
92+
"name": "coherence",
93+
"result": "FAILURE",
94+
"score": 0.2,
95+
"startTime": "2025-05-15T17:23:57Z",
96+
"status": "COMPLETED",
97+
"actualValue": "",
98+
"expectedValue": ""
99+
},
100+
{
101+
"endTime": "2025-05-15T17:24:08Z",
102+
"errorCode": 0,
103+
"metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user's query.",
104+
"metricLabel": "completeness",
105+
"name": "completeness",
106+
"result": "FAILURE",
107+
"score": 0.2,
108+
"startTime": "2025-05-15T17:23:57Z",
109+
"status": "COMPLETED",
110+
"actualValue": "",
111+
"expectedValue": ""
112+
}
113+
],
114+
"startTime": "2024-01-01T00:00:00Z",
115+
"generatedData": {
116+
"actionsSequence": [],
117+
"outcome": "test outcome",
118+
"topic": "test topic"
119+
}
120+
}
121+
]
122+
}

0 commit comments

Comments
 (0)