Merge pull request #142 from salesforcecli/er/add-metrics-to-test-results-summary

shetzel · web-flow · commit 3af78c2b6cc6 · 2025-05-20T14:40:42.000-06:00
W-18507456: add Metric Pass percentage to test results summary
diff --git a/src/handleTestResults.ts b/src/handleTestResults.ts
@@ -72,6 +72,7 @@ export function humanFormat(results: AgentTestResultsResponse): string {
   const ux = new Ux();
 
   const tables: string[] = [];
+  const metricResults = [];
   for (const testCase of results.testCases) {
     let table = ux.makeTable({
       title: `${ansis.bold(`Test Case #${testCase.testNumber}`)}\n${ansis.dim('Utterance')}: ${
@@ -122,6 +123,8 @@ export function humanFormat(results: AgentTestResultsResponse): string {
         width: '100%',
       });
       tables.push(table);
+      // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
+      metricResults.push(...metrics);
     }
     // add a line break between end of the first table and the utterance of the next
     tables.push('\n');
@@ -145,6 +148,12 @@ export function humanFormat(results: AgentTestResultsResponse): string {
   }, 0);
   const outcomePassPercent = (outcomePassCount / results.testCases.length) * 100;
 
+  // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
+  const metricPassCount = metricResults.filter(
+    (f) => f.result === 'PASS' || f.name === 'output_latency_milliseconds'
+  ).length;
+  const metricPassPercent = metricResults.length > 0 ? (metricPassCount / metricResults.length) * 100 : 0;
+
   const final = {
     Status: results.status,
     Duration: results.endTime
@@ -153,6 +162,7 @@ export function humanFormat(results: AgentTestResultsResponse): string {
     'Topic Pass %': `${topicPassPercent.toFixed(2)}%`,
     'Action Pass %': `${actionPassPercent.toFixed(2)}%`,
     'Outcome Pass %': `${outcomePassPercent.toFixed(2)}%`,
+    ...(metricResults.length ? { 'Metric Pass %': `${metricPassPercent.toFixed(2)}%` } : {}),
   };
 
   const resultsTable = makeSimpleTable(final, ansis.bold.blue('Test Results'));
diff --git a/test/handleTestResults.test.ts b/test/handleTestResults.test.ts
@@ -81,3 +81,25 @@ describe('truncate', () => {
     expect(truncate(0, 0)).to.equal('0');
   });
 });
+
+describe('metric calculations', () => {
+  it('should handle test cases with no metrics', async () => {
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/4.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
+    const output = humanFormat(input);
+    expect(output).to.not.include('Metric Pass %');
+  });
+  it('should correctly calculate metric pass percentage', async () => {
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/5.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
+    const output = humanFormat(input);
+    expect(output).to.include('Metric Pass %   33.33%');
+  });
+
+  it('should handle test cases where all metrics fail', async () => {
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/6.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
+    const output = humanFormat(input);
+    expect(output).to.include('Metric Pass %   0.00%');
+  });
+});
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/5.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/5.json
@@ -0,0 +1,122 @@
+{
+  "status": "COMPLETED",
+  "startTime": "2025-01-07T12:00:00Z",
+  "endTime": "2025-01-07T12:00:10.35Z",
+  "errorMessage": null,
+  "subjectName": "test-subject",
+  "testCases": [
+    {
+      "testNumber": 1,
+      "status": "COMPLETED",
+      "inputs": { "utterance": "test1" },
+      "testResults": [
+        {
+          "actualValue": "Local_Events_Information",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "Local_Events_Information",
+          "metricLabel": "topic_assertion",
+          "name": "topic_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "actualValue": "[]",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "[]",
+          "metricLabel": "actions_assertion",
+          "name": "actions_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "actualValue": "Could you please specify the type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "Could you let me know what type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
+          "metricLabel": "output_validation",
+          "name": "output_validation",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user&#39;s query.",
+          "metricLabel": "completeness",
+          "name": "completeness",
+          "result": "FAILURE",
+          "score": 0.2,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        }
+      ],
+      "startTime": "2024-01-01T00:00:00Z",
+      "generatedData": {
+        "actionsSequence": [],
+        "outcome": "test outcome",
+        "topic": "test topic"
+      }
+    },
+    {
+      "testNumber": 2,
+      "status": "COMPLETED",
+      "inputs": { "utterance": "test2" },
+      "testResults": [
+        {
+          "actualValue": "Weather_and_Temperature_Information",
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "expectedValue": "Weather_and_Temperature_Information",
+          "metricLabel": "topic_assertion",
+          "name": "topic_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer is a short sentence that does not provide any weather information, and it does not seem to be related to the question. It is difficult to understand what the answer is trying to say, and it does not contain any grammar errors.",
+          "metricLabel": "coherence",
+          "name": "coherence",
+          "result": "FAILURE",
+          "score": 0.2,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user&#39;s query.",
+          "metricLabel": "completeness",
+          "name": "completeness",
+          "result": "PASS",
+          "score": 0.9,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        }
+      ],
+      "startTime": "2024-01-01T00:00:00Z",
+      "generatedData": {
+        "actionsSequence": [],
+        "outcome": "test outcome",
+        "topic": "test topic"
+      }
+    }
+  ]
+}
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/6.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results/6.json
@@ -0,0 +1,122 @@
+{
+  "status": "COMPLETED",
+  "startTime": "2025-01-07T12:00:00Z",
+  "endTime": "2025-01-07T12:00:10.35Z",
+  "errorMessage": null,
+  "subjectName": "test-subject",
+  "testCases": [
+    {
+      "testNumber": 1,
+      "status": "COMPLETED",
+      "inputs": { "utterance": "test1" },
+      "testResults": [
+        {
+          "actualValue": "Local_Events_Information",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "Local_Events_Information",
+          "metricLabel": "topic_assertion",
+          "name": "topic_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "actualValue": "[]",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "[]",
+          "metricLabel": "actions_assertion",
+          "name": "actions_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "actualValue": "Could you please specify the type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
+          "endTime": "2025-05-15T17:24:07Z",
+          "errorCode": 0,
+          "expectedValue": "Could you let me know what type of event you're interested in? For example, are you looking for cultural events, family-friendly activities, or something else?",
+          "metricLabel": "output_validation",
+          "name": "output_validation",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user&#39;s query.",
+          "metricLabel": "completeness",
+          "name": "completeness",
+          "result": "FAILURE",
+          "score": 0.2,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        }
+      ],
+      "startTime": "2024-01-01T00:00:00Z",
+      "generatedData": {
+        "actionsSequence": [],
+        "outcome": "test outcome",
+        "topic": "test topic"
+      }
+    },
+    {
+      "testNumber": 2,
+      "status": "COMPLETED",
+      "inputs": { "utterance": "test2" },
+      "testResults": [
+        {
+          "actualValue": "Weather_and_Temperature_Information",
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "expectedValue": "Weather_and_Temperature_Information",
+          "metricLabel": "topic_assertion",
+          "name": "topic_assertion",
+          "result": "PASS",
+          "score": 1,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED"
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer is a short sentence that does not provide any weather information, and it does not seem to be related to the question. It is difficult to understand what the answer is trying to say, and it does not contain any grammar errors.",
+          "metricLabel": "coherence",
+          "name": "coherence",
+          "result": "FAILURE",
+          "score": 0.2,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        },
+        {
+          "endTime": "2025-05-15T17:24:08Z",
+          "errorCode": 0,
+          "metricExplainability": "The answer does not provide any weather information, which is the main request. It only provides a general response without addressing the user&#39;s query.",
+          "metricLabel": "completeness",
+          "name": "completeness",
+          "result": "FAILURE",
+          "score": 0.2,
+          "startTime": "2025-05-15T17:23:57Z",
+          "status": "COMPLETED",
+          "actualValue": "",
+          "expectedValue": ""
+        }
+      ],
+      "startTime": "2024-01-01T00:00:00Z",
+      "generatedData": {
+        "actionsSequence": [],
+        "outcome": "test outcome",
+        "topic": "test topic"
+      }
+    }
+  ]
+}