make ollamaInferenceEngine handle return_meta_data (#1956)

lilacheden · web-flow · commit 7a6f68eecb84 · 2025-12-15T10:41:10.000+02:00
* make ollamaInferenceEngine handle return_meta_data

Signed-off-by: lilacheden &lt;lilach.edel@gmail.com&gt;

* fix wml inference tests to use supported models

Signed-off-by: lilacheden &lt;lilach.edel@gmail.com&gt;

* more model fixes to test_inference_engine

Signed-off-by: lilacheden &lt;lilach.edel@gmail.com&gt;

* allow small diff in metric test

Signed-off-by: lilacheden &lt;lilach.edel@gmail.com&gt;

---------

Signed-off-by: lilacheden &lt;lilach.edel@gmail.com&gt;
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -1461,7 +1461,18 @@ def _infer(
                 options=args,
             )
             results.append(response)
-
+        if return_meta_data:
+            return [
+                TextGenerationInferenceOutput(
+                    prediction=element["message"]["content"],
+                    generated_text=element["message"]["content"],
+                    input_tokens=element.get("prompt_eval_count", 0),
+                    output_tokens=element.get("eval_count", 0),
+                    model_name=self.model,
+                    inference_type=self.label,
+                )
+                for element in results
+            ]
         return [element["message"]["content"] for element in results]
 
 
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
@@ -159,7 +159,7 @@ def test_llava_inference_engine(self):
 
     def test_watsonx_inference(self):
         model = WMLInferenceEngineGeneration(
-            model_name="google/flan-t5-xl",
+            model_name="ibm/granite-3-8b-instruct",
             data_classification_policy=["public"],
             random_seed=111,
             min_new_tokens=1,
@@ -193,7 +193,7 @@ def test_watsonx_inference_with_external_client(self):
         from ibm_watsonx_ai.client import APIClient, Credentials
 
         model = WMLInferenceEngineGeneration(
-            model_name="google/flan-t5-xl",
+            model_name="ibm/granite-3-8b-instruct",
             data_classification_policy=["public"],
             random_seed=111,
             min_new_tokens=1,
@@ -279,7 +279,7 @@ def test_option_selecting_by_log_prob_inference_engines(self):
         ]
 
         watsonx_engine = WMLInferenceEngineGeneration(
-            model_name="meta-llama/llama-3-2-1b-instruct"
+            model_name="ibm/granite-3-8b-instruct"
         )
 
         for engine in [watsonx_engine]:
@@ -383,7 +383,7 @@ def test_lite_llm_inference_engine(self):
 
     def test_lite_llm_inference_engine_without_task_data_not_failing(self):
         LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-2-1b-instruct",
+            model="watsonx/meta-llama/llama-3-2-11b-vision-instruct",
             max_tokens=2,
             temperature=0,
             top_p=1,
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
@@ -2708,7 +2708,7 @@ def test_perplexity(self):
             metric=perplexity_question, predictions=prediction, references=references
         )
         self.assertAlmostEqual(
-            first_instance_target, outputs[0]["score"]["instance"]["score"]
+            first_instance_target, outputs[0]["score"]["instance"]["score"], places=5
         )
 
     def test_fuzzyner(self):

Original file line number	Diff line number	Diff line change
`@@ -2708,7 +2708,7 @@ def test_perplexity(self):`
`2708`	`2708`	`metric=perplexity_question, predictions=prediction, references=references`
`2709`	`2709`	`)`
`2710`	`2710`	`self.assertAlmostEqual(`
`2711`		`- first_instance_target, outputs[0]["score"]["instance"]["score"]`
	`2711`	`+ first_instance_target, outputs[0]["score"]["instance"]["score"], places=5`
`2712`	`2712`	`)`
`2713`	`2713`
`2714`	`2714`	`def test_fuzzyner(self):`