Eval utilities tweaks (#20)

peadaroh · fern-api[bot] · web-flow · commit 12d36660ba4f · 2024-10-13T16:17:21.000-04:00
Extends the base Evaluations client with a run method to avoid type errors, updates the local runner to use the datapoint dict instead of Pydantic object and tweaks some of the warning wording.
---------

Co-authored-by: fern-api &lt;115122769+fern-api[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ client.prompts.log(
     messages=[{"role": "user", "content": "What really happened at Roswell?"}],
     inputs={"person": "Trump"},
     created_at=datetime.datetime.fromisoformat(
-        "2024-07-19 00:29:35.178000+00:00",
+        "2024-07-18 23:29:35.178000+00:00",
     ),
     provider_latency=6.5931549072265625,
     output_message={
@@ -88,7 +88,7 @@ async def main() -> None:
         ],
         inputs={"person": "Trump"},
         created_at=datetime.datetime.fromisoformat(
-            "2024-07-19 00:29:35.178000+00:00",
+            "2024-07-18 23:29:35.178000+00:00",
         ),
         provider_latency=6.5931549072265625,
         output_message={
diff --git a/reference.md b/reference.md
@@ -56,7 +56,7 @@ client.prompts.log(
     messages=[{"role": "user", "content": "What really happened at Roswell?"}],
     inputs={"person": "Trump"},
     created_at=datetime.datetime.fromisoformat(
-        "2024-07-19 00:29:35.178000+00:00",
+        "2024-07-18 23:29:35.178000+00:00",
     ),
     provider_latency=6.5931549072265625,
     output_message={
@@ -6258,10 +6258,10 @@ client.flows.log(
     output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
     trace_status="incomplete",
     start_time=datetime.datetime.fromisoformat(
-        "2024-07-08 22:40:35+00:00",
+        "2024-07-08 21:40:35+00:00",
     ),
     end_time=datetime.datetime.fromisoformat(
-        "2024-07-08 22:40:39+00:00",
+        "2024-07-08 21:40:39+00:00",
     ),
 )
 
diff --git a/src/humanloop/client.py b/src/humanloop/client.py
@@ -1,11 +1,46 @@
 import typing
+from typing import Optional, List, Sequence
 import os
 import httpx
-from functools import partial
 
 from .base_client import BaseHumanloop, AsyncBaseHumanloop
 from .environment import HumanloopEnvironment
-from .eval_utils import _run_eval
+from .eval_utils import _run_eval, Dataset, File, Evaluator, EvaluatorCheck
+from .base_client import EvaluationsClient
+
+class ExtendedEvalsClient(EvaluationsClient):
+
+    client: BaseHumanloop
+
+    def run(
+        self,
+        file: File,
+        name: Optional[str],
+        dataset: Dataset,
+        evaluators: Optional[Sequence[Evaluator]] = None,
+        # logs: typing.Sequence[dict] | None = None,
+        workers: int = 4,
+    ) -> List[EvaluatorCheck]:
+        """
+        Evaluate your function for a given `Dataset` and set of `Evaluators`.
+        :param file: the Humanloop file being evaluated, including a function to run over the dataset.
+        :param name: the name of the Evaluation to run. If it does not exist, a new Evaluation will be created under your File.
+        :param dataset: the dataset to map your function over to produce the outputs required by the Evaluation.
+        :param evaluators: define how judgments are provided for this Evaluation.
+        :param workers: the number of threads to process datapoints using your function concurrently.
+        :return: per Evaluator checks.
+        """
+        if self.client is None:
+            raise ValueError("Need Humanloop client defined to run evals")
+
+        return _run_eval(
+            client=self.client,
+            file=file,
+            name=name,
+            dataset=dataset,
+            evaluators=evaluators,
+            workers=workers,
+        )
 
 
 class Humanloop(BaseHumanloop):
@@ -37,7 +72,9 @@ def __init__(
             follow_redirects=follow_redirects,
             httpx_client=httpx_client,
         )
-        self.evaluations.run = partial(_run_eval, client=self)  # type: ignore[attr-defined]
+        eval_client = ExtendedEvalsClient(client_wrapper=self._client_wrapper)
+        eval_client.client = self
+        self.evaluations = eval_client
 
 
 class AsyncHumanloop(AsyncBaseHumanloop):
diff --git a/src/humanloop/eval_utils.py b/src/humanloop/eval_utils.py
@@ -205,7 +205,6 @@ def _run_eval(
         try:
             Flow.parse_obj(version)
         except ValidationError:
-            logger.warning("Invalid Flow `version` in your `file` request. Setting your version payload as flow `attributes`.")
             flow_version = {"attributes": version}
             file_dict = {**file, **flow_version}
         hl_file = client.flows.upsert(**file_dict)
@@ -307,11 +306,12 @@ def _run_eval(
     # Define the function to execute your function in parallel and Log to Humanloop
     def process_datapoint(datapoint: Datapoint):
         start_time = datetime.now()
+        datapoint_dict = datapoint.dict()
         try:
-            if datapoint.messages:
-                output = function_(**datapoint.inputs, messages=datapoint.messages)
+            if "messages" in datapoint_dict:
+                output = function_(**datapoint_dict["inputs"], messages=datapoint_dict["messages"])
             else:
-                output = function_(**datapoint.inputs)
+                output = function_(**datapoint_dict["inputs"])
             if custom_logger:
                 log = function_(client=client, output=output)
             else:
@@ -340,7 +340,7 @@ def process_datapoint(datapoint: Datapoint):
                 start_time = datetime.now()
                 eval_function = local_evaluator["callable"]
                 if local_evaluator["args_type"] == "target_required":
-                    judgment = eval_function(log.dict(), datapoint.target)
+                    judgment = eval_function(log.dict(), datapoint_dict["target"])
                 else:
                     judgment = eval_function(log.dict())
 
@@ -375,7 +375,7 @@ def process_datapoint(datapoint: Datapoint):
 
     # Generate locally if a file `callable` is provided
     if function_:
-        logger.info(f"{CYAN}\nRunning {hl_file.name} {type_} callable over {hl_dataset.name}{RESET} Dataset using {workers} workers")
+        logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name} using {workers} workers{RESET} ")
         completed_tasks = 0
         with ThreadPoolExecutor(max_workers=workers) as executor:
             futures = [
@@ -387,7 +387,7 @@ def process_datapoint(datapoint: Datapoint):
                 _progress_bar(total_datapoints, completed_tasks)
     else:
         # TODO: trigger run when updated API is available
-        logger.info(f"{CYAN}\nRunning {type_} {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
+        logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
 
     # Wait for the Evaluation to complete then print the results
     complete = False
@@ -471,7 +471,7 @@ def get_score_from_evaluator_stat(stat: Union[NumericStats, BooleanStats]) -> Un
     elif isinstance(stat, NumericStats):
         score = round(stat.mean, 2)
     else:
-        raise ValueError("Invalid Evaluator Stat type.")
+        pass
     return score
 
 
diff --git a/src/humanloop/flows/client.py b/src/humanloop/flows/client.py
@@ -197,10 +197,10 @@ def log(
             output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
             trace_status="incomplete",
             start_time=datetime.datetime.fromisoformat(
-                "2024-07-08 22:40:35+00:00",
+                "2024-07-08 21:40:35+00:00",
             ),
             end_time=datetime.datetime.fromisoformat(
-                "2024-07-08 22:40:39+00:00",
+                "2024-07-08 21:40:39+00:00",
             ),
         )
         """
@@ -1366,10 +1366,10 @@ async def main() -> None:
                 output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
                 trace_status="incomplete",
                 start_time=datetime.datetime.fromisoformat(
-                    "2024-07-08 22:40:35+00:00",
+                    "2024-07-08 21:40:35+00:00",
                 ),
                 end_time=datetime.datetime.fromisoformat(
-                    "2024-07-08 22:40:39+00:00",
+                    "2024-07-08 21:40:39+00:00",
                 ),
             )
 
diff --git a/src/humanloop/prompts/client.py b/src/humanloop/prompts/client.py
@@ -236,7 +236,7 @@ def log(
             messages=[{"role": "user", "content": "What really happened at Roswell?"}],
             inputs={"person": "Trump"},
             created_at=datetime.datetime.fromisoformat(
-                "2024-07-19 00:29:35.178000+00:00",
+                "2024-07-18 23:29:35.178000+00:00",
             ),
             provider_latency=6.5931549072265625,
             output_message={
@@ -2117,7 +2117,7 @@ async def main() -> None:
                 ],
                 inputs={"person": "Trump"},
                 created_at=datetime.datetime.fromisoformat(
-                    "2024-07-19 00:29:35.178000+00:00",
+                    "2024-07-18 23:29:35.178000+00:00",
                 ),
                 provider_latency=6.5931549072265625,
                 output_message={