diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 6bc0b2b728..60311ac719 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1329,6 +1329,15 @@ def _resolve_evaluation_run_metrics( for metric_instance in metrics: if isinstance(metric_instance, types.EvaluationRunMetric): resolved_metrics_list.append(metric_instance) + elif isinstance(metric_instance, str) and metric_instance.startswith( + "projects/" + ): + resolved_metrics_list.append( + types.EvaluationRunMetric( + metric=metric_instance.split("/")[-1], + metric_resource_name=metric_instance, + ) + ) elif isinstance( metric_instance, _evals_metric_loaders.LazyLoadedPrebuiltMetric ): @@ -1409,6 +1418,15 @@ def _resolve_metrics( raise elif isinstance(metric_instance, types.Metric): resolved_metrics_list.append(metric_instance) + elif isinstance(metric_instance, str) and metric_instance.startswith( + "projects/" + ): + # Wrap the string in a Metric object to satisfy Pydantic validation + # and extract the ID for the 'name' property + metric_id = metric_instance.split("/")[-1] + resolved_metrics_list.append( + types.Metric(name=metric_id, metric_resource_name=metric_instance) + ) else: try: metric_name_str = str(metric_instance) diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 671471be81..7d9f1c340e 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -1027,7 +1027,7 @@ def get_metric_result( for attempt in range(_MAX_RETRIES): try: api_response = self.module._evaluate_instances( - metrics=[self.metric], + metrics_sources=[self.metric], instance=payload.get("instance"), autorater_config=payload.get("autorater_config"), ) @@ -1164,7 +1164,7 @@ def get_metric_result( for attempt in range(_MAX_RETRIES): try: api_response = self.module._evaluate_instances( - metrics=[self.metric], + metrics_sources=[self.metric], instance=payload.get("instance"), ) break @@ -1242,6 +1242,14 @@ def aggregate( ) +class RegisteredMetricHandler(PredefinedMetricHandler): + """Metric handler for registered metrics.""" + + def __init__(self, module: "evals.Evals", metric: types.Metric): + # Skip the parent check for SUPPORTED_PREDEFINED_METRICS + MetricHandler.__init__(self, module=module, metric=metric) + + _METRIC_HANDLER_MAPPING = [ ( lambda m: hasattr(m, "remote_custom_function") and m.remote_custom_function, @@ -1251,6 +1259,10 @@ def aggregate( lambda m: m.custom_function and isinstance(m.custom_function, Callable), CustomMetricHandler, ), + ( + lambda m: getattr(m, "metric_resource_name", None) is not None, + RegisteredMetricHandler, + ), ( lambda m: m.name in ComputationMetricHandler.SUPPORTED_COMPUTATION_METRICS, ComputationMetricHandler, diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index a7bad99bba..d7f1b10be0 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -31,12 +31,19 @@ def t_metrics( Args: metrics: A list of metrics used for evaluation. set_default_aggregation_metrics: Whether to set default aggregation metrics. + Returns: A list of resolved metric payloads for the evaluation request. """ metrics_payload = [] for metric in metrics: + # Case 1: Registered Metric Resource Name + if isinstance(metric, str) and metric.startswith("projects/"): + metrics_payload.append({"metric_resource_name": metric}) + continue + + # Case 2: Inline Metric Configuration metric_payload_item: dict[str, Any] = {} metric_name = getv(metric, ["name"]).lower() @@ -83,5 +90,23 @@ def t_metrics( raise ValueError( f"Unsupported metric type or invalid metric name: {metric_name}" ) - metrics_payload.append(metric_payload_item) + metrics_payload.append({"metric": metric_payload_item}) return metrics_payload + + +def t_metric_sources(metrics: list[Any]) -> list[dict[str, Any]]: + """Prepares the MetricSource payload for the evaluation request.""" + sources_payload = [] + for metric in metrics: + # Check if the 'metric' is a resource name string or contains one + resource_name = getattr(metric, "metric_resource_name", None) + if not resource_name and isinstance(metric, str) and metric.startswith("projects/"): + resource_name = metric + + if resource_name: + sources_payload.append({"metric_resource_name": resource_name}) + else: + # Fallback to existing Metric spec transformation + metric_payload = t_metrics([metric])[0] + sources_payload.append({"metric": metric_payload}) + return sources_payload diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index ebff5e6366..82042bb124 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -238,11 +238,11 @@ def _EvaluateInstancesRequestParameters_to_vertex( if getv(from_object, ["autorater_config"]) is not None: setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"])) - if getv(from_object, ["metrics"]) is not None: + if getv(from_object, ["metrics_sources"]) is not None: setv( to_object, - ["metrics"], - [item for item in t.t_metrics(getv(from_object, ["metrics"]))], + ["metricSources"], + t.t_metric_sources(getv(from_object, ["metrics_sources"])), ) if getv(from_object, ["instance"]) is not None: @@ -399,6 +399,13 @@ def _EvaluationRunMetric_from_vertex( _UnifiedMetric_from_vertex(getv(from_object, ["metricConfig"]), to_object), ) + if getv(from_object, ["metricResourceName"]) is not None: + setv( + to_object, + ["metric_resource_name"], + getv(from_object, ["metricResourceName"]), + ) + return to_object @@ -417,6 +424,13 @@ def _EvaluationRunMetric_to_vertex( _UnifiedMetric_to_vertex(getv(from_object, ["metric_config"]), to_object), ) + if getv(from_object, ["metric_resource_name"]) is not None: + setv( + to_object, + ["metricResourceName"], + getv(from_object, ["metric_resource_name"]), + ) + return to_object @@ -512,6 +526,13 @@ def _GenerateInstanceRubricsRequest_to_vertex( ), ) + if getv(from_object, ["metric_resource_name"]) is not None: + setv( + to_object, + ["metricResourceName"], + getv(from_object, ["metric_resource_name"]), + ) + if getv(from_object, ["config"]) is not None: setv(to_object, ["config"], getv(from_object, ["config"])) @@ -976,7 +997,7 @@ def _evaluate_instances( ] = None, rubric_based_metric_input: Optional[types.RubricBasedMetricInputOrDict] = None, autorater_config: Optional[genai_types.AutoraterConfigOrDict] = None, - metrics: Optional[list[types.MetricOrDict]] = None, + metrics_sources: Optional[list[types.MetricOrDict]] = None, instance: Optional[types.EvaluationInstanceOrDict] = None, config: Optional[types.EvaluateInstancesConfigOrDict] = None, ) -> types.EvaluateInstancesResponse: @@ -996,7 +1017,7 @@ def _evaluate_instances( tool_parameter_kv_match_input=tool_parameter_kv_match_input, rubric_based_metric_input=rubric_based_metric_input, autorater_config=autorater_config, - metrics=metrics, + metrics_sources=metrics_sources, instance=instance, config=config, ) @@ -1049,6 +1070,7 @@ def _generate_rubrics( types.PredefinedMetricSpecOrDict ] = None, rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None, + metric_resource_name: Optional[str] = None, config: Optional[types.RubricGenerationConfigOrDict] = None, ) -> types.GenerateInstanceRubricsResponse: """ @@ -1059,6 +1081,7 @@ def _generate_rubrics( contents=contents, predefined_rubric_generation_spec=predefined_rubric_generation_spec, rubric_generation_spec=rubric_generation_spec, + metric_resource_name=metric_resource_name, config=config, ) @@ -2230,7 +2253,7 @@ async def _evaluate_instances( ] = None, rubric_based_metric_input: Optional[types.RubricBasedMetricInputOrDict] = None, autorater_config: Optional[genai_types.AutoraterConfigOrDict] = None, - metrics: Optional[list[types.MetricOrDict]] = None, + metrics_sources: Optional[list[types.MetricOrDict]] = None, instance: Optional[types.EvaluationInstanceOrDict] = None, config: Optional[types.EvaluateInstancesConfigOrDict] = None, ) -> types.EvaluateInstancesResponse: @@ -2250,7 +2273,7 @@ async def _evaluate_instances( tool_parameter_kv_match_input=tool_parameter_kv_match_input, rubric_based_metric_input=rubric_based_metric_input, autorater_config=autorater_config, - metrics=metrics, + metrics_sources=metrics_sources, instance=instance, config=config, ) @@ -2305,6 +2328,7 @@ async def _generate_rubrics( types.PredefinedMetricSpecOrDict ] = None, rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None, + metric_resource_name: Optional[str] = None, config: Optional[types.RubricGenerationConfigOrDict] = None, ) -> types.GenerateInstanceRubricsResponse: """ @@ -2315,6 +2339,7 @@ async def _generate_rubrics( contents=contents, predefined_rubric_generation_spec=predefined_rubric_generation_spec, rubric_generation_spec=rubric_generation_spec, + metric_resource_name=metric_resource_name, config=config, ) diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 4a7512d0fe..f91a4f7aaa 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -2326,6 +2326,9 @@ class LLMBasedMetricSpec(_common.BaseModel): default=None, description="""Dynamically generate rubrics using this specification.""", ) + metric_resource_name: Optional[str] = Field( + default=None, description="""The resource name of the metric definition.""" + ) class LLMBasedMetricSpecDict(TypedDict, total=False): @@ -2350,6 +2353,9 @@ class LLMBasedMetricSpecDict(TypedDict, total=False): rubric_generation_spec: Optional[RubricGenerationSpecDict] """Dynamically generate rubrics using this specification.""" + metric_resource_name: Optional[str] + """The resource name of the metric definition.""" + LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict] @@ -2482,6 +2488,9 @@ class EvaluationRunMetric(_common.BaseModel): metric_config: Optional[UnifiedMetric] = Field( default=None, description="""The unified metric used for evaluation run.""" ) + metric_resource_name: Optional[str] = Field( + default=None, description="""The resource name of the metric definition.""" + ) class EvaluationRunMetricDict(TypedDict, total=False): @@ -2493,6 +2502,9 @@ class EvaluationRunMetricDict(TypedDict, total=False): metric_config: Optional[UnifiedMetricDict] """The unified metric used for evaluation run.""" + metric_resource_name: Optional[str] + """The resource name of the metric definition.""" + EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict] @@ -4439,6 +4451,9 @@ class Metric(_common.BaseModel): default=None, description="""Optional steering instruction parameters for the automated predefined metric.""", ) + metric_resource_name: Optional[str] = Field( + default=None, description="""The resource name of the metric definition.""" + ) # Allow extra fields to support metric-specific config fields. model_config = ConfigDict(extra="allow") @@ -4643,6 +4658,9 @@ class MetricDict(TypedDict, total=False): metric_spec_parameters: Optional[dict[str, Any]] """Optional steering instruction parameters for the automated predefined metric.""" + metric_resource_name: Optional[str] + """The resource name of the metric definition.""" + MetricOrDict = Union[Metric, MetricDict] @@ -4679,11 +4697,9 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel): autorater_config: Optional[genai_types.AutoraterConfig] = Field( default=None, description="""""" ) - metrics: Optional[list[Metric]] = Field( + metrics_sources: Optional[list[Metric]] = Field( default=None, - description="""The metrics used for evaluation. - Currently, we only support evaluating a single metric. If multiple metrics - are provided, only the first one will be evaluated.""", + description="""The metrics used for evaluation. Each can be an inline configuration or a registered metric resource name.""", ) instance: Optional[EvaluationInstance] = Field( default=None, description="""The instance to be evaluated.""" @@ -4727,10 +4743,8 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False): autorater_config: Optional[genai_types.AutoraterConfigDict] """""" - metrics: Optional[list[MetricDict]] - """The metrics used for evaluation. - Currently, we only support evaluating a single metric. If multiple metrics - are provided, only the first one will be evaluated.""" + metrics_sources: Optional[list[MetricDict]] + """The metrics used for evaluation. Each can be an inline configuration or a registered metric resource name.""" instance: Optional[EvaluationInstanceDict] """The instance to be evaluated.""" @@ -5354,6 +5368,10 @@ class _GenerateInstanceRubricsRequest(_common.BaseModel): default=None, description="""Specification for how the rubrics should be generated.""", ) + metric_resource_name: Optional[str] = Field( + default=None, + description="""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored.""", + ) config: Optional[RubricGenerationConfig] = Field(default=None, description="""""") @@ -5374,6 +5392,9 @@ class _GenerateInstanceRubricsRequestDict(TypedDict, total=False): rubric_generation_spec: Optional[RubricGenerationSpecDict] """Specification for how the rubrics should be generated.""" + metric_resource_name: Optional[str] + """Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored.""" + config: Optional[RubricGenerationConfigDict] """"""