Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1329,6 +1329,15 @@ def _resolve_evaluation_run_metrics(
for metric_instance in metrics:
if isinstance(metric_instance, types.EvaluationRunMetric):
resolved_metrics_list.append(metric_instance)
elif isinstance(metric_instance, str) and metric_instance.startswith(
"projects/"
):
resolved_metrics_list.append(
types.EvaluationRunMetric(
metric=metric_instance.split("/")[-1],
metric_resource_name=metric_instance,
)
)
elif isinstance(
metric_instance, _evals_metric_loaders.LazyLoadedPrebuiltMetric
):
Expand Down Expand Up @@ -1409,6 +1418,15 @@ def _resolve_metrics(
raise
elif isinstance(metric_instance, types.Metric):
resolved_metrics_list.append(metric_instance)
elif isinstance(metric_instance, str) and metric_instance.startswith(
"projects/"
):
# Wrap the string in a Metric object to satisfy Pydantic validation
# and extract the ID for the 'name' property
metric_id = metric_instance.split("/")[-1]
resolved_metrics_list.append(
types.Metric(name=metric_id, metric_resource_name=metric_instance)
)
else:
try:
metric_name_str = str(metric_instance)
Expand Down
16 changes: 14 additions & 2 deletions vertexai/_genai/_evals_metric_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ def get_metric_result(
for attempt in range(_MAX_RETRIES):
try:
api_response = self.module._evaluate_instances(
metrics=[self.metric],
metrics_sources=[self.metric],
instance=payload.get("instance"),
autorater_config=payload.get("autorater_config"),
)
Expand Down Expand Up @@ -1164,7 +1164,7 @@ def get_metric_result(
for attempt in range(_MAX_RETRIES):
try:
api_response = self.module._evaluate_instances(
metrics=[self.metric],
metrics_sources=[self.metric],
instance=payload.get("instance"),
)
break
Expand Down Expand Up @@ -1242,6 +1242,14 @@ def aggregate(
)


class RegisteredMetricHandler(PredefinedMetricHandler):
"""Metric handler for registered metrics."""

def __init__(self, module: "evals.Evals", metric: types.Metric):
# Skip the parent check for SUPPORTED_PREDEFINED_METRICS
MetricHandler.__init__(self, module=module, metric=metric)


_METRIC_HANDLER_MAPPING = [
(
lambda m: hasattr(m, "remote_custom_function") and m.remote_custom_function,
Expand All @@ -1251,6 +1259,10 @@ def aggregate(
lambda m: m.custom_function and isinstance(m.custom_function, Callable),
CustomMetricHandler,
),
(
lambda m: getattr(m, "metric_resource_name", None) is not None,
RegisteredMetricHandler,
),
(
lambda m: m.name in ComputationMetricHandler.SUPPORTED_COMPUTATION_METRICS,
ComputationMetricHandler,
Expand Down
27 changes: 26 additions & 1 deletion vertexai/_genai/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,19 @@ def t_metrics(
Args:
metrics: A list of metrics used for evaluation.
set_default_aggregation_metrics: Whether to set default aggregation metrics.

Returns:
A list of resolved metric payloads for the evaluation request.
"""
metrics_payload = []

for metric in metrics:
# Case 1: Registered Metric Resource Name
if isinstance(metric, str) and metric.startswith("projects/"):
metrics_payload.append({"metric_resource_name": metric})
continue

# Case 2: Inline Metric Configuration
metric_payload_item: dict[str, Any] = {}

metric_name = getv(metric, ["name"]).lower()
Expand Down Expand Up @@ -83,5 +90,23 @@ def t_metrics(
raise ValueError(
f"Unsupported metric type or invalid metric name: {metric_name}"
)
metrics_payload.append(metric_payload_item)
metrics_payload.append({"metric": metric_payload_item})
return metrics_payload


def t_metric_sources(metrics: list[Any]) -> list[dict[str, Any]]:
"""Prepares the MetricSource payload for the evaluation request."""
sources_payload = []
for metric in metrics:
# Check if the 'metric' is a resource name string or contains one
resource_name = getattr(metric, "metric_resource_name", None)
if not resource_name and isinstance(metric, str) and metric.startswith("projects/"):
resource_name = metric

if resource_name:
sources_payload.append({"metric_resource_name": resource_name})
else:
# Fallback to existing Metric spec transformation
metric_payload = t_metrics([metric])[0]
sources_payload.append({"metric": metric_payload})
return sources_payload
39 changes: 32 additions & 7 deletions vertexai/_genai/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,11 @@ def _EvaluateInstancesRequestParameters_to_vertex(
if getv(from_object, ["autorater_config"]) is not None:
setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"]))

if getv(from_object, ["metrics"]) is not None:
if getv(from_object, ["metrics_sources"]) is not None:
setv(
to_object,
["metrics"],
[item for item in t.t_metrics(getv(from_object, ["metrics"]))],
["metricSources"],
t.t_metric_sources(getv(from_object, ["metrics_sources"])),
)

if getv(from_object, ["instance"]) is not None:
Expand Down Expand Up @@ -399,6 +399,13 @@ def _EvaluationRunMetric_from_vertex(
_UnifiedMetric_from_vertex(getv(from_object, ["metricConfig"]), to_object),
)

if getv(from_object, ["metricResourceName"]) is not None:
setv(
to_object,
["metric_resource_name"],
getv(from_object, ["metricResourceName"]),
)

return to_object


Expand All @@ -417,6 +424,13 @@ def _EvaluationRunMetric_to_vertex(
_UnifiedMetric_to_vertex(getv(from_object, ["metric_config"]), to_object),
)

if getv(from_object, ["metric_resource_name"]) is not None:
setv(
to_object,
["metricResourceName"],
getv(from_object, ["metric_resource_name"]),
)

return to_object


Expand Down Expand Up @@ -512,6 +526,13 @@ def _GenerateInstanceRubricsRequest_to_vertex(
),
)

if getv(from_object, ["metric_resource_name"]) is not None:
setv(
to_object,
["metricResourceName"],
getv(from_object, ["metric_resource_name"]),
)

if getv(from_object, ["config"]) is not None:
setv(to_object, ["config"], getv(from_object, ["config"]))

Expand Down Expand Up @@ -976,7 +997,7 @@ def _evaluate_instances(
] = None,
rubric_based_metric_input: Optional[types.RubricBasedMetricInputOrDict] = None,
autorater_config: Optional[genai_types.AutoraterConfigOrDict] = None,
metrics: Optional[list[types.MetricOrDict]] = None,
metrics_sources: Optional[list[types.MetricOrDict]] = None,
instance: Optional[types.EvaluationInstanceOrDict] = None,
config: Optional[types.EvaluateInstancesConfigOrDict] = None,
) -> types.EvaluateInstancesResponse:
Expand All @@ -996,7 +1017,7 @@ def _evaluate_instances(
tool_parameter_kv_match_input=tool_parameter_kv_match_input,
rubric_based_metric_input=rubric_based_metric_input,
autorater_config=autorater_config,
metrics=metrics,
metrics_sources=metrics_sources,
instance=instance,
config=config,
)
Expand Down Expand Up @@ -1049,6 +1070,7 @@ def _generate_rubrics(
types.PredefinedMetricSpecOrDict
] = None,
rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
metric_resource_name: Optional[str] = None,
config: Optional[types.RubricGenerationConfigOrDict] = None,
) -> types.GenerateInstanceRubricsResponse:
"""
Expand All @@ -1059,6 +1081,7 @@ def _generate_rubrics(
contents=contents,
predefined_rubric_generation_spec=predefined_rubric_generation_spec,
rubric_generation_spec=rubric_generation_spec,
metric_resource_name=metric_resource_name,
config=config,
)

Expand Down Expand Up @@ -2230,7 +2253,7 @@ async def _evaluate_instances(
] = None,
rubric_based_metric_input: Optional[types.RubricBasedMetricInputOrDict] = None,
autorater_config: Optional[genai_types.AutoraterConfigOrDict] = None,
metrics: Optional[list[types.MetricOrDict]] = None,
metrics_sources: Optional[list[types.MetricOrDict]] = None,
instance: Optional[types.EvaluationInstanceOrDict] = None,
config: Optional[types.EvaluateInstancesConfigOrDict] = None,
) -> types.EvaluateInstancesResponse:
Expand All @@ -2250,7 +2273,7 @@ async def _evaluate_instances(
tool_parameter_kv_match_input=tool_parameter_kv_match_input,
rubric_based_metric_input=rubric_based_metric_input,
autorater_config=autorater_config,
metrics=metrics,
metrics_sources=metrics_sources,
instance=instance,
config=config,
)
Expand Down Expand Up @@ -2305,6 +2328,7 @@ async def _generate_rubrics(
types.PredefinedMetricSpecOrDict
] = None,
rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
metric_resource_name: Optional[str] = None,
config: Optional[types.RubricGenerationConfigOrDict] = None,
) -> types.GenerateInstanceRubricsResponse:
"""
Expand All @@ -2315,6 +2339,7 @@ async def _generate_rubrics(
contents=contents,
predefined_rubric_generation_spec=predefined_rubric_generation_spec,
rubric_generation_spec=rubric_generation_spec,
metric_resource_name=metric_resource_name,
config=config,
)

Expand Down
37 changes: 29 additions & 8 deletions vertexai/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,9 @@ class LLMBasedMetricSpec(_common.BaseModel):
default=None,
description="""Dynamically generate rubrics using this specification.""",
)
metric_resource_name: Optional[str] = Field(
default=None, description="""The resource name of the metric definition."""
)


class LLMBasedMetricSpecDict(TypedDict, total=False):
Expand All @@ -2350,6 +2353,9 @@ class LLMBasedMetricSpecDict(TypedDict, total=False):
rubric_generation_spec: Optional[RubricGenerationSpecDict]
"""Dynamically generate rubrics using this specification."""

metric_resource_name: Optional[str]
"""The resource name of the metric definition."""


LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict]

Expand Down Expand Up @@ -2482,6 +2488,9 @@ class EvaluationRunMetric(_common.BaseModel):
metric_config: Optional[UnifiedMetric] = Field(
default=None, description="""The unified metric used for evaluation run."""
)
metric_resource_name: Optional[str] = Field(
default=None, description="""The resource name of the metric definition."""
)


class EvaluationRunMetricDict(TypedDict, total=False):
Expand All @@ -2493,6 +2502,9 @@ class EvaluationRunMetricDict(TypedDict, total=False):
metric_config: Optional[UnifiedMetricDict]
"""The unified metric used for evaluation run."""

metric_resource_name: Optional[str]
"""The resource name of the metric definition."""


EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict]

Expand Down Expand Up @@ -4439,6 +4451,9 @@ class Metric(_common.BaseModel):
default=None,
description="""Optional steering instruction parameters for the automated predefined metric.""",
)
metric_resource_name: Optional[str] = Field(
default=None, description="""The resource name of the metric definition."""
)

# Allow extra fields to support metric-specific config fields.
model_config = ConfigDict(extra="allow")
Expand Down Expand Up @@ -4643,6 +4658,9 @@ class MetricDict(TypedDict, total=False):
metric_spec_parameters: Optional[dict[str, Any]]
"""Optional steering instruction parameters for the automated predefined metric."""

metric_resource_name: Optional[str]
"""The resource name of the metric definition."""


MetricOrDict = Union[Metric, MetricDict]

Expand Down Expand Up @@ -4679,11 +4697,9 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description=""""""
)
metrics: Optional[list[Metric]] = Field(
metrics_sources: Optional[list[Metric]] = Field(
default=None,
description="""The metrics used for evaluation.
Currently, we only support evaluating a single metric. If multiple metrics
are provided, only the first one will be evaluated.""",
description="""The metrics used for evaluation. Each can be an inline configuration or a registered metric resource name.""",
)
instance: Optional[EvaluationInstance] = Field(
default=None, description="""The instance to be evaluated."""
Expand Down Expand Up @@ -4727,10 +4743,8 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
autorater_config: Optional[genai_types.AutoraterConfigDict]
""""""

metrics: Optional[list[MetricDict]]
"""The metrics used for evaluation.
Currently, we only support evaluating a single metric. If multiple metrics
are provided, only the first one will be evaluated."""
metrics_sources: Optional[list[MetricDict]]
"""The metrics used for evaluation. Each can be an inline configuration or a registered metric resource name."""

instance: Optional[EvaluationInstanceDict]
"""The instance to be evaluated."""
Expand Down Expand Up @@ -5354,6 +5368,10 @@ class _GenerateInstanceRubricsRequest(_common.BaseModel):
default=None,
description="""Specification for how the rubrics should be generated.""",
)
metric_resource_name: Optional[str] = Field(
default=None,
description="""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored.""",
)
config: Optional[RubricGenerationConfig] = Field(default=None, description="""""")


Expand All @@ -5374,6 +5392,9 @@ class _GenerateInstanceRubricsRequestDict(TypedDict, total=False):
rubric_generation_spec: Optional[RubricGenerationSpecDict]
"""Specification for how the rubrics should be generated."""

metric_resource_name: Optional[str]
"""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored."""

config: Optional[RubricGenerationConfigDict]
""""""

Expand Down
Loading