From b7cb1aa6f53921cb0484d95cb7385cb7308a0f13 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 10:24:28 -0600 Subject: [PATCH 1/7] implement canonical metrics, feature flagged to preserve legacy where legacy ones were released --- harness/main.py | 23 +- harness/manifests/deployment.yaml | 4 + .../client/automator/async_task_runner.py | 23 +- .../client/automator/task_handler.py | 16 +- src/conductor/client/automator/task_runner.py | 23 +- .../settings/metrics_settings.py | 19 +- .../client/orkes/orkes_base_client.py | 5 +- .../client/orkes/orkes_workflow_client.py | 44 +- src/conductor/client/orkes_clients.py | 7 +- .../telemetry/canonical_metrics_collector.py | 215 ++++ .../telemetry/legacy_metrics_collector.py | 304 ++++++ .../client/telemetry/metrics_collector.py | 947 +----------------- .../telemetry/metrics_collector_base.py | 467 +++++++++ .../client/telemetry/metrics_factory.py | 45 + .../telemetry/model/metric_documentation.py | 9 + .../client/telemetry/model/metric_label.py | 3 +- .../client/telemetry/model/metric_name.py | 9 + .../workflow/executor/workflow_executor.py | 6 +- .../test_canonical_metrics_collector.py | 216 ++++ tests/unit/telemetry/test_metrics_factory.py | 117 +++ 20 files changed, 1554 insertions(+), 948 deletions(-) create mode 100644 src/conductor/client/telemetry/canonical_metrics_collector.py create mode 100644 src/conductor/client/telemetry/legacy_metrics_collector.py create mode 100644 src/conductor/client/telemetry/metrics_collector_base.py create mode 100644 src/conductor/client/telemetry/metrics_factory.py create mode 100644 tests/unit/telemetry/test_canonical_metrics_collector.py create mode 100644 tests/unit/telemetry/test_metrics_factory.py diff --git a/harness/main.py b/harness/main.py index 28a92b3df..2be7d8e6c 100644 --- a/harness/main.py +++ b/harness/main.py @@ -9,6 +9,7 @@ from conductor.client.configuration.settings.metrics_settings import MetricsSettings from conductor.client.http.models.task_def import TaskDef from conductor.client.orkes_clients import OrkesClients +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.workflow.conductor_workflow import ConductorWorkflow from conductor.client.workflow.task.simple_task import SimpleTask @@ -67,7 +68,16 @@ def register_metadata(clients: OrkesClients) -> None: def main() -> None: configuration = Configuration() - clients = OrkesClients(configuration) + + metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) + metrics_settings = MetricsSettings(http_port=metrics_port) + print(f"Prometheus metrics will be served on port {metrics_port}") + + # Main-process MetricsCollector: writes workflow-client / HTTP metrics into + # the shared PROMETHEUS_MULTIPROC_DIR so they merge with worker subprocess + # metrics in the exported /metrics payload. + metrics_collector = create_metrics_collector(metrics_settings) + clients = OrkesClients(configuration, metrics_collector=metrics_collector) register_metadata(clients) @@ -80,10 +90,6 @@ def main() -> None: worker = SimulatedTaskWorker(task_name, codename, sleep_seconds, batch_size, poll_interval_ms) workers.append(worker) - metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) - metrics_settings = MetricsSettings(http_port=metrics_port) - print(f"Prometheus metrics will be served on port {metrics_port}") - task_handler = TaskHandler( workers=workers, configuration=configuration, @@ -93,7 +99,6 @@ def main() -> None: workflow_executor = clients.get_workflow_executor() governor = WorkflowGovernor(workflow_executor, WORKFLOW_NAME, workflows_per_sec) - governor.start() main_pid = os.getpid() shutting_down = False @@ -111,7 +116,13 @@ def shutdown(signum, frame): signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) + # Fork worker processes BEFORE starting the governor thread. The parent's + # MetricsCollector uses prometheus_client's multiprocess mode, which guards + # its mmapped value files with a threading.Lock. If the governor thread is + # already running at fork() time, a child can inherit that lock in a held + # state and deadlock on its first metric write. task_handler.start_processes() + governor.start() task_handler.join_processes() diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 048559faf..0043716b0 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -51,6 +51,10 @@ spec: - name: HARNESS_POLL_INTERVAL_MS value: "100" + # === METRICS MODE === + - name: WORKER_CANONICAL_METRICS + value: "true" + ports: - name: metrics containerPort: 9991 diff --git a/src/conductor/client/automator/async_task_runner.py b/src/conductor/client/automator/async_task_runner.py index ba3e36534..116c6ce36 100644 --- a/src/conductor/client/automator/async_task_runner.py +++ b/src/conductor/client/automator/async_task_runner.py @@ -27,7 +27,7 @@ from conductor.client.http.rest import AuthorizationException, ApiException from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient from conductor.client.orkes.orkes_schema_client import OrkesSchemaClient -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.worker.worker_interface import WorkerInterface from conductor.client.worker.worker_config import resolve_worker_config, get_worker_config_oneline from conductor.client.worker.exception import NonRetryableException @@ -88,7 +88,7 @@ def __init__( self.metrics_collector = None if metrics_settings is not None: - self.metrics_collector = MetricsCollector( + self.metrics_collector = create_metrics_collector( metrics_settings ) # Register metrics collector as event listener @@ -863,6 +863,7 @@ async def __async_update_task(self, task_result: TaskResult): if attempt > 0: # Exponential backoff: [10s, 20s, 30s] before retry await asyncio.sleep(attempt * 10) + update_start = time.time() try: if self._use_update_v2: next_task = await self.async_task_client.update_task_v2(body=task_result) @@ -873,6 +874,10 @@ async def __async_update_task(self, task_result: TaskResult): task_definition_name, next_task.task_id if next_task else None ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return next_task else: await self.async_task_client.update_task(body=task_result) @@ -882,6 +887,10 @@ async def __async_update_task(self, task_result: TaskResult): task_result.workflow_instance_id, task_definition_name, ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except ApiException as e: if e.status in (404, 405) and self._use_update_v2: @@ -895,12 +904,19 @@ async def __async_update_task(self, task_result: TaskResult): # Retry immediately with v1 try: await self.async_task_client.update_task(body=task_result) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except Exception as fallback_e: last_exception = fallback_e continue last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) @@ -917,6 +933,9 @@ async def __async_update_task(self, task_result: TaskResult): except Exception as e: last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 2e3616f11..6a8255bd0 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -18,7 +18,7 @@ from conductor.client.event.task_runner_events import TaskRunnerEvent from conductor.client.event.sync_event_dispatcher import SyncEventDispatcher from conductor.client.event.sync_listener_register import register_task_runner_listener -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase from conductor.client.telemetry.model.metric_documentation import MetricDocumentation from conductor.client.telemetry.model.metric_label import MetricLabel from conductor.client.telemetry.model.metric_name import MetricName @@ -36,10 +36,14 @@ _mp_fork_set = False if not _mp_fork_set: try: - if platform == "win32": - set_start_method("spawn") - else: - set_start_method("fork") + # The prometheus_client library holds a module-level threading lock; + # forking while that lock is held causes a deadlock in child processes. + # Set CONDUCTOR_MP_START_METHOD=spawn to avoid this if you hit the + # deadlock. Default is fork for backward compatibility (spawn requires + # all Process arguments to be picklable). + _default_method = "spawn" if platform == "win32" else "fork" + _method = os.environ.get("CONDUCTOR_MP_START_METHOD", _default_method).strip().lower() + set_start_method(_method) _mp_fork_set = True except Exception as e: logger.info("error when setting multiprocessing.set_start_method - maybe the context is set %s", e.args) @@ -345,7 +349,7 @@ def __create_metrics_provider_process(self, metrics_settings: MetricsSettings) - self.metrics_provider_process = None return self.metrics_provider_process = Process( - target=MetricsCollector.provide_metrics, + target=MetricsCollectorBase.provide_metrics, args=(metrics_settings,) ) logger.info("Created MetricsProvider process") diff --git a/src/conductor/client/automator/task_runner.py b/src/conductor/client/automator/task_runner.py index af566de10..25df5961e 100644 --- a/src/conductor/client/automator/task_runner.py +++ b/src/conductor/client/automator/task_runner.py @@ -30,7 +30,7 @@ from conductor.client.http.rest import AuthorizationException, ApiException from conductor.client.orkes.orkes_metadata_client import OrkesMetadataClient from conductor.client.orkes.orkes_schema_client import OrkesSchemaClient -from conductor.client.telemetry.metrics_collector import MetricsCollector +from conductor.client.telemetry.metrics_factory import create_metrics_collector from conductor.client.worker.worker import ASYNC_TASK_RUNNING from conductor.client.worker.worker_interface import WorkerInterface from conductor.client.worker.worker_config import resolve_worker_config, get_worker_config_oneline @@ -69,7 +69,7 @@ def __init__( self.metrics_collector = None if metrics_settings is not None: - self.metrics_collector = MetricsCollector( + self.metrics_collector = create_metrics_collector( metrics_settings ) # Register metrics collector as event listener @@ -972,6 +972,7 @@ def __update_task(self, task_result: TaskResult): if attempt > 0: # Exponential backoff: [10s, 20s, 30s] before retry time.sleep(attempt * 10) + update_start = time.time() try: if self._use_update_v2: next_task = self.task_client.update_task_v2(body=task_result) @@ -982,6 +983,10 @@ def __update_task(self, task_result: TaskResult): task_definition_name, next_task.task_id if next_task else None ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return next_task else: self.task_client.update_task(body=task_result) @@ -991,6 +996,10 @@ def __update_task(self, task_result: TaskResult): task_result.workflow_instance_id, task_definition_name, ) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except ApiException as e: if e.status in (404, 405) and self._use_update_v2: @@ -1004,12 +1013,19 @@ def __update_task(self, task_result: TaskResult): # Retry immediately with v1 try: self.task_client.update_task(body=task_result) + if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="SUCCESS" + ) return None except Exception as fallback_e: last_exception = fallback_e continue last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) @@ -1044,6 +1060,9 @@ def __update_task(self, task_result: TaskResult): except Exception as e: last_exception = e if self.metrics_collector is not None: + self.metrics_collector.record_task_update_time( + task_definition_name, time.time() - update_start, status="FAILURE" + ) self.metrics_collector.increment_task_update_error( task_definition_name, type(e) ) diff --git a/src/conductor/client/configuration/settings/metrics_settings.py b/src/conductor/client/configuration/settings/metrics_settings.py index 18a4c96bc..4d4bb6eec 100644 --- a/src/conductor/client/configuration/settings/metrics_settings.py +++ b/src/conductor/client/configuration/settings/metrics_settings.py @@ -24,7 +24,8 @@ def __init__( directory: Optional[str] = None, file_name: str = "metrics.log", update_interval: float = 0.1, - http_port: Optional[int] = None): + http_port: Optional[int] = None, + clean_directory: bool = True): """ Configure metrics collection settings. @@ -40,6 +41,10 @@ def __init__( If None: - Metrics will be written to file at {directory}/{file_name} - No HTTP server will be started + clean_directory: If True (default), remove stale prometheus_client + .db files from the directory on init. Without this, + metric names from previous configurations persist in the + multiprocess directory and appear in /metrics output. """ if directory is None: directory = get_default_temporary_folder() @@ -47,6 +52,8 @@ def __init__( self.file_name = file_name self.update_interval = update_interval self.http_port = http_port + if clean_directory: + self._clean_stale_db_files() def __set_dir(self, dir: str) -> None: if not os.path.isdir(dir): @@ -57,3 +64,13 @@ def __set_dir(self, dir: str) -> None: "Failed to create metrics temporary folder, reason: %s", e) self.directory = dir + + def _clean_stale_db_files(self) -> None: + """Remove leftover prometheus_client multiprocess .db files.""" + import glob + pattern = os.path.join(self.directory, "*.db") + for path in glob.glob(pattern): + try: + os.remove(path) + except Exception as e: + logger.debug("Could not remove stale metrics db file %s: %s", path, e) diff --git a/src/conductor/client/orkes/orkes_base_client.py b/src/conductor/client/orkes/orkes_base_client.py index 2e04533f8..3c5314ad8 100644 --- a/src/conductor/client/orkes/orkes_base_client.py +++ b/src/conductor/client/orkes/orkes_base_client.py @@ -22,8 +22,9 @@ class OrkesBaseClient(object): - def __init__(self, configuration: Configuration): - self.api_client = ApiClient(configuration) + def __init__(self, configuration: Configuration, metrics_collector=None): + self.metrics_collector = metrics_collector + self.api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.logger = logging.getLogger( Configuration.get_logging_formatted_name(__name__) ) diff --git a/src/conductor/client/orkes/orkes_workflow_client.py b/src/conductor/client/orkes/orkes_workflow_client.py index 24472260e..7ef501c07 100644 --- a/src/conductor/client/orkes/orkes_workflow_client.py +++ b/src/conductor/client/orkes/orkes_workflow_client.py @@ -1,4 +1,5 @@ from __future__ import annotations +import json from typing import Optional, List, Dict from conductor.client.configuration.configuration import Configuration @@ -18,9 +19,10 @@ class OrkesWorkflowClient(OrkesBaseClient, WorkflowClient): def __init__( self, - configuration: Configuration + configuration: Configuration, + metrics_collector=None, ): - super(OrkesWorkflowClient, self).__init__(configuration) + super(OrkesWorkflowClient, self).__init__(configuration, metrics_collector=metrics_collector) def start_workflow_by_name( self, @@ -38,10 +40,44 @@ def start_workflow_by_name( if priority: kwargs.update({"priority": priority}) - return self.workflowResourceApi.start_workflow1(input, name, **kwargs) + self._record_workflow_input_payload_size(name, version, input) + try: + return self.workflowResourceApi.start_workflow1(input, name, **kwargs) + except Exception as e: + self._record_workflow_start_error(name, e) + raise def start_workflow(self, start_workflow_request: StartWorkflowRequest) -> str: - return self.workflowResourceApi.start_workflow(start_workflow_request) + self._record_workflow_input_payload_size( + start_workflow_request.name, + start_workflow_request.version, + getattr(start_workflow_request, "input", None), + ) + try: + return self.workflowResourceApi.start_workflow(start_workflow_request) + except Exception as e: + self._record_workflow_start_error(start_workflow_request.name, e) + raise + + def _record_workflow_input_payload_size(self, name: str, version, workflow_input) -> None: + if self.metrics_collector is None: + return + try: + encoded = json.dumps(workflow_input if workflow_input is not None else {}, default=str) + size_bytes = len(encoded.encode("utf-8")) + self.metrics_collector.record_workflow_input_payload_size( + name, str(version) if version is not None else "", size_bytes, + ) + except Exception: + pass + + def _record_workflow_start_error(self, name: str, exception: Exception) -> None: + if self.metrics_collector is None: + return + try: + self.metrics_collector.increment_workflow_start_error(name, exception) + except Exception: + pass def execute_workflow( self, diff --git a/src/conductor/client/orkes_clients.py b/src/conductor/client/orkes_clients.py index 51078b195..50c773907 100644 --- a/src/conductor/client/orkes_clients.py +++ b/src/conductor/client/orkes_clients.py @@ -21,13 +21,14 @@ class OrkesClients: - def __init__(self, configuration: Configuration = None): + def __init__(self, configuration: Configuration = None, metrics_collector=None): if configuration is None: configuration = Configuration() self.configuration = configuration + self.metrics_collector = metrics_collector def get_workflow_client(self) -> WorkflowClient: - return OrkesWorkflowClient(self.configuration) + return OrkesWorkflowClient(self.configuration, metrics_collector=self.metrics_collector) def get_authorization_client(self) -> AuthorizationClient: return OrkesAuthorizationClient(self.configuration) @@ -48,7 +49,7 @@ def get_integration_client(self) -> IntegrationClient: return OrkesIntegrationClient(self.configuration) def get_workflow_executor(self) -> WorkflowExecutor: - return WorkflowExecutor(self.configuration) + return WorkflowExecutor(self.configuration, metrics_collector=self.metrics_collector) def get_prompt_client(self) -> PromptClient: return OrkesPromptClient(self.configuration) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py new file mode 100644 index 000000000..f77a8890e --- /dev/null +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -0,0 +1,215 @@ +""" +Canonical Prometheus metrics implementation following the harmonized metric +catalog (see sdk-metrics-harmonization.md). + +Uses real Prometheus Histograms for timing and size metrics, bounded-cardinality +exception labels, and canonical metric names (_total suffixed counters, _seconds +suffixed time histograms, _bytes suffixed size histograms). + +Events / metrics that have no canonical equivalent (legacy-only gauges) are +consumed as no-ops. This class is selected at runtime when +WORKER_CANONICAL_METRICS=true. +""" + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_collector_base import ( + MetricsCollectorBase, + _exception_label, +) +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + +TIME_BUCKETS = (0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0) +SIZE_BUCKETS = (100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000) + + +class CanonicalMetricsCollector(MetricsCollectorBase): + + def __init__(self, settings: MetricsSettings): + super().__init__(settings) + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def increment_task_poll(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_POLL, + documentation=MetricDocumentation.TASK_POLL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_poll_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_POLL_ERROR, + documentation=MetricDocumentation.TASK_POLL_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_execution_started(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_STARTED, + documentation=MetricDocumentation.TASK_EXECUTION_STARTED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_queue_full(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_QUEUE_FULL, + documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_uncaught_exception(self, exception=None) -> None: + self._increment_counter( + name=MetricName.THREAD_UNCAUGHT_EXCEPTION, + documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, + labels={ + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_worker_restart(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.WORKER_RESTART, + documentation=MetricDocumentation.WORKER_RESTART, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_paused(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_PAUSED, + documentation=MetricDocumentation.TASK_PAUSED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTE_ERROR, + documentation=MetricDocumentation.TASK_EXECUTE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_ack_failed(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_FAILED, + documentation=MetricDocumentation.TASK_ACK_FAILED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_ack_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_ERROR, + documentation=MetricDocumentation.TASK_ACK_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_task_update_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_UPDATE_ERROR, + documentation=MetricDocumentation.TASK_UPDATE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: + self._increment_counter( + name=MetricName.EXTERNAL_PAYLOAD_USED, + documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, + labels={ + MetricLabel.ENTITY_NAME: entity_name, + MetricLabel.OPERATION: operation, + MetricLabel.PAYLOAD_TYPE: payload_type, + }, + ) + + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: + self._increment_counter( + name=MetricName.WORKFLOW_START_ERROR, + documentation=MetricDocumentation.WORKFLOW_START_ERROR, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.EXCEPTION: _exception_label(exception), + }, + ) + + # ------------------------------------------------------------------ + # Timing (real Prometheus Histograms) + # ------------------------------------------------------------------ + + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_POLL_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._observe_histogram( + name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + self._observe_histogram( + name=MetricName.API_REQUEST_TIME_CANONICAL, + documentation=MetricDocumentation.API_REQUEST_TIME_CANONICAL, + labels={ + MetricLabel.METHOD: method, + MetricLabel.URI: uri, + MetricLabel.STATUS: status, + }, + value=time_spent, + buckets=TIME_BUCKETS, + ) + + # ------------------------------------------------------------------ + # Sizes (real Prometheus Histograms with size buckets) + # ------------------------------------------------------------------ + + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: + self._observe_histogram( + name=MetricName.TASK_RESULT_SIZE_BYTES, + documentation=MetricDocumentation.TASK_RESULT_SIZE_BYTES, + labels={MetricLabel.TASK_TYPE: task_type}, + value=payload_size, + buckets=SIZE_BUCKETS, + ) + + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: + self._observe_histogram( + name=MetricName.WORKFLOW_INPUT_SIZE_BYTES, + documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE_BYTES, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.WORKFLOW_VERSION: version, + }, + value=payload_size, + buckets=SIZE_BUCKETS, + ) diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py new file mode 100644 index 000000000..e819810b9 --- /dev/null +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -0,0 +1,304 @@ +""" +Legacy Prometheus metrics implementation preserving the metric names, label +conventions, and quantile-gauge timing shape from the original python-sdk. + +Events / metrics that have no legacy equivalent are consumed as no-ops. +This class is selected at runtime when WORKER_CANONICAL_METRICS is not true. +""" + +from collections import deque +from typing import Any, Dict, List + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry import metrics_collector_base as _mcb +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + + +class LegacyMetricsCollector(MetricsCollectorBase): + + QUANTILE_WINDOW_SIZE = 1000 + + def __init__(self, settings: MetricsSettings): + super().__init__(settings) + self.quantile_metrics: Dict[str, Any] = {} + self.quantile_data: Dict[str, deque] = {} + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def increment_task_poll(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_POLL, + documentation=MetricDocumentation.TASK_POLL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_poll_error(self, task_type: str, exception) -> None: + pass # no legacy metric for poll errors + + def increment_task_execution_started(self, task_type: str) -> None: + pass # no legacy metric for execution-started + + def increment_task_execution_queue_full(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTION_QUEUE_FULL, + documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_uncaught_exception(self, exception=None) -> None: + self._increment_counter( + name=MetricName.THREAD_UNCAUGHT_EXCEPTION, + documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, + labels={}, + ) + + def increment_worker_restart(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.WORKER_RESTART, + documentation=MetricDocumentation.WORKER_RESTART, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_paused(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_PAUSED, + documentation=MetricDocumentation.TASK_PAUSED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_execution_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_EXECUTE_ERROR, + documentation=MetricDocumentation.TASK_EXECUTE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_task_ack_failed(self, task_type: str) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_FAILED, + documentation=MetricDocumentation.TASK_ACK_FAILED, + labels={MetricLabel.TASK_TYPE: task_type}, + ) + + def increment_task_ack_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_ACK_ERROR, + documentation=MetricDocumentation.TASK_ACK_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_task_update_error(self, task_type: str, exception) -> None: + self._increment_counter( + name=MetricName.TASK_UPDATE_ERROR, + documentation=MetricDocumentation.TASK_UPDATE_ERROR, + labels={ + MetricLabel.TASK_TYPE: task_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: + self._increment_counter( + name=MetricName.EXTERNAL_PAYLOAD_USED, + documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, + labels={ + MetricLabel.ENTITY_NAME: entity_name, + MetricLabel.OPERATION: operation, + MetricLabel.PAYLOAD_TYPE_LEGACY: payload_type, + }, + ) + + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: + self._increment_counter( + name=MetricName.WORKFLOW_START_ERROR, + documentation=MetricDocumentation.WORKFLOW_START_ERROR, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.EXCEPTION: str(exception), + }, + ) + + # ------------------------------------------------------------------ + # Timing (last-value gauges + quantile gauges) + # ------------------------------------------------------------------ + + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_gauge( + name=MetricName.TASK_POLL_TIME, + documentation=MetricDocumentation.TASK_POLL_TIME, + labels={MetricLabel.TASK_TYPE: task_type}, + value=time_spent, + ) + self._record_quantiles( + name=MetricName.TASK_POLL_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_gauge( + name=MetricName.TASK_EXECUTE_TIME, + documentation=MetricDocumentation.TASK_EXECUTE_TIME, + labels={MetricLabel.TASK_TYPE: task_type}, + value=time_spent, + ) + self._record_quantiles( + name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: + self._record_quantiles( + name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, + documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, + labels={MetricLabel.TASK_TYPE: task_type, MetricLabel.STATUS: status}, + value=time_spent, + ) + + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: + self._record_quantiles( + name=MetricName.API_REQUEST_TIME, + documentation=MetricDocumentation.API_REQUEST_TIME, + labels={ + MetricLabel.METHOD: method, + MetricLabel.URI: uri, + MetricLabel.STATUS: status, + }, + value=time_spent, + ) + + # ------------------------------------------------------------------ + # Sizes (last-value gauges) + # ------------------------------------------------------------------ + + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: + self._record_gauge( + name=MetricName.TASK_RESULT_SIZE, + documentation=MetricDocumentation.TASK_RESULT_SIZE, + labels={MetricLabel.TASK_TYPE: task_type}, + value=payload_size, + ) + + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: + self._record_gauge( + name=MetricName.WORKFLOW_INPUT_SIZE, + documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE, + labels={ + MetricLabel.WORKFLOW_TYPE: workflow_type, + MetricLabel.WORKFLOW_VERSION: version, + }, + value=payload_size, + ) + + # ------------------------------------------------------------------ + # Quantile-gauge machinery (legacy timing shape) + # ------------------------------------------------------------------ + + def _record_quantiles( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: float + ) -> None: + if not self.must_collect_metrics: + return + + with self._lock: + label_values = tuple(labels.values()) + data_key = f"{name}_{label_values}" + + if data_key not in self.quantile_data: + self.quantile_data[data_key] = deque(maxlen=self.QUANTILE_WINDOW_SIZE) + + self.quantile_data[data_key].append(value) + + observations = sorted(self.quantile_data[data_key]) + n = len(observations) + + if n > 0: + quantiles = [0.5, 0.75, 0.9, 0.95, 0.99] + for q in quantiles: + quantile_value = self._calculate_quantile(observations, q) + gauge = self._get_quantile_gauge( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ["quantile"], + ) + gauge.labels(*labels.values(), str(q)).set(quantile_value) + + self._update_summary_aggregates( + name=name, + documentation=documentation, + labels=labels, + observations=list(self.quantile_data[data_key]), + ) + + @staticmethod + def _calculate_quantile(sorted_values: List[float], quantile: float) -> float: + if not sorted_values: + return 0.0 + n = len(sorted_values) + index = quantile * (n - 1) + if index.is_integer(): + return sorted_values[int(index)] + lower_index = int(index) + upper_index = min(lower_index + 1, n - 1) + fraction = index - lower_index + return sorted_values[lower_index] + fraction * (sorted_values[upper_index] - sorted_values[lower_index]) + + def _get_quantile_gauge(self, name, documentation, labelnames): + if name not in self.quantile_metrics: + _mcb._ensure_prometheus_imported() + self.quantile_metrics[name] = _mcb.Gauge( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + multiprocess_mode='all', + ) + return self.quantile_metrics[name] + + def _update_summary_aggregates(self, name, documentation, labels, observations): + if not observations: + return + _mcb._ensure_prometheus_imported() + base_name = name.value if hasattr(name, 'value') else str(name) + doc_str = documentation.value if hasattr(documentation, 'value') else str(documentation) + + count_name = f"{base_name}_count" + if count_name not in self.gauges: + self.gauges[count_name] = _mcb.Gauge( + name=count_name, + documentation=f"{doc_str} - count", + labelnames=[label.value for label in labels.keys()], + registry=self.registry, + multiprocess_mode='all', + ) + + sum_name = f"{base_name}_sum" + if sum_name not in self.gauges: + self.gauges[sum_name] = _mcb.Gauge( + name=sum_name, + documentation=f"{doc_str} - sum", + labelnames=[label.value for label in labels.keys()], + registry=self.registry, + multiprocess_mode='all', + ) + + self.gauges[count_name].labels(*labels.values()).set(len(observations)) + self.gauges[sum_name].labels(*labels.values()).set(sum(observations)) diff --git a/src/conductor/client/telemetry/metrics_collector.py b/src/conductor/client/telemetry/metrics_collector.py index 7e3b6c579..0773326d6 100644 --- a/src/conductor/client/telemetry/metrics_collector.py +++ b/src/conductor/client/telemetry/metrics_collector.py @@ -1,927 +1,38 @@ -import logging -import os -import threading -import time -from collections import deque -from typing import Any, ClassVar, Dict, List, Tuple +""" +Backward-compatibility shim. -# Lazy imports - these will be imported when first needed -# This is necessary for multiprocess mode where PROMETHEUS_MULTIPROC_DIR -# must be set before prometheus_client is imported -CollectorRegistry = None -Counter = None -Gauge = None -Histogram = None -Summary = None -write_to_textfile = None -MultiProcessCollector = None +Existing code that does:: -def _ensure_prometheus_imported(): - """Lazy import of prometheus_client to ensure PROMETHEUS_MULTIPROC_DIR is set first.""" - global CollectorRegistry, Counter, Gauge, Histogram, Summary, write_to_textfile, MultiProcessCollector + from conductor.client.telemetry.metrics_collector import MetricsCollector - if CollectorRegistry is None: - from prometheus_client import CollectorRegistry as _CollectorRegistry - from prometheus_client import Counter as _Counter - from prometheus_client import Gauge as _Gauge - from prometheus_client import Histogram as _Histogram - from prometheus_client import Summary as _Summary - from prometheus_client import write_to_textfile as _write_to_textfile - from prometheus_client.multiprocess import MultiProcessCollector as _MultiProcessCollector +will continue to work -- ``MetricsCollector`` is the legacy implementation. - CollectorRegistry = _CollectorRegistry - Counter = _Counter - Gauge = _Gauge - Histogram = _Histogram - Summary = _Summary - write_to_textfile = _write_to_textfile - MultiProcessCollector = _MultiProcessCollector +``task_handler.py`` accesses ``mc._ensure_prometheus_imported()``, +``mc.Counter``, ``mc.CollectorRegistry`` etc. via +``from conductor.client.telemetry import metrics_collector as mc``. +These are lazily-initialised module-level globals in the base module, +so we use ``__getattr__`` to forward attribute lookups dynamically +(a plain ``from … import Counter`` would capture ``None`` at import time). +""" -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.telemetry.model.metric_documentation import MetricDocumentation -from conductor.client.telemetry.model.metric_label import MetricLabel -from conductor.client.telemetry.model.metric_name import MetricName +# Re-export the legacy implementation under its original name. +from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector as MetricsCollector # noqa: F401 -# Event system imports (for new event-driven architecture) -from conductor.client.event.task_runner_events import ( - PollStarted, - PollCompleted, - PollFailure, - TaskExecutionStarted, - TaskExecutionCompleted, - TaskExecutionFailure, -) -from conductor.client.event.workflow_events import ( - WorkflowStarted, - WorkflowInputPayloadSize, - WorkflowPayloadUsed, -) -from conductor.client.event.task_events import ( - TaskResultPayloadSize, - TaskPayloadUsed, -) +from conductor.client.telemetry import metrics_collector_base as _base # noqa: F401 -logger = logging.getLogger( - Configuration.get_logging_formatted_name( - __name__ - ) -) +_FORWARDED = { + "_ensure_prometheus_imported", + "CollectorRegistry", + "Counter", + "Gauge", + "Histogram", + "Summary", + "write_to_textfile", + "MultiProcessCollector", +} -class MetricsCollector: - """ - Prometheus-based metrics collector for Conductor operations. - - This class implements the event listener protocols (TaskRunnerEventsListener, - WorkflowEventsListener, TaskEventsListener) via structural subtyping (duck typing), - matching the Java SDK's MetricsCollector interface. - - Supports both usage patterns: - 1. Direct method calls (backward compatible): - metrics.increment_task_poll(task_type) - - 2. Event-driven (new): - dispatcher.register(PollStarted, metrics.on_poll_started) - dispatcher.publish(PollStarted(...)) - - Thread Safety: - This class is thread-safe. Internal dictionaries (counters, gauges, histograms, etc.) - are protected by a lock to prevent race conditions when accessed from multiple threads - (e.g., worker threads and monitor threads). - - Note: Uses Python's Protocol for structural subtyping rather than explicit - inheritance to avoid circular imports and maintain backward compatibility. - """ - QUANTILE_WINDOW_SIZE = 1000 # Keep last 1000 observations for quantile calculation - - def __init__(self, settings: MetricsSettings): - # Instance state (avoid cross-test/dir interference from class-level caches). - self.counters: Dict[str, Counter] = {} - self.gauges: Dict[str, Gauge] = {} - self.histograms: Dict[str, Histogram] = {} - self.summaries: Dict[str, Summary] = {} - self.quantile_metrics: Dict[str, Gauge] = {} # metric_name -> Gauge with quantile label (used as summary) - self.quantile_data: Dict[str, deque] = {} # metric_name+labels -> deque of values - self.registry = None - self.must_collect_metrics = False - self._lock = threading.RLock() # Reentrant lock for thread-safe access to internal dictionaries - - if settings is None: - return - - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - - # Import prometheus_client NOW (after PROMETHEUS_MULTIPROC_DIR is set). - _ensure_prometheus_imported() - - # Each MetricsCollector instance gets its own registry so callers/tests can - # safely use different PROMETHEUS_MULTIPROC_DIR values in the same process. - self.registry = CollectorRegistry() - - self.must_collect_metrics = True - logger.debug( - "MetricsCollector initialized with directory=%s, must_collect=%s", - settings.directory, - self.must_collect_metrics, - ) - - @staticmethod - def provide_metrics(settings: MetricsSettings) -> None: - if settings is None: - return - - # Set environment variable for this process - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - - # Import prometheus_client in this process too (after setting env var) - _ensure_prometheus_imported() - - OUTPUT_FILE_PATH = os.path.join( - settings.directory, - settings.file_name - ) - - # Wait a bit for worker processes to start and create initial metrics - time.sleep(0.5) - - registry = CollectorRegistry() - # Use custom collector that removes pid label and aggregates across processes - from prometheus_client.multiprocess import MultiProcessCollector as MPCollector - from prometheus_client.samples import Sample - from prometheus_client.metrics_core import Metric - - class NoPidCollector(MPCollector): - """Custom collector that removes pid label and aggregates metrics across processes.""" - def collect(self): - for metric in super().collect(): - # Group samples by label set (excluding pid) - aggregated = {} - - for sample in metric.samples: - # Remove pid from labels - labels = {k: v for k, v in sample.labels.items() if k != 'pid'} - # Create key from sample name and labels - label_items = tuple(sorted(labels.items())) - key = (sample.name, label_items) - - if key not in aggregated: - aggregated[key] = { - 'labels': labels, - 'values': [], - 'name': sample.name, - 'timestamp': sample.timestamp, - 'exemplar': sample.exemplar - } - - aggregated[key]['values'].append(sample.value) - - # Create consolidated samples - filtered_samples = [] - for key, data in aggregated.items(): - # For counters and _count/_sum metrics: sum the values - # For gauges with quantiles: take the mean (approximation) - # For other gauges: take the last value - if metric.type == 'counter' or data['name'].endswith('_count') or data['name'].endswith('_sum'): - # Sum values for counters - value = sum(data['values']) - elif 'quantile' in data['labels']: - # For quantile metrics, take the mean across processes - value = sum(data['values']) / len(data['values']) - else: - # For other gauges, take the last value - value = data['values'][-1] - - filtered_samples.append( - Sample(data['name'], data['labels'], value, data['timestamp'], data['exemplar']) - ) - - # Create new metric and assign filtered samples - new_metric = Metric(metric.name, metric.documentation, metric.type) - new_metric.samples = filtered_samples - yield new_metric - - NoPidCollector(registry) - - # Start HTTP server if port is specified - http_server = None - if settings.http_port is not None: - http_server = MetricsCollector._start_http_server(settings.http_port, registry) - logger.info(f"Metrics HTTP server mode: serving from memory (no file writes) (pid={os.getpid()})") - - # When HTTP server is enabled, don't write to file - just keep updating registry in memory - # The HTTP server reads directly from the registry - while True: - time.sleep(settings.update_interval) - else: - # File-based mode: write metrics to file periodically - logger.info(f"Metrics file mode: writing to {OUTPUT_FILE_PATH} (pid={os.getpid()})") - while True: - try: - write_to_textfile( - OUTPUT_FILE_PATH, - registry - ) - except Exception as e: - # Log error but continue - metrics files might be in inconsistent state - logger.debug(f"Error writing metrics (will retry): {e}") - - time.sleep(settings.update_interval) - - @staticmethod - def _start_http_server(port: int, registry: 'CollectorRegistry') -> 'HTTPServer': - """Start HTTP server to expose metrics endpoint for Prometheus scraping.""" - from http.server import HTTPServer, BaseHTTPRequestHandler - import threading - - class MetricsHTTPHandler(BaseHTTPRequestHandler): - """HTTP handler to serve Prometheus metrics.""" - - def do_GET(self): - """Handle GET requests for /metrics endpoint.""" - if self.path == '/metrics': - try: - # Generate metrics in Prometheus text format - from prometheus_client import generate_latest - metrics_content = generate_latest(registry) - - # Send response - self.send_response(200) - self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') - self.end_headers() - self.wfile.write(metrics_content) - - except Exception as e: - logger.error(f"Error serving metrics: {e}") - self.send_response(500) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(f'Error: {str(e)}'.encode('utf-8')) - - elif self.path == '/' or self.path == '/health': - # Health check endpoint - self.send_response(200) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(b'OK') - - else: - self.send_response(404) - self.send_header('Content-Type', 'text/plain') - self.end_headers() - self.wfile.write(b'Not Found - Try /metrics') - - def log_message(self, format, *args): - """Override to use our logger instead of stderr.""" - logger.debug(f"HTTP {self.address_string()} - {format % args}") - - server = HTTPServer(('', port), MetricsHTTPHandler) - logger.info(f"Started metrics HTTP server on port {port} (pid={os.getpid()})") - logger.info(f"Metrics available at: http://localhost:{port}/metrics") - - # Run server in daemon thread - server_thread = threading.Thread(target=server.serve_forever, daemon=True) - server_thread.start() - - return server - - def increment_task_poll(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_POLL, - documentation=MetricDocumentation.TASK_POLL, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_execution_queue_full(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_EXECUTION_QUEUE_FULL, - documentation=MetricDocumentation.TASK_EXECUTION_QUEUE_FULL, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_uncaught_exception(self): - self.__increment_counter( - name=MetricName.THREAD_UNCAUGHT_EXCEPTION, - documentation=MetricDocumentation.THREAD_UNCAUGHT_EXCEPTION, - labels={} - ) - - def increment_worker_restart(self, task_type: str) -> None: - """Incremented each time TaskHandler restarts a worker subprocess.""" - self.__increment_counter( - name=MetricName.WORKER_RESTART, - documentation=MetricDocumentation.WORKER_RESTART, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_poll_error(self, task_type: str, exception: Exception) -> None: - # No-op: Poll errors are already tracked via task_poll_time_seconds_count with status=FAILURE - pass - - def increment_task_paused(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_PAUSED, - documentation=MetricDocumentation.TASK_PAUSED, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_execution_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_EXECUTE_ERROR, - documentation=MetricDocumentation.TASK_EXECUTE_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_task_ack_failed(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_ACK_FAILED, - documentation=MetricDocumentation.TASK_ACK_FAILED, - labels={ - MetricLabel.TASK_TYPE: task_type - } - ) - - def increment_task_ack_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_ACK_ERROR, - documentation=MetricDocumentation.TASK_ACK_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_task_update_error(self, task_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.TASK_UPDATE_ERROR, - documentation=MetricDocumentation.TASK_UPDATE_ERROR, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: - self.__increment_counter( - name=MetricName.EXTERNAL_PAYLOAD_USED, - documentation=MetricDocumentation.EXTERNAL_PAYLOAD_USED, - labels={ - MetricLabel.ENTITY_NAME: entity_name, - MetricLabel.OPERATION: operation, - MetricLabel.PAYLOAD_TYPE: payload_type - } - ) - - def increment_workflow_start_error(self, workflow_type: str, exception: Exception) -> None: - self.__increment_counter( - name=MetricName.WORKFLOW_START_ERROR, - documentation=MetricDocumentation.WORKFLOW_START_ERROR, - labels={ - MetricLabel.WORKFLOW_TYPE: workflow_type, - MetricLabel.EXCEPTION: str(exception) - } - ) - - def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: - self.__record_gauge( - name=MetricName.WORKFLOW_INPUT_SIZE, - documentation=MetricDocumentation.WORKFLOW_INPUT_SIZE, - labels={ - MetricLabel.WORKFLOW_TYPE: workflow_type, - MetricLabel.WORKFLOW_VERSION: version - }, - value=payload_size - ) - - def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: - self.__record_gauge( - name=MetricName.TASK_RESULT_SIZE, - documentation=MetricDocumentation.TASK_RESULT_SIZE, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=payload_size - ) - - def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - self.__record_gauge( - name=MetricName.TASK_POLL_TIME, - documentation=MetricDocumentation.TASK_POLL_TIME, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=time_spent - ) - # Record as quantile gauges for percentile tracking - self.__record_quantiles( - name=MetricName.TASK_POLL_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - self.__record_gauge( - name=MetricName.TASK_EXECUTE_TIME, - documentation=MetricDocumentation.TASK_EXECUTE_TIME, - labels={ - MetricLabel.TASK_TYPE: task_type - }, - value=time_spent - ) - # Record as quantile gauges for percentile tracking - self.__record_quantiles( - name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_poll_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task poll time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_POLL_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_POLL_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_execute_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task execution time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_EXECUTE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_EXECUTE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_task_update_time_histogram(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: - """Record task update time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.TASK_UPDATE_TIME_HISTOGRAM, - documentation=MetricDocumentation.TASK_UPDATE_TIME_HISTOGRAM, - labels={ - MetricLabel.TASK_TYPE: task_type, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: - """Record API request time with quantile gauges for percentile tracking.""" - self.__record_quantiles( - name=MetricName.API_REQUEST_TIME, - documentation=MetricDocumentation.API_REQUEST_TIME, - labels={ - MetricLabel.METHOD: method, - MetricLabel.URI: uri, - MetricLabel.STATUS: status - }, - value=time_spent - ) - - def __increment_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str] - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - counter = self.__get_counter( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - counter.labels(*labels.values()).inc() - - def __record_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - gauge = self.__get_gauge( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - gauge.labels(*labels.values()).set(value) - - def __get_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Counter: - if name not in self.counters: - self.counters[name] = self.__generate_counter( - name, documentation, labelnames - ) - return self.counters[name] - - def __get_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Gauge: - if name not in self.gauges: - self.gauges[name] = self.__generate_gauge( - name, documentation, labelnames - ) - return self.gauges[name] - - def __generate_counter( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Counter: - return Counter( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry - ) - - def __generate_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Gauge: - return Gauge( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - def __observe_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - histogram = self.__get_histogram( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - histogram.labels(*labels.values()).observe(value) - - def __get_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Histogram: - if name not in self.histograms: - self.histograms[name] = self.__generate_histogram( - name, documentation, labelnames - ) - return self.histograms[name] - - def __generate_histogram( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Histogram: - # Standard buckets for timing metrics: 1ms to 10s - return Histogram( - name=name, - documentation=documentation, - labelnames=labelnames, - buckets=(0.001, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0), - registry=self.registry - ) - - def __observe_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: Any - ) -> None: - if not self.must_collect_metrics: - return - with self._lock: - summary = self.__get_summary( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] - ) - summary.labels(*labels.values()).observe(value) - - def __get_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Summary: - if name not in self.summaries: - self.summaries[name] = self.__generate_summary( - name, documentation, labelnames - ) - return self.summaries[name] - - def __generate_summary( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[MetricLabel] - ) -> Summary: - # Create summary metric - # Note: Prometheus Summary metrics provide count and sum by default - # For percentiles, use histogram buckets or calculate server-side - return Summary( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry - ) - - def __record_quantiles( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - value: float - ) -> None: - """ - Record a value and update quantile gauges (p50, p75, p90, p95, p99). - Also maintains _count and _sum for proper summary metrics. - - Maintains a sliding window of observations and calculates quantiles. - """ - if not self.must_collect_metrics: - return - - with self._lock: - # Create a key for this metric+labels combination - label_values = tuple(labels.values()) - data_key = f"{name}_{label_values}" - - # Initialize data window if needed - if data_key not in self.quantile_data: - self.quantile_data[data_key] = deque(maxlen=self.QUANTILE_WINDOW_SIZE) - - # Add new observation - self.quantile_data[data_key].append(value) - - # Calculate and update quantiles - observations = sorted(self.quantile_data[data_key]) - n = len(observations) - - if n > 0: - quantiles = [0.5, 0.75, 0.9, 0.95, 0.99] - for q in quantiles: - quantile_value = self.__calculate_quantile(observations, q) - - # Get or create gauge for this quantile - gauge = self.__get_quantile_gauge( - name=name, - documentation=documentation, - labelnames=[label.value for label in labels.keys()] + ["quantile"], - quantile=q - ) - - # Set gauge value with labels + quantile - gauge.labels(*labels.values(), str(q)).set(quantile_value) - - # Also publish _count and _sum for proper summary metrics - self.__update_summary_aggregates( - name=name, - documentation=documentation, - labels=labels, - observations=list(self.quantile_data[data_key]) - ) - - def __calculate_quantile(self, sorted_values: List[float], quantile: float) -> float: - """Calculate quantile from sorted list of values.""" - if not sorted_values: - return 0.0 - - n = len(sorted_values) - index = quantile * (n - 1) - - if index.is_integer(): - return sorted_values[int(index)] - else: - # Linear interpolation - lower_index = int(index) - upper_index = min(lower_index + 1, n - 1) - fraction = index - lower_index - return sorted_values[lower_index] + fraction * (sorted_values[upper_index] - sorted_values[lower_index]) - - def __get_quantile_gauge( - self, - name: MetricName, - documentation: MetricDocumentation, - labelnames: List[str], - quantile: float - ) -> Gauge: - """Get or create a gauge for quantiles (single gauge with quantile label).""" - if name not in self.quantile_metrics: - # Create a single gauge with quantile as a label - # This gauge will be shared across all quantiles for this metric - # Note: In multiprocess mode, prometheus_client automatically adds 'pid' label - # We use multiprocess_mode='all' to aggregate across processes and remove pid - self.quantile_metrics[name] = Gauge( - name=name, - documentation=documentation, - labelnames=labelnames, - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - return self.quantile_metrics[name] - - def __update_summary_aggregates( - self, - name: MetricName, - documentation: MetricDocumentation, - labels: Dict[MetricLabel, str], - observations: List[float] - ) -> None: - """ - Update _count and _sum gauges for proper summary metric format. - This makes the metrics compatible with Prometheus summary type. - - Note: This method should only be called while holding self._lock - (called from __record_quantiles which already holds the lock). - """ - if not observations: - return - - # Convert enum to string value - base_name = name.value if hasattr(name, 'value') else str(name) - - # Convert documentation enum to string - doc_str = documentation.value if hasattr(documentation, 'value') else str(documentation) - - # Get or create _count gauge - count_name = f"{base_name}_count" - if count_name not in self.gauges: - self.gauges[count_name] = Gauge( - name=count_name, - documentation=f"{doc_str} - count", - labelnames=[label.value for label in labels.keys()], - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - # Get or create _sum gauge - sum_name = f"{base_name}_sum" - if sum_name not in self.gauges: - self.gauges[sum_name] = Gauge( - name=sum_name, - documentation=f"{doc_str} - sum", - labelnames=[label.value for label in labels.keys()], - registry=self.registry, - multiprocess_mode='all' # Aggregate across processes, don't include pid - ) - - # Update values - self.gauges[count_name].labels(*labels.values()).set(len(observations)) - self.gauges[sum_name].labels(*labels.values()).set(sum(observations)) - - # ========================================================================= - # Event Listener Protocol Implementation (TaskRunnerEventsListener) - # ========================================================================= - # These methods allow MetricsCollector to be used as an event listener - # in the new event-driven architecture, while maintaining backward - # compatibility with existing direct method calls. - - def on_poll_started(self, event: PollStarted) -> None: - """ - Handle poll started event. - Maps to increment_task_poll() for backward compatibility. - """ - self.increment_task_poll(event.task_type) - - def on_poll_completed(self, event: PollCompleted) -> None: - """ - Handle poll completed event. - Maps to record_task_poll_time() for backward compatibility. - """ - self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") - - def on_poll_failure(self, event: PollFailure) -> None: - """ - Handle poll failure event. - Maps to increment_task_poll_error() for backward compatibility. - Also records poll time with FAILURE status. - """ - self.increment_task_poll_error(event.task_type, event.cause) - # Record poll time with failure status if duration is available - if hasattr(event, 'duration_ms') and event.duration_ms is not None: - self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="FAILURE") - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - """ - Handle task execution started event. - No direct metric equivalent in old system - could be used for - tracking in-flight tasks in the future. - """ - pass # No corresponding metric in existing system - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - """ - Handle task execution completed event. - Maps to record_task_execute_time() and record_task_result_payload_size(). - """ - self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") - if event.output_size_bytes is not None: - self.record_task_result_payload_size(event.task_type, event.output_size_bytes) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - """ - Handle task execution failure event. - Maps to increment_task_execution_error() for backward compatibility. - Also records execution time with FAILURE status. - """ - self.increment_task_execution_error(event.task_type, event.cause) - # Record execution time with failure status if duration is available - if hasattr(event, 'duration_ms') and event.duration_ms is not None: - self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") - - # ========================================================================= - # Event Listener Protocol Implementation (WorkflowEventsListener) - # ========================================================================= - - def on_workflow_started(self, event: WorkflowStarted) -> None: - """ - Handle workflow started event. - Maps to increment_workflow_start_error() if workflow failed to start. - """ - if not event.success and event.cause is not None: - self.increment_workflow_start_error(event.name, event.cause) - - def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: - """ - Handle workflow input payload size event. - Maps to record_workflow_input_payload_size(). - """ - version_str = str(event.version) if event.version is not None else "1" - self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) - - def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: - """ - Handle workflow external payload usage event. - Maps to increment_external_payload_used(). - """ - self.increment_external_payload_used(event.name, event.operation, event.payload_type) - - # ========================================================================= - # Event Listener Protocol Implementation (TaskEventsListener) - # ========================================================================= - - def on_task_result_payload_size(self, event: TaskResultPayloadSize) -> None: - """ - Handle task result payload size event. - Maps to record_task_result_payload_size(). - """ - self.record_task_result_payload_size(event.task_type, event.size_bytes) - - def on_task_payload_used(self, event: TaskPayloadUsed) -> None: - """ - Handle task external payload usage event. - Maps to increment_external_payload_used(). - """ - self.increment_external_payload_used(event.task_type, event.operation, event.payload_type) +def __getattr__(name): + if name in _FORWARDED: + return getattr(_base, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py new file mode 100644 index 000000000..edb72d6a5 --- /dev/null +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -0,0 +1,467 @@ +import abc +import logging +import os +import signal +import threading +import time +from typing import Any, Dict, List + +# Lazy imports - these will be imported when first needed. +# PROMETHEUS_MULTIPROC_DIR must be set before prometheus_client is imported. +CollectorRegistry = None +Counter = None +Gauge = None +Histogram = None +Summary = None +write_to_textfile = None +MultiProcessCollector = None + + +def _ensure_prometheus_imported(): + """Lazy import of prometheus_client to ensure PROMETHEUS_MULTIPROC_DIR is set first.""" + global CollectorRegistry, Counter, Gauge, Histogram, Summary, write_to_textfile, MultiProcessCollector + + if CollectorRegistry is None: + from prometheus_client import CollectorRegistry as _CollectorRegistry + from prometheus_client import Counter as _Counter + from prometheus_client import Gauge as _Gauge + from prometheus_client import Histogram as _Histogram + from prometheus_client import Summary as _Summary + from prometheus_client import write_to_textfile as _write_to_textfile + from prometheus_client.multiprocess import MultiProcessCollector as _MultiProcessCollector + + CollectorRegistry = _CollectorRegistry + Counter = _Counter + Gauge = _Gauge + Histogram = _Histogram + Summary = _Summary + write_to_textfile = _write_to_textfile + MultiProcessCollector = _MultiProcessCollector + + +def _exception_label(exception) -> str: + """ + Return a bounded-cardinality label value for an exception. + + Reduces to the bare class name (mirroring the Java / Go SDK convention) + to prevent unbounded label cardinality from error messages. + """ + if exception is None: + return "None" + if isinstance(exception, type): + return exception.__name__ + if isinstance(exception, BaseException): + return type(exception).__name__ + return str(exception) + + +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.model.metric_documentation import MetricDocumentation +from conductor.client.telemetry.model.metric_label import MetricLabel +from conductor.client.telemetry.model.metric_name import MetricName + +from conductor.client.event.task_runner_events import ( + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, +) +from conductor.client.event.workflow_events import ( + WorkflowStarted, + WorkflowInputPayloadSize, + WorkflowPayloadUsed, +) +from conductor.client.event.task_events import ( + TaskResultPayloadSize, + TaskPayloadUsed, +) + +logger = logging.getLogger( + Configuration.get_logging_formatted_name( + __name__ + ) +) + + +class MetricsCollectorBase(abc.ABC): + """ + Abstract base class for Conductor metrics collectors. + + Provides shared Prometheus infrastructure (registry, lazy imports, multiprocess + aggregation, HTTP server) and concrete event-handler implementations that + delegate to abstract metric methods. Subclasses implement the abstract methods + to emit either legacy or canonical metric shapes. + + Satisfies the MetricsCollector Protocol in event/listeners.py via duck typing. + """ + + def __init__(self, settings: MetricsSettings): + self.counters: Dict[str, Any] = {} + self.gauges: Dict[str, Any] = {} + self.histograms: Dict[str, Any] = {} + self.summaries: Dict[str, Any] = {} + self.registry = None + self.must_collect_metrics = False + self._lock = threading.RLock() + + if settings is None: + return + + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + + _ensure_prometheus_imported() + + self.registry = CollectorRegistry() + + self.must_collect_metrics = True + logger.debug( + "MetricsCollector initialized with directory=%s, must_collect=%s", + settings.directory, + self.must_collect_metrics, + ) + + # ========================================================================= + # Static: Multiprocess metrics aggregation + HTTP serving + # ========================================================================= + + @staticmethod + def provide_metrics(settings: MetricsSettings) -> None: + if settings is None: + return + + # Ignore SIGINT in this subprocess -- Ctrl-C is handled by the parent + # via SIGTERM from TaskHandler.stop_processes(). Without this, Ctrl-C + # produces noisy KeyboardInterrupt tracebacks from time.sleep(). + try: + signal.signal(signal.SIGINT, signal.SIG_IGN) + except Exception: + pass + + os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory + + _ensure_prometheus_imported() + + OUTPUT_FILE_PATH = os.path.join( + settings.directory, + settings.file_name + ) + + time.sleep(0.5) + + registry = CollectorRegistry() + from prometheus_client.multiprocess import MultiProcessCollector as MPCollector + from prometheus_client.samples import Sample + from prometheus_client.metrics_core import Metric + + class NoPidCollector(MPCollector): + """Custom collector that removes pid label and aggregates metrics across processes.""" + def collect(self): + for metric in super().collect(): + aggregated = {} + + for sample in metric.samples: + labels = {k: v for k, v in sample.labels.items() if k != 'pid'} + label_items = tuple(sorted(labels.items())) + key = (sample.name, label_items) + + if key not in aggregated: + aggregated[key] = { + 'labels': labels, + 'values': [], + 'name': sample.name, + 'timestamp': sample.timestamp, + 'exemplar': sample.exemplar + } + + aggregated[key]['values'].append(sample.value) + + filtered_samples = [] + for key, data in aggregated.items(): + if metric.type == 'counter' or data['name'].endswith('_count') or data['name'].endswith('_sum'): + value = sum(data['values']) + elif metric.type == 'histogram' or '_bucket' in data['name']: + value = sum(data['values']) + elif 'quantile' in data['labels']: + value = sum(data['values']) / len(data['values']) + else: + value = data['values'][-1] + + filtered_samples.append( + Sample(data['name'], data['labels'], value, data['timestamp'], data['exemplar']) + ) + + new_metric = Metric(metric.name, metric.documentation, metric.type) + new_metric.samples = filtered_samples + yield new_metric + + NoPidCollector(registry) + + http_server = None + if settings.http_port is not None: + http_server = MetricsCollectorBase._start_http_server(settings.http_port, registry) + logger.info(f"Metrics HTTP server mode: serving from memory (no file writes) (pid={os.getpid()})") + + while True: + time.sleep(settings.update_interval) + else: + logger.info(f"Metrics file mode: writing to {OUTPUT_FILE_PATH} (pid={os.getpid()})") + while True: + try: + write_to_textfile( + OUTPUT_FILE_PATH, + registry + ) + except Exception as e: + logger.debug(f"Error writing metrics (will retry): {e}") + + time.sleep(settings.update_interval) + + @staticmethod + def _start_http_server(port: int, registry) -> 'HTTPServer': + """Start HTTP server to expose metrics endpoint for Prometheus scraping.""" + from http.server import HTTPServer, BaseHTTPRequestHandler + import threading as _threading + + class MetricsHTTPHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/metrics': + try: + from prometheus_client import generate_latest + metrics_content = generate_latest(registry) + + self.send_response(200) + self.send_header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') + self.end_headers() + self.wfile.write(metrics_content) + + except Exception as e: + logger.error(f"Error serving metrics: {e}") + self.send_response(500) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(f'Error: {str(e)}'.encode('utf-8')) + + elif self.path == '/' or self.path == '/health': + self.send_response(200) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(b'OK') + + else: + self.send_response(404) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(b'Not Found - Try /metrics') + + def log_message(self, format, *args): + logger.debug(f"HTTP {self.address_string()} - {format % args}") + + server = HTTPServer(('', port), MetricsHTTPHandler) + logger.info(f"Started metrics HTTP server on port {port} (pid={os.getpid()})") + logger.info(f"Metrics available at: http://localhost:{port}/metrics") + + server_thread = _threading.Thread(target=server.serve_forever, daemon=True) + server_thread.start() + + return server + + # ========================================================================= + # Shared Prometheus helpers (used by subclasses) + # ========================================================================= + + def _increment_counter( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str] + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + counter = self._get_counter( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ) + counter.labels(*labels.values()).inc() + + def _record_gauge( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: Any + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + gauge = self._get_gauge( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()] + ) + gauge.labels(*labels.values()).set(value) + + def _observe_histogram( + self, + name: MetricName, + documentation: MetricDocumentation, + labels: Dict[MetricLabel, str], + value: Any, + buckets=None + ) -> None: + if not self.must_collect_metrics: + return + with self._lock: + histogram = self._get_histogram( + name=name, + documentation=documentation, + labelnames=[label.value for label in labels.keys()], + buckets=buckets + ) + histogram.labels(*labels.values()).observe(value) + + def _get_counter(self, name, documentation, labelnames): + if name not in self.counters: + self.counters[name] = Counter( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry + ) + return self.counters[name] + + def _get_gauge(self, name, documentation, labelnames): + if name not in self.gauges: + self.gauges[name] = Gauge( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + multiprocess_mode='all' + ) + return self.gauges[name] + + def _get_histogram(self, name, documentation, labelnames, buckets=None): + if name not in self.histograms: + kwargs = dict( + name=name, + documentation=documentation, + labelnames=labelnames, + registry=self.registry, + ) + if buckets is not None: + kwargs['buckets'] = buckets + self.histograms[name] = Histogram(**kwargs) + return self.histograms[name] + + # ========================================================================= + # Abstract metric methods -- the full union surface. + # Subclasses implement each with real logic or pass (no-op). + # ========================================================================= + + @abc.abstractmethod + def increment_task_poll(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_poll_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_execution_started(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_execution_queue_full(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_uncaught_exception(self, exception=None) -> None: ... + + @abc.abstractmethod + def increment_worker_restart(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_paused(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_execution_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_ack_failed(self, task_type: str) -> None: ... + + @abc.abstractmethod + def increment_task_ack_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_task_update_error(self, task_type: str, exception) -> None: ... + + @abc.abstractmethod + def increment_external_payload_used(self, entity_name: str, operation: str, payload_type: str) -> None: ... + + @abc.abstractmethod + def increment_workflow_start_error(self, workflow_type: str, exception) -> None: ... + + @abc.abstractmethod + def record_task_poll_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_task_execute_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_task_update_time(self, task_type: str, time_spent: float, status: str = "SUCCESS") -> None: ... + + @abc.abstractmethod + def record_api_request_time(self, method: str, uri: str, status: str, time_spent: float) -> None: ... + + @abc.abstractmethod + def record_task_result_payload_size(self, task_type: str, payload_size: int) -> None: ... + + @abc.abstractmethod + def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: ... + + # ========================================================================= + # Concrete event handlers -- delegate to the abstract metric methods. + # These satisfy the event listener protocols in event/listeners.py. + # ========================================================================= + + def on_poll_started(self, event: PollStarted) -> None: + self.increment_task_poll(event.task_type) + + def on_poll_completed(self, event: PollCompleted) -> None: + self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") + + def on_poll_failure(self, event: PollFailure) -> None: + self.increment_task_poll_error(event.task_type, event.cause) + if hasattr(event, 'duration_ms') and event.duration_ms is not None: + self.record_task_poll_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + + def on_task_execution_started(self, event: TaskExecutionStarted) -> None: + self.increment_task_execution_started(event.task_type) + + def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: + self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") + if event.output_size_bytes is not None: + self.record_task_result_payload_size(event.task_type, event.output_size_bytes) + + def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: + self.increment_task_execution_error(event.task_type, event.cause) + if hasattr(event, 'duration_ms') and event.duration_ms is not None: + self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + + def on_workflow_started(self, event: WorkflowStarted) -> None: + if not event.success and event.cause is not None: + self.increment_workflow_start_error(event.name, event.cause) + + def on_workflow_input_payload_size(self, event: WorkflowInputPayloadSize) -> None: + version_str = str(event.version) if event.version is not None else "1" + self.record_workflow_input_payload_size(event.name, version_str, event.size_bytes) + + def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: + self.increment_external_payload_used(event.name, event.operation, event.payload_type) + + def on_task_result_payload_size(self, event: TaskResultPayloadSize) -> None: + self.record_task_result_payload_size(event.task_type, event.size_bytes) + + def on_task_payload_used(self, event: TaskPayloadUsed) -> None: + self.increment_external_payload_used(event.task_type, event.operation, event.payload_type) diff --git a/src/conductor/client/telemetry/metrics_factory.py b/src/conductor/client/telemetry/metrics_factory.py new file mode 100644 index 000000000..3b3e4187c --- /dev/null +++ b/src/conductor/client/telemetry/metrics_factory.py @@ -0,0 +1,45 @@ +""" +Factory that selects the correct MetricsCollector implementation based on +environment variables. + + WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector + WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default during deprecation) + +If WORKER_CANONICAL_METRICS is true it takes priority regardless of the value +of WORKER_LEGACY_METRICS. +""" + +import logging +import os + +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_collector_base import MetricsCollectorBase + +logger = logging.getLogger( + Configuration.get_logging_formatted_name(__name__) +) + + +def _env_bool(name: str, default: bool) -> bool: + value = os.environ.get(name, "") + if not value: + return default + return value.strip().lower() in ("true", "1", "yes") + + +def create_metrics_collector(settings: MetricsSettings) -> MetricsCollectorBase: + """ + Create the metrics collector selected by environment variables. + + Returns a fully-initialised collector (legacy or canonical) that satisfies + the MetricsCollector Protocol and can be registered as an event listener. + """ + if _env_bool("WORKER_CANONICAL_METRICS", default=False): + from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector + logger.info("WORKER_CANONICAL_METRICS is true — using CanonicalMetricsCollector") + return CanonicalMetricsCollector(settings) + + from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector + logger.info("Using LegacyMetricsCollector (set WORKER_CANONICAL_METRICS=true for canonical metrics)") + return LegacyMetricsCollector(settings) diff --git a/src/conductor/client/telemetry/model/metric_documentation.py b/src/conductor/client/telemetry/model/metric_documentation.py index d76ad063c..47d8c2736 100644 --- a/src/conductor/client/telemetry/model/metric_documentation.py +++ b/src/conductor/client/telemetry/model/metric_documentation.py @@ -2,6 +2,7 @@ class MetricDocumentation(str, Enum): + # Shared / legacy documentation strings API_REQUEST_TIME = "API request duration in seconds with quantiles" EXTERNAL_PAYLOAD_USED = "Incremented each time external payload storage is used" TASK_ACK_ERROR = "Task ack has encountered an exception" @@ -21,3 +22,11 @@ class MetricDocumentation(str, Enum): WORKER_RESTART = "Worker subprocess restarted" WORKFLOW_START_ERROR = "Counter for workflow start errors" WORKFLOW_INPUT_SIZE = "Records input payload size of a workflow" + + # Canonical-only documentation strings + API_REQUEST_TIME_CANONICAL = "Latency of HTTP API client requests in seconds" + TASK_EXECUTION_STARTED = "Incremented when a polled task is dispatched to the worker function" + TASK_POLL_ERROR = "Incremented when a poll request fails client-side" + TASK_RESULT_SIZE_BYTES = "Serialized byte size of task result output" + WORKFLOW_INPUT_SIZE_BYTES = "Serialized byte size of workflow input" + ACTIVE_WORKERS = "Current number of worker threads actively executing a task" diff --git a/src/conductor/client/telemetry/model/metric_label.py b/src/conductor/client/telemetry/model/metric_label.py index 7aeae21ef..8d4a63f70 100644 --- a/src/conductor/client/telemetry/model/metric_label.py +++ b/src/conductor/client/telemetry/model/metric_label.py @@ -6,7 +6,8 @@ class MetricLabel(str, Enum): EXCEPTION = "exception" METHOD = "method" OPERATION = "operation" - PAYLOAD_TYPE = "payload_type" + PAYLOAD_TYPE = "payloadType" + PAYLOAD_TYPE_LEGACY = "payload_type" STATUS = "status" TASK_TYPE = "taskType" URI = "uri" diff --git a/src/conductor/client/telemetry/model/metric_name.py b/src/conductor/client/telemetry/model/metric_name.py index 89810865d..d8485a267 100644 --- a/src/conductor/client/telemetry/model/metric_name.py +++ b/src/conductor/client/telemetry/model/metric_name.py @@ -2,6 +2,7 @@ class MetricName(str, Enum): + # Legacy metric names (pre-harmonization) API_REQUEST_TIME = "http_api_client_request" EXTERNAL_PAYLOAD_USED = "external_payload_used" TASK_ACK_ERROR = "task_ack_error" @@ -21,3 +22,11 @@ class MetricName(str, Enum): WORKER_RESTART = "worker_restart" WORKFLOW_INPUT_SIZE = "workflow_input_size" WORKFLOW_START_ERROR = "workflow_start_error" + + # Canonical-only metric names (harmonization additions) + API_REQUEST_TIME_CANONICAL = "http_api_client_request_seconds" + TASK_EXECUTION_STARTED = "task_execution_started" + TASK_POLL_ERROR = "task_poll_error" + TASK_RESULT_SIZE_BYTES = "task_result_size_bytes" + WORKFLOW_INPUT_SIZE_BYTES = "workflow_input_size_bytes" + ACTIVE_WORKERS = "active_workers" diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index 20074de09..c8ff1f939 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -27,11 +27,11 @@ class WorkflowExecutor: - def __init__(self, configuration: Configuration) -> Self: - api_client = ApiClient(configuration) + def __init__(self, configuration: Configuration, metrics_collector=None) -> Self: + api_client = ApiClient(configuration, metrics_collector=metrics_collector) self.metadata_client = MetadataResourceApi(api_client) self.task_client = TaskResourceApi(api_client) - self.workflow_client = OrkesWorkflowClient(configuration) + self.workflow_client = OrkesWorkflowClient(configuration, metrics_collector=metrics_collector) def register_workflow(self, workflow: WorkflowDef, overwrite: Optional[bool] = None) -> object: """Create a new workflow definition""" diff --git a/tests/unit/telemetry/test_canonical_metrics_collector.py b/tests/unit/telemetry/test_canonical_metrics_collector.py new file mode 100644 index 000000000..16983ae24 --- /dev/null +++ b/tests/unit/telemetry/test_canonical_metrics_collector.py @@ -0,0 +1,216 @@ +""" +Tests for CanonicalMetricsCollector. + +Verifies that canonical metric names, types, labels, and bucket sets are +correct per the sdk-metrics-harmonization spec. +""" + +import os +import shutil +import tempfile +import unittest + +from prometheus_client import write_to_textfile + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector +from conductor.client.telemetry.metrics_collector_base import _exception_label +from conductor.client.event.task_runner_events import ( + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, +) + + +class TestCanonicalMetricsCollector(unittest.TestCase): + + def setUp(self): + self.metrics_dir = tempfile.mkdtemp() + self.settings = MetricsSettings(directory=self.metrics_dir) + self.collector = CanonicalMetricsCollector(self.settings) + + def tearDown(self): + if os.path.exists(self.metrics_dir): + shutil.rmtree(self.metrics_dir) + + def _get_metrics_text(self): + path = os.path.join(self.metrics_dir, "test_out.prom") + write_to_textfile(path, self.collector.registry) + with open(path) as f: + return f.read() + + # ------------------------------------------------------------------ + # Counters + # ------------------------------------------------------------------ + + def test_task_poll_counter(self): + self.collector.increment_task_poll("my_task") + text = self._get_metrics_text() + self.assertIn('task_poll_total{taskType="my_task"}', text) + + def test_task_poll_error_counter(self): + self.collector.increment_task_poll_error("my_task", RuntimeError("oops")) + text = self._get_metrics_text() + self.assertIn("task_poll_error_total", text) + self.assertIn('exception="RuntimeError"', text) + + def test_task_execution_started_counter(self): + self.collector.increment_task_execution_started("my_task") + text = self._get_metrics_text() + self.assertIn('task_execution_started_total{taskType="my_task"}', text) + + def test_uncaught_exception_with_label(self): + self.collector.increment_uncaught_exception(ValueError("bad")) + text = self._get_metrics_text() + self.assertIn("thread_uncaught_exceptions_total", text) + self.assertIn('exception="ValueError"', text) + + def test_external_payload_uses_camelcase_label(self): + self.collector.increment_external_payload_used("ent", "READ", "TASK_INPUT") + text = self._get_metrics_text() + self.assertIn("external_payload_used_total", text) + self.assertIn('payloadType="TASK_INPUT"', text) + + def test_exception_label_uses_class_name(self): + self.collector.increment_task_execution_error("t", ValueError("x")) + text = self._get_metrics_text() + self.assertIn('exception="ValueError"', text) + + def test_workflow_start_error(self): + self.collector.increment_workflow_start_error("wf", Exception("fail")) + text = self._get_metrics_text() + self.assertIn("workflow_start_error_total", text) + self.assertIn('exception="Exception"', text) + + # ------------------------------------------------------------------ + # Time Histograms + # ------------------------------------------------------------------ + + def test_task_poll_time_is_histogram(self): + self.collector.record_task_poll_time("my_task", 0.05, status="SUCCESS") + text = self._get_metrics_text() + self.assertIn("task_poll_time_seconds_bucket", text) + self.assertIn("task_poll_time_seconds_count", text) + self.assertIn("task_poll_time_seconds_sum", text) + self.assertIn('le="0.1"', text) + + def test_task_execute_time_is_histogram(self): + self.collector.record_task_execute_time("my_task", 0.5, status="SUCCESS") + text = self._get_metrics_text() + self.assertIn("task_execute_time_seconds_bucket", text) + + def test_task_update_time_is_histogram(self): + self.collector.record_task_update_time("my_task", 0.2, status="FAILURE") + text = self._get_metrics_text() + self.assertIn("task_update_time_seconds_bucket", text) + self.assertIn('status="FAILURE"', text) + + def test_api_request_time_canonical_name(self): + self.collector.record_api_request_time("GET", "/api/tasks", "200", 0.1) + text = self._get_metrics_text() + self.assertIn("http_api_client_request_seconds_bucket", text) + self.assertIn("http_api_client_request_seconds_count", text) + + def test_time_histogram_bucket_set(self): + """Canonical time histograms use the spec bucket set.""" + self.collector.record_task_poll_time("my_task", 0.001) + text = self._get_metrics_text() + for boundary in ("0.001", "0.005", "0.01", "0.025", "0.05", "0.1", + "0.25", "0.5", "1.0", "2.5", "5.0", "10.0"): + self.assertIn(f'le="{boundary}"', text) + + # ------------------------------------------------------------------ + # Size Histograms + # ------------------------------------------------------------------ + + def test_task_result_size_bytes_histogram(self): + self.collector.record_task_result_payload_size("my_task", 5000) + text = self._get_metrics_text() + self.assertIn("task_result_size_bytes_bucket", text) + self.assertIn("task_result_size_bytes_count", text) + + def test_workflow_input_size_bytes_histogram(self): + self.collector.record_workflow_input_payload_size("wf", "1", 50000) + text = self._get_metrics_text() + self.assertIn("workflow_input_size_bytes_bucket", text) + + def test_size_histogram_bucket_set(self): + """Canonical size histograms use the spec bucket set.""" + self.collector.record_task_result_payload_size("t", 500) + text = self._get_metrics_text() + # prometheus_client uses scientific notation for large values + for boundary in ("100.0", "1000.0", "10000.0", "100000.0", + "1e+06", "1e+07"): + self.assertIn(f'le="{boundary}"', text) + + # ------------------------------------------------------------------ + # Event handler integration + # ------------------------------------------------------------------ + + def test_event_poll_started_increments_counter(self): + event = PollStarted(task_type="t", worker_id="w", poll_count=1) + self.collector.on_poll_started(event) + text = self._get_metrics_text() + self.assertIn('task_poll_total{taskType="t"}', text) + + def test_event_poll_completed_records_histogram(self): + event = PollCompleted(task_type="t", duration_ms=100.0, tasks_received=1) + self.collector.on_poll_completed(event) + text = self._get_metrics_text() + self.assertIn("task_poll_time_seconds_bucket", text) + + def test_event_poll_failure_records_error_and_histogram(self): + event = PollFailure(task_type="t", duration_ms=50.0, cause=RuntimeError("x")) + self.collector.on_poll_failure(event) + text = self._get_metrics_text() + self.assertIn("task_poll_error_total", text) + self.assertIn("task_poll_time_seconds_bucket", text) + + def test_event_execution_started(self): + event = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(event) + text = self._get_metrics_text() + self.assertIn("task_execution_started_total", text) + + def test_event_execution_completed_records_histogram_and_size(self): + event = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=200.0, output_size_bytes=1024, + ) + self.collector.on_task_execution_completed(event) + text = self._get_metrics_text() + self.assertIn("task_execute_time_seconds_bucket", text) + self.assertIn("task_result_size_bytes_bucket", text) + + def test_event_execution_failure_records_counter_and_histogram(self): + event = TaskExecutionFailure( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", cause=ValueError("bad"), duration_ms=100.0, + ) + self.collector.on_task_execution_failure(event) + text = self._get_metrics_text() + self.assertIn("task_execute_error_total", text) + self.assertIn("task_execute_time_seconds_bucket", text) + + +class TestExceptionLabel(unittest.TestCase): + """Test the _exception_label helper.""" + + def test_none(self): + self.assertEqual(_exception_label(None), "None") + + def test_exception_instance(self): + self.assertEqual(_exception_label(ValueError("x")), "ValueError") + + def test_exception_class(self): + self.assertEqual(_exception_label(RuntimeError), "RuntimeError") + + def test_string_passthrough(self): + self.assertEqual(_exception_label("SomeError"), "SomeError") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py new file mode 100644 index 000000000..586047aa5 --- /dev/null +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -0,0 +1,117 @@ +""" +Tests for the metrics factory and gated metrics selection. +""" + +import os +import shutil +import tempfile +import unittest + +from conductor.client.configuration.settings.metrics_settings import MetricsSettings +from conductor.client.telemetry.metrics_factory import create_metrics_collector +from conductor.client.telemetry.legacy_metrics_collector import LegacyMetricsCollector +from conductor.client.telemetry.canonical_metrics_collector import CanonicalMetricsCollector + + +class TestMetricsFactory(unittest.TestCase): + + def setUp(self): + self.metrics_dir = tempfile.mkdtemp() + self.settings = MetricsSettings(directory=self.metrics_dir) + self._saved_env = {} + for key in ("WORKER_CANONICAL_METRICS", "WORKER_LEGACY_METRICS"): + self._saved_env[key] = os.environ.pop(key, None) + + def tearDown(self): + for key, val in self._saved_env.items(): + if val is None: + os.environ.pop(key, None) + else: + os.environ[key] = val + if os.path.exists(self.metrics_dir): + shutil.rmtree(self.metrics_dir) + + def test_default_returns_legacy(self): + """With no env vars set, factory returns LegacyMetricsCollector.""" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, LegacyMetricsCollector) + + def test_canonical_true_returns_canonical(self): + """WORKER_CANONICAL_METRICS=true selects CanonicalMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_canonical_1_returns_canonical(self): + """WORKER_CANONICAL_METRICS=1 selects CanonicalMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "1" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_canonical_false_returns_legacy(self): + """WORKER_CANONICAL_METRICS=false selects LegacyMetricsCollector.""" + os.environ["WORKER_CANONICAL_METRICS"] = "false" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, LegacyMetricsCollector) + + def test_canonical_takes_priority_over_legacy(self): + """WORKER_CANONICAL_METRICS=true wins even if WORKER_LEGACY_METRICS=true.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + os.environ["WORKER_LEGACY_METRICS"] = "true" + collector = create_metrics_collector(self.settings) + self.assertIsInstance(collector, CanonicalMetricsCollector) + + def test_both_implementations_satisfy_same_interface(self): + """Both implementations have the same public method surface.""" + legacy = LegacyMetricsCollector(self.settings) + os.environ["WORKER_CANONICAL_METRICS"] = "true" + canonical = create_metrics_collector( + MetricsSettings(directory=tempfile.mkdtemp()) + ) + + required_methods = [ + "increment_task_poll", + "increment_task_poll_error", + "increment_task_execution_started", + "increment_task_execution_queue_full", + "increment_uncaught_exception", + "increment_worker_restart", + "increment_task_paused", + "increment_task_execution_error", + "increment_task_ack_failed", + "increment_task_ack_error", + "increment_task_update_error", + "increment_external_payload_used", + "increment_workflow_start_error", + "record_task_poll_time", + "record_task_execute_time", + "record_task_update_time", + "record_api_request_time", + "record_task_result_payload_size", + "record_workflow_input_payload_size", + "on_poll_started", + "on_poll_completed", + "on_poll_failure", + "on_task_execution_started", + "on_task_execution_completed", + "on_task_execution_failure", + "on_workflow_started", + "on_workflow_input_payload_size", + "on_workflow_payload_used", + "on_task_result_payload_size", + "on_task_payload_used", + ] + + for method_name in required_methods: + self.assertTrue( + hasattr(legacy, method_name) and callable(getattr(legacy, method_name)), + f"LegacyMetricsCollector missing method: {method_name}", + ) + self.assertTrue( + hasattr(canonical, method_name) and callable(getattr(canonical, method_name)), + f"CanonicalMetricsCollector missing method: {method_name}", + ) + + +if __name__ == "__main__": + unittest.main() From 9d671cafeae9c3f115a1c01d84dda04a3e79a01d Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 11:24:07 -0600 Subject: [PATCH 2/7] validate the multiprocessing start method choice --- src/conductor/client/automator/task_handler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/conductor/client/automator/task_handler.py b/src/conductor/client/automator/task_handler.py index 6a8255bd0..74dc62731 100644 --- a/src/conductor/client/automator/task_handler.py +++ b/src/conductor/client/automator/task_handler.py @@ -33,6 +33,7 @@ ) _decorated_functions = {} +_VALID_MP_START_METHODS = {"spawn", "fork", "forkserver"} _mp_fork_set = False if not _mp_fork_set: try: @@ -42,7 +43,13 @@ # deadlock. Default is fork for backward compatibility (spawn requires # all Process arguments to be picklable). _default_method = "spawn" if platform == "win32" else "fork" - _method = os.environ.get("CONDUCTOR_MP_START_METHOD", _default_method).strip().lower() + _method = os.environ.get("CONDUCTOR_MP_START_METHOD", "").strip().lower() or _default_method + if _method not in _VALID_MP_START_METHODS: + logger.warning( + "Ignoring invalid CONDUCTOR_MP_START_METHOD=%r; falling back to %r", + _method, _default_method, + ) + _method = _default_method set_start_method(_method) _mp_fork_set = True except Exception as e: From 98cf8ace68f1cbf0d81d3cc8bd1d4e84bcd1008f Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 12:32:13 -0600 Subject: [PATCH 3/7] remove unnecessary sleep --- src/conductor/client/telemetry/metrics_collector_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index edb72d6a5..a539a25d0 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -149,8 +149,6 @@ def provide_metrics(settings: MetricsSettings) -> None: settings.file_name ) - time.sleep(0.5) - registry = CollectorRegistry() from prometheus_client.multiprocess import MultiProcessCollector as MPCollector from prometheus_client.samples import Sample From 6fed50502e878ca1c6c6df56a014624b1e1dce4a Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Thu, 30 Apr 2026 13:07:32 -0600 Subject: [PATCH 4/7] fixes for a few canonical metrics --- .../telemetry/canonical_metrics_collector.py | 12 ++++ .../telemetry/legacy_metrics_collector.py | 7 +++ .../telemetry/metrics_collector_base.py | 11 ++++ .../test_canonical_metrics_collector.py | 57 +++++++++++++++++++ .../unit/telemetry/test_metrics_collector.py | 39 +++++++++++++ 5 files changed, 126 insertions(+) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index f77a8890e..eaf876085 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -145,6 +145,18 @@ def increment_workflow_start_error(self, workflow_type: str, exception) -> None: }, ) + # ------------------------------------------------------------------ + # Gauges + # ------------------------------------------------------------------ + + def set_active_workers(self, task_type: str, count: int) -> None: + self._record_gauge( + name=MetricName.ACTIVE_WORKERS, + documentation=MetricDocumentation.ACTIVE_WORKERS, + labels={MetricLabel.TASK_TYPE: task_type}, + value=count, + ) + # ------------------------------------------------------------------ # Timing (real Prometheus Histograms) # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index e819810b9..114f2a26d 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -129,6 +129,13 @@ def increment_workflow_start_error(self, workflow_type: str, exception) -> None: }, ) + # ------------------------------------------------------------------ + # Gauges (canonical-only, no-op in legacy) + # ------------------------------------------------------------------ + + def set_active_workers(self, task_type: str, count: int) -> None: + pass + # ------------------------------------------------------------------ # Timing (last-value gauges + quantile gauges) # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index a539a25d0..c81b26f67 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -1,4 +1,5 @@ import abc +import collections import logging import os import signal @@ -106,6 +107,7 @@ def __init__(self, settings: MetricsSettings): self.registry = None self.must_collect_metrics = False self._lock = threading.RLock() + self._active_worker_counts: Dict[str, int] = collections.defaultdict(int) if settings is None: return @@ -418,6 +420,9 @@ def record_task_result_payload_size(self, task_type: str, payload_size: int) -> @abc.abstractmethod def record_workflow_input_payload_size(self, workflow_type: str, version: str, payload_size: int) -> None: ... + @abc.abstractmethod + def set_active_workers(self, task_type: str, count: int) -> None: ... + # ========================================================================= # Concrete event handlers -- delegate to the abstract metric methods. # These satisfy the event listener protocols in event/listeners.py. @@ -436,16 +441,22 @@ def on_poll_failure(self, event: PollFailure) -> None: def on_task_execution_started(self, event: TaskExecutionStarted) -> None: self.increment_task_execution_started(event.task_type) + self._active_worker_counts[event.task_type] += 1 + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="SUCCESS") if event.output_size_bytes is not None: self.record_task_result_payload_size(event.task_type, event.output_size_bytes) + self._active_worker_counts[event.task_type] = max(0, self._active_worker_counts[event.task_type] - 1) + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: self.increment_task_execution_error(event.task_type, event.cause) if hasattr(event, 'duration_ms') and event.duration_ms is not None: self.record_task_execute_time(event.task_type, event.duration_ms / 1000, status="FAILURE") + self._active_worker_counts[event.task_type] = max(0, self._active_worker_counts[event.task_type] - 1) + self.set_active_workers(event.task_type, self._active_worker_counts[event.task_type]) def on_workflow_started(self, event: WorkflowStarted) -> None: if not event.success and event.cause is not None: diff --git a/tests/unit/telemetry/test_canonical_metrics_collector.py b/tests/unit/telemetry/test_canonical_metrics_collector.py index 16983ae24..3b0ea4e73 100644 --- a/tests/unit/telemetry/test_canonical_metrics_collector.py +++ b/tests/unit/telemetry/test_canonical_metrics_collector.py @@ -195,6 +195,63 @@ def test_event_execution_failure_records_counter_and_histogram(self): self.assertIn("task_execute_error_total", text) self.assertIn("task_execute_time_seconds_bucket", text) + # ------------------------------------------------------------------ + # active_workers gauge + # ------------------------------------------------------------------ + + def test_active_workers_gauge_increments_on_start(self): + event = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(event) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"}', text) + self.assertIn("1.0", text) + + def test_active_workers_gauge_decrements_on_complete(self): + start = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(start) + self.collector.on_task_execution_started(start) + + complete = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=100.0, output_size_bytes=None, + ) + self.collector.on_task_execution_completed(complete) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 1.0', text) + + def test_active_workers_gauge_decrements_on_failure(self): + start = TaskExecutionStarted(task_type="t", task_id="id", worker_id="w", workflow_instance_id="wf") + self.collector.on_task_execution_started(start) + + failure = TaskExecutionFailure( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", cause=ValueError("x"), duration_ms=50.0, + ) + self.collector.on_task_execution_failure(failure) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 0.0', text) + + def test_active_workers_gauge_floors_at_zero(self): + complete = TaskExecutionCompleted( + task_type="t", task_id="id", worker_id="w", + workflow_instance_id="wf", duration_ms=100.0, output_size_bytes=None, + ) + self.collector.on_task_execution_completed(complete) + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="t"} 0.0', text) + + def test_active_workers_tracks_multiple_task_types(self): + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="a", task_id="1", worker_id="w", workflow_instance_id="wf")) + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="a", task_id="2", worker_id="w", workflow_instance_id="wf")) + self.collector.on_task_execution_started( + TaskExecutionStarted(task_type="b", task_id="3", worker_id="w", workflow_instance_id="wf")) + + text = self._get_metrics_text() + self.assertIn('active_workers{taskType="a"} 2.0', text) + self.assertIn('active_workers{taskType="b"} 1.0', text) + class TestExceptionLabel(unittest.TestCase): """Test the _exception_label helper.""" diff --git a/tests/unit/telemetry/test_metrics_collector.py b/tests/unit/telemetry/test_metrics_collector.py index 5471b745a..3d1ee834d 100644 --- a/tests/unit/telemetry/test_metrics_collector.py +++ b/tests/unit/telemetry/test_metrics_collector.py @@ -500,6 +500,45 @@ def test_quantile_sliding_window(self): # Note: _calculate_percentile is not a public method and percentile calculation # is handled internally by prometheus_client Summary objects + # ========================================================================= + # active_workers tracking (base class counter, legacy no-op gauge) + # ========================================================================= + + def test_active_workers_count_tracks_in_base_class(self): + collector = MetricsCollector(self.metrics_settings) + + start = TaskExecutionStarted(task_type='test_task', task_id='t1', worker_id='w', workflow_instance_id='wf') + collector.on_task_execution_started(start) + collector.on_task_execution_started(start) + self.assertEqual(collector._active_worker_counts['test_task'], 2) + + complete = TaskExecutionCompleted( + task_type='test_task', task_id='t1', worker_id='w', + workflow_instance_id='wf', duration_ms=100.0, output_size_bytes=None, + ) + collector.on_task_execution_completed(complete) + self.assertEqual(collector._active_worker_counts['test_task'], 1) + + def test_active_workers_count_floors_at_zero(self): + collector = MetricsCollector(self.metrics_settings) + + complete = TaskExecutionCompleted( + task_type='test_task', task_id='t1', worker_id='w', + workflow_instance_id='wf', duration_ms=100.0, output_size_bytes=None, + ) + collector.on_task_execution_completed(complete) + self.assertEqual(collector._active_worker_counts['test_task'], 0) + + def test_active_workers_not_emitted_in_legacy_mode(self): + collector = MetricsCollector(self.metrics_settings) + + start = TaskExecutionStarted(task_type='test_task', task_id='t1', worker_id='w', workflow_instance_id='wf') + collector.on_task_execution_started(start) + + self._write_metrics(collector) + metrics_content = self._read_metrics_file() + self.assertNotIn('active_workers', metrics_content) + # ========================================================================= # Edge Cases and Boundary Conditions # ========================================================================= From 63974df9c27d02227ac001215ea60b3435f7b5c6 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 09:36:57 -0600 Subject: [PATCH 5/7] cleaner reporting on which metrics implementation is used in the test harness worker --- harness/main.py | 5 +---- .../telemetry/canonical_metrics_collector.py | 3 +++ .../client/telemetry/legacy_metrics_collector.py | 3 +++ .../client/telemetry/metrics_collector_base.py | 5 +++++ tests/unit/telemetry/test_metrics_factory.py | 14 ++++++++++++++ 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/harness/main.py b/harness/main.py index 2be7d8e6c..f556fabed 100644 --- a/harness/main.py +++ b/harness/main.py @@ -71,12 +71,9 @@ def main() -> None: metrics_port = env_int_or_default("HARNESS_METRICS_PORT", 9991) metrics_settings = MetricsSettings(http_port=metrics_port) - print(f"Prometheus metrics will be served on port {metrics_port}") - # Main-process MetricsCollector: writes workflow-client / HTTP metrics into - # the shared PROMETHEUS_MULTIPROC_DIR so they merge with worker subprocess - # metrics in the exported /metrics payload. metrics_collector = create_metrics_collector(metrics_settings) + print(f"Prometheus metrics server started on port {metrics_port} ({metrics_collector.collector_name()} metrics)") clients = OrkesClients(configuration, metrics_collector=metrics_collector) register_metadata(clients) diff --git a/src/conductor/client/telemetry/canonical_metrics_collector.py b/src/conductor/client/telemetry/canonical_metrics_collector.py index eaf876085..42562a8bb 100644 --- a/src/conductor/client/telemetry/canonical_metrics_collector.py +++ b/src/conductor/client/telemetry/canonical_metrics_collector.py @@ -29,6 +29,9 @@ class CanonicalMetricsCollector(MetricsCollectorBase): def __init__(self, settings: MetricsSettings): super().__init__(settings) + def collector_name(self) -> str: + return "canonical" + # ------------------------------------------------------------------ # Counters # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/legacy_metrics_collector.py b/src/conductor/client/telemetry/legacy_metrics_collector.py index 114f2a26d..418e56da0 100644 --- a/src/conductor/client/telemetry/legacy_metrics_collector.py +++ b/src/conductor/client/telemetry/legacy_metrics_collector.py @@ -26,6 +26,9 @@ def __init__(self, settings: MetricsSettings): self.quantile_metrics: Dict[str, Any] = {} self.quantile_data: Dict[str, deque] = {} + def collector_name(self) -> str: + return "legacy" + # ------------------------------------------------------------------ # Counters # ------------------------------------------------------------------ diff --git a/src/conductor/client/telemetry/metrics_collector_base.py b/src/conductor/client/telemetry/metrics_collector_base.py index c81b26f67..358012d8b 100644 --- a/src/conductor/client/telemetry/metrics_collector_base.py +++ b/src/conductor/client/telemetry/metrics_collector_base.py @@ -363,6 +363,11 @@ def _get_histogram(self, name, documentation, labelnames, buckets=None): # Subclasses implement each with real logic or pass (no-op). # ========================================================================= + @abc.abstractmethod + def collector_name(self) -> str: + """Return the name of this collector implementation ('legacy', 'canonical').""" + ... + @abc.abstractmethod def increment_task_poll(self, task_type: str) -> None: ... diff --git a/tests/unit/telemetry/test_metrics_factory.py b/tests/unit/telemetry/test_metrics_factory.py index 586047aa5..ebe17864c 100644 --- a/tests/unit/telemetry/test_metrics_factory.py +++ b/tests/unit/telemetry/test_metrics_factory.py @@ -61,6 +61,19 @@ def test_canonical_takes_priority_over_legacy(self): collector = create_metrics_collector(self.settings) self.assertIsInstance(collector, CanonicalMetricsCollector) + def test_legacy_collector_name(self): + """LegacyMetricsCollector.collector_name() returns 'legacy'.""" + collector = create_metrics_collector(self.settings) + self.assertEqual(collector.collector_name(), "legacy") + + def test_canonical_collector_name(self): + """CanonicalMetricsCollector.collector_name() returns 'canonical'.""" + os.environ["WORKER_CANONICAL_METRICS"] = "true" + collector = create_metrics_collector( + MetricsSettings(directory=tempfile.mkdtemp()) + ) + self.assertEqual(collector.collector_name(), "canonical") + def test_both_implementations_satisfy_same_interface(self): """Both implementations have the same public method surface.""" legacy = LegacyMetricsCollector(self.settings) @@ -70,6 +83,7 @@ def test_both_implementations_satisfy_same_interface(self): ) required_methods = [ + "collector_name", "increment_task_poll", "increment_task_poll_error", "increment_task_execution_started", From 9bca75401a43e9f9977496ea554f155c8d6aba2b Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 11:40:58 -0600 Subject: [PATCH 6/7] update docs regarding metrics --- METRICS.md | 527 ++++++++---------- README.md | 2 +- WORKER_CONFIGURATION.md | 7 +- docs/design/WORKER_DESIGN.md | 115 +--- .../design/WORKER_SDK_IMPLEMENTATION_GUIDE.md | 43 +- .../design/event_driven_interceptor_system.md | 403 ++------------ 6 files changed, 313 insertions(+), 784 deletions(-) diff --git a/METRICS.md b/METRICS.md index 2201d1c01..28266498d 100644 --- a/METRICS.md +++ b/METRICS.md @@ -1,333 +1,280 @@ -# Metrics Documentation - -The Conductor Python SDK includes built-in metrics collection using Prometheus to monitor worker performance, API requests, and task execution. - -## Table of Contents - -- [Quick Reference](#quick-reference) -- [Configuration](#configuration) -- [Metric Types](#metric-types) -- [Examples](#examples) - -## Quick Reference - -| Metric Name | Type | Labels | Description | -|------------|------|--------|-------------| -| `api_request_time_seconds` | Timer (quantile gauge) | `method`, `uri`, `status`, `quantile` | API request latency to Conductor server | -| `api_request_time_seconds_count` | Gauge | `method`, `uri`, `status` | Total number of API requests | -| `api_request_time_seconds_sum` | Gauge | `method`, `uri`, `status` | Total time spent in API requests | -| `task_poll_total` | Counter | `taskType` | Number of task poll attempts | -| `task_poll_time` | Gauge | `taskType` | Most recent poll duration (legacy) | -| `task_poll_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task poll latency distribution | -| `task_poll_time_seconds_count` | Gauge | `taskType`, `status` | Total number of poll attempts by status | -| `task_poll_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent polling | -| `task_execute_time` | Gauge | `taskType` | Most recent execution duration (legacy) | -| `task_execute_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task execution latency distribution | -| `task_execute_time_seconds_count` | Gauge | `taskType`, `status` | Total number of task executions by status | -| `task_execute_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent executing tasks | -| `task_execute_error_total` | Counter | `taskType`, `exception` | Number of task execution errors | -| `task_update_time_seconds` | Timer (quantile gauge) | `taskType`, `status`, `quantile` | Task update latency distribution | -| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Total number of task updates by status | -| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Total time spent updating tasks | -| `task_update_error_total` | Counter | `taskType`, `exception` | Number of task update errors | -| `task_result_size` | Gauge | `taskType` | Size of task result payload (bytes) | -| `task_execution_queue_full_total` | Counter | `taskType` | Number of times execution queue was full | -| `task_paused_total` | Counter | `taskType` | Number of polls while worker paused | -| `worker_restart_total` | Counter | `taskType` | Number of times TaskHandler restarted a worker subprocess | -| `external_payload_used_total` | Counter | `taskType`, `payloadType` | External payload storage usage count | -| `workflow_input_size` | Gauge | `workflowType`, `version` | Workflow input payload size (bytes) | -| `workflow_start_error_total` | Counter | `workflowType`, `exception` | Workflow start error count | - -### Label Values - -**`status`**: `SUCCESS`, `FAILURE` -**`method`**: `GET`, `POST`, `PUT`, `DELETE` -**`uri`**: API endpoint path (e.g., `/tasks/poll/batch/{taskType}`, `/tasks/update-v2`) -**`status` (HTTP)**: HTTP response code (`200`, `401`, `404`, `500`) or `error` -**`quantile`**: `0.5` (p50), `0.75` (p75), `0.9` (p90), `0.95` (p95), `0.99` (p99) -**`payloadType`**: `input`, `output` -**`exception`**: Exception type or error message - -### Example Metrics Output +# Python SDK Metrics -```prometheus -# API Request Metrics -api_request_time_seconds{method="GET",uri="/tasks/poll/batch/myTask",status="200",quantile="0.5"} 0.112 -api_request_time_seconds{method="GET",uri="/tasks/poll/batch/myTask",status="200",quantile="0.99"} 0.245 -api_request_time_seconds_count{method="GET",uri="/tasks/poll/batch/myTask",status="200"} 1000.0 -api_request_time_seconds_sum{method="GET",uri="/tasks/poll/batch/myTask",status="200"} 114.5 - -# Task Poll Metrics -task_poll_total{taskType="myTask"} 10264.0 -task_poll_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.95"} 0.025 -task_poll_time_seconds_count{taskType="myTask",status="SUCCESS"} 1000.0 -task_poll_time_seconds_count{taskType="myTask",status="FAILURE"} 95.0 - -# Task Execution Metrics -task_execute_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.99"} 0.017 -task_execute_time_seconds_count{taskType="myTask",status="SUCCESS"} 120.0 -task_execute_error_total{taskType="myTask",exception="TimeoutError"} 3.0 - -# Task Update Metrics -task_update_time_seconds{taskType="myTask",status="SUCCESS",quantile="0.95"} 0.096 -task_update_time_seconds_count{taskType="myTask",status="SUCCESS"} 15.0 -``` +The Conductor Python SDK can expose Prometheus metrics for worker polling, task +execution, task updates, workflow starts, external payload usage, and generated +API-client HTTP calls. -## Configuration +The SDK currently has two mutually exclusive metric surfaces: -### Enabling Metrics +- **Legacy metrics** are the default. They preserve the pre-harmonization Python + SDK names and shapes, including sliding-window quantile gauges for timing + metrics. +- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They + use the cross-SDK canonical names, labels, units, and Prometheus histogram + shapes. -Metrics are enabled by providing a `MetricsSettings` object when creating a `TaskHandler`: +Only one collector is active in a worker process. The SDK does not emit legacy +and canonical metrics at the same time. -```python -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.automator.task_handler import TaskHandler - -# Configure metrics -metrics_settings = MetricsSettings( - directory='/path/to/metrics', # Directory where metrics file will be written - file_name='conductor_metrics.prom', # Metrics file name (default: 'conductor_metrics.prom') - update_interval=10 # Update interval in seconds (default: 10) -) +Metric names below are the names exposed in Prometheus text output. The Python +`prometheus_client` library appends `_total` to counters in exposition output. -# Configure Conductor connection -api_config = Configuration( - server_api_url='http://localhost:8080/api', - debug=False -) - -# Create task handler with metrics -with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, - workers=[...] -) as task_handler: - task_handler.start_processes() -``` - -### AsyncIO Workers +## Configuration -Usage with TaskHandler: +Enable metrics by passing `MetricsSettings` to `TaskHandler`. ```python from conductor.client.automator.task_handler import TaskHandler +from conductor.client.configuration.configuration import Configuration +from conductor.client.configuration.settings.metrics_settings import MetricsSettings + +config = Configuration() +metrics = MetricsSettings( + directory="/tmp/conductor-metrics", + http_port=8000, +) with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, + configuration=config, + metrics_settings=metrics, scan_for_annotated_workers=True, - import_modules=['your_module'] ) as task_handler: task_handler.start_processes() task_handler.join_processes() ``` -### Metrics File Cleanup +With `http_port` set, the SDK starts a Prometheus-compatible HTTP endpoint: -For multiprocess workers using Prometheus multiprocess mode, clean the metrics directory on startup to avoid stale data: +```shell +curl http://localhost:8000/metrics +curl http://localhost:8000/health +``` + +Without `http_port`, the SDK writes Prometheus text output to +`{directory}/{file_name}` at `update_interval` seconds: ```python -import os -import shutil - -metrics_dir = '/path/to/metrics' -if os.path.exists(metrics_dir): - shutil.rmtree(metrics_dir) -os.makedirs(metrics_dir, exist_ok=True) - -metrics_settings = MetricsSettings( - directory=metrics_dir, - file_name='conductor_metrics.prom', - update_interval=10 +metrics = MetricsSettings( + directory="/tmp/conductor-metrics", + file_name="conductor_metrics.prom", + update_interval=10, + http_port=None, ) ``` +`MetricsSettings` cleans stale Prometheus multiprocess `.db` files by default. +Use a dedicated metrics directory per worker process group. -## Metric Types - -### Quantile Gauges (Timers) +## Selecting Canonical Metrics -All timing metrics use quantile gauges to track latency distribution: +Set `WORKER_CANONICAL_METRICS` before the worker starts: -- **Quantile labels**: Each metric includes 5 quantiles (p50, p75, p90, p95, p99) -- **Count suffix**: `{metric_name}_count` tracks total number of observations -- **Sum suffix**: `{metric_name}_sum` tracks total time spent - -**Example calculation (average):** -``` -average = task_poll_time_seconds_sum / task_poll_time_seconds_count -average = 18.75 / 1000.0 = 0.01875 seconds +```shell +WORKER_CANONICAL_METRICS=true python my_worker.py ``` -**Why quantiles instead of histograms?** -- More accurate percentile tracking with sliding window (last 1000 observations) -- No need to pre-configure bucket boundaries -- Lower memory footprint -- Direct percentile values without interpolation +Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other +value, or an unset variable, selects legacy metrics. The variable is read when +the metrics collector is created, so changing it requires a worker restart. + +## Canonical Metrics + +Canonical timing values are seconds. Canonical size values are bytes. Label +names use camelCase. + +Metrics are created lazily. A metric appears in `/metrics` only after the +corresponding worker event or collector method records it. Some low-level +surface metrics, such as ack, queue-full, paused, and uncaught-exception +counters, may not appear in normal worker runs unless that path is exercised. + +### Canonical Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | +| `thread_uncaught_exceptions_total` | `exception` | Collector surface for uncaught worker-thread exceptions. | +| `worker_restart_total` | `taskType` | Python-only counter for TaskHandler subprocess restarts. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Canonical Time Histograms + +All canonical time histograms use buckets: +`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`, +`5`, `10`. + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | Generated API-client HTTP request latency. | + +Each histogram exposes Prometheus series such as: -### Sliding Window - -Quantile metrics use a sliding window of the last 1000 observations to calculate percentiles. This provides: -- Recent performance data (not cumulative) -- Accurate percentile estimation -- Bounded memory usage - -## Examples - -### Querying Metrics with PromQL - -**Average API request latency:** -```promql -rate(api_request_time_seconds_sum[5m]) / rate(api_request_time_seconds_count[5m]) +```prometheus +task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42.0 +task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50.0 +task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3 ``` -**API error rate:** -```promql -sum(rate(api_request_time_seconds_count{status=~"4..|5.."}[5m])) -/ -sum(rate(api_request_time_seconds_count[5m])) -``` +### Canonical Size Histograms + +All canonical size histograms use buckets: +`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`. + +| Metric | Labels | Description | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. | + +### Canonical Gauges + +| Metric | Labels | Description | +|---|---|---| +| `active_workers` | `taskType` | Current number of workers actively executing a task. | + +## Legacy Metrics + +Legacy mode is the default so existing dashboards and alerts continue to work. +Timing metrics are sliding-window quantile gauges over the latest 1,000 +observations. Legacy timing metrics also expose `_count` and `_sum` gauge +series for the current sliding window. + +As in canonical mode, metrics are created lazily and rare or surface-only +counters appear only when the corresponding code path records them. + +### Legacy Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time polling is done. | +| `task_execute_error_total` | `taskType`, `exception` | Task execution errors. `exception` is `str(exception)`. | +| `task_update_error_total` | `taskType`, `exception` | Task update errors. `exception` is `str(exception)`. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. `exception` is `str(exception)`. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. | +| `task_execution_queue_full_total` | `taskType` | Collector surface for execution queue saturation events. | +| `task_paused_total` | `taskType` | Collector surface for polls skipped while the worker is paused. | +| `thread_uncaught_exceptions_total` | none | Collector surface for uncaught worker-thread exceptions. | +| `worker_restart_total` | `taskType` | TaskHandler subprocess restarts. | +| `external_payload_used_total` | `entityName`, `operation`, `payload_type` | External payload storage usage. | +| `workflow_start_error_total` | `workflowType`, `exception` | Workflow start errors. `exception` is `str(exception)`. | + +Legacy mode does not emit `task_poll_error_total`, +`task_execution_started_total`, or `active_workers`. + +### Legacy Time Metrics + +| Metric | Type | Labels | Description | +|---|---|---|---| +| `task_poll_time` | Gauge | `taskType` | Most recent poll duration, in seconds. | +| `task_poll_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window poll latency quantiles. | +| `task_poll_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window poll observation count. | +| `task_poll_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window poll duration sum. | +| `task_execute_time` | Gauge | `taskType` | Most recent task execution duration, in seconds. | +| `task_execute_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window execution latency quantiles. | +| `task_execute_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window execution observation count. | +| `task_execute_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window execution duration sum. | +| `task_update_time_seconds` | Quantile gauge | `taskType`, `status`, `quantile` | Sliding-window task update latency quantiles. | +| `task_update_time_seconds_count` | Gauge | `taskType`, `status` | Sliding-window task update observation count. | +| `task_update_time_seconds_sum` | Gauge | `taskType`, `status` | Sliding-window task update duration sum. | +| `http_api_client_request` | Quantile gauge | `method`, `uri`, `status`, `quantile` | Sliding-window API-client request latency quantiles. | +| `http_api_client_request_count` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request observation count. | +| `http_api_client_request_sum` | Gauge | `method`, `uri`, `status` | Sliding-window API-client request duration sum. | + +### Legacy Size Gauges + +| Metric | Labels | Description | +|---|---|---| +| `task_result_size` | `taskType` | Most recent serialized task result output size, in bytes. | +| `workflow_input_size` | `workflowType`, `version` | Most recent serialized workflow input size, in bytes. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `taskType` | Worker metrics | Task definition name. | +| `workflowType` | Workflow metrics | Workflow definition name. | +| `version` | `workflow_input_size`, `workflow_input_size_bytes` | Workflow version as a string. If absent, the SDK uses `1`. | +| `status` | Task time metrics | `SUCCESS` or `FAILURE`. For HTTP metrics, the response code as a string, an exception `status` or `code`, or `error` when unavailable. | +| `exception` | Error counters | Legacy uses `str(exception)`. Canonical uses the exception class name, such as `TimeoutError`. | +| `entityName` | `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payload_type` | Legacy `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | HTTP metrics | HTTP verb. | +| `uri` | HTTP metrics | Request path passed by the generated API client. | +| `quantile` | Legacy time metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | + +## Migrating From Legacy to Canonical + +Canonical mode is opt-in during the deprecation period. Before switching a +production worker, update dashboards and alerts against a staging worker with +`WORKER_CANONICAL_METRICS=true`. + +Key changes: + +- Time and size distribution metrics are real Prometheus histograms in + canonical mode. Query `_bucket` series with `histogram_quantile()` instead of + reading `{quantile="..."}` gauges. +- Legacy last-value gauges `task_poll_time`, `task_execute_time`, + `task_result_size`, and `workflow_input_size` are not emitted in canonical + mode. +- Canonical adds `task_poll_error_total`, `task_execution_started_total`, and + `active_workers`. +- `external_payload_used_total` changes label `payload_type` to `payloadType`. +- Canonical `exception` labels are bounded to exception class names. Legacy + error counters may include raw exception messages. + +Common replacements: + +| Legacy | Canonical | +|---|---| +| `task_poll_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | +| `task_execute_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | +| `task_update_time_seconds{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_update_time_seconds_bucket[5m])))` | +| `http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | +| `task_result_size` | `task_result_size_bytes_bucket`, `_count`, and `_sum` | +| `workflow_input_size` | `workflow_input_size_bytes_bucket`, `_count`, and `_sum` | +| `external_payload_used_total{payload_type="TASK_OUTPUT"}` | `external_payload_used_total{payloadType="TASK_OUTPUT"}` | + +Average latency queries continue to use `_sum` divided by `_count`, but the +canonical series are cumulative histogram counters: -**Task poll success rate:** ```promql -sum(rate(task_poll_time_seconds_count{status="SUCCESS"}[5m])) +sum(rate(task_execute_time_seconds_sum[5m])) by (taskType) / -sum(rate(task_poll_time_seconds_count[5m])) -``` - -**p95 task execution time:** -```promql -task_execute_time_seconds{quantile="0.95"} -``` - -**Slowest API endpoints (p99):** -```promql -topk(10, api_request_time_seconds{quantile="0.99"}) -``` - -### Complete Example - -```python -import os -import shutil -from conductor.client.configuration.configuration import Configuration -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.automator.task_handler import TaskHandler -from conductor.client.worker.worker_interface import WorkerInterface - -# Clean metrics directory -metrics_dir = os.path.join(os.path.expanduser('~'), 'conductor_metrics') -if os.path.exists(metrics_dir): - shutil.rmtree(metrics_dir) -os.makedirs(metrics_dir, exist_ok=True) - -# Configure metrics -metrics_settings = MetricsSettings( - directory=metrics_dir, - file_name='conductor_metrics.prom', - update_interval=10 # Update file every 10 seconds -) - -# Configure Conductor -api_config = Configuration( - server_api_url='http://localhost:8080/api', - debug=False -) - -# Define worker -class MyWorker(WorkerInterface): - def execute(self, task): - return {'status': 'completed'} - - def get_task_definition_name(self): - return 'my_task' - -# Start with metrics -with TaskHandler( - configuration=api_config, - metrics_settings=metrics_settings, - workers=[MyWorker()] -) as task_handler: - task_handler.start_processes() +sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ``` -### Scraping with Prometheus - -Configure Prometheus to scrape the metrics file: - -```yaml -# prometheus.yml -scrape_configs: - - job_name: 'conductor-python-sdk' - static_configs: - - targets: ['localhost:8000'] # Use file_sd or custom exporter - metric_relabel_configs: - - source_labels: [taskType] - target_label: task_type -``` - -**Note:** Since metrics are written to a file, you'll need to either: -1. Use Prometheus's `textfile` collector with Node Exporter -2. Create a simple HTTP server to expose the metrics file -3. Use a custom exporter to read and serve the file - -### Example HTTP Metrics Server - -```python -from http.server import HTTPServer, SimpleHTTPRequestHandler -import os - -class MetricsHandler(SimpleHTTPRequestHandler): - def do_GET(self): - if self.path == '/metrics': - metrics_file = '/path/to/conductor_metrics.prom' - if os.path.exists(metrics_file): - with open(metrics_file, 'rb') as f: - content = f.read() - self.send_response(200) - self.send_header('Content-Type', 'text/plain; version=0.0.4') - self.end_headers() - self.wfile.write(content) - else: - self.send_response(404) - self.end_headers() - else: - self.send_response(404) - self.end_headers() - -# Run server -httpd = HTTPServer(('0.0.0.0', 8000), MetricsHandler) -httpd.serve_forever() -``` - -## Best Practices +## Troubleshooting -1. **Clean metrics directory on startup** to avoid stale multiprocess metrics -2. **Monitor disk space** as metrics files can grow with many task types -3. **Use appropriate update_interval** (10-60 seconds recommended) -4. **Set up alerts** on error rates and high latencies -5. **Monitor queue saturation** (`task_execution_queue_full_total`) for backpressure -6. **Track API errors** by status code to identify authentication or server issues -7. **Use p95/p99 latencies** for SLO monitoring rather than averages +### Metrics Are Empty -## Troubleshooting +- Verify that `metrics_settings` is passed to `TaskHandler`. +- Verify workers have polled or executed tasks. Metrics are created lazily when + the relevant event occurs. +- Check that the metrics directory is writable. -### Metrics file is empty -- Ensure `MetricsCollector` is registered as an event listener -- Check that workers are actually polling and executing tasks -- Verify the metrics directory has write permissions +### Stale or Unexpected Series -### Stale metrics after restart -- Clean the metrics directory on startup (see Configuration section) -- Prometheus's `multiprocess` mode requires cleanup between runs +- Use a dedicated `directory` for each worker process group. +- Leave `clean_directory=True` on `MetricsSettings` unless another process owns + the same Prometheus multiprocess directory. +- Restart workers after changing `WORKER_CANONICAL_METRICS`. -### High memory usage -- Reduce the sliding window size (default: 1000 observations) -- Increase `update_interval` to write less frequently -- Limit the number of unique label combinations +### High Cardinality -### Missing metrics -- Verify `metrics_settings` is passed to TaskHandler -- Check that the SDK version supports the metric you're looking for -- Ensure workers are properly registered and running +- Prefer canonical mode for bounded `exception` labels. +- Watch the `uri` label on HTTP metrics. The Python SDK records the request path + available at the generated API-client call site. +- Avoid embedding user identifiers or unbounded values in task type, workflow + type, or external payload labels. diff --git a/README.md b/README.md index 958dfbe35..0ea4e91ec 100644 --- a/README.md +++ b/README.md @@ -320,7 +320,7 @@ with TaskHandler(configuration=config, metrics_settings=metrics, scan_for_annota curl http://localhost:8000/metrics ``` -See [examples/metrics_example.py](examples/metrics_example.py) and [METRICS.md](METRICS.md) for details on all tracked metrics. +Legacy metrics are emitted by default. Set `WORKER_CANONICAL_METRICS=true` before starting workers to use the canonical metric catalog. See [examples/metrics_example.py](examples/metrics_example.py) and [METRICS.md](METRICS.md) for the full legacy and canonical reference. ### Managing Workflow Executions diff --git a/WORKER_CONFIGURATION.md b/WORKER_CONFIGURATION.md index 9d8ab1a01..39bb0bc28 100644 --- a/WORKER_CONFIGURATION.md +++ b/WORKER_CONFIGURATION.md @@ -330,7 +330,6 @@ export conductor.worker.process_order.paused=true When a worker is paused: - It stops polling for new tasks - Already-executing tasks complete normally -- The `task_paused_total` metric is incremented for each skipped poll - No code changes or process restarts required **Use cases:** @@ -346,11 +345,7 @@ unset conductor.worker.all.paused export conductor.worker.all.paused=false ``` -**Monitor paused workers** using the `task_paused_total` metric: -```promql -# Check how many times workers were paused -task_paused_total{taskType="process_order"} -``` +See [METRICS.md](METRICS.md) for the current Python SDK metrics catalog. ### Multi-Region Deployment diff --git a/docs/design/WORKER_DESIGN.md b/docs/design/WORKER_DESIGN.md index 98bcf62ba..fdc0b1af4 100644 --- a/docs/design/WORKER_DESIGN.md +++ b/docs/design/WORKER_DESIGN.md @@ -767,114 +767,13 @@ Workers package - all worker modules auto-discovered ## Metrics & Monitoring -The SDK provides comprehensive Prometheus metrics collection with two deployment modes: +This design document describes how worker events flow through the SDK. The +current user-facing metrics setup, legacy and canonical metric catalogs, +`WORKER_CANONICAL_METRICS` behavior, Prometheus examples, and migration guidance +are maintained in [`../../METRICS.md`](../../METRICS.md). -### Configuration - -**HTTP Mode (Recommended - Metrics served from memory):** -```python -from conductor.client.configuration.settings.metrics_settings import MetricsSettings - -metrics_settings = MetricsSettings( - directory="/tmp/conductor-metrics", # .db files for multiprocess coordination - update_interval=0.1, # Update every 100ms - http_port=8000 # Expose metrics via HTTP -) - -with TaskHandler( - configuration=config, - metrics_settings=metrics_settings -) as handler: - handler.start_processes() -``` - -**File Mode (Metrics written to file):** -```python -metrics_settings = MetricsSettings( - directory="/tmp/conductor-metrics", - file_name="metrics.prom", - update_interval=1.0, - http_port=None # No HTTP server - write to file instead -) -``` - -### Modes - -| Mode | HTTP Server | File Writes | Use Case | -|------|-------------|-------------|----------| -| HTTP (`http_port` set) | ✅ Built-in | ❌ Disabled | Prometheus scraping, production | -| File (`http_port=None`) | ❌ Disabled | ✅ Enabled | File-based monitoring, testing | - -**HTTP Mode Benefits:** -- Metrics served directly from memory (no file I/O) -- Built-in HTTP server with `/metrics` and `/health` endpoints -- Automatic aggregation across worker processes (no PID labels) -- Ready for Prometheus scraping out-of-the-box - -### Key Metrics - -**Task Metrics:** -- `task_poll_time_seconds{taskType,quantile}` - Poll latency (includes batch polling) -- `task_execute_time_seconds{taskType,quantile}` - Actual execution time (async tasks: from submission to completion) -- `task_execute_error_total{taskType,exception}` - Execution errors by type -- `task_poll_total{taskType}` - Total poll count -- `task_result_size_bytes{taskType,quantile}` - Task output size - -**API Metrics:** -- `http_api_client_request{method,uri,status,quantile}` - API request latency -- `http_api_client_request_count{method,uri,status}` - Request count by endpoint -- `http_api_client_request_sum{method,uri,status}` - Total request time - -**Labels:** -- `taskType`: Task definition name -- `method`: HTTP method (GET, POST, PUT) -- `uri`: API endpoint path -- `status`: HTTP status code -- `exception`: Exception type (for errors) -- `quantile`: 0.5, 0.75, 0.9, 0.95, 0.99 - -**Important Notes:** -- **No PID labels**: Metrics are automatically aggregated across processes -- **Async execution time**: Includes actual execution time, not just coroutine submission time -- **Multiprocess safe**: Uses SQLite .db files in `directory` for coordination - -### Prometheus Integration - -**Scrape Config:** -```yaml -scrape_configs: - - job_name: 'conductor-workers' - static_configs: - - targets: ['localhost:8000'] - scrape_interval: 15s -``` - -**Accessing Metrics:** -```bash -# Metrics endpoint -curl http://localhost:8000/metrics - -# Health check -curl http://localhost:8000/health - -# Watch specific metric -watch -n 1 'curl -s http://localhost:8000/metrics | grep task_execute_time_seconds' -``` - -**PromQL Examples:** -```promql -# Average execution time -rate(task_execute_time_seconds_sum[5m]) / rate(task_execute_time_seconds_count[5m]) - -# Success rate -sum(rate(task_execute_time_seconds_count{status="SUCCESS"}[5m])) / sum(rate(task_execute_time_seconds_count[5m])) - -# p95 latency -task_execute_time_seconds{quantile="0.95"} - -# Error rate -sum(rate(task_execute_error_total[5m])) by (taskType) -``` +Keep metric names and PromQL examples out of this design document so the SDK has +one source of truth for legacy and canonical metrics. --- @@ -1264,8 +1163,8 @@ class CostTracker(TaskRunnerEventsListener): ```python handler = TaskHandler( configuration=config, + metrics_settings=metrics_settings, event_listeners=[ - PrometheusMetricsCollector(), SLAMonitor(threshold_ms=5000), CostTracker(cost_per_second={'ml_task': 0.05}), CustomAuditLogger() diff --git a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md index 3f28504df..8c2db407a 100644 --- a/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md +++ b/docs/design/WORKER_SDK_IMPLEMENTATION_GUIDE.md @@ -1789,44 +1789,15 @@ Response: void ## 15. Metrics & Monitoring -### 15.1 Required Metrics +The Python SDK's current metrics behavior is documented in +[`../../METRICS.md`](../../METRICS.md). That file is the source of truth for: -**Via Event System (Recommended):** +- Enabling metrics with `MetricsSettings` +- Selecting canonical metrics with `WORKER_CANONICAL_METRICS` +- The complete legacy and canonical Prometheus catalogs +- Migration guidance from legacy quantile gauges to canonical histograms -Implement MetricsCollector as EventListener: - -``` -class MetricsCollector implements TaskRunnerEventsListener { - on_poll_started(event): - increment_counter("task_poll_total", labels={taskType: event.taskType}) - - on_poll_completed(event): - record_histogram("task_poll_time_seconds", event.durationMs / 1000) - increment_counter("task_poll_total", labels={taskType: event.taskType}) - - on_task_execution_completed(event): - record_histogram("task_execute_time_seconds", event.durationMs / 1000) - record_histogram("task_result_size_bytes", event.outputSizeBytes) - - on_task_execution_failure(event): - increment_counter("task_execute_error_total", - labels={taskType: event.taskType, exception: event.cause.type}) - - on_task_update_failure(event): - increment_counter("task_update_failed_total", - labels={taskType: event.taskType}) - // CRITICAL: Alert operations team -} -``` - -**Metric Names (Prometheus format):** -- `task_poll_total{taskType}` -- `task_poll_time_seconds{taskType,quantile}` -- `task_execute_time_seconds{taskType,quantile}` -- `task_execute_error_total{taskType,exception}` -- `task_result_size_bytes{taskType,quantile}` -- `task_update_error_total{taskType,exception}` -- `task_update_failed_total{taskType}` ← CRITICAL metric +Do not duplicate metric names or PromQL examples in this implementation guide. --- diff --git a/docs/design/event_driven_interceptor_system.md b/docs/design/event_driven_interceptor_system.md index 011bdb85d..ce01b08b9 100644 --- a/docs/design/event_driven_interceptor_system.md +++ b/docs/design/event_driven_interceptor_system.md @@ -1,5 +1,11 @@ # Event-Driven Interceptor System - Design Document +> Historical design note: this document describes the event-driven interceptor +> design and includes some pre-harmonization metrics examples. The current +> Python SDK metrics setup, complete legacy and canonical catalogs, +> `WORKER_CANONICAL_METRICS` behavior, and migration guidance are maintained in +> [`../../METRICS.md`](../../METRICS.md). + ## Table of Contents - [Overview](#overview) - [Current State Analysis](#current-state-analysis) @@ -53,28 +59,10 @@ Design and implement an event-driven interceptor system that: ### Existing Metrics System -**Location**: `src/conductor/client/telemetry/metrics_collector.py` - -```python -class MetricsCollector: - def __init__(self, settings: MetricsSettings): - os.environ["PROMETHEUS_MULTIPROC_DIR"] = settings.directory - MultiProcessCollector(self.registry) - - def increment_task_poll(self, task_type: str) -> None: - self.__increment_counter( - name=MetricName.TASK_POLL, - documentation=MetricDocumentation.TASK_POLL, - labels={MetricLabel.TASK_TYPE: task_type} - ) -``` - -**Current Usage** in `task_runner_asyncio.py`: - -```python -if self.metrics_collector is not None: - self.metrics_collector.increment_task_poll(task_definition_name) -``` +The original design coupled task runner code directly to a metrics collector. +The current implementation now routes metrics through telemetry collector +classes and worker/client events. See [`../../METRICS.md`](../../METRICS.md) for +current setup and metric names. ### Problems with Current Approach @@ -86,18 +74,10 @@ if self.metrics_collector is not None: | Limited data | Can't access full context | Medium | | No filtering | All-or-nothing | Low | -### Available Metrics (Current) +### Current Metrics Reference -**Counters:** -- `task_poll`, `task_poll_error`, `task_execution_queue_full` -- `task_execute_error`, `task_ack_error`, `task_ack_failed` -- `task_update_error`, `task_paused` -- `thread_uncaught_exceptions`, `workflow_start_error` -- `external_payload_used` - -**Gauges:** -- `task_poll_time`, `task_execute_time` -- `task_result_size`, `workflow_input_size` +The current Prometheus metric names, labels, and legacy/canonical mode behavior +are maintained in [`../../METRICS.md`](../../METRICS.md). --- @@ -760,13 +740,12 @@ class TaskResultSize(TaskClientEvent): ## Metrics Collection Flow -### Old Flow (Current) +### Earlier Direct Flow ``` TaskRunner.poll_tasks() - └─> metrics_collector.increment_task_poll(task_type) - └─> counter.labels(task_type).inc() - └─> Prometheus registry + └─> direct metrics collector call + └─> Prometheus registry ``` **Problems:** @@ -774,14 +753,13 @@ TaskRunner.poll_tasks() - Synchronous call - Can't add custom logic without modifying SDK -### New Flow (Proposed) +### Event-Driven Flow ``` TaskRunner.poll_tasks() └─> event_dispatcher.publish(PollStarted(...)) └─> asyncio.create_task(dispatch_to_listeners()) - ├─> PrometheusCollector.on_poll_started() - │ └─> counter.labels(task_type).inc() + ├─> Metrics listener ├─> DatadogCollector.on_poll_started() │ └─> datadog.increment('poll.started') └─> CustomListener.on_poll_started() @@ -796,15 +774,7 @@ TaskRunner.poll_tasks() ### Integration with TaskRunnerAsyncIO -**Current code** (`task_runner_asyncio.py`): - -```python -# OLD - Direct metrics call -if self.metrics_collector is not None: - self.metrics_collector.increment_task_poll(task_definition_name) -``` - -**New code** (with events): +**Event publishing:** ```python # NEW - Event publishing @@ -815,211 +785,12 @@ self.event_dispatcher.publish(PollStarted( )) ``` -### Adapter Pattern for Backward Compatibility - -**Location**: `src/conductor/client/telemetry/metrics_collector_adapter.py` - -```python -""" -Adapter to make old MetricsCollector work with new event system. -""" - -from conductor.client.telemetry.metrics_collector import MetricsCollector as OldMetricsCollector -from conductor.client.events.listeners import MetricsCollector as NewMetricsCollector -from conductor.client.events.task_runner_events import * - - -class MetricsCollectorAdapter(NewMetricsCollector): - """ - Adapter that wraps old MetricsCollector and implements new protocol. - - This allows existing metrics collection to work with new event system - without any code changes. - """ - - def __init__(self, old_collector: OldMetricsCollector): - self.collector = old_collector +### Current Metrics Implementation - def on_poll_started(self, event: PollStarted) -> None: - self.collector.increment_task_poll(event.task_type) - - def on_poll_completed(self, event: PollCompleted) -> None: - self.collector.record_task_poll_time(event.task_type, event.duration_ms / 1000.0) - - def on_poll_failure(self, event: PollFailure) -> None: - # Create exception-like object for old API - error = type(event.error_type, (Exception,), {})() - self.collector.increment_task_poll_error(event.task_type, error) - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - # Old collector doesn't have this metric - pass - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - self.collector.record_task_execute_time( - event.task_type, - event.duration_ms / 1000.0 - ) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - error = type(event.error_type, (Exception,), {})() - self.collector.increment_task_execution_error(event.task_type, error) - - # Implement other protocol methods... -``` - -### New Prometheus Collector (Reference Implementation) - -**Location**: `src/conductor/client/telemetry/prometheus/prometheus_metrics_collector.py` - -```python -""" -Reference implementation: Prometheus metrics collector using event system. -""" - -from typing import Optional -from prometheus_client import Counter, Histogram, CollectorRegistry -from conductor.client.events.listeners import MetricsCollector -from conductor.client.events.task_runner_events import * -from conductor.client.events.workflow_events import * -from conductor.client.events.task_client_events import * - - -class PrometheusMetricsCollector(MetricsCollector): - """ - Prometheus metrics collector implementing the MetricsCollector protocol. - - Exposes metrics in Prometheus format for scraping. - - Usage: - collector = PrometheusMetricsCollector() - - # Register with task handler - handler = TaskHandler( - configuration=config, - event_listeners=[collector] - ) - """ - - def __init__( - self, - registry: Optional[CollectorRegistry] = None, - namespace: str = "conductor" - ): - self.registry = registry or CollectorRegistry() - self.namespace = namespace - - # Define metrics - self._poll_started_counter = Counter( - f'{namespace}_task_poll_started_total', - 'Total number of task polling attempts', - ['task_type', 'worker_id'], - registry=self.registry - ) - - self._poll_duration_histogram = Histogram( - f'{namespace}_task_poll_duration_seconds', - 'Task polling duration in seconds', - ['task_type', 'status'], # status: success, failure - registry=self.registry - ) - - self._task_execution_started_counter = Counter( - f'{namespace}_task_execution_started_total', - 'Total number of task executions started', - ['task_type', 'worker_id'], - registry=self.registry - ) - - self._task_execution_duration_histogram = Histogram( - f'{namespace}_task_execution_duration_seconds', - 'Task execution duration in seconds', - ['task_type', 'status'], # status: completed, failed - registry=self.registry - ) - - self._task_execution_failure_counter = Counter( - f'{namespace}_task_execution_failures_total', - 'Total number of task execution failures', - ['task_type', 'error_type', 'retryable'], - registry=self.registry - ) - - self._workflow_started_counter = Counter( - f'{namespace}_workflow_started_total', - 'Total number of workflow start attempts', - ['workflow_name', 'status'], # status: success, failure - registry=self.registry - ) - - # Task Runner Event Handlers - - def on_poll_started(self, event: PollStarted) -> None: - self._poll_started_counter.labels( - task_type=event.task_type, - worker_id=event.worker_id - ).inc() - - def on_poll_completed(self, event: PollCompleted) -> None: - self._poll_duration_histogram.labels( - task_type=event.task_type, - status='success' - ).observe(event.duration_ms / 1000.0) - - def on_poll_failure(self, event: PollFailure) -> None: - self._poll_duration_histogram.labels( - task_type=event.task_type, - status='failure' - ).observe(event.duration_ms / 1000.0) - - def on_task_execution_started(self, event: TaskExecutionStarted) -> None: - self._task_execution_started_counter.labels( - task_type=event.task_type, - worker_id=event.worker_id - ).inc() - - def on_task_execution_completed(self, event: TaskExecutionCompleted) -> None: - self._task_execution_duration_histogram.labels( - task_type=event.task_type, - status='completed' - ).observe(event.duration_ms / 1000.0) - - def on_task_execution_failure(self, event: TaskExecutionFailure) -> None: - self._task_execution_duration_histogram.labels( - task_type=event.task_type, - status='failed' - ).observe(event.duration_ms / 1000.0) - - self._task_execution_failure_counter.labels( - task_type=event.task_type, - error_type=event.error_type, - retryable=str(event.is_retryable) - ).inc() - - # Workflow Event Handlers - - def on_workflow_started(self, event: WorkflowStarted) -> None: - self._workflow_started_counter.labels( - workflow_name=event.workflow_name, - status='success' if event.success else 'failure' - ).inc() - - def on_workflow_input_size(self, event: WorkflowInputSize) -> None: - # Could add histogram for input sizes - pass - - def on_workflow_payload_used(self, event: WorkflowPayloadUsed) -> None: - # Could track external storage usage - pass - - # Task Client Event Handlers - - def on_task_payload_used(self, event: TaskPayloadUsed) -> None: - pass - - def on_task_result_size(self, event: TaskResultSize) -> None: - pass -``` +The implemented SDK metrics collector is selected by `MetricsSettings` and, for +canonical mode, `WORKER_CANONICAL_METRICS`. It listens to worker and client +events through the current telemetry collector classes. See +[`../../METRICS.md`](../../METRICS.md) for current setup and metric names. --- @@ -1045,50 +816,45 @@ class PrometheusMetricsCollector(MetricsCollector): **Tasks:** 1. Add event_dispatcher to TaskRunnerAsyncIO 2. Add event_dispatcher to TaskRunner (multiprocessing) -3. Publish events alongside existing metrics calls -4. Create MetricsCollectorAdapter +3. Publish events for worker lifecycle changes +4. Register metrics collectors as event listeners 5. Integration tests -**Backward Compatible**: Both old and new APIs work simultaneously - -```python -# Both work at the same time -if self.metrics_collector: - self.metrics_collector.increment_task_poll(task_type) # OLD - -self.event_dispatcher.publish(PollStarted(...)) # NEW -``` +**Backward Compatible**: Existing metrics setup continues to work while event +publishing is introduced. ### Phase 3: Reference Implementation (Week 3) **Goal**: New Prometheus collector using events **Tasks:** -1. Implement PrometheusMetricsCollector (new) +1. Implement the built-in metrics collector 2. Create example collectors (Datadog, CloudWatch) 3. Documentation and examples 4. Performance benchmarks -**Backward Compatible**: Users can choose old or new collector +**Backward Compatible**: Users can select legacy or canonical metrics through +the documented metrics factory behavior. ### Phase 4: Deprecation (Future Release) -**Goal**: Mark old API as deprecated +**Goal**: Deprecate pre-harmonization metric shapes when canonical metrics +become the default. **Tasks:** -1. Add deprecation warnings to old MetricsCollector -2. Update all examples to use new API -3. Migration guide +1. Announce canonical metrics as the default +2. Update examples to use documented metrics setup +3. Maintain migration guidance in `METRICS.md` **Timeline**: 6 months deprecation period ### Phase 5: Removal (Future Major Version) -**Goal**: Remove old metrics API +**Goal**: Remove legacy metric shapes in a future major version. **Tasks:** -1. Remove old MetricsCollector implementation -2. Remove adapter +1. Remove legacy collector implementation +2. Keep `METRICS.md` aligned with the released surface 3. Update major version **Timeline**: Next major version (2.0.0) @@ -1132,9 +898,9 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW - [ ] Publish events (same as AsyncIO) - [ ] Handle multiprocess event publishing -**Day 4: Adapter Pattern** -- [ ] Implement MetricsCollectorAdapter -- [ ] Tests for adapter +**Day 4: Compatibility** +- [ ] Verify existing metrics setup continues to work +- [ ] Tests for compatibility behavior **Day 5: Integration Tests** - [ ] End-to-end tests with events @@ -1143,8 +909,8 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW ### Week 3: Reference Implementation & Examples -**Day 1-2: New Prometheus Collector** -- [ ] Implement PrometheusMetricsCollector using events +**Day 1-2: Built-in Metrics Collector** +- [ ] Implement metrics collection using events - [ ] HTTP server for metrics endpoint - [ ] Tests @@ -1163,51 +929,29 @@ self.event_dispatcher.publish(PollStarted(...)) # NEW ## Examples -### Example 1: Basic Usage (Prometheus) +### Example 1: Current Metrics Usage ```python from conductor.client.configuration.configuration import Configuration from conductor.client.automator.task_handler import TaskHandler -from conductor.client.telemetry.prometheus.prometheus_metrics_collector import ( - PrometheusMetricsCollector -) +from conductor.client.configuration.settings.metrics_settings import MetricsSettings config = Configuration() - -# Create Prometheus collector -prometheus = PrometheusMetricsCollector() +metrics_settings = MetricsSettings(directory="/tmp/conductor-metrics", http_port=8000) # Create task handler with metrics with TaskHandler( configuration=config, - event_listeners=[prometheus] # NEW API + metrics_settings=metrics_settings, ) as handler: handler.start_processes() handler.join_processes() ``` -### Example 2: Multiple Collectors - -```python -from conductor.client.telemetry.prometheus.prometheus_metrics_collector import ( - PrometheusMetricsCollector -) -from my_app.metrics.datadog_collector import DatadogCollector -from my_app.monitoring.sla_monitor import SLAMonitor - -# Create multiple collectors -prometheus = PrometheusMetricsCollector() -datadog = DatadogCollector(api_key=os.getenv('DATADOG_API_KEY')) -sla_monitor = SLAMonitor(thresholds={'critical_task': 30.0}) - -# Register all collectors -handler = TaskHandler( - configuration=config, - event_listeners=[prometheus, datadog, sla_monitor] -) -``` +For the current Prometheus metrics catalog and canonical mode selection, see +[`../../METRICS.md`](../../METRICS.md). -### Example 3: Custom Event Listener +### Example 2: Custom Event Listener ```python from conductor.client.events.listeners import TaskRunnerEventsListener @@ -1240,7 +984,7 @@ handler = TaskHandler( ) ``` -### Example 4: Selective Listening (Lambda) +### Example 3: Selective Listening (Lambda) ```python from conductor.client.events.event_dispatcher import EventDispatcher @@ -1259,7 +1003,7 @@ dispatcher.register_sync( ) ``` -### Example 5: Cost Tracking +### Example 4: Cost Tracking ```python from decimal import Decimal @@ -1295,27 +1039,6 @@ handler = TaskHandler( ) ``` -### Example 6: Backward Compatibility - -```python -from conductor.client.configuration.settings.metrics_settings import MetricsSettings -from conductor.client.telemetry.metrics_collector import MetricsCollector -from conductor.client.telemetry.metrics_collector_adapter import MetricsCollectorAdapter - -# OLD API (still works) -metrics_settings = MetricsSettings(directory="/tmp/metrics") -old_collector = MetricsCollector(metrics_settings) - -# Wrap old collector with adapter -adapter = MetricsCollectorAdapter(old_collector) - -# Use with new event system -handler = TaskHandler( - configuration=config, - event_listeners=[adapter] # OLD collector works with NEW system! -) -``` - --- ## Performance Considerations @@ -1482,16 +1205,13 @@ prometheus.start_http_server(port=9991, path='/metrics') ## Appendix A: API Comparison -### Old API (Current) +### Earlier Direct Metrics API -```python -# Direct coupling to metrics collector -if self.metrics_collector: - self.metrics_collector.increment_task_poll(task_type) - self.metrics_collector.record_task_poll_time(task_type, duration) -``` +The earlier design coupled task runner code directly to metric recording calls. +Current user-facing metrics setup is documented in +[`../../METRICS.md`](../../METRICS.md). -### New API (Proposed) +### Event-Driven API ```python # Event-driven, decoupled @@ -1520,11 +1240,8 @@ src/conductor/client/ │ └── task_client_events.py # Task client event types │ ├── telemetry/ -│ ├── metrics_collector.py # OLD (keep for compatibility) -│ ├── metrics_collector_adapter.py # Adapter for old → new -│ └── prometheus/ -│ ├── __init__.py -│ └── prometheus_metrics_collector.py # NEW reference implementation +│ ├── metrics_collector.py # Compatibility shim +│ └── metrics_collector_base.py # Shared collector infrastructure │ └── automator/ ├── task_handler_asyncio.py # Modified to publish events From a6e721d17c74bec348d32ecb30168088c0f1cd28 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 11:53:13 -0600 Subject: [PATCH 7/7] additional update of docs re: metrics --- METRICS.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/METRICS.md b/METRICS.md index 28266498d..24bc39ac3 100644 --- a/METRICS.md +++ b/METRICS.md @@ -234,7 +234,18 @@ Key changes: - Canonical `exception` labels are bounded to exception class names. Legacy error counters may include raw exception messages. -Common replacements: +Legacy metrics that change name or type in canonical mode: + +| Legacy metric | Canonical metric | Change | +|---|---|---| +| `task_poll_time` (gauge) | — | Removed; use the histogram instead. | +| `task_execute_time` (gauge) | — | Removed; use the histogram instead. | +| `task_result_size` (gauge) | `task_result_size_bytes` (histogram) | Renamed; gauge becomes histogram with buckets. | +| `workflow_input_size` (gauge) | `workflow_input_size_bytes` (histogram) | Renamed; gauge becomes histogram with buckets. | +| `http_api_client_request` (quantile gauge) | `http_api_client_request_seconds` (histogram) | Renamed with `_seconds` suffix; quantile gauge becomes histogram. | +| `external_payload_used_total{payload_type=…}` | `external_payload_used_total{payloadType=…}` | Label renamed from `payload_type` to `payloadType`. | + +Common PromQL replacements: | Legacy | Canonical | |---|---|