|
1 | 1 | # (C) 2026 GoodData Corporation |
2 | | -"""SDK wrapper for the AI Lake long-running-operation surface. |
| 2 | +"""SDK wrapper for the AI Lake API surface. |
3 | 3 |
|
4 | | -Today this exposes only the operations needed by aggregate-aware LDMs: |
| 4 | +Currently exposed operations: |
5 | 5 |
|
6 | | -- `analyze_statistics` triggers `ANALYZE TABLE` over a database instance so |
7 | | - CBO statistics catch up after a schema or data change. Required after |
8 | | - registering a pre-aggregation table whose dim attributes the platform will |
9 | | - later resolve via filter pushdown. |
| 6 | +- `list_object_storages` lists ObjectStorages registered for the organization. |
| 7 | + Use the returned names as ``source_storage_name`` in `create_pipe_table`. |
| 8 | +- `create_pipe_table` registers a pipe table in a database instance, with |
| 9 | + optional `CatalogColumnExpression` overrides for HLL / BITMAP columns. |
| 10 | +- `analyze_statistics` triggers ``ANALYZE TABLE`` over a database instance so |
| 11 | + CBO statistics catch up after a schema or data change. |
10 | 12 | - `get_operation` and `wait_for_operation` cover the polling side of the |
11 | 13 | long-running operation contract that `analyze_statistics` returns. |
12 | | -
|
13 | | -The full AI Lake API surface (database provisioning, pipe-table |
14 | | -registration, service commands) is not yet wrapped here; consumers that |
15 | | -need those should call `client.ai_lake_api.<method>` directly until a |
16 | | -ticket adds typed wrappers. |
17 | 14 | """ |
18 | 15 |
|
19 | 16 | from __future__ import annotations |
|
25 | 22 | from attrs import define |
26 | 23 | from gooddata_api_client.api.ai_lake_api import AILakeApi |
27 | 24 | from gooddata_api_client.model.analyze_statistics_request import AnalyzeStatisticsRequest |
| 25 | +from gooddata_api_client.model.create_pipe_table_request import CreatePipeTableRequest |
28 | 26 |
|
| 27 | +from gooddata_sdk.catalog.ai_lake.entity_model.column_expression import CatalogColumnExpression |
| 28 | +from gooddata_sdk.catalog.ai_lake.entity_model.object_storage import CatalogObjectStorageInfo |
29 | 29 | from gooddata_sdk.catalog.base import Base |
30 | 30 | from gooddata_sdk.client import GoodDataApiClient |
31 | 31 |
|
@@ -76,6 +76,105 @@ def __init__(self, api_client: GoodDataApiClient) -> None: |
76 | 76 | self._client = api_client |
77 | 77 | self._ai_lake_api: AILakeApi = AILakeApi(api_client._api_client) |
78 | 78 |
|
| 79 | + # ------------------------------------------------------------------ |
| 80 | + # ObjectStorage listing |
| 81 | + # ------------------------------------------------------------------ |
| 82 | + |
| 83 | + def list_object_storages(self) -> list[CatalogObjectStorageInfo]: |
| 84 | + """List ObjectStorages registered for the organization. |
| 85 | +
|
| 86 | + Provider credentials are stripped server-side — only safe descriptors |
| 87 | + (id, name, type, bucket, region, endpoint, …) are returned. |
| 88 | +
|
| 89 | + Use the returned :attr:`~CatalogObjectStorageInfo.name` as |
| 90 | + ``source_storage_name`` when calling :meth:`create_pipe_table`, or |
| 91 | + pass :attr:`~CatalogObjectStorageInfo.storage_id` to the |
| 92 | + ``ProvisionDatabase`` ``storageIds`` list. |
| 93 | +
|
| 94 | + Returns: |
| 95 | + List of :class:`CatalogObjectStorageInfo`, ordered by name. |
| 96 | + """ |
| 97 | + response = self._ai_lake_api.list_ai_lake_object_storages(_check_return_type=False) |
| 98 | + data = response.to_dict() if hasattr(response, "to_dict") else dict(response) |
| 99 | + return [CatalogObjectStorageInfo.from_dict(s) for s in data.get("storages", [])] |
| 100 | + |
| 101 | + # ------------------------------------------------------------------ |
| 102 | + # Pipe-table management |
| 103 | + # ------------------------------------------------------------------ |
| 104 | + |
| 105 | + def create_pipe_table( |
| 106 | + self, |
| 107 | + instance_id: str, |
| 108 | + table_name: str, |
| 109 | + source_storage_name: str, |
| 110 | + path_prefix: str, |
| 111 | + *, |
| 112 | + column_expressions: dict[str, CatalogColumnExpression] | None = None, |
| 113 | + column_overrides: dict[str, str] | None = None, |
| 114 | + aggregation_overrides: dict[str, str] | None = None, |
| 115 | + max_varchar_length: int | None = None, |
| 116 | + polling_interval_seconds: int | None = None, |
| 117 | + table_properties: dict[str, str] | None = None, |
| 118 | + ) -> None: |
| 119 | + """Register a new pipe table in an AI Lake database instance. |
| 120 | +
|
| 121 | + Args: |
| 122 | + instance_id: Database instance name (preferred) or UUID. |
| 123 | + table_name: OLAP table name. Must match ``^[a-z][a-z0-9_-]{0,62}$``. |
| 124 | + source_storage_name: Name of a registered ObjectStorage (use |
| 125 | + :meth:`list_object_storages` to discover available names). |
| 126 | + path_prefix: Path prefix to the parquet files in the storage |
| 127 | + (e.g. ``'my-dataset/year=2024/'``). |
| 128 | + column_expressions: Per-target-column projection overrides. Each |
| 129 | + key is the target column name; the value is a |
| 130 | + :class:`CatalogColumnExpression` that emits |
| 131 | + ``<function>(<column>) AS <key>`` in the generated |
| 132 | + ``CREATE PIPE … AS INSERT`` SELECT list. Required for |
| 133 | + AGGREGATE-KEY tables that include native HLL or BITMAP columns. |
| 134 | + column_overrides: Override inferred column types, e.g. |
| 135 | + ``{"year": "INT", "event_date": "DATE"}``. |
| 136 | + aggregation_overrides: Maps non-key column names to their StarRocks |
| 137 | + aggregation function (``SUM``, ``MIN``, ``MAX``, ``REPLACE``, |
| 138 | + ``HLL_UNION``, ``BITMAP_UNION``, …). Required for every |
| 139 | + non-key column when ``key_config`` type is ``'aggregate'``. |
| 140 | + max_varchar_length: Cap VARCHAR(N) columns to this length; 0 means |
| 141 | + no cap. |
| 142 | + polling_interval_seconds: How often (in seconds) the pipe polls for |
| 143 | + new files; 0 or ``None`` uses the server default. |
| 144 | + table_properties: ``CREATE TABLE PROPERTIES`` key-value pairs. |
| 145 | + Defaults to ``{"replication_num": "1"}`` server-side. |
| 146 | + """ |
| 147 | + kwargs: dict[str, Any] = {} |
| 148 | + if column_expressions is not None: |
| 149 | + kwargs["column_expressions"] = {k: v.as_api_model() for k, v in column_expressions.items()} |
| 150 | + if column_overrides is not None: |
| 151 | + kwargs["column_overrides"] = column_overrides |
| 152 | + if aggregation_overrides is not None: |
| 153 | + kwargs["aggregation_overrides"] = aggregation_overrides |
| 154 | + if max_varchar_length is not None: |
| 155 | + kwargs["max_varchar_length"] = max_varchar_length |
| 156 | + if polling_interval_seconds is not None: |
| 157 | + kwargs["polling_interval_seconds"] = polling_interval_seconds |
| 158 | + if table_properties is not None: |
| 159 | + kwargs["table_properties"] = table_properties |
| 160 | + |
| 161 | + request = CreatePipeTableRequest( |
| 162 | + table_name=table_name, |
| 163 | + source_storage_name=source_storage_name, |
| 164 | + path_prefix=path_prefix, |
| 165 | + _check_type=False, |
| 166 | + **kwargs, |
| 167 | + ) |
| 168 | + self._ai_lake_api.create_ai_lake_pipe_table( |
| 169 | + instance_id, |
| 170 | + request, |
| 171 | + _check_return_type=False, |
| 172 | + ) |
| 173 | + |
| 174 | + # ------------------------------------------------------------------ |
| 175 | + # Statistics |
| 176 | + # ------------------------------------------------------------------ |
| 177 | + |
79 | 178 | def analyze_statistics( |
80 | 179 | self, |
81 | 180 | instance_id: str, |
|
0 commit comments