Skip to content

Commit 0c299a0

Browse files
authored
Merge pull request #1608 from gooddata/aben/composite-refs
feat(gooddata-pipelines): support composite key references on parent datasets
2 parents 1c4dfe4 + ff2c33d commit 0c299a0

6 files changed

Lines changed: 268 additions & 19 deletions

File tree

docs/content/en/latest/pipelines/ldm_extension/_index.md

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ The custom dataset represents a new dataset appended to the child LDM. It is def
4040
| dataset_source_table | string | Name of the table in the Physical Data Model. |
4141
| dataset_source_sql | string \| None | SQL query defining the dataset. |
4242
| parent_dataset_reference | string \| None | ID of the parent dataset to which the custom one will be connected. |
43-
| parent_dataset_reference_attribute_id | string | ID of the attribute used for creating the relationship in the parent dataset. |
44-
| dataset_reference_source_column | string | Name of the column used for creating the relationship in the custom dataset. |
45-
| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) | Column data type. |
43+
| parent_dataset_reference_attribute_id | string \| None | **Deprecated** — use `parent_dataset_references` instead. |
44+
| dataset_reference_source_column | string \| None | **Deprecated** — use `parent_dataset_references` instead. |
45+
| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) \| None | **Deprecated** — use `parent_dataset_references` instead. |
46+
| parent_dataset_references | [ParentDatasetReference](#parentdatasetreference)[] \| None | List of references to the parent dataset. |
4647
| workspace_data_filter_id | string | ID of the workspace data filter to use. |
4748
| workspace_data_filter_column_name | string | Name of the column in custom dataset used for filtering. |
4849
| dataset_description | string \| None | Optional declarative description on the custom dataset. |
@@ -52,6 +53,18 @@ The custom dataset represents a new dataset appended to the child LDM. It is def
5253

5354
Either `dataset_source_table` or `dataset_source_sql` must be specified with a truthy value, but not both. An exception will be raised if both parameters are falsy or if both have truthy values.
5455

56+
`parent_dataset_references` must contain at least one entry.
57+
58+
#### ParentDatasetReference
59+
60+
Bundles one column of a (possibly composite) join to the parent dataset. Pass a list of these on `CustomDatasetDefinition.parent_dataset_references`, one entry per join column.
61+
62+
| name | type | description |
63+
|------|------|-------------|
64+
| attribute_id | string | ID of the attribute on the parent dataset that this column joins to. |
65+
| source_column | string | Name of the column on this dataset used to join to the parent. |
66+
| data_type | [ColumnDataType](#columndatatype) | Data type of the source column. |
67+
5568
### Custom Field Definitions
5669

5770
The custom fields define the individual fields in the custom datasets defined above. Each custom field needs to be specified with the following parameters:
@@ -162,6 +175,7 @@ from gooddata_pipelines import (
162175
CustomFieldDefinition,
163176
CustomFieldType,
164177
LdmExtensionManager,
178+
ParentDatasetReference,
165179
)
166180

167181
import logging
@@ -188,9 +202,13 @@ custom_dataset_definitions = [
188202
dataset_source_table="products_custom",
189203
dataset_source_sql=None,
190204
parent_dataset_reference="products",
191-
parent_dataset_reference_attribute_id="products.product_id",
192-
dataset_reference_source_column="product_id",
193-
dataset_reference_source_column_data_type=ColumnDataType.INT,
205+
parent_dataset_references=[
206+
ParentDatasetReference(
207+
attribute_id="products.product_id",
208+
source_column="product_id",
209+
data_type=ColumnDataType.INT,
210+
),
211+
],
194212
workspace_data_filter_id="wdf_id",
195213
workspace_data_filter_column_name="wdf_column",
196214
)

packages/gooddata-pipelines/src/gooddata_pipelines/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
CustomDatasetDefinition,
2727
CustomFieldDefinition,
2828
CustomFieldType,
29+
ParentDatasetReference,
2930
)
3031

3132
# -------- Provisioning --------
@@ -93,6 +94,7 @@
9394
"CustomFieldDefinition",
9495
"ColumnDataType",
9596
"CustomFieldType",
97+
"ParentDatasetReference",
9698
"provision",
9799
"WorkflowType",
98100
"__version__",

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,46 @@ def _date_ref_from_field(
154154
],
155155
)
156156

157+
@staticmethod
158+
def _build_parent_reference_sources(
159+
definition: CustomDatasetDefinition,
160+
) -> list[CatalogDeclarativeReferenceSource]:
161+
"""Build the reference sources from either the new list or the legacy triple."""
162+
if definition.parent_dataset_references:
163+
return [
164+
CatalogDeclarativeReferenceSource(
165+
column=ref.source_column,
166+
data_type=ref.data_type.value,
167+
target=CatalogGrainIdentifier(
168+
id=ref.attribute_id,
169+
type=CustomFieldType.ATTRIBUTE.value,
170+
),
171+
)
172+
for ref in definition.parent_dataset_references
173+
]
174+
175+
# `check_reference_form` on the model guarantees all three legacy
176+
# fields are set when `parent_dataset_references` is empty.
177+
if (
178+
definition.parent_dataset_reference_attribute_id is None
179+
or definition.dataset_reference_source_column is None
180+
or definition.dataset_reference_source_column_data_type is None
181+
):
182+
raise ValueError(
183+
"Legacy reference fields must be set when "
184+
"`parent_dataset_references` is not provided."
185+
)
186+
return [
187+
CatalogDeclarativeReferenceSource(
188+
column=definition.dataset_reference_source_column,
189+
data_type=definition.dataset_reference_source_column_data_type.value,
190+
target=CatalogGrainIdentifier(
191+
id=definition.parent_dataset_reference_attribute_id,
192+
type=CustomFieldType.ATTRIBUTE.value,
193+
),
194+
)
195+
]
196+
157197
@staticmethod
158198
def _get_sources(
159199
dataset: CustomDataset,
@@ -253,6 +293,10 @@ def datasets_to_ldm(
253293
# Get the data source info
254294
dataset_source_table_id, dataset_sql = self._get_sources(dataset)
255295

296+
parent_reference_sources = self._build_parent_reference_sources(
297+
dataset.definition
298+
)
299+
256300
# Construct the declarative dataset object and append it to the list.
257301
declarative_datasets.append(
258302
CatalogDeclarativeDataset(
@@ -265,16 +309,7 @@ def datasets_to_ldm(
265309
id=dataset.definition.parent_dataset_reference,
266310
),
267311
multivalue=True,
268-
sources=[
269-
CatalogDeclarativeReferenceSource(
270-
column=dataset.definition.dataset_reference_source_column,
271-
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
272-
target=CatalogGrainIdentifier(
273-
id=dataset.definition.parent_dataset_reference_attribute_id,
274-
type=CustomFieldType.ATTRIBUTE.value,
275-
),
276-
)
277-
],
312+
sources=parent_reference_sources,
278313
),
279314
]
280315
+ date_references,

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,25 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition":
6161
return self
6262

6363

64+
class ParentDatasetReference(BaseModel):
65+
"""One column of a (possibly composite) join to the parent dataset.
66+
67+
A list of these on ``CustomDatasetDefinition.parent_dataset_references``
68+
supports multi-column foreign keys. Each entry binds a source column on the
69+
new dataset to a grain attribute on the parent.
70+
"""
71+
72+
attribute_id: str = Field(
73+
description="Attribute ID on the parent dataset that this column joins to.",
74+
)
75+
source_column: str = Field(
76+
description="Column name on this dataset used to join to the parent.",
77+
)
78+
data_type: ColumnDataType = Field(
79+
description="Data type of the source column.",
80+
)
81+
82+
6483
class CustomDatasetDefinition(BaseModel):
6584
"""Input model for custom dataset definition."""
6685

@@ -71,9 +90,31 @@ class CustomDatasetDefinition(BaseModel):
7190
dataset_source_table: str | None
7291
dataset_source_sql: str | None
7392
parent_dataset_reference: str
74-
parent_dataset_reference_attribute_id: str
75-
dataset_reference_source_column: str
76-
dataset_reference_source_column_data_type: ColumnDataType
93+
parent_dataset_reference_attribute_id: str | None = Field(
94+
default=None,
95+
deprecated=(
96+
"Use `parent_dataset_references` instead. "
97+
"This field will be removed in a future release."
98+
),
99+
)
100+
dataset_reference_source_column: str | None = Field(
101+
default=None,
102+
deprecated=(
103+
"Use `parent_dataset_references` instead. "
104+
"This field will be removed in a future release."
105+
),
106+
)
107+
dataset_reference_source_column_data_type: ColumnDataType | None = Field(
108+
default=None,
109+
deprecated=(
110+
"Use `parent_dataset_references` instead. "
111+
"This field will be removed in a future release."
112+
),
113+
)
114+
parent_dataset_references: list[ParentDatasetReference] | None = Field(
115+
default=None,
116+
description="List of references to the parent dataset.",
117+
)
77118
workspace_data_filter_id: str
78119
workspace_data_filter_column_name: str
79120
dataset_description: str | None = Field(
@@ -98,6 +139,31 @@ def check_source(self) -> "CustomDatasetDefinition":
98139
)
99140
return self
100141

142+
@model_validator(mode="after")
143+
def check_reference_form(self) -> "CustomDatasetDefinition":
144+
"""Exactly one reference form must be set: either the new list or the legacy triple."""
145+
has_new = bool(self.parent_dataset_references)
146+
has_legacy = (
147+
self.parent_dataset_reference_attribute_id is not None
148+
or self.dataset_reference_source_column is not None
149+
or self.dataset_reference_source_column_data_type is not None
150+
)
151+
if has_new and has_legacy:
152+
raise ValueError(
153+
"Set either `parent_dataset_references` or the legacy single-column "
154+
"fields (`parent_dataset_reference_attribute_id`, "
155+
"`dataset_reference_source_column`, "
156+
"`dataset_reference_source_column_data_type`), not both."
157+
)
158+
if not has_new and not has_legacy:
159+
raise ValueError(
160+
"Provide either `parent_dataset_references` or the legacy single-column "
161+
"fields (`parent_dataset_reference_attribute_id`, "
162+
"`dataset_reference_source_column`, "
163+
"`dataset_reference_source_column_data_type`)."
164+
)
165+
return self
166+
101167

102168
class CustomDataset(BaseModel):
103169
"""Custom dataset with its definition and custom fields."""

packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,52 @@ def test_datasets_to_ldm(mock_custom_dataset):
129129
assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1"
130130
assert len(ldm.date_instances) == 1
131131
assert ldm.date_instances[0].id == "date1"
132+
133+
134+
def test_datasets_to_ldm_parent_dataset_references_composite():
135+
"""Multi-column references via `parent_dataset_references` produce N sources."""
136+
from gooddata_pipelines.ldm_extension.models.custom_data_object import (
137+
CustomDatasetDefinition,
138+
ParentDatasetReference,
139+
)
140+
141+
definition = CustomDatasetDefinition(
142+
workspace_id="workspace1",
143+
dataset_id="ds_composite",
144+
dataset_name="Composite Dataset",
145+
dataset_source_table="table1",
146+
dataset_datasource_id="ds_source",
147+
dataset_source_sql=None,
148+
parent_dataset_reference="parent_ds",
149+
parent_dataset_references=[
150+
ParentDatasetReference(
151+
attribute_id="parent_pk1",
152+
source_column="src_col1",
153+
data_type=ColumnDataType.STRING,
154+
),
155+
ParentDatasetReference(
156+
attribute_id="parent_pk2",
157+
source_column="src_col2",
158+
data_type=ColumnDataType.INT,
159+
),
160+
],
161+
workspace_data_filter_id="wdf1",
162+
workspace_data_filter_column_name="col1",
163+
)
164+
ds = CustomDataset(definition=definition, custom_fields=[])
165+
processor = LdmExtensionDataProcessor()
166+
model = processor.datasets_to_ldm({"ds_composite": ds})
167+
parent_ref = model.ldm.datasets[0].references[0]
168+
assert len(parent_ref.sources) == 2
169+
assert [s.column for s in parent_ref.sources] == ["src_col1", "src_col2"]
170+
171+
172+
def test_datasets_to_ldm_legacy_reference_fallback(mock_dataset_definition):
173+
"""When `parent_dataset_references` is not set, fall back to legacy fields."""
174+
mock_dataset_definition.parent_dataset_references = None
175+
ds = CustomDataset(definition=mock_dataset_definition, custom_fields=[])
176+
processor = LdmExtensionDataProcessor()
177+
model = processor.datasets_to_ldm({"ds1": ds})
178+
parent_ref = model.ldm.datasets[0].references[0]
179+
assert len(parent_ref.sources) == 1
180+
assert parent_ref.sources[0].column == "ref_col"

packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
CustomDatasetDefinition,
99
CustomFieldDefinition,
1010
CustomFieldType,
11+
ParentDatasetReference,
1112
)
1213

1314

@@ -100,3 +101,81 @@ def test_custom_dataset_model():
100101
assert dataset.definition.dataset_id == "ds1"
101102
assert len(dataset.custom_fields) == 1
102103
assert dataset.custom_fields[0].custom_field_id == "cf1"
104+
105+
106+
def test_custom_dataset_definition_parent_dataset_references_optional():
107+
"""The new composite-reference field is optional and defaults to None."""
108+
ds = CustomDatasetDefinition(**make_valid_dataset_def())
109+
assert ds.parent_dataset_references is None
110+
111+
112+
def test_custom_dataset_definition_parent_dataset_references_accepted():
113+
"""Composite references can be provided via the new list field."""
114+
refs = [
115+
ParentDatasetReference(
116+
attribute_id="parent_pk1",
117+
source_column="src_col1",
118+
data_type=ColumnDataType.STRING,
119+
),
120+
ParentDatasetReference(
121+
attribute_id="parent_pk2",
122+
source_column="src_col2",
123+
data_type=ColumnDataType.INT,
124+
),
125+
]
126+
data = make_valid_dataset_def(
127+
parent_dataset_reference_attribute_id=None,
128+
dataset_reference_source_column=None,
129+
dataset_reference_source_column_data_type=None,
130+
parent_dataset_references=refs,
131+
)
132+
ds = CustomDatasetDefinition(**data)
133+
assert ds.parent_dataset_references is not None
134+
assert len(ds.parent_dataset_references) == 2
135+
assert ds.parent_dataset_references[1].data_type == ColumnDataType.INT
136+
137+
138+
def test_custom_dataset_definition_no_reference_form_raises():
139+
"""Providing neither the legacy fields nor `parent_dataset_references` is rejected."""
140+
data = make_valid_dataset_def(
141+
parent_dataset_reference_attribute_id=None,
142+
dataset_reference_source_column=None,
143+
dataset_reference_source_column_data_type=None,
144+
)
145+
with pytest.raises(ValidationError) as exc:
146+
CustomDatasetDefinition(**data)
147+
assert "Provide either" in str(exc.value)
148+
149+
150+
def test_custom_dataset_definition_mixed_reference_forms_raises():
151+
"""Setting both legacy fields and `parent_dataset_references` is rejected."""
152+
data = make_valid_dataset_def(
153+
parent_dataset_references=[
154+
ParentDatasetReference(
155+
attribute_id="parent_pk",
156+
source_column="src_col",
157+
data_type=ColumnDataType.STRING,
158+
)
159+
],
160+
)
161+
with pytest.raises(ValidationError) as exc:
162+
CustomDatasetDefinition(**data)
163+
assert "not both" in str(exc.value)
164+
165+
166+
def test_custom_dataset_definition_legacy_reference_fields_optional():
167+
data = make_valid_dataset_def(
168+
parent_dataset_reference_attribute_id=None,
169+
dataset_reference_source_column=None,
170+
dataset_reference_source_column_data_type=None,
171+
parent_dataset_references=[
172+
ParentDatasetReference(
173+
attribute_id="parent_pk",
174+
source_column="src_col",
175+
data_type=ColumnDataType.STRING,
176+
)
177+
],
178+
)
179+
ds = CustomDatasetDefinition(**data)
180+
assert ds.dataset_reference_source_column is None
181+
assert ds.parent_dataset_references is not None

0 commit comments

Comments
 (0)