Skip to content

Commit b13864a

Browse files
benkeannaclaude
andcommitted
feat(gooddata-pipelines): support composite key references on parent datasets
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent efc6b2a commit b13864a

4 files changed

Lines changed: 249 additions & 14 deletions

File tree

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,49 @@ def _date_ref_from_field(
154154
],
155155
)
156156

157+
@staticmethod
158+
def _build_parent_reference_sources(
159+
definition: CustomDatasetDefinition,
160+
) -> list[CatalogDeclarativeReferenceSource]:
161+
"""Resolve the list of parent reference sources.
162+
163+
Precedence:
164+
* If ``parent_dataset_references`` is set and non-empty, use it as-is.
165+
* Otherwise fall back to the legacy single-column fields wrapped in a
166+
one-element list. Missing legacy fields yield an empty list, which
167+
will be rejected downstream by the GoodData API.
168+
"""
169+
if definition.parent_dataset_references:
170+
return [
171+
CatalogDeclarativeReferenceSource(
172+
column=ref.source_column,
173+
data_type=ref.data_type.value,
174+
target=CatalogGrainIdentifier(
175+
id=ref.attribute_id,
176+
type=CustomFieldType.ATTRIBUTE.value,
177+
),
178+
)
179+
for ref in definition.parent_dataset_references
180+
]
181+
182+
if (
183+
definition.dataset_reference_source_column is not None
184+
and definition.dataset_reference_source_column_data_type is not None
185+
and definition.parent_dataset_reference_attribute_id is not None
186+
):
187+
return [
188+
CatalogDeclarativeReferenceSource(
189+
column=definition.dataset_reference_source_column,
190+
data_type=definition.dataset_reference_source_column_data_type.value,
191+
target=CatalogGrainIdentifier(
192+
id=definition.parent_dataset_reference_attribute_id,
193+
type=CustomFieldType.ATTRIBUTE.value,
194+
),
195+
)
196+
]
197+
198+
return []
199+
157200
@staticmethod
158201
def _get_sources(
159202
dataset: CustomDataset,
@@ -253,6 +296,13 @@ def datasets_to_ldm(
253296
# Get the data source info
254297
dataset_source_table_id, dataset_sql = self._get_sources(dataset)
255298

299+
# Build the parent reference source list. The composite-friendly
300+
# `parent_dataset_references` list takes precedence when set and
301+
# non-empty; otherwise fall back to the legacy single-column fields.
302+
parent_reference_sources = self._build_parent_reference_sources(
303+
dataset.definition
304+
)
305+
256306
# Construct the declarative dataset object and append it to the list.
257307
declarative_datasets.append(
258308
CatalogDeclarativeDataset(
@@ -265,16 +315,7 @@ def datasets_to_ldm(
265315
id=dataset.definition.parent_dataset_reference,
266316
),
267317
multivalue=True,
268-
sources=[
269-
CatalogDeclarativeReferenceSource(
270-
column=dataset.definition.dataset_reference_source_column,
271-
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
272-
target=CatalogGrainIdentifier(
273-
id=dataset.definition.parent_dataset_reference_attribute_id,
274-
type=CustomFieldType.ATTRIBUTE.value,
275-
),
276-
)
277-
],
318+
sources=parent_reference_sources,
278319
),
279320
]
280321
+ date_references,

packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,38 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition":
6161
return self
6262

6363

64+
class ParentDatasetReference(BaseModel):
65+
"""One column of a (possibly composite) join to the parent dataset.
66+
67+
A list of these on ``CustomDatasetDefinition.parent_dataset_references``
68+
supports multi-column foreign keys. Each entry binds a source column on the
69+
new dataset to a grain attribute on the parent.
70+
"""
71+
72+
attribute_id: str = Field(
73+
description="Attribute ID on the parent dataset that this column joins to.",
74+
)
75+
source_column: str = Field(
76+
description="Column name on this dataset used to join to the parent.",
77+
)
78+
data_type: ColumnDataType = Field(
79+
description="Data type of the source column.",
80+
)
81+
82+
6483
class CustomDatasetDefinition(BaseModel):
65-
"""Input model for custom dataset definition."""
84+
"""Input model for custom dataset definition.
85+
86+
The reference to the parent dataset can be expressed in two ways:
87+
88+
* The legacy single-column form via ``parent_dataset_reference_attribute_id``,
89+
``dataset_reference_source_column`` and ``dataset_reference_source_column_data_type``.
90+
All three must be provided together.
91+
* The composite-friendly form via ``parent_dataset_references``: a list of
92+
``ParentDatasetReference`` entries, one per join column.
93+
94+
Exactly one of the two forms must be used; mixing both is rejected.
95+
"""
6696

6797
workspace_id: str
6898
dataset_id: str
@@ -71,9 +101,34 @@ class CustomDatasetDefinition(BaseModel):
71101
dataset_source_table: str | None
72102
dataset_source_sql: str | None
73103
parent_dataset_reference: str
74-
parent_dataset_reference_attribute_id: str
75-
dataset_reference_source_column: str
76-
dataset_reference_source_column_data_type: ColumnDataType
104+
parent_dataset_reference_attribute_id: str | None = Field(
105+
default=None,
106+
deprecated=(
107+
"Use `parent_dataset_references` for richer (composite-key) joins. "
108+
"This field will be removed in a future release."
109+
),
110+
)
111+
dataset_reference_source_column: str | None = Field(
112+
default=None,
113+
deprecated=(
114+
"Use `parent_dataset_references` for richer (composite-key) joins. "
115+
"This field will be removed in a future release."
116+
),
117+
)
118+
dataset_reference_source_column_data_type: ColumnDataType | None = Field(
119+
default=None,
120+
deprecated=(
121+
"Use `parent_dataset_references` for richer (composite-key) joins. "
122+
"This field will be removed in a future release."
123+
),
124+
)
125+
parent_dataset_references: list[ParentDatasetReference] | None = Field(
126+
default=None,
127+
description=(
128+
"Composite-key reference to the parent dataset. When provided and "
129+
"non-empty, supersedes the legacy single-column reference fields."
130+
),
131+
)
77132
workspace_data_filter_id: str
78133
workspace_data_filter_column_name: str
79134
dataset_description: str | None = Field(
@@ -98,6 +153,29 @@ def check_source(self) -> "CustomDatasetDefinition":
98153
)
99154
return self
100155

156+
@model_validator(mode="after")
157+
def check_reference_form_exclusive(self) -> "CustomDatasetDefinition":
158+
"""Reject mixing the legacy single-column fields with ``parent_dataset_references``.
159+
160+
Forcing callers to pick one form prevents silent precedence surprises:
161+
without this check, setting both would quietly use the new list and
162+
ignore the legacy values, which is easy to miss when debugging.
163+
"""
164+
has_new = bool(self.parent_dataset_references)
165+
has_legacy = (
166+
self.parent_dataset_reference_attribute_id is not None
167+
or self.dataset_reference_source_column is not None
168+
or self.dataset_reference_source_column_data_type is not None
169+
)
170+
if has_new and has_legacy:
171+
raise ValueError(
172+
"Set either `parent_dataset_references` or the legacy single-column "
173+
"fields (`parent_dataset_reference_attribute_id`, "
174+
"`dataset_reference_source_column`, "
175+
"`dataset_reference_source_column_data_type`), not both."
176+
)
177+
return self
178+
101179

102180
class CustomDataset(BaseModel):
103181
"""Custom dataset with its definition and custom fields."""

packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,52 @@ def test_datasets_to_ldm(mock_custom_dataset):
129129
assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1"
130130
assert len(ldm.date_instances) == 1
131131
assert ldm.date_instances[0].id == "date1"
132+
133+
134+
def test_datasets_to_ldm_parent_dataset_references_composite():
135+
"""Multi-column references via `parent_dataset_references` produce N sources."""
136+
from gooddata_pipelines.ldm_extension.models.custom_data_object import (
137+
CustomDatasetDefinition,
138+
ParentDatasetReference,
139+
)
140+
141+
definition = CustomDatasetDefinition(
142+
workspace_id="workspace1",
143+
dataset_id="ds_composite",
144+
dataset_name="Composite Dataset",
145+
dataset_source_table="table1",
146+
dataset_datasource_id="ds_source",
147+
dataset_source_sql=None,
148+
parent_dataset_reference="parent_ds",
149+
parent_dataset_references=[
150+
ParentDatasetReference(
151+
attribute_id="parent_pk1",
152+
source_column="src_col1",
153+
data_type=ColumnDataType.STRING,
154+
),
155+
ParentDatasetReference(
156+
attribute_id="parent_pk2",
157+
source_column="src_col2",
158+
data_type=ColumnDataType.INT,
159+
),
160+
],
161+
workspace_data_filter_id="wdf1",
162+
workspace_data_filter_column_name="col1",
163+
)
164+
ds = CustomDataset(definition=definition, custom_fields=[])
165+
processor = LdmExtensionDataProcessor()
166+
model = processor.datasets_to_ldm({"ds_composite": ds})
167+
parent_ref = model.ldm.datasets[0].references[0]
168+
assert len(parent_ref.sources) == 2
169+
assert [s.column for s in parent_ref.sources] == ["src_col1", "src_col2"]
170+
171+
172+
def test_datasets_to_ldm_legacy_reference_fallback(mock_dataset_definition):
173+
"""When `parent_dataset_references` is not set, fall back to legacy fields."""
174+
mock_dataset_definition.parent_dataset_references = None
175+
ds = CustomDataset(definition=mock_dataset_definition, custom_fields=[])
176+
processor = LdmExtensionDataProcessor()
177+
model = processor.datasets_to_ldm({"ds1": ds})
178+
parent_ref = model.ldm.datasets[0].references[0]
179+
assert len(parent_ref.sources) == 1
180+
assert parent_ref.sources[0].column == "ref_col"

packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
CustomDatasetDefinition,
99
CustomFieldDefinition,
1010
CustomFieldType,
11+
ParentDatasetReference,
1112
)
1213

1314

@@ -100,3 +101,69 @@ def test_custom_dataset_model():
100101
assert dataset.definition.dataset_id == "ds1"
101102
assert len(dataset.custom_fields) == 1
102103
assert dataset.custom_fields[0].custom_field_id == "cf1"
104+
105+
106+
def test_custom_dataset_definition_parent_dataset_references_optional():
107+
"""The new composite-reference field is optional and defaults to None."""
108+
ds = CustomDatasetDefinition(**make_valid_dataset_def())
109+
assert ds.parent_dataset_references is None
110+
111+
112+
def test_custom_dataset_definition_parent_dataset_references_accepted():
113+
"""Composite references can be provided via the new list field."""
114+
refs = [
115+
ParentDatasetReference(
116+
attribute_id="parent_pk1",
117+
source_column="src_col1",
118+
data_type=ColumnDataType.STRING,
119+
),
120+
ParentDatasetReference(
121+
attribute_id="parent_pk2",
122+
source_column="src_col2",
123+
data_type=ColumnDataType.INT,
124+
),
125+
]
126+
data = make_valid_dataset_def(
127+
parent_dataset_reference_attribute_id=None,
128+
dataset_reference_source_column=None,
129+
dataset_reference_source_column_data_type=None,
130+
parent_dataset_references=refs,
131+
)
132+
ds = CustomDatasetDefinition(**data)
133+
assert ds.parent_dataset_references is not None
134+
assert len(ds.parent_dataset_references) == 2
135+
assert ds.parent_dataset_references[1].data_type == ColumnDataType.INT
136+
137+
138+
def test_custom_dataset_definition_mixed_reference_forms_raises():
139+
"""Setting both legacy fields and `parent_dataset_references` is rejected."""
140+
data = make_valid_dataset_def(
141+
parent_dataset_references=[
142+
ParentDatasetReference(
143+
attribute_id="parent_pk",
144+
source_column="src_col",
145+
data_type=ColumnDataType.STRING,
146+
)
147+
],
148+
)
149+
with pytest.raises(ValidationError) as exc:
150+
CustomDatasetDefinition(**data)
151+
assert "not both" in str(exc.value)
152+
153+
154+
def test_custom_dataset_definition_legacy_reference_fields_optional():
155+
data = make_valid_dataset_def(
156+
parent_dataset_reference_attribute_id=None,
157+
dataset_reference_source_column=None,
158+
dataset_reference_source_column_data_type=None,
159+
parent_dataset_references=[
160+
ParentDatasetReference(
161+
attribute_id="parent_pk",
162+
source_column="src_col",
163+
data_type=ColumnDataType.STRING,
164+
)
165+
],
166+
)
167+
ds = CustomDatasetDefinition(**data)
168+
assert ds.dataset_reference_source_column is None
169+
assert ds.parent_dataset_references is not None

0 commit comments

Comments
 (0)