|
35 | 35 | from collections.abc import Iterable as abcIterable |
36 | 36 | from dataclasses import dataclass, field |
37 | 37 | from deprecated.sphinx import deprecated |
38 | | -from typing import Any, List, Set, Union, Generator, MutableMapping, Optional, Dict, Type |
| 38 | +from typing import Any, Mapping, List, Set, Union, Generator, MutableMapping, Optional, Dict, Type |
39 | 39 |
|
40 | 40 | import copy |
41 | 41 | import re |
@@ -192,14 +192,77 @@ def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset: |
192 | 192 | "associated with a string") |
193 | 193 | return LabeledSubset(label, set(subset), description) |
194 | 194 |
|
195 | | - def to_primitives(self) -> Dict[str, Union[List[str], str]]: |
| 195 | + def to_primitives(self, sorter: Optional[Mapping[str, Any]] = None) -> Dict[str, Union[List[str], str]]: |
196 | 196 | """Convert to a representation used in yaml serialization |
| 197 | +
|
| 198 | + Parameters |
| 199 | + ---------- |
| 200 | + sorter : `Mapping` [ `str`, `object` ] |
| 201 | + Mapping from task or subset label to a comparable object that |
| 202 | + should be used to sort those labels. |
197 | 203 | """ |
198 | | - accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)} |
| 204 | + contents = list(self.subset) |
| 205 | + if sorter is not None: |
| 206 | + contents.sort(key=lambda label: sorter[label]) |
| 207 | + accumulate: Dict[str, Union[List[str], str]] = {"subset": contents} |
199 | 208 | if self.description is not None: |
200 | 209 | accumulate["description"] = self.description |
201 | 210 | return accumulate |
202 | 211 |
|
| 212 | + @staticmethod |
| 213 | + def expand_nested(original_subsets: Mapping[str, LabeledSubset]) -> Dict[str, LabeledSubset]: |
| 214 | + """Recursively expand subsets that contain the labels of other subsets. |
| 215 | +
|
| 216 | + Parameters |
| 217 | + ---------- |
| 218 | + original_subsets : `Mapping` [ `str`, `LabeledSubset` ] |
| 219 | + Mapping from string label to a labeled subset definition; keys must |
| 220 | + match the `label` attribute of the corresponding value. |
| 221 | +
|
| 222 | + Returns |
| 223 | + ------- |
| 224 | + new_subsets : `Dict` [ `str`, `LabeledSubset` ] |
| 225 | + Mapping of new labeled subset definitions. Guaranteed to contain |
| 226 | + new instances (even when an old instance did not contain any subset |
| 227 | + labels) with only task labels. |
| 228 | +
|
| 229 | + Raises |
| 230 | + ------ |
| 231 | + RuntimeError |
| 232 | + Raised if a reference cycle is detected. |
| 233 | + """ |
| 234 | + done: Dict[str, LabeledSubset] = {} |
| 235 | + in_progress: Set[str] = set() |
| 236 | + |
| 237 | + def expand_one_subset(subset_label: str) -> Set[str]: |
| 238 | + """Expand the subset with the given label if it has not already |
| 239 | + been done, returning its recursively-expanded (i.e. task label |
| 240 | + only) contents. |
| 241 | +
|
| 242 | + This updates `done` in-place, and uses `in_progress` to detect |
| 243 | + cycles. |
| 244 | + """ |
| 245 | + if (already_expanded := done.get(subset_label)) is not None: |
| 246 | + return already_expanded.subset |
| 247 | + if subset_label in in_progress: |
| 248 | + raise RuntimeError(f"Cycle detected in subset definitions involving {subset_label}.") |
| 249 | + in_progress.add(subset_label) |
| 250 | + original = original_subsets[subset_label] |
| 251 | + new = LabeledSubset( |
| 252 | + label=subset_label, |
| 253 | + subset=set(original.subset - original_subsets.keys()), |
| 254 | + description=original.description, |
| 255 | + ) |
| 256 | + for nested_subset_label in original.subset & original_subsets.keys(): |
| 257 | + new.subset.update(expand_one_subset(original_subsets[nested_subset_label])) |
| 258 | + in_progress.remove(subset_label) |
| 259 | + done[subset_label] = new |
| 260 | + return new.subset |
| 261 | + |
| 262 | + for k in original_subsets.keys(): |
| 263 | + expand_one_subset(k) |
| 264 | + return done |
| 265 | + |
203 | 266 |
|
204 | 267 | @dataclass |
205 | 268 | class ParametersIR: |
@@ -918,31 +981,99 @@ def to_file(self, filename: str): |
918 | 981 | """ |
919 | 982 | self.write_to_uri(filename) |
920 | 983 |
|
921 | | - def write_to_uri(self, uri: Union[ButlerURI, str]): |
| 984 | + def write_to_uri( |
| 985 | + self, |
| 986 | + uri: Union[ButlerURI, str], |
| 987 | + *, |
| 988 | + expanded_tasks: Optional[Mapping[str, TaskIR]] = None, |
| 989 | + ): |
922 | 990 | """Serialize this `PipelineIR` object into a yaml formatted string and |
923 | 991 | write the output to a file at the specified uri. |
924 | 992 |
|
925 | 993 | Parameters |
926 | 994 | ---------- |
927 | 995 | uri: `str` or `ButlerURI` |
928 | 996 | Location of document to write a `PipelineIR` object. |
| 997 | + expanded_tasks : `Mapping` [ `str`, `TaskIR` ], optional |
| 998 | + Mapping containing replacement `TaskIR` objects that capture the |
| 999 | + fully-expanded configuration rather than a set of overrides, in |
| 1000 | + deterministic order. When this is not `None`, the ``instrument`` |
| 1001 | + and ``parameters`` sections are not written and all other sections |
| 1002 | + are sorted (using the order of the given tasks) to maximize the |
| 1003 | + extent to which equivalent pipelines will be written identically. |
929 | 1004 | """ |
930 | 1005 | with ButlerURI(uri).open("w") as buffer: |
931 | | - yaml.dump(self.to_primitives(), buffer, sort_keys=False) |
| 1006 | + yaml.dump(self.to_primitives(expanded_tasks=expanded_tasks), buffer, sort_keys=False) |
932 | 1007 |
|
933 | | - def to_primitives(self) -> Dict[str, Any]: |
| 1008 | + def to_primitives(self, expanded_tasks: Optional[Mapping[str, TaskIR]] = None) -> Dict[str, Any]: |
934 | 1009 | """Convert to a representation used in yaml serialization |
| 1010 | +
|
| 1011 | + Parameters |
| 1012 | + ---------- |
| 1013 | + expanded_tasks : `Mapping` [ `str`, `TaskIR` ], optional |
| 1014 | + Mapping containing replacement `TaskIR` objects that capture the |
| 1015 | + fully-expanded configuration rather than a set of overrides, in |
| 1016 | + deterministic order. When this is not `None`, the ``instrument`` |
| 1017 | + and ``parameters`` sections are not written and all other sections |
| 1018 | + are sorted (using the order of the given tasks) to maximize the |
| 1019 | + extent to which equivalent pipelines will be written identically. |
| 1020 | +
|
| 1021 | + Returns |
| 1022 | + ------- |
| 1023 | + primitives : `dict` |
| 1024 | + Dictionary that maps directly to the serialized YAML form. |
935 | 1025 | """ |
936 | 1026 | accumulate = {"description": self.description} |
937 | | - if self.instrument is not None: |
938 | | - accumulate['instrument'] = self.instrument |
939 | | - if self.parameters: |
940 | | - accumulate['parameters'] = self.parameters.to_primitives() |
941 | | - accumulate['tasks'] = {m: t.to_primitives() for m, t in self.tasks.items()} |
| 1027 | + sorter: Optional[Dict[str, int]] = None |
| 1028 | + if expanded_tasks is None: |
| 1029 | + tasks = self.tasks |
| 1030 | + labeled_subsets = self.labeled_subsets |
| 1031 | + contracts = self.contracts |
| 1032 | + # Instrument and parameters are only included in non-expanded form, |
| 1033 | + # because they'll have already been applied to configs in expanded |
| 1034 | + # form. It might be nice to include the instrument to check that |
| 1035 | + # the expanded pipeline is only used on data from that instrument, |
| 1036 | + # but we can't risk having the instrument's config overrides |
| 1037 | + # applied (again) on top of other configs that should supersede |
| 1038 | + # them. |
| 1039 | + if self.instrument is not None: |
| 1040 | + accumulate['instrument'] = self.instrument |
| 1041 | + if self.parameters: |
| 1042 | + accumulate['parameters'] = self.parameters.to_primitives() |
| 1043 | + else: |
| 1044 | + tasks = expanded_tasks |
| 1045 | + # Make a dict that maps task labels to their position in the |
| 1046 | + # (presumably ordered) mapping we were given. |
| 1047 | + sorter = {label: n for n, label in enumerate(expanded_tasks.keys())} |
| 1048 | + # Expand out subsets that reference other subsets, so that they |
| 1049 | + # all only reference task labels directly. |
| 1050 | + subset_labels = LabeledSubset.expand_nested(self.labeled_subsets) |
| 1051 | + # Sort the labeled subsets themselves by the position of their |
| 1052 | + # first task in the overall pipeline, followed by their own label |
| 1053 | + # to break ties. Note that we sort the tasks within them only |
| 1054 | + # later when we convert them to primitives, because at this point |
| 1055 | + # they hold `set` objects, not lists. |
| 1056 | + subset_labels = { |
| 1057 | + subset.label: subset |
| 1058 | + for subset in sorted( |
| 1059 | + subset_labels.values(), |
| 1060 | + key=lambda s: (min(sorter[t] for t in s.subset), s.label) |
| 1061 | + ) |
| 1062 | + } |
| 1063 | + # Sort contracts by the string expression itself, just for as much |
| 1064 | + # determinism as we can manage; can't help it if someone rewrites |
| 1065 | + # an expression to something equivalent that changes the sort |
| 1066 | + # order. |
| 1067 | + contracts = list(self.contracts) |
| 1068 | + contracts.sort(key=lambda c: c.contract) |
| 1069 | + |
| 1070 | + # Get primitives for sections common to expanded and non-expanded |
| 1071 | + # forms. |
| 1072 | + accumulate['tasks'] = {m: t.to_primitives() for m, t in tasks.items()} |
942 | 1073 | if len(self.contracts) > 0: |
943 | | - accumulate['contracts'] = [c.to_primitives() for c in self.contracts] |
944 | | - if self.labeled_subsets: |
945 | | - accumulate['subsets'] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} |
| 1074 | + accumulate['contracts'] = [c.to_primitives() for c in contracts] |
| 1075 | + if labeled_subsets: |
| 1076 | + accumulate['subsets'] = {k: v.to_primitives(sorter) for k, v in labeled_subsets.items()} |
946 | 1077 | return accumulate |
947 | 1078 |
|
948 | 1079 | def __str__(self) -> str: |
|
0 commit comments