aws-durable-execution-sdk-python/src/aws_durable_execution_sdk_python/config.py at 619b09e1cb8e753ba12fc2de4e8dae8b7cf334ac · aws/aws-durable-execution-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
"""Configuration types."""

from __future__ import annotations

import random
from dataclasses import dataclass, field
from enum import Enum, StrEnum
from typing import TYPE_CHECKING, Generic, TypeVar

from aws_durable_execution_sdk_python.exceptions import ValidationError

P = TypeVar("P")  # Payload type
R = TypeVar("R")  # Result type
T = TypeVar("T")
U = TypeVar("U")

if TYPE_CHECKING:
    from collections.abc import Callable
    from concurrent.futures import Future

    from aws_durable_execution_sdk_python.lambda_service import OperationSubType
    from aws_durable_execution_sdk_python.retries import RetryDecision
    from aws_durable_execution_sdk_python.serdes import SerDes
    from aws_durable_execution_sdk_python.types import SummaryGenerator


Numeric = int | float  # deliberately leaving off complex


@dataclass(frozen=True)
class Duration:
    """Represents a duration stored as total seconds."""

    seconds: int = 0

    def __post_init__(self):
        if self.seconds < 0:
            msg = "Duration seconds must be positive"
            raise ValidationError(msg)

    def to_seconds(self) -> int:
        """Convert the duration to total seconds."""
        return self.seconds

    @classmethod
    def from_seconds(cls, value: float) -> Duration:
        """Create a Duration from total seconds."""
        return cls(seconds=int(value))

    @classmethod
    def from_minutes(cls, value: float) -> Duration:
        """Create a Duration from minutes."""
        return cls(seconds=int(value * 60))

    @classmethod
    def from_hours(cls, value: float) -> Duration:
        """Create a Duration from hours."""
        return cls(seconds=int(value * 3600))

    @classmethod
    def from_days(cls, value: float) -> Duration:
        """Create a Duration from days."""
        return cls(seconds=int(value * 86400))


@dataclass(frozen=True)
class BatchedInput(Generic[T, U]):
    batch_input: T
    items: list[U]


class TerminationMode(Enum):
    TERMINATE = "TERMINATE"
    CANCEL = "CANCEL"
    WAIT = "WAIT"
    ABANDON = "ABANDON"


@dataclass(frozen=True)
class CompletionConfig:
    """Configuration for determining when parallel/map operations complete.

    This class defines the success/failure criteria for operations that process
    multiple items or branches concurrently.

    Args:
        min_successful: Minimum number of successful completions required.
            If None, no minimum is enforced. Use this to implement "at least N
            must succeed" semantics.

        tolerated_failure_count: Maximum number of failures allowed before
            the operation is considered failed. If None, no limit on failure count.
            Use this to implement "fail fast after N failures" semantics.

        tolerated_failure_percentage: Maximum percentage of failures allowed
            (0.0 to 100.0). If None, no percentage limit is enforced.
            Use this to implement "fail if more than X% fail" semantics.

    Note:
        The operation completes when any of the completion criteria are met:
        - Enough successes (min_successful reached)
        - Too many failures (tolerated limits exceeded)
        - All items/branches completed

    Example:
        # Succeed if at least 3 succeed, fail if more than 2 fail
        config = CompletionConfig(
            min_successful=3,
            tolerated_failure_count=2
        )
    """

    min_successful: int | None = None
    tolerated_failure_count: int | None = None
    tolerated_failure_percentage: int | float | None = None

    # TODO: reevaluate this
    # @staticmethod
    # def first_completed():
    #     return CompletionConfig(
    #         min_successful=None, tolerated_failure_count=None, tolerated_failure_percentage=None
    #     )

    @staticmethod
    def first_successful():
        return CompletionConfig(
            min_successful=1,
            tolerated_failure_count=None,
            tolerated_failure_percentage=None,
        )

    @staticmethod
    def all_completed():
        return CompletionConfig(
            min_successful=None,
            tolerated_failure_count=None,
            tolerated_failure_percentage=None,
        )

    @staticmethod
    def all_successful():
        return CompletionConfig(
            min_successful=None,
            tolerated_failure_count=0,
            tolerated_failure_percentage=0,
        )


@dataclass(frozen=True)
class ParallelConfig:
    """Configuration options for parallel execution operations.

    This class configures how parallel operations are executed, including
    concurrency limits, completion criteria, and serialization behavior.

    Args:
        max_concurrency: Maximum number of parallel branches to execute concurrently.
            If None, no limit is imposed and all branches run concurrently.
            Use this to control resource usage and prevent overwhelming the system.

        completion_config: Defines when the parallel operation should complete.
            Controls success/failure criteria for the overall parallel operation.
            Default is CompletionConfig.all_successful() which requires all branches
            to succeed. Other options include first_successful() and all_completed().

        serdes: Custom serialization/deserialization configuration for BatchResult.
            Applied at the handler level to serialize the entire BatchResult object.
            If None, uses the default JSON serializer for BatchResult.

            Backward Compatibility: If only 'serdes' is provided (no item_serdes),
            it will be used for both individual functions AND BatchResult serialization
            to maintain existing behavior.

        item_serdes: Custom serialization/deserialization configuration for individual functions.
            Applied to each function's result as tasks complete in child contexts.
            If None, uses the default JSON serializer for individual function results.

            When both 'serdes' and 'item_serdes' are provided:
            - item_serdes: Used for individual function results in child contexts
            - serdes: Used for the entire BatchResult at handler level

        summary_generator: Function to generate compact summaries for large results (>256KB).
            When the serialized result exceeds CHECKPOINT_SIZE_LIMIT, this generator
            creates a JSON summary instead of checkpointing the full result. The operation
            is marked with ReplayChildren=true to reconstruct the full result during replay.

            Used internally by map/parallel operations to handle large BatchResult payloads.
            Signature: (result: T) -> str

    Example:
        # Run at most 3 branches concurrently, succeed if any one succeeds
        config = ParallelConfig(
            max_concurrency=3,
            completion_config=CompletionConfig.first_successful()
        )
    """

    max_concurrency: int | None = None
    completion_config: CompletionConfig = field(
        default_factory=CompletionConfig.all_successful
    )
    serdes: SerDes | None = None
    item_serdes: SerDes | None = None
    summary_generator: SummaryGenerator | None = None


class StepSemantics(Enum):
    AT_MOST_ONCE_PER_RETRY = "AT_MOST_ONCE_PER_RETRY"
    AT_LEAST_ONCE_PER_RETRY = "AT_LEAST_ONCE_PER_RETRY"


@dataclass(frozen=True)
class StepConfig:
    """Configuration for a step."""

    retry_strategy: Callable[[Exception, int], RetryDecision] | None = None
    step_semantics: StepSemantics = StepSemantics.AT_LEAST_ONCE_PER_RETRY
    serdes: SerDes | None = None


class CheckpointMode(Enum):
    NO_CHECKPOINT = ("NO_CHECKPOINT",)
    CHECKPOINT_AT_FINISH = ("CHECKPOINT_AT_FINISH",)
    CHECKPOINT_AT_START_AND_FINISH = "CHECKPOINT_AT_START_AND_FINISH"


@dataclass(frozen=True)
class ChildConfig(Generic[T]):
    """Configuration options for child context operations.

    This class configures how child contexts are executed and checkpointed,
    matching the TypeScript ChildConfig interface behavior.

    Args:
        serdes: Custom serialization/deserialization configuration for BatchResult.
            Applied at the handler level to serialize the entire BatchResult object.
            If None, uses the default JSON serializer for BatchResult.

            Backward Compatibility: If only 'serdes' is provided (no item_serdes),
            it will be used for both individual items AND BatchResult serialization
            to maintain existing behavior.

        item_serdes: Custom serialization/deserialization configuration for individual items.
            Applied to each item's result as tasks complete in child contexts.
            If None, uses the default JSON serializer for individual items.

            When both 'serdes' and 'item_serdes' are provided:
            - item_serdes: Used for individual item results in child contexts
            - serdes: Used for the entire BatchResult at handler level

        sub_type: Operation subtype identifier used for tracking and debugging.
            Examples: OperationSubType.MAP_ITERATION, OperationSubType.PARALLEL_BRANCH.
            Used internally by the execution engine for operation classification.

        summary_generator: Function to generate compact summaries for large results (>256KB).
            When the serialized result exceeds CHECKPOINT_SIZE_LIMIT, this generator
            creates a JSON summary instead of checkpointing the full result. The operation
            is marked with ReplayChildren=true to reconstruct the full result during replay.

            Used internally by map/parallel operations to handle large BatchResult payloads.
            Signature: (result: T) -> str
    Note:
        checkpoint_mode field is commented out as it's not currently implemented.
        When implemented, it will control when checkpoints are created:
        - CHECKPOINT_AT_START_AND_FINISH: Checkpoint at both start and completion (default)
        - CHECKPOINT_AT_FINISH: Only checkpoint when operation completes
        - NO_CHECKPOINT: No automatic checkpointing

    See TypeScript reference: aws-durable-execution-sdk-js/src/types/index.ts
    """

    # checkpoint_mode: CheckpointMode = CheckpointMode.CHECKPOINT_AT_START_AND_FINISH
    serdes: SerDes | None = None
    item_serdes: SerDes | None = None
    sub_type: OperationSubType | None = None
    summary_generator: SummaryGenerator | None = None


class ItemsPerBatchUnit(Enum):
    COUNT = ("COUNT",)
    BYTES = "BYTES"


@dataclass(frozen=True)
class ItemBatcher(Generic[T]):
    """Configuration for batching items in map operations.

    This class defines how individual items should be grouped together into batches
    for more efficient processing in map operations.

    Args:
        max_items_per_batch: Maximum number of items to include in a single batch.
            If 0 (default), no item count limit is applied. Use this to control
            batch size when processing many small items.

        max_item_bytes_per_batch: Maximum total size in bytes for items in a batch.
            If 0 (default), no size limit is applied. Use this to control memory
            usage when processing large items or when items vary significantly in size.

        batch_input: Additional data to include with each batch.
            This data is passed to the processing function along with the batched items.
            Useful for providing context or configuration that applies to all items
            in the batch.

    Example:
        # Batch up to 100 items or 1MB, whichever comes first
        batcher = ItemBatcher(
            max_items_per_batch=100,
            max_item_bytes_per_batch=1024*1024,
            batch_input={"processing_mode": "fast"}
        )
    """

    max_items_per_batch: int = 0
    max_item_bytes_per_batch: int | float = 0
    batch_input: T | None = None


@dataclass(frozen=True)
class MapConfig:
    """Configuration options for map operations over collections.

    This class configures how map operations process collections of items,
    including concurrency, batching, completion criteria, and serialization.

    Args:
        max_concurrency: Maximum number of items to process concurrently.
            If None, no limit is imposed and all items are processed concurrently.
            Use this to control resource usage when processing large collections.

        item_batcher: Configuration for batching multiple items together for processing.
            Allows grouping items by count or size to optimize processing efficiency.
            Default is no batching (each item processed individually).

        completion_config: Defines when the map operation should complete.
            Controls success/failure criteria for the overall map operation.
            Default allows any number of failures. Use CompletionConfig.all_successful()
            to require all items to succeed.

        serdes: Custom serialization/deserialization configuration for BatchResult.
            Applied at the handler level to serialize the entire BatchResult object.
            If None, uses the default JSON serializer for BatchResult.

            Backward Compatibility: If only 'serdes' is provided (no item_serdes),
            it will be used for both individual items AND BatchResult serialization
            to maintain existing behavior.

        item_serdes: Custom serialization/deserialization configuration for individual items.
            Applied to each item's result as tasks complete in child contexts.
            If None, uses the default JSON serializer for individual items.

            When both 'serdes' and 'item_serdes' are provided:
            - item_serdes: Used for individual item results in child contexts
            - serdes: Used for the entire BatchResult at handler level

        summary_generator: Function to generate compact summaries for large results (>256KB).
            When the serialized result exceeds CHECKPOINT_SIZE_LIMIT, this generator
            creates a JSON summary instead of checkpointing the full result. The operation
            is marked with ReplayChildren=true to reconstruct the full result during replay.

            Used internally by map/parallel operations to handle large BatchResult payloads.
            Signature: (result: T) -> str

    Example:
        # Process 5 items at a time, batch by count, require all to succeed
        config = MapConfig(
            max_concurrency=5,
            item_batcher=ItemBatcher(max_items_per_batch=10),
            completion_config=CompletionConfig.all_successful()
        )
    """

    max_concurrency: int | None = None
    item_batcher: ItemBatcher = field(default_factory=ItemBatcher)
    completion_config: CompletionConfig = field(default_factory=CompletionConfig)
    serdes: SerDes | None = None
    item_serdes: SerDes | None = None
    summary_generator: SummaryGenerator | None = None


@dataclass(frozen=True)
class InvokeConfig(Generic[P, R]):
    """
    Configuration for invoke operations.

    This class configures how function invocations are executed, including
    serialization and tenant isolation.

    Args:
        serdes_payload: Custom serialization/deserialization for the payload
            sent to the invoked function. Defaults to DEFAULT_JSON_SERDES when
            not set.

        serdes_result: Custom serialization/deserialization for the result
            returned from the invoked function. Defaults to DEFAULT_JSON_SERDES when
            not set.

        tenant_id: Optional tenant identifier for multi-tenant isolation.
            If provided, the invocation will be scoped to this tenant.
    """

    # retry_strategy: Callable[[Exception, int], RetryDecision] | None = None
    serdes_payload: SerDes[P] | None = None
    serdes_result: SerDes[R] | None = None
    tenant_id: str | None = None


@dataclass(frozen=True)
class CallbackConfig:
    """Configuration for callbacks."""

    timeout: Duration = field(default_factory=Duration)
    heartbeat_timeout: Duration = field(default_factory=Duration)
    serdes: SerDes | None = None

    @property
    def timeout_seconds(self) -> int:
        """Get timeout in seconds."""
        return self.timeout.to_seconds()

    @property
    def heartbeat_timeout_seconds(self) -> int:
        """Get heartbeat timeout in seconds."""
        return self.heartbeat_timeout.to_seconds()


@dataclass(frozen=True)
class WaitForCallbackConfig(CallbackConfig):
    """Configuration for wait for callback."""

    retry_strategy: Callable[[Exception, int], RetryDecision] | None = None


class StepFuture(Generic[T]):
    """A future that will block on result() until the step returns."""

    def __init__(self, future: Future[T], name: str | None = None):
        self.name = name
        self.future = future

    def result(self, timeout_seconds: int | None = None) -> T:
        """Return the result of the Future."""
        return self.future.result(timeout=timeout_seconds)


# region Jitter


class JitterStrategy(StrEnum):
    """
    Jitter strategies are used to introduce noise when attempting to retry
    an invoke. We introduce noise to prevent a thundering-herd effect where
    a group of accesses (e.g. invokes) happen at once.

    Jitter is meant to be used to spread operations across time.

    Based on AWS Architecture Blog: https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/

    members:
        :NONE: No jitter; use the exact calculated delay
        :FULL: Full jitter; random delay between 0 and calculated delay
        :HALF: Equal jitter; random delay between 0.5x and 1.0x of the calculated delay
    """

    NONE = "NONE"
    FULL = "FULL"
    HALF = "HALF"

    def apply_jitter(self, delay: float) -> float:
        """Apply jitter to a delay value and return the final delay.

        Args:
            delay: The base delay value to apply jitter to

        Returns:
            The final delay after applying jitter strategy
        """
        match self:
            case JitterStrategy.NONE:
                return delay
            case JitterStrategy.HALF:
                # Equal jitter: delay/2 + random(0, delay/2)
                return delay / 2 + random.random() * (delay / 2)  # noqa: S311
            case _:  # default is FULL
                # Full jitter: random(0, delay)
                return random.random() * delay  # noqa: S311


# endregion Jitter