From c4adb748dc51fc945ac389164a4320c4f6c7d11c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 6 Feb 2026 15:54:58 +0100 Subject: [PATCH 01/15] WIP, many open quiestions. Have to start testing to see the real situations --- src/apify/_actor.py | 4 +++ src/apify/_configuration.py | 20 +++++++++-- .../_apify/_alias_resolving.py | 36 +++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 4fd2989e..ca934838 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -36,6 +36,7 @@ from apify.events import ApifyEventManager, EventManager, LocalEventManager from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient, SmartApifyStorageClient +from apify.storage_clients._apify._alias_resolving import AliasResolver from apify.storage_clients._file_system import ApifyFileSystemStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -203,6 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() + + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self._configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 856c5fd0..264eb145 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -5,9 +5,9 @@ from decimal import Decimal from logging import getLogger from pathlib import Path -from typing import Annotated, Any +from typing import Annotated, Any, Required, TypedDict -from pydantic import AliasChoices, BeforeValidator, Field, model_validator +from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, field_validator, model_validator from typing_extensions import Self, deprecated from crawlee import service_locator @@ -34,6 +34,13 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') +class ActorStorageIds(TypedDict): + """Storage IDs for different storage types used by an Actor.""" + keyValueStores: dict[str, str] + datasets: dict[str, str] + requestQueues: dict[str, str] + + @docs_group('Configuration') class Configuration(CrawleeConfiguration): """A class for specifying the configuration of an Actor. @@ -446,6 +453,15 @@ class Configuration(CrawleeConfiguration): BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), ] = None + actor_storage_ids: Annotated[ + ActorStorageIds | None, + Field( + alias='apify_actor_storage_ids', + description='Storage IDs for the actor', + ), + BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), + ] = None + @model_validator(mode='after') def disable_browser_sandbox_on_platform(self) -> Self: """Disable the browser sandbox mode when running on the Apify platform. diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index e357333f..54cedb90 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -262,3 +262,39 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore raise ValueError("'Configuration.default_key_value_store_id' must be set.") return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) + + + @classmethod + async def register_aliases(cls, configuration: Configuration) -> None: + """Load alias mapping from dictionary to the default kvs.""" + def convert_name(name: str): + """Convert from mapping name to storage type name used in the alias mapping.""" + return {"datasets": "Dataset", "keyValueStores": "KeyValueStore", "requestQueues": "RequestQueue"}[name] + + configuration_mapping = {} + + if configuration.default_dataset_id != configuration.actor_storage_ids["datasets"].get("default", configuration.default_dataset_id): + raise RuntimeError( + f"Conflicting default dataset ids: {configuration.default_dataset_id=}," + f" {configuration.actor_storage_ids['datasets'].get('default')=}") + + for config_storage_type, mapping in configuration.actor_storage_ids.items(): + for storage_alias, storage_id in mapping.items(): + if storage_alias == "default": + # This is how the default storage is stored in the default kvs + storage_alias="__default__" + + configuration_mapping[AliasResolver( + storage_type=convert_name(config_storage_type), + alias=storage_alias, + configuration=configuration, + )._storage_key] = storage_id + + # Aliased storage can be also default storage!!! + # Should we store such second alias to the default storage or ignore it in such case? Probably + + # What if existing default dataset already has conflicting keys? + # Just override it, that will teach it to have conflicting values! + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = (await client.get_record(cls._ALIAS_MAPPING_KEY) or {"value":{}}).get("value") + await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) From 19113e709e1d3f4c576aca6a1992d23006a5830c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 13 Feb 2026 16:08:15 +0100 Subject: [PATCH 02/15] Add debug --- src/apify/_actor.py | 2 ++ src/apify/_configuration.py | 4 ++-- src/apify/storage_clients/_apify/_alias_resolving.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index ca934838..7f4172dc 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os import sys import warnings from contextlib import suppress @@ -206,6 +207,7 @@ async def __aenter__(self) -> Self: await self.open_key_value_store() # Load non-default aliased storages from configuration + self.log.warning('\n'.join(f'{k}={v}' for k, v in os.environ.items())) await AliasResolver.register_aliases(configuration=self._configuration) return self diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 264eb145..2469310e 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -453,10 +453,10 @@ class Configuration(CrawleeConfiguration): BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), ] = None - actor_storage_ids: Annotated[ + actor_storages: Annotated[ ActorStorageIds | None, Field( - alias='apify_actor_storage_ids', + alias='actor_storages_json', description='Storage IDs for the actor', ), BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 54cedb90..06e58b40 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -273,12 +273,12 @@ def convert_name(name: str): configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storage_ids["datasets"].get("default", configuration.default_dataset_id): + if configuration.default_dataset_id != configuration.actor_storages["datasets"].get("default", configuration.default_dataset_id): raise RuntimeError( f"Conflicting default dataset ids: {configuration.default_dataset_id=}," - f" {configuration.actor_storage_ids['datasets'].get('default')=}") + f" {configuration.actor_storages['datasets'].get('default')=}") - for config_storage_type, mapping in configuration.actor_storage_ids.items(): + for config_storage_type, mapping in configuration.actor_storages.items(): for storage_alias, storage_id in mapping.items(): if storage_alias == "default": # This is how the default storage is stored in the default kvs From b12e27ee3418a7372da3c7bfa7bb4fc1fcb0899c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 16 Feb 2026 15:50:54 +0100 Subject: [PATCH 03/15] Fix type issues, prepare for tests. Merge first --- src/apify/_actor.py | 3 +- src/apify/_configuration.py | 30 ++++++++++--- .../_apify/_alias_resolving.py | 44 ++++++++++--------- tests/unit/actor/test_configuration.py | 2 + 4 files changed, 49 insertions(+), 30 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 7f4172dc..ce74c688 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -207,8 +207,7 @@ async def __aenter__(self) -> Self: await self.open_key_value_store() # Load non-default aliased storages from configuration - self.log.warning('\n'.join(f'{k}={v}' for k, v in os.environ.items())) - await AliasResolver.register_aliases(configuration=self._configuration) + #await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 2469310e..5b02115d 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,13 +1,14 @@ from __future__ import annotations +import dataclasses import json from datetime import datetime, timedelta from decimal import Decimal from logging import getLogger from pathlib import Path -from typing import Annotated, Any, Required, TypedDict +from typing import Annotated, Any -from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, field_validator, model_validator +from pydantic import AliasChoices, BeforeValidator, Field, model_validator from typing_extensions import Self, deprecated from crawlee import service_locator @@ -34,11 +35,26 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') -class ActorStorageIds(TypedDict): +@dataclasses.dataclass +class ActorStorages: """Storage IDs for different storage types used by an Actor.""" - keyValueStores: dict[str, str] + + key_value_stores: dict[str, str] datasets: dict[str, str] - requestQueues: dict[str, str] + request_queues: dict[str, str] + + +def _load_storage_keys(data: None | str | dict) -> ActorStorages | None: + """Load storage keys from environment.""" + if data is None: + return None + + storage_mapping = data if isinstance(data, dict) else json.loads(data) + return ActorStorages( + key_value_stores=storage_mapping.get('keyValueStores', {}), + datasets=storage_mapping.get('datasets', {}), + request_queues=storage_mapping.get('requestQueues', {}), + ) @docs_group('Configuration') @@ -454,12 +470,12 @@ class Configuration(CrawleeConfiguration): ] = None actor_storages: Annotated[ - ActorStorageIds | None, + ActorStorages | None, Field( alias='actor_storages_json', description='Storage IDs for the actor', ), - BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), + BeforeValidator(_load_storage_keys), ] = None @model_validator(mode='after') diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 06e58b40..546fbd14 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -263,32 +263,33 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) - @classmethod async def register_aliases(cls, configuration: Configuration) -> None: - """Load alias mapping from dictionary to the default kvs.""" - def convert_name(name: str): - """Convert from mapping name to storage type name used in the alias mapping.""" - return {"datasets": "Dataset", "keyValueStores": "KeyValueStore", "requestQueues": "RequestQueue"}[name] - + """Load any alias mapping from configuration to the default kvs.""" + if configuration.actor_storages is None: + return configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storages["datasets"].get("default", configuration.default_dataset_id): - raise RuntimeError( - f"Conflicting default dataset ids: {configuration.default_dataset_id=}," - f" {configuration.actor_storages['datasets'].get('default')=}") + if configuration.default_dataset_id != configuration.actor_storages.datasets.get( + 'default'): + logger.warning( + f'Conflicting default dataset ids: {configuration.default_dataset_id=},' + f" {configuration.actor_storages.datasets.get('default')=}" + ) - for config_storage_type, mapping in configuration.actor_storages.items(): + for mapping, storage_type in ( + (configuration.actor_storages.key_value_stores, 'KeyValueStore'), + (configuration.actor_storages.datasets, 'Dataset'), + (configuration.actor_storages.request_queues, 'RequestQueue'), + ): for storage_alias, storage_id in mapping.items(): - if storage_alias == "default": - # This is how the default storage is stored in the default kvs - storage_alias="__default__" - - configuration_mapping[AliasResolver( - storage_type=convert_name(config_storage_type), - alias=storage_alias, - configuration=configuration, - )._storage_key] = storage_id + configuration_mapping[ + cls( # noqa: SLF001# It is ok in own classmethod. + storage_type=storage_type, + alias='__default__' if storage_alias == 'default' else storage_alias, + configuration=configuration, + )._storage_key + ] = storage_id # Aliased storage can be also default storage!!! # Should we store such second alias to the default storage or ignore it in such case? Probably @@ -296,5 +297,6 @@ def convert_name(name: str): # What if existing default dataset already has conflicting keys? # Just override it, that will teach it to have conflicting values! client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = (await client.get_record(cls._ALIAS_MAPPING_KEY) or {"value":{}}).get("value") + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', + {}) await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index 7f01c48e..e36771fa 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -7,6 +8,7 @@ from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.crawlers import BasicCrawler from crawlee.errors import ServiceConflictError +from crawlee.storage_clients import MemoryStorageClient from apify import Actor from apify import Configuration as ApifyConfiguration From 3b36459186da3c214183270a595aea98e0d974d0 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Feb 2026 09:47:42 +0100 Subject: [PATCH 04/15] Adapt to new test structure --- src/apify/_actor.py | 7 +++-- .../_apify/_alias_resolving.py | 14 +++------- tests/e2e/test_schema_storages/__init__.py | 0 .../actor_source/actor.json | 24 +++++++++++++++++ .../test_schema_storages/actor_source/main.py | 7 +++++ .../test_schema_storages.py | 26 +++++++++++++++++++ tests/unit/actor/test_configuration.py | 2 -- 7 files changed, 64 insertions(+), 16 deletions(-) create mode 100644 tests/e2e/test_schema_storages/__init__.py create mode 100644 tests/e2e/test_schema_storages/actor_source/actor.json create mode 100644 tests/e2e/test_schema_storages/actor_source/main.py create mode 100644 tests/e2e/test_schema_storages/test_schema_storages.py diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 3319fe0b..5bb91149 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import os import sys import warnings from contextlib import suppress @@ -205,9 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - - # Load non-default aliased storages from configuration - #await AliasResolver.register_aliases(configuration=self.configuration) + else: + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 546fbd14..aa6dcb47 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -265,13 +265,13 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore @classmethod async def register_aliases(cls, configuration: Configuration) -> None: - """Load any alias mapping from configuration to the default kvs.""" + """Load alias mapping from configuration to the default kvs.""" if configuration.actor_storages is None: return + configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storages.datasets.get( - 'default'): + if configuration.default_dataset_id != configuration.actor_storages.datasets.get('default'): logger.warning( f'Conflicting default dataset ids: {configuration.default_dataset_id=},' f" {configuration.actor_storages.datasets.get('default')=}" @@ -291,12 +291,6 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - # Aliased storage can be also default storage!!! - # Should we store such second alias to the default storage or ignore it in such case? Probably - - # What if existing default dataset already has conflicting keys? - # Just override it, that will teach it to have conflicting values! client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', - {}) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) diff --git a/tests/e2e/test_schema_storages/__init__.py b/tests/e2e/test_schema_storages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e/test_schema_storages/actor_source/actor.json b/tests/e2e/test_schema_storages/actor_source/actor.json new file mode 100644 index 00000000..f83b04ec --- /dev/null +++ b/tests/e2e/test_schema_storages/actor_source/actor.json @@ -0,0 +1,24 @@ +{ + "actorSpecification": 1, + "version": "0.0", + "storages": { + "datasets": { + "default": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + }, + "custom": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + } + } + } +} diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py new file mode 100644 index 00000000..ebed9ba4 --- /dev/null +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -0,0 +1,7 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + assert Actor.configuration.actor_storages + assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages.datasets['custom'] diff --git a/tests/e2e/test_schema_storages/test_schema_storages.py b/tests/e2e/test_schema_storages/test_schema_storages.py new file mode 100644 index 00000000..8ad98b7a --- /dev/null +++ b/tests/e2e/test_schema_storages/test_schema_storages.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' + + +def read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() + + +async def test_configuration_storages(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='schema_storages', + source_files={ + 'src/main.py': read_actor_source('main.py'), + '.actor/actor.json': read_actor_source('actor.json'), + }, + ) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index b1a201ac..ccd20849 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -1,4 +1,3 @@ -import json from pathlib import Path import pytest @@ -8,7 +7,6 @@ from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.crawlers import BasicCrawler from crawlee.errors import ServiceConflictError -from crawlee.storage_clients import MemoryStorageClient from apify import Actor from apify import Configuration as ApifyConfiguration From 72c2f353199b618cdcb3d304a3250302abdc2d11 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Feb 2026 16:57:36 +0100 Subject: [PATCH 05/15] Add uni tests, WIP TODO - how should it behave locally? --- src/apify/_actor.py | 6 ++-- src/apify/_configuration.py | 5 +-- .../_apify/_alias_resolving.py | 15 +++++++-- tests/unit/actor/test_configuration.py | 23 +++++++++++++ .../storage_clients/test_alias_resolver.py | 33 ++++++++++++++++++- 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 5bb91149..d23de91a 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -204,9 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - else: - # Load non-default aliased storages from configuration - await AliasResolver.register_aliases(configuration=self.configuration) + + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 5b02115d..00bc89df 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -44,11 +44,12 @@ class ActorStorages: request_queues: dict[str, str] -def _load_storage_keys(data: None | str | dict) -> ActorStorages | None: +def _load_storage_keys(data: None | str | dict | ActorStorages) -> ActorStorages | None: """Load storage keys from environment.""" if data is None: return None - + if isinstance(data, ActorStorages): + return data storage_mapping = data if isinstance(data, dict) else json.loads(data) return ActorStorages( key_value_stores=storage_mapping.get('keyValueStores', {}), diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index aa6dcb47..58da173b 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -291,6 +291,15 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) - await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) + if configuration.is_at_home: + # Bulk update the mapping in the default KVS with the configuration mapping. + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) + # Update the existing mapping with the configuration mapping. + existing_mapping.update(configuration_mapping) + # Store the updated mapping back in the KVS and in memory. + await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) + cls._alias_map = existing_mapping + else: + # Update only in-memory mapping when not at home + cls._alias_map.update(configuration_mapping) diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index ccd20849..e59fa5b9 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -300,3 +300,26 @@ def test_actor_pricing_info_from_json_env_var(monkeypatch: pytest.MonkeyPatch) - config = ApifyConfiguration() assert config.actor_pricing_info is not None assert config.actor_pricing_info.pricing_model == 'PAY_PER_EVENT' + + +def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that actor_storages_json is parsed from JSON env var.""" + import json + + datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + + actor_storages_json = json.dumps( + { + 'datasets': datasets, + 'requestQueues': request_queues, + 'keyValueStores': key_value_stores, + } + ) + monkeypatch.setenv('ACTOR_STORAGES_JSON', actor_storages_json) + config = ApifyConfiguration() + assert config.actor_storages + assert config.actor_storages.datasets == datasets + assert config.actor_storages.request_queues == request_queues + assert config.actor_storages.key_value_stores == key_value_stores diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 56821ca1..3cfa0fa3 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,6 +1,10 @@ from __future__ import annotations -from apify._configuration import Configuration +from crawlee import service_locator + +from apify import Actor +from apify._configuration import Configuration, ActorStorages +from apify.storage_clients import SmartApifyStorageClient, ApifyStorageClient from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -76,3 +80,30 @@ async def test_get_alias_map_returns_in_memory_map() -> None: AliasResolver._alias_map = {} result = await AliasResolver._get_alias_map(config) assert result == {} + + +async def test_register_aliases() -> None: + """Test that _get_alias_map loads the map from KVS when at home. + + AliasResolver works locally only """ + + + datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + + config = Configuration(is_at_home=False, + token='test-token', + actor_storages= ActorStorages( + datasets = datasets, + request_queues = request_queues, + key_value_stores = key_value_stores + ), + ) + storage_client = ApifyStorageClient() + service_locator.set_storage_client( + SmartApifyStorageClient(local_storage_client=storage_client, cloud_storage_client=storage_client) + ) + async with Actor(configuration=config): + d = await Actor.open_dataset(alias='default') + assert d From b7604cba954bcbd7c2373928702672810bf3732c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Feb 2026 13:26:06 +0100 Subject: [PATCH 06/15] Finalize tests --- src/apify/_actor.py | 7 +-- .../_apify/_alias_resolving.py | 21 ++++---- tests/integration/test_storages.py | 53 +++++++++++++++++++ tests/unit/actor/test_configuration.py | 6 +-- .../storage_clients/test_alias_resolver.py | 33 +----------- 5 files changed, 70 insertions(+), 50 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index d23de91a..7e3da0ba 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -204,9 +204,10 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - - # Load non-default aliased storages from configuration - await AliasResolver.register_aliases(configuration=self.configuration) + else: + # Load pre-existing non-default aliased storages from configuration + # Supported only on the Apify platform, where those storages are pre-created by the platform. + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 58da173b..ba75c199 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -291,15 +291,12 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - if configuration.is_at_home: - # Bulk update the mapping in the default KVS with the configuration mapping. - client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) - # Update the existing mapping with the configuration mapping. - existing_mapping.update(configuration_mapping) - # Store the updated mapping back in the KVS and in memory. - await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) - cls._alias_map = existing_mapping - else: - # Update only in-memory mapping when not at home - cls._alias_map.update(configuration_mapping) + # Bulk update the mapping in the default KVS with the configuration mapping. + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) + + # Update the existing mapping with the configuration mapping. + existing_mapping.update(configuration_mapping) + # Store the updated mapping back in the KVS and in memory. + await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) + cls._alias_map.update(existing_mapping) diff --git a/tests/integration/test_storages.py b/tests/integration/test_storages.py index 7ad807fe..05002a80 100644 --- a/tests/integration/test_storages.py +++ b/tests/integration/test_storages.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from typing import cast import pytest @@ -8,7 +9,9 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue from apify import Actor, Configuration +from apify._configuration import ActorStorages from apify.storage_clients import ApifyStorageClient, MemoryStorageClient, SmartApifyStorageClient +from apify.storage_clients._apify._alias_resolving import AliasResolver @pytest.mark.parametrize( @@ -125,3 +128,53 @@ async def test_actor_implicit_storage_init(apify_token: str) -> None: assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_storages_alias_resolving(apify_token: str) -> None: + """Test that `AliasResolver.register_aliases` correctly updates default KVS with Actor storages.""" + + # Actor storages + datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + + # Set up the configuration and storage client for the test + configuration = Configuration( + default_key_value_store_id='default_kvs_id', + token=apify_token, + actor_storages=ActorStorages( + datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores + ), + ) + storage_client = ApifyStorageClient() + service_locator.set_configuration(configuration) + service_locator.set_storage_client(storage_client) + + client_cache_key = cast('tuple', storage_client.get_storage_client_cache_key(configuration))[-1] + # Add some unrelated pre-existing alias mapping (it should be preserved after registering aliases) + pre_existing_mapping = {f'KeyValueStore,pre_existing_alias,{client_cache_key}': 'pre_existing_id'} + + default_kvs = await KeyValueStore.open(configuration=configuration, storage_client=storage_client) + await default_kvs.set_value(AliasResolver._ALIAS_MAPPING_KEY, pre_existing_mapping) + + # Construct the expected mapping + expected_mapping = {} + for storage_type, storage_map in ( + ('Dataset', datasets), + ('KeyValueStore', key_value_stores), + ('RequestQueue', request_queues), + ): + for storage_alias, storage_id in storage_map.items(): + expected_mapping[ + ','.join( + (storage_type, '__default__' if storage_alias == 'default' else storage_alias, client_cache_key) + ) + ] = storage_id + expected_mapping.update(pre_existing_mapping) + + try: + configuration.default_key_value_store_id = default_kvs.id + await AliasResolver.register_aliases(configuration=configuration) + assert await default_kvs.get_value(AliasResolver._ALIAS_MAPPING_KEY) == expected_mapping + finally: + await default_kvs.drop() diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index e59fa5b9..461a103a 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -306,9 +306,9 @@ def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: """Test that actor_storages_json is parsed from JSON env var.""" import json - datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} actor_storages_json = json.dumps( { diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 3cfa0fa3..56821ca1 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,10 +1,6 @@ from __future__ import annotations -from crawlee import service_locator - -from apify import Actor -from apify._configuration import Configuration, ActorStorages -from apify.storage_clients import SmartApifyStorageClient, ApifyStorageClient +from apify._configuration import Configuration from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -80,30 +76,3 @@ async def test_get_alias_map_returns_in_memory_map() -> None: AliasResolver._alias_map = {} result = await AliasResolver._get_alias_map(config) assert result == {} - - -async def test_register_aliases() -> None: - """Test that _get_alias_map loads the map from KVS when at home. - - AliasResolver works locally only """ - - - datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - - config = Configuration(is_at_home=False, - token='test-token', - actor_storages= ActorStorages( - datasets = datasets, - request_queues = request_queues, - key_value_stores = key_value_stores - ), - ) - storage_client = ApifyStorageClient() - service_locator.set_storage_client( - SmartApifyStorageClient(local_storage_client=storage_client, cloud_storage_client=storage_client) - ) - async with Actor(configuration=config): - d = await Actor.open_dataset(alias='default') - assert d From aa3b9ea91bd17e7abf3d2184e87a5d10abc63622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Thu, 19 Feb 2026 08:44:28 +0100 Subject: [PATCH 07/15] Apply suggestions from code review Co-authored-by: Vlada Dusek Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../_apify/_alias_resolving.py | 2 +- .../actor_source/actor.json | 22 +++++++++---------- tests/integration/test_storages.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index ba75c199..57a75c54 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -284,7 +284,7 @@ async def register_aliases(cls, configuration: Configuration) -> None: ): for storage_alias, storage_id in mapping.items(): configuration_mapping[ - cls( # noqa: SLF001# It is ok in own classmethod. + cls( # noqa: SLF001 # It is ok in own classmethod. storage_type=storage_type, alias='__default__' if storage_alias == 'default' else storage_alias, configuration=configuration, diff --git a/tests/e2e/test_schema_storages/actor_source/actor.json b/tests/e2e/test_schema_storages/actor_source/actor.json index f83b04ec..8501bd99 100644 --- a/tests/e2e/test_schema_storages/actor_source/actor.json +++ b/tests/e2e/test_schema_storages/actor_source/actor.json @@ -3,21 +3,21 @@ "version": "0.0", "storages": { "datasets": { - "default": { + "default": { "actorSpecification": 1, - "fields": { - "properties": { - "id": { "type": "string" } - } - } + "fields": { + "properties": { + "id": { "type": "string" } + } + } }, - "custom": { + "custom": { "actorSpecification": 1, "fields": { - "properties": { - "id": { "type": "string" } - } - } + "properties": { + "id": { "type": "string" } + } + } } } } diff --git a/tests/integration/test_storages.py b/tests/integration/test_storages.py index 05002a80..6eac1079 100644 --- a/tests/integration/test_storages.py +++ b/tests/integration/test_storages.py @@ -135,8 +135,8 @@ async def test_actor_storages_alias_resolving(apify_token: str) -> None: # Actor storages datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} - request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} - key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_request_queue_id', 'custom': 'custom_request_queue_id'} + key_value_stores = {'default': 'default_key_value_store_id', 'custom': 'custom_key_value_store_id'} # Set up the configuration and storage client for the test configuration = Configuration( From aa5624f401683555b5f1b444901c1cdda61396c7 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 09:10:38 +0100 Subject: [PATCH 08/15] Review comments --- .../_apify/_alias_resolving.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 57a75c54..5e3956f5 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -136,10 +136,11 @@ def __init__( alias: str, configuration: Configuration, ) -> None: - self._storage_type = storage_type self._alias = alias self._configuration = configuration - self._additional_cache_key = hash_api_base_url_and_token(configuration) + self._storage_key = self.get_storage_key( + storage_type=storage_type, alias=alias, additional_cache_key=hash_api_base_url_and_token(configuration) + ) async def __aenter__(self) -> AliasResolver: """Context manager to prevent race condition in alias creation.""" @@ -236,17 +237,6 @@ async def store_mapping(self, storage_id: str) -> None: except Exception as exc: logger.warning(f'Error storing alias mapping for {self._alias}: {exc}') - @property - def _storage_key(self) -> str: - """Get a unique storage key used for storing the alias in the mapping.""" - return self._ALIAS_STORAGE_KEY_SEPARATOR.join( - [ - self._storage_type, - self._alias, - self._additional_cache_key, - ] - ) - @staticmethod async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStoreClientAsync: """Get a client for the default key-value store.""" @@ -263,6 +253,18 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) + @classmethod + def get_storage_key( + cls, storage_type: Literal['Dataset', 'KeyValueStore', 'RequestQueue'], alias: str, additional_cache_key: str + ) -> str: + return cls._ALIAS_STORAGE_KEY_SEPARATOR.join( + [ + storage_type, + alias, + additional_cache_key, + ] + ) + @classmethod async def register_aliases(cls, configuration: Configuration) -> None: """Load alias mapping from configuration to the default kvs.""" @@ -276,6 +278,7 @@ async def register_aliases(cls, configuration: Configuration) -> None: f'Conflicting default dataset ids: {configuration.default_dataset_id=},' f" {configuration.actor_storages.datasets.get('default')=}" ) + additional_cache_key = hash_api_base_url_and_token(configuration) for mapping, storage_type in ( (configuration.actor_storages.key_value_stores, 'KeyValueStore'), @@ -284,11 +287,11 @@ async def register_aliases(cls, configuration: Configuration) -> None: ): for storage_alias, storage_id in mapping.items(): configuration_mapping[ - cls( # noqa: SLF001 # It is ok in own classmethod. - storage_type=storage_type, - alias='__default__' if storage_alias == 'default' else storage_alias, - configuration=configuration, - )._storage_key + cls.get_storage_key( + storage_type, + '__default__' if storage_alias == 'default' else storage_alias, + additional_cache_key, + ) ] = storage_id # Bulk update the mapping in the default KVS with the configuration mapping. From 17288ca06bf40056f63e480b91fcd70e79568e8b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 09:48:40 +0100 Subject: [PATCH 09/15] Remove unnecessary record handling --- .../_apify/_alias_resolving.py | 27 +++++-------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 5e3956f5..61cacd69 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -184,15 +184,7 @@ async def _get_alias_map(cls, configuration: Configuration) -> dict[str, str]: default_kvs_client = await cls._get_default_kvs_client(configuration) record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY) - - # get_record can return {key: ..., value: ..., content_type: ...} - if isinstance(record, dict): - if 'value' in record and isinstance(record['value'], dict): - cls._alias_map = record['value'] - else: - cls._alias_map = record - else: - cls._alias_map = dict[str, str]() + cls._alias_map = record.get('value', {}) if record else {} return cls._alias_map @@ -221,19 +213,11 @@ async def store_mapping(self, storage_id: str) -> None: try: record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY) - - # get_record can return {key: ..., value: ..., content_type: ...} - if isinstance(record, dict) and 'value' in record: - record = record['value'] - - # Update or create the record with the new alias mapping - if isinstance(record, dict): - record[self._storage_key] = storage_id - else: - record = {self._storage_key: storage_id} + value = record.get('value', {}) if record else {} + value[self._storage_key] = storage_id # Store the mapping back in the KVS. - await default_kvs_client.set_record(self._ALIAS_MAPPING_KEY, record) + await default_kvs_client.set_record(key=self._ALIAS_MAPPING_KEY, value=value) except Exception as exc: logger.warning(f'Error storing alias mapping for {self._alias}: {exc}') @@ -296,7 +280,8 @@ async def register_aliases(cls, configuration: Configuration) -> None: # Bulk update the mapping in the default KVS with the configuration mapping. client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) + record = await client.get_record(cls._ALIAS_MAPPING_KEY) + existing_mapping = record.get('value', {}) if record else {} # Update the existing mapping with the configuration mapping. existing_mapping.update(configuration_mapping) From a7e645fa5c3c1fab51f00dd123d17c36e89f9156 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 10:15:30 +0100 Subject: [PATCH 10/15] Review comments --- .../_apify/_alias_resolving.py | 78 ++++++++++--------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 61cacd69..039e8e1f 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -252,39 +252,45 @@ def get_storage_key( @classmethod async def register_aliases(cls, configuration: Configuration) -> None: """Load alias mapping from configuration to the default kvs.""" - if configuration.actor_storages is None: - return - - configuration_mapping = {} - - if configuration.default_dataset_id != configuration.actor_storages.datasets.get('default'): - logger.warning( - f'Conflicting default dataset ids: {configuration.default_dataset_id=},' - f" {configuration.actor_storages.datasets.get('default')=}" - ) - additional_cache_key = hash_api_base_url_and_token(configuration) - - for mapping, storage_type in ( - (configuration.actor_storages.key_value_stores, 'KeyValueStore'), - (configuration.actor_storages.datasets, 'Dataset'), - (configuration.actor_storages.request_queues, 'RequestQueue'), - ): - for storage_alias, storage_id in mapping.items(): - configuration_mapping[ - cls.get_storage_key( - storage_type, - '__default__' if storage_alias == 'default' else storage_alias, - additional_cache_key, - ) - ] = storage_id - - # Bulk update the mapping in the default KVS with the configuration mapping. - client = await cls._get_default_kvs_client(configuration=configuration) - record = await client.get_record(cls._ALIAS_MAPPING_KEY) - existing_mapping = record.get('value', {}) if record else {} - - # Update the existing mapping with the configuration mapping. - existing_mapping.update(configuration_mapping) - # Store the updated mapping back in the KVS and in memory. - await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) - cls._alias_map.update(existing_mapping) + async with await cls._get_alias_init_lock(): + # Skip if no mapping or just default storages + if configuration.actor_storages is None or set( + configuration.actor_storages.datasets.keys() + | configuration.actor_storages.key_value_stores.keys() + | configuration.actor_storages.request_queues.keys() + ) == {'default'}: + return + + configuration_mapping = {} + + if configuration.default_dataset_id != configuration.actor_storages.datasets.get('default'): + logger.warning( + f'Conflicting default dataset ids: {configuration.default_dataset_id=},' + f" {configuration.actor_storages.datasets.get('default')=}" + ) + additional_cache_key = hash_api_base_url_and_token(configuration) + + for mapping, storage_type in ( + (configuration.actor_storages.key_value_stores, 'KeyValueStore'), + (configuration.actor_storages.datasets, 'Dataset'), + (configuration.actor_storages.request_queues, 'RequestQueue'), + ): + for storage_alias, storage_id in mapping.items(): + configuration_mapping[ + cls.get_storage_key( + storage_type, + '__default__' if storage_alias == 'default' else storage_alias, + additional_cache_key, + ) + ] = storage_id + + # Bulk update the mapping in the default KVS with the configuration mapping. + client = await cls._get_default_kvs_client(configuration=configuration) + record = await client.get_record(cls._ALIAS_MAPPING_KEY) + existing_mapping = record.get('value', {}) if record else {} + + # Update the existing mapping with the configuration mapping. + existing_mapping.update(configuration_mapping) + # Store the updated mapping back in the KVS and in memory. + await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) + cls._alias_map.update(existing_mapping) From a996dcad782691c70af566718457667b3de9e88b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 11:02:16 +0100 Subject: [PATCH 11/15] Simplify and avoid unnecessary API calls --- src/apify/_actor.py | 5 -- .../_apify/_alias_resolving.py | 78 +++++-------------- .../actor_source/actor.json | 38 ++++++++- .../test_schema_storages/actor_source/main.py | 10 ++- tests/integration/test_storages.py | 53 ------------- .../storage_clients/test_alias_resolver.py | 26 ++++++- 6 files changed, 92 insertions(+), 118 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 7e3da0ba..76f4475b 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -36,7 +36,6 @@ from apify.events import ApifyEventManager, EventManager, LocalEventManager from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient, SmartApifyStorageClient -from apify.storage_clients._apify._alias_resolving import AliasResolver from apify.storage_clients._file_system import ApifyFileSystemStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -204,10 +203,6 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - else: - # Load pre-existing non-default aliased storages from configuration - # Supported only on the Apify platform, where those storages are pre-created by the platform. - await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 039e8e1f..4abc73e7 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -5,6 +5,8 @@ from logging import getLogger from typing import TYPE_CHECKING, ClassVar, Literal, overload +from propcache import cached_property + from apify_client import ApifyClientAsync from ._utils import hash_api_base_url_and_token @@ -136,11 +138,9 @@ def __init__( alias: str, configuration: Configuration, ) -> None: + self._storage_type = storage_type self._alias = alias self._configuration = configuration - self._storage_key = self.get_storage_key( - storage_type=storage_type, alias=alias, additional_cache_key=hash_api_base_url_and_token(configuration) - ) async def __aenter__(self) -> AliasResolver: """Context manager to prevent race condition in alias creation.""" @@ -194,6 +194,18 @@ async def resolve_id(self) -> str | None: Returns: Storage id if it exists, None otherwise. """ + # First try to find the alias in the configuration mapping to avoid any API calls. + # This mapping is maintained by the Apify platform and does not have to be maintained in the default KVS. + if self._configuration.actor_storages and self._alias != 'default': + storage_maps = { + 'Dataset': self._configuration.actor_storages.datasets, + 'KeyValueStore': self._configuration.actor_storages.key_value_stores, + 'RequestQueue': self._configuration.actor_storages.request_queues, + } + if storage_id := storage_maps.get(self._storage_type, {}).get(self._alias): + return storage_id + + # Fallback to the mapping saved in the default KVS return (await self._get_alias_map(self._configuration)).get(self._storage_key, None) async def store_mapping(self, storage_id: str) -> None: @@ -237,60 +249,12 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) - @classmethod - def get_storage_key( - cls, storage_type: Literal['Dataset', 'KeyValueStore', 'RequestQueue'], alias: str, additional_cache_key: str - ) -> str: - return cls._ALIAS_STORAGE_KEY_SEPARATOR.join( + @cached_property + def _storage_key(self) -> str: + return self._ALIAS_STORAGE_KEY_SEPARATOR.join( [ - storage_type, - alias, - additional_cache_key, + self._storage_type, + self._alias, + hash_api_base_url_and_token(self._configuration), ] ) - - @classmethod - async def register_aliases(cls, configuration: Configuration) -> None: - """Load alias mapping from configuration to the default kvs.""" - async with await cls._get_alias_init_lock(): - # Skip if no mapping or just default storages - if configuration.actor_storages is None or set( - configuration.actor_storages.datasets.keys() - | configuration.actor_storages.key_value_stores.keys() - | configuration.actor_storages.request_queues.keys() - ) == {'default'}: - return - - configuration_mapping = {} - - if configuration.default_dataset_id != configuration.actor_storages.datasets.get('default'): - logger.warning( - f'Conflicting default dataset ids: {configuration.default_dataset_id=},' - f" {configuration.actor_storages.datasets.get('default')=}" - ) - additional_cache_key = hash_api_base_url_and_token(configuration) - - for mapping, storage_type in ( - (configuration.actor_storages.key_value_stores, 'KeyValueStore'), - (configuration.actor_storages.datasets, 'Dataset'), - (configuration.actor_storages.request_queues, 'RequestQueue'), - ): - for storage_alias, storage_id in mapping.items(): - configuration_mapping[ - cls.get_storage_key( - storage_type, - '__default__' if storage_alias == 'default' else storage_alias, - additional_cache_key, - ) - ] = storage_id - - # Bulk update the mapping in the default KVS with the configuration mapping. - client = await cls._get_default_kvs_client(configuration=configuration) - record = await client.get_record(cls._ALIAS_MAPPING_KEY) - existing_mapping = record.get('value', {}) if record else {} - - # Update the existing mapping with the configuration mapping. - existing_mapping.update(configuration_mapping) - # Store the updated mapping back in the KVS and in memory. - await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) - cls._alias_map.update(existing_mapping) diff --git a/tests/e2e/test_schema_storages/actor_source/actor.json b/tests/e2e/test_schema_storages/actor_source/actor.json index 8501bd99..f72ae6b2 100644 --- a/tests/e2e/test_schema_storages/actor_source/actor.json +++ b/tests/e2e/test_schema_storages/actor_source/actor.json @@ -11,7 +11,43 @@ } } }, - "custom": { + "custom_d": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + } + }, + "key_value_stores": { + "default": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + }, + "custom_kvs": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + } + }, + "request_queues": { + "default": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + }, + "custom_rq": { "actorSpecification": 1, "fields": { "properties": { diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py index ebed9ba4..c068288d 100644 --- a/tests/e2e/test_schema_storages/actor_source/main.py +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -4,4 +4,12 @@ async def main() -> None: async with Actor: assert Actor.configuration.actor_storages - assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages.datasets['custom'] + assert (await Actor.open_dataset(alias='custom_d')).id == Actor.configuration.actor_storages.datasets[ + 'custom_d' + ] + assert (await Actor.open_dataset(alias='custom_kvs')).id == Actor.configuration.actor_storages.datasets[ + 'custom_kvs' + ] + assert (await Actor.open_dataset(alias='custom_rq')).id == Actor.configuration.actor_storages.datasets[ + 'custom_rq' + ] diff --git a/tests/integration/test_storages.py b/tests/integration/test_storages.py index 6eac1079..7ad807fe 100644 --- a/tests/integration/test_storages.py +++ b/tests/integration/test_storages.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -from typing import cast import pytest @@ -9,9 +8,7 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue from apify import Actor, Configuration -from apify._configuration import ActorStorages from apify.storage_clients import ApifyStorageClient, MemoryStorageClient, SmartApifyStorageClient -from apify.storage_clients._apify._alias_resolving import AliasResolver @pytest.mark.parametrize( @@ -128,53 +125,3 @@ async def test_actor_implicit_storage_init(apify_token: str) -> None: assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) - - -async def test_actor_storages_alias_resolving(apify_token: str) -> None: - """Test that `AliasResolver.register_aliases` correctly updates default KVS with Actor storages.""" - - # Actor storages - datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} - request_queues = {'default': 'default_request_queue_id', 'custom': 'custom_request_queue_id'} - key_value_stores = {'default': 'default_key_value_store_id', 'custom': 'custom_key_value_store_id'} - - # Set up the configuration and storage client for the test - configuration = Configuration( - default_key_value_store_id='default_kvs_id', - token=apify_token, - actor_storages=ActorStorages( - datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores - ), - ) - storage_client = ApifyStorageClient() - service_locator.set_configuration(configuration) - service_locator.set_storage_client(storage_client) - - client_cache_key = cast('tuple', storage_client.get_storage_client_cache_key(configuration))[-1] - # Add some unrelated pre-existing alias mapping (it should be preserved after registering aliases) - pre_existing_mapping = {f'KeyValueStore,pre_existing_alias,{client_cache_key}': 'pre_existing_id'} - - default_kvs = await KeyValueStore.open(configuration=configuration, storage_client=storage_client) - await default_kvs.set_value(AliasResolver._ALIAS_MAPPING_KEY, pre_existing_mapping) - - # Construct the expected mapping - expected_mapping = {} - for storage_type, storage_map in ( - ('Dataset', datasets), - ('KeyValueStore', key_value_stores), - ('RequestQueue', request_queues), - ): - for storage_alias, storage_id in storage_map.items(): - expected_mapping[ - ','.join( - (storage_type, '__default__' if storage_alias == 'default' else storage_alias, client_cache_key) - ) - ] = storage_id - expected_mapping.update(pre_existing_mapping) - - try: - configuration.default_key_value_store_id = default_kvs.id - await AliasResolver.register_aliases(configuration=configuration) - assert await default_kvs.get_value(AliasResolver._ALIAS_MAPPING_KEY) == expected_mapping - finally: - await default_kvs.drop() diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 56821ca1..088457f6 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,6 +1,6 @@ from __future__ import annotations -from apify._configuration import Configuration +from apify._configuration import ActorStorages, Configuration from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -76,3 +76,27 @@ async def test_get_alias_map_returns_in_memory_map() -> None: AliasResolver._alias_map = {} result = await AliasResolver._get_alias_map(config) assert result == {} + + +async def test_actor_storages_alias_resolving() -> None: + """Test that `AliasResolver.register_aliases` correctly updates default KVS with Actor storages.""" + + # Actor storages + datasets = {'default': 'default_dataset_id', 'custom': 'custom_Dataset_id'} + request_queues = {'default': 'default_request_queue_id', 'custom': 'custom_RequestQueue_id'} + key_value_stores = {'default': 'default_key_value_store_id', 'custom': 'custom_KeyValueStore_id'} + + # Set up the configuration and storage client for the test + configuration = Configuration( + default_key_value_store_id='default_kvs_id', + actor_storages=ActorStorages( + datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores + ), + ) + + # Construct the expected mapping + for storage_type in ('Dataset', 'KeyValueStore', 'RequestQueue'): + assert ( + await AliasResolver(storage_type=storage_type, alias='custom', configuration=configuration).resolve_id() + == f'custom_{storage_type}_id' + ) From f738e5804c4a89ddacab81ba2581bd18413029fa Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 19 Feb 2026 11:24:11 +0100 Subject: [PATCH 12/15] Finalize --- .../_apify/_alias_resolving.py | 20 +++++----- .../actor_source/actor.json | 38 +------------------ .../test_schema_storages/actor_source/main.py | 10 +---- .../storage_clients/test_alias_resolver.py | 9 ++--- 4 files changed, 16 insertions(+), 61 deletions(-) diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 4abc73e7..b2ad3df4 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -233,6 +233,16 @@ async def store_mapping(self, storage_id: str) -> None: except Exception as exc: logger.warning(f'Error storing alias mapping for {self._alias}: {exc}') + @cached_property + def _storage_key(self) -> str: + return self._ALIAS_STORAGE_KEY_SEPARATOR.join( + [ + self._storage_type, + self._alias, + hash_api_base_url_and_token(self._configuration), + ] + ) + @staticmethod async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStoreClientAsync: """Get a client for the default key-value store.""" @@ -248,13 +258,3 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore raise ValueError("'Configuration.default_key_value_store_id' must be set.") return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) - - @cached_property - def _storage_key(self) -> str: - return self._ALIAS_STORAGE_KEY_SEPARATOR.join( - [ - self._storage_type, - self._alias, - hash_api_base_url_and_token(self._configuration), - ] - ) diff --git a/tests/e2e/test_schema_storages/actor_source/actor.json b/tests/e2e/test_schema_storages/actor_source/actor.json index f72ae6b2..8501bd99 100644 --- a/tests/e2e/test_schema_storages/actor_source/actor.json +++ b/tests/e2e/test_schema_storages/actor_source/actor.json @@ -11,43 +11,7 @@ } } }, - "custom_d": { - "actorSpecification": 1, - "fields": { - "properties": { - "id": { "type": "string" } - } - } - } - }, - "key_value_stores": { - "default": { - "actorSpecification": 1, - "fields": { - "properties": { - "id": { "type": "string" } - } - } - }, - "custom_kvs": { - "actorSpecification": 1, - "fields": { - "properties": { - "id": { "type": "string" } - } - } - } - }, - "request_queues": { - "default": { - "actorSpecification": 1, - "fields": { - "properties": { - "id": { "type": "string" } - } - } - }, - "custom_rq": { + "custom": { "actorSpecification": 1, "fields": { "properties": { diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py index c068288d..ebed9ba4 100644 --- a/tests/e2e/test_schema_storages/actor_source/main.py +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -4,12 +4,4 @@ async def main() -> None: async with Actor: assert Actor.configuration.actor_storages - assert (await Actor.open_dataset(alias='custom_d')).id == Actor.configuration.actor_storages.datasets[ - 'custom_d' - ] - assert (await Actor.open_dataset(alias='custom_kvs')).id == Actor.configuration.actor_storages.datasets[ - 'custom_kvs' - ] - assert (await Actor.open_dataset(alias='custom_rq')).id == Actor.configuration.actor_storages.datasets[ - 'custom_rq' - ] + assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages.datasets['custom'] diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 088457f6..28a757b8 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -78,23 +78,22 @@ async def test_get_alias_map_returns_in_memory_map() -> None: assert result == {} -async def test_actor_storages_alias_resolving() -> None: - """Test that `AliasResolver.register_aliases` correctly updates default KVS with Actor storages.""" +async def test_configuration_storages_alias_resolving() -> None: + """Test that `AliasResolver` correctly resolves ids of storages registered in Configuration.""" # Actor storages datasets = {'default': 'default_dataset_id', 'custom': 'custom_Dataset_id'} request_queues = {'default': 'default_request_queue_id', 'custom': 'custom_RequestQueue_id'} key_value_stores = {'default': 'default_key_value_store_id', 'custom': 'custom_KeyValueStore_id'} - # Set up the configuration and storage client for the test + # Set up the configuration with the storage mapping configuration = Configuration( - default_key_value_store_id='default_kvs_id', actor_storages=ActorStorages( datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores ), ) - # Construct the expected mapping + # Check that id of each non-default storage saved in the mapping is resolved for storage_type in ('Dataset', 'KeyValueStore', 'RequestQueue'): assert ( await AliasResolver(storage_type=storage_type, alias='custom', configuration=configuration).resolve_id() From bc7d13ddc89ca524bfa5e2282910c0092df82785 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 20 Feb 2026 08:10:08 +0100 Subject: [PATCH 13/15] Review comments --- src/apify/_configuration.py | 6 ++---- src/apify/storage_clients/_apify/_alias_resolving.py | 4 ++-- tests/unit/actor/test_configuration.py | 8 +++----- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 61110b8b..002394a4 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,6 +1,5 @@ from __future__ import annotations -import dataclasses import json from datetime import datetime, timedelta from decimal import Decimal @@ -8,7 +7,7 @@ from pathlib import Path from typing import Annotated, Any -from pydantic import AliasChoices, BeforeValidator, Field, model_validator +from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, model_validator from typing_extensions import Self, deprecated from crawlee import service_locator @@ -35,8 +34,7 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') -@dataclasses.dataclass -class ActorStorages: +class ActorStorages(BaseModel): """Storage IDs for different storage types used by an Actor.""" key_value_stores: dict[str, str] diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index b2ad3df4..9736b67c 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -2,11 +2,10 @@ import logging from asyncio import Lock +from functools import cached_property from logging import getLogger from typing import TYPE_CHECKING, ClassVar, Literal, overload -from propcache import cached_property - from apify_client import ApifyClientAsync from ._utils import hash_api_base_url_and_token @@ -235,6 +234,7 @@ async def store_mapping(self, storage_id: str) -> None: @cached_property def _storage_key(self) -> str: + """Get a unique storage key used for storing the alias in the mapping.""" return self._ALIAS_STORAGE_KEY_SEPARATOR.join( [ self._storage_type, diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index 461a103a..c8991be1 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -286,7 +287,6 @@ def test_max_total_charge_usd_decimal_parsing(monkeypatch: pytest.MonkeyPatch) - def test_actor_pricing_info_from_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: """Test that actor_pricing_info is parsed from JSON env var.""" - import json pricing_json = json.dumps( { @@ -304,11 +304,9 @@ def test_actor_pricing_info_from_json_env_var(monkeypatch: pytest.MonkeyPatch) - def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: """Test that actor_storages_json is parsed from JSON env var.""" - import json - datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} - request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} - key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_request_queue_id', 'custom': 'custom_request_queue_id'} + key_value_stores = {'default': 'default_key_value_store_id', 'custom': 'custom_key_value_store_id'} actor_storages_json = json.dumps( { From e2df85345a76adf198ef18ae8460fc5d70e9c84c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 20 Feb 2026 12:59:09 +0100 Subject: [PATCH 14/15] Change to TypedDict --- src/apify/_configuration.py | 28 +++++++++---------- .../_apify/_alias_resolving.py | 6 ++-- .../test_schema_storages/actor_source/main.py | 2 +- tests/unit/actor/test_configuration.py | 6 ++-- .../storage_clients/test_alias_resolver.py | 6 ++-- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 002394a4..bd02c650 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -7,8 +7,8 @@ from pathlib import Path from typing import Annotated, Any -from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, model_validator -from typing_extensions import Self, deprecated +from pydantic import AliasChoices, BeforeValidator, Field, model_validator +from typing_extensions import Self, TypedDict, deprecated from crawlee import service_locator from crawlee._utils.models import timedelta_ms @@ -34,26 +34,24 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') -class ActorStorages(BaseModel): - """Storage IDs for different storage types used by an Actor.""" - +class ActorStorages(TypedDict): key_value_stores: dict[str, str] datasets: dict[str, str] request_queues: dict[str, str] -def _load_storage_keys(data: None | str | dict | ActorStorages) -> ActorStorages | None: - """Load storage keys from environment.""" +def _load_storage_keys( + data: None | str | ActorStorages, +) -> ActorStorages | None: + """Load storage keys.""" if data is None: return None - if isinstance(data, ActorStorages): - return data - storage_mapping = data if isinstance(data, dict) else json.loads(data) - return ActorStorages( - key_value_stores=storage_mapping.get('keyValueStores', {}), - datasets=storage_mapping.get('datasets', {}), - request_queues=storage_mapping.get('requestQueues', {}), - ) + storage_mapping = json.loads(data) if isinstance(data, str) else data + return { + 'key_value_stores': storage_mapping.get('keyValueStores', storage_mapping.get('key_value_stores', {})), + 'datasets': storage_mapping.get('datasets', storage_mapping.get('datasets', {})), + 'request_queues': storage_mapping.get('requestQueues', storage_mapping.get('request_queues', {})), + } @docs_group('Configuration') diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 9736b67c..1170193b 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -197,9 +197,9 @@ async def resolve_id(self) -> str | None: # This mapping is maintained by the Apify platform and does not have to be maintained in the default KVS. if self._configuration.actor_storages and self._alias != 'default': storage_maps = { - 'Dataset': self._configuration.actor_storages.datasets, - 'KeyValueStore': self._configuration.actor_storages.key_value_stores, - 'RequestQueue': self._configuration.actor_storages.request_queues, + 'Dataset': self._configuration.actor_storages['datasets'], + 'KeyValueStore': self._configuration.actor_storages['key_value_stores'], + 'RequestQueue': self._configuration.actor_storages['request_queues'], } if storage_id := storage_maps.get(self._storage_type, {}).get(self._alias): return storage_id diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py index ebed9ba4..fcb7ef73 100644 --- a/tests/e2e/test_schema_storages/actor_source/main.py +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -4,4 +4,4 @@ async def main() -> None: async with Actor: assert Actor.configuration.actor_storages - assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages.datasets['custom'] + assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages['datasets']['custom'] diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index c8991be1..5199a07b 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -318,6 +318,6 @@ def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv('ACTOR_STORAGES_JSON', actor_storages_json) config = ApifyConfiguration() assert config.actor_storages - assert config.actor_storages.datasets == datasets - assert config.actor_storages.request_queues == request_queues - assert config.actor_storages.key_value_stores == key_value_stores + assert config.actor_storages['datasets'] == datasets + assert config.actor_storages['request_queues'] == request_queues + assert config.actor_storages['key_value_stores'] == key_value_stores diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 28a757b8..0cfe3103 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,6 +1,6 @@ from __future__ import annotations -from apify._configuration import ActorStorages, Configuration +from apify._configuration import Configuration from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -88,9 +88,7 @@ async def test_configuration_storages_alias_resolving() -> None: # Set up the configuration with the storage mapping configuration = Configuration( - actor_storages=ActorStorages( - datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores - ), + actor_storages={'datasets': datasets, 'request_queues': request_queues, 'key_value_stores': key_value_stores} ) # Check that id of each non-default storage saved in the mapping is resolved From a7306a85eb6c3ccce9f0abf91434ad72278f644b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Mon, 23 Feb 2026 16:52:08 +0100 Subject: [PATCH 15/15] Apply suggestions from code review Co-authored-by: Vlada Dusek --- src/apify/_configuration.py | 27 +++++++++++++++---- .../test_schema_storages/actor_source/main.py | 4 ++- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index bd02c650..005b8e28 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -35,15 +35,32 @@ def _transform_to_list(value: Any) -> list[str] | None: class ActorStorages(TypedDict): + """Mapping of storage aliases to their IDs, grouped by storage type. + + Populated from the `ACTOR_STORAGES_JSON` env var that the Apify platform sets when an Actor declares + named storages in its `actor.json` schema. Each key maps a user-defined alias (e.g. `'custom'`) + to the platform-assigned storage ID. + """ + key_value_stores: dict[str, str] datasets: dict[str, str] request_queues: dict[str, str] -def _load_storage_keys( - data: None | str | ActorStorages, -) -> ActorStorages | None: - """Load storage keys.""" +def _load_storage_keys(data: None | str | ActorStorages) -> ActorStorages | None: + """Parse the `ACTOR_STORAGES_JSON` value into a normalized `ActorStorages` dict. + + The platform provides this as a JSON string with camelCase keys (`keyValueStores`, `requestQueues`, `datasets`). + This validator deserializes the JSON when needed and normalizes the keys to snake_case, falling back to empty + dicts for missing storage types. + + Args: + data: Raw value - `None` when the env var is unset, a JSON string from the env var, or an already-parsed + `ActorStorages` dict when set programmatically. + + Returns: + Normalized storage mapping, or `None` if the input is `None`. + """ if data is None: return None storage_mapping = json.loads(data) if isinstance(data, str) else data @@ -470,7 +487,7 @@ class Configuration(CrawleeConfiguration): ActorStorages | None, Field( alias='actor_storages_json', - description='Storage IDs for the actor', + description='Mapping of storage aliases to their platform-assigned IDs.', ), BeforeValidator(_load_storage_keys), ] = None diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py index fcb7ef73..43a82207 100644 --- a/tests/e2e/test_schema_storages/actor_source/main.py +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -4,4 +4,6 @@ async def main() -> None: async with Actor: assert Actor.configuration.actor_storages - assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages['datasets']['custom'] + dataset = await Actor.open_dataset(alias='custom') + expected_id = Actor.configuration.actor_storages['datasets']['custom'] + assert dataset.id == expected_id