diff --git a/apify-api/openapi/components/schemas/actor-runs/Run.yaml b/apify-api/openapi/components/schemas/actor-runs/Run.yaml index e950d4ff9d..9ccc75b47d 100644 --- a/apify-api/openapi/components/schemas/actor-runs/Run.yaml +++ b/apify-api/openapi/components/schemas/actor-runs/Run.yaml @@ -97,6 +97,40 @@ properties: type: string examples: [FL35cSF7jrxr3BY39] description: ID of the default request queue for this run. + storageIds: + type: object + description: Storage IDs associated with this run, organized by storage type. + properties: + datasets: + type: object + description: Dataset storage IDs. + properties: + default: + type: string + examples: [wmKPijuyDnPZAPRMk] + description: ID of the default dataset for this run. + additionalProperties: + type: string + keyValueStores: + type: object + description: Key-value store storage IDs. + properties: + default: + type: string + examples: [eJNzqsbPiopwJcgGQ] + description: ID of the default key-value store for this run. + additionalProperties: + type: string + requestQueues: + type: object + description: Request queue storage IDs. + properties: + default: + type: string + examples: [FL35cSF7jrxr3BY39] + description: ID of the default request queue for this run. + additionalProperties: + type: string buildNumber: type: string examples: [0.0.36] diff --git a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}.yaml b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}.yaml index a6bd87858f..fce9ce2eb5 100644 --- a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}.yaml +++ b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}.yaml @@ -148,6 +148,13 @@ get: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true @@ -292,6 +299,13 @@ put: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@abort.yaml b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@abort.yaml index 0419c9fa67..e73ab8bc9d 100644 --- a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@abort.yaml +++ b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@abort.yaml @@ -80,6 +80,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 isContainerServerReady: false gitBranchName: master usage: diff --git a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@metamorph.yaml b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@metamorph.yaml index 61de7b46d6..36339a35f6 100644 --- a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@metamorph.yaml +++ b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@metamorph.yaml @@ -101,6 +101,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 metamorphs: - createdAt: "2019-11-30T07:39:24.202Z" actorId: nspoEjklmnsF2oosD diff --git a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@reboot.yaml b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@reboot.yaml index e4e6adb00f..f2b23d4735 100644 --- a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@reboot.yaml +++ b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@reboot.yaml @@ -71,6 +71,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@resurrect.yaml b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@resurrect.yaml index c639fe92d8..c3ce09560a 100644 --- a/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@resurrect.yaml +++ b/apify-api/openapi/paths/actor-runs/actor-runs@{runId}@resurrect.yaml @@ -100,6 +100,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs.yaml b/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs.yaml index 684bec8375..60ac2ffef1 100644 --- a/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs.yaml +++ b/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs.yaml @@ -316,6 +316,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.2.2 containerUrl: "https://nwfcc4btrgqt.runs.apify.com" isContainerServerReady: false diff --git a/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs@last.yaml b/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs@last.yaml index b85a53c2fa..6beea31a7e 100644 --- a/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs@last.yaml +++ b/apify-api/openapi/paths/actor-tasks/actor-tasks@{actorTaskId}@runs@last.yaml @@ -123,6 +123,13 @@ get: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs.yaml index 244936cd64..476f08c45c 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs.yaml @@ -333,6 +333,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs@last.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs@last.yaml index 4e558aeb17..32ac2f4a0b 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs@last.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs@last.yaml @@ -125,6 +125,13 @@ get: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}.yaml index b1718338c1..b9d53be04c 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}.yaml @@ -97,6 +97,13 @@ get: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@abort.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@abort.yaml index eecfc7783a..676580126d 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@abort.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@abort.yaml @@ -89,6 +89,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 isContainerServerReady: false gitBranchName: master usage: diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@metamorph.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@metamorph.yaml index b0bf9cd7ac..7f9753f444 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@metamorph.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@metamorph.yaml @@ -113,6 +113,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 metamorphs: - createdAt: "2019-11-30T07:39:24.202Z" actorId: nspoEjklmnsF2oosD diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@resurrect.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@resurrect.yaml index b610d759e2..4cea0cb00e 100644 --- a/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@resurrect.yaml +++ b/apify-api/openapi/paths/actors/acts@{actorId}@runs@{runId}@resurrect.yaml @@ -128,6 +128,13 @@ post: defaultKeyValueStoreId: eJNzqsbPiopwJcgGQ defaultDatasetId: wmKPijuyDnPZAPRMk defaultRequestQueueId: FL35cSF7jrxr3BY39 + storageIds: + datasets: + default: wmKPijuyDnPZAPRMk + keyValueStores: + default: eJNzqsbPiopwJcgGQ + requestQueues: + default: FL35cSF7jrxr3BY39 buildNumber: 0.0.36 containerUrl: "https://g8kd8kbc5ge8.runs.apify.net" isContainerServerReady: true diff --git a/sources/platform/actors/development/actor_definition/dataset_schema/multiple_datasets.md b/sources/platform/actors/development/actor_definition/dataset_schema/multiple_datasets.md new file mode 100644 index 0000000000..80f5abf4a4 --- /dev/null +++ b/sources/platform/actors/development/actor_definition/dataset_schema/multiple_datasets.md @@ -0,0 +1,108 @@ +--- +title: Multiple datasets +description: Learn how to use multiple datasets within your Actors to organize and store different types of data separately. +slug: /actors/development/actor-definition/dataset-schema/multiple-datasets +--- + +**Specify datasets with different structure.** + +--- + +Some Actors produce data with different structure. In some cases, it's convenient to store the data in separate datasets, instead of pushing all data to the default one. Multiple datasets allow to specify those datasets upfront and enforce validation rules. + +New datasets are created when the run starts, and follow it's data-retention. + + +## Defining multiple datasets + +The multiple datasets may defined in Actor schema using `datasets` object: + +```json title=".actor/actor.json" +{ + "actorSpecification": 1, + "name": "this-is-e-commerce-scraper", + "title": "E-Commerce Scraper", + "version": "1.0.0", + "storages": { + "datasets": { + "default": "./products_dataset_schema.json", + "categories": "./categories_dataset_schema.json" + } + } +} +``` +Schemas of individual datasets can be provided as a file reference or inlined. + +The keys of the `datasets` objects are **aliases**, which can be used to refer to the specific datasets. In the example above, we have two datasets, aliased as `default` and `categories`. + +:::info + +Alias and **name** are not the same thing. Named datasets have specific behavior in Apify platform (eg, the automatic data retention policy does not apply to them). Aliased datasets follow the data retention of their respective run. Aliases stay local to the run they belong to. + +::: + +The `datasets` object has to contain the `default` alias. + +The `datasets` and `dataset` objects are mutually exclusive, the schema can only contain one. + +## Accessing the datasets in Actor code + +Mapping of aliases to the IDs is passed to the Actor in JSON encoded `ACTOR_STORAGES_JSON` environment variable. + +```javascript +const storageIds = JSON.parse(process.env.ACTOR_STORAGES_JSON) +const defaultDataset = await Actor.openDataset(); +// For the default dataset, it's also possible to use the following syntax: +// const defaultDataset = await Actor.openDataset(storageIds.datasets.default); +const categoriesDataset = await Actor.openDataset(storageIds.datasets.categories); + +``` + +```sh +echo $ACTOR_STORAGES_JSON | jq '.datasets.categories' +``` + +Support for JS and Python SDKs is incoming, the expected syntax is following: + +```javascript +const categoriesDataset = await Actor.openDataset({alias: 'categories'}); +``` + +```python +categories_dataset = await Actor.open_dataset(alias='categories') +``` + +## Showing data to users + +### Run Storages tab + +The Storage tab of Actor run view is displaying all the dataset defined by Actor and datasets that were used by the run (up to some limit). + +This makes the data accessible, but not very user-friendly. To make the datasets more accessible to users, use output schema. + +### Output schema + +Actors with output schema can refer to the datasets through variables using aliases: + +```json +{ + "actorOutputSchemaVersion": 1, + "title": "Output schema", + "properties": { + "products": { + "type": "string", + "title": "Products", + "template": "{{storages.datasets.default.apiUrl}}/items" + }, + "categories": { + "type": "string", + "title": "Categories", + "template": "{{storages.datasets.categories.apiUrl}}/items" + } + } +} +``` + +## Billing implications + +The `apify-default-dataset-item` synthetic event is only charged for items in dataset aliased as `default`. Charging for items in other datasets needs to be implemented in the Actor code. diff --git a/sources/platform/actors/development/programming_interface/environment_variables.md b/sources/platform/actors/development/programming_interface/environment_variables.md index b295748a64..92df9b7955 100644 --- a/sources/platform/actors/development/programming_interface/environment_variables.md +++ b/sources/platform/actors/development/programming_interface/environment_variables.md @@ -44,6 +44,7 @@ Here's a table of key system environment variables: | `ACTOR_BUILD_TAGS` | A comma-separated list of tags of the Actor build used in the run. Note that this environment variable is assigned at the time of start of the Actor and doesn't change over time, even if the assigned build tags change. | | `ACTOR_TASK_ID` | ID of the Actor task. Empty if Actor is run outside of any task, e.g. directly using the API. | | `ACTOR_EVENTS_WEBSOCKET_URL` | Websocket URL where Actor may listen for [events](/platform/actors/development/programming-interface/system-events) from Actor platform. | +| `ACTOR_STORAGES_JSON` | JSON encoded unique identifiers of storages associated with the current Actor run | | `ACTOR_DEFAULT_DATASET_ID` | Unique identifier for the default dataset associated with the current Actor run. | | `ACTOR_DEFAULT_KEY_VALUE_STORE_ID` | Unique identifier for the default key-value store associated with the current Actor run. | | `ACTOR_DEFAULT_REQUEST_QUEUE_ID` | Unique identifier for the default request queue associated with the current Actor run. |