From 370d229fa312b2da7bae7b49572cf9ce8c2257c2 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Thu, 19 Mar 2026 13:39:27 +0100 Subject: [PATCH 1/3] Add GCS file repository support Implements GCSFileRepository with Application Default Credentials and lazy client init. Accessible via gcs:// URL. Install with pip install ingestify[gcs]. --- .../infra/store/file/gcs_file_repository.py | 49 +++++++++++++++++++ ingestify/main.py | 5 ++ setup.py | 5 +- 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 ingestify/infra/store/file/gcs_file_repository.py diff --git a/ingestify/infra/store/file/gcs_file_repository.py b/ingestify/infra/store/file/gcs_file_repository.py new file mode 100644 index 0000000..d5e29dd --- /dev/null +++ b/ingestify/infra/store/file/gcs_file_repository.py @@ -0,0 +1,49 @@ +from pathlib import Path +from typing import BinaryIO + +from ingestify.domain import Dataset +from ingestify.domain.models import FileRepository + + +class GCSFileRepository(FileRepository): + _client = None + + @property + def client(self): + if not self._client: + from google.cloud import storage + + self._client = storage.Client() + return self._client + + def __getstate__(self): + return { + "base_dir": self.base_dir, + "_client": None, + "identifier_transformer": self.identifier_transformer, + } + + def save_content( + self, + bucket: str, + dataset: Dataset, + revision_id: int, + filename: str, + stream: BinaryIO, + ) -> Path: + key = self.get_write_path(bucket, dataset, revision_id, filename) + gcs_bucket = key.parts[0] + blob_name = str(Path(*key.parts[1:])) + self.client.bucket(gcs_bucket).blob(blob_name).upload_from_file(stream) + return key + + def load_content(self, storage_path: str) -> BinaryIO: + key = self.get_read_path(storage_path) + gcs_bucket = key.parts[0] + blob_name = str(Path(*key.parts[1:])) + blob = self.client.bucket(gcs_bucket).blob(blob_name) + return blob.open("rb") + + @classmethod + def supports(cls, url: str) -> bool: + return url.startswith("gcs://") diff --git a/ingestify/main.py b/ingestify/main.py index 4ba90b2..83ddaa1 100644 --- a/ingestify/main.py +++ b/ingestify/main.py @@ -22,6 +22,7 @@ from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer from ingestify.exceptions import ConfigurationError from ingestify.infra import S3FileRepository, LocalFileRepository +from ingestify.infra.store.file.gcs_file_repository import GCSFileRepository from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository from ingestify.infra.store.dataset.sqlalchemy.repository import ( SqlAlchemySessionProvider, @@ -67,6 +68,10 @@ def build_file_repository(file_url: str, identifier_transformer) -> FileReposito repository = S3FileRepository( url=file_url, identifier_transformer=identifier_transformer ) + elif file_url.startswith("gcs://"): + repository = GCSFileRepository( + url=file_url, identifier_transformer=identifier_transformer + ) elif file_url.startswith("file://"): repository = LocalFileRepository( url=file_url, identifier_transformer=identifier_transformer diff --git a/setup.py b/setup.py index 3915265..d4c3ea0 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,10 @@ def setup_package(): "boto3", "pydantic>=2.0.0", ], - extras_require={"test": ["pytest>=6.2.5,<7", "pytz"]}, + extras_require={ + "gcs": ["google-cloud-storage>=2.0.0"], + "test": ["pytest>=6.2.5,<7", "pytz"], + }, ) From 8b90285ef90a2125dadd6fad5ab7774a383ce130 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Thu, 19 Mar 2026 13:44:36 +0100 Subject: [PATCH 2/3] Align GCSFileRepository with S3/LocalFileRepository conventions - Export via infra/store/file/__init__.py like other repositories - Import from ingestify.infra in main.py - Align blob_name path handling with S3FileRepository --- ingestify/infra/store/file/__init__.py | 1 + ingestify/infra/store/file/gcs_file_repository.py | 15 ++++++++------- ingestify/main.py | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ingestify/infra/store/file/__init__.py b/ingestify/infra/store/file/__init__.py index e439861..dd3c910 100644 --- a/ingestify/infra/store/file/__init__.py +++ b/ingestify/infra/store/file/__init__.py @@ -1,2 +1,3 @@ +from .gcs_file_repository import GCSFileRepository from .local_file_repository import LocalFileRepository from .s3_file_repository import S3FileRepository diff --git a/ingestify/infra/store/file/gcs_file_repository.py b/ingestify/infra/store/file/gcs_file_repository.py index d5e29dd..4df2923 100644 --- a/ingestify/infra/store/file/gcs_file_repository.py +++ b/ingestify/infra/store/file/gcs_file_repository.py @@ -32,17 +32,18 @@ def save_content( stream: BinaryIO, ) -> Path: key = self.get_write_path(bucket, dataset, revision_id, filename) - gcs_bucket = key.parts[0] - blob_name = str(Path(*key.parts[1:])) - self.client.bucket(gcs_bucket).blob(blob_name).upload_from_file(stream) + gcs_bucket = Path(key.parts[0]) + self.client.bucket(str(gcs_bucket)).blob( + str(key.relative_to(gcs_bucket)) + ).upload_from_file(stream) return key def load_content(self, storage_path: str) -> BinaryIO: key = self.get_read_path(storage_path) - gcs_bucket = key.parts[0] - blob_name = str(Path(*key.parts[1:])) - blob = self.client.bucket(gcs_bucket).blob(blob_name) - return blob.open("rb") + gcs_bucket = Path(key.parts[0]) + return self.client.bucket(str(gcs_bucket)).blob( + str(key.relative_to(gcs_bucket)) + ).open("rb") @classmethod def supports(cls, url: str) -> bool: diff --git a/ingestify/main.py b/ingestify/main.py index 83ddaa1..e60b09a 100644 --- a/ingestify/main.py +++ b/ingestify/main.py @@ -21,8 +21,7 @@ from ingestify.domain.models.fetch_policy import FetchPolicy from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer from ingestify.exceptions import ConfigurationError -from ingestify.infra import S3FileRepository, LocalFileRepository -from ingestify.infra.store.file.gcs_file_repository import GCSFileRepository +from ingestify.infra import S3FileRepository, LocalFileRepository, GCSFileRepository from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository from ingestify.infra.store.dataset.sqlalchemy.repository import ( SqlAlchemySessionProvider, From a5267a56af348deb95eb3592af764786aa675c50 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Thu, 19 Mar 2026 14:22:19 +0100 Subject: [PATCH 3/3] Code formatting --- ingestify/infra/store/file/gcs_file_repository.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ingestify/infra/store/file/gcs_file_repository.py b/ingestify/infra/store/file/gcs_file_repository.py index 4df2923..a92ed61 100644 --- a/ingestify/infra/store/file/gcs_file_repository.py +++ b/ingestify/infra/store/file/gcs_file_repository.py @@ -41,9 +41,11 @@ def save_content( def load_content(self, storage_path: str) -> BinaryIO: key = self.get_read_path(storage_path) gcs_bucket = Path(key.parts[0]) - return self.client.bucket(str(gcs_bucket)).blob( - str(key.relative_to(gcs_bucket)) - ).open("rb") + return ( + self.client.bucket(str(gcs_bucket)) + .blob(str(key.relative_to(gcs_bucket))) + .open("rb") + ) @classmethod def supports(cls, url: str) -> bool: