diff --git a/ingestify/infra/store/file/__init__.py b/ingestify/infra/store/file/__init__.py index e439861..dd3c910 100644 --- a/ingestify/infra/store/file/__init__.py +++ b/ingestify/infra/store/file/__init__.py @@ -1,2 +1,3 @@ +from .gcs_file_repository import GCSFileRepository from .local_file_repository import LocalFileRepository from .s3_file_repository import S3FileRepository diff --git a/ingestify/infra/store/file/gcs_file_repository.py b/ingestify/infra/store/file/gcs_file_repository.py new file mode 100644 index 0000000..a92ed61 --- /dev/null +++ b/ingestify/infra/store/file/gcs_file_repository.py @@ -0,0 +1,52 @@ +from pathlib import Path +from typing import BinaryIO + +from ingestify.domain import Dataset +from ingestify.domain.models import FileRepository + + +class GCSFileRepository(FileRepository): + _client = None + + @property + def client(self): + if not self._client: + from google.cloud import storage + + self._client = storage.Client() + return self._client + + def __getstate__(self): + return { + "base_dir": self.base_dir, + "_client": None, + "identifier_transformer": self.identifier_transformer, + } + + def save_content( + self, + bucket: str, + dataset: Dataset, + revision_id: int, + filename: str, + stream: BinaryIO, + ) -> Path: + key = self.get_write_path(bucket, dataset, revision_id, filename) + gcs_bucket = Path(key.parts[0]) + self.client.bucket(str(gcs_bucket)).blob( + str(key.relative_to(gcs_bucket)) + ).upload_from_file(stream) + return key + + def load_content(self, storage_path: str) -> BinaryIO: + key = self.get_read_path(storage_path) + gcs_bucket = Path(key.parts[0]) + return ( + self.client.bucket(str(gcs_bucket)) + .blob(str(key.relative_to(gcs_bucket))) + .open("rb") + ) + + @classmethod + def supports(cls, url: str) -> bool: + return url.startswith("gcs://") diff --git a/ingestify/main.py b/ingestify/main.py index 4ba90b2..e60b09a 100644 --- a/ingestify/main.py +++ b/ingestify/main.py @@ -21,7 +21,7 @@ from ingestify.domain.models.fetch_policy import FetchPolicy from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer from ingestify.exceptions import ConfigurationError -from ingestify.infra import S3FileRepository, LocalFileRepository +from ingestify.infra import S3FileRepository, LocalFileRepository, GCSFileRepository from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository from ingestify.infra.store.dataset.sqlalchemy.repository import ( SqlAlchemySessionProvider, @@ -67,6 +67,10 @@ def build_file_repository(file_url: str, identifier_transformer) -> FileReposito repository = S3FileRepository( url=file_url, identifier_transformer=identifier_transformer ) + elif file_url.startswith("gcs://"): + repository = GCSFileRepository( + url=file_url, identifier_transformer=identifier_transformer + ) elif file_url.startswith("file://"): repository = LocalFileRepository( url=file_url, identifier_transformer=identifier_transformer diff --git a/setup.py b/setup.py index 3915265..d4c3ea0 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,10 @@ def setup_package(): "boto3", "pydantic>=2.0.0", ], - extras_require={"test": ["pytest>=6.2.5,<7", "pytz"]}, + extras_require={ + "gcs": ["google-cloud-storage>=2.0.0"], + "test": ["pytest>=6.2.5,<7", "pytz"], + }, )