Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ingestify/infra/store/file/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .gcs_file_repository import GCSFileRepository
from .local_file_repository import LocalFileRepository
from .s3_file_repository import S3FileRepository
52 changes: 52 additions & 0 deletions ingestify/infra/store/file/gcs_file_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from pathlib import Path
from typing import BinaryIO

from ingestify.domain import Dataset
from ingestify.domain.models import FileRepository


class GCSFileRepository(FileRepository):
_client = None

@property
def client(self):
if not self._client:
from google.cloud import storage

self._client = storage.Client()
return self._client

def __getstate__(self):
return {
"base_dir": self.base_dir,
"_client": None,
"identifier_transformer": self.identifier_transformer,
}

def save_content(
self,
bucket: str,
dataset: Dataset,
revision_id: int,
filename: str,
stream: BinaryIO,
) -> Path:
key = self.get_write_path(bucket, dataset, revision_id, filename)
gcs_bucket = Path(key.parts[0])
self.client.bucket(str(gcs_bucket)).blob(
str(key.relative_to(gcs_bucket))
).upload_from_file(stream)
return key

def load_content(self, storage_path: str) -> BinaryIO:
key = self.get_read_path(storage_path)
gcs_bucket = Path(key.parts[0])
return (
self.client.bucket(str(gcs_bucket))
.blob(str(key.relative_to(gcs_bucket)))
.open("rb")
)

@classmethod
def supports(cls, url: str) -> bool:
return url.startswith("gcs://")
6 changes: 5 additions & 1 deletion ingestify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ingestify.domain.models.fetch_policy import FetchPolicy
from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
from ingestify.exceptions import ConfigurationError
from ingestify.infra import S3FileRepository, LocalFileRepository
from ingestify.infra import S3FileRepository, LocalFileRepository, GCSFileRepository
from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
from ingestify.infra.store.dataset.sqlalchemy.repository import (
SqlAlchemySessionProvider,
Expand Down Expand Up @@ -67,6 +67,10 @@ def build_file_repository(file_url: str, identifier_transformer) -> FileReposito
repository = S3FileRepository(
url=file_url, identifier_transformer=identifier_transformer
)
elif file_url.startswith("gcs://"):
repository = GCSFileRepository(
url=file_url, identifier_transformer=identifier_transformer
)
elif file_url.startswith("file://"):
repository = LocalFileRepository(
url=file_url, identifier_transformer=identifier_transformer
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def setup_package():
"boto3",
"pydantic>=2.0.0",
],
extras_require={"test": ["pytest>=6.2.5,<7", "pytz"]},
extras_require={
"gcs": ["google-cloud-storage>=2.0.0"],
"test": ["pytest>=6.2.5,<7", "pytz"],
},
)


Expand Down
Loading