|
1 | 1 | import importlib.metadata |
2 | 2 | import json |
| 3 | +import mimetypes |
3 | 4 | import uuid |
4 | 5 | import logging |
5 | 6 | from itertools import count |
@@ -509,6 +510,84 @@ def ingest_upload( |
509 | 510 | backoff(ae, attempt) |
510 | 511 | return {} |
511 | 512 |
|
| 513 | + def signed_url_upload( |
| 514 | + self, |
| 515 | + collection_id: str, |
| 516 | + file_path: Optional[Path] = None, |
| 517 | + metadata: Optional[Dict] = None, |
| 518 | + index: bool = True, |
| 519 | + ) -> Dict: |
| 520 | + """ |
| 521 | + Upload a document using the signed URL workflow. |
| 522 | +
|
| 523 | + For directories (no file), falls back to the standard ingest endpoint |
| 524 | + since there is no file content to upload. |
| 525 | +
|
| 526 | + The workflow is: |
| 527 | + 1. POST /file/uploadUrl -> {url, id} |
| 528 | + 2. PUT file content to the signed url |
| 529 | + 3. POST /collections/{id}/document with the upload_id and metadata |
| 530 | +
|
| 531 | + params |
| 532 | + ------ |
| 533 | + collection_id: id of the collection to upload to |
| 534 | + file_path: path of the file to upload. None while creating folders |
| 535 | + metadata: dict containing metadata for the file or folders |
| 536 | + index: whether to index the document after creation |
| 537 | + """ |
| 538 | + if not file_path or file_path.is_dir(): |
| 539 | + return self.ingest_upload( |
| 540 | + collection_id, file_path, metadata=metadata, index=index |
| 541 | + ) |
| 542 | + |
| 543 | + mime_type = mimetypes.guess_type(file_path.name)[0] or MIME |
| 544 | + meta = dict(metadata or {}) |
| 545 | + meta["file_name"] = file_path.name |
| 546 | + meta["mime_type"] = mime_type |
| 547 | + |
| 548 | + for attempt in count(1): |
| 549 | + try: |
| 550 | + # Step 1: request a signed upload URL |
| 551 | + upload_url = self._make_url("file/uploadUrl") |
| 552 | + try: |
| 553 | + result = self._request("POST", upload_url) |
| 554 | + except AlephException as ae: |
| 555 | + if ae.status == 404: |
| 556 | + raise AlephException( |
| 557 | + "Upload endpoint not found. Is this an Aleph Pro instance?" |
| 558 | + ) from ae |
| 559 | + raise |
| 560 | + signed_url = result["url"] |
| 561 | + upload_id = result["id"] |
| 562 | + log.info("Signed URL id [%s]: %s", upload_id, file_path.name) |
| 563 | + log.debug("Signed URL id [%s]", upload_id) |
| 564 | + |
| 565 | + # Step 2: PUT file content to the signed URL |
| 566 | + try: |
| 567 | + with file_path.open("rb") as fh: |
| 568 | + response = self.session.put( |
| 569 | + signed_url, |
| 570 | + data=fh, |
| 571 | + headers={"Content-Type": "application/octet-stream"}, |
| 572 | + ) |
| 573 | + response.raise_for_status() |
| 574 | + except (RequestException, HTTPError) as exc: |
| 575 | + raise AlephException(exc) from exc |
| 576 | + |
| 577 | + # Step 3: create the document record |
| 578 | + doc_url_path = f"collections/{collection_id}/document" |
| 579 | + doc_url = self._make_url(doc_url_path, params={"index": index}) |
| 580 | + payload = {"upload_id": upload_id, "Meta": meta} |
| 581 | + result = self._request("POST", doc_url, json=payload) |
| 582 | + if not result: |
| 583 | + return {"id": upload_id, "status": "ok"} |
| 584 | + return result |
| 585 | + except AlephException as ae: |
| 586 | + if not ae.transient or attempt > self.retries: |
| 587 | + raise ae from ae |
| 588 | + backoff(ae, attempt) |
| 589 | + return {} |
| 590 | + |
512 | 591 | def create_entityset( |
513 | 592 | self, collection_id: str, type: str, label: str, summary: Optional[str] |
514 | 593 | ) -> Dict: |
|
0 commit comments