-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathfile.py
More file actions
225 lines (190 loc) · 7.68 KB
/
file.py
File metadata and controls
225 lines (190 loc) · 7.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from io import BytesIO, StringIO
import os
from typing import List, Optional, Union, IO
from pydantic import BaseModel, ConfigDict, Field
from pydantic.fields import PrivateAttr
from dvuploader.checksum import Checksum, ChecksumTypes
class File(BaseModel):
"""
Represents a file with its properties and methods for uploading to Dataverse.
Attributes:
filepath (str): The path to the file.
handler (Union[BytesIO, StringIO, IO, None]): File handler for reading the file contents.
description (str): The description of the file.
directory_label (str): The label of the directory where the file is stored.
mimeType (str): The MIME type of the file.
categories (Optional[List[str]]): The categories associated with the file.
restrict (bool): Indicates if the file is restricted.
checksum_type (ChecksumTypes): The type of checksum used for the file.
storageIdentifier (Optional[str]): The identifier of the storage where the file is stored.
file_name (Optional[str]): The name of the file.
checksum (Optional[Checksum]): The checksum of the file.
to_replace (bool): Indicates if the file should be replaced.
file_id (Optional[Union[str, int]]): The ID of the file to replace.
Private Attributes:
_size (int): Size of the file in bytes.
_unchanged_data (bool): Indicates if the file data has not changed since last upload.
_enforce_metadata_update (bool): Indicates if metadata update is enforced.
_is_inside_zip (bool): Indicates if the file is packaged inside a zip archive.
Methods:
extract_file_name(): Extracts filename from filepath and initializes file handler.
_validate_filepath(path): Validates if the file path exists and is a file.
apply_checksum(): Calculates and applies the checksum for the file.
"""
model_config = ConfigDict(
populate_by_name=True,
arbitrary_types_allowed=True,
)
filepath: str = Field(
...,
exclude=True,
description="The path to the file",
)
handler: Union[BytesIO, StringIO, IO, None] = Field(
default=None,
exclude=True,
description="File handler for reading the file contents",
)
description: Optional[str] = Field(
default=None,
alias="description",
description="The description of the file",
)
directory_label: Optional[str] = Field(
default=None,
alias="directoryLabel",
description="The label of the directory where the file is stored",
)
mimeType: str = Field(
default="application/octet-stream",
description="The MIME type of the file",
)
categories: Optional[List[str]] = Field(
default=["DATA"],
alias="categories",
description="The categories associated with the file",
)
restrict: bool = Field(
default=False,
alias="restrict",
description="Indicates if the file is restricted",
)
checksum_type: ChecksumTypes = Field(
default=ChecksumTypes.MD5,
exclude=True,
description="The type of checksum used for the file",
)
storageIdentifier: Optional[str] = Field(
default=None,
description="The identifier of the storage where the file is stored",
)
file_name: Optional[str] = Field(
default=None,
alias="fileName",
description="The name of the file",
)
checksum: Optional[Checksum] = Field(
default=None,
description="The checksum of the file",
)
file_id: Optional[Union[str, int]] = Field(
default=None,
alias="fileToReplaceId",
description="The ID of the file to replace",
)
tab_ingest: bool = Field(
default=True,
alias="tabIngest",
description="Indicates if tabular ingest should be performed",
)
to_replace: bool = Field(
default=False,
description="Indicates if the file should be replaced",
)
_size: int = PrivateAttr(default=0)
_unchanged_data: bool = PrivateAttr(default=False)
_enforce_metadata_update: bool = PrivateAttr(default=False)
_is_inside_zip: bool = PrivateAttr(default=False)
def extract_file_name(self):
"""
Extracts the file name from the file path and initializes the file handler.
Also calculates the file size and prepares for checksum calculation.
Returns:
self: The current instance of the class.
"""
# Hash file
hash_algo, hash_fun = self.checksum_type.value
if self.handler is None:
self._validate_filepath(self.filepath)
self._size = os.path.getsize(self.filepath)
else:
self._size = len(self.handler.read())
self.directory_label = os.path.dirname(self.filepath)
self.handler.seek(0)
if self.file_name is None:
self.file_name = os.path.basename(self.filepath)
self.checksum = Checksum.from_algo(
hash_fun=hash_fun,
hash_algo=hash_algo,
)
return self
def get_handler(self) -> IO:
"""
Opens the file and initializes the file handler.
"""
if self.handler is not None:
return self.handler
return open(self.filepath, "rb")
@staticmethod
def _validate_filepath(path):
"""
Validates if the given filepath exists and is a file.
Args:
path (str): The filepath to be validated.
Raises:
FileNotFoundError: If the filepath does not exist.
IsADirectoryError: If the filepath points to a directory instead of a file.
"""
if not os.path.exists(path):
raise FileNotFoundError(f"Filepath {path} does not exist.")
elif not os.path.isfile(path):
raise IsADirectoryError(f"Filepath {path} is not a file.")
def apply_checksum(self):
"""
Calculates and applies the checksum for the file.
Must be called after extract_file_name() has initialized the checksum.
And all data has been fed into the checksum hash function.
Raises:
AssertionError: If checksum is not initialized or hash function is not set.
"""
assert self.checksum is not None, "Checksum is not calculated."
assert self.checksum._hash_fun is not None, "Checksum hash function is not set."
self.checksum.apply_checksum()
def update_checksum_chunked(self, blocksize=2**20):
"""Updates the checksum with data read from a file-like object in chunks.
Args:
blocksize (int, optional): Size of chunks to read. Defaults to 1MB (2**20)
Raises:
AssertionError: If the hash function has not been initialized
Note:
This method resets the file position to the start after reading.
"""
assert self.checksum is not None, "Checksum is not initialized."
assert self.checksum._hash_fun is not None, "Checksum hash function is not set."
handler = self.get_handler()
while True:
buf = handler.read(blocksize)
if not isinstance(buf, bytes):
buf = buf.encode()
if not buf:
break
self.checksum._hash_fun.update(buf)
if self.handler is not None: # type: ignore
# In case of passed handler, we need to seek the handler to the start after reading.
self.handler.seek(0)
else:
# Path-based handlers will be opened just-in-time, so we can close it.
handler.close()
def __del__(self):
if self.handler is not None:
self.handler.close()