accessioning-scripts/technical-appraisal-logs.py at main · uga-libraries/accessioning-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""File Manifest and Deletion Log Generator

This script generates a CSV manifest of all the digital files received in an accession,
including the file name, relevant dates, and MD5 hash. It also identifies file paths
that may break other scripts and saves those paths to a separate log for review.

Using the "compare" argument compares the initial manifest to the files left in the
accession after technical appraisal and generates an additional CSV log of any files
that were deleted in the process. The script can be run multiple times with this
argument and any additional deletions will be added to the existing log.

This script requires an installation of 'pandas' in your Python environment.

Script usage: python /path/to/script /path/to/accession/directory [compare]
"""

import os
import csv
import sys
import hashlib
import pandas as pd
from datetime import datetime

dir_to_log = sys.argv[1]
date = datetime.now().strftime("%Y%m%d")
header = ['File', 'SizeKB', 'DateCreated', 'DateModified', 'MD5', 'Notes']


def scan_full_dir(dirpath):
    """Scans a directory tree and gets os.DirEntry objects for all its files and subdirectories

    Adapted from code by Ben Hoyt on Stackoverflow: https://stackoverflow.com/a/33135143

    Parameters
    -----------
    dirpath : str
        The file path of the directory to scan

    Returns
    -----------
    os.DirEntry object
        Inidividual os.DirEntry objects for each file or directory in the tree, as they are generated
        Description of os.DirEntry attributes: https://docs.python.org/3/library/os.html#os.DirEntry
    """
    not_found = []
    for entry in os.scandir(dirpath):
        try:
            if entry.is_dir():
                yield from scan_full_dir(entry.path)
            else:
                yield entry
        except FileNotFoundError:
            print (f'FileNotFoundError: {entry}')
            not_found.append(entry)

    for x in not_found:
        ext_path = f'\\\\?\\{x.path}'
        if x.is_dir():
            yield from scan_full_dir(ext_path)
        else:
            yield ext_path


def find_init_manifest(dirpath):
    """Scans a directory and identifies a CSV file manifest created by this script

    Parameters
    -----------
    dirpath : str
        The file path of the directory to scan

    Returns
    -----------
    string
        The file path of the CSV file manifest in the directory
    """

    with os.scandir(dirpath) as d:
        for entry in d:
            fname = str(entry)
            if 'initialmanifest' in fname:
                init_manifest = entry.path
                return str(init_manifest)


def find_deletion_log(dirpath):
    """Scans a directory and identifies a CSV deletion log created by this script

    Parameters
    -----------
    dirpath : str
        The file path of the directory to scan

    Returns
    -----------
    string
        The file path of the CSV deletion log in the directory
    """

    with os.scandir(dirpath) as d:
        for entry in d:
            fname = str(entry)
            if 'deletionlog' in fname:
                del_log = entry.path
                return str(del_log)


def get_file_info(entry):
    """Aggregates relevant attributes from the os.DirEntry object for a file and generates its MD5 hash
        Description of the chained stat() method: https://docs.python.org/3/library/os.html#os.DirEntry.stat

    Parameters
    -----------
    entry : os.DirEntry object
        The object yielded by the scandir() iterator for the file

    Returns
    -----------
    list
        A list of the file's relevant attributes in str format
    """
    data = []

    path = entry.path
    size = entry.stat().st_size
    sizeKB = round((int(size)/1000), 1)
    modified = entry.stat().st_mtime
    try:
        date_modified = datetime.fromtimestamp(modified).strftime('%Y-%m-%d')
    except OSError:
        date_modified = 'date_modified_not_calculated'
    created = entry.stat().st_ctime
    try:
        date_created = datetime.fromtimestamp(created).strftime('%Y-%m-%d')
    except OSError:
        date_created = 'date_created_not_calculated'

    data.extend([path, sizeKB, date_modified, date_created])
    return data


if __name__ == "__main__":

    log_docs = ['deletionlog_', 'initialmanifest_', 'filestoreview_']

    # Check for a "compare" argument provided by the user
    try:
        if sys.argv[2].lower() == "compare":
            start_compare = sys.argv[2]
        else:
            start_compare = None
            print(f'\nERROR: "{sys.argv[2]}" is an unrecognized argument\n\nScript usage: python /path/to/script '
                  f'/path/to/accession/directory [compare]')
            quit()

    except IndexError:
        start_compare = None

    # If no "compare" argument, check for an existing manifest
    if start_compare is None:
        man = find_init_manifest(dir_to_log)

        # Alert the user if one is found with the same date and give them the option to
        # cancel the process and prevent overwriting
        if man:
            if date in man:
                todaysfile = man.rsplit('\\', 1)[-1]
                check = input(f'\nA file called "{todaysfile}" already exists in this location. Do you wish to overwrite'
                              f' it? Type Y or N: ')
                if check.lower() in ['n', 'no']:
                    print(f'\nProcess cancelled. \n\nTo create a log of deleted files, run this script again with the '
                          f'"compare" parameter: python /path/to/script /path/to/accession/directory compare')
                    quit()
                if check.lower() in ['y', 'yes']:
                    print('\nFile manifest will be saved to the accession folder. Working...')

        # Create the manifest CSV and a log of files to review
        with open(f'{dir_to_log}\\initialmanifest_{date}.csv', "w", encoding="utf-8", newline='') as manifest, \
                open(f'{dir_to_log}\\filestoreview_{date}.csv', "w", encoding="utf-8", newline='') as review_log:
            wr_initman = csv.writer(manifest)
            wr_revlog = csv.writer(review_log)
            wr_initman.writerow(header)
            wr_revlog.writerow(header)

            # Scan through the full directory tree and write the relevant file information to the manifest

            for entry in scan_full_dir(dir_to_log):
                try:
                    data = get_file_info(entry)

                    # Skip over log documents
                    if any(x in data[0] for x in log_docs):
                        continue
                    else:
                        # Generate MD5 checksum for each file in initial manifest
                        path = data[0]
                        with open(path, 'rb') as f:
                            file_data = f.read()
                            md5 = hashlib.md5(file_data).hexdigest()
                            md5_generated = md5.upper()
                            data.append(md5_generated)
                            wr_initman.writerow(data)
                    # Look at the file path and check for any problematic characters
                    filepath = data[0]
                    path = str(filepath)

                    probchars = ['&', '$', '*', '?']
                    smartquotes = ['“', '”', '’']

                    # If the path contains any of these substrings, write the relevant file info to the review log and
                    # include the reason
                    if any (c in path for c in probchars):
                        data.append("Path contains special characters")
                        wr_revlog.writerow(data)
                    if any (q in path for q in smartquotes):
                        data.append("Path contains smart quotes or apostrophes")
                        wr_revlog.writerow(data)

                    # Check the path length to see if it exceeds the Windows max path length
                    path.replace('\\', '\\\\')
                    if len(path) > 260:
                        data.append("Path exceeds 260 characters")
                        wr_revlog.writerow(data)
                except FileNotFoundError:
                    data = [f"Path not found: {entry}", None, None, None, "FileNotFoundError"]
                    wr_initman.writerow(data)
                    continue

    # If there's a "compare" argument, scan the directory and put current file information into a new dataframe
    if start_compare == "compare":
        new_df = pd.DataFrame(columns=header[:-2])
        for entry in scan_full_dir(dir_to_log):
            data = get_file_info(entry)

            # Skip over log docs
            if any(x in data[0] for x in log_docs):
                continue
            else:
                new_df.loc[len(new_df)] = data

        print('\nDeletion log will be saved to the accession folder. Working...')

        # Find the initial file manifest and read it to a pandas dataframe
        man_df = pd.DataFrame(columns=header)
        man = find_init_manifest(dir_to_log)
        df = pd.read_csv(man)
        man_df = pd.concat([man_df, df], axis=0)

        # Concatenate the two dataframes and exclude logs
        deleted = pd.concat([man_df, new_df], ignore_index=True)
        deleted = deleted[~deleted['File'].str.contains('|'.join(log_docs))]

        # Compare the file name and parent folder from the file paths in each dataframe to identify
        # and drop any duplicates
        deleted['FName'] = deleted['File'].astype(str).str.split('\\', n=-2).str[-1].str.strip()
        deleted = deleted.drop_duplicates('FName', keep=False)

        # Add a "Date Deleted" column with today's date
        deleted.drop(['DateModified'], axis=1, inplace=True)
        deleted.insert(3, 'DateDeleted', datetime.now().strftime("%Y-%m-%d"))
        deleted.drop(['FName'], axis=1, inplace=True)

        # Check to see if a deletion log already exists
        del_log = find_deletion_log(dir_to_log)
        if del_log:
            logfile = del_log.rsplit('\\', 1)[-1]
            print(f'\nA file called "{logfile}" already exists in this location. If any additional deletions are found,'
                  f' they will be added to this file.')
            del_df = pd.read_csv(del_log)

            # Concatenate the two dataframes and exclude logs
            new_deletions = pd.concat([deleted,del_df], ignore_index=True)
            new_deletions = new_deletions[~new_deletions['File'].str.contains('|'.join(log_docs))]

            # Compare the file name and parent folder from the file paths in each dataframe
            # to identify and drop any duplicates
            new_deletions['FName'] = new_deletions['File'].astype(str).str.split('\\', n=-2).str[-1].str.strip()
            new_deletions = new_deletions.drop_duplicates('FName', keep=False)
            new_deletions['DateDeleted'] = datetime.now().strftime("%Y-%m-%d")
            new_deletions.drop(['FName'], axis=1, inplace=True)

            # Append new deletion information to the existing CSV
            new_deletions.to_csv(del_log, mode='a', header=False, index=False)

            # Update CSV file name with today's date
            split_fn = del_log.rsplit('_', 1)[0]
            updated_fn = f'{split_fn}_{date}.csv'
            os.rename(del_log, updated_fn)

        # If no existing deletion log, write the dataframe of deleted files to a new CSV
        else:
            deleted.to_csv(f'{dir_to_log}\\deletionlog_{date}.csv', encoding="utf-8", index=False)

    print(f'\nScript is finished running.')