-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathschema_validators.py
More file actions
1023 lines (903 loc) · 48.1 KB
/
schema_validators.py
File metadata and controls
1023 lines (903 loc) · 48.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import re
import yaml
import logging
import requests
from datetime import datetime
# Local modules
from schema import schema_manager
from schema import schema_errors
from schema import schema_neo4j_queries
from schema.schema_constants import SchemaConstants
from hubmap_commons import hm_auth
logger = logging.getLogger(__name__)
####################################################################################################
## Entity Level Validators
####################################################################################################
"""
Validate the application specified in the custom HTTP header
for creating a new entity via POST.
Parameters
----------
options_dict : dict
A dictionary of data needed by this entity-level validator based upon the create/POST or
update/PUT actions. The dictionary will always have 'http_request' and will have
'existing_entity_dict' for a PUT request.
"""
def validate_application_header_before_entity_create(options_dict):
if 'http_request' in options_dict:
request = options_dict['http_request']
else:
logger.error(f"validate_application_header_before_entity_create() expected 'http_request' in"
f" options_dict, but it was missing in {str(options_dict)}.")
raise KeyError("Entity validator internal misconfiguration.")
# A list of applications allowed to create this new entity or update Dataset and Upload
# Use lowercase for comparison
applications_allowed = [
SchemaConstants.INGEST_API_APP,
SchemaConstants.INGEST_PIPELINE_APP,
SchemaConstants.INGEST_UI,
SchemaConstants.ENTITY_API_APP
]
_validate_application_header(applications_allowed, request.headers)
"""
Validate required conditions prior to allowing update of an existing entity via PUT.
Parameters
----------
options_dict : dict
A dictionary of data needed by this entity-level validator based upon the create/POST or
update/PUT actions. The dictionary will always have 'http_request' and will have
'existing_entity_dict' for a PUT request.
"""
def validate_entity_not_locked_before_update(options_dict):
if 'existing_entity_dict' in options_dict:
existing_entity_dict = options_dict['existing_entity_dict']
else:
logger.error(f"validate_entity_not_locked_before_update() expected 'existing_entity_dict' in"
f" options_dict, but it was missing in {str(options_dict)}.")
raise KeyError("Entity validator internal misconfiguration.")
_is_entity_locked_against_update(existing_entity_dict)
##############################################################################################
## Property Level Validators
####################################################################################################
"""
Validate the specified value for a Dataset's dataset_type is in the valueset UBKG recognizes.
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_recognized_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# If the proposed Dataset dataset_type ends with something in square brackets, anything inside
# those square brackets are acceptable at the end of the string. Simply validate the start.
proposed_dataset_type_prefix = re.sub(pattern='(\S)\s\[.*\]$', repl=r'\1', string=new_data_dict['dataset_type'])
target_list = schema_manager.get_dataset_type_valueset_list()
# TODO This is a temporary bypass because the UBKG does not support publication as a dataset_type yet. Remove once its added
target_list.append("Publication")
if proposed_dataset_type_prefix not in target_list:
raise ValueError(f"Proposed Dataset dataset_type '{proposed_dataset_type_prefix}'"
f" is not recognized in the existing ontology."
f" Valid values are: {str(target_list)}.")
"""
Validate the specified value for an Upload's intended_dataset_type is in the valueset UBKG recognizes.
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_intended_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# If the proposed Upload intended_dataset_type ends with something in square brackets, anything inside
# those square brackets are acceptable at the end of the string. Simply validate the start.
proposed_dataset_type_prefix = re.sub(pattern='(\S)\s\[.*\]$', repl=r'\1', string=new_data_dict['intended_dataset_type'])
target_list = schema_manager.get_dataset_type_valueset_list()
if proposed_dataset_type_prefix not in target_list:
raise ValueError(f"Proposed Upload intended_dataset_type '{proposed_dataset_type_prefix}'"
f" is not recognized in the existing ontology."
f" Valid values are: {str(target_list)}.")
"""
Validate the target list has no duplicated items
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_no_duplicates_in_list(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# Use lowercase for comparison via list comprehensions
target_list = [v.lower() for v in new_data_dict[property_key]]
if len(set(target_list)) != len(target_list):
raise ValueError(f"The {property_key} field must only contain unique items")
"""
Validate all the provided uuids exist and all are Datasets when updating the target Upload
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_ids_exist_and_datasets(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
all_uuids_list = new_data_dict[property_key]
qualified_uuids_list = schema_neo4j_queries.get_found_dataset_uuids(neo4j_driver_instance, all_uuids_list)
unqualified_uuids_list = [item for item in all_uuids_list if item not in qualified_uuids_list]
if unqualified_uuids_list:
raise ValueError(f"The following {len(unqualified_uuids_list)} uuids are either not found or not Dataset type: {str(unqualified_uuids_list)}.")
"""
Validate that a given dataset is not a component of a multi-assay split parent dataset fore allowing status to be
updated. If a component dataset needs to be updated, update it via its parent multi-assay dataset
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_dataset_not_component(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
headers = request.headers
if not headers.get(SchemaConstants.INTERNAL_TRIGGER) == SchemaConstants.COMPONENT_DATASET:
neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
uuid = existing_data_dict['uuid']
creation_action = schema_neo4j_queries.get_entity_creation_action_activity(neo4j_driver_instance, uuid)
if creation_action == 'Multi-Assay Split':
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']}. Can not change status on component datasets directly. Status"
f"change must occur on parent multi-assay split dataset")
"""
If the provided previous revision is already a revision of another dataset, disallow
"""
def validate_if_revision_is_unique(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
previous_revision = new_data_dict['previous_revision_uuid']
neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
next_revision = schema_neo4j_queries.get_next_revision_uuid(neo4j_driver_instance, previous_revision)
if next_revision:
raise ValueError(f"Dataset marked as previous revision is already the previous revision of another dataset. "
f"Each dataset may only be the previous revision of one other dataset")
"""
If an entity has a DOI, do not allow it to be updated
"""
def halt_update_if_DOI_exists(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'doi_url' in existing_data_dict or 'registered_doi' in existing_data_dict:
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']} due to DOI already exists.")
"""
Do not allow a Collection to be created or updated with DOI information if it does not meet all the
criteria of being a public entity.
"""
def halt_DOI_if_collection_missing_elements(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'contacts' not in existing_data_dict:
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']} for DOI because it has no contacts.")
if 'contributors' not in existing_data_dict:
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']} for DOI because it has no contributors.")
# Count up other validations to check 'datasets', since a transient property
"""
Do not allow a Collection to be created or updated with DOI information if any Dataset in the Collection is not public.
"""
def halt_DOI_if_unpublished_dataset(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# If the request is not trying to create/update DOI, simply return so the request can proceed.
if 'doi_url' not in new_data_dict or 'registered_doi' not in new_data_dict:
return
neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
distinct_dataset_levels = []
if 'dataset_uuids' in new_data_dict:
# For a Create POST request, or for an Update PUT request with 'dataset_uuids' specified,
# retrieve all the existing Datasets specified with the request.
dataset_uuids = existing_data_dict['dataset_uuids']
collection_datasets = []
for dataset_uuid in dataset_uuids:
try:
ds = schema_neo4j_queries.get_entity(neo4j_driver_instance
,dataset_uuid)
if ds['data_access_level'] not in distinct_dataset_levels:
distinct_dataset_levels.append(ds['data_access_level'])
except Exception as nfe:
raise ValueError(f"Unable to modify existing {new_data_dict['entity_type']}"
f" {new_data_dict['uuid']} since"
f" Dataset {dataset_uuid} could not be found to verify.")
else:
# For an Update PUT request without 'dataset_uuids' specified,
# simply get the existing, distinct 'data_access_level' setting for all the Datasets in the Collection
distinct_dataset_statuses = schema_neo4j_queries.get_collection_datasets_statuses(neo4j_driver_instance
,existing_data_dict['uuid'])
if len( distinct_dataset_statuses) != 1 or \
distinct_dataset_statuses[0].lower() != SchemaConstants.DATASET_STATUS_PUBLISHED:
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']} for DOI since it contains unpublished Datasets.")
"""
Validate the DOI parameters are presented as a pair during creation or modification.
Even if one is populated already, disallow setting the other, so the data is consciously synced.
Verify the values are compatible with each other.
"""
def verify_DOI_pair(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# Disallow providing one DOI parameter but not the other
if ('doi_url' in new_data_dict and 'registered_doi' not in new_data_dict) or \
('doi_url' not in new_data_dict and 'registered_doi' in new_data_dict):
raise ValueError( f"The properties 'doi_url' and 'registered_doi' must both be set in the same operation.")
# Since both DOI parameters are present, make sure neither is the empty string
if new_data_dict['doi_url'] == '' or new_data_dict['registered_doi'] == '':
raise ValueError( f"The properties 'doi_url' and 'registered_doi' cannot be empty, when specified.")
# Check if doi_url matches registered_doi with the expected prefix
try:
expected_doi_url = SchemaConstants.DOI_BASE_URL + new_data_dict['registered_doi']
except Exception as e:
# If SchemaConstants.DOI_BASE_URL is not set, or there is some other
# problem, give up and fail this validation.
logger.error(f"During verify_DOI_pair schema validator, unexpected exception e={str(e)}")
raise ValueError(f"An unexpected error occurred during evaluation of DOI parameters. See logs.")
if expected_doi_url and new_data_dict['doi_url'] != expected_doi_url:
raise ValueError( f"The 'doi_url' property should match the 'registered_doi' property, after"
f" the prefix {SchemaConstants.DOI_BASE_URL}.")
"""
Validate every entity in a list is of entity_type accepted
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def collection_entities_are_existing_datasets(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# `dataset_uuids` is required for creating a Collection
# Verify each UUID specified exists in the uuid-api, exists in Neo4j, and is for a Dataset before
# proceeding with creation of Collection.
bad_dataset_uuids = []
for dataset_uuid in new_data_dict['dataset_uuids']:
try:
## The following code duplicates some functionality existing in app.py, in
## query_target_entity(), which also deals with caching. In the future, the
## validation logic shared by this file and app.py should become a utility
## module, shared by validators as well as app.py. But for now, the code
## is repeated for the following.
# Get cached ids if exist otherwise retrieve from UUID-API. Expect an
# Exception to be raised if not found.
dataset_uuid_entity = schema_manager.get_hubmap_ids(id=dataset_uuid)
# If the uuid exists per the uuid-api, make sure it also exists as a Neo4j entity.
uuid = dataset_uuid_entity['uuid']
entity_dict = schema_neo4j_queries.get_entity(schema_manager.get_neo4j_driver_instance(), dataset_uuid)
# If dataset_uuid is not found in Neo4j or is not for a Dataset, fail the validation.
if not entity_dict:
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
f" but not found in Neo4j.")
bad_dataset_uuids.append(dataset_uuid)
elif entity_dict['entity_type'] != 'Dataset':
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
f" but entity_type={entity_dict['entity_type']}, not Dataset.")
bad_dataset_uuids.append(dataset_uuid)
except Exception as nfe:
# If the dataset_uuid is not found, fail the validation.
logger.info(f"Request for {dataset_uuid} inclusion in Collection"
f" failed uuid-api retrieval.")
bad_dataset_uuids.append(dataset_uuid)
# If any uuids in the request dataset_uuids are not for an existing Dataset entity which
# exists in uuid-api and Neo4j, raise an Exception so the validation fails and the
# operation can be rejected.
if bad_dataset_uuids:
raise ValueError(f"Unable to find Datasets for {bad_dataset_uuids}.")
"""
Validate the provided value of Dataset.status on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Dataset
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_application_header_before_property_update(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# A list of applications allowed to update Dataset.status or Upload.status
# Use lowercase for comparison
applications_allowed = [
SchemaConstants.INGEST_API_APP,
SchemaConstants.INGEST_PIPELINE_APP,
SchemaConstants.INGEST_UI,
SchemaConstants.ENTITY_API_APP
]
_validate_application_header(applications_allowed, request.headers)
"""
Validate the provided value of Dataset.status on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Dataset
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_dataset_status_value(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# Use lowercase for comparison
accepted_status_values = [
'new', 'processing', 'published', 'qa', 'error', 'hold', 'invalid', 'submitted', 'incomplete'
]
new_status = new_data_dict[property_key].lower()
if new_status not in accepted_status_values:
raise ValueError("The provided status value of Dataset is not valid")
if 'status' not in existing_data_dict:
raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'validate_dataset_status_value()' validator method.")
# If status == 'Published' already in Neo4j, then fail for any changes at all
# Because once published, the dataset should be read-only
if existing_data_dict['status'].lower() == SchemaConstants.DATASET_STATUS_PUBLISHED:
raise ValueError(f"The status of this {normalized_entity_type} is already 'Published', status change is not allowed")
# HTTP header names are case-insensitive
# request.headers.get('X-Hubmap-Application') returns None if the header doesn't exist
app_header = request.headers.get(SchemaConstants.HUBMAP_APP_HEADER)
# Change status to 'Published' can only happen via ingest-api
# because file system changes are needed
if (new_status == SchemaConstants.DATASET_STATUS_PUBLISHED) and (app_header.lower() != SchemaConstants.INGEST_API_APP):
raise ValueError(f"Dataset status change to 'Published' can only be made via {SchemaConstants.INGEST_API_APP}")
"""
Validate that status, if included in new_data_dict, is different from the existing status value
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_status_changed(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'status' not in existing_data_dict:
raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'validate_status_changed()' validator method.")
# Only allow 'status' in new_data_dict if its different than the existing status value
if existing_data_dict['status'].lower() == new_data_dict['status'].lower():
raise ValueError(f"Status value is already {existing_data_dict['status']}, cannot change to {existing_data_dict['status']}. If no change, do not include status field in update")
"""
Validate the sub_status field is also provided when Dataset.retraction_reason is provided on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_if_retraction_permitted(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'status' not in existing_data_dict:
raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'validate_if_retraction_permitted()' validator method.")
# Only published dataset can be retracted
if existing_data_dict['status'].lower() != SchemaConstants.DATASET_STATUS_PUBLISHED:
raise ValueError("This dataset is not published, retraction is not allowed")
# Only token in HuBMAP-Data-Admin group can retract a published dataset
try:
# The property 'hmgroupids' is ALWAYS in the output with using schema_manager.get_user_info()
# when the token in request is a nexus_token
user_info = schema_manager.get_user_info(request)
hubmap_admin_group_uuid = schema_manager.get_auth_helper_instance().groupNameToId('HuBMAP-Data-Admin')['uuid']
except Exception as e:
# Log the full stack trace, prepend a line with our message
logger.exception(e)
# If the token is not a nexus token, no group information available
# The commons.hm_auth.AuthCache would return a Response with 500 error message
# We treat such cases as the user not in the HuBMAP-READ group
raise ValueError("Failed to parse the permission based on token, retraction is not allowed")
if hubmap_admin_group_uuid not in user_info['hmgroupids']:
raise ValueError("Permission denied, retraction is not allowed")
"""
Validate the sub_status field is also provided when Dataset.retraction_reason is provided on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_sub_status_provided(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'sub_status' not in new_data_dict:
raise ValueError("Missing sub_status field when retraction_reason is provided")
"""
Validate the reaction_reason field is also provided when Dataset.sub_status is provided on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_retraction_reason_provided(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'retraction_reason' not in new_data_dict:
raise ValueError("Missing retraction_reason field when sub_status is provided")
"""
Validate the provided value of Dataset.sub_status on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_retracted_dataset_sub_status_value(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# Use lowercase for comparison
accepted_sub_status_values = ['retracted']
sub_status = new_data_dict[property_key].lower()
if sub_status not in accepted_sub_status_values:
raise ValueError("Invalid sub_status value of the Dataset to be retracted")
"""
Validate the provided value of Upload.status on update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_upload_status_value(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# Use lowercase for comparison
accepted_status_values = [
'new', 'valid', 'invalid', 'error', 'reorganized', 'processing', 'submitted', 'incomplete'
]
new_status = new_data_dict[property_key].lower()
if new_status not in accepted_status_values:
raise ValueError(f"Invalid status value: {new_status}")
"""
Validate the anticipated_complete_data string provided for an Upload
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_anticipated_complete_date(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
MAX_ANTICIPATED_COMPLETE_DATE = '2026-12'
anticipated_complete_date_str = new_data_dict[property_key]
if not re.fullmatch(pattern=r'^\d{4}-\d{2}$', string=anticipated_complete_date_str):
raise ValueError(f"Format of '{anticipated_complete_date_str}' does not match the format YYYY-MM")
anticipated_year, anticipated_month = map(int, anticipated_complete_date_str.split("-"))
if anticipated_month < 1 or anticipated_month > 12:
raise ValueError(f"Anticipated completion month of '{anticipated_complete_date_str[5:]}' is not valid")
now = datetime.now()
current_year = now.year
current_month = now.month
if anticipated_year < current_year or \
(anticipated_year == current_year and anticipated_month < current_month):
raise ValueError( f"Anticipated complete date '{anticipated_complete_date_str}'"
f" cannot be before the current month.")
max_anticipated_year, max_anticipated_month = map(int, MAX_ANTICIPATED_COMPLETE_DATE.split("-"))
if anticipated_year > max_anticipated_year:
raise ValueError( f"Anticipated complete date '{anticipated_complete_date_str}'"
f" cannot be after '{MAX_ANTICIPATED_COMPLETE_DATE}'.")
"""
Validate the anticipated_dataset_count integer provided for an Upload
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_anticipated_dataset_count(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
# anticipated_dataset_count of type int, assured by provenance_schema.yaml "type: integer"
anticipated_dataset_count = new_data_dict[property_key]
if anticipated_dataset_count <= 0:
raise ValueError(f"{property_key} must be positive integer when specified.")
"""
Validate the provided value of Sample.sample_category on create via POST and update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_sample_category(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
defined_tissue_types = ["organ", "block", "section", "suspension"]
sample_category = new_data_dict[property_key].lower()
if sample_category not in defined_tissue_types:
raise ValueError(f"Invalid sample_category: {sample_category}."
f" Should be one of {', '.join(defined_tissue_types)}.")
# Given the sample_category is a defined_tissue_types element, assure the request has
# the proper case for storage
if new_data_dict[property_key] != sample_category:
raise ValueError(f"The case of sample_category '{new_data_dict[property_key]}'"
f" must be specified as '{sample_category}'.")
"""
Validate the provided value of Dataset.direct_ancestor on create via POST and update via PUT
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_ancestor_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
allowed_ancestor_types = ["Dataset", "Sample"]
for allowed_ancestor in list(allowed_ancestor_types):
subclasses = schema_manager.get_entity_subclasses(schema_manager.normalize_entity_type(allowed_ancestor))
allowed_ancestor_types.extend(subclasses)
direct_ancestor_uuids = new_data_dict[property_key]
disallowed_properties = [{"property": "sample_category", "value": "organ"}]
invalid_uuids = schema_neo4j_queries.validate_direct_ancestors(schema_manager.get_neo4j_driver_instance(), direct_ancestor_uuids, allowed_ancestor_types, disallowed_properties)
if invalid_uuids:
raise ValueError(f"Invalid or not-found direct_ancestor_uuid(s). Allowed entity_types are: {', '.join(allowed_ancestor_types)}. For samples, 'organ' is not allowed. Invalid uuids: {', '.join(invalid_uuids)}")
"""
Validate the provided value of Publication.publication_date is in the correct format against ISO 8601 Format:
'2022-10-31T09:00:00Z' for example, but we only care the date part 'YYYY-MM-DD'
on create via POST and update via PUT
Note: we allow users to use a future date value
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_publication_date(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
try:
# The user provided date string is valid if we can convert it to a datetime object
# base on the ISO 8601 format, 'YYYY-MM-DD', it's fine if the user entered the time part
date_obj = datetime.fromisoformat(new_data_dict[property_key])
except ValueError:
raise ValueError(f"Invalid {property_key} format, must be YYYY-MM-DD")
"""
Validate that the id for the given entity is not included in the direct ancestor uuid's to prevent loops.
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_id_not_in_direct_ancestor(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
if 'uuid' not in existing_data_dict:
raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'validate_id_not_in_direct_ancestor()' validator method.")
entity_uuid = existing_data_dict.get("uuid")
ancestors = new_data_dict.get(property_key)
if entity_uuid in ancestors:
raise ValueError(f"Entity uuid may not be included in {property_key}.")
"""
Validate the provided value of the activity creation action. Only very specific
values are allowed.
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_creation_action(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
accepted_creation_action_values = ["central process", "lab process", "external process"]
creation_action = new_data_dict[property_key].lower()
if creation_action and creation_action not in accepted_creation_action_values:
raise ValueError("Invalid {} value. Accepted values are: {}".format(property_key, ", ".join(accepted_creation_action_values)))
if creation_action == '':
raise ValueError(f"The property {property_key} cannot be empty, when specified.")
if creation_action == 'external process':
direct_ancestor_uuids = new_data_dict.get('direct_ancestor_uuids')
entity_types_dict = schema_neo4j_queries.filter_ancestors_by_type(schema_manager.get_neo4j_driver_instance(), direct_ancestor_uuids, "dataset")
if entity_types_dict:
raise ValueError(f"If 'creation_action' field is given, all ancestor uuids must belong to datasets. The following entities belong to non-dataset entities \
{entity_types_dict}")
"""
Validate the provided value of the activity creation action before updating direct ancestors. Certain values prohibited
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_not_invalid_creation_action(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
prohibited_creation_action_values = ["Central Process", "Multi-Assay Split"]
entity_uuid = existing_data_dict.get('uuid')
creation_action = schema_neo4j_queries.get_entity_creation_action_activity(schema_manager.get_neo4j_driver_instance(), entity_uuid)
if creation_action and creation_action in prohibited_creation_action_values:
raise ValueError("Cannot update {} value if creation_action of parent activity is {}".format(property_key, ", ".join(prohibited_creation_action_values)))
"""
Validate that the user is in Hubmap-Data-Admin before creating or updating field 'assigned_to_group_name'
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_in_admin_group(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
try:
# The property 'hmgroupids' is ALWAYS in the output with using schema_manager.get_user_info()
# when the token in request is a nexus_token
user_info = schema_manager.get_user_info(request)
hubmap_admin_group_uuid = schema_manager.get_auth_helper_instance().groupNameToId('HuBMAP-Data-Admin')['uuid']
except Exception as e:
# Log the full stack trace, prepend a line with our message
logger.exception(e)
# If the token is not a groups token, no group information available
# The commons.hm_auth.AuthCache would return a Response with 500 error message
# We treat such cases as the user not in the HuBMAP-Data group
raise ValueError("Failed to parse the permission based on token, retraction is not allowed")
if hubmap_admin_group_uuid not in user_info['hmgroupids']:
raise ValueError(f"Permission denied, not permitted to set property {property_key}")
"""
Validate that the provided group_name is one of the group name 'shortname' values where data_provider == true available
from hubmap-commons in the xxx-globus-groups.json file on entity creation
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_group_name(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
assigned_to_group_name = new_data_dict['assigned_to_group_name']
# If method is PUT, an empty string is allowed, thus validation is skipped. But if a value is given, it must still be validated.
if not (request.method == "PUT" and (not assigned_to_group_name or not str(assigned_to_group_name).strip())):
globus_groups = schema_manager.get_auth_helper_instance().getHuBMAPGroupInfo()
group_dict = next((entry for entry in globus_groups.values() if entry.get("displayname") == assigned_to_group_name), None)
if group_dict is None:
raise ValueError("Invalid value for 'assigned_to_group_name'")
is_data_provider = group_dict.get('data_provider')
if not is_data_provider:
raise ValueError("Invalid group in 'assigned_to_group_name'. Must be a data provider")
"""
Trigger event method to verify tracking fields new_associated_multi_assay_uuid and
superseded_associated_processed_component_uuids are set coherently on Multi-Assay Datasets and
their component Datasets.
Parameters
----------
property_key : str
The target property key
normalized_type : str
One of the types defined in the schema yaml: Dataset
user_token: str
The user's globus nexus token
existing_data_dict : dict
A dictionary that contains all existing entity properties as Neo4j data types.
N.B. elements are not Python data types and must be converted with utilities like schema_manager.convert_str_literal()
new_data_dict : dict
The request input data as Python data structures converted from JSON, which has passed schema validation, entity
validation, and possibly other property validations.
"""
def verify_multi_assay_dataset_components(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
if 'superseded_associated_processed_component_uuids' in existing_data_dict \
and 'superseded_associated_processed_component_uuids' in new_data_dict:
raise ValueError( f"'superseded_associated_processed_component_uuids' is already set on"
f" {existing_data_dict['uuid']}.")
if 'new_associated_multi_assay_uuid' in existing_data_dict \
and 'new_associated_multi_assay_uuid' in new_data_dict:
raise ValueError( f"'new_associated_multi_assay_uuid' is already set on"
f" {existing_data_dict['uuid']}.")
if 'superseded_associated_processed_component_uuids' in new_data_dict \
and 'new_associated_multi_assay_uuid' in new_data_dict:
raise ValueError( f"'superseded_associated_processed_component_uuids' and 'new_associated_multi_assay_uuid'"
f" cannot both be specified on a single Dataset.")
if 'superseded_associated_processed_component_uuids' in new_data_dict \
and 'new_associated_multi_assay_uuid' in existing_data_dict:
raise ValueError( f"'superseded_associated_processed_component_uuids' cannot be set on"
f" existing Dataset {existing_data_dict['uuid']} because it is a component Dataset of"
f" {existing_data_dict['new_associated_multi_assay_uuid']}.")
if 'new_associated_multi_assay_uuid' in new_data_dict \
and 'superseded_associated_processed_component_uuids' in existing_data_dict:
# Convert the string from Neo4j the Python list
supersededComponentDatasets = schema_manager.convert_str_literal(existing_data_dict['superseded_associated_processed_component_uuids'])
raise ValueError( f"'new_associated_multi_assay_uuid' cannot be set on"
f" existing Dataset {existing_data_dict['uuid']} because it is a Multi-Assay Dataset"
f" with {len(supersededComponentDatasets)}"
f" component Datasets it supersedes.")
# If no contradictions above have caused a ValueError, check if new data contains UUIDs for valid entities.
if 'new_associated_multi_assay_uuid' in new_data_dict:
proposedMultiAssayDataset = schema_neo4j_queries.get_entity(schema_manager.get_neo4j_driver_instance()
, new_data_dict['new_associated_multi_assay_uuid'])
if len(proposedMultiAssayDataset) < 1:
raise ValueError( f"'new_associated_multi_assay_uuid' value"
f" {new_data_dict['new_associated_multi_assay_uuid']}"
f" does not exist.")
if 'superseded_associated_processed_component_uuids' in new_data_dict:
for uuid in new_data_dict['superseded_associated_processed_component_uuids']:
proposedComponentDataset = schema_neo4j_queries.get_entity( schema_manager.get_neo4j_driver_instance()
, uuid)
if len(proposedComponentDataset) < 1:
raise ValueError(f"'superseded_associated_processed_component_uuids' entry with value"
f" {uuid} does not exist.")
# fall out successfully if no raise() occurred.
return
"""
Validate the specified value for an Upload's priority_project_list is in a recognized value
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
def validate_priority_project(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
allowed_priority_projects = SchemaConstants.ALLOWED_PRIORITY_PROJECTS
for priority_project in new_data_dict.get('priority_project_list'):
if priority_project not in allowed_priority_projects:
raise ValueError(f"Provided priority_project_list contains unrecognized value: {priority_project}. Allowed values are {', '.join(allowed_priority_projects)}. These are case-sensitive values.")
####################################################################################################
## Internal Functions
####################################################################################################
"""
Validate the application specified in the custom HTTP header
Parameters
----------
applications_allowed : list
A list of applications allowed, use lowercase for comparison
request_headers: Flask request.headers object, behaves like a dict
The instance of Flask request.headers passed in from application request
"""
def _validate_application_header(applications_allowed, request_headers):
# HTTP header names are case-insensitive
# request_headers.get('X-Hubmap-Application') returns None if the header doesn't exist
app_header = request_headers.get(SchemaConstants.HUBMAP_APP_HEADER)
if not app_header:
msg = f"Unable to proceed due to missing {SchemaConstants.HUBMAP_APP_HEADER} header from request"
raise schema_errors.MissingApplicationHeaderException(msg)
# Use lowercase for comparing the application header value against the yaml
if app_header.lower() not in applications_allowed:
msg = f"Unable to proceed due to invalid {SchemaConstants.HUBMAP_APP_HEADER} header value: {app_header}"
raise schema_errors.InvalidApplicationHeaderException(msg)
"""
Indicate if the entity meets a criteria to lock out modification updates