@@ -55,20 +55,6 @@ def init_spark_and_db(app_name: str, database: str) -> SparkSession:
5555 return spark
5656
5757
58- # ---------------------------------------------------------------------
59- # CDM TABLE SCHEMAS
60- # ---------------------------------------------------------------------
61- # Using centralized schemas
62- IDENTIFIER_SCHEMA = cdm_schemas ["Identifier" ]
63- NAME_SCHEMA = cdm_schemas ["Name" ]
64- FEATURE_SCHEMA = cdm_schemas ["Feature" ]
65- CONTIG_COLLECTION_X_FEATURE_SCHEMA = cdm_schemas ["ContigCollection_x_Feature" ]
66- CONTIG_COLLECTION_X_PROTEIN_SCHEMA = cdm_schemas ["ContigCollection_x_Protein" ]
67- FEATURE_X_PROTEIN_SCHEMA = cdm_schemas ["Feature_x_Protein" ]
68- CONTIG_SCHEMA = cdm_schemas ["Contig" ]
69- CONTIG_X_CONTIG_COLLECTION_SCHEMA = cdm_schemas ["Contig_x_ContigCollection" ]
70-
71-
7258# ---------------------------------------------------------------------
7359# CDM PREFIX NORMALIZATION
7460# ---------------------------------------------------------------------
@@ -160,19 +146,21 @@ def load_feature_records(data: dict) -> list[tuple]:
160146 "minus" : "negative" ,
161147 "unstranded" : "unstranded" ,
162148 }.get (r .get ("orientation" ), "unknown" )
163- features .append ((
164- feature_id ,
165- None ,
166- None ,
167- None ,
168- to_int (r .get ("end" )),
169- None ,
170- to_int (r .get ("begin" )),
171- strand ,
172- "RefSeq" ,
173- None ,
174- "gene" ,
175- ))
149+ features .append (
150+ (
151+ feature_id ,
152+ None ,
153+ None ,
154+ None ,
155+ to_int (r .get ("end" )),
156+ None ,
157+ to_int (r .get ("begin" )),
158+ strand ,
159+ "RefSeq" ,
160+ None ,
161+ "gene" ,
162+ )
163+ )
176164 return features
177165
178166
@@ -270,10 +258,12 @@ def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
270258 assembly = annotations [0 ].get ("assembly_accession" )
271259
272260 if contig and assembly :
273- links .append ((
274- f"refseq:{ contig } " ,
275- apply_prefix (assembly ),
276- ))
261+ links .append (
262+ (
263+ f"refseq:{ contig } " ,
264+ apply_prefix (assembly ),
265+ )
266+ )
277267
278268 return links
279269
@@ -285,27 +275,27 @@ def write_to_table(
285275 spark : SparkSession ,
286276 records : list [tuple ],
287277 table_name : str ,
288- schema : StructType ,
289278 database : str = "default" ,
290279) -> None :
291280 if records :
292- spark .createDataFrame (records , schema ).write .format ("delta" ).mode ("overwrite" ).option (
281+ spark .createDataFrame (records , cdm_schemas [ table_name ] ).write .format ("delta" ).mode ("overwrite" ).option (
293282 "overwriteSchema" , "true"
294283 ).saveAsTable (f"{ database } .{ table_name } " )
295284
296285
297286# ---------------------------------------------------------------------
298287# SQL PREVIEW
299288# ---------------------------------------------------------------------
289+
300290CDM_TABLES = [
301- "cdm_identifiers " ,
302- "cdm_names " ,
303- "cdm_features " ,
304- "cdm_contig_collection_x_feature " ,
305- "cdm_contig_collection_x_protein " ,
306- "cdm_feature_x_protein " ,
307- "cdm_contigs " ,
308- "cdm_contig_x_contig_collection " ,
291+ "Identifier " ,
292+ "Name " ,
293+ "Feature " ,
294+ "ContigCollection_x_Feature " ,
295+ "ContigCollection_x_Protein " ,
296+ "Feature_x_Protein " ,
297+ "Contig " ,
298+ "Contig_x_ContigCollection " ,
309299]
310300
311301
@@ -316,6 +306,68 @@ def run_sql_query(spark: SparkSession, database: str = "default") -> None:
316306 spark .sql (f"SELECT * FROM { table } LIMIT 20" ).show (truncate = False )
317307
318308
309+ def parse_annotation_data (spark : SparkSession , datasets : list [dict ], namespace : str ) -> None :
310+ # -----------------------------------------
311+ # Parse and write CDM tables
312+ # -----------------------------------------
313+ for data in datasets :
314+ write_to_table (
315+ spark ,
316+ load_identifiers (data ),
317+ "Identifier" ,
318+ namespace ,
319+ )
320+
321+ write_to_table (
322+ spark ,
323+ load_names (data ),
324+ "Name" ,
325+ namespace ,
326+ )
327+
328+ write_to_table (
329+ spark ,
330+ load_feature_records (data ),
331+ "Feature" ,
332+ namespace ,
333+ )
334+
335+ write_to_table (
336+ spark ,
337+ load_contig_collection_x_feature (data ),
338+ "ContigCollection_x_Feature" ,
339+ namespace ,
340+ )
341+
342+ write_to_table (
343+ spark ,
344+ load_contig_collection_x_protein (data ),
345+ "ContigCollection_x_Protein" ,
346+ namespace ,
347+ )
348+
349+ write_to_table (
350+ spark ,
351+ load_feature_x_protein (data ),
352+ "Feature_x_Protein" ,
353+ namespace ,
354+ )
355+
356+ write_to_table (
357+ spark ,
358+ load_contigs (data ),
359+ "Contig" ,
360+ namespace ,
361+ )
362+
363+ write_to_table (
364+ spark ,
365+ load_contig_x_contig_collection (data ),
366+ "Contig_x_ContigCollection" ,
367+ namespace ,
368+ )
369+
370+
319371# ---------------------------------------------------------------------
320372# CLI ENTRY
321373# ---------------------------------------------------------------------
@@ -383,73 +435,7 @@ def main():
383435 with open (path ) as f :
384436 datasets .append (json .load (f ))
385437
386- # -----------------------------------------
387- # Parse and write CDM tables
388- # -----------------------------------------
389- for data in datasets :
390- write_to_table (
391- spark ,
392- load_identifiers (data ),
393- "cdm_identifiers" ,
394- IDENTIFIER_SCHEMA ,
395- args .namespace ,
396- )
397-
398- write_to_table (
399- spark ,
400- load_names (data ),
401- "cdm_names" ,
402- NAME_SCHEMA ,
403- args .namespace ,
404- )
405-
406- write_to_table (
407- spark ,
408- load_feature_records (data ),
409- "cdm_features" ,
410- FEATURE_SCHEMA ,
411- args .namespace ,
412- )
413-
414- write_to_table (
415- spark ,
416- load_contig_collection_x_feature (data ),
417- "cdm_contig_collection_x_feature" ,
418- CONTIG_COLLECTION_X_FEATURE_SCHEMA ,
419- args .namespace ,
420- )
421-
422- write_to_table (
423- spark ,
424- load_contig_collection_x_protein (data ),
425- "cdm_contig_collection_x_protein" ,
426- CONTIG_COLLECTION_X_PROTEIN_SCHEMA ,
427- args .namespace ,
428- )
429-
430- write_to_table (
431- spark ,
432- load_feature_x_protein (data ),
433- "cdm_feature_x_protein" ,
434- FEATURE_X_PROTEIN_SCHEMA ,
435- args .namespace ,
436- )
437-
438- write_to_table (
439- spark ,
440- load_contigs (data ),
441- "cdm_contigs" ,
442- CONTIG_SCHEMA ,
443- args .namespace ,
444- )
445-
446- write_to_table (
447- spark ,
448- load_contig_x_contig_collection (data ),
449- "cdm_contig_x_contig_collection" ,
450- CONTIG_X_CONTIG_COLLECTION_SCHEMA ,
451- args .namespace ,
452- )
438+ parse_annotation_data (spark , datasets , args .namespace )
453439
454440 # -----------------------------------------
455441 # SQL preview
0 commit comments