kbase
diff --git a/‎importers/djornl/parser.py‎
Lines changed: 185 additions & 110 deletions b/‎importers/djornl/parser.py‎
Lines changed: 185 additions & 110 deletions
diff --git a/‎importers/test/test_djornl_parser.py‎
Lines changed: 86 additions & 12 deletions b/‎importers/test/test_djornl_parser.py‎
Lines changed: 86 additions & 12 deletions
diff --git a/‎relation_engine_server/utils/bulk_import.py‎
Lines changed: 3 additions & 3 deletions b/‎relation_engine_server/utils/bulk_import.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎spec/collections/djornl/djornl_edge.yaml‎
Lines changed: 6 additions & 5 deletions b/‎spec/collections/djornl/djornl_edge.yaml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎spec/collections/djornl/djornl_node.yaml‎
Lines changed: 1 addition & 0 deletions b/‎spec/collections/djornl/djornl_node.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spec/datasets/djornl/csv_cluster.yaml‎
Lines changed: 21 additions & 0 deletions b/‎spec/datasets/djornl/csv_cluster.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎spec/datasets/djornl/csv_edge.yaml‎
Lines changed: 23 additions & 0 deletions b/‎spec/datasets/djornl/csv_edge.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎spec/datasets/djornl/csv_node.yaml‎
Lines changed: 53 additions & 0 deletions b/‎spec/datasets/djornl/csv_node.yaml‎
Lines changed: 53 additions & 0 deletions
@@ -64,9 +64,24 @@ def test_load_empty_files(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
-        self.assertEqual(parser.load_node_metadata(), {"nodes": []})
-        self.assertEqual(parser.load_cluster_data(), {"nodes": []})
+        # header only, no content
+        err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_node_metadata()
+
+        # comments only
+        err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_edges()
+
+        # mix of problems
+        err_str = "\n".join([
+            'cluster_data/headers_only.tsv: no valid data found',
+            'cluster_data/no_content.tsv: no header line found',
+            'cluster_data/comment_only.tsv: no header line found',
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_cluster_data()
 
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
@@ -77,37 +92,61 @@ def test_load_missing_files(self):
         with self.assertRaisesRegex(RuntimeError, err_str):
             self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-    def test_load_invalid_types(self):
+    def test_load_invalid_edges(self):
         """ test file format errors """
 
         # path: test/djornl/invalid_types
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
+        # invalid edge type, invalid scores
+        edge_err_msg = "\n".join([
+            r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas",
+            r"edges.tsv line 7: '2.' does not match .*?",
+            r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas",
+            r"edges.tsv line 10: 'score!' does not match .*?"
+        ])
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
+    def test_load_invalid_nodes(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         # invalid node type
-        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
+        node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
+    def test_load_invalid_clusters(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        # invalid node type
+        cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
+        with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
+            parser.load_cluster_data()
+
     def test_load_col_count_errors(self):
         """ test files with invalid numbers of columns """
 
         # path: test/djornl/col_count_errors
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'line 6: expected 5 cols, found 3'
+        # not enough cols
+        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
-        # invalid node type
-        node_err_msg = 'line 3: expected 20 cols, found 22'
+        # too many cols
+        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
@@ -144,10 +183,45 @@ def test_load_valid_cluster_data(self):
 
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         cluster_data = parser.load_cluster_data()
         self.assertEqual(
             cluster_data,
             self.json_data["load_cluster_data"]
         )
 
-        parser.check_data_delta()
+    def test_duplicate_edge_data(self):
+        """ test files with duplicate edge data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "\n".join([
+            "hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__high-throughput-ppi",
+            "hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__pairwise-gene-coexpression"
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_edges()
+
+    def test_duplicate_node_data(self):
+        """ test files with duplicate node data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_node_metadata()
+
+    def test_duplicate_cluster_data(self):
+        """ test files with duplicate cluster data, which should be seamlessly merged """
+
+        # path: test/djornl/col_count_errors
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        cluster_data = parser.load_cluster_data()
+        self.assertEqual(
+            cluster_data,
+            self.json_data["load_cluster_data"]
+        )
@@ -6,7 +6,7 @@
 import hashlib
 
 from relation_engine_server.utils.json_validation import get_schema_validator
-from relation_engine_server.utils import spec_loader
+from relation_engine_server.utils.spec_loader import get_collection
 from relation_engine_server.utils.arango_client import import_from_file
 
 
@@ -16,8 +16,8 @@ def bulk_import(query_params):
     schema, then write them into a temporary file that can be passed into the
     arango client.
     """
-    schema = spec_loader.get_collection(query_params['collection'])
-    validator = get_schema_validator(schema=schema['schema'])
+    schema_file = get_collection(query_params['collection'], path_only=True)
+    validator = get_schema_validator(schema_file=schema_file, validate_at='/schema')
     # We can't use a context manager here
     # We need to close the file to have the file contents readable
     #  and we need to prevent deletion of the temp file on close (default behavior of tempfiles)
 
@@ -14,6 +14,7 @@ schema:
   description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
   type: object
   required: [score, edge_type, _from, _to, _key]
+  additionalProperties: false
   properties:
     _key:
       type: string
@@ -32,21 +33,21 @@ schema:
       title: Edge Type
       type: string
       oneOf:
-        - const: domain_co_occur
+        - const: domain-co-occurrence
           title: AraNetv2-DC_domain-co-occurrence
           description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: gene_coexpr
+        - const: pairwise-gene-coexpression
           title: AraNetv2-CX_pairwise-gene-coexpression
           description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
             calculated from Pearson correlation coefficients to normalize the data
             for comparison across studies and different types of data layers (Lee et
             al, 2015).
-        - const: pheno_assn
+        - const: phenotype_associations
           title: AraGWAS-Phenotype_Associations
           description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
-        - const: ppi_hithru
+        - const: high-throughput-ppi
           title: AraNetv2-HT_high-throughput-ppi
           description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: ppi_liter
+        - const: lit-curated-ppi
           title: AraNetv2-LC_lit-curated-ppi
           description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
@@ -12,6 +12,7 @@ schema:
   description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
   type: object
   required: [_key]
+  additionalProperties: false
   properties:
     _key:
       type: string
 
@@ -0,0 +1,21 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_cluster
+title: Cluster data
+description: Cluster ID to node ID mappings
+type: object
+required: [cluster_id, node_ids]
+additionalProperties: false
+properties:
+  cluster_id:
+    type: string
+    format: regex
+    pattern: "^Cluster\\d+"
+#    $ref: definitions.yaml#definitions/cluster_id
+  node_ids:
+    type: string
+#    type: array
+#    title: Node IDs
+#    sep: ","
+#    items:
+#      $ref: definitions.yaml#definitions/djornl_node/_key
+#    examples: ["AT1G01010,AT1G01020,AT1G01030", "AT1G01040,AT1G01050"]
@@ -0,0 +1,23 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_edge
+title: Arabidopsis gene-gene or gene-phenotype edge
+description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
+type: object
+required: [node1, node2, edge, layer_descrip]
+properties:
+  node1:
+    $ref: definitions.yaml#definitions/djornl_edge/_from
+  node2:
+    $ref: definitions.yaml#definitions/djornl_edge/_to
+  edge:
+    type: string
+    format: regex
+    pattern: "^\\d*(\\.\\d+)?$"
+  layer_descrip:
+    type: string
+    oneOf:
+      - const: AraNetv2-DC_domain-co-occurrence
+      - const: AraNetv2-CX_pairwise-gene-coexpression
+      - const: AraGWAS-Phenotype_Associations
+      - const: AraNetv2-HT_high-throughput-ppi
+      - const: AraNetv2-LC_lit-curated-ppi
@@ -0,0 +1,53 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_node
+title: CSV node file syntax
+description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
+type: object
+required: [node_id, node_type]
+additionalProperties: false
+properties:
+  node_id:
+    $ref: definitions.yaml#definitions/djornl_node/_key
+  node_type:
+    $ref: definitions.yaml#definitions/djornl_node/node_type
+  clusters:
+    $ref: definitions.yaml#definitions/djornl_node/clusters
+  transcript:
+    $ref: definitions.yaml#definitions/djornl_node/transcript
+  gene_symbol:
+    $ref: definitions.yaml#definitions/djornl_node/gene_symbol
+  gene_full_name:
+    $ref: definitions.yaml#definitions/djornl_node/gene_full_name
+  gene_model_type:
+    $ref: definitions.yaml#definitions/djornl_node/gene_model_type
+  tair_computational_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_computational_description
+  tair_curator_summary:
+    $ref: definitions.yaml#definitions/djornl_node/tair_curator_summary
+  tair_short_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_short_description
+  go_terms:
+    type: string
+    format: regex
+    pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$"
+#    $ref: definitions.yaml#definitions/djornl_node/go_terms
+  go_description:
+    $ref: definitions.yaml#definitions/djornl_node/go_description
+  mapman_bin:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_bin
+  mapman_name:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_name
+  mapman_description:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_description
+  pheno_aragwas_id:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id
+  pheno_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_description
+  pheno_pto_name:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_name
+  pheno_pto_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_description
+  pheno_ref:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_ref
+  user_notes:
+    $ref: definitions.yaml#definitions/djornl_node/user_notes