kbase
diff --git a/‎importers/djornl/parser.py‎
Lines changed: 218 additions & 127 deletions b/‎importers/djornl/parser.py‎
Lines changed: 218 additions & 127 deletions
diff --git a/‎importers/test/test_djornl_parser.py‎
Lines changed: 94 additions & 12 deletions b/‎importers/test/test_djornl_parser.py‎
Lines changed: 94 additions & 12 deletions
diff --git a/‎relation_engine_server/utils/bulk_import.py‎
Lines changed: 3 additions & 3 deletions b/‎relation_engine_server/utils/bulk_import.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎spec/collections/djornl/djornl_edge.yaml‎
Lines changed: 6 additions & 30 deletions b/‎spec/collections/djornl/djornl_edge.yaml‎
Lines changed: 6 additions & 30 deletions
diff --git a/‎spec/collections/djornl/djornl_node.yaml‎
Lines changed: 3 additions & 2 deletions b/‎spec/collections/djornl/djornl_node.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎spec/datasets/djornl/csv_cluster.yaml‎
Lines changed: 15 additions & 0 deletions b/‎spec/datasets/djornl/csv_cluster.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎spec/datasets/djornl/csv_edge.yaml‎
Lines changed: 23 additions & 0 deletions b/‎spec/datasets/djornl/csv_edge.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎spec/datasets/djornl/csv_node.yaml‎
Lines changed: 52 additions & 0 deletions b/‎spec/datasets/djornl/csv_node.yaml‎
Lines changed: 52 additions & 0 deletions
@@ -64,9 +64,24 @@ def test_load_empty_files(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
-        self.assertEqual(parser.load_node_metadata(), {"nodes": []})
-        self.assertEqual(parser.load_cluster_data(), {"nodes": []})
+        # header only, no content
+        err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_node_metadata()
+
+        # comments only
+        err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_edges()
+
+        # mix of problems
+        err_str = "\n".join([
+            'cluster_data/headers_only.tsv: no valid data found',
+            'cluster_data/no_content.tsv: no header line found',
+            'cluster_data/comment_only.tsv: no header line found',
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_cluster_data()
 
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
@@ -77,37 +92,61 @@ def test_load_missing_files(self):
         with self.assertRaisesRegex(RuntimeError, err_str):
             self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-    def test_load_invalid_types(self):
+    def test_load_invalid_edges(self):
         """ test file format errors """
 
         # path: test/djornl/invalid_types
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
+        # invalid edge type, invalid scores
+        edge_err_msg = "\n".join([
+            r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas",
+            r"edges.tsv line 7: '2.' does not match .*?",
+            r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas",
+            r"edges.tsv line 10: 'score!' does not match .*?"
+        ])
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
+    def test_load_invalid_nodes(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         # invalid node type
-        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
+        node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
+    def test_load_invalid_clusters(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        # invalid node type
+        cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
+        with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
+            parser.load_cluster_data()
+
     def test_load_col_count_errors(self):
         """ test files with invalid numbers of columns """
 
         # path: test/djornl/col_count_errors
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'line 6: expected 5 cols, found 3'
+        # not enough cols
+        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
-        # invalid node type
-        node_err_msg = 'line 3: expected 20 cols, found 22'
+        # too many cols
+        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
@@ -144,10 +183,53 @@ def test_load_valid_cluster_data(self):
 
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         cluster_data = parser.load_cluster_data()
         self.assertEqual(
             cluster_data,
             self.json_data["load_cluster_data"]
         )
 
-        parser.check_data_delta()
+    def test_duplicate_edge_data(self):
+        """ test files with duplicate edge data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "\n".join([
+            "hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__AraNetv2-HT_.*?",
+            "hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__AraNetv2-CX_.*?"
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_edges()
+
+    def test_duplicate_node_data(self):
+        """ test files with duplicate node data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_node_metadata()
+
+    def test_duplicate_cluster_data(self):
+        """ test files with duplicate cluster data, which should be seamlessly merged """
+
+        # path: test/djornl/col_count_errors
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        cluster_data = parser.load_cluster_data()
+        self.assertEqual(
+            cluster_data,
+            self.json_data["load_cluster_data"]
+        )
+
+    def test_the_full_shebang(self):
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        parser.load_data()
+        self.assertEqual(True, parser.load_data())
@@ -6,7 +6,7 @@
 import hashlib
 
 from relation_engine_server.utils.json_validation import get_schema_validator
-from relation_engine_server.utils import spec_loader
+from relation_engine_server.utils.spec_loader import get_collection
 from relation_engine_server.utils.arango_client import import_from_file
 
 
@@ -16,8 +16,8 @@ def bulk_import(query_params):
     schema, then write them into a temporary file that can be passed into the
     arango client.
     """
-    schema = spec_loader.get_collection(query_params['collection'])
-    validator = get_schema_validator(schema=schema['schema'])
+    schema_file = get_collection(query_params['collection'], path_only=True)
+    validator = get_schema_validator(schema_file=schema_file, validate_at='/schema')
     # We can't use a context manager here
     # We need to close the file to have the file contents readable
     #  and we need to prevent deletion of the temp file on close (default behavior of tempfiles)
 
@@ -14,39 +14,15 @@ schema:
   description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
   type: object
   required: [score, edge_type, _from, _to, _key]
+  additionalProperties: false
   properties:
     _key:
-      type: string
-      title: Key
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_key
     _from:
-      type: string
-      title: Gene ID
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_from
     _to:
-      type: string
-      title: Gene or Phenotype ID
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_to
     score:
-      title: Edge Score (Weight)
-      # (float)
-      type: number
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/score
     edge_type:
-      title: Edge Type
-      type: string
-      oneOf:
-        - const: domain_co_occur
-          title: AraNetv2-DC_domain-co-occurrence
-          description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: gene_coexpr
-          title: AraNetv2-CX_pairwise-gene-coexpression
-          description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
-            calculated from Pearson correlation coefficients to normalize the data
-            for comparison across studies and different types of data layers (Lee et
-            al, 2015).
-        - const: pheno_assn
-          title: AraGWAS-Phenotype_Associations
-          description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
-        - const: ppi_hithru
-          title: AraNetv2-HT_high-throughput-ppi
-          description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: ppi_liter
-          title: AraNetv2-LC_lit-curated-ppi
-          description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/edge_type
@@ -3,15 +3,16 @@ type: vertex
 delta: false
 
 indexes:
- - type: hash
-   fields: ["clusters[*]"]
+  - type: hash
+    fields: ["clusters[*]"]
 
 schema:
   "$schema": http://json-schema.org/draft-07/schema#
   title: Gene and Phenotype Vertices
   description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
   type: object
   required: [_key]
+  additionalProperties: false
   properties:
     _key:
       type: string
 
@@ -0,0 +1,15 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_cluster
+title: Cluster data
+description: Cluster ID to node ID mappings
+type: object
+required: [cluster_id, node_ids]
+additionalProperties: false
+properties:
+  cluster_id:
+    type: string
+    format: regex
+    pattern: "^Cluster\\d+"
+  # pre-transform node_ids
+  node_ids:
+    type: string
@@ -0,0 +1,23 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_edge
+title: Arabidopsis gene-gene or gene-phenotype edge
+description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
+type: object
+required: [node1, node2, edge, layer_descrip]
+properties:
+  node1:
+    $ref: definitions.yaml#definitions/djornl_edge/_from
+  node2:
+    $ref: definitions.yaml#definitions/djornl_edge/_to
+  edge:
+    type: string
+    format: regex
+    pattern: "^\\d*(\\.\\d+)?$"
+  layer_descrip:
+    type: string
+    oneOf:
+      - const: AraNetv2-DC_domain-co-occurrence
+      - const: AraNetv2-CX_pairwise-gene-coexpression
+      - const: AraGWAS-Phenotype_Associations
+      - const: AraNetv2-HT_high-throughput-ppi
+      - const: AraNetv2-LC_lit-curated-ppi
@@ -0,0 +1,52 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_node
+title: CSV node file syntax
+description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
+type: object
+required: [node_id, node_type]
+additionalProperties: false
+properties:
+  node_id:
+    $ref: definitions.yaml#definitions/djornl_node/_key
+  node_type:
+    $ref: definitions.yaml#definitions/djornl_node/node_type
+  clusters:
+    $ref: definitions.yaml#definitions/djornl_node/clusters
+  transcript:
+    $ref: definitions.yaml#definitions/djornl_node/transcript
+  gene_symbol:
+    $ref: definitions.yaml#definitions/djornl_node/gene_symbol
+  gene_full_name:
+    $ref: definitions.yaml#definitions/djornl_node/gene_full_name
+  gene_model_type:
+    $ref: definitions.yaml#definitions/djornl_node/gene_model_type
+  tair_computational_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_computational_description
+  tair_curator_summary:
+    $ref: definitions.yaml#definitions/djornl_node/tair_curator_summary
+  tair_short_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_short_description
+  go_terms:
+    type: string
+    format: regex
+    pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$"
+  go_description:
+    $ref: definitions.yaml#definitions/djornl_node/go_description
+  mapman_bin:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_bin
+  mapman_name:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_name
+  mapman_description:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_description
+  pheno_aragwas_id:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id
+  pheno_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_description
+  pheno_pto_name:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_name
+  pheno_pto_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_description
+  pheno_ref:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_ref
+  user_notes:
+    $ref: definitions.yaml#definitions/djornl_node/user_notes