Skip to content

Commit 9d50c6c

Browse files
committed
Refactoring parser and adding validation
1 parent f4e868c commit 9d50c6c

40 files changed

Lines changed: 800 additions & 274 deletions

importers/djornl/parser.py

Lines changed: 185 additions & 110 deletions
Large diffs are not rendered by default.

importers/test/test_djornl_parser.py

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,24 @@ def test_load_empty_files(self):
6464
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
6565
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
6666

67-
self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
68-
self.assertEqual(parser.load_node_metadata(), {"nodes": []})
69-
self.assertEqual(parser.load_cluster_data(), {"nodes": []})
67+
# header only, no content
68+
err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
69+
with self.assertRaisesRegex(RuntimeError, err_str):
70+
parser.load_node_metadata()
71+
72+
# comments only
73+
err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
74+
with self.assertRaisesRegex(RuntimeError, err_str):
75+
parser.load_edges()
76+
77+
# mix of problems
78+
err_str = "\n".join([
79+
'cluster_data/headers_only.tsv: no valid data found',
80+
'cluster_data/no_content.tsv: no header line found',
81+
'cluster_data/comment_only.tsv: no header line found',
82+
])
83+
with self.assertRaisesRegex(RuntimeError, err_str):
84+
parser.load_cluster_data()
7085

7186
def test_load_missing_files(self):
7287
""" test loading when files cannot be found """
@@ -77,37 +92,61 @@ def test_load_missing_files(self):
7792
with self.assertRaisesRegex(RuntimeError, err_str):
7893
self.init_parser_with_path(RES_ROOT_DATA_PATH)
7994

80-
def test_load_invalid_types(self):
95+
def test_load_invalid_edges(self):
8196
""" test file format errors """
8297

8398
# path: test/djornl/invalid_types
8499
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
85100
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
86101

87-
# invalid edge type
88-
edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
102+
# invalid edge type, invalid scores
103+
edge_err_msg = "\n".join([
104+
r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas",
105+
r"edges.tsv line 7: '2.' does not match .*?",
106+
r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas",
107+
r"edges.tsv line 10: 'score!' does not match .*?"
108+
])
89109
with self.assertRaisesRegex(RuntimeError, edge_err_msg):
90110
parser.load_edges()
91111

112+
def test_load_invalid_nodes(self):
113+
""" test file format errors """
114+
115+
# path: test/djornl/invalid_types
116+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
117+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
118+
92119
# invalid node type
93-
node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
120+
node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
94121
with self.assertRaisesRegex(RuntimeError, node_err_msg):
95122
parser.load_node_metadata()
96123

124+
def test_load_invalid_clusters(self):
125+
""" test file format errors """
126+
127+
# path: test/djornl/invalid_types
128+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
129+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
130+
131+
# invalid node type
132+
cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
133+
with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
134+
parser.load_cluster_data()
135+
97136
def test_load_col_count_errors(self):
98137
""" test files with invalid numbers of columns """
99138

100139
# path: test/djornl/col_count_errors
101140
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
102141
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
103142

104-
# invalid edge type
105-
edge_err_msg = 'line 6: expected 5 cols, found 3'
143+
# not enough cols
144+
edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3'
106145
with self.assertRaisesRegex(RuntimeError, edge_err_msg):
107146
parser.load_edges()
108147

109-
# invalid node type
110-
node_err_msg = 'line 3: expected 20 cols, found 22'
148+
# too many cols
149+
node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
111150
with self.assertRaisesRegex(RuntimeError, node_err_msg):
112151
parser.load_node_metadata()
113152

@@ -144,10 +183,45 @@ def test_load_valid_cluster_data(self):
144183

145184
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
146185
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
186+
147187
cluster_data = parser.load_cluster_data()
148188
self.assertEqual(
149189
cluster_data,
150190
self.json_data["load_cluster_data"]
151191
)
152192

153-
parser.check_data_delta()
193+
def test_duplicate_edge_data(self):
194+
""" test files with duplicate edge data, which should throw an error """
195+
196+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
197+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
198+
199+
err_msg = "\n".join([
200+
"hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__high-throughput-ppi",
201+
"hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__pairwise-gene-coexpression"
202+
])
203+
with self.assertRaisesRegex(RuntimeError, err_msg):
204+
parser.load_edges()
205+
206+
def test_duplicate_node_data(self):
207+
""" test files with duplicate node data, which should throw an error """
208+
209+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
210+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
211+
212+
err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
213+
with self.assertRaisesRegex(RuntimeError, err_msg):
214+
parser.load_node_metadata()
215+
216+
def test_duplicate_cluster_data(self):
217+
""" test files with duplicate cluster data, which should be seamlessly merged """
218+
219+
# path: test/djornl/col_count_errors
220+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
221+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
222+
223+
cluster_data = parser.load_cluster_data()
224+
self.assertEqual(
225+
cluster_data,
226+
self.json_data["load_cluster_data"]
227+
)

relation_engine_server/utils/bulk_import.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import hashlib
77

88
from relation_engine_server.utils.json_validation import get_schema_validator
9-
from relation_engine_server.utils import spec_loader
9+
from relation_engine_server.utils.spec_loader import get_collection
1010
from relation_engine_server.utils.arango_client import import_from_file
1111

1212

@@ -16,8 +16,8 @@ def bulk_import(query_params):
1616
schema, then write them into a temporary file that can be passed into the
1717
arango client.
1818
"""
19-
schema = spec_loader.get_collection(query_params['collection'])
20-
validator = get_schema_validator(schema=schema['schema'])
19+
schema_file = get_collection(query_params['collection'], path_only=True)
20+
validator = get_schema_validator(schema_file=schema_file, validate_at='/schema')
2121
# We can't use a context manager here
2222
# We need to close the file to have the file contents readable
2323
# and we need to prevent deletion of the temp file on close (default behavior of tempfiles)

spec/collections/djornl/djornl_edge.yaml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ schema:
1414
description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
1515
type: object
1616
required: [score, edge_type, _from, _to, _key]
17+
additionalProperties: false
1718
properties:
1819
_key:
1920
type: string
@@ -32,21 +33,21 @@ schema:
3233
title: Edge Type
3334
type: string
3435
oneOf:
35-
- const: domain_co_occur
36+
- const: domain-co-occurrence
3637
title: AraNetv2-DC_domain-co-occurrence
3738
description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
38-
- const: gene_coexpr
39+
- const: pairwise-gene-coexpression
3940
title: AraNetv2-CX_pairwise-gene-coexpression
4041
description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
4142
calculated from Pearson correlation coefficients to normalize the data
4243
for comparison across studies and different types of data layers (Lee et
4344
al, 2015).
44-
- const: pheno_assn
45+
- const: phenotype_associations
4546
title: AraGWAS-Phenotype_Associations
4647
description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
47-
- const: ppi_hithru
48+
- const: high-throughput-ppi
4849
title: AraNetv2-HT_high-throughput-ppi
4950
description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
50-
- const: ppi_liter
51+
- const: lit-curated-ppi
5152
title: AraNetv2-LC_lit-curated-ppi
5253
description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).

spec/collections/djornl/djornl_node.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ schema:
1212
description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
1313
type: object
1414
required: [_key]
15+
additionalProperties: false
1516
properties:
1617
_key:
1718
type: string
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_cluster
3+
title: Cluster data
4+
description: Cluster ID to node ID mappings
5+
type: object
6+
required: [cluster_id, node_ids]
7+
additionalProperties: false
8+
properties:
9+
cluster_id:
10+
type: string
11+
format: regex
12+
pattern: "^Cluster\\d+"
13+
# $ref: definitions.yaml#definitions/cluster_id
14+
node_ids:
15+
type: string
16+
# type: array
17+
# title: Node IDs
18+
# sep: ","
19+
# items:
20+
# $ref: definitions.yaml#definitions/djornl_node/_key
21+
# examples: ["AT1G01010,AT1G01020,AT1G01030", "AT1G01040,AT1G01050"]

spec/datasets/djornl/csv_edge.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_edge
3+
title: Arabidopsis gene-gene or gene-phenotype edge
4+
description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
5+
type: object
6+
required: [node1, node2, edge, layer_descrip]
7+
properties:
8+
node1:
9+
$ref: definitions.yaml#definitions/djornl_edge/_from
10+
node2:
11+
$ref: definitions.yaml#definitions/djornl_edge/_to
12+
edge:
13+
type: string
14+
format: regex
15+
pattern: "^\\d*(\\.\\d+)?$"
16+
layer_descrip:
17+
type: string
18+
oneOf:
19+
- const: AraNetv2-DC_domain-co-occurrence
20+
- const: AraNetv2-CX_pairwise-gene-coexpression
21+
- const: AraGWAS-Phenotype_Associations
22+
- const: AraNetv2-HT_high-throughput-ppi
23+
- const: AraNetv2-LC_lit-curated-ppi

spec/datasets/djornl/csv_node.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_node
3+
title: CSV node file syntax
4+
description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
5+
type: object
6+
required: [node_id, node_type]
7+
additionalProperties: false
8+
properties:
9+
node_id:
10+
$ref: definitions.yaml#definitions/djornl_node/_key
11+
node_type:
12+
$ref: definitions.yaml#definitions/djornl_node/node_type
13+
clusters:
14+
$ref: definitions.yaml#definitions/djornl_node/clusters
15+
transcript:
16+
$ref: definitions.yaml#definitions/djornl_node/transcript
17+
gene_symbol:
18+
$ref: definitions.yaml#definitions/djornl_node/gene_symbol
19+
gene_full_name:
20+
$ref: definitions.yaml#definitions/djornl_node/gene_full_name
21+
gene_model_type:
22+
$ref: definitions.yaml#definitions/djornl_node/gene_model_type
23+
tair_computational_description:
24+
$ref: definitions.yaml#definitions/djornl_node/tair_computational_description
25+
tair_curator_summary:
26+
$ref: definitions.yaml#definitions/djornl_node/tair_curator_summary
27+
tair_short_description:
28+
$ref: definitions.yaml#definitions/djornl_node/tair_short_description
29+
go_terms:
30+
type: string
31+
format: regex
32+
pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$"
33+
# $ref: definitions.yaml#definitions/djornl_node/go_terms
34+
go_description:
35+
$ref: definitions.yaml#definitions/djornl_node/go_description
36+
mapman_bin:
37+
$ref: definitions.yaml#definitions/djornl_node/mapman_bin
38+
mapman_name:
39+
$ref: definitions.yaml#definitions/djornl_node/mapman_name
40+
mapman_description:
41+
$ref: definitions.yaml#definitions/djornl_node/mapman_description
42+
pheno_aragwas_id:
43+
$ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id
44+
pheno_description:
45+
$ref: definitions.yaml#definitions/djornl_node/pheno_description
46+
pheno_pto_name:
47+
$ref: definitions.yaml#definitions/djornl_node/pheno_pto_name
48+
pheno_pto_description:
49+
$ref: definitions.yaml#definitions/djornl_node/pheno_pto_description
50+
pheno_ref:
51+
$ref: definitions.yaml#definitions/djornl_node/pheno_ref
52+
user_notes:
53+
$ref: definitions.yaml#definitions/djornl_node/user_notes

0 commit comments

Comments
 (0)