Skip to content

Commit 541dd4e

Browse files
committed
Refactor parsing method to be more generic
Refactor spec files to use a definitions file Add tests for duplicated data
1 parent b2357f1 commit 541dd4e

42 files changed

Lines changed: 856 additions & 318 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

importers/djornl/parser.py

Lines changed: 218 additions & 127 deletions
Large diffs are not rendered by default.

importers/test/test_djornl_parser.py

Lines changed: 94 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,24 @@ def test_load_empty_files(self):
6464
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
6565
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
6666

67-
self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
68-
self.assertEqual(parser.load_node_metadata(), {"nodes": []})
69-
self.assertEqual(parser.load_cluster_data(), {"nodes": []})
67+
# header only, no content
68+
err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
69+
with self.assertRaisesRegex(RuntimeError, err_str):
70+
parser.load_node_metadata()
71+
72+
# comments only
73+
err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
74+
with self.assertRaisesRegex(RuntimeError, err_str):
75+
parser.load_edges()
76+
77+
# mix of problems
78+
err_str = "\n".join([
79+
'cluster_data/headers_only.tsv: no valid data found',
80+
'cluster_data/no_content.tsv: no header line found',
81+
'cluster_data/comment_only.tsv: no header line found',
82+
])
83+
with self.assertRaisesRegex(RuntimeError, err_str):
84+
parser.load_cluster_data()
7085

7186
def test_load_missing_files(self):
7287
""" test loading when files cannot be found """
@@ -77,37 +92,61 @@ def test_load_missing_files(self):
7792
with self.assertRaisesRegex(RuntimeError, err_str):
7893
self.init_parser_with_path(RES_ROOT_DATA_PATH)
7994

80-
def test_load_invalid_types(self):
95+
def test_load_invalid_edges(self):
8196
""" test file format errors """
8297

8398
# path: test/djornl/invalid_types
8499
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
85100
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
86101

87-
# invalid edge type
88-
edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
102+
# invalid edge type, invalid scores
103+
edge_err_msg = "\n".join([
104+
r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas",
105+
r"edges.tsv line 7: '2.' does not match .*?",
106+
r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas",
107+
r"edges.tsv line 10: 'score!' does not match .*?"
108+
])
89109
with self.assertRaisesRegex(RuntimeError, edge_err_msg):
90110
parser.load_edges()
91111

112+
def test_load_invalid_nodes(self):
113+
""" test file format errors """
114+
115+
# path: test/djornl/invalid_types
116+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
117+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
118+
92119
# invalid node type
93-
node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
120+
node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
94121
with self.assertRaisesRegex(RuntimeError, node_err_msg):
95122
parser.load_node_metadata()
96123

124+
def test_load_invalid_clusters(self):
125+
""" test file format errors """
126+
127+
# path: test/djornl/invalid_types
128+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
129+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
130+
131+
# invalid node type
132+
cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
133+
with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
134+
parser.load_cluster_data()
135+
97136
def test_load_col_count_errors(self):
98137
""" test files with invalid numbers of columns """
99138

100139
# path: test/djornl/col_count_errors
101140
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
102141
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
103142

104-
# invalid edge type
105-
edge_err_msg = 'line 6: expected 5 cols, found 3'
143+
# not enough cols
144+
edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3'
106145
with self.assertRaisesRegex(RuntimeError, edge_err_msg):
107146
parser.load_edges()
108147

109-
# invalid node type
110-
node_err_msg = 'line 3: expected 20 cols, found 22'
148+
# too many cols
149+
node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
111150
with self.assertRaisesRegex(RuntimeError, node_err_msg):
112151
parser.load_node_metadata()
113152

@@ -144,10 +183,53 @@ def test_load_valid_cluster_data(self):
144183

145184
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
146185
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
186+
147187
cluster_data = parser.load_cluster_data()
148188
self.assertEqual(
149189
cluster_data,
150190
self.json_data["load_cluster_data"]
151191
)
152192

153-
parser.check_data_delta()
193+
def test_duplicate_edge_data(self):
194+
""" test files with duplicate edge data, which should throw an error """
195+
196+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
197+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
198+
199+
err_msg = "\n".join([
200+
"hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__AraNetv2-HT_.*?",
201+
"hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__AraNetv2-CX_.*?"
202+
])
203+
with self.assertRaisesRegex(RuntimeError, err_msg):
204+
parser.load_edges()
205+
206+
def test_duplicate_node_data(self):
207+
""" test files with duplicate node data, which should throw an error """
208+
209+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
210+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
211+
212+
err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
213+
with self.assertRaisesRegex(RuntimeError, err_msg):
214+
parser.load_node_metadata()
215+
216+
def test_duplicate_cluster_data(self):
217+
""" test files with duplicate cluster data, which should be seamlessly merged """
218+
219+
# path: test/djornl/col_count_errors
220+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
221+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
222+
223+
cluster_data = parser.load_cluster_data()
224+
self.assertEqual(
225+
cluster_data,
226+
self.json_data["load_cluster_data"]
227+
)
228+
229+
def test_the_full_shebang(self):
230+
231+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
232+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
233+
234+
parser.load_data()
235+
self.assertEqual(True, parser.load_data())

relation_engine_server/utils/bulk_import.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import hashlib
77

88
from relation_engine_server.utils.json_validation import get_schema_validator
9-
from relation_engine_server.utils import spec_loader
9+
from relation_engine_server.utils.spec_loader import get_collection
1010
from relation_engine_server.utils.arango_client import import_from_file
1111

1212

@@ -16,8 +16,8 @@ def bulk_import(query_params):
1616
schema, then write them into a temporary file that can be passed into the
1717
arango client.
1818
"""
19-
schema = spec_loader.get_collection(query_params['collection'])
20-
validator = get_schema_validator(schema=schema['schema'])
19+
schema_file = get_collection(query_params['collection'], path_only=True)
20+
validator = get_schema_validator(schema_file=schema_file, validate_at='/schema')
2121
# We can't use a context manager here
2222
# We need to close the file to have the file contents readable
2323
# and we need to prevent deletion of the temp file on close (default behavior of tempfiles)

spec/collections/djornl/djornl_edge.yaml

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,39 +14,15 @@ schema:
1414
description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
1515
type: object
1616
required: [score, edge_type, _from, _to, _key]
17+
additionalProperties: false
1718
properties:
1819
_key:
19-
type: string
20-
title: Key
20+
$ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_key
2121
_from:
22-
type: string
23-
title: Gene ID
22+
$ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_from
2423
_to:
25-
type: string
26-
title: Gene or Phenotype ID
24+
$ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_to
2725
score:
28-
title: Edge Score (Weight)
29-
# (float)
30-
type: number
26+
$ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/score
3127
edge_type:
32-
title: Edge Type
33-
type: string
34-
oneOf:
35-
- const: domain_co_occur
36-
title: AraNetv2-DC_domain-co-occurrence
37-
description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
38-
- const: gene_coexpr
39-
title: AraNetv2-CX_pairwise-gene-coexpression
40-
description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
41-
calculated from Pearson correlation coefficients to normalize the data
42-
for comparison across studies and different types of data layers (Lee et
43-
al, 2015).
44-
- const: pheno_assn
45-
title: AraGWAS-Phenotype_Associations
46-
description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
47-
- const: ppi_hithru
48-
title: AraNetv2-HT_high-throughput-ppi
49-
description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
50-
- const: ppi_liter
51-
title: AraNetv2-LC_lit-curated-ppi
52-
description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
28+
$ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/edge_type

spec/collections/djornl/djornl_node.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,16 @@ type: vertex
33
delta: false
44

55
indexes:
6-
- type: hash
7-
fields: ["clusters[*]"]
6+
- type: hash
7+
fields: ["clusters[*]"]
88

99
schema:
1010
"$schema": http://json-schema.org/draft-07/schema#
1111
title: Gene and Phenotype Vertices
1212
description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
1313
type: object
1414
required: [_key]
15+
additionalProperties: false
1516
properties:
1617
_key:
1718
type: string
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_cluster
3+
title: Cluster data
4+
description: Cluster ID to node ID mappings
5+
type: object
6+
required: [cluster_id, node_ids]
7+
additionalProperties: false
8+
properties:
9+
cluster_id:
10+
type: string
11+
format: regex
12+
pattern: "^Cluster\\d+"
13+
# pre-transform node_ids
14+
node_ids:
15+
type: string

spec/datasets/djornl/csv_edge.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_edge
3+
title: Arabidopsis gene-gene or gene-phenotype edge
4+
description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
5+
type: object
6+
required: [node1, node2, edge, layer_descrip]
7+
properties:
8+
node1:
9+
$ref: definitions.yaml#definitions/djornl_edge/_from
10+
node2:
11+
$ref: definitions.yaml#definitions/djornl_edge/_to
12+
edge:
13+
type: string
14+
format: regex
15+
pattern: "^\\d*(\\.\\d+)?$"
16+
layer_descrip:
17+
type: string
18+
oneOf:
19+
- const: AraNetv2-DC_domain-co-occurrence
20+
- const: AraNetv2-CX_pairwise-gene-coexpression
21+
- const: AraGWAS-Phenotype_Associations
22+
- const: AraNetv2-HT_high-throughput-ppi
23+
- const: AraNetv2-LC_lit-curated-ppi

spec/datasets/djornl/csv_node.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"$schema": http://json-schema.org/draft-07/schema#
2+
name: csv_node
3+
title: CSV node file syntax
4+
description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
5+
type: object
6+
required: [node_id, node_type]
7+
additionalProperties: false
8+
properties:
9+
node_id:
10+
$ref: definitions.yaml#definitions/djornl_node/_key
11+
node_type:
12+
$ref: definitions.yaml#definitions/djornl_node/node_type
13+
clusters:
14+
$ref: definitions.yaml#definitions/djornl_node/clusters
15+
transcript:
16+
$ref: definitions.yaml#definitions/djornl_node/transcript
17+
gene_symbol:
18+
$ref: definitions.yaml#definitions/djornl_node/gene_symbol
19+
gene_full_name:
20+
$ref: definitions.yaml#definitions/djornl_node/gene_full_name
21+
gene_model_type:
22+
$ref: definitions.yaml#definitions/djornl_node/gene_model_type
23+
tair_computational_description:
24+
$ref: definitions.yaml#definitions/djornl_node/tair_computational_description
25+
tair_curator_summary:
26+
$ref: definitions.yaml#definitions/djornl_node/tair_curator_summary
27+
tair_short_description:
28+
$ref: definitions.yaml#definitions/djornl_node/tair_short_description
29+
go_terms:
30+
type: string
31+
format: regex
32+
pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$"
33+
go_description:
34+
$ref: definitions.yaml#definitions/djornl_node/go_description
35+
mapman_bin:
36+
$ref: definitions.yaml#definitions/djornl_node/mapman_bin
37+
mapman_name:
38+
$ref: definitions.yaml#definitions/djornl_node/mapman_name
39+
mapman_description:
40+
$ref: definitions.yaml#definitions/djornl_node/mapman_description
41+
pheno_aragwas_id:
42+
$ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id
43+
pheno_description:
44+
$ref: definitions.yaml#definitions/djornl_node/pheno_description
45+
pheno_pto_name:
46+
$ref: definitions.yaml#definitions/djornl_node/pheno_pto_name
47+
pheno_pto_description:
48+
$ref: definitions.yaml#definitions/djornl_node/pheno_pto_description
49+
pheno_ref:
50+
$ref: definitions.yaml#definitions/djornl_node/pheno_ref
51+
user_notes:
52+
$ref: definitions.yaml#definitions/djornl_node/user_notes

0 commit comments

Comments
 (0)