Skip to content

Commit aee3307

Browse files
committed
lib: update GWAS indexer according to biodata changes, and add JUnit test, #TASK-8139
1 parent 0c40ce6 commit aee3307

2 files changed

Lines changed: 141 additions & 16 deletions

File tree

cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -263,19 +263,19 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade
263263
GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores();
264264
if (StringUtils.isNotEmpty(values[27])) {
265265
try {
266-
scores.setPValue(Double.parseDouble(values[27]));
266+
scores.setPvalue(Double.parseDouble(values[27]));
267267
} catch (NumberFormatException e) {
268268
logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]);
269269
}
270270
}
271271
if (StringUtils.isNotEmpty(values[28])) {
272272
try {
273-
scores.setPValueMlog(Double.parseDouble(values[28]));
273+
scores.setPvalueMlog(Double.parseDouble(values[28]));
274274
} catch (NumberFormatException e) {
275275
logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]);
276276
}
277277
}
278-
scores.setPValueText(values[29]);
278+
scores.setPvalueText(values[29]);
279279
if (StringUtils.isNotEmpty(values[30])) {
280280
try {
281281
scores.setOrBeta(Double.parseDouble(values[30]));
@@ -463,8 +463,8 @@ private void checkAndAddGwasAssociation(String key, GwasAssociation newGwas, Map
463463
GwasAssociationStudyTraitScores newScores = newTrait.getScores().get(0);
464464
for (GwasAssociationStudyTraitScores scores : currTrait.getScores()) {
465465
if (scores.getOrBeta() == newScores.getOrBeta()
466-
&& scores.getPValueMlog() == newScores.getPValueMlog()
467-
&& scores.getPValue() == newScores.getPValue()) {
466+
&& scores.getPvalueMlog() == newScores.getPvalueMlog()
467+
&& scores.getPvalue() == newScores.getPvalue()) {
468468
currScores = scores;
469469
break;
470470
}

cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java

Lines changed: 136 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,24 @@
2121
import com.fasterxml.jackson.databind.ObjectMapper;
2222
import com.fasterxml.jackson.databind.ObjectReader;
2323
import org.hamcrest.CoreMatchers;
24+
import org.junit.jupiter.api.Assertions;
25+
import org.junit.jupiter.api.Assumptions;
2426
import org.junit.jupiter.api.Disabled;
2527
import org.junit.jupiter.api.Test;
2628
import org.opencb.biodata.models.variant.Variant;
2729
import org.opencb.biodata.models.variant.avro.*;
2830
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
2931
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
3032
import org.opencb.commons.utils.FileUtils;
33+
import org.rocksdb.Options;
34+
import org.rocksdb.RocksDB;
35+
import org.rocksdb.RocksDBException;
36+
import org.rocksdb.RocksIterator;
3137

3238
import java.io.BufferedReader;
3339
import java.io.IOException;
3440
import java.net.URISyntaxException;
41+
import java.nio.file.Files;
3542
import java.nio.file.Path;
3643
import java.nio.file.Paths;
3744
import java.util.*;
@@ -586,6 +593,90 @@ public void parse() throws Exception {
586593

587594
}
588595

596+
@Test
597+
public void testGwasIndexer() throws RocksDBException, IOException {
598+
Path gwasDataDir = Paths.get("/opt/gwas-data/");
599+
Assumptions.assumeTrue(Files.exists(gwasDataDir));
600+
601+
Path gwasFile = gwasDataDir.resolve("gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv");
602+
Path dbSnpTabixFile = gwasDataDir.resolve("All.vcf.gz");
603+
Path genomeSequenceFilePath = gwasDataDir.resolve("Homo_sapiens.GRCh38.fa");
604+
String assembly = "grch38";
605+
606+
Path outputDir = Paths.get("/tmp");
607+
Path rocksDbDir = outputDir.resolve("integration.idx");
608+
Object[] dbConnection = getDBConnection(rocksDbDir.toAbsolutePath().toString(), true);
609+
RocksDB rdb = (RocksDB) dbConnection[0];
610+
611+
GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbSnpTabixFile, genomeSequenceFilePath, assembly, rdb);
612+
gwasIndexer.index();
613+
614+
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(outputDir, CLINICAL_VARIANT_DATA, true);
615+
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's
616+
// named "iterator"
617+
RocksIterator rocksIterator = rdb.newIterator();
618+
619+
ObjectMapper mapper = new ObjectMapper();
620+
System.out.println("Reading from RocksDB index and serializing to " + serializer.getOutdir().resolve(serializer.getFileName()));
621+
int counter = 0;
622+
for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) {
623+
Variant variant = parseVariantFromVariantId(new String(rocksIterator.key()));
624+
if (variant != null) {
625+
VariantAnnotation variantAnnotation = mapper.readValue(rocksIterator.value(), VariantAnnotation.class);
626+
variant.setAnnotation(variantAnnotation);
627+
serializer.serialize(variant);
628+
counter++;
629+
if (counter % 10000 == 0) {
630+
System.out.printf(counter + " written");
631+
}
632+
}
633+
}
634+
serializer.close();
635+
System.out.println("Done.");
636+
serializer.close();
637+
638+
Path clinicalVariantFile = outputDir.resolve(CLINICAL_VARIANT_DATA + JSON_GZ_EXTENSION);
639+
Assertions.assertTrue(Files.exists(clinicalVariantFile));
640+
641+
// Read serialized variants and check some of them
642+
List<Variant> variantList = loadSerializedVariants(clinicalVariantFile.toAbsolutePath().toString());
643+
Assertions.assertFalse(variantList.isEmpty());
644+
Assertions.assertEquals(93, variantList.size());
645+
boolean found = false;
646+
for (Variant variant : variantList) {
647+
assertNotNull(variant.getAnnotation().getGwas());
648+
Assertions.assertEquals("EBI GWAS catalog", variant.getAnnotation().getGwas().get(0).getSource(), "Source");
649+
if (variant.getChromosome().equals("11") && variant.getStart().equals(27658369) && variant.getReference().equals("C")
650+
&& variant.getAlternate().equals("T")) {
651+
found = true;
652+
Assertions.assertEquals("rs6265", variant.getAnnotation().getGwas().get(0).getSnpId());
653+
Assertions.assertEquals(3.0E-10, variant.getAnnotation().getGwas().get(0).getStudies().get(0).getTraits().get(0).getScores().get(0).getPvalue());
654+
Assertions.assertEquals(9.522878745280337, variant.getAnnotation().getGwas().get(0).getStudies().get(0).getTraits().get(0).getScores().get(0).getPvalueMlog());
655+
}
656+
}
657+
Assertions.assertTrue(found, "Expected GWAS variant not found in serialized variants.");
658+
659+
// Clean and delete directories/files
660+
rdb.close();
661+
org.apache.commons.io.FileUtils.deleteDirectory(rocksDbDir.toFile());
662+
Files.deleteIfExists(clinicalVariantFile);
663+
}
664+
665+
private Variant parseVariantFromVariantId(String variantId) {
666+
try {
667+
String[] parts = variantId.split(":", -1); // -1 to include empty fields
668+
if (parts[1].contains("-")) {
669+
String[] pos = parts[1].split("-");
670+
return new Variant(parts[0].trim(), Integer.parseInt(pos[0].trim()), Integer.parseInt(pos[1].trim()), parts[2], parts[3]);
671+
} else {
672+
return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]);
673+
}
674+
} catch (Exception e) {
675+
System.out.printf("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId);
676+
return null;
677+
}
678+
}
679+
589680
private void cleanUp() throws URISyntaxException, IOException {
590681
// Clean up temporary files/directories/indexes
591682
org.apache.commons.io.FileUtils.deleteDirectory(Paths.get("/tmp/clinicalVariant1/").toFile());
@@ -726,17 +817,17 @@ private List<Variant> loadSerializedVariants(String fileName) {
726817
//
727818
// EvidenceEntry entry = buildEvidenceEntry(info);
728819
// System.out.println(variant.toStringSimple() + " : " + entry.toString());
729-
//// if (variant != null) {
730-
//// boolean success = updateRocksDB(variant);
731-
//// // updateRocksDB may fail (false) if normalisation process fails
732-
//// if (success) {
733-
//// numberIndexedRecords++;
734-
//// }
735-
//// }
736-
//// totalNumberRecords++;
737-
//// if (totalNumberRecords % 1000 == 0) {
738-
//// logger.info("{} records parsed", totalNumberRecords);
739-
//// }
820+
//// if (variant != null) {
821+
//// boolean success = updateRocksDB(variant);
822+
//// // updateRocksDB may fail (false) if normalisation process fails
823+
//// if (success) {
824+
//// numberIndexedRecords++;
825+
//// }
826+
//// }
827+
//// totalNumberRecords++;
828+
//// if (totalNumberRecords % 1000 == 0) {
829+
//// logger.info("{} records parsed", totalNumberRecords);
830+
//// }
740831
// }
741832
// }
742833
// }
@@ -811,4 +902,38 @@ public void testVariant() {
811902
System.out.println(v.toStringSimple());
812903
}
813904

905+
private Object[] getDBConnection(String dbLocation, boolean forceCreate) {
906+
boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation));
907+
// a static method that loads the RocksDB C++ library.
908+
RocksDB.loadLibrary();
909+
// the Options class contains a set of configurable DB options
910+
// that determines the behavior of a database.
911+
Options options = new Options().setCreateIfMissing(true);
912+
913+
// options.setMaxBackgroundCompactions(4);
914+
// options.setMaxBackgroundFlushes(1);
915+
// options.setCompressionType(CompressionType.NO_COMPRESSION);
916+
// options.setMaxOpenFiles(-1);
917+
// options.setIncreaseParallelism(4);
918+
// options.setCompactionStyle(CompactionStyle.LEVEL);
919+
// options.setLevelCompactionDynamicLevelBytes(true);
920+
921+
RocksDB db = null;
922+
try {
923+
// a factory method that returns a RocksDB instance
924+
if (indexingNeeded) {
925+
db = RocksDB.open(options, dbLocation);
926+
} else {
927+
db = RocksDB.openReadOnly(options, dbLocation);
928+
}
929+
// do something
930+
} catch (RocksDBException e) {
931+
// do some error handling
932+
e.printStackTrace();
933+
System.exit(1);
934+
}
935+
936+
return new Object[]{db, options, dbLocation, indexingNeeded};
937+
938+
}
814939
}

0 commit comments

Comments
 (0)