Skip to content

Commit 0c40ce6

Browse files
authored
Merge pull request #696 from opencb/TASK-5564
TASK-5564 - Update data sources for CellBase 6.x
2 parents a9ec915 + 13dc96b commit 0c40ce6

212 files changed

Lines changed: 14893 additions & 9114 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pull-request-approved.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,17 @@ on:
66
types: [ submitted ]
77

88
jobs:
9-
test:
9+
10+
test-xetabase:
11+
if: github.event.review.state == 'approved'
12+
name: "Run all tests before merging"
13+
uses: opencb/java-common-libs/.github/workflows/test-xetabase-workflow.yml@develop
14+
with:
15+
branch: develop
16+
task: ${{ github.event.pull_request.head.ref }}
17+
secrets: inherit
18+
19+
test-cellbase:
1020
uses: ./.github/workflows/test-analysis.yml
1121
secrets: inherit
1222
with:

.github/workflows/task.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,17 @@ jobs:
1313
with:
1414
upload_artifact: true
1515

16+
deploy-maven:
17+
uses: opencb/java-common-libs/.github/workflows/deploy-maven-repository-workflow.yml@develop
18+
needs: test
19+
with:
20+
maven_opts: -Dcheckstyle.skip -DCELLBASE.WAR.NAME=cellbase
21+
cache_key: ${{ needs.test.outputs.cache_key }}
22+
secrets: inherit
23+
1624
deploy-docker:
1725
uses: opencb/java-common-libs/.github/workflows/deploy-docker-hub-workflow.yml@develop
1826
needs: test
1927
with:
20-
cli: python3 ./build/cloud/docker/docker-build.py push --images base --tag ${{ github.ref_name }}
28+
cli: python3 ./build/cloud/docker/docker-build.py push --images base,builder --tag ${{ github.ref_name }}
2129
secrets: inherit

.github/workflows/test-analysis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
uses: opencb/java-common-libs/.github/workflows/build-java-app-workflow.yml@develop
2222
with:
2323
needs_hadoop_preparation: false
24-
maven_opts: -Dcheckstyle.skip
24+
maven_opts: -Dcheckstyle.skip -DCELLBASE.WAR.NAME=cellbase
2525
upload_artifact: ${{ inputs.upload_artifact }}
2626
dependency_repos: "java-common-libs,biodata"
2727
secrets: inherit

cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ LABEL org.label-schema.vendor="OpenCB" \
1111
## We need to be root to install dependencies
1212
USER root
1313
RUN apt-get update -y && \
14-
apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl && \
14+
apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl libxml-simple-perl liblog-log4perl-perl libxml-parser-perl libxml-dom-perl && \
1515
mkdir /opt/ensembl && chown cellbase:cellbase /opt/ensembl && \
1616
rm -rf /var/lib/apt/lists/*
1717

@@ -26,6 +26,10 @@ RUN cd /opt/ensembl && \
2626
git clone https://github.com/Ensembl/ensembl-variation.git && \
2727
git clone https://github.com/Ensembl/ensembl-funcgen.git && \
2828
git clone https://github.com/Ensembl/ensembl-compara.git && \
29-
git clone https://github.com/Ensembl/ensembl-io.git
29+
git clone https://github.com/Ensembl/ensembl-io.git && \
30+
git clone --branch cvs/release-0_7 https://github.com/biomart/biomart-perl
3031

31-
ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
32+
## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes
33+
RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/
34+
35+
ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib

cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -134,16 +134,13 @@ our $ENSEMBL_GENOMES_PORT = "4157";
134134
our $ENSEMBL_GENOMES_USER = "anonymous";
135135

136136
## Vertebrates
137-
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
138-
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
139-
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
140-
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
141-
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
142-
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
143-
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
144-
our $MUS_MUSCULUS_CORE = "mus_musculus_core_78_38";
145-
our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_78_38";
146-
our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_78_38";
137+
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_114_38";
138+
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_114_38";
139+
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_114_38";
140+
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_114_38";
141+
our $MUS_MUSCULUS_CORE = "mus_musculus_core_114_39";
142+
our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_114_39";
143+
our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_114_39";
147144
our $RATTUS_NORVEGICUS_CORE = "rattus_norvegicus_core_78_5";
148145
our $RATTUS_NORVEGICUS_VARIATION = "rattus_norvegicus_variation_78_5";
149146
our $RATTUS_NORVEGICUS_FUNCTIONAL = "rattus_norvegicus_funcgen_78_5";
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env perl
2+
3+
use strict;
4+
use Getopt::Long;
5+
use Data::Dumper;
6+
use JSON;
7+
use DB_CONFIG;
8+
9+
use BioMart::Initializer;
10+
use BioMart::Query;
11+
use BioMart::QueryRunner;
12+
13+
## Default values
14+
my $species = 'hsapiens';
15+
my $outdir = "./";
16+
17+
## Parsing command line
18+
GetOptions ('species=s' => \$species, 'outdir=s' => \$outdir);
19+
20+
21+
my $confFile = "/opt/cellbase/scripts/ensembl-scripts/martURLLocation.xml";
22+
23+
# NB: change action to 'clean' if you wish to start a fresh configuration
24+
# and to 'cached' if you want to skip configuration step on subsequent runs from the same registry
25+
my $action='clean';
26+
my $initializer = BioMart::Initializer->new('registryFile'=>$confFile, 'action'=>$action);
27+
my $registry = $initializer->getRegistry;
28+
29+
my $query = BioMart::Query->new('registry'=>$registry,'virtualSchemaName'=>'default');
30+
31+
$query->setDataset($species."_gene_ensembl");
32+
33+
$query->addAttribute("ensembl_gene_id");
34+
$query->addAttribute("ensembl_transcript_id");
35+
$query->addAttribute("transcript_is_canonical");
36+
37+
$query->formatter("TSV");
38+
39+
# Open the file for writing
40+
open(my $fh, '>', "$outdir/ensembl_canonical.txt") or die "Cannot open ensembl_canonical.txt file: $!";
41+
42+
# Save the original stdout
43+
my $original_stdout = *STDOUT;
44+
open(STDOUT, '>&', $fh) or die "Can't redirect STDOUT: $!";
45+
46+
my $query_runner = BioMart::QueryRunner->new();
47+
48+
# to obtain unique rows only
49+
$query_runner->uniqueRowsOnly(1);
50+
$query_runner->execute($query);
51+
#$query_runner->printHeader();
52+
#print ENSEMBL_CANONICAL $query_runner->printResults();
53+
# Call printResults which prints to STDOUT (now redirected to the file)
54+
$query_runner->printResults();
55+
#$query_runner->printFooter();
56+
57+
# Restore the original stdout
58+
open(STDOUT, '>&', $original_stdout) or die "Can't restore STDOUT: $!";
59+
60+
# Close the filehandle
61+
close($fh) or die "Failed to close file: $!";

cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
####################################################################
1717
## Parsing command line options ####################################
1818
####################################################################
19-
# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ...
19+
##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp
20+
21+
# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ...
2022

2123
## Parsing command line
2224
GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo,
@@ -50,8 +52,8 @@
5052

5153
if ($phylo eq "" || $phylo eq "vertebrate") {
5254
print ("In vertebrates section\n");
53-
if ($species eq "Homo sapiens" && $assembly eq "GRCh38") {
54-
print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
55+
if ($species eq "Homo sapiens" || $species eq "Mus musculus") {
56+
print ($species." selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
5557
Bio::EnsEMBL::Registry->load_registry_from_db(
5658
-host => $ENSEMBL_HOST,
5759
-user => $ENSEMBL_USER,

cellbase-app/app/scripts/ensembl-scripts/genome_info.pl

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
####################################################################
1818
## Parsing command line options ####################################
1919
####################################################################
20-
# USAGE: ./genome_info.pl --species "Homo sapiens" --outfile ../../appl_db/ird_v1/hsa ...
20+
##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --assembly GRCm39 --outfile /tmp
21+
22+
# USAGE: ./genome_info.pl --species "Homo sapiens" --assembly GRCh38 --outfile ../../appl_db/ird_v1/hsa ...
2123

2224
## Parsing command line
2325
GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'o|outfile=s' => \$outfile, 'phylo=s' => \$phylo,
@@ -29,7 +31,6 @@
2931

3032
if ($outfile eq "") {
3133
$outfile = "/ensembl-data/genome_info.json";
32-
# $outfile = "/ensembl-data/$species.json";
3334
}
3435

3536
####################################################################
@@ -42,17 +43,13 @@
4243
# Bio::EnsEMBL::Registry->load_all("$ENSEMBL_REGISTRY");
4344
if($phylo eq "" || $phylo eq "vertebrate") {
4445
print ("In vertebrates section\n");
45-
if ($species eq "Homo sapiens" && $assembly eq "GRCh38") {
46-
print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
47-
Bio::EnsEMBL::Registry->load_registry_from_db(
48-
-host => $ENSEMBL_HOST,
49-
-user => $ENSEMBL_USER,
50-
-port => $ENSEMBL_PORT,
51-
-verbose => $verbose
52-
);
53-
} else {
54-
print ("Human selected, assembly ".$assembly." no supported\n");
55-
}
46+
print ("Species: ".$species.", assembly ".$assembly.", connecting to: ".$ENSEMBL_HOST.":".$ENSEMBL_PORT."\n");
47+
Bio::EnsEMBL::Registry->load_registry_from_db(
48+
-host => $ENSEMBL_HOST,
49+
-user => $ENSEMBL_USER,
50+
-port => $ENSEMBL_PORT,
51+
-verbose => $verbose
52+
);
5653
} else {
5754
print ("In no-vertebrates section\n");
5855
Bio::EnsEMBL::Registry->load_registry_from_db(
@@ -64,7 +61,6 @@
6461

6562
my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Slice");
6663
my $karyotype_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "KaryotypeBand");
67-
# my $gene_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Gene");
6864
####################################################################
6965

7066
my %info_stats = ();
@@ -81,12 +77,10 @@
8177
$chromosome{'start'} = int($chrom->start());
8278
$chromosome{'end'} = int($chrom->end());
8379
$chromosome{'size'} = int($chrom->seq_region_length());
84-
# $chromosome{'numberGenes'} = scalar @{$chrom->get_all_Genes()};
8580
$chromosome{'isCircular'} = $chrom->is_circular();
8681

8782
my @cytobands = ();
8883
foreach my $cyto(@{$karyotype_adaptor->fetch_all_by_chr_name($chrom->seq_region_name)}) {
89-
# print $cytoband->name."\n";
9084
my %cytoband = ();
9185
$cytoband{'name'} = $cyto->name();
9286
$cytoband{'start'} = int($cyto->start());
@@ -96,7 +90,7 @@
9690
push(@cytobands, \%cytoband);
9791
}
9892

99-
## check if any cytoband has been added
93+
## Check if any cytoband has been added
10094
## If not a unique cytoband covering all chromosome is added.
10195
if(@cytobands == 0) {
10296
my %cytoband = ();
@@ -110,7 +104,6 @@
110104
$chromosome{'cytobands'} = \@cytobands;
111105

112106
push(@chromosomes, \%chromosome);
113-
# push(@chrom_ids, $chrom->seq_region_name);
114107
}
115108
$info_stats{'chromosomes'} = \@chromosomes;
116109

@@ -124,7 +117,6 @@
124117
$supercontig{'start'} = int($supercon->start());
125118
$supercontig{'end'} = int($supercon->end());
126119
$supercontig{'size'} = int($supercon->seq_region_length());
127-
# $supercontig{'numberGenes'} = scalar @{$supercon->get_all_Genes()};
128120
$supercontig{'isCircular'} = $supercon->is_circular();
129121

130122
## Adding an unique cytoband covering all chromosome is added.
@@ -151,7 +143,7 @@
151143

152144
sub print_parameters {
153145
print "Parameters: ";
154-
print "species: $species, outfile: $outfile, ";
146+
print "species: $species, assembly: $assembly, outfile: $outfile, ";
155147
print "ensembl-registry: $ENSEMBL_REGISTRY, ";
156148
print "ensembl-host: $ENSEMBL_HOST, ensembl-port: $ENSEMBL_PORT, ";
157149
print "ensembl-user: $ENSEMBL_USER, verbose: $verbose, help: $help";
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<!--
2+
~ Copyright 2015-2020 OpenCB
3+
~
4+
~ Licensed under the Apache License, Version 2.0 (the "License");
5+
~ you may not use this file except in compliance with the License.
6+
~ You may obtain a copy of the License at
7+
~
8+
~ http://www.apache.org/licenses/LICENSE-2.0
9+
~
10+
~ Unless required by applicable law or agreed to in writing, software
11+
~ distributed under the License is distributed on an "AS IS" BASIS,
12+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
~ See the License for the specific language governing permissions and
14+
~ limitations under the License.
15+
-->
16+
17+
<MartRegistry>
18+
<MartURLLocation database="ensembl_mart_111" default="1" displayName="Ensembl Genes 111" host="www.ensembl.org" includeDatasets="" martUser="" name="ENSEMBL_MART_ENSEMBL" path="/biomart/martservice" port="80" serverVirtualSchema="default" visible="1" />
19+
</MartRegistry>

0 commit comments

Comments
 (0)