Skip to content

Commit 445210c

Browse files
karya0gc00
authored andcommitted
Simplified mtcp-restart plugin interface.
This uses the newly proposed environment variable based mechanism.
1 parent 6ac5406 commit 445210c

3 files changed

Lines changed: 73 additions & 59 deletions

File tree

restart_plugin/dmtcp_restart_plugin.cpp

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "workerstate.h"
22
#include "dmtcp_restart.h"
33
#include "jassert.h"
4+
#include "jconvert.h"
45
#include "jfilesystem.h"
56
#include "util.h"
67

@@ -39,30 +40,21 @@ void dmtcp_restart_plugin(const string &restartDir,
3940
// Also, create the DMTCP shared-memory area.
4041
t->initialize();
4142

42-
vector<char *> mtcpArgs = getMtcpArgs();
43-
mtcpArgs.push_back((char *)"--mpi");
44-
45-
const map<string, string> &kvmap = t->getKeyValueMap();
46-
47-
mtcpArgs.push_back((char*) "--minLibsStart");
48-
mtcpArgs.push_back((char*) kvmap.at("MANA_MinLibsStart").c_str());
49-
50-
mtcpArgs.push_back((char*) "--maxLibsEnd");
51-
mtcpArgs.push_back((char*) kvmap.at("MANA_MaxLibsEnd").c_str());
52-
53-
mtcpArgs.push_back((char*) "--minHighMemStart");
54-
mtcpArgs.push_back((char*) kvmap.at("MANA_MinHighMemStart").c_str());
43+
publishKeyValueMapToMtcpEnvironment(t);
5544

5645
if (!restartDir.empty()) {
57-
mtcpArgs.push_back((char *)"--restartdir");
58-
mtcpArgs.push_back((char *)restartDir.c_str());
46+
setenv("MANA_RestartDir", restartDir.c_str(), 1);
5947
}
6048

61-
for (const string &image : ckptImages) {
62-
mtcpArgs.push_back((char*) image.c_str());
49+
for (size_t i = 0; i < ckptImages.size(); i++) {
50+
string key = "MANA_CkptImage_Rank_" + jalib::XToString(i);
51+
setenv(key.c_str(), ckptImages[i].c_str(), 1);
6352
}
6453

54+
vector<char *> mtcpArgs = getMtcpArgs();
55+
mtcpArgs.push_back((char *)"--mpi");
56+
6557
mtcpArgs.push_back(NULL);
6658
execvp(mtcpArgs[0], &mtcpArgs[0]);
6759
JASSERT(false)(mtcpArgs[0]).Text("execvp failed!");
68-
}
60+
}

restart_plugin/mtcp_restart_plugin.c

Lines changed: 61 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
// Remove this FIXME comment.
3737
// Create a new MANA PR from this, and the 'struct PluginInfo' field..
3838

39-
#define HAS_MAP_FIXED_NOREPLACE LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
39+
#define HAS_MAP_FIXED_NOREPLACE LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
4040

4141
// Using both methods to skip unmapping lower-half mmap'ed regions
4242
// In theory, just one of these techniques should suffice.
@@ -48,13 +48,19 @@
4848

4949
NO_OPTIMIZE
5050
char*
51-
getCkptImageByRank(int rank, char **argv)
51+
getCkptImageByRank(int rank, char **environ)
5252
{
53-
char *fname = NULL;
54-
if (rank >= 0) {
55-
fname = argv[rank];
53+
if (rank < 0) {
54+
return NULL;
5655
}
57-
return fname;
56+
57+
char *fname = NULL;
58+
char envKey[64] = {0};
59+
char rankStr[20] = {0};
60+
mtcp_itoa(rankStr, rank);
61+
mtcp_strcpy(envKey, "MANA_CkptImage_Rank_");
62+
mtcp_strncat(envKey, rankStr, mtcp_strlen(rankStr));
63+
return mtcp_getenv(envKey, environ);
5864
}
5965

6066
static inline int
@@ -191,17 +197,17 @@ int my_memcmp(const void *buffer1, const void *buffer2, size_t len) {
191197

192198
// FIXME: Many style rules broken. Code never reviewed by skilled programmer.
193199
int getCkptImageByDir(RestoreInfo *rinfo, char *buffer, size_t buflen, int rank) {
194-
if(!rinfo->restartDir) {
200+
if(!rinfo->pluginInfo.restartDir) {
195201
MTCP_PRINTF("***ERROR No restart directory found - cannot find checkpoint image by directory!");
196202
return -1;
197203
}
198204

199-
size_t len = mtcp_strlen(rinfo->restartDir);
205+
size_t len = mtcp_strlen(rinfo->pluginInfo.restartDir);
200206
if(len >= buflen){
201207
MTCP_PRINTF("***ERROR Restart directory would overflow given buffer!");
202208
return -1;
203209
}
204-
mtcp_strcpy(buffer, rinfo->restartDir); // start with directory
210+
mtcp_strcpy(buffer, rinfo->pluginInfo.restartDir); // start with directory
205211

206212
// ensure directory ends with /
207213
if(buffer[len - 1] != '/') {
@@ -437,6 +443,36 @@ bool is_overlap(char *start1, char *end1, char *start2, char *end2) {
437443
return end1 >= start2 || end2 >= start1;
438444
}
439445

446+
void populate_plugin_info(RestoreInfo *rinfo)
447+
{
448+
// FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should
449+
// directly write to rinfo->pluginInfo, and we won't
450+
// need to do this extra copy, here.
451+
// Copy the upper-half info to rinfo->pluginInfo
452+
453+
const char *minLibsStartStr = mtcp_getenv("MANA_MinLibsStart", rinfo->environ);
454+
if (minLibsStartStr != NULL) {
455+
rinfo->pluginInfo.minLibsStart = (char*) mtcp_strtoll(minLibsStartStr);
456+
}
457+
458+
const char *maxLibsEndStr = mtcp_getenv("MANA_MaxLibsEnd", rinfo->environ);
459+
if (maxLibsEndStr != NULL) {
460+
rinfo->pluginInfo.maxLibsEnd = (char*) mtcp_strtoll(maxLibsEndStr);
461+
}
462+
463+
const char *minHighMemStartStr = mtcp_getenv("MANA_MinHighMemStart", rinfo->environ);
464+
if (minHighMemStartStr != NULL) {
465+
rinfo->pluginInfo.minHighMemStart = (char*) mtcp_strtoll(minHighMemStartStr);
466+
}
467+
468+
const char *maxHighMemEndStr = mtcp_getenv("MANA_MaxHighMemEnd", rinfo->environ);
469+
if (maxHighMemEndStr != NULL) {
470+
rinfo->pluginInfo.maxHighMemEnd = (char*) mtcp_strtoll(maxHighMemEndStr);
471+
}
472+
473+
rinfo->pluginInfo.restartDir = mtcp_getenv("MANA_RestartDir", rinfo->environ);
474+
}
475+
440476
#ifdef SINGLE_CART_REORDER
441477
int
442478
load_cartesian_properties(char *filename, CartesianProperties *cp)
@@ -513,6 +549,8 @@ get_rank_corresponding_to_coordinates(int comm_old_size, int ndims, int *coords)
513549
void
514550
mtcp_plugin_hook(RestoreInfo *rinfo)
515551
{
552+
populate_plugin_info(rinfo);
553+
516554
remap_vdso_and_vvar_regions(rinfo);
517555
mysetauxval(rinfo->environ, AT_SYSINFO_EHDR,
518556
(unsigned long int) rinfo->currentVdsoStart);
@@ -535,16 +573,6 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
535573
// when calling mtcp:restorememoryareas().
536574
rinfo->pluginInfo.lh_info_addr = lh_info_addr;
537575

538-
// FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should
539-
// directly write to rinfo->pluginInfo, and we won't
540-
// need to do this extra copy, here.
541-
// Copy the upper-half info to rinfo->pluginInfo
542-
rinfo->pluginInfo.minLibsStart = rinfo->minLibsStart;
543-
rinfo->pluginInfo.maxLibsEnd = rinfo->maxLibsEnd;
544-
rinfo->pluginInfo.minHighMemStart = rinfo->minHighMemStart;
545-
rinfo->pluginInfo.maxHighMemEnd = rinfo->maxHighMemEnd;
546-
rinfo->pluginInfo.restartDir = rinfo->restartDir;
547-
548576
// Reserve first 500 file descriptors for the Upper-half
549577
int reserved_fds[500];
550578
int total_reserved_fds;
@@ -560,18 +588,18 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
560588
*/
561589
{
562590
// minLibsStart was chosen with extra space below, for future libs, mmap.
563-
start1 = rinfo->minLibsStart; // first lib of upper half
591+
start1 = rinfo->pluginInfo.minLibsStart; // first lib of upper half
564592
// Either lh_info_addr->memRange is a region between 1 GB and 2 GB below
565593
// the end of stack in the lower half; or else it is at an unusual
566594
// address for which we hope there is no address conflict.
567595
// The latter holds if USE_LH_FIXED_ADDRESS was defined in
568596
// mtcp_split_process.c, in both restart_plugin and mpi-proxy-split dirs.
569-
end1 = rinfo->maxLibsEnd;
597+
end1 = rinfo->pluginInfo.maxLibsEnd;
570598

571599
// Reserve 8MB above min high memory region. That should include space for
572600
// stack, argv, env, auxvec.
573-
start2 = rinfo->minHighMemStart - 1 * GB; // Allow for stack to grow
574-
end2 = rinfo->minHighMemStart + 8 * MB;
601+
start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow
602+
end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB;
575603
// Ignore region start2:end2 if it is overlapped with region start1:end1
576604
if (is_overlap(start1, end1, start2, end2)) {
577605
if (end1 < end2) { end1 = end2; }
@@ -683,7 +711,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
683711
char *filename = "./ckpt_rank_0/cartesian.info";
684712

685713
char full_filename[PATH_MAX];
686-
set_header_filepath(full_filename, rinfo->restartDir);
714+
set_header_filepath(full_filename, rinfo->pluginInfo.restartDir);
687715
ManaHeader m_header;
688716
MTCP_ASSERT(load_mana_header(full_filename, &m_header) == 0);
689717

@@ -732,7 +760,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
732760
ckpt_image_rank_to_be_restored) == -1) {
733761
mtcp_strncpy(
734762
rinfo->ckptImage,
735-
getCkptImageByRank(ckpt_image_rank_to_be_restored, rinfo->argv),
763+
getCkptImageByRank(ckpt_image_rank_to_be_restored, rinfo->environ),
736764
PATH_MAX);
737765
}
738766

@@ -746,6 +774,8 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
746774
void
747775
mtcp_plugin_hook(RestoreInfo *rinfo)
748776
{
777+
populate_plugin_info(rinfo);
778+
749779
remap_vdso_and_vvar_regions(rinfo);
750780
mysetauxval(rinfo->environ, AT_SYSINFO_EHDR,
751781
(unsigned long int) rinfo->currentVdsoStart);
@@ -768,15 +798,6 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
768798
// when calling mtcp:restorememoryareas().
769799
rinfo->pluginInfo.lh_info_addr = lh_info_addr;
770800

771-
// FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should
772-
// directly write to rinfo->pluginInfo, and we won't
773-
// need to do this extra copy, here.
774-
// Copy the upper-half info to rinfo->pluginInfo
775-
rinfo->pluginInfo.minLibsStart = rinfo->minLibsStart;
776-
rinfo->pluginInfo.maxLibsEnd = rinfo->maxLibsEnd;
777-
rinfo->pluginInfo.minHighMemStart = rinfo->minHighMemStart;
778-
rinfo->pluginInfo.restartDir = rinfo->restartDir;
779-
780801
// Reserve first 500 file descriptors for the Upper-half
781802
int reserved_fds[500];
782803
int total_reserved_fds;
@@ -792,18 +813,18 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
792813
*/
793814
{
794815
// minLibsStart was chosen with extra space below, for future libs, mmap.
795-
start1 = rinfo->minLibsStart; // first lib of upper half
816+
start1 = rinfo->pluginInfo.minLibsStart; // first lib of upper half
796817
// Either lh_info_addr->memRange is a region between 1 GB and 2 GB below
797818
// the end of stack in the lower half; or else it is at an unusual
798819
// address for which we hope there is no address conflict.
799820
// The latter holds if USE_LH_FIXED_ADDRESS was defined in
800821
// mtcp_split_process.c, in both restart_plugin and mpi-proxy-split dirs.
801-
end1 = rinfo->maxLibsEnd;
822+
end1 = rinfo->pluginInfo.maxLibsEnd;
802823

803824
// Reserve 8MB above min high memory region. That should include space for
804825
// stack, argv, env, auxvec.
805-
start2 = rinfo->minHighMemStart - 1 * GB; // Allow for stack to grow
806-
end2 = rinfo->minHighMemStart + 8 * MB;
826+
start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow
827+
end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB;
807828
// Ignore region start2:end2 if it is overlapped with region start1:end1
808829
if (is_overlap(start1, end1, start2, end2)) {
809830
if (end1 < end2) { end1 = end2; }
@@ -909,7 +930,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
909930
# endif
910931

911932
char full_filename[PATH_MAX];
912-
set_header_filepath(full_filename, rinfo->restartDir);
933+
set_header_filepath(full_filename, rinfo->pluginInfo.restartDir);
913934
ManaHeader m_header;
914935
MTCP_ASSERT(load_mana_header(full_filename, &m_header) == 0);
915936

@@ -926,7 +947,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo)
926947

927948
if (getCkptImageByDir(rinfo, rinfo->ckptImage, 512, rank) == -1) {
928949
mtcp_strncpy(rinfo->ckptImage,
929-
getCkptImageByRank(rank, rinfo->argv),
950+
getCkptImageByRank(rank, rinfo->environ),
930951
PATH_MAX);
931952
}
932953

restart_plugin/mtcp_split_process.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ splitProcess(RestoreInfo *rinfo)
9595
// We then kill the child process.
9696
ret = read_lh_proxy_bits(rinfo, childpid, rinfo->argv[0]);
9797
// FIXME: We should use lh_info_addr, in place of rinfo->pluginInfo
98-
mtcp_memcpy(&rinfo->pluginInfo, lh_info_addr, sizeof(*lh_info_addr));
98+
rinfo->pluginInfo.lh_info_addr = lh_info_addr;
99+
99100
mtcp_sys_kill(childpid, SIGKILL);
100101
mtcp_sys_wait4(childpid, NULL, 0, NULL);
101102
}

0 commit comments

Comments
 (0)