📄 vprivcheckpoint.c
字号:
/* MPICH-V Copyright (C) 2002, 2003 Groupe Cluster et Grid, LRI, Universite de Paris Sud This file is part of MPICH-V. MPICH-V is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. MPICH-V is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with MPICH-V; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA $Id: vprivcheckpoint.c,v 1.5 2006/01/24 19:35:01 rodrigue Exp $*/#include "config.h"#include "vprivcheckpoint.h"#include <stdlib.h>#include <stdio.h>#include <unistd.h>#include <signal.h>#include <sys/types.h>#include <sys/stat.h>#include <sys/time.h>#include <fcntl.h>#include <wait.h>#include <time.h>#include <libcr.h>#include "debug.h"#include "mpid.h"#include "mpiddev.h"#include "mpimem.h"extern void VPreCheckpoint(void);extern void VPostCheckpoint(void);static int VGROUP;static int VRANK;static int blcr_is_restart = 0;static pid_t cpid;static struct sigaction soa;static void on_child(int s, siginfo_t *i, void *ptr){ pid_t p; for(;;) { p = waitpid(-1, NULL, WNOHANG); if( p <= 0 ) break; if(p == cpid) { cpid = 0; sigaction(SIGCHLD, &soa, NULL); } else { if( soa.sa_flags & SA_SIGINFO ) { if( (soa.sa_sigaction != (void*)SIG_DFL) && (soa.sa_sigaction != (void*)SIG_IGN) && (soa.sa_sigaction != NULL) ) soa.sa_sigaction(s, i, ptr); } else { if( (soa.sa_handler != SIG_DFL) && (soa.sa_handler != SIG_IGN) && (soa.sa_handler != NULL) ) soa.sa_handler(s); } } }}static int blcr_on_checkpoint(void* arg) { FILE *f_debug; blcr_is_restart = cr_checkpoint(CR_CHECKPOINT_READY); f_debug = fopen("/tmp/debug.log", "a"); fprintf(f_debug, "%d call of the callback: %p, %d\n", getpid(), &blcr_is_restart, blcr_is_restart); if(blcr_is_restart) fprintf(f_debug, "We have been restarted callback\n"); else fprintf(f_debug, "We are continuing\n"); fprintf(f_debug, "out of callback : blcr_is_restart = %p, %d\n", &blcr_is_restart, blcr_is_restart); fclose(f_debug); return 0;}/** * Open pipe for asynchronous read * @return -1 on error, 0 on success */int vprivcheckpoint_init(int group, int rank){ VGROUP = group; VRANK = rank; printi("ckpt", "Initializing BLCR driver checkpoint lib for %d:%d\n", VGROUP, VRANK); if( cr_init() < 0 ) qerror("unable to initialize BLCR in MPI application: %s", cr_strerror(errno)); return 0;}/** * Perform actual checkpoint * @return -1 on failure, 0 on success */int vprivcheckpoint_performcheckpoint(int ckpt_wait){ FILE *f_debug; struct timespec req; struct sigaction sa; char filename[256]; sprintf(filename, TMPDIR"/%d:%d.ckpt.pipe", VGROUP, VRANK); printi("ckpt", "before signal treatment start"); sa.sa_sigaction = on_child; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; if( sigaction(SIGCHLD, &sa, &soa) == -1 ) printe("sigaction SIGCHLD"); /* generate a clone of the process, close all opened socket, write checkpoint image and exit the clone */ switch(cpid = fork()) { case 0 : printi("ckpt", "Child process launched"); /* let's go for checkpoint */ printi("ckpt", "let s go for checkpoint lib calls"); if( cr_init() < 0 ) printe("cr_init failed: %s", cr_strerror(errno)); else printi("cr_init", "forked process initiated"); VPreCheckpoint(); /* finaly we let BLCR proceed to the checkpoint */ f_debug=fopen("/tmp/debug.log", "w"); fprintf(f_debug, "checkpoint by BLCR\n"); fprintf(f_debug, "call of blcr init done\n"); cr_register_callback(blcr_on_checkpoint, NULL, CR_SIGNAL_CONTEXT); fprintf(f_debug, "CKPT_FILENAME = %s\n", filename?filename:"(null)\n"); fclose(f_debug); cr_request_file (filename); f_debug = fopen("/tmp/debug.log", "a"); fprintf(f_debug, "the lib blcr fonctions are called (blcr_is_restart = %p, %d)\n", &blcr_is_restart, blcr_is_restart); if(!blcr_is_restart) { for(;;) { if( cr_status() == CR_STATE_IDLE ) break; fprintf(f_debug, "Still checkpointing : status = %d\n", cr_status()); req.tv_sec = 1; req.tv_nsec = 0; nanosleep(&req, NULL); } fprintf(f_debug, "This is not a restart: exiting\n"); fclose(f_debug); _exit(0); } fprintf(f_debug, "this is a restart\n"); blcr_is_restart = 0; fclose(f_debug); VPostCheckpoint(); printi("ckpt", "Restart completed"); return 0; case -1 : return -1; default : if (ckpt_wait) { printi("ckpt","Waiting for clone to checkpoint"); while( cpid != 0 ) pause(); } } printi("ckpt", "checkpoint %s", ckpt_wait?"complete":"running"); return 0;}/** * Close and free all internaly used data structures * @return -1 on error, 0 on success */int vprivcheckpoint_finalize(void){ printi("ckpt", "checkpoint_finalize: pipe closed"); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -