📄 mpiexec.c
字号:
* interface. The global variables are declared static so that they'll * have the correct scope if we remove these routines from the mpiexec.c file. */struct groupentry { int groupsize; int num_in_barrier; char kvsname[MAXNAMELEN];};static struct groupentry grouptable[MAXGROUPS];static int nextnewgroup = 0;struct pair { char key[MAXKEYLEN]; char val[MAXVALLEN];};struct kvs { int active; char kvsname[MAXNAMELEN]; struct pair pairs[MAXPAIRS];};static struct kvs kvstable[MAXKVSS];static int kvsid;/* * The Forker implementation of the PMI routines have names * fPMI_xxx *//* * Perform any initialization. * Input * nprocs - Initial number of processes to create (size of initial group) * Output * kvsname is the initial kvs name (provide a string of size MAXNAMELEN. * Return value: groupid */int fPMI_Init( int nprocs, char kvsname[] ){ int i; for ( i = 0; i < MAXKVSS; i++ ) kvstable[i].active = 0; /* set up group */ grouptable[nextnewgroup].groupsize = nprocs; /* set up keyval space for this group */ fPMI_Allocate_kvs( &kvsid, kvsname ); return nextnewgroup;}/* kvsname is output */int fPMI_Allocate_kvs( int *kvsid, char kvsname[] ){ int i, j; for ( i = 0; i < MAXKVSS; i++ ) if ( !kvstable[i].active ) break; if ( i >= MAXKVSS ) { fprintf( stderr, "too many kvs's\n" ); return( -1 ); } else { kvstable[i].active = 1; for ( j = 0; j < MAXPAIRS; j++ ) { kvstable[i].pairs[j].key[0] = '\0'; kvstable[i].pairs[j].val[0] = '\0'; } snprintf( kvstable[i].kvsname, MAXNAMELEN, "kvs_%d", i ); strncpy( kvsname, kvstable[i].kvsname, MAXNAMELEN ); *kvsid = i; return( 0 ); }}int fPMI_Allocate_kvs_group( void ){ return nextnewgroup++;}/* * Handle an incoming "barrier" command */void fPMI_Handle_barrier( int idx ){ int i; grouptable[fdtable[idx].group].num_in_barrier++; if ( grouptable[fdtable[idx].group].num_in_barrier == grouptable[fdtable[idx].group].groupsize ) { for ( i = 0; i < MAXFDENTRIES; i++ ) { if ( fdtable[i].active && fdtable[i].group == fdtable[idx].group ) PMIU_writeline(fdtable[i].fd, "cmd=barrier_out\n" ); } grouptable[fdtable[idx].group].num_in_barrier = 0; }}/* * Handle an incoming "create_kvs" command */void fPMI_Handle_create_kvs( int idx ){ int kvsidx; char kvsname[MAXNAMELEN], outbuf[PMIU_MAXLINE]; fPMI_Allocate_kvs( &kvsidx, kvsname ); snprintf( outbuf, PMIU_MAXLINE, "cmd=newkvs kvsname=%s\n", kvsname ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* * Handle an incoming "destroy_kvs" command */void fPMI_Handle_destroy_kvs( int idx ){ int i, rc=0; char kvsname[MAXNAMELEN]; char message[PMIU_MAXLINE], outbuf[PMIU_MAXLINE]; PMIU_getval( "kvsname", kvsname, MAXNAMELEN ); for ( i = 0; i < MAXKVSS; i++ ) { if ( strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) { if ( kvstable[i].active ) { kvstable[i].active = 0; snprintf( message, PMIU_MAXLINE, "KVS_%s_successfully_destroyed", kvsname ); rc = 0; } else { snprintf( message, PMIU_MAXLINE, "KVS_%s_previously_destroyed", kvsname ); rc = -1; } break; } } if ( i == MAXKVSS ) { rc = -1; snprintf( message, PMIU_MAXLINE, "KVS %s not found", kvsname ); } snprintf( outbuf, PMIU_MAXLINE, "cmd=kvs_destroyed rc=%d msg=%s\n", rc, message ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* * Handle an incoming "put" command */void fPMI_Handle_put( int idx ){ int i, j, rc=0; char kvsname[MAXNAMELEN]; char message[PMIU_MAXLINE], outbuf[PMIU_MAXLINE]; char key[MAXKEYLEN]; PMIU_getval( "kvsname", kvsname, MAXNAMELEN ); for ( i = 0; i < MAXKVSS; i++ ) { if ( kvstable[i].active && strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) { /* should check here for duplicate key and raise error */ PMIU_getval( "key", key, MAXKEYLEN ); for ( j = 0; j < MAXPAIRS; j++ ) { if ( strncmp( kvstable[i].pairs[j].key, key, MAXKEYLEN ) == 0 ) { rc = -1; /* no duplicate keys allowed */ snprintf( message, PMIU_MAXLINE, "duplicate_key %s", key ); break; } else if ( strncmp( kvstable[i].pairs[j].key, "", MAXKEYLEN ) == 0 ) { PMIU_getval( "key", kvstable[i].pairs[j].key, MAXKEYLEN ); PMIU_getval( "value", kvstable[i].pairs[j].val, MAXVALLEN ); rc = 0; strncpy( message, "success", PMIU_MAXLINE ); break; } } if ( j == MAXPAIRS ) { rc = -1; snprintf( message, PMIU_MAXLINE, "no_room_in_kvs_%s", kvsname ); } } break; } if ( i == MAXKVSS ) { rc = -1; snprintf( message, PMIU_MAXLINE, "kvs_%s_not_found", kvsname ); } snprintf( outbuf, PMIU_MAXLINE, "cmd=put_result rc=%d msg=%s\n", rc, message ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* * Handle incoming "get" command */void fPMI_Handle_get( int idx ){ int i, j, rc=0; char kvsname[MAXNAMELEN]; char message[PMIU_MAXLINE], key[PMIU_MAXLINE], value[PMIU_MAXLINE]; char outbuf[PMIU_MAXLINE]; PMIU_getval( "kvsname", kvsname, MAXNAMELEN ); for ( i = 0; i < MAXKVSS; i++ ) { if ( kvstable[i].active && strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) { PMIU_getval( "key", key, PMIU_MAXLINE ); for ( j = 0; j < MAXPAIRS; j++ ) { if ( strncmp( kvstable[i].pairs[j].key, key, MAXKEYLEN ) == 0 ) { rc = 0; strncpy( message, "success", PMIU_MAXLINE ); strncpy( value, kvstable[i].pairs[j].val, PMIU_MAXLINE ); break; } } if ( j == MAXPAIRS ) { rc = -1; strncpy( value, "unknown", PMIU_MAXLINE ); snprintf( message, PMIU_MAXLINE, "key_%s_not_found", kvsname ); } } break; } if ( i == MAXKVSS ) { rc = -1; strncpy( value, "unknown", PMIU_MAXLINE ); snprintf( message, PMIU_MAXLINE, "kvs_%s_not_found", kvsname ); } snprintf( outbuf, PMIU_MAXLINE, "cmd=get_result rc=%d msg=%s value=%s\n", rc, message, value ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* Handle an incoming get_my_kvsname command */void fPMI_Handle_get_my_kvsname( int idx ){ char outbuf[PMIU_MAXLINE]; snprintf( outbuf, PMIU_MAXLINE, "cmd=my_kvsname kvsname=%s\n", fdtable[idx].kvsname ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* Handle an incoming "get_maxes" command */void fPMI_Handle_get_maxes( int idx ){ char outbuf[PMIU_MAXLINE]; snprintf( outbuf, PMIU_MAXLINE, "cmd=maxes kvsname_max=%d keylen_max=%d vallen_max=%d\n", MAXNAMELEN, MAXKEYLEN, MAXVALLEN ); PMIU_writeline( fdtable[idx].fd, outbuf );}/* * Handle incoming "getbyidx" command */void fPMI_Handle_getbyidx( int idx ){ int i, j; char kvsname[MAXNAMELEN], j_char[8], outbuf[PMIU_MAXLINE]; PMIU_getval( "kvsname", kvsname, MAXNAMELEN ); for ( i = 0; i < MAXKVSS; i++ ) { if ( kvstable[i].active && strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) { PMIU_getval( "idx", j_char, 8 ); j = atoi( j_char ); if ( ( j > MAXPAIRS ) || strncmp( kvstable[i].pairs[j].key, "", MAXKEYLEN ) == 0 ) { snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results rc=-1 " "reason=no_more_keyvals\n" ); } else { snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results " "rc=0 nextidx=%d key=%s val=%s\n", j + 1, kvstable[i].pairs[j].key, kvstable[i].pairs[j].val ); } } break; } if ( i == MAXKVSS ) { snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results rc=-1 " "reason=kvs_%s_not_found\n", kvsname ); } PMIU_writeline( fdtable[idx].fd, outbuf );}/* #undef HAVE_PTRACE */#if defined(HAVE_PTRACE) && defined(HAVE_PTRACE_CONT) #include <sys/ptrace.h>/* * Ptrace to control execution for handling failures. * Ptrace causes the process to stop on any signal (except SIGKILL). * fork(); * IGNORE = 0; * ptrace( PTRACE_TRACEME, IGNORE, IGNORE); * exec... * * The parent can use * ptrace( PTRACE_CONT, pid, 0 ); * to cause the process to continue. PTRACE_KILL to kill, * PTRACE_ATTACH and DETACH for processes not started with TRACEME. * * wait returns status: * WIFSTOPPED * WSTOPSIG * option value of WUNTRACED * * When using this option, it may be necessary to timeout the select * on PMI messages more often, perform a nonblocking wait on processes * and look for ones that are stopped. * * Functions to write: * CheckForStopped - Checks for a stopped process and executes * the requested routine (which probably executes a simple command) * * RunOnStopped - Runs a command on the stopped process * */#define MAX_COMMAND_LEN 1024char commandOnStopped[MAX_COMMAND_LEN];int onStopped = 0;/* Set the default command */void SetDefaultCommandOnStopped( void ){ char *p = getenv( "MPIEXEC_ONSIG" ); if (p) strcpy( commandOnStopped, p );}/* Eventually allow the polling interval to be set by environment/cmdline */int InitHandleStopped( void ) { return 1; } /* Make 10 for general use *//* Set the command to be used on stopped processes */void SetCommandOnStopped( const char cmd[] ){ strncpy( commandOnStopped, cmd, MAX_COMMAND_LEN ); onStopped = 1; /* Check for special cases */ if (strncmp( commandOnStopped, "traceback", 9 ) == 0) { /* FIXME: gdb only reads command from a file! */ strcpy( commandOnStopped, "gdb -batch -n -x gettb %e %p" ); } else if (strncmp( commandOnStopped, "gdb", 3 ) == 0) { strcpy( commandOnStopped, "xterm -e \"gdb %e %p\"" ); }}/* * Run the specified command on the given pid. The following sequences * are handled specially: * %e - replace with the name of the executable * %p - replace with the pid of the process * e.g., the command * xterm -e "gdb %e %p" & * runs an xterm that runs gdb on the stopped process, in the background. */void RunOnStopped( const char execname[], pid_t pid ) { char c; char fullcommand[MAX_COMMAND_LEN+1]; char *pout, *pin; /* Form the full command from the command string */ pout = fullcommand; pin = commandOnStopped; while ((c = *pin++) != 0 && (pout - fullcommand) < MAX_COMMAND_LEN) { if (c == '%') { if (*pin == 'e') { char *eptr = execname; pin++; /* Replace with the executable name */ while (*eptr && (pout - fullcommand) < MAX_COMMAND_LEN) { *pout++ = *eptr++; } } else if (*pin == 'p') { char pidchars[12], *pptr = pidchars; pin++; /* Replace with the pid */ sprintf( pidchars, "%d", (int)pid ); while (*pptr && (pout - fullcommand) < MAX_COMMAND_LEN) { *pout++ = *pptr++; } } else { *pout++ = c; *pout++ = *pin++; } } else { *pout++ = c; } } if (pout - fullcommand >= MAX_COMMAND_LEN) { /* Error - command is too long */ return; } /* Add trailing null */ *pout = 0; /* Run this command string in the background and orphaned, but with stdout still directed to us */ /* FIXME: system isn't robust enough for what we want */ printf( "Running %s\n", fullcommand ); /* We need to detach before we can run a command that itself wishes to use ptrace. There isn't a good way to do this, but we try using PTRACE_DETACH. What we do use is SIGTSTP, which will often leave the process stopped so that the next command can find it. */ ptrace( PTRACE_DETACH, pid, 0, SIGTSTP ); system( fullcommand ); /* We could re-attach the process here. If we don't, we can no longer wait on the process. Instead, we might reattach but turn off the handling of events. */ /* ptrace( PTRACE_ATTACH, pid, 0, 0 ); */}/* See if we want to set ptrace for this process. Putting this into a routine allows us to have more complex criteria */void CheckIfTraced( void ){ int rc; if (onStopped) { rc = ptrace( PTRACE_TRACEME, 0, 0, 0 ); if (rc < 0) { perror( "Error from ptrace(PTRACE_TRACEME):" ); } }}void CheckForStopped( const char execname[] ){ pid_t pid; int sig; int client_stat; /* ? WUNTRACED */ while (1) { pid = waitpid( -1, &client_stat, WNOHANG ); if (!pid) return; /* no stopped process */ if (WIFSTOPPED(client_stat)) { sig = WSTOPSIG(client_stat); if (sig == SIGTRAP) { /* Ignore these signals */ ptrace( PTRACE_CONT, pid, 0, 0 ); } else if (onStopped) { /*printf( "Signal is %d %s\n", sig, strsignal(sig) );*/ /* FIXME: Find this pid in the list of processes; get the executable name */ RunOnStopped( execname, pid ); } } else { /* Handle a process exit */ /* FIXME: look up pid and see if the process has finalized */ HandleWaitStatus( pid, client_stat, NORMAL, 0 ); num_exited++; } }}void KillTracedProcesses( void ){ int i; pid_t pid; for (i=0; i<=maxfdentryInUse; i++) { if (fdtable[i].active) { pid = fdtable[i].pid; if (pid > 0) { ptrace( PTRACE_KILL, pid, 0, 0 ); } } }}#else/* Dummy routines if ptrace is not available */int InitHandleStopped( void ) { return 0; }void SetDefaultCommandOnStopped( void ) {}void CheckIfTraced( void ) {}void CheckForStopped( const char cmd[] ) {}void SetCommandOnStopped( const char cmd[] ) {}void KillTracedProcesses( void ){}#endif/* * We should set up error messages so that they are of two flavors: * developer and user. E.g., the developer message might * include perror output and a terse message such as * "fork failed" while the user message might be more like * "Unable to create processes; check the total number of processes" */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -