📄 orterun.c
字号:
} /* Wait for the app to complete */ if (wait_for_job_completion) { OPAL_THREAD_LOCK(&orterun_globals.lock); while (!orterun_globals.exit) { opal_condition_wait(&orterun_globals.cond, &orterun_globals.lock); } /* check to see if the job was aborted */ if (ORTE_JOBID_INVALID != jobid && ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) { if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } /* define the exit state as abnormal by default */ exit_state = ORTE_JOB_STATE_ABORTED; } if (ORTE_JOB_STATE_TERMINATED != exit_state) { /* abnormal termination of some kind */ dump_aborted_procs(jobid); /* If we showed more abort messages than were allowed, show a followup message here */ if (num_aborted > max_display_aborted) { i = num_aborted - max_display_aborted; printf("%d additional process%s aborted (not shown)\n", i, ((i > 1) ? "es" : "")); } if (num_killed > 0) { printf("%d process%s killed (possibly by Open MPI)\n", num_killed, ((num_killed > 1) ? "es" : "")); } } /* Make sure we propagate the exit code */ if (WIFEXITED(orterun_globals.exit_status)) { rc = WEXITSTATUS(orterun_globals.exit_status); } else { /* If a process was killed by a signal, then make the * exit code of orterun be "signo + 128" so that "prog" * and "orterun prog" will both set the same status * value for the shell */ rc = WTERMSIG(orterun_globals.exit_status) + 128; } /* the job is complete - now tell the orteds that it is * okay to finalize and exit, we are done with them * be sure to include any descendants so nothing is * left hanging */ if (ORTE_JOBID_INVALID != jobid) { OBJ_CONSTRUCT(&attributes, opal_list_t); orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) { opal_show_help("help-orterun.txt", "orterun:daemon-die", true, orterun_basename, ORTE_ERROR_NAME(ret)); } while (NULL != (item = opal_list_remove_first(&attributes))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&attributes); } OPAL_THREAD_UNLOCK(&orterun_globals.lock); /* If we were forcibly killed, print a warning that the user may still have some manual cleanup to do. */ if (ORTE_JOBID_INVALID == jobid) { opal_show_help("help-orterun.txt", "orterun:abnormal-exit", true, orterun_basename, orterun_basename); } } }DONE: for (i = 0; i < num_apps; ++i) { OBJ_RELEASE(apps[i]); } free(apps); OBJ_RELEASE(apps_pa); orte_finalize(); free(orterun_basename); return rc;}/* * On abnormal termination - dump the * exit status of the aborted procs. */static void dump_aborted_procs(orte_jobid_t jobid){ char *segment; orte_gpr_value_t** values = NULL; orte_std_cntr_t i, k, num_values = 0; int rc; int32_t exit_status = 0; bool exit_status_set; char *keys[] = { ORTE_PROC_NAME_KEY, ORTE_PROC_LOCAL_PID_KEY, ORTE_PROC_RANK_KEY, ORTE_PROC_EXIT_CODE_KEY, ORTE_NODE_NAME_KEY, NULL }; OPAL_TRACE_ARG1(1, jobid); /* query the job segment on the registry */ if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) { ORTE_ERROR_LOG(rc); return; } rc = orte_gpr.get( ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, segment, NULL, keys, &num_values, &values ); if(rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); free(segment); return; } for (i = 0; i < num_values; i++) { orte_gpr_value_t* value = values[i]; orte_process_name_t name, *nptr; pid_t pid = 0, *pidptr; orte_std_cntr_t rank = 0, *sptr; bool rank_found=false; char* node_name = NULL; orte_exit_code_t *ecptr; exit_status = 0; exit_status_set = false; for(k=0; k < value->cnt; k++) { orte_gpr_keyval_t* keyval = value->keyvals[k]; if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nptr, keyval->value, ORTE_NAME))) { ORTE_ERROR_LOG(rc); continue; } name = *nptr; continue; } if(strcmp(keyval->key, ORTE_PROC_LOCAL_PID_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pidptr, keyval->value, ORTE_PID))) { ORTE_ERROR_LOG(rc); continue; } pid = *pidptr; continue; } if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); continue; } rank_found = true; rank = *sptr; continue; } if(strcmp(keyval->key, ORTE_PROC_EXIT_CODE_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&ecptr, keyval->value, ORTE_EXIT_CODE))) { ORTE_ERROR_LOG(rc); continue; } exit_status = *ecptr; exit_status_set = true; continue; } if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { node_name = (char*)(keyval->value->data); continue; } } if (rank_found) { if (WIFSIGNALED(exit_status)) { if (9 == WTERMSIG(exit_status)) { ++num_killed; } else { if (num_aborted < max_display_aborted) {#ifdef HAVE_STRSIGNAL if (NULL != strsignal(WTERMSIG(exit_status))) { opal_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", false, orterun_basename, (unsigned long)rank, (unsigned long)pid, node_name, WTERMSIG(exit_status), strsignal(WTERMSIG(exit_status))); } else {#endif opal_show_help("help-orterun.txt", "orterun:proc-aborted", false, orterun_basename, (unsigned long)rank, (unsigned long)pid, node_name, WTERMSIG(exit_status));#ifdef HAVE_STRSIGNAL }#endif } ++num_aborted; } } } /* If we haven't done so already, hold the exit_status so we can return it when exiting. Specifically, keep the first non-zero entry. If they all return zero, we'll return zero. We already have the globals.lock (from job_state_callback), so don't try to get it again. */ if (0 == orterun_globals.exit_status && exit_status_set) { orterun_globals.exit_status = exit_status; } OBJ_RELEASE(value); } if (NULL != values) { free(values); } free(segment);}/* * signal main thread when application completes */static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state){ OPAL_TRACE_ARG2(1, jobid, state); OPAL_THREAD_LOCK(&orterun_globals.lock); /* Note that there's only three states that we're interested in here: TERMINATED: which means that all the processes in the job have completed (normally and/or abnormally). AT_STG1: which means that everyone has hit stage gate 1, so we can do the parallel debugger startup stuff. Remember that the rmgr itself will also be called for the ABORTED state and call the pls.terminate_job, which will result in killing all the other processes. */ if (orte_debug_flag) { opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n", jobid, state); } switch(state) { case ORTE_PROC_STATE_TERMINATED: orterun_globals.exit_status = 0; /* set the exit status to indicate normal termination */ orterun_globals.exit = true; opal_condition_signal(&orterun_globals.cond); break; case ORTE_PROC_STATE_AT_STG1: orte_totalview_init_after_spawn(jobid); break; default: opal_output(0, "orterun: job state callback in unexpected state - jobid %lu, state 0x%04x\n", jobid, state); break; } OPAL_THREAD_UNLOCK(&orterun_globals.lock);}/* * Fail-safe in the event the job hangs and doesn't * cleanup correctly. */static void exit_callback(int fd, short event, void *arg){ OPAL_TRACE(1); /* Remove the TERM and INT signal handlers */ opal_signal_del(&term_handler); opal_signal_del(&int_handler);#ifndef __WINDOWS__ /** Remove the USR signal handlers */ opal_signal_del(&sigusr1_handler); opal_signal_del(&sigusr2_handler);#endif /* __WINDOWS__ */ /* Trigger the normal exit conditions */ orterun_globals.exit = true; orterun_globals.exit_status = 1; opal_condition_signal(&orterun_globals.cond);}/* * Attempt to terminate the job and wait for callback indicating * the job has been aborted. */typedef enum { ABORT_SIGNAL_FIRST, ABORT_SIGNAL_PROCESSING, ABORT_SIGNAL_WARNED, ABORT_SIGNAL_DONE} abort_signal_state_t;static void abort_signal_callback(int fd, short flags, void *arg){ int ret; opal_event_t* event; opal_list_t attrs; opal_list_item_t *item; static abort_signal_state_t state=ABORT_SIGNAL_FIRST; static struct timeval invoked, now; double a, b; OPAL_TRACE(1); /* If this whole process has already completed, then bail */ switch (state) { case ABORT_SIGNAL_FIRST: /* This is the first time through */ state = ABORT_SIGNAL_PROCESSING; break; case ABORT_SIGNAL_WARNED: gettimeofday(&now, NULL); a = invoked.tv_sec * 1000000 + invoked.tv_usec; b = now.tv_sec * 1000000 + invoked.tv_usec; if (b - a <= 1000000) { if (!orterun_globals.quiet){ fprintf(stderr, "%s: forcibly killing job...\n", orterun_basename); } /* tell the pls to cancel the terminate request - * obviously, something is wrong at this point */ if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) { ORTE_ERROR_LOG(ret); } /* We are in an event handler; exit_callback() will delete the handler that is currently running (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call exit_handler() after this. */ if (NULL != (event = (opal_event_t*) malloc(sizeof(opal_event_t)))) { opal_evtimer_set(event, exit_callback, NULL); now.tv_sec = 0; now.tv_usec = 0; opal_evtimer_add(event, &now); state = ABORT_SIGNAL_DONE; } return; } /* Otherwise fall through to PROCESSING and warn again */ case ABORT_SIGNAL_PROCESSING: opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", true, orterun_basename, orterun_basename, orterun_basename); gettimeofday(&invoked, NULL); state = ABORT_SIGNAL_WARNED; return; case ABORT_SIGNAL_DONE: /* Nothing to do -- return */ return; } if (!orterun_globals.quiet){ fprintf(stderr, "%s: killing job...\n\n", orterun_basename); } /* terminate the job - this will also wakeup orterun so * it can kill all the orteds. Be sure to kill all the job's * descendants, if any, so nothing is left hanging */ if (jobid != ORTE_JOBID_INVALID) { OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs); while (NULL != (item = opal_list_remove_first(&attrs))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&attrs); if (ORTE_SUCCESS != ret) { /* If we failed the terminate_job() above, then the condition variable in the main loop in orterun won't wake up. So signal it. */ if (NULL != (event = (opal_event_t*) malloc(sizeof(opal_event_t)))) { opal_evtimer_set(event, exit_callback, NULL); now.tv_sec = 0; now.tv_usec = 0; opal_evtimer_add(event, &now); } else { /* We really don't want to do this, but everything else has failed... */ orterun_globals.exit = true; orterun_globals.exit_status = 1; opal_condition_signal(&orterun_globals.cond); } jobid = ORTE_JOBID_INVALID; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -