📄 odls_default_module.c
字号:
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_NODE_NAME_KEY, ORTE_STRING, node->nodename))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(ndat); OBJ_RELEASE(value); return rc; } if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, value))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(ndat); OBJ_RELEASE(values[0]); return rc; } ndat->cnt += 1; } } *data = ndat; return ORTE_SUCCESS;}static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_status){ time_t end; pid_t ret;#if !defined(HAVE_SCHED_YIELD) struct timeval t; fd_set bogus;#endif end = time(NULL) + timeout; do { ret = waitpid(pid, exit_status, WNOHANG); if (pid == ret) { /* It died -- return success */ return true; } else if (-1 == ret && ECHILD == errno) { /* The pid no longer exists, so we'll call this "good enough for government work" */ return true; }#if defined(HAVE_SCHED_YIELD) sched_yield();#else /* Bogus delay for 1 usec */ t.tv_sec = 0; t.tv_usec = 1; FD_ZERO(&bogus); FD_SET(0, &bogus); select(1, &bogus, NULL, NULL, &t);#endif } while (time(NULL) < end); /* The child didn't die, so return false */ return false;}int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state){ orte_odls_child_t *child; opal_list_item_t *item; int rc, exit_status; opal_list_t procs_killed; orte_namelist_t *proc; OBJ_CONSTRUCT(&procs_killed, opal_list_t); opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job); /* since we are going to be working with the global list of * children, we need to protect that list from modification * by other threads */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name)); /* is this process alive? if not, then nothing for us * to do to it */ if (!child->alive) { opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child is not alive", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME)); continue; } /* do we have a child from the specified job? Because the * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { continue; } /* de-register the SIGCHILD callback for this pid */ orte_wait_cb_cancel(child->pid); /* Send a sigterm to the process. If we get ESRCH back, that means the process is already dead, so just move on. */ if (0 != kill(child->pid, SIGTERM) && ESRCH != errno) { int err = errno; opal_show_help("help-odls-default.txt", "odls-default:could-not-send-kill", true, orte_system_info.nodename, child->pid, err); goto MOVEON; } /* The kill succeeded. Wait up to timeout_before_sigkill seconds to see if it died. */ if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) { /* try killing it again */ kill(child->pid, SIGKILL); /* Double check that it actually died this time */ if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) { opal_show_help("help-odls-default.txt", "odls-default:could-not-kill", true, orte_system_info.nodename, child->pid); } } MOVEON: /* set the process to "not alive" */ child->alive = false; /* add this proc to the local list */ proc = OBJ_NEW(orte_namelist_t); if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(proc->name), child->name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return rc; } opal_list_append(&procs_killed, &proc->item); } /* we are done with the global list, so we can now release * any waiting threads - this also allows any callbacks to work */ opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); /* deconstruct the local list and update the process states on the registry, if indicated */ while (NULL != (item = opal_list_remove_first(&procs_killed))) { proc = (orte_namelist_t*)item; if (set_state) { if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(proc->name, ORTE_PROC_STATE_TERMINATED, exit_status))) { ORTE_ERROR_LOG(rc); /* don't exit out even if this didn't work - we still might need to kill more * processes, so just keep trucking */ } } OBJ_RELEASE(proc); } OBJ_DESTRUCT(&procs_killed); return ORTE_SUCCESS;}/* * Wait for a callback indicating the child has completed. */static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata){ orte_odls_child_t *child; opal_list_item_t *item; bool aborted; char *job, *vpid, *abort_file; struct stat buf; int rc; opal_output(orte_odls_globals.output, "odls: child process terminated"); /* since we are going to be working with the global list of * children, we need to protect that list from modification * by other threads. This will also be used to protect us * from race conditions on any abort situation */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); /* find this child */ for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->alive && pid == child->pid) { /* found it */ goto GOTCHILD; } } /* get here if we didn't find the child, or if the specified child is already * dead. If the latter, then we have a problem as it means we are detecting * it exiting multiple times */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return;GOTCHILD: orte_iof.iof_flush(); /* determine the state of this process */ aborted = false; if(WIFEXITED(status)) { /* even though the process exited "normally", it is quite * possible that this happened via an orte_abort call - in * which case, we need to indicate this was an "abnormal" * termination. See the note in "orte_abort.c" for * an explanation of this process. * * For our purposes here, we need to check for the existence * of an "abort" file in this process' session directory. If * we find it, then we know that this was an abnormal termination. */ if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, child->name->jobid))) { ORTE_ERROR_LOG(rc); goto MOVEON; } if (ORTE_SUCCESS != (rc = orte_ns.convert_vpid_to_string(&vpid, child->name->vpid))) { ORTE_ERROR_LOG(rc); free(job); goto MOVEON; } abort_file = opal_os_path(false, orte_process_info.universe_session_dir, job, vpid, "abort", NULL ); free(job); free(vpid); if (0 == stat(abort_file, &buf)) { /* the abort file must exist - there is nothing in it we need. It's * meer existence indicates that an abnormal termination occurred */ opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort", ORTE_NAME_ARGS(child->name)); aborted = true; free(abort_file); } else { opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally", ORTE_NAME_ARGS(child->name)); } } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition */ opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal", ORTE_NAME_ARGS(child->name)); aborted = true; }MOVEON: /* set this proc to "not alive" */ child->alive = false; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(child->name); /* set the proc state in the child structure */ if (aborted) { child->state = ORTE_PROC_STATE_ABORTED; } else { child->state = ORTE_PROC_STATE_TERMINATED; } /* Need to unlock before we call set_proc_state as this is going to generate * a trigger that will eventually callback to us */ opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); if (aborted) { rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, status); } else { rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_TERMINATED, status); } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); }}/** * Fork/exec the specified processes */static int odls_default_fork_local_proc( orte_app_context_t* context, orte_odls_child_t *child, orte_vpid_t vpid_start, orte_vpid_t vpid_range, bool want_processor, size_t processor, bool oversubscribed, char **base_environ){ pid_t pid; orte_iof_base_io_conf_t opts; int rc; sigset_t sigs; int i = 0, p[2]; /* should pull this information from MPIRUN instead of going with default */ opts.usepty = OMPI_ENABLE_PTY_SUPPORT; /* BWB - Fix post beta. Should setup stdin in orterun and make part of the app_context */ if (child->name->vpid == 0) { opts.connect_stdin = true; } else { opts.connect_stdin = false; } rc = orte_iof_base_setup_prefork(&opts); if (ORTE_SUCCESS != rc) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -