⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odls_default_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
            if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]),                                                            ORTE_NODE_NAME_KEY,                                                            ORTE_STRING, node->nodename))) {                ORTE_ERROR_LOG(rc);                OBJ_RELEASE(ndat);                OBJ_RELEASE(value);                return rc;            }                      if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, value))) {                ORTE_ERROR_LOG(rc);                OBJ_RELEASE(ndat);                OBJ_RELEASE(values[0]);                return rc;            }            ndat->cnt += 1;        }    }        *data = ndat;    return ORTE_SUCCESS;}static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_status){    time_t end;    pid_t ret;#if !defined(HAVE_SCHED_YIELD)    struct timeval t;    fd_set bogus;#endif            end = time(NULL) + timeout;    do {        ret = waitpid(pid, exit_status, WNOHANG);        if (pid == ret) {            /* It died -- return success */            return true;        } else if (-1 == ret && ECHILD == errno) {            /* The pid no longer exists, so we'll call this "good               enough for government work" */            return true;        }#if defined(HAVE_SCHED_YIELD)        sched_yield();#else        /* Bogus delay for 1 usec */        t.tv_sec = 0;        t.tv_usec = 1;        FD_ZERO(&bogus);        FD_SET(0, &bogus);        select(1, &bogus, NULL, NULL, &t);#endif            } while (time(NULL) < end);    /* The child didn't die, so return false */    return false;}int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state){    orte_odls_child_t *child;    opal_list_item_t *item;    int rc, exit_status;    opal_list_t procs_killed;    orte_namelist_t *proc;    OBJ_CONSTRUCT(&procs_killed, opal_list_t);        opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",                    ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);    /* since we are going to be working with the global list of     * children, we need to protect that list from modification     * by other threads     */    OPAL_THREAD_LOCK(&orte_odls_default.mutex);        for (item = opal_list_get_first(&orte_odls_default.children);         item != opal_list_get_end(&orte_odls_default.children);         item = opal_list_get_next(item)) {        child = (orte_odls_child_t*)item;                opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",                    ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));        /* is this process alive? if not, then nothing for us         * to do to it         */        if (!child->alive) {            opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child is not alive",                    ORTE_NAME_ARGS(ORTE_PROC_MY_NAME));            continue;        }                /* do we have a child from the specified job? Because the        *  job could be given as a WILDCARD value, we must use        *  the dss.compare function to check for equality.        */        if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {            continue;        }                /* de-register the SIGCHILD callback for this pid */        orte_wait_cb_cancel(child->pid);        /* Send a sigterm to the process.  If we get ESRCH back, that           means the process is already dead, so just move on. */        if (0 != kill(child->pid, SIGTERM) && ESRCH != errno) {            int err = errno;            opal_show_help("help-odls-default.txt",                           "odls-default:could-not-send-kill",                           true, orte_system_info.nodename, child->pid, err);            goto MOVEON;        }        /* The kill succeeded.  Wait up to timeout_before_sigkill           seconds to see if it died. */        if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {            /* try killing it again */            kill(child->pid, SIGKILL);            /* Double check that it actually died this time */            if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {                opal_show_help("help-odls-default.txt",                               "odls-default:could-not-kill",                               true, orte_system_info.nodename, child->pid);            }        }        MOVEON:        /* set the process to "not alive" */        child->alive = false;                /* add this proc to the local list */        proc = OBJ_NEW(orte_namelist_t);        if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(proc->name), child->name, ORTE_NAME))) {            ORTE_ERROR_LOG(rc);            opal_condition_signal(&orte_odls_default.cond);            OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);            return rc;        }        opal_list_append(&procs_killed, &proc->item);    }        /* we are done with the global list, so we can now release     * any waiting threads - this also allows any callbacks to work     */    opal_condition_signal(&orte_odls_default.cond);    OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);            /* deconstruct the local list and update the process states on the registry, if indicated */    while (NULL != (item = opal_list_remove_first(&procs_killed))) {        proc = (orte_namelist_t*)item;        if (set_state) {            if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(proc->name, ORTE_PROC_STATE_TERMINATED, exit_status))) {                ORTE_ERROR_LOG(rc);                /* don't exit out even if this didn't work - we still might need to kill more                 * processes, so just keep trucking                 */            }        }        OBJ_RELEASE(proc);    }        OBJ_DESTRUCT(&procs_killed);    return ORTE_SUCCESS;}/* *  Wait for a callback indicating the child has completed. */static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata){    orte_odls_child_t *child;    opal_list_item_t *item;    bool aborted;    char *job, *vpid, *abort_file;    struct stat buf;    int rc;    opal_output(orte_odls_globals.output, "odls: child process terminated");        /* since we are going to be working with the global list of     * children, we need to protect that list from modification     * by other threads. This will also be used to protect us     * from race conditions on any abort situation     */    OPAL_THREAD_LOCK(&orte_odls_default.mutex);     /* find this child */    for (item = opal_list_get_first(&orte_odls_default.children);         item != opal_list_get_end(&orte_odls_default.children);         item = opal_list_get_next(item)) {        child = (orte_odls_child_t*)item;        if (child->alive && pid == child->pid) { /* found it */            goto GOTCHILD;        }    }    /* get here if we didn't find the child, or if the specified child is already    * dead. If the latter, then we have a problem as it means we are detecting    * it exiting multiple times    */    ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);    opal_condition_signal(&orte_odls_default.cond);    OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);    return;GOTCHILD:    orte_iof.iof_flush();    /* determine the state of this process */    aborted = false;    if(WIFEXITED(status)) {        /* even though the process exited "normally", it is quite         * possible that this happened via an orte_abort call - in         * which case, we need to indicate this was an "abnormal"         * termination. See the note in "orte_abort.c" for         * an explanation of this process.         *         * For our purposes here, we need to check for the existence         * of an "abort" file in this process' session directory. If         * we find it, then we know that this was an abnormal termination.         */        if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, child->name->jobid))) {            ORTE_ERROR_LOG(rc);            goto MOVEON;        }        if (ORTE_SUCCESS != (rc = orte_ns.convert_vpid_to_string(&vpid, child->name->vpid))) {            ORTE_ERROR_LOG(rc);            free(job);            goto MOVEON;        }        abort_file = opal_os_path(false, orte_process_info.universe_session_dir,                                  job, vpid, "abort", NULL );        free(job);        free(vpid);                if (0 == stat(abort_file, &buf)) {            /* the abort file must exist - there is nothing in it we need. It's             * meer existence indicates that an abnormal termination occurred             */            opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",                        ORTE_NAME_ARGS(child->name));            aborted = true;            free(abort_file);        } else {            opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",                        ORTE_NAME_ARGS(child->name));        }    } else {        /* the process was terminated with a signal! That's definitely         * abnormal, so indicate that condition         */        opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",                    ORTE_NAME_ARGS(child->name));        aborted = true;    }MOVEON:    /* set this proc to "not alive" */    child->alive = false;    /* Clean up the session directory as if we were the process     * itself.  This covers the case where the process died abnormally     * and didn't cleanup its own session directory.     */    orte_session_dir_finalize(child->name);    /* set the proc state in the child structure */    if (aborted) {        child->state = ORTE_PROC_STATE_ABORTED;    } else {        child->state = ORTE_PROC_STATE_TERMINATED;    }    /* Need to unlock before we call set_proc_state as this is going to generate     * a trigger that will eventually callback to us     */    opal_condition_signal(&orte_odls_default.cond);    OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);    if (aborted) {        rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, status);            } else {        rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_TERMINATED, status);    }    if (ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);    }}/** *  Fork/exec the specified processes */static int odls_default_fork_local_proc(    orte_app_context_t* context,    orte_odls_child_t *child,    orte_vpid_t vpid_start,    orte_vpid_t vpid_range,    bool want_processor,    size_t processor,    bool oversubscribed,    char **base_environ){    pid_t pid;    orte_iof_base_io_conf_t opts;    int rc;    sigset_t sigs;    int i = 0, p[2];    /* should pull this information from MPIRUN instead of going with       default */    opts.usepty = OMPI_ENABLE_PTY_SUPPORT;    /* BWB - Fix post beta.  Should setup stdin in orterun and       make part of the app_context */    if (child->name->vpid == 0) {        opts.connect_stdin = true;    } else {        opts.connect_stdin = false;    }    rc = orte_iof_base_setup_prefork(&opts);    if (ORTE_SUCCESS != rc) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -