⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_rsh_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
                                           ORTE_PROC_STATE_ABORTED, status);            if (ORTE_SUCCESS != rc) {                ORTE_ERROR_LOG(rc);            }        }        OBJ_RELEASE(node); cleanup:        /* tell the user something went wrong */        opal_output(0, "ERROR: A daemon on node %s failed to start as expected.",                    info->nodename);        opal_output(0, "ERROR: There may be more information available from");        opal_output(0, "ERROR: the remote shell (see above).");        if (WIFEXITED(status)) {            opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.",                   WEXITSTATUS(status));        } else if (WIFSIGNALED(status)) {#ifdef WCOREDUMP            if (WCOREDUMP(status)) {                opal_output(0, "The daemon received a signal %d (with core).",                            WTERMSIG(status));            } else {                opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));            }#else            opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));#endif /* WCOREDUMP */        } else {            opal_output(0, "No extra status information is available: %d.", status);        }        OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);        /* tell the system that this daemon is gone */        if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) {            ORTE_ERROR_LOG(rc);        }                /* remove the daemon from our local list */        opal_list_remove_item(&active_daemons, &info->super);        OBJ_RELEASE(info);        OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);    } /* if abnormal exit */    /* release any waiting threads */    OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);    /* first check timing request */    if (mca_pls_rsh_component.timing) {        if (0 != gettimeofday(&launchstop, NULL)) {            opal_output(0, "pls_rsh: could not obtain stop time");        } else {            deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 +                     (launchstop.tv_usec - launchstart[info->name->vpid].tv_usec);            avgtime = avgtime + deltat;            if (deltat < mintime) {                mintime = deltat;                miniter = (unsigned long)info->name->vpid;            }            if (deltat > maxtime) {                maxtime = deltat;                maxiter = (unsigned long)info->name->vpid;            }        }    }    if (mca_pls_rsh_component.num_children-- >=        mca_pls_rsh_component.num_concurrent ||        mca_pls_rsh_component.num_children == 0) {        opal_condition_signal(&mca_pls_rsh_component.cond);    }    if (mca_pls_rsh_component.timing && mca_pls_rsh_component.num_children == 0) {        if (0 != gettimeofday(&joblaunchstop, NULL)) {            opal_output(0, "pls_rsh: could not obtain job launch stop time");        } else {            deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +                     (joblaunchstop.tv_usec - joblaunchstart.tv_usec);            opal_output(0, "pls_rsh: total time to launch job is %lu usec", deltat);            if (mintime < 999999999) {                /* had at least one non-local node */                avgtime = avgtime/opal_list_get_size(&active_daemons);                opal_output(0, "pls_rsh: average time to launch one daemon %f usec", avgtime);                opal_output(0, "pls_rsh: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter);                opal_output(0, "pls_rsh: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter);            } else {                opal_output(0, "No nonlocal launches to report for timing info");            }        }        free(launchstart);    }        OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);}/** * Launch a daemon (bootproxy) on each node. The daemon will be responsible * for launching the application. */int orte_pls_rsh_launch(orte_jobid_t jobid){    orte_job_map_t *map;    opal_list_item_t *n_item;    orte_mapped_node_t *rmaps_node;    orte_std_cntr_t num_nodes;    orte_vpid_t vpid;    int node_name_index1;    int node_name_index2;    int proc_name_index;    int local_exec_index, local_exec_index_end;    char *jobid_string = NULL;    char *uri, *param;    char **argv = NULL, **tmp;    char *prefix_dir;    int argc;    int rc;    sigset_t sigs;    struct passwd *p;    bool remote_sh = false, remote_csh = false;     bool local_sh = false, local_csh = false;    char *lib_base = NULL, *bin_base = NULL;    orte_pls_daemon_info_t *dmn;    orte_pls_rsh_shell_t shell;    if (mca_pls_rsh_component.timing) {        if (0 != gettimeofday(&joblaunchstart, NULL)) {            opal_output(0, "pls_rsh: could not obtain start time");            joblaunchstart.tv_sec = 0;            joblaunchstart.tv_usec = 0;        }            }        /* setup a list that will contain the info for all the daemons     * so we can store it on the registry when done and use it     * locally to track their state     */    OBJ_CONSTRUCT(&active_daemons, opal_list_t);    /* Get the map for this job     * We need the entire mapping for a couple of reasons:     *  - need the prefix to start with.     *  - need to know the nodes we are launching on     * All other mapping responsibilities fall to orted in the fork PLS     */    rc = orte_rmaps.get_job_map(&map, jobid);    if (ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);        OBJ_DESTRUCT(&active_daemons);        return rc;    }    /* if the user requested that we re-use daemons,     * launch the procs on any existing, re-usable daemons     */    if (orte_pls_base.reuse_daemons) {        if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {            ORTE_ERROR_LOG(rc);            OBJ_RELEASE(map);            OBJ_DESTRUCT(&active_daemons);            return rc;        }    }        num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);    if (0 == num_nodes) {        /* nothing left to do - just return */        OBJ_RELEASE(map);        OBJ_DESTRUCT(&active_daemons);        return ORTE_SUCCESS;    }    if (mca_pls_rsh_component.debug_daemons &&        mca_pls_rsh_component.num_concurrent < num_nodes) {        /* we can't run in this situation, so pretty print the error         * and exit         */        opal_show_help("help-pls-rsh.txt", "deadlock-params",                       true, mca_pls_rsh_component.num_concurrent, num_nodes);        OBJ_RELEASE(map);        OBJ_DESTRUCT(&active_daemons);        return ORTE_ERR_FATAL;    }    /*     * After a discussion between Ralph & Jeff, we concluded that we     * really are handling the prefix dir option incorrectly. It currently     * is associated with an app_context, yet it really refers to the     * location where OpenRTE/Open MPI is installed on a NODE. Fixing     * this right now would involve significant change to orterun as well     * as elsewhere, so we will intentionally leave this incorrect at this     * point. The error, however, is identical to that seen in all prior     * releases of OpenRTE/Open MPI, so our behavior is no worse than before.     *     * A note to fix this, along with ideas on how to do so, has been filed     * on the project's Trac system under "feature enhancement".     *     * For now, default to the prefix_dir provided in the first app_context.     * Since there always MUST be at least one app_context, we are safe in     * doing this.     */    prefix_dir = map->apps[0]->prefix_dir;        /*     * Allocate a range of vpids for the daemons.     */    if (num_nodes == 0) {        return ORTE_ERR_BAD_PARAM;    }    rc = orte_ns.reserve_range(0, num_nodes, &vpid);    if (ORTE_SUCCESS != rc) {        goto cleanup;    }    /* setup the orted triggers for passing their launch info */    if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }        /* need integer value for command line parameter */    if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }    /* What is our local shell? */    shell = ORTE_PLS_RSH_SHELL_UNKNOWN;    p = getpwuid(getuid());    if (NULL != p) {        param = p->pw_shell;        shell = find_shell(p->pw_shell);    }    /* If we didn't find it in getpwuid(), try looking at the $SHELL       environment variable (see       https://svn.open-mpi.org/trac/ompi/ticket/1060) */    if (ORTE_PLS_RSH_SHELL_UNKNOWN == shell &&         NULL != (param = getenv("SHELL"))) {        shell = find_shell(param);    }    switch (shell) {    case ORTE_PLS_RSH_SHELL_SH:  /* fall through */    case ORTE_PLS_RSH_SHELL_KSH: /* fall through */    case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */    case ORTE_PLS_RSH_SHELL_BASH: local_sh = true; break;    case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */    case ORTE_PLS_RSH_SHELL_CSH:  local_csh = true; break;    default:        opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n",                    (NULL != param) ? param : "unknown");        remote_sh = true;        break;    }    if (mca_pls_rsh_component.debug) {        opal_output(0, "pls:rsh: local csh: %d, local sh: %d\n",                    local_csh, local_sh);    }    /* What is our remote shell? */    if (mca_pls_rsh_component.assume_same_shell) {        remote_sh = local_sh;        remote_csh = local_csh;        if (mca_pls_rsh_component.debug) {            opal_output(0, "pls:rsh: assuming same remote shell as local shell");        }    } else {        orte_pls_rsh_shell_t shell;        rmaps_node = (orte_mapped_node_t*)opal_list_get_first(&map->nodes);        rc = orte_pls_rsh_probe(rmaps_node, &shell);        if (ORTE_SUCCESS != rc) {            ORTE_ERROR_LOG(rc);            return rc;        }        switch (shell) {        case ORTE_PLS_RSH_SHELL_SH:  /* fall through */        case ORTE_PLS_RSH_SHELL_KSH: /* fall through */        case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */        case ORTE_PLS_RSH_SHELL_BASH: remote_sh = true; break;        case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */        case ORTE_PLS_RSH_SHELL_CSH:  remote_csh = true; break;        default:            opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");            remote_sh = true;        }    }    if (mca_pls_rsh_component.debug) {        opal_output(0, "pls:rsh: remote csh: %d, remote sh: %d\n",                    remote_csh, remote_sh);    }    /*     * Build argv array     */    argv = opal_argv_copy(mca_pls_rsh_component.agent_argv);    argc = mca_pls_rsh_component.agent_argc;    node_name_index1 = argc;    opal_argv_append(&argc, &argv, "<template>");    /* Do we need to source .profile on the remote side? */    if (!(remote_csh || remote_sh)) {        int i;        tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');        if (NULL == tmp) {            return ORTE_ERR_OUT_OF_RESOURCE;        }        for (i = 0; NULL != tmp[i]; ++i) {            opal_argv_append(&argc, &argv, tmp[i]);        }        opal_argv_free(tmp);    }    /* add the daemon command (as specified by user) */    local_exec_index = argc;    opal_argv_append(&argc, &argv, mca_pls_rsh_component.orted);    /* check for debug flags */    orte_pls_base_mca_argv(&argc, &argv);    opal_argv_append(&argc, &argv, "--bootproxy");    opal_argv_append(&argc, &argv, jobid_string);    opal_argv_append(&argc, &argv, "--name");    proc_name_index = argc;    opal_argv_append(&argc, &argv, "<template>");    /* tell the daemon how many procs are in the daemon's job */    opal_argv_append(&argc, &argv, "--num_procs");    asprintf(&param, "%lu", (unsigned long)(vpid + num_nodes));    opal_argv_append(&argc, &argv, param);    free(param);    /* tell the daemon the starting vpid of the daemon's job */    opal_argv_append(&argc, &argv, "--vpid_start");    opal_argv_append(&argc, &argv, "0");    opal_argv_append(&argc, &argv, "--nodename");    node_name_index2 = argc;    opal_argv_append(&argc, &argv, "<template>");    /* pass along the universe name and location info */    opal_argv_append(&argc, &argv, "--universe");    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,                orte_universe_info.host, orte_universe_info.name);    opal_argv_append(&argc, &argv, param);    free(param);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -