⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_bproc.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
 * Sets up the passed environment for processes launched by the bproc launcher. * @param env a pointer to the environment to setup */static void orte_pls_bproc_setup_env(char *** env){    char ** merged;    char * var;    char * param;    int rc;    int num_env;    OPAL_TRACE(1);        num_env = opal_argv_count(*env);    /* append mca parameters to our environment */    if(ORTE_SUCCESS != (rc = mca_base_param_build_env(env, &num_env, false))) {        ORTE_ERROR_LOG(rc);    }    /* ns replica contact info */    if(NULL == orte_process_info.ns_replica) {        orte_dss.copy((void**)&orte_process_info.ns_replica, orte_process_info.my_name, ORTE_NAME);        orte_process_info.ns_replica_uri = orte_rml.get_uri();    }    var = mca_base_param_environ_variable("ns","replica","uri");    opal_setenv(var,orte_process_info.ns_replica_uri, true, env);    free(var);    /* make sure the username used to create the bproc directory is the same on     * the backend as the frontend */    var = mca_base_param_environ_variable("pls","bproc","username");    opal_setenv(var, orte_system_info.user, true, env);    free(var);    /* gpr replica contact info */    if(NULL == orte_process_info.gpr_replica) {        orte_dss.copy((void**)&orte_process_info.gpr_replica, orte_process_info.my_name, ORTE_NAME);        orte_process_info.gpr_replica_uri = orte_rml.get_uri();    }    var = mca_base_param_environ_variable("gpr","replica","uri");    opal_setenv(var,orte_process_info.gpr_replica_uri, true, env);    free(var);    /* universe directory - needs to match orted */    var = mca_base_param_environ_variable("universe", NULL, NULL);    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,                orte_universe_info.host, orte_universe_info.name);    opal_setenv(var, param, true, env);    free(param);    free(var);    /* merge in environment - merge ensures we don't overwrite anything we just set */    merged = opal_environ_merge(*env, environ);    opal_argv_free(*env);    *env = merged;        /* make sure hostname doesn't get pushed to backend node */    opal_unsetenv("HOSTNAME", env);        /* make sure the frontend hostname does not get pushed out to the backend */    var = mca_base_param_environ_variable("orte", "base", "nodename");    opal_unsetenv(var, env);    free(var);    }/** * Launches the daemons * @param cellid         the cellid of the job * @param envp           a pointer to the environment to use for the daemons * @param node_arrays    an array that holds the node arrays for each app context * @param node_array_lens an array of lengths of the node arrays * @param num_contexts   the number of application contexts * @param num_procs      the numer of processes in the job * @param global_vpid_start the starting vpid for the user's processes * @param jobid          the jobid for the user processes * @retval ORTE_SUCCESS * @retval error */static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {    int * daemon_list = NULL;    int num_daemons = 0;    int rc, i;    int * pids = NULL;    int argc;    char ** argv = NULL;    char * param;    char * var;    int stride;    char * orted_path;    orte_vpid_t daemon_vpid_start;    orte_std_cntr_t idx;    struct stat buf;    opal_list_t daemons;    orte_pls_daemon_info_t *dmn;    opal_list_item_t *item;    struct timeval joblaunchstart, launchstart, launchstop;    OPAL_TRACE(1);        if (orte_pls_base.timing) {        if (0 != gettimeofday(&joblaunchstart, NULL)) {            opal_output(0, "pls_bproc: could not obtain start time");        }    }        /* indicate that the daemons have not completely launched yet */    daemons_launched = false;        /* setup a list that will contain the info for all the daemons     * so we can store it on the registry when done     */    OBJ_CONSTRUCT(&daemons, opal_list_t);    /* get the number of nodes in this job and allocate an array for     * their names so we can pass that to bproc - populate the list     * with the node names     */    num_daemons = map->num_nodes;    if (0 == num_daemons) {        /* nothing to do */        OBJ_DESTRUCT(&daemons);        return ORTE_SUCCESS;    }        if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);        goto cleanup;    }    i = 0;    for (item = opal_list_get_first(&map->nodes);         item != opal_list_get_end(&map->nodes);         item = opal_list_get_next(item)) {        orte_mapped_node_t *node = (orte_mapped_node_t*)item;        daemon_list[i++] = atoi(node->nodename);    }    /* allocate storage for bproc to return the daemon pids */    if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) {        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);        goto cleanup;    }    /* allocate a range of vpids for the daemons */    rc = orte_ns.reserve_range(0, num_daemons, &daemon_vpid_start);    if(ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }    /* setup the orted triggers for passing their launch info */    if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(map->job, num_daemons, NULL, NULL))) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }        /* setup the daemon environment */    orte_pls_bproc_setup_env(envp);    /* direct the daemons to drop contact files so the local procs     * can learn how to contact them - this is used for routing     * OOB messaging     */    var = mca_base_param_environ_variable("odls","base","drop_contact_file");    opal_setenv(var,"1", true, envp);    free(var);    /* daemons calculate their process name using a "stride" of one, so     * push that value into their environment */    stride = 1;    asprintf(&param, "%ld", (long)stride);    var = mca_base_param_environ_variable("pls", "bproc", "stride");    opal_setenv(var, param, true, envp);    free(param);    free(var);    /* set up the base environment so the daemons can get their names once launched */    rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, daemon_vpid_start,                               0, num_daemons, envp);    if(ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }    argc = 0;    opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);    /* check for debug flags */#if 0    if (mca_pls_bproc_component.debug) {         opal_argv_append(&argc, &argv, "--debug");         opal_argv_append(&argc, &argv, "--debug-daemons");    }#endif     opal_argv_append(&argc, &argv, "--bootproxy");    orte_ns.convert_jobid_to_string(&param, map->job);    opal_argv_append(&argc, &argv, param);    free(param);    /* pass along the universe name and location info */    opal_argv_append(&argc, &argv, "--universe");    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,                orte_universe_info.host, orte_universe_info.name);    opal_argv_append(&argc, &argv, param);    free(param);    /* tell orted not to demonize itself */    opal_argv_append(&argc, &argv, "--no-daemonize");    /* find orted */    if(0 == stat(mca_pls_bproc_component.orted, &buf)) {        orted_path = strdup(mca_pls_bproc_component.orted);    } else {        orted_path = opal_path_findv(mca_pls_bproc_component.orted, 0, environ, NULL);        if(NULL == orted_path) {            orted_path = opal_os_path( false, opal_install_dirs.bindir, mca_pls_bproc_component.orted, NULL );            if( (NULL != orted_path) || (0 != stat(orted_path, &buf)) ) {                char *path = getenv("PATH");                if (NULL == path) {                    path = ("PATH is empty!");                }                opal_show_help("help-pls-bproc.txt", "no-orted", true,                               mca_pls_bproc_component.orted,                               mca_pls_bproc_component.orted, path, opal_install_dirs.bindir);                rc = ORTE_ERROR;                ORTE_ERROR_LOG(rc);                goto cleanup;            }        }    }    if(0 < mca_pls_bproc_component.debug) {        opal_output(0, "PLS_BPROC DEBUG: launching %d daemons. cmd: %s ",                    num_daemons, orted_path);    }    /* launch the daemons */    if (orte_pls_base.timing) {        if (0 != gettimeofday(&launchstart, NULL)) {            opal_output(0, "pls_bproc: could not obtain start time");        }    }    if (mca_pls_bproc_component.do_not_launch) {        for (i=0; i < num_daemons; i++) pids[i] = i+1;        rc = num_daemons;    } else {        rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);    }        if (orte_pls_base.timing) {        if (0 != gettimeofday(&launchstop, NULL)) {             opal_output(0, "pls_bproc: could not obtain stop time");         } else {             opal_output(0, "pls_bproc: daemon launch time is %ld usec",                         (launchstop.tv_sec - launchstart.tv_sec)*1000000 +                          (launchstop.tv_usec - launchstart.tv_usec));         }    }        if(rc != num_daemons) {        opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,                       num_daemons, rc, orted_path);        rc = ORTE_ERROR;        goto cleanup;    }        if(0 < mca_pls_bproc_component.debug) {        opal_output(0, "PLS_BPROC DEBUG: %d daemons launched. First pid: %d\n",                    rc, *pids);    }    for(i = 0; i < num_daemons; i++) {        if(0 >= pids[i]) {            opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,                           daemon_list[i], pids[i], errno, orted_path);            rc = ORTE_ERROR;            ORTE_ERROR_LOG(rc);            goto cleanup;        } else {            if (0 > asprintf(&param, "%d", daemon_list[i])) {                rc = ORTE_ERR_OUT_OF_RESOURCE;                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);                goto cleanup;            }            rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);            if(ORTE_SUCCESS != rc) {                ORTE_ERROR_LOG(rc);                goto cleanup;            }            dmn = OBJ_NEW(orte_pls_daemon_info_t);            rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,                                             daemon_vpid_start + i);            if(ORTE_SUCCESS != rc) {                ORTE_ERROR_LOG(rc);                goto cleanup;            }            dmn->cell = dmn->name->cellid;            dmn->nodename = strdup(param);            dmn->active_job = map->job;            opal_list_append(&daemons, &dmn->super);                        free(param);        }    }        /* store the daemon info */    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {        ORTE_ERROR_LOG(rc);    }    /* setup the callbacks - this needs to be done *after* we store the     * daemon info so that short-lived apps don't cause mpirun to     * try and terminate the orteds before we record them     */    if (!mca_pls_bproc_component.do_not_launch) {        for (i=0; i < num_daemons; i++) {            rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,                              &daemon_list[i]);            if(ORTE_SUCCESS != rc) {                ORTE_ERROR_LOG(rc);                goto cleanup;            }        }            /* wait for communication back from the daemons, which indicates they have         * sucessfully set up the pty/pipes and IO forwarding which the user apps         * will use  */        for(i = 0; i < num_daemons; i++) {            orte_buffer_t ack;            int src[4];            OBJ_CONSTRUCT(&ack, orte_buffer_t);            rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &ack, ORTE_RML_TAG_BPROC);            if(0 > rc) {                ORTE_ERROR_LOG(rc);                OBJ_DESTRUCT(&ack);                goto cleanup;            }            idx = 4;            rc = orte_dss.unpack(&ack, &src, &idx, ORTE_INT);            if(ORTE_SUCCESS != rc) {                ORTE_ERROR_LOG(rc);            }            OBJ_DESTRUCT(&ack);                        if(-1 == src[0]) {                /* one of the daemons has failed to properly launch. The error is sent                * by orte_pls_bproc_waitpid_daemon_cb  */                if(-1 == src[1]) { /* did not die on a signal */                    opal_show_help("help-pls-bproc.txt", "daemon-died-no-signal", true,                                   src[2], src[3]);                } else { /* died on a signal */                    opal_show_help("help-pls-bproc.txt", "daemon-died-signal", true,                                   src[2], src[3], src[1]);                }                rc = ORTE_ERROR;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -