⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_slurm_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 2 页
字号:
       corresponding app_context.  If there are multiple,       different prefix's in the app context, complain (i.e., only       allow one --prefix option for the entire slurm run -- we       don't support different --prefix'es for different nodes in       the SLURM pls) */    cur_prefix = NULL;    for (i=0; i < map->num_apps; i++) {        char * app_prefix_dir = map->apps[i]->prefix_dir;         /* Check for already set cur_prefix_dir -- if different,           complain */        if (NULL != app_prefix_dir) {            if (NULL != cur_prefix &&                0 != strcmp (cur_prefix, app_prefix_dir)) {                opal_show_help("help-pls-slurm.txt", "multiple-prefixes",                               true, cur_prefix, app_prefix_dir);                return ORTE_ERR_FATAL;            }            /* If not yet set, copy it; iff set, then it's the               same anyway */            if (NULL == cur_prefix) {                cur_prefix = strdup(app_prefix_dir);                if (mca_pls_slurm_component.debug) {                    opal_output (0, "pls:slurm: Set prefix:%s",                                 cur_prefix);                }            }        }    }    /* setup the daemon info for each node */    vpid = start_vpid;    for (item = opal_list_get_first(&map->nodes);         item != opal_list_get_end(&map->nodes);         item = opal_list_get_next(item)) {        orte_mapped_node_t* node = (orte_mapped_node_t*)item;                /* record the daemons info for this node */        dmn = OBJ_NEW(orte_pls_daemon_info_t);        dmn->active_job = jobid;        dmn->cell = node->cell;        dmn->nodename = strdup(node->nodename);        if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(dmn->name), dmn->cell, 0, vpid))) {            ORTE_ERROR_LOG(rc);            goto cleanup;        }        opal_list_append(&daemons, &dmn->super);        vpid++;    }    /* store the daemon info on the registry */    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {        ORTE_ERROR_LOG(rc);    }        /* setup environment */    env = opal_argv_copy(environ);    var = mca_base_param_environ_variable("seed", NULL, NULL);    opal_setenv(var, "0", true, &env);    free(var);    var = mca_base_param_environ_variable("orte", "slurm", "nodelist");    opal_setenv(var, nodelist_flat, true, &env);    free(nodelist_flat);    free(var);    if (mca_pls_slurm_component.timing) {        if (0 != gettimeofday(&launchstart, NULL)) {            opal_output(0, "pls_slurm: could not obtain start time");        }            }        /* exec the daemon */    rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);        if (mca_pls_slurm_component.timing) {        if (0 != gettimeofday(&launchstop, NULL)) {             opal_output(0, "pls_slurm: could not obtain stop time");         } else {             opal_output(0, "pls_slurm: daemon block launch time is %ld usec",                         (launchstop.tv_sec - launchstart.tv_sec)*1000000 +                          (launchstop.tv_usec - launchstart.tv_usec));             opal_output(0, "pls_slurm: total job launch time is %ld usec",                         (launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +                          (launchstop.tv_usec - joblaunchstart.tv_usec));         }    }    if (ORTE_SUCCESS != rc) {        opal_output(0, "pls:slurm: start_procs returned error %d", rc);        goto cleanup;    }    /* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */    /* JMS: how do we catch when srun dies? */cleanup:    OBJ_RELEASE(map);    opal_argv_free(argv);    opal_argv_free(env);    if(NULL != jobid_string) {        free(jobid_string);    }        while (NULL != (item = opal_list_remove_first(&daemons))) {        OBJ_RELEASE(item);    }    OBJ_DESTRUCT(&daemons);        return rc;}static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){    int rc;    opal_list_t daemons;    opal_list_item_t *item;        /* construct the list of active daemons on this job */    OBJ_CONSTRUCT(&daemons, opal_list_t);    if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {        ORTE_ERROR_LOG(rc);        goto CLEANUP;    }        /* order them to kill their local procs for this job */    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {        ORTE_ERROR_LOG(rc);        goto CLEANUP;    }    CLEANUP:    while (NULL != (item = opal_list_remove_first(&daemons))) {        OBJ_RELEASE(item);    }    OBJ_DESTRUCT(&daemons);    return rc;}/*** Terminate the orteds for a given job */static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){    int rc;    opal_list_t daemons;    opal_list_item_t *item;        /* construct the list of active daemons on this job */    OBJ_CONSTRUCT(&daemons, opal_list_t);    if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {        ORTE_ERROR_LOG(rc);        goto CLEANUP;    }        /* order them to go away */    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {        ORTE_ERROR_LOG(rc);    }    CLEANUP:    while (NULL != (item = opal_list_remove_first(&daemons))) {        OBJ_RELEASE(item);    }    OBJ_DESTRUCT(&daemons);    return rc;}/* * The way we've used SLURM, we can't kill individual processes -- * we'll kill the entire job */static int pls_slurm_terminate_proc(const orte_process_name_t *name){    opal_output(0, "pls:slurm:terminate_proc: not supported");    return ORTE_ERR_NOT_SUPPORTED;}/** * Signal all the processes in the child srun by sending the signal directly to it */static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs){    if (0 != srun_pid) {        kill(srun_pid, (int)signal);   }    return ORTE_SUCCESS;}/* * Signal a specific process */static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal){    opal_output(0, "pls:slurm:signal_proc: not supported");    return ORTE_ERR_NOT_SUPPORTED;}/** * Cancel an operation involving comm to an orted */int pls_slurm_cancel_operation(void){    int rc;    if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {        ORTE_ERROR_LOG(rc);    }        return rc;}static int pls_slurm_finalize(void){    int rc;    /* cleanup any pending recvs */    if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {        ORTE_ERROR_LOG(rc);    }        return ORTE_SUCCESS;}static int pls_slurm_start_proc(int argc, char **argv, char **env,                                char *prefix){    int fd, id, debug_daemons;    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);    if (NULL == exec_argv) {        return ORTE_ERR_NOT_FOUND;    }    srun_pid = fork();    if (-1 == srun_pid) {        opal_output(0, "pls:slurm:start_proc: fork failed");        return ORTE_ERR_IN_ERRNO;    } else if (0 == srun_pid) {        char *bin_base = NULL, *lib_base = NULL;        /* Figure out the basenames for the libdir and bindir.  There           is a lengthy comment about this in pls_rsh_module.c           explaining all the rationale for how / why we're doing           this. */        lib_base = opal_basename(opal_install_dirs.libdir);        bin_base = opal_basename(opal_install_dirs.bindir);        /* If we have a prefix, then modify the PATH and           LD_LIBRARY_PATH environment variables.  */        if (NULL != prefix) {            char *oldenv, *newenv;            /* Reset PATH */            oldenv = getenv("PATH");            if (NULL != oldenv) {                asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);            } else {                asprintf(&newenv, "%s/%s", prefix, bin_base);            }            opal_setenv("PATH", newenv, true, &env);            if (mca_pls_slurm_component.debug) {                opal_output(0, "pls:slurm: reset PATH: %s", newenv);            }            free(newenv);            /* Reset LD_LIBRARY_PATH */            oldenv = getenv("LD_LIBRARY_PATH");            if (NULL != oldenv) {                asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);            } else {                asprintf(&newenv, "%s/%s", prefix, lib_base);            }            opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);            if (mca_pls_slurm_component.debug) {                opal_output(0, "pls:slurm: reset LD_LIBRARY_PATH: %s",                            newenv);            }            free(newenv);        }        fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);        if(fd > 0) {            dup2(fd, 0);        }        /* When not in debug mode and --debug-daemons was not passed,         * tie stdout/stderr to dev null so we don't see messages from orted */        id = mca_base_param_find("orte", "debug", "daemons");        if(id < 0) {            id = mca_base_param_register_int("orte", "debug", "daemons", NULL, 0);        }        mca_base_param_lookup_int(id, &debug_daemons);        if (0 == mca_pls_slurm_component.debug && 0 == debug_daemons) {            if (fd >= 0) {                if (fd != 1) {                    dup2(fd,1);                }                if (fd != 2) {                    dup2(fd,2);                }            }        }        if (fd > 2) {            close(fd);        }        /* get the srun process out of orterun's process group so that           signals sent from the shell (like those resulting from           cntl-c) don't get sent to srun */        setpgid(0, 0);        execve(exec_argv, argv, env);        opal_output(0, "pls:slurm:start_proc: exec failed");        /* don't return - need to exit - returning would be bad -           we're not in the calling process anymore */        exit(1);    }    free(exec_argv);    /* just in case, make sure that the srun process is not in our       process group any more.  Stevens says always do this on both       sides of the fork... */    setpgid(srun_pid, srun_pid);    return ORTE_SUCCESS;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -