📄 pls_slurm_module.c
字号:
corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM pls) */ cur_prefix = NULL; for (i=0; i < map->num_apps; i++) { char * app_prefix_dir = map->apps[i]->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { opal_show_help("help-pls-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); return ORTE_ERR_FATAL; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); if (mca_pls_slurm_component.debug) { opal_output (0, "pls:slurm: Set prefix:%s", cur_prefix); } } } } /* setup the daemon info for each node */ vpid = start_vpid; for (item = opal_list_get_first(&map->nodes); item != opal_list_get_end(&map->nodes); item = opal_list_get_next(item)) { orte_mapped_node_t* node = (orte_mapped_node_t*)item; /* record the daemons info for this node */ dmn = OBJ_NEW(orte_pls_daemon_info_t); dmn->active_job = jobid; dmn->cell = node->cell; dmn->nodename = strdup(node->nodename); if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(dmn->name), dmn->cell, 0, vpid))) { ORTE_ERROR_LOG(rc); goto cleanup; } opal_list_append(&daemons, &dmn->super); vpid++; } /* store the daemon info on the registry */ if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) { ORTE_ERROR_LOG(rc); } /* setup environment */ env = opal_argv_copy(environ); var = mca_base_param_environ_variable("seed", NULL, NULL); opal_setenv(var, "0", true, &env); free(var); var = mca_base_param_environ_variable("orte", "slurm", "nodelist"); opal_setenv(var, nodelist_flat, true, &env); free(nodelist_flat); free(var); if (mca_pls_slurm_component.timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "pls_slurm: could not obtain start time"); } } /* exec the daemon */ rc = pls_slurm_start_proc(argc, argv, env, cur_prefix); if (mca_pls_slurm_component.timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "pls_slurm: could not obtain stop time"); } else { opal_output(0, "pls_slurm: daemon block launch time is %ld usec", (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec)); opal_output(0, "pls_slurm: total job launch time is %ld usec", (launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + (launchstop.tv_usec - joblaunchstart.tv_usec)); } } if (ORTE_SUCCESS != rc) { opal_output(0, "pls:slurm: start_procs returned error %d", rc); goto cleanup; } /* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */ /* JMS: how do we catch when srun dies? */cleanup: OBJ_RELEASE(map); opal_argv_free(argv); opal_argv_free(env); if(NULL != jobid_string) { free(jobid_string); } while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){ int rc; opal_list_t daemons; opal_list_item_t *item; /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* order them to kill their local procs for this job */ if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } CLEANUP: while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}/*** Terminate the orteds for a given job */static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){ int rc; opal_list_t daemons; opal_list_item_t *item; /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* order them to go away */ if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } CLEANUP: while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}/* * The way we've used SLURM, we can't kill individual processes -- * we'll kill the entire job */static int pls_slurm_terminate_proc(const orte_process_name_t *name){ opal_output(0, "pls:slurm:terminate_proc: not supported"); return ORTE_ERR_NOT_SUPPORTED;}/** * Signal all the processes in the child srun by sending the signal directly to it */static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs){ if (0 != srun_pid) { kill(srun_pid, (int)signal); } return ORTE_SUCCESS;}/* * Signal a specific process */static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal){ opal_output(0, "pls:slurm:signal_proc: not supported"); return ORTE_ERR_NOT_SUPPORTED;}/** * Cancel an operation involving comm to an orted */int pls_slurm_cancel_operation(void){ int rc; if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { ORTE_ERROR_LOG(rc); } return rc;}static int pls_slurm_finalize(void){ int rc; /* cleanup any pending recvs */ if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) { ORTE_ERROR_LOG(rc); } return ORTE_SUCCESS;}static int pls_slurm_start_proc(int argc, char **argv, char **env, char *prefix){ int fd, id, debug_daemons; char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); if (NULL == exec_argv) { return ORTE_ERR_NOT_FOUND; } srun_pid = fork(); if (-1 == srun_pid) { opal_output(0, "pls:slurm:start_proc: fork failed"); return ORTE_ERR_IN_ERRNO; } else if (0 == srun_pid) { char *bin_base = NULL, *lib_base = NULL; /* Figure out the basenames for the libdir and bindir. There is a lengthy comment about this in pls_rsh_module.c explaining all the rationale for how / why we're doing this. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. */ if (NULL != prefix) { char *oldenv, *newenv; /* Reset PATH */ oldenv = getenv("PATH"); if (NULL != oldenv) { asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv); } else { asprintf(&newenv, "%s/%s", prefix, bin_base); } opal_setenv("PATH", newenv, true, &env); if (mca_pls_slurm_component.debug) { opal_output(0, "pls:slurm: reset PATH: %s", newenv); } free(newenv); /* Reset LD_LIBRARY_PATH */ oldenv = getenv("LD_LIBRARY_PATH"); if (NULL != oldenv) { asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv); } else { asprintf(&newenv, "%s/%s", prefix, lib_base); } opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); if (mca_pls_slurm_component.debug) { opal_output(0, "pls:slurm: reset LD_LIBRARY_PATH: %s", newenv); } free(newenv); } fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666); if(fd > 0) { dup2(fd, 0); } /* When not in debug mode and --debug-daemons was not passed, * tie stdout/stderr to dev null so we don't see messages from orted */ id = mca_base_param_find("orte", "debug", "daemons"); if(id < 0) { id = mca_base_param_register_int("orte", "debug", "daemons", NULL, 0); } mca_base_param_lookup_int(id, &debug_daemons); if (0 == mca_pls_slurm_component.debug && 0 == debug_daemons) { if (fd >= 0) { if (fd != 1) { dup2(fd,1); } if (fd != 2) { dup2(fd,2); } } } if (fd > 2) { close(fd); } /* get the srun process out of orterun's process group so that signals sent from the shell (like those resulting from cntl-c) don't get sent to srun */ setpgid(0, 0); execve(exec_argv, argv, env); opal_output(0, "pls:slurm:start_proc: exec failed"); /* don't return - need to exit - returning would be bad - we're not in the calling process anymore */ exit(1); } free(exec_argv); /* just in case, make sure that the srun process is not in our process group any more. Stevens says always do this on both sides of the fork... */ setpgid(srun_pid, srun_pid); return ORTE_SUCCESS;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -