📄 pls_bproc.c
字号:
* Sets up the passed environment for processes launched by the bproc launcher. * @param env a pointer to the environment to setup */static void orte_pls_bproc_setup_env(char *** env){ char ** merged; char * var; char * param; int rc; int num_env; OPAL_TRACE(1); num_env = opal_argv_count(*env); /* append mca parameters to our environment */ if(ORTE_SUCCESS != (rc = mca_base_param_build_env(env, &num_env, false))) { ORTE_ERROR_LOG(rc); } /* ns replica contact info */ if(NULL == orte_process_info.ns_replica) { orte_dss.copy((void**)&orte_process_info.ns_replica, orte_process_info.my_name, ORTE_NAME); orte_process_info.ns_replica_uri = orte_rml.get_uri(); } var = mca_base_param_environ_variable("ns","replica","uri"); opal_setenv(var,orte_process_info.ns_replica_uri, true, env); free(var); /* make sure the username used to create the bproc directory is the same on * the backend as the frontend */ var = mca_base_param_environ_variable("pls","bproc","username"); opal_setenv(var, orte_system_info.user, true, env); free(var); /* gpr replica contact info */ if(NULL == orte_process_info.gpr_replica) { orte_dss.copy((void**)&orte_process_info.gpr_replica, orte_process_info.my_name, ORTE_NAME); orte_process_info.gpr_replica_uri = orte_rml.get_uri(); } var = mca_base_param_environ_variable("gpr","replica","uri"); opal_setenv(var,orte_process_info.gpr_replica_uri, true, env); free(var); /* universe directory - needs to match orted */ var = mca_base_param_environ_variable("universe", NULL, NULL); asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, orte_universe_info.host, orte_universe_info.name); opal_setenv(var, param, true, env); free(param); free(var); /* merge in environment - merge ensures we don't overwrite anything we just set */ merged = opal_environ_merge(*env, environ); opal_argv_free(*env); *env = merged; /* make sure hostname doesn't get pushed to backend node */ opal_unsetenv("HOSTNAME", env); /* make sure the frontend hostname does not get pushed out to the backend */ var = mca_base_param_environ_variable("orte", "base", "nodename"); opal_unsetenv(var, env); free(var); }/** * Launches the daemons * @param cellid the cellid of the job * @param envp a pointer to the environment to use for the daemons * @param node_arrays an array that holds the node arrays for each app context * @param node_array_lens an array of lengths of the node arrays * @param num_contexts the number of application contexts * @param num_procs the numer of processes in the job * @param global_vpid_start the starting vpid for the user's processes * @param jobid the jobid for the user processes * @retval ORTE_SUCCESS * @retval error */static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) { int * daemon_list = NULL; int num_daemons = 0; int rc, i; int * pids = NULL; int argc; char ** argv = NULL; char * param; char * var; int stride; char * orted_path; orte_vpid_t daemon_vpid_start; orte_std_cntr_t idx; struct stat buf; opal_list_t daemons; orte_pls_daemon_info_t *dmn; opal_list_item_t *item; struct timeval joblaunchstart, launchstart, launchstop; OPAL_TRACE(1); if (orte_pls_base.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "pls_bproc: could not obtain start time"); } } /* indicate that the daemons have not completely launched yet */ daemons_launched = false; /* setup a list that will contain the info for all the daemons * so we can store it on the registry when done */ OBJ_CONSTRUCT(&daemons, opal_list_t); /* get the number of nodes in this job and allocate an array for * their names so we can pass that to bproc - populate the list * with the node names */ num_daemons = map->num_nodes; if (0 == num_daemons) { /* nothing to do */ OBJ_DESTRUCT(&daemons); return ORTE_SUCCESS; } if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); goto cleanup; } i = 0; for (item = opal_list_get_first(&map->nodes); item != opal_list_get_end(&map->nodes); item = opal_list_get_next(item)) { orte_mapped_node_t *node = (orte_mapped_node_t*)item; daemon_list[i++] = atoi(node->nodename); } /* allocate storage for bproc to return the daemon pids */ if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); goto cleanup; } /* allocate a range of vpids for the daemons */ rc = orte_ns.reserve_range(0, num_daemons, &daemon_vpid_start); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } /* setup the orted triggers for passing their launch info */ if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(map->job, num_daemons, NULL, NULL))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* setup the daemon environment */ orte_pls_bproc_setup_env(envp); /* direct the daemons to drop contact files so the local procs * can learn how to contact them - this is used for routing * OOB messaging */ var = mca_base_param_environ_variable("odls","base","drop_contact_file"); opal_setenv(var,"1", true, envp); free(var); /* daemons calculate their process name using a "stride" of one, so * push that value into their environment */ stride = 1; asprintf(¶m, "%ld", (long)stride); var = mca_base_param_environ_variable("pls", "bproc", "stride"); opal_setenv(var, param, true, envp); free(param); free(var); /* set up the base environment so the daemons can get their names once launched */ rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, daemon_vpid_start, 0, num_daemons, envp); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } argc = 0; opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted); /* check for debug flags */#if 0 if (mca_pls_bproc_component.debug) { opal_argv_append(&argc, &argv, "--debug"); opal_argv_append(&argc, &argv, "--debug-daemons"); }#endif opal_argv_append(&argc, &argv, "--bootproxy"); orte_ns.convert_jobid_to_string(¶m, map->job); opal_argv_append(&argc, &argv, param); free(param); /* pass along the universe name and location info */ opal_argv_append(&argc, &argv, "--universe"); asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, orte_universe_info.host, orte_universe_info.name); opal_argv_append(&argc, &argv, param); free(param); /* tell orted not to demonize itself */ opal_argv_append(&argc, &argv, "--no-daemonize"); /* find orted */ if(0 == stat(mca_pls_bproc_component.orted, &buf)) { orted_path = strdup(mca_pls_bproc_component.orted); } else { orted_path = opal_path_findv(mca_pls_bproc_component.orted, 0, environ, NULL); if(NULL == orted_path) { orted_path = opal_os_path( false, opal_install_dirs.bindir, mca_pls_bproc_component.orted, NULL ); if( (NULL != orted_path) || (0 != stat(orted_path, &buf)) ) { char *path = getenv("PATH"); if (NULL == path) { path = ("PATH is empty!"); } opal_show_help("help-pls-bproc.txt", "no-orted", true, mca_pls_bproc_component.orted, mca_pls_bproc_component.orted, path, opal_install_dirs.bindir); rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); goto cleanup; } } } if(0 < mca_pls_bproc_component.debug) { opal_output(0, "PLS_BPROC DEBUG: launching %d daemons. cmd: %s ", num_daemons, orted_path); } /* launch the daemons */ if (orte_pls_base.timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "pls_bproc: could not obtain start time"); } } if (mca_pls_bproc_component.do_not_launch) { for (i=0; i < num_daemons; i++) pids[i] = i+1; rc = num_daemons; } else { rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp); } if (orte_pls_base.timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "pls_bproc: could not obtain stop time"); } else { opal_output(0, "pls_bproc: daemon launch time is %ld usec", (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec)); } } if(rc != num_daemons) { opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true, num_daemons, rc, orted_path); rc = ORTE_ERROR; goto cleanup; } if(0 < mca_pls_bproc_component.debug) { opal_output(0, "PLS_BPROC DEBUG: %d daemons launched. First pid: %d\n", rc, *pids); } for(i = 0; i < num_daemons; i++) { if(0 >= pids[i]) { opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true, daemon_list[i], pids[i], errno, orted_path); rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); goto cleanup; } else { if (0 > asprintf(¶m, "%d", daemon_list[i])) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); goto cleanup; } rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } dmn = OBJ_NEW(orte_pls_daemon_info_t); rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0, daemon_vpid_start + i); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } dmn->cell = dmn->name->cellid; dmn->nodename = strdup(param); dmn->active_job = map->job; opal_list_append(&daemons, &dmn->super); free(param); } } /* store the daemon info */ if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) { ORTE_ERROR_LOG(rc); } /* setup the callbacks - this needs to be done *after* we store the * daemon info so that short-lived apps don't cause mpirun to * try and terminate the orteds before we record them */ if (!mca_pls_bproc_component.do_not_launch) { for (i=0; i < num_daemons; i++) { rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb, &daemon_list[i]); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } } /* wait for communication back from the daemons, which indicates they have * sucessfully set up the pty/pipes and IO forwarding which the user apps * will use */ for(i = 0; i < num_daemons; i++) { orte_buffer_t ack; int src[4]; OBJ_CONSTRUCT(&ack, orte_buffer_t); rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &ack, ORTE_RML_TAG_BPROC); if(0 > rc) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&ack); goto cleanup; } idx = 4; rc = orte_dss.unpack(&ack, &src, &idx, ORTE_INT); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&ack); if(-1 == src[0]) { /* one of the daemons has failed to properly launch. The error is sent * by orte_pls_bproc_waitpid_daemon_cb */ if(-1 == src[1]) { /* did not die on a signal */ opal_show_help("help-pls-bproc.txt", "daemon-died-no-signal", true, src[2], src[3]); } else { /* died on a signal */ opal_show_help("help-pls-bproc.txt", "daemon-died-signal", true, src[2], src[3], src[1]); } rc = ORTE_ERROR;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -