📄 pls_rsh_module.c
字号:
ORTE_PROC_STATE_ABORTED, status); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } } OBJ_RELEASE(node); cleanup: /* tell the user something went wrong */ opal_output(0, "ERROR: A daemon on node %s failed to start as expected.", info->nodename); opal_output(0, "ERROR: There may be more information available from"); opal_output(0, "ERROR: the remote shell (see above)."); if (WIFEXITED(status)) { opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.", WEXITSTATUS(status)); } else if (WIFSIGNALED(status)) {#ifdef WCOREDUMP if (WCOREDUMP(status)) { opal_output(0, "The daemon received a signal %d (with core).", WTERMSIG(status)); } else { opal_output(0, "The daemon received a signal %d.", WTERMSIG(status)); }#else opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));#endif /* WCOREDUMP */ } else { opal_output(0, "No extra status information is available: %d.", status); } OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock); /* tell the system that this daemon is gone */ if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) { ORTE_ERROR_LOG(rc); } /* remove the daemon from our local list */ opal_list_remove_item(&active_daemons, &info->super); OBJ_RELEASE(info); OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock); } /* if abnormal exit */ /* release any waiting threads */ OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock); /* first check timing request */ if (mca_pls_rsh_component.timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "pls_rsh: could not obtain stop time"); } else { deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 + (launchstop.tv_usec - launchstart[info->name->vpid].tv_usec); avgtime = avgtime + deltat; if (deltat < mintime) { mintime = deltat; miniter = (unsigned long)info->name->vpid; } if (deltat > maxtime) { maxtime = deltat; maxiter = (unsigned long)info->name->vpid; } } } if (mca_pls_rsh_component.num_children-- >= mca_pls_rsh_component.num_concurrent || mca_pls_rsh_component.num_children == 0) { opal_condition_signal(&mca_pls_rsh_component.cond); } if (mca_pls_rsh_component.timing && mca_pls_rsh_component.num_children == 0) { if (0 != gettimeofday(&joblaunchstop, NULL)) { opal_output(0, "pls_rsh: could not obtain job launch stop time"); } else { deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + (joblaunchstop.tv_usec - joblaunchstart.tv_usec); opal_output(0, "pls_rsh: total time to launch job is %lu usec", deltat); if (mintime < 999999999) { /* had at least one non-local node */ avgtime = avgtime/opal_list_get_size(&active_daemons); opal_output(0, "pls_rsh: average time to launch one daemon %f usec", avgtime); opal_output(0, "pls_rsh: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter); opal_output(0, "pls_rsh: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter); } else { opal_output(0, "No nonlocal launches to report for timing info"); } } free(launchstart); } OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);}/** * Launch a daemon (bootproxy) on each node. The daemon will be responsible * for launching the application. */int orte_pls_rsh_launch(orte_jobid_t jobid){ orte_job_map_t *map; opal_list_item_t *n_item; orte_mapped_node_t *rmaps_node; orte_std_cntr_t num_nodes; orte_vpid_t vpid; int node_name_index1; int node_name_index2; int proc_name_index; int local_exec_index, local_exec_index_end; char *jobid_string = NULL; char *uri, *param; char **argv = NULL, **tmp; char *prefix_dir; int argc; int rc; sigset_t sigs; struct passwd *p; bool remote_sh = false, remote_csh = false; bool local_sh = false, local_csh = false; char *lib_base = NULL, *bin_base = NULL; orte_pls_daemon_info_t *dmn; orte_pls_rsh_shell_t shell; if (mca_pls_rsh_component.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "pls_rsh: could not obtain start time"); joblaunchstart.tv_sec = 0; joblaunchstart.tv_usec = 0; } } /* setup a list that will contain the info for all the daemons * so we can store it on the registry when done and use it * locally to track their state */ OBJ_CONSTRUCT(&active_daemons, opal_list_t); /* Get the map for this job * We need the entire mapping for a couple of reasons: * - need the prefix to start with. * - need to know the nodes we are launching on * All other mapping responsibilities fall to orted in the fork PLS */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&active_daemons); return rc; } /* if the user requested that we re-use daemons, * launch the procs on any existing, re-usable daemons */ if (orte_pls_base.reuse_daemons) { if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return rc; } } num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); if (0 == num_nodes) { /* nothing left to do - just return */ OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return ORTE_SUCCESS; } if (mca_pls_rsh_component.debug_daemons && mca_pls_rsh_component.num_concurrent < num_nodes) { /* we can't run in this situation, so pretty print the error * and exit */ opal_show_help("help-pls-rsh.txt", "deadlock-params", true, mca_pls_rsh_component.num_concurrent, num_nodes); OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return ORTE_ERR_FATAL; } /* * After a discussion between Ralph & Jeff, we concluded that we * really are handling the prefix dir option incorrectly. It currently * is associated with an app_context, yet it really refers to the * location where OpenRTE/Open MPI is installed on a NODE. Fixing * this right now would involve significant change to orterun as well * as elsewhere, so we will intentionally leave this incorrect at this * point. The error, however, is identical to that seen in all prior * releases of OpenRTE/Open MPI, so our behavior is no worse than before. * * A note to fix this, along with ideas on how to do so, has been filed * on the project's Trac system under "feature enhancement". * * For now, default to the prefix_dir provided in the first app_context. * Since there always MUST be at least one app_context, we are safe in * doing this. */ prefix_dir = map->apps[0]->prefix_dir; /* * Allocate a range of vpids for the daemons. */ if (num_nodes == 0) { return ORTE_ERR_BAD_PARAM; } rc = orte_ns.reserve_range(0, num_nodes, &vpid); if (ORTE_SUCCESS != rc) { goto cleanup; } /* setup the orted triggers for passing their launch info */ if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* need integer value for command line parameter */ if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* What is our local shell? */ shell = ORTE_PLS_RSH_SHELL_UNKNOWN; p = getpwuid(getuid()); if (NULL != p) { param = p->pw_shell; shell = find_shell(p->pw_shell); } /* If we didn't find it in getpwuid(), try looking at the $SHELL environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) */ if (ORTE_PLS_RSH_SHELL_UNKNOWN == shell && NULL != (param = getenv("SHELL"))) { shell = find_shell(param); } switch (shell) { case ORTE_PLS_RSH_SHELL_SH: /* fall through */ case ORTE_PLS_RSH_SHELL_KSH: /* fall through */ case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLS_RSH_SHELL_BASH: local_sh = true; break; case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLS_RSH_SHELL_CSH: local_csh = true; break; default: opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", (NULL != param) ? param : "unknown"); remote_sh = true; break; } if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: local csh: %d, local sh: %d\n", local_csh, local_sh); } /* What is our remote shell? */ if (mca_pls_rsh_component.assume_same_shell) { remote_sh = local_sh; remote_csh = local_csh; if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: assuming same remote shell as local shell"); } } else { orte_pls_rsh_shell_t shell; rmaps_node = (orte_mapped_node_t*)opal_list_get_first(&map->nodes); rc = orte_pls_rsh_probe(rmaps_node, &shell); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } switch (shell) { case ORTE_PLS_RSH_SHELL_SH: /* fall through */ case ORTE_PLS_RSH_SHELL_KSH: /* fall through */ case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLS_RSH_SHELL_BASH: remote_sh = true; break; case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLS_RSH_SHELL_CSH: remote_csh = true; break; default: opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n"); remote_sh = true; } } if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: remote csh: %d, remote sh: %d\n", remote_csh, remote_sh); } /* * Build argv array */ argv = opal_argv_copy(mca_pls_rsh_component.agent_argv); argc = mca_pls_rsh_component.agent_argc; node_name_index1 = argc; opal_argv_append(&argc, &argv, "<template>"); /* Do we need to source .profile on the remote side? */ if (!(remote_csh || remote_sh)) { int i; tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' '); if (NULL == tmp) { return ORTE_ERR_OUT_OF_RESOURCE; } for (i = 0; NULL != tmp[i]; ++i) { opal_argv_append(&argc, &argv, tmp[i]); } opal_argv_free(tmp); } /* add the daemon command (as specified by user) */ local_exec_index = argc; opal_argv_append(&argc, &argv, mca_pls_rsh_component.orted); /* check for debug flags */ orte_pls_base_mca_argv(&argc, &argv); opal_argv_append(&argc, &argv, "--bootproxy"); opal_argv_append(&argc, &argv, jobid_string); opal_argv_append(&argc, &argv, "--name"); proc_name_index = argc; opal_argv_append(&argc, &argv, "<template>"); /* tell the daemon how many procs are in the daemon's job */ opal_argv_append(&argc, &argv, "--num_procs"); asprintf(¶m, "%lu", (unsigned long)(vpid + num_nodes)); opal_argv_append(&argc, &argv, param); free(param); /* tell the daemon the starting vpid of the daemon's job */ opal_argv_append(&argc, &argv, "--vpid_start"); opal_argv_append(&argc, &argv, "0"); opal_argv_append(&argc, &argv, "--nodename"); node_name_index2 = argc; opal_argv_append(&argc, &argv, "<template>"); /* pass along the universe name and location info */ opal_argv_append(&argc, &argv, "--universe"); asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, orte_universe_info.host, orte_universe_info.name); opal_argv_append(&argc, &argv, param); free(param);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -