📄 pls_rsh_module.c
字号:
/* setup ns contact info */ opal_argv_append(&argc, &argv, "--nsreplica"); if (NULL != orte_process_info.ns_replica_uri) { uri = strdup(orte_process_info.ns_replica_uri); } else { uri = orte_rml.get_uri(); } asprintf(¶m, "\"%s\"", uri); opal_argv_append(&argc, &argv, param); free(uri); free(param); /* setup gpr contact info */ opal_argv_append(&argc, &argv, "--gprreplica"); if (NULL != orte_process_info.gpr_replica_uri) { uri = strdup(orte_process_info.gpr_replica_uri); } else { uri = orte_rml.get_uri(); } asprintf(¶m, "\"%s\"", uri); opal_argv_append(&argc, &argv, param); free(uri); free(param); local_exec_index_end = argc; if (!(remote_csh || remote_sh)) { opal_argv_append(&argc, &argv, ")"); } if (mca_pls_rsh_component.debug) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "pls:rsh: final template argv:"); opal_output(0, "pls:rsh: %s", param); free(param); } } /* Figure out the basenames for the libdir and bindir. This requires some explanation: - Use opal_install_dirs.libdir and opal_install_dirs.bindir instead of -D'ing some macros in this directory's Makefile.am because it makes all the dependencies work out correctly. These are defined in opal/install_dirs.h. - After a discussion on the devel-core mailing list, the developers decided that we should use the local directory basenames as the basis for the prefix on the remote note. This does not handle a few notable cases (e.g., f the libdir/bindir is not simply a subdir under the prefix, if the libdir/bindir basename is not the same on the remote node as it is here in the local node, etc.), but we decided that --prefix was meant to handle "the common case". If you need something more complex than this, a) edit your shell startup files to set PATH/LD_LIBRARY_PATH properly on the remove node, or b) use some new/to-be-defined options that explicitly allow setting the bindir/libdir on the remote node. We decided to implement these options (e.g., --remote-bindir and --remote-libdir) to orterun when it actually becomes a problem for someone (vs. a hypothetical situation). Hence, for now, we simply take the basename of this install's libdir and bindir and use it to append this install's prefix and use that on the remote node. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* * Iterate through each of the nodes */ if (mca_pls_rsh_component.timing) { /* allocate space to track the start times */ launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval)); } for(n_item = opal_list_get_first(&map->nodes); n_item != opal_list_get_end(&map->nodes); n_item = opal_list_get_next(n_item)) { orte_process_name_t* name; pid_t pid; char *exec_path; char **exec_argv; rmaps_node = (orte_mapped_node_t*)n_item; if (mca_pls_rsh_component.timing) { if (0 != gettimeofday(&launchstart[vpid], NULL)) { opal_output(0, "pls_rsh: could not obtain start time"); } } /* new daemon - setup to record its info */ dmn = OBJ_NEW(orte_pls_daemon_info_t); dmn->active_job = jobid; opal_list_append(&active_daemons, &dmn->super); /* setup node name */ free(argv[node_name_index1]); if (NULL != rmaps_node->username && 0 != strlen (rmaps_node->username)) { asprintf (&argv[node_name_index1], "%s@%s", rmaps_node->username, rmaps_node->nodename); } else { argv[node_name_index1] = strdup(rmaps_node->nodename); } free(argv[node_name_index2]); argv[node_name_index2] = strdup(rmaps_node->nodename); /* save it in the daemon info */ dmn->nodename = strdup(rmaps_node->nodename); /* initialize daemons process name */ rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } /* save it in the daemon info */ dmn->cell = rmaps_node->cell; if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* fork a child to exec the rsh/ssh session */ /* set the process state to "launched" */ if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) { ORTE_ERROR_LOG(rc); goto cleanup; } pid = fork(); if (pid < 0) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } /* child */ if (pid == 0) { char* name_string; char** env; char* var; long fd, fdmax = sysconf(_SC_OPEN_MAX); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: launching on node %s\n", rmaps_node->nodename); } /* We don't need to sense an oversubscribed condition and set the sched_yield * for the node as we are only launching the daemons at this time. The daemons * are now smart enough to set the oversubscribed condition themselves when * they launch the local procs. */ /* Is this a local launch? * * Not all node names may be resolvable (if we found * localhost in the hostfile, for example). So first * check trivial case of node_name being same as the * current nodename, which must be local. If that doesn't * match, check using ifislocal(). */ if (!mca_pls_rsh_component.force_rsh && (0 == strcmp(rmaps_node->nodename, orte_system_info.nodename) || opal_ifislocal(rmaps_node->nodename))) { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: %s is a LOCAL node\n", rmaps_node->nodename); } if (mca_pls_rsh_component.timing) { /* since this is a local launch, the daemon will never reach * the waitpid callback - so set the start value to * something nonsensical */ launchstart[vpid].tv_sec = 0; launchstart[vpid].tv_usec = 0; } exec_path = opal_path_findv(argv[local_exec_index], 0, environ, NULL); if (NULL == exec_path && NULL == prefix_dir) { rc = orte_pls_rsh_fill_exec_path (&exec_path); if (ORTE_SUCCESS != rc) { exit(-1); /* the forked process MUST exit */ } } else { if (NULL != prefix_dir) { exec_path = opal_os_path( false, prefix_dir, bin_base, "orted", NULL ); } /* If we yet did not fill up the execpath, do so now */ if (NULL == exec_path) { rc = orte_pls_rsh_fill_exec_path (&exec_path); if (ORTE_SUCCESS != rc) { exit(-1); /* the forked process MUST exit */ } } } /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. We're already in the child process, so it's ok to modify environ. */ if (NULL != prefix_dir) { char *oldenv, *newenv; /* Reset PATH */ newenv = opal_os_path( false, prefix_dir, bin_base, NULL ); oldenv = getenv("PATH"); if (NULL != oldenv) { char *temp; asprintf(&temp, "%s:%s", newenv, oldenv ); free( newenv ); newenv = temp; } opal_setenv("PATH", newenv, true, &environ); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: reset PATH: %s", newenv); } free(newenv); /* Reset LD_LIBRARY_PATH */ newenv = opal_os_path( false, prefix_dir, lib_base, NULL ); oldenv = getenv("LD_LIBRARY_PATH"); if (NULL != oldenv) { char* temp; asprintf(&temp, "%s:%s", newenv, oldenv); free(newenv); newenv = temp; } opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: reset LD_LIBRARY_PATH: %s", newenv); } free(newenv); } /* Since this is a local execution, we need to potentially whack the final ")" in the argv (if sh/csh conditionals, from above). Note that we're modifying the argv[] in the child process, so there's no need to save this and restore it afterward -- the parent's argv[] is unmodified. */ if (NULL != argv[local_exec_index_end]) { free(argv[local_exec_index_end]); argv[local_exec_index_end] = NULL; } /* tell the daemon to setup its own process session/group */ opal_argv_append(&argc, &argv, "--set-sid"); exec_argv = &argv[local_exec_index]; /* Finally, chdir($HOME) because we're making the assumption that this is what will happen on remote nodes (via rsh/ssh). This allows a user to specify a path that is relative to $HOME for both the cwd and argv[0] and it will work on all nodes -- including the local nost. Otherwise, it would work on remote nodes and not the local node. If the user does not start in $HOME on the remote nodes... well... let's hope they start in $HOME. :-) */ var = getenv("HOME"); if (NULL != var) { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: changing to directory %s", var); } /* Ignore errors -- what are we going to do? (and we ignore errors on the remote nodes in the fork pls, so this is consistent) */ chdir(var); } } else { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: %s is a REMOTE node\n", rmaps_node->nodename); } exec_argv = argv; exec_path = strdup(mca_pls_rsh_component.agent_path); if (NULL != prefix_dir) { char *opal_prefix = getenv("OPAL_PREFIX"); if (remote_sh) { asprintf (&argv[local_exec_index], "%s%s%s PATH=%s/%s:$PATH ; export PATH ; " "LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; " "%s/%s/%s", (opal_prefix != NULL ? "OPAL_PREFIX=" : ""), (opal_prefix != NULL ? opal_prefix : ""), (opal_prefix != NULL ? " ;" : ""), prefix_dir, bin_base, prefix_dir, lib_base, prefix_dir, bin_base, mca_pls_rsh_component.orted); } if (remote_csh) { /* [t]csh is a bit more challenging -- we have to check whether LD_LIBRARY_PATH is already set before we try to set it. Must be very careful about obeying [t]csh's order of evaluation and not using a variable before it is defined. See this thread for more details: http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ asprintf (&argv[local_exec_index], "%s%s%s set path = ( %s/%s $path ) ; " "if ( $?LD_LIBRARY_PATH == 1 ) " "set OMPI_have_llp ; " "if ( $?LD_LIBRARY_PATH == 0 ) " "setenv LD_LIBRARY_PATH %s/%s ; " "if ( $?OMPI_have_llp == 1 ) " "setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; " "%s/%s/%s", (opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""), (opal_prefix != NULL ? opal_prefix : ""), (opal_prefix != NULL ? " ;" : ""), prefix_dir, bin_base, prefix_dir, lib_base, prefix_dir, lib_base, prefix_dir, bin_base, mca_pls_rsh_component.orted); } } } /* setup process name */ rc = orte_ns.get_proc_name_string(&name_string, name); if (ORTE_SUCCESS != rc) { opal_output(0, "orte_pls_rsh: unable to create process name"); exit(-1); } free(argv[proc_name_index]); argv[proc_name_index] = strdup(name_string); if (!mca_pls_rsh_component.debug) { /* setup stdin */ int fd = open("/dev/null", O_RDWR); dup2(fd, 0); close(fd);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -