📄 pls_rsh_module.c
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics. Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */#include "orte_config.h"#include "orte/orte_constants.h"#include <stdlib.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#include <errno.h>#include <string.h>#ifdef HAVE_STRINGS_H#include <strings.h>#endif#ifdef HAVE_SYS_SELECT_H#include <sys/select.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#ifdef HAVE_SYS_TYPES_H#include <sys/types.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_SYS_WAIT_H#include <sys/wait.h>#endif#include <fcntl.h>#include <signal.h>#ifdef HAVE_PWD_H#include <pwd.h>#endif#include "opal/mca/installdirs/installdirs.h"#include "opal/mca/base/mca_base_param.h"#include "opal/util/if.h"#include "opal/util/os_path.h"#include "opal/util/path.h"#include "opal/event/event.h"#include "opal/util/show_help.h"#include "opal/util/argv.h"#include "opal/util/opal_environ.h"#include "opal/util/output.h"#include "opal/util/trace.h"#include "opal/util/basename.h"#include "orte/util/sys_info.h"#include "orte/util/univ_info.h"#include "orte/util/session_dir.h"#include "orte/runtime/orte_wait.h"#include "orte/dss/dss.h"#include "orte/mca/ns/ns.h"#include "orte/mca/rml/rml.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/ras/ras_types.h"#include "orte/mca/rmaps/rmaps.h"#include "orte/mca/smr/smr.h"#include "orte/mca/pls/pls.h"#include "orte/mca/pls/base/base.h"#include "orte/mca/pls/base/pls_private.h"#include "orte/mca/pls/rsh/pls_rsh.h"#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADSstatic int orte_pls_rsh_launch_threaded(orte_jobid_t jobid);#endiforte_pls_base_module_t orte_pls_rsh_module = {#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS orte_pls_rsh_launch_threaded,#else orte_pls_rsh_launch,#endif orte_pls_rsh_terminate_job, orte_pls_rsh_terminate_orteds, orte_pls_rsh_terminate_proc, orte_pls_rsh_signal_job, orte_pls_rsh_signal_proc, orte_pls_rsh_cancel_operation, orte_pls_rsh_finalize};typedef enum { ORTE_PLS_RSH_SHELL_BASH = 0, ORTE_PLS_RSH_SHELL_ZSH, ORTE_PLS_RSH_SHELL_TCSH, ORTE_PLS_RSH_SHELL_CSH, ORTE_PLS_RSH_SHELL_KSH, ORTE_PLS_RSH_SHELL_SH, ORTE_PLS_RSH_SHELL_UNKNOWN} orte_pls_rsh_shell_t;/* These strings *must* follow the same order as the enum ORTE_PLS_RSH_SHELL_* */static const char * orte_pls_rsh_shell_name[] = { "bash", "zsh", "tcsh", /* tcsh has to be first otherwise strstr finds csh */ "csh", "ksh", "sh", "unknown"};/* * Local functions */static void set_handler_default(int sig);static orte_pls_rsh_shell_t find_shell(char *shell);/* local global storage of timing variables */static unsigned long mintime=999999999, miniter, maxtime=0, maxiter;static float avgtime=0.0;static struct timeval *launchstart; static struct timeval joblaunchstart, joblaunchstop;/* local global storage of the list of active daemons */static opal_list_t active_daemons;/** * Check the Shell variable on the specified node */static int orte_pls_rsh_probe(orte_mapped_node_t *node, orte_pls_rsh_shell_t *shell){ char ** argv; int argc, rc, nfds, i; int fd[2]; pid_t pid; fd_set readset; fd_set errset; char outbuf[4096]; if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: going to check SHELL variable on node %s\n", node->nodename); } *shell = ORTE_PLS_RSH_SHELL_UNKNOWN; /* * Build argv array */ argv = opal_argv_copy(mca_pls_rsh_component.agent_argv); argc = mca_pls_rsh_component.agent_argc; opal_argv_append(&argc, &argv, node->nodename); opal_argv_append(&argc, &argv, "echo $SHELL"); if (pipe(fd)) { opal_output(0, "pls:rsh: pipe failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } if ((pid = fork()) < 0) { opal_output(0, "pls:rsh: fork failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } else if (pid == 0) { /* child */ if (dup2(fd[1], 1) < 0) { opal_output(0, "pls:rsh: dup2 failed with errno=%d\n", errno); exit(01); } execvp(argv[0], argv); exit(errno); } if (close(fd[1])) { opal_output(0, "pls:rsh: close failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } /* Monitor stdout */ FD_ZERO(&readset); nfds = fd[0]+1; memset (outbuf, 0, sizeof (outbuf)); rc = ORTE_SUCCESS;; while (ORTE_SUCCESS == rc) { int err; FD_SET (fd[0], &readset); errset = readset; err = select(nfds, &readset, NULL, &errset, NULL); if (err == -1) { if (errno == EINTR) continue; else { rc = ORTE_ERR_IN_ERRNO; break; } } if (FD_ISSET(fd[0], &errset) != 0) rc = ORTE_ERR_FATAL; /* In case we have something valid to read on stdin */ if (FD_ISSET(fd[0], &readset) != 0) { ssize_t ret = 1; char temp[4096]; char * ptr = outbuf; ssize_t outbufsize = sizeof(outbuf); memset (temp, 0, sizeof(temp)); while (ret != 0) { ret = read (fd[0], temp, 256); if (ret < 0) { if (errno == EINTR) continue; else { rc = ORTE_ERR_IN_ERRNO; break; } } else { if (outbufsize > 0) { memcpy (ptr, temp, (ret > outbufsize) ? outbufsize : ret); outbufsize -= ret; ptr += ret; if (outbufsize > 0) *ptr = '\0'; } } } /* After reading complete string (aka read returns 0), we just break */ break; } } /* Search for the substring of known shell-names */ for (i = 0; i < (int)(sizeof (orte_pls_rsh_shell_name)/ sizeof(orte_pls_rsh_shell_name[0])); i++) { char *sh_name = NULL; sh_name = rindex(outbuf, '/'); if ( sh_name != NULL ) { sh_name++; /* skip '/' */ /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore * we have to remove the "\n" */ if ( sh_name[strlen(sh_name)-1] == '\n' ) { sh_name[strlen(sh_name)-1] = '\0'; } if ( 0 == strcmp(sh_name, orte_pls_rsh_shell_name[i]) ) { *shell = i; break; } } } if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: node:%s has SHELL: %s\n", node->nodename, orte_pls_rsh_shell_name[*shell]); } return rc;}/** * Fill the exec_path variable with the directory to the orted */static int orte_pls_rsh_fill_exec_path ( char ** exec_path){ struct stat buf; asprintf(exec_path, "%s/orted", opal_install_dirs.bindir); if (0 != stat(*exec_path, &buf)) { char *path = getenv("PATH"); if (NULL == path) { path = ("PATH is empty!"); } opal_show_help("help-pls-rsh.txt", "no-local-orted", true, path, opal_install_dirs.bindir); return ORTE_ERR_NOT_FOUND; } return ORTE_SUCCESS;}/** * Callback on daemon exit. */static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata){ orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata; orte_mapped_node_t *node; orte_mapped_proc_t *proc; opal_list_item_t *item; int rc; unsigned long deltat; struct timeval launchstop; /* if ssh exited abnormally, set the child processes to aborted and print something useful to the user. The usual reasons for ssh to exit abnormally all are a pretty good indication that the child processes aren't going to start up properly. This should somehow be pushed up to the calling level, but we don't really have a way to do that just yet. */ if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* get the mapping for our node so we can cancel the right things */ rc = orte_rmaps.get_node_map(&node, info->cell, info->nodename, info->active_job); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set state of all processes associated with the daemon as terminated */ for(item = opal_list_get_first(&node->procs); item != opal_list_get_end(&node->procs); item = opal_list_get_next(item)) { proc = (orte_mapped_proc_t*) item; /* Clean up the session directory as if we were the process itself. This covers the case where the process died abnormally and didn't cleanup its own session directory. */ orte_session_dir_finalize(&(proc->name)); rc = orte_smr.set_proc_state(&(proc->name),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -