⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_rsh_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc.  All rights reserved. * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights *                         reserved.  * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics.  Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */#include "orte_config.h"#include "orte/orte_constants.h"#include <stdlib.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#include <errno.h>#include <string.h>#ifdef HAVE_STRINGS_H#include <strings.h>#endif#ifdef HAVE_SYS_SELECT_H#include <sys/select.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#ifdef HAVE_SYS_TYPES_H#include <sys/types.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_SYS_WAIT_H#include <sys/wait.h>#endif#include <fcntl.h>#include <signal.h>#ifdef HAVE_PWD_H#include <pwd.h>#endif#include "opal/mca/installdirs/installdirs.h"#include "opal/mca/base/mca_base_param.h"#include "opal/util/if.h"#include "opal/util/os_path.h"#include "opal/util/path.h"#include "opal/event/event.h"#include "opal/util/show_help.h"#include "opal/util/argv.h"#include "opal/util/opal_environ.h"#include "opal/util/output.h"#include "opal/util/trace.h"#include "opal/util/basename.h"#include "orte/util/sys_info.h"#include "orte/util/univ_info.h"#include "orte/util/session_dir.h"#include "orte/runtime/orte_wait.h"#include "orte/dss/dss.h"#include "orte/mca/ns/ns.h"#include "orte/mca/rml/rml.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/ras/ras_types.h"#include "orte/mca/rmaps/rmaps.h"#include "orte/mca/smr/smr.h"#include "orte/mca/pls/pls.h"#include "orte/mca/pls/base/base.h"#include "orte/mca/pls/base/pls_private.h"#include "orte/mca/pls/rsh/pls_rsh.h"#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADSstatic int orte_pls_rsh_launch_threaded(orte_jobid_t jobid);#endiforte_pls_base_module_t orte_pls_rsh_module = {#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS    orte_pls_rsh_launch_threaded,#else    orte_pls_rsh_launch,#endif    orte_pls_rsh_terminate_job,    orte_pls_rsh_terminate_orteds,    orte_pls_rsh_terminate_proc,    orte_pls_rsh_signal_job,    orte_pls_rsh_signal_proc,    orte_pls_rsh_cancel_operation,    orte_pls_rsh_finalize};typedef enum {    ORTE_PLS_RSH_SHELL_BASH = 0,    ORTE_PLS_RSH_SHELL_ZSH,    ORTE_PLS_RSH_SHELL_TCSH,    ORTE_PLS_RSH_SHELL_CSH,    ORTE_PLS_RSH_SHELL_KSH,    ORTE_PLS_RSH_SHELL_SH,    ORTE_PLS_RSH_SHELL_UNKNOWN} orte_pls_rsh_shell_t;/* These strings *must* follow the same order as the enum   ORTE_PLS_RSH_SHELL_* */static const char * orte_pls_rsh_shell_name[] = {    "bash",    "zsh",    "tcsh",       /* tcsh has to be first otherwise strstr finds csh */    "csh",    "ksh",    "sh",    "unknown"};/* * Local functions */static void set_handler_default(int sig);static orte_pls_rsh_shell_t find_shell(char *shell);/* local global storage of timing variables */static unsigned long  mintime=999999999, miniter, maxtime=0, maxiter;static float avgtime=0.0;static struct timeval *launchstart;   static struct timeval joblaunchstart, joblaunchstop;/* local global storage of the list of active daemons */static opal_list_t active_daemons;/** * Check the Shell variable on the specified node */static int orte_pls_rsh_probe(orte_mapped_node_t *node,                               orte_pls_rsh_shell_t *shell){    char ** argv;    int argc, rc, nfds, i;    int fd[2];    pid_t pid;    fd_set readset;    fd_set errset;    char outbuf[4096];    if (mca_pls_rsh_component.debug) {        opal_output(0, "pls:rsh: going to check SHELL variable on node %s\n",                    node->nodename);    }    *shell = ORTE_PLS_RSH_SHELL_UNKNOWN;    /*     * Build argv array     */    argv = opal_argv_copy(mca_pls_rsh_component.agent_argv);    argc = mca_pls_rsh_component.agent_argc;    opal_argv_append(&argc, &argv, node->nodename);    opal_argv_append(&argc, &argv, "echo $SHELL");    if (pipe(fd)) {        opal_output(0, "pls:rsh: pipe failed with errno=%d\n", errno);        return ORTE_ERR_IN_ERRNO;    }    if ((pid = fork()) < 0) {        opal_output(0, "pls:rsh: fork failed with errno=%d\n", errno);        return ORTE_ERR_IN_ERRNO;    }    else if (pid == 0) {          /* child */        if (dup2(fd[1], 1) < 0) {            opal_output(0, "pls:rsh: dup2 failed with errno=%d\n", errno);            exit(01);        }        execvp(argv[0], argv);        exit(errno);    }    if (close(fd[1])) {        opal_output(0, "pls:rsh: close failed with errno=%d\n", errno);        return ORTE_ERR_IN_ERRNO;    }    /* Monitor stdout */    FD_ZERO(&readset);    nfds = fd[0]+1;    memset (outbuf, 0, sizeof (outbuf));    rc = ORTE_SUCCESS;;    while (ORTE_SUCCESS == rc) {        int err;        FD_SET (fd[0], &readset);        errset = readset;        err = select(nfds, &readset, NULL, &errset, NULL);        if (err == -1) {            if (errno == EINTR)                continue;            else {                rc = ORTE_ERR_IN_ERRNO;                break;            }        }        if (FD_ISSET(fd[0], &errset) != 0)            rc = ORTE_ERR_FATAL;        /* In case we have something valid to read on stdin */        if (FD_ISSET(fd[0], &readset) != 0) {            ssize_t ret = 1;            char temp[4096];            char * ptr = outbuf;            ssize_t outbufsize = sizeof(outbuf);            memset (temp, 0, sizeof(temp));            while (ret != 0) {                ret = read (fd[0], temp, 256);                if (ret < 0) {                    if (errno == EINTR)                        continue;                    else {                        rc = ORTE_ERR_IN_ERRNO;                        break;                    }                }                else {                    if (outbufsize > 0) {                        memcpy (ptr, temp, (ret > outbufsize) ? outbufsize : ret);                        outbufsize -= ret;                        ptr += ret;                        if (outbufsize > 0)                            *ptr = '\0';                    }                }            }            /* After reading complete string (aka read returns 0), we just break */            break;        }    }    /* Search for the substring of known shell-names */    for (i = 0; i < (int)(sizeof (orte_pls_rsh_shell_name)/                          sizeof(orte_pls_rsh_shell_name[0])); i++) {        char *sh_name = NULL;        sh_name = rindex(outbuf, '/');        if ( sh_name != NULL ) {            sh_name++; /* skip '/' */                        /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore             * we have to remove the "\n" */            if ( sh_name[strlen(sh_name)-1] == '\n' ) {                sh_name[strlen(sh_name)-1] = '\0';            }            if ( 0 == strcmp(sh_name, orte_pls_rsh_shell_name[i]) ) {                *shell = i;                break;            }        }    }    if (mca_pls_rsh_component.debug) {        opal_output(0, "pls:rsh: node:%s has SHELL: %s\n",                    node->nodename, orte_pls_rsh_shell_name[*shell]);    }    return rc;}/** * Fill the exec_path variable with the directory to the orted */static int orte_pls_rsh_fill_exec_path ( char ** exec_path){    struct stat buf;    asprintf(exec_path, "%s/orted", opal_install_dirs.bindir);    if (0 != stat(*exec_path, &buf)) {        char *path = getenv("PATH");        if (NULL == path) {            path = ("PATH is empty!");        }        opal_show_help("help-pls-rsh.txt", "no-local-orted",                        true, path, opal_install_dirs.bindir);        return ORTE_ERR_NOT_FOUND;    }   return ORTE_SUCCESS;}/** * Callback on daemon exit. */static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata){    orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata;    orte_mapped_node_t *node;    orte_mapped_proc_t *proc;    opal_list_item_t *item;    int rc;    unsigned long deltat;    struct timeval launchstop;    /* if ssh exited abnormally, set the child processes to aborted       and print something useful to the user.  The usual reasons for       ssh to exit abnormally all are a pretty good indication that       the child processes aren't going to start up properly.       This should somehow be pushed up to the calling level, but we       don't really have a way to do that just yet.    */    if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {        /* get the mapping for our node so we can cancel the right things */        rc = orte_rmaps.get_node_map(&node, info->cell,                                     info->nodename, info->active_job);        if (ORTE_SUCCESS != rc) {            ORTE_ERROR_LOG(rc);            goto cleanup;        }        /* set state of all processes associated with the daemon as           terminated */        for(item =  opal_list_get_first(&node->procs);            item != opal_list_get_end(&node->procs);            item =  opal_list_get_next(item)) {            proc = (orte_mapped_proc_t*) item;                /* Clean up the session directory as if we were the                   process itself.  This covers the case where the                   process died abnormally and didn't cleanup its own                   session directory. */                orte_session_dir_finalize(&(proc->name));                rc = orte_smr.set_proc_state(&(proc->name),

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -