⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_poe_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights *                         reserved.  * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics.  Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */#include "orte_config.h"#include "orte/orte_constants.h"#include <fcntl.h>#include <errno.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#include "opal/mca/base/mca_base_param.h"#include "opal/util/argv.h"#include "opal/util/opal_environ.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/ns/ns.h"#include "orte/mca/rmaps/rmaps.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/rml/rml.h"#include "orte/mca/smr/smr.h"#include "orte/util/univ_info.h"#include "orte/util/session_dir.h"#include "orte/runtime/orte_wait.h"/* remove for ORTE 2.0 */#include "orte/mca/sds/base/base.h"#include "orte/mca/pls/pls.h"#include "orte/mca/pls/poe/pls_poe.h"/* * Local functions */static int pls_poe_launch_job(orte_jobid_t jobid);static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);static int pls_poe_terminate_proc(const orte_process_name_t *name);static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);static int pls_poe_finalize(void);static int pls_poe_cancel_operation(void);orte_pls_base_module_t orte_pls_poe_module = {    pls_poe_launch_job,    pls_poe_terminate_job,    pls_poe_terminate_orteds,    pls_poe_terminate_proc,    pls_poe_signal_job,    pls_poe_signal_proc,    pls_poe_cancel_operation,    pls_poe_finalize};/**poe_set_handler_default - set signal handler to default@param sig signal [IN]*/static void poe_set_handler_default(int sig){    struct sigaction act;    act.sa_handler = SIG_DFL;    act.sa_flags = 0;    sigemptyset(&act.sa_mask);    sigaction(sig, &act, (struct sigaction *)0);}/**poe_argv_append_int - append integer variable to argument variable@param argc argument count [OUT]@param argv argument variable [OUT]@param varname variable name [IN]@param min minimum value [IN]@param argname argument name [IN]*/static inline int poe_argv_append_int(int *argc, char ***argv, int varname, int min, char *argname){    char *tmp_string;    if(varname >= min) {        opal_argv_append(argc, argv, argname);        asprintf(&tmp_string, "%d", varname);        opal_argv_append(argc, argv, tmp_string);        free(tmp_string);    } else {        return ORTE_ERR_BAD_PARAM;    }    return ORTE_SUCCESS;}/**@warning - THIS FUNCTION IS NOT USED. IT WILL BE USED WHEN FAULT-TOLERANCE FEATURE IS NEEDED*/#ifdef __FOR_LATERint pls_poe_launch_interactive_orted(orte_jobid_t jobid){    opal_list_t nodes, mapping_list;    opal_list_item_t* item;    orte_std_cntr_t num_nodes;    orte_vpid_t vpid;    int node_name_index1;    int node_name_index2;    int proc_name_index;    char *tmp_string;    char *uri, *param;    char* name_string;    char** argv;    int argc;    int pid;    int rc;    int i;    int status;    FILE *hfp, *cfp;    /* Query the list of nodes allocated and mapped to this job.     * We need the entire mapping for a couple of reasons:     *  - need the prefix to start with.     *  - need to know if we are launching on a subset of the allocated nodes     * All other mapping responsibilities fall to orted in the fork PLS     */    if((mca_pls_poe_component.hostfile=tempnam(NULL,NULL))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;    if((mca_pls_poe_component.cmdfile=tempnam(NULL,NULL))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;    if((hfp=fopen(mca_pls_poe_component.hostfile,"w"))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;    if((cfp=fopen(mca_pls_poe_component.cmdfile,"w"))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;    OBJ_CONSTRUCT(&nodes, opal_list_t);    OBJ_CONSTRUCT(&mapping_list, opal_list_t);    rc = orte_rmaps_base_mapped_node_query(&mapping_list, &nodes, jobid);    if(ORTE_SUCCESS != rc) {        goto cleanup;    }    /*     * Allocate a range of vpids for the daemons.     */    num_nodes = opal_list_get_size(&nodes);    if(num_nodes == 0) {        return ORTE_ERR_BAD_PARAM;    }    rc = orte_ns.reserve_range(0, num_nodes, &vpid);    if(ORTE_SUCCESS != rc) {        goto cleanup;    }    /* application */    argv = opal_argv_copy(opal_argv_split(mca_pls_poe_component.orted, ' '));    argc = opal_argv_count(argv);    if (mca_pls_poe_component.debug) {         opal_argv_append(&argc, &argv, "--debug");    }    opal_argv_append(&argc, &argv, "--debug-daemons");    opal_argv_append(&argc, &argv, "--no-daemonize");    opal_argv_append(&argc, &argv, "--bootproxy");    /* need integer value for command line parameter - NOT hex */    asprintf(&tmp_string, "%lu", (unsigned long)jobid);    opal_argv_append(&argc, &argv, tmp_string);    free(tmp_string);    opal_argv_append(&argc, &argv, "--name");    proc_name_index = argc;    opal_argv_append(&argc, &argv, "");    opal_argv_append(&argc, &argv, "--nodename");    node_name_index2 = argc;    opal_argv_append(&argc, &argv, "");    /* pass along the universe name and location info */    opal_argv_append(&argc, &argv, "--universe");    asprintf(&tmp_string, "%s@%s:%s", orte_universe_info.uid,    orte_universe_info.host, orte_universe_info.name);    opal_argv_append(&argc, &argv, tmp_string);    free(tmp_string);    /* setup ns contact info */    opal_argv_append(&argc, &argv, "--nsreplica");    if(NULL != orte_process_info.ns_replica_uri) {        uri = strdup(orte_process_info.ns_replica_uri);    } else {        uri = orte_rml.get_uri();    }    asprintf(&param, "\"%s\"", uri);    opal_argv_append(&argc, &argv, param);    free(uri);    /* setup gpr contact info */    opal_argv_append(&argc, &argv, "--gprreplica");    if(NULL != orte_process_info.gpr_replica_uri) {        uri = strdup(orte_process_info.gpr_replica_uri);    } else {        uri = orte_rml.get_uri();    }    asprintf(&param, "\"%s\"", uri);    opal_argv_append(&argc, &argv, param);    free(uri);    /*     * Iterate through each of the nodes and spin     * up a daemon.     */    for(item =  opal_list_get_first(&nodes);        item != opal_list_get_end(&nodes);        item =  opal_list_get_next(item)) {        orte_ras_node_t* node = (orte_ras_node_t*)item;        orte_process_name_t* name;        pid_t pid;        /* setup node name */        argv[node_name_index2] = node->node_name;        fprintf(hfp,"%s\n",node->node_name);        /* initialize daemons process name */        rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);        if(ORTE_SUCCESS != rc) {            ORTE_ERROR_LOG(rc);            goto cleanup;        }        /* setup process name */        rc = orte_ns.get_proc_name_string(&name_string, name);        if(ORTE_SUCCESS != rc) {            opal_output(0, "orte_pls_poe: unable to create process name");            return rc;        }        argv[proc_name_index] = name_string;        for(i=0;i<argc;i++) {           fprintf(cfp,"%s ",argv[i]);        }        fprintf(cfp,"\n");        if (mca_pls_poe_component.verbose) {           opal_output(0, "%s:cmdfile %s\n", __FUNCTION__, opal_argv_join(argv, ' '));        }        vpid++;        free(name);    }    fclose(cfp);    fclose(hfp);    argv = opal_argv_copy(mca_pls_poe_component.argv);    argc = mca_pls_poe_component.argc;    opal_argv_append(&argc, &argv, "-hostfile");    opal_argv_append(&argc, &argv, mca_pls_poe_component.hostfile);    opal_argv_append(&argc, &argv, "-cmdfile");    opal_argv_append(&argc, &argv, mca_pls_poe_component.cmdfile);    opal_argv_append(&argc, &argv, "-procs");    asprintf(&tmp_string, "%d", num_nodes);    opal_argv_append(&argc, &argv, tmp_string);    free(tmp_string);    opal_argv_append(&argc, &argv, "-pgmmodel");    opal_argv_append(&argc, &argv, "mpmd");    opal_argv_append(&argc, &argv, "-resd");    opal_argv_append(&argc, &argv, "no");    opal_argv_append(&argc, &argv, "-labelio");    opal_argv_append(&argc, &argv, "yes");    opal_argv_append(&argc, &argv, "-infolevel");    opal_argv_append(&argc, &argv, "6");    opal_argv_append(&argc, &argv, "-stdoutmode");    opal_argv_append(&argc, &argv, "ordered");    rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retry, 0, "-retry");    if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }    rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retrycount, 0, "-retrycount");    if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }    if (mca_pls_poe_component.verbose) {       opal_output(0, "%s:cmdline %s\n", __FUNCTION__, opal_argv_join(argv, ' '));    }    pid = fork();    if(pid < 0) {        rc = ORTE_ERR_OUT_OF_RESOURCE;        goto cleanup;    }    /* child */    if(pid == 0) {       execv(mca_pls_poe_component.path, argv);       opal_output(0, "orte_pls_poe: execv failed with errno=%d\n", errno);       exit(-1);    }cleanup:    while(NULL != (item = opal_list_remove_first(&nodes))) {        OBJ_RELEASE(item);    }    OBJ_DESTRUCT(&nodes);    while(NULL != (item = opal_list_remove_first(&mapping_list))) {        OBJ_RELEASE(item);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -