⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pls_slurm_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc.  All rights reserved. * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights *                         reserved.  * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics.  Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */#include "orte_config.h"#include "orte/orte_constants.h"#include "orte/orte_types.h"#include <sys/types.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#include <signal.h>#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef HAVE_SYS_TYPES_H#include <sys/types.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_FCNTL_H#include <fcntl.h>#endif#include "opal/mca/installdirs/installdirs.h"#include "opal/util/argv.h"#include "opal/util/output.h"#include "opal/util/opal_environ.h"#include "opal/util/path.h"#include "opal/util/show_help.h"#include "opal/util/basename.h"#include "opal/mca/base/mca_base_param.h"#include "orte/runtime/runtime.h"#include "orte/mca/ns/base/base.h"#include "orte/mca/rml/rml.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/smr/smr.h"#include "orte/mca/rmaps/rmaps.h"#include "orte/mca/pls/pls.h"#include "orte/mca/pls/base/base.h"#include "orte/mca/pls/base/pls_private.h"#include "pls_slurm.h"/* * Local functions */static int pls_slurm_launch_job(orte_jobid_t jobid);static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);static int pls_slurm_terminate_proc(const orte_process_name_t *name);static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);static int pls_slurm_finalize(void);static int pls_slurm_cancel_operation(void);static int pls_slurm_start_proc(int argc, char **argv, char **env,                                char *prefix);/* * Global variable */orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {    pls_slurm_launch_job,    pls_slurm_terminate_job,    pls_slurm_terminate_orteds,    pls_slurm_terminate_proc,    pls_slurm_signal_job,    pls_slurm_signal_proc,    pls_slurm_cancel_operation,    pls_slurm_finalize};/* * Local variable */static pid_t srun_pid = 0;static int pls_slurm_launch_job(orte_jobid_t jobid){    orte_job_map_t *map;    opal_list_item_t *item;    size_t num_nodes;    orte_vpid_t vpid;    orte_vpid_t start_vpid;    char *jobid_string = NULL;    char *uri, *param;    char **argv;    int argc;    int rc;    char *tmp;    char** env = NULL;    char* var;    char *nodelist_flat;    char **nodelist_argv;    int nodelist_argc;    orte_process_name_t* name;    char *name_string;    char **custom_strings;    int num_args, i;    char *cur_prefix;    opal_list_t daemons;    orte_pls_daemon_info_t *dmn;    struct timeval joblaunchstart, launchstart, launchstop;    if (mca_pls_slurm_component.timing) {        if (0 != gettimeofday(&joblaunchstart, NULL)) {            opal_output(0, "pls_slurm: could not obtain job start time");        }            }        /* setup a list that will contain the info for all the daemons     * so we can store it on the registry when done     */    OBJ_CONSTRUCT(&daemons, opal_list_t);        /* Query the map for this job.     * We need the entire mapping for a couple of reasons:     *  - need the prefix to start with.     *  - need to know if we are launching on a subset of the allocated nodes     * All other mapping responsibilities fall to orted in the fork PLS     */    rc = orte_rmaps.get_job_map(&map, jobid);    if (ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);        OBJ_DESTRUCT(&daemons);        return rc;    }    /* if the user requested that we re-use daemons,     * launch the procs on any existing, re-usable daemons     */    if (orte_pls_base.reuse_daemons) {        if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {            ORTE_ERROR_LOG(rc);            OBJ_RELEASE(map);            OBJ_DESTRUCT(&daemons);            return rc;        }    }        /*     * Allocate a range of vpids for the daemons.     */    num_nodes = opal_list_get_size(&map->nodes);    if (num_nodes == 0) {        /* nothing further to do - job must have been launched         * on existing daemons, so we can just return         */        OBJ_RELEASE(map);        OBJ_DESTRUCT(&daemons);        return ORTE_SUCCESS;    }    rc = orte_ns.reserve_range(0, num_nodes, &vpid);    if (ORTE_SUCCESS != rc) {        goto cleanup;    }    start_vpid = vpid;     /* setup the orted triggers for passing their launch info */    if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }        /* need integer value for command line parameter */    asprintf(&jobid_string, "%lu", (unsigned long) jobid);    /*     * start building argv array     */    argv = NULL;    argc = 0;    /*     * SLURM srun OPTIONS     */    /* add the srun command */    opal_argv_append(&argc, &argv, "srun");    /* Append user defined arguments to srun */    if ( NULL != mca_pls_slurm_component.custom_args ) {        custom_strings = opal_argv_split(mca_pls_slurm_component.custom_args, ' ');        num_args       = opal_argv_count(custom_strings);        for (i = 0; i < num_args; ++i) {            opal_argv_append(&argc, &argv, custom_strings[i]);        }        opal_argv_free(custom_strings);    }    asprintf(&tmp, "--nodes=%lu", (unsigned long) num_nodes);    opal_argv_append(&argc, &argv, tmp);    free(tmp);    asprintf(&tmp, "--ntasks=%lu", (unsigned long) num_nodes);    opal_argv_append(&argc, &argv, tmp);    free(tmp);    /* create nodelist */    nodelist_argv = NULL;    nodelist_argc = 0;    for (item =  opal_list_get_first(&map->nodes);         item != opal_list_get_end(&map->nodes);         item =  opal_list_get_next(item)) {        orte_mapped_node_t* node = (orte_mapped_node_t*)item;        opal_argv_append(&nodelist_argc, &nodelist_argv, node->nodename);    }    nodelist_flat = opal_argv_join(nodelist_argv, ',');    opal_argv_free(nodelist_argv);    asprintf(&tmp, "--nodelist=%s", nodelist_flat);    opal_argv_append(&argc, &argv, tmp);    free(tmp);    /*     * ORTED OPTIONS     */    /* add the daemon command (as specified by user) */    opal_argv_append(&argc, &argv, mca_pls_slurm_component.orted);    opal_argv_append(&argc, &argv, "--no-daemonize");    /* check for debug flags */    orte_pls_base_mca_argv(&argc, &argv);    /* proxy information */    opal_argv_append(&argc, &argv, "--bootproxy");    opal_argv_append(&argc, &argv, jobid_string);    /* force orted to use the slurm sds */    opal_argv_append(&argc, &argv, "--ns-nds");    opal_argv_append(&argc, &argv, "slurm");    /* set orte process name to be the base of the name list for the daemons */    rc = orte_ns.create_process_name(&name,                                     orte_process_info.my_name->cellid,                                     0, vpid);    if (ORTE_SUCCESS != rc) {        ORTE_ERROR_LOG(rc);        goto cleanup;    }    rc = orte_ns.get_proc_name_string(&name_string, name);    if (ORTE_SUCCESS != rc) {        opal_output(0, "orte_pls_rsh: unable to create process name");        goto cleanup;    }    free(name);    opal_argv_append(&argc, &argv, "--name");    opal_argv_append(&argc, &argv, name_string);    free(name_string);    /* tell the daemon how many procs are in the daemon's job */    opal_argv_append(&argc, &argv, "--num_procs");    asprintf(&param, "%lu", (unsigned long) num_nodes);    opal_argv_append(&argc, &argv, param);    free(param);    /* tell the daemon the starting vpid of the daemon's job */    opal_argv_append(&argc, &argv, "--vpid_start");    asprintf(&param, "%lu", (unsigned long) 0);    opal_argv_append(&argc, &argv, param);    free(param);    /* pass along the universe name and location info */    opal_argv_append(&argc, &argv, "--universe");    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,             orte_universe_info.host, orte_universe_info.name);    opal_argv_append(&argc, &argv, param);    free(param);    /* setup ns contact info */    opal_argv_append(&argc, &argv, "--nsreplica");    if (NULL != orte_process_info.ns_replica_uri) {        uri = strdup(orte_process_info.ns_replica_uri);    } else {        uri = orte_rml.get_uri();    }    asprintf(&param, "\"%s\"", uri);    opal_argv_append(&argc, &argv, param);    free(uri);    free(param);    /* setup gpr contact info */    opal_argv_append(&argc, &argv, "--gprreplica");    if (NULL != orte_process_info.gpr_replica_uri) {        uri = strdup(orte_process_info.gpr_replica_uri);    } else {        uri = orte_rml.get_uri();    }    asprintf(&param, "\"%s\"", uri);    opal_argv_append(&argc, &argv, param);    free(uri);    free(param);    if (mca_pls_slurm_component.debug) {        param = opal_argv_join(argv, ' ');        if (NULL != param) {            opal_output(0, "pls:slurm: final top-level argv:");            opal_output(0, "pls:slurm:     %s", param);            free(param);        }    }    /* Copy the prefix-directory specified in the

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -