⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 orterun.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
/* -*- C -*- * * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007      Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights *                         reserved.  * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */#include "orte_config.h"#include <stdio.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_SYS_PARAM_H#include <sys/param.h>#endif#include <errno.h>#include <signal.h>#include <ctype.h>#ifdef HAVE_SYS_TYPES_H#include <sys/types.h>#endif  /* HAVE_SYS_TYPES_H */#ifdef HAVE_SYS_WAIT_H#include <sys/wait.h>#endif  /* HAVE_SYS_WAIT_H */#ifdef HAVE_LIBGEN_H#include <libgen.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#include "opal/event/event.h"#include "opal/mca/installdirs/installdirs.h"#include "opal/mca/base/base.h"#include "opal/threads/condition.h"#include "opal/util/argv.h"#include "opal/util/basename.h"#include "opal/util/cmd_line.h"#include "opal/util/opal_environ.h"#include "opal/util/output.h"#include "opal/util/show_help.h"#include "opal/util/trace.h"#include "opal/version.h"#include "opal/runtime/opal.h"#include "orte/orte_constants.h"#include "orte/class/orte_pointer_array.h"#include "orte/util/proc_info.h"#include "orte/util/sys_info.h"#include "orte/util/universe_setup_file_io.h"#include "orte/util/pre_condition_transports.h"#include "orte/mca/ns/ns.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/pls/pls.h"#include "orte/mca/rmaps/rmaps_types.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/schema/schema.h"#include "orte/mca/smr/smr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/runtime/runtime.h"#include "orte/runtime/params.h"#include "orte/runtime/orte_wait.h"#include "orterun.h"#include "totalview.h"/* * Globals */static struct opal_event term_handler;static struct opal_event int_handler;#ifndef __WINDOWS__static struct opal_event sigusr1_handler;static struct opal_event sigusr2_handler;#endif  /* __WINDOWS__ */static orte_jobid_t jobid = ORTE_JOBID_INVALID;static orte_pointer_array_t *apps_pa;static bool wait_for_job_completion = true;static char *orterun_basename = NULL;static int max_display_aborted = 1;static int num_aborted = 0;static int num_killed = 0;static char **global_mca_env = NULL;static bool have_zero_np = false;static orte_std_cntr_t total_num_apps = 0;static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;/* * Globals */struct globals_t orterun_globals;bool globals_init = false;opal_cmd_line_init_t cmd_line_init[] = {    /* Various "obvious" options */    { NULL, NULL, NULL, 'h', NULL, "help", 0,      &orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL,      "This help message" },    { NULL, NULL, NULL, 'V', NULL, "version", 0,      &orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL,      "Print version and exit" },    { NULL, NULL, NULL, 'v', NULL, "verbose", 0,      &orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,      "Be verbose" },    { NULL, NULL, NULL, 'q', NULL, "quiet", 0,      &orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,      "Suppress helpful messages" },    /* Use an appfile */    { NULL, NULL, NULL, '\0', NULL, "app", 1,      &orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING,      "Provide an appfile; ignore all other command line options" },    /* Number of processes; -c, -n, --n, -np, and --np are all       synonyms */    { NULL, NULL, NULL, 'c', "np", "np", 1,      &orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,      "Number of processes to run" },    { NULL, NULL, NULL, '\0', "n", "n", 1,      &orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,      "Number of processes to run" },        /* Set a hostfile */    { "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Provide a hostfile" },    { "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Provide a hostfile" },    /* Don't wait for the process to finish before exiting */#if 0    { NULL, NULL, NULL, '\0', "nw", "nw", 0,      &orterun_globals.no_wait_for_job_completion, OPAL_CMD_LINE_TYPE_BOOL,      "Launch the processes and do not wait for their completion (i.e., let orterun complete as soon a successful launch occurs)" },#endif        /* Set the max number of aborted processes to show */    { NULL, NULL, NULL, '\0', "aborted", "aborted", 1,      &max_display_aborted, OPAL_CMD_LINE_TYPE_INT,      "The maximum number of aborted processes to display" },    /* Export environment variables; potentially used multiple times,       so it does not make sense to set into a variable */    { NULL, NULL, NULL, 'x', NULL, NULL, 1,      NULL, OPAL_CMD_LINE_TYPE_NULL,      "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },    /* Specific mapping (C, cX, N, nX) */#if 0    /* JJH --map is not currently implemented so don't advertise it until it is */    { NULL, NULL, NULL, '\0', NULL, "map", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Mapping of processes to nodes / CPUs" },#endif    { NULL, NULL, NULL, '\0', "bynode", "bynode", 0,      &orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,      "Whether to allocate/map processes round-robin by node" },    { NULL, NULL, NULL, '\0', "byslot", "byslot", 0,      &orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,      "Whether to allocate/map processes round-robin by slot (the default)" },    { "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },    { "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,        NULL, OPAL_CMD_LINE_TYPE_INT,        "Launch n processes per node on all allocated nodes" },    { "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Nodes are not to be oversubscribed, even if the system supports such operation"},    { "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Display the process map just before launch"},        /* mpiexec-like arguments */    { NULL, NULL, NULL, '\0', "wdir", "wdir", 1,      &orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,      "Set the working directory of the started processes" },    { NULL, NULL, NULL, '\0', "wd", "wd", 1,      &orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,      "Synonym for --wdir" },    { NULL, NULL, NULL, '\0', "path", "path", 1,      &orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING,      "PATH to be used to look for executables to start processes" },    /* These arguments can be specified multiple times */#if 0    /* JMS: Removed because it's not really implemented */    { NULL, NULL, NULL, '\0', "arch", "arch", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Architecture to start processes on" },#endif    { NULL, NULL, NULL, 'H', "host", "host", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "List of hosts to invoke processes on" },    /* OSC mpiexec-like arguments */    { "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Do not run any MPI applications on the local node" },    /* User-level debugger arguments */    { NULL, NULL, NULL, '\0', "tv", "tv", 0,      &orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,      "Deprecated backwards compatibility flag; synonym for \"--debug\"" },    { NULL, NULL, NULL, '\0', "debug", "debug", 0,      &orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,      "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" },    { "orte", "base", "user_debugger", '\0', "debugger", "debugger", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Sequence of debuggers to search for when \"--debug\" is used" },    /* OpenRTE arguments */    { "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Enable debugging of OpenRTE" },        { "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,      NULL, OPAL_CMD_LINE_TYPE_INT,      "Enable debugging of any OpenRTE daemons used by this application" },        { "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Enable debugging of any OpenRTE daemons used by this application, storing output in files" },        { "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "Do not detach OpenRTE daemons used by this application" },        { "universe", NULL, NULL, '\0', NULL, "universe", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Set the universe name as username@hostname:universe_name for this application" },        { NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,      &orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,      "Set the root for the session directory tree for orterun ONLY" },    { NULL, NULL, NULL, '\0', NULL, "do-not-launch", 0,        &orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,        "Perform all necessary operations to prepare to launch the application, but do not actually launch it" },        { "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0,      NULL, OPAL_CMD_LINE_TYPE_BOOL,      "If set, reuse daemons to launch dynamically spawned processes"},    { NULL, NULL, NULL, '\0', NULL, "prefix", 1,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Prefix where Open MPI is installed on remote nodes" },    { NULL, NULL, NULL, '\0', NULL, "noprefix", 0,      NULL, OPAL_CMD_LINE_TYPE_STRING,      "Disable automatic --prefix behavior" },    /* End of list */    { NULL, NULL, NULL, '\0', NULL, NULL, 0,      NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }};/* * Local functions */static void exit_callback(int fd, short event, void *arg);static void abort_signal_callback(int fd, short event, void *arg);static void signal_forward_callback(int fd, short event, void *arg);static int create_app(int argc, char* argv[], orte_app_context_t **app,                      bool *made_app, char ***app_env);static int init_globals(void);static int parse_globals(int argc, char* argv[]);static int parse_locals(int argc, char* argv[]);static int parse_appfile(char *filename, char ***env);static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state);static void dump_aborted_procs(orte_jobid_t jobid);int orterun(int argc, char *argv[]){    orte_app_context_t **apps;    int rc, ret, i, num_apps, array_size;    orte_proc_state_t cb_states;    orte_job_state_t exit_state;    opal_list_t attributes;    opal_list_item_t *item;    uint8_t flow;    /* Need to initialize OPAL so that install_dirs are filled in */    opal_init_util();    /* Setup MCA params */    mca_base_param_init();    orte_register_params(false);    /* find our basename (the name of the executable) so that we can       use it in pretty-print error messages */    orterun_basename = opal_basename(argv[0]);    /* Check for some "global" command line params */    parse_globals(argc, argv);    /* If we're still here, parse each app */    parse_locals(argc, argv);    /* Convert the list of apps to an array of orte_app_context_t       pointers */    array_size = orte_pointer_array_get_size(apps_pa);    apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t *) * array_size);    if (NULL == apps) {        opal_show_help("help-orterun.txt", "orterun:call-failed",                       true, orterun_basename, "system", "malloc returned NULL", errno);        exit(1);    }    num_apps = 0;    for (i = 0; i < array_size; ++i) {        apps[num_apps] = (orte_app_context_t *)            orte_pointer_array_get_item(apps_pa, i);        if (NULL != apps[num_apps]) {            num_apps++;        }    }    if (0 == num_apps) {        /* This should never happen -- this case should be caught in           create_app(), but let's just double check... */        opal_show_help("help-orterun.txt", "orterun:nothing-to-do",                       true, orterun_basename);        exit(1);    }    /* Intialize our Open RTE environment */    /* Set the flag telling orte_init that I am NOT a     * singleton, but am "infrastructure" - prevents setting     * up incorrect infrastructure that only a singleton would     * require     */    if (ORTE_SUCCESS != (rc = orte_init(true))) {        opal_show_help("help-orterun.txt", "orterun:init-failure", true,                       "orte_init()", rc);        return rc;    }    /* pre-condition any network transports that require it */    if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) {        ORTE_ERROR_LOG(rc);        opal_show_help("help-orterun.txt", "orterun:precondition", false,                       orterun_basename, NULL, NULL, rc);        return rc;    }    /* Prep to start the application */    /* construct the list of attributes */    OBJ_CONSTRUCT(&attributes, opal_list_t);        if (orterun_globals.do_not_launch) {        flow = ORTE_RMGR_SETUP | ORTE_RMGR_RES_DISC | ORTE_RMGR_ALLOC | ORTE_RMGR_MAP | ORTE_RMGR_SETUP_TRIGS;        orte_rmgr.add_attribute(&attributes, ORTE_RMGR_SPAWN_FLOW, ORTE_UINT8, &flow, ORTE_RMGR_ATTR_OVERRIDE);    }    /** setup callbacks for abort signals */    opal_signal_set(&term_handler, SIGTERM,                    abort_signal_callback, &term_handler);    opal_signal_add(&term_handler, NULL);    opal_signal_set(&int_handler, SIGINT,                    abort_signal_callback, &int_handler);    opal_signal_add(&int_handler, NULL);#ifndef __WINDOWS__    /** setup callbacks for signals we should foward */    opal_signal_set(&sigusr1_handler, SIGUSR1,                    signal_forward_callback, &sigusr1_handler);    opal_signal_add(&sigusr1_handler, NULL);    opal_signal_set(&sigusr2_handler, SIGUSR2,                    signal_forward_callback, &sigusr2_handler);    opal_signal_add(&sigusr2_handler, NULL);#endif  /* __WINDOWS__ */    orte_totalview_init_before_spawn();    /* Spawn the job */    cb_states = ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_AT_STG1;    rc = orte_rmgr.spawn_job(apps, num_apps, &jobid, 0, NULL, job_state_callback, cb_states, &attributes);    while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);    OBJ_DESTRUCT(&attributes);        if (ORTE_SUCCESS != rc) {        /* JMS show_help unless it is ERR_SILENT */        if (ORTE_ERR_SILENT != rc) {            opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);        }    } else {        if (orterun_globals.do_not_launch) {            /* we are done! */            goto DONE;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -