📄 orterun.c
字号:
/* -*- C -*- * * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */#include "orte_config.h"#include <stdio.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_SYS_PARAM_H#include <sys/param.h>#endif#include <errno.h>#include <signal.h>#include <ctype.h>#ifdef HAVE_SYS_TYPES_H#include <sys/types.h>#endif /* HAVE_SYS_TYPES_H */#ifdef HAVE_SYS_WAIT_H#include <sys/wait.h>#endif /* HAVE_SYS_WAIT_H */#ifdef HAVE_LIBGEN_H#include <libgen.h>#endif#ifdef HAVE_SYS_TIME_H#include <sys/time.h>#endif#include "opal/event/event.h"#include "opal/mca/installdirs/installdirs.h"#include "opal/mca/base/base.h"#include "opal/threads/condition.h"#include "opal/util/argv.h"#include "opal/util/basename.h"#include "opal/util/cmd_line.h"#include "opal/util/opal_environ.h"#include "opal/util/output.h"#include "opal/util/show_help.h"#include "opal/util/trace.h"#include "opal/version.h"#include "opal/runtime/opal.h"#include "orte/orte_constants.h"#include "orte/class/orte_pointer_array.h"#include "orte/util/proc_info.h"#include "orte/util/sys_info.h"#include "orte/util/universe_setup_file_io.h"#include "orte/util/pre_condition_transports.h"#include "orte/mca/ns/ns.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/pls/pls.h"#include "orte/mca/rmaps/rmaps_types.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/schema/schema.h"#include "orte/mca/smr/smr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/runtime/runtime.h"#include "orte/runtime/params.h"#include "orte/runtime/orte_wait.h"#include "orterun.h"#include "totalview.h"/* * Globals */static struct opal_event term_handler;static struct opal_event int_handler;#ifndef __WINDOWS__static struct opal_event sigusr1_handler;static struct opal_event sigusr2_handler;#endif /* __WINDOWS__ */static orte_jobid_t jobid = ORTE_JOBID_INVALID;static orte_pointer_array_t *apps_pa;static bool wait_for_job_completion = true;static char *orterun_basename = NULL;static int max_display_aborted = 1;static int num_aborted = 0;static int num_killed = 0;static char **global_mca_env = NULL;static bool have_zero_np = false;static orte_std_cntr_t total_num_apps = 0;static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;/* * Globals */struct globals_t orterun_globals;bool globals_init = false;opal_cmd_line_init_t cmd_line_init[] = { /* Various "obvious" options */ { NULL, NULL, NULL, 'h', NULL, "help", 0, &orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, { NULL, NULL, NULL, 'V', NULL, "version", 0, &orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL, "Print version and exit" }, { NULL, NULL, NULL, 'v', NULL, "verbose", 0, &orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Be verbose" }, { NULL, NULL, NULL, 'q', NULL, "quiet", 0, &orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL, "Suppress helpful messages" }, /* Use an appfile */ { NULL, NULL, NULL, '\0', NULL, "app", 1, &orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING, "Provide an appfile; ignore all other command line options" }, /* Number of processes; -c, -n, --n, -np, and --np are all synonyms */ { NULL, NULL, NULL, 'c', "np", "np", 1, &orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT, "Number of processes to run" }, { NULL, NULL, NULL, '\0', "n", "n", 1, &orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT, "Number of processes to run" }, /* Set a hostfile */ { "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Provide a hostfile" }, { "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Provide a hostfile" }, /* Don't wait for the process to finish before exiting */#if 0 { NULL, NULL, NULL, '\0', "nw", "nw", 0, &orterun_globals.no_wait_for_job_completion, OPAL_CMD_LINE_TYPE_BOOL, "Launch the processes and do not wait for their completion (i.e., let orterun complete as soon a successful launch occurs)" },#endif /* Set the max number of aborted processes to show */ { NULL, NULL, NULL, '\0', "aborted", "aborted", 1, &max_display_aborted, OPAL_CMD_LINE_TYPE_INT, "The maximum number of aborted processes to display" }, /* Export environment variables; potentially used multiple times, so it does not make sense to set into a variable */ { NULL, NULL, NULL, 'x', NULL, NULL, 1, NULL, OPAL_CMD_LINE_TYPE_NULL, "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, /* Specific mapping (C, cX, N, nX) */#if 0 /* JJH --map is not currently implemented so don't advertise it until it is */ { NULL, NULL, NULL, '\0', NULL, "map", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Mapping of processes to nodes / CPUs" },#endif { NULL, NULL, NULL, '\0', "bynode", "bynode", 0, &orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL, "Whether to allocate/map processes round-robin by node" }, { NULL, NULL, NULL, '\0', "byslot", "byslot", 0, &orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL, "Whether to allocate/map processes round-robin by slot (the default)" }, { "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" }, { "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1, NULL, OPAL_CMD_LINE_TYPE_INT, "Launch n processes per node on all allocated nodes" }, { "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Nodes are not to be oversubscribed, even if the system supports such operation"}, { "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Display the process map just before launch"}, /* mpiexec-like arguments */ { NULL, NULL, NULL, '\0', "wdir", "wdir", 1, &orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING, "Set the working directory of the started processes" }, { NULL, NULL, NULL, '\0', "wd", "wd", 1, &orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING, "Synonym for --wdir" }, { NULL, NULL, NULL, '\0', "path", "path", 1, &orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING, "PATH to be used to look for executables to start processes" }, /* These arguments can be specified multiple times */#if 0 /* JMS: Removed because it's not really implemented */ { NULL, NULL, NULL, '\0', "arch", "arch", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Architecture to start processes on" },#endif { NULL, NULL, NULL, 'H', "host", "host", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "List of hosts to invoke processes on" }, /* OSC mpiexec-like arguments */ { "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Do not run any MPI applications on the local node" }, /* User-level debugger arguments */ { NULL, NULL, NULL, '\0', "tv", "tv", 0, &orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL, "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, { NULL, NULL, NULL, '\0', "debug", "debug", 0, &orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL, "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, { "orte", "base", "user_debugger", '\0', "debugger", "debugger", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Sequence of debuggers to search for when \"--debug\" is used" }, /* OpenRTE arguments */ { "orte", "debug", NULL, 'd', NULL, "debug-devel", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Enable debugging of OpenRTE" }, { "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_INT, "Enable debugging of any OpenRTE daemons used by this application" }, { "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, { "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Do not detach OpenRTE daemons used by this application" }, { "universe", NULL, NULL, '\0', NULL, "universe", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Set the universe name as username@hostname:universe_name for this application" }, { NULL, NULL, NULL, '\0', NULL, "tmpdir", 1, &orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING, "Set the root for the session directory tree for orterun ONLY" }, { NULL, NULL, NULL, '\0', NULL, "do-not-launch", 0, &orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL, "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, { "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "If set, reuse daemons to launch dynamically spawned processes"}, { NULL, NULL, NULL, '\0', NULL, "prefix", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Prefix where Open MPI is installed on remote nodes" }, { NULL, NULL, NULL, '\0', NULL, "noprefix", 0, NULL, OPAL_CMD_LINE_TYPE_STRING, "Disable automatic --prefix behavior" }, /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }};/* * Local functions */static void exit_callback(int fd, short event, void *arg);static void abort_signal_callback(int fd, short event, void *arg);static void signal_forward_callback(int fd, short event, void *arg);static int create_app(int argc, char* argv[], orte_app_context_t **app, bool *made_app, char ***app_env);static int init_globals(void);static int parse_globals(int argc, char* argv[]);static int parse_locals(int argc, char* argv[]);static int parse_appfile(char *filename, char ***env);static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state);static void dump_aborted_procs(orte_jobid_t jobid);int orterun(int argc, char *argv[]){ orte_app_context_t **apps; int rc, ret, i, num_apps, array_size; orte_proc_state_t cb_states; orte_job_state_t exit_state; opal_list_t attributes; opal_list_item_t *item; uint8_t flow; /* Need to initialize OPAL so that install_dirs are filled in */ opal_init_util(); /* Setup MCA params */ mca_base_param_init(); orte_register_params(false); /* find our basename (the name of the executable) so that we can use it in pretty-print error messages */ orterun_basename = opal_basename(argv[0]); /* Check for some "global" command line params */ parse_globals(argc, argv); /* If we're still here, parse each app */ parse_locals(argc, argv); /* Convert the list of apps to an array of orte_app_context_t pointers */ array_size = orte_pointer_array_get_size(apps_pa); apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t *) * array_size); if (NULL == apps) { opal_show_help("help-orterun.txt", "orterun:call-failed", true, orterun_basename, "system", "malloc returned NULL", errno); exit(1); } num_apps = 0; for (i = 0; i < array_size; ++i) { apps[num_apps] = (orte_app_context_t *) orte_pointer_array_get_item(apps_pa, i); if (NULL != apps[num_apps]) { num_apps++; } } if (0 == num_apps) { /* This should never happen -- this case should be caught in create_app(), but let's just double check... */ opal_show_help("help-orterun.txt", "orterun:nothing-to-do", true, orterun_basename); exit(1); } /* Intialize our Open RTE environment */ /* Set the flag telling orte_init that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require */ if (ORTE_SUCCESS != (rc = orte_init(true))) { opal_show_help("help-orterun.txt", "orterun:init-failure", true, "orte_init()", rc); return rc; } /* pre-condition any network transports that require it */ if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) { ORTE_ERROR_LOG(rc); opal_show_help("help-orterun.txt", "orterun:precondition", false, orterun_basename, NULL, NULL, rc); return rc; } /* Prep to start the application */ /* construct the list of attributes */ OBJ_CONSTRUCT(&attributes, opal_list_t); if (orterun_globals.do_not_launch) { flow = ORTE_RMGR_SETUP | ORTE_RMGR_RES_DISC | ORTE_RMGR_ALLOC | ORTE_RMGR_MAP | ORTE_RMGR_SETUP_TRIGS; orte_rmgr.add_attribute(&attributes, ORTE_RMGR_SPAWN_FLOW, ORTE_UINT8, &flow, ORTE_RMGR_ATTR_OVERRIDE); } /** setup callbacks for abort signals */ opal_signal_set(&term_handler, SIGTERM, abort_signal_callback, &term_handler); opal_signal_add(&term_handler, NULL); opal_signal_set(&int_handler, SIGINT, abort_signal_callback, &int_handler); opal_signal_add(&int_handler, NULL);#ifndef __WINDOWS__ /** setup callbacks for signals we should foward */ opal_signal_set(&sigusr1_handler, SIGUSR1, signal_forward_callback, &sigusr1_handler); opal_signal_add(&sigusr1_handler, NULL); opal_signal_set(&sigusr2_handler, SIGUSR2, signal_forward_callback, &sigusr2_handler); opal_signal_add(&sigusr2_handler, NULL);#endif /* __WINDOWS__ */ orte_totalview_init_before_spawn(); /* Spawn the job */ cb_states = ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_AT_STG1; rc = orte_rmgr.spawn_job(apps, num_apps, &jobid, 0, NULL, job_state_callback, cb_states, &attributes); while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attributes); if (ORTE_SUCCESS != rc) { /* JMS show_help unless it is ERR_SILENT */ if (ORTE_ERR_SILENT != rc) { opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc); } } else { if (orterun_globals.do_not_launch) { /* we are done! */ goto DONE;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -