⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odls_default_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);        return ORTE_ERR_OUT_OF_RESOURCE;    }    /* A pipe is used to communicate between the parent and child to       indicate whether the exec ultiimately succeeded or failed.  The       child sets the pipe to be close-on-exec; the child only ever       writes anything to the pipe if there is an error (e.g.,       executable not found, exec() fails, etc.).  The parent does a       blocking read on the pipe; if the pipe closed with no data,       then the exec() succeeded.  If the parent reads something from       the pipe, then the child was letting us know that it failed. */    if (pipe(p) < 0) {        ORTE_ERROR_LOG(ORTE_ERR_IN_ERRNO);        return ORTE_ERR_IN_ERRNO;    }    /* Fork off the child */    pid = fork();    if(pid < 0) {        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);        return ORTE_ERR_OUT_OF_RESOURCE;    }    if (pid == 0) {        char *param, *param2;        char *uri;        char **environ_copy;        long fd, fdmax = sysconf(_SC_OPEN_MAX);        /* Setup the pipe to be close-on-exec */        close(p[0]);        fcntl(p[1], F_SETFD, FD_CLOEXEC);        /* setup stdout/stderr so that any error messages that we may           print out will get displayed back at orterun */        orte_iof_base_setup_child(&opts);        /* Try to change to the context cwd and check that the app           exists and is executable The resource manager functions will           take care of outputting a pretty error message, if required         */        if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {           /* Tell the parent that Badness happened */            write(p[1], &i, sizeof(int));            exit(1);        }        if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {            /* Tell the parent that Badness happened */            write(p[1], &i, sizeof(int));            exit(1);        }        /* setup base environment: copy the current environ and merge           in the app context environ */        if (NULL != context->env) {            environ_copy = opal_environ_merge(base_environ, context->env);        } else {            environ_copy = opal_argv_copy(base_environ);        }        /* special case handling for --prefix: this is somewhat icky,           but at least some users do this.  :-\ It is possible that           when using --prefix, the user will also "-x PATH" and/or           "-x LD_LIBRARY_PATH", which would therefore clobber the           work that was done in the prior pls to ensure that we have           the prefix at the beginning of the PATH and           LD_LIBRARY_PATH.  So examine the context->env and see if we           find PATH or LD_LIBRARY_PATH.  If found, that means the           prior work was clobbered, and we need to re-prefix those           variables. */        for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {            char *newenv;            /* Reset PATH */            if (0 == strncmp("PATH=", context->env[i], 5)) {                asprintf(&newenv, "%s/bin:%s",                         context->prefix_dir, context->env[i] + 5);                opal_setenv("PATH", newenv, true, &environ_copy);                free(newenv);            }            /* Reset LD_LIBRARY_PATH */            else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {                asprintf(&newenv, "%s/lib:%s",                         context->prefix_dir, context->env[i] + 16);                opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ_copy);                free(newenv);            }        }        param = mca_base_param_environ_variable("rmgr","bootproxy","jobid");        opal_unsetenv(param, &environ_copy);        free(param);        /* setup yield schedule and processor affinity         * We default here to always setting the affinity processor if we want         * it. The processor affinity system then determines         * if processor affinity is enabled/requested - if so, it then uses         * this value to select the process to which the proc is "assigned".         * Otherwise, the paffinity subsystem just ignores this value anyway         */        if (oversubscribed) {            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");            opal_setenv(param, "1", false, &environ_copy);       } else {            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");            opal_setenv(param, "0", false, &environ_copy);        }        free(param);                if (want_processor) {            param = mca_base_param_environ_variable("mpi", NULL,                                                    "paffinity_processor");            asprintf(&param2, "%lu", (unsigned long) processor);            opal_setenv(param, param2, false, &environ_copy);            free(param);            free(param2);        } else {            param = mca_base_param_environ_variable("mpi", NULL,                                                    "paffinity_processor");            opal_unsetenv(param, &environ_copy);            free(param);        }                /* setup universe info */        if (NULL != orte_universe_info.name) {            param = mca_base_param_environ_variable("universe", NULL, NULL);            asprintf(&uri, "%s@%s:%s", orte_universe_info.uid,                                       orte_universe_info.host,                                       orte_universe_info.name);            opal_setenv(param, uri, true, &environ_copy);            free(param);            free(uri);        }        /* setup ns contact info */        if(NULL != orte_process_info.ns_replica_uri) {            uri = strdup(orte_process_info.ns_replica_uri);        } else {            uri = orte_rml.get_uri();        }        param = mca_base_param_environ_variable("ns","replica","uri");        opal_setenv(param, uri, true, &environ_copy);        free(param);        free(uri);        /* setup gpr contact info */        if(NULL != orte_process_info.gpr_replica_uri) {            uri = strdup(orte_process_info.gpr_replica_uri);        } else {            uri = orte_rml.get_uri();        }        param = mca_base_param_environ_variable("gpr","replica","uri");        opal_setenv(param, uri, true, &environ_copy);        free(param);        free(uri);                /* set the app_context number into the environment */        param = mca_base_param_environ_variable("orte","app","num");        asprintf(&param2, "%ld", (long)child->app_idx);        opal_setenv(param, param2, true, &environ_copy);        free(param);        free(param2);                /* use same nodename as the starting daemon (us) */        param = mca_base_param_environ_variable("orte", "base", "nodename");        opal_setenv(param, orte_system_info.nodename, true, &environ_copy);        free(param);        /* push name into environment */        orte_ns_nds_env_put(child->name, vpid_start, vpid_range,                            &environ_copy);        /* close all file descriptors w/ exception of stdin/stdout/stderr */        for(fd=3; fd<fdmax; fd++)            close(fd);        if (context->argv == NULL) {            context->argv = malloc(sizeof(char*)*2);            context->argv[0] = strdup(context->app);            context->argv[1] = NULL;        }        /* Set signal handlers back to the default.  Do this close to           the exev() because the event library may (and likely will)           reset them.  If we don't do this, the event library may           have left some set that, at least on some OS's, don't get           reset via fork() or exec().  Hence, the launched process           could be unkillable (for example). */        set_handler_default(SIGTERM);        set_handler_default(SIGINT);        set_handler_default(SIGHUP);        set_handler_default(SIGPIPE);        set_handler_default(SIGCHLD);        /* Unblock all signals, for many of the same reasons that we           set the default handlers, above.  This is noticable on           Linux where the event library blocks SIGTERM, but we don't           want that blocked by the launched process. */        sigprocmask(0, 0, &sigs);        sigprocmask(SIG_UNBLOCK, &sigs, 0);        /* Exec the new executable */        execve(context->app, context->argv, environ_copy);        opal_show_help("help-odls-default.txt", "orte-odls-default:execv-error",                       true, context->app, strerror(errno));        exit(1);    } else {        /* connect endpoints IOF */        rc = orte_iof_base_setup_parent(child->name, &opts);        if(ORTE_SUCCESS != rc) {            ORTE_ERROR_LOG(rc);            return rc;        }        /* Wait to read something from the pipe or close */        close(p[1]);        while (1) {            rc = read(p[0], &i, sizeof(int));            if (rc < 0) {                /* Signal interrupts are ok */                if (errno == EINTR) {                    continue;                }                /* Other errno's are bad */                return ORTE_ERR_IN_ERRNO;                break;            } else if (0 == rc) {                /* Child was successful in exec'ing! */                break;            } else {                /* Doh -- child failed.                   Report the ORTE rc from child to let the calling function                   know about the failure.  The actual exit status of child proc                   cannot be found here. The calling func need to report the                   failure to launch this process through the SMR or else                   everyone else will hang.                */                return i;            }        }        /* set the proc state to LAUNCHED and save the pid */        child->state = ORTE_PROC_STATE_LAUNCHED;        child->pid = pid;        child->alive = true;    }        return ORTE_SUCCESS;}/** * Launch all processes allocated to the current node. */int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ){    int rc;    orte_std_cntr_t i, j, kv, kv2, *sptr;    orte_gpr_value_t *value, **values;    orte_gpr_keyval_t *kval;    orte_app_context_t *app;    orte_jobid_t job;    orte_vpid_t *vptr, start, range;    char *node_name;    opal_list_t app_context_list;    orte_odls_child_t *child;    odls_default_app_context_t *app_item;    int num_processors;    bool oversubscribed=false, want_processor, *bptr, override_oversubscribed=false;    opal_list_item_t *item, *item2;    /* parse the returned data to create the required structures     * for a fork launch. Since the data will contain information     * on procs for ALL nodes, we first have to find the value     * struct that contains info for our node.     */    /* first, retrieve the job number we are to launch from the     * returned data - we can extract the jobid directly from the     * subscription name we created     */    if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, data->target))) {        ORTE_ERROR_LOG(rc);        return rc;    }        opal_output(orte_odls_globals.output, "odls: setting up launch for job %ld", (long)job);        /* We need to create a list of the app_contexts     * so we can know what to launch - the process info only gives     * us an index into the app_context array, not the app_context     * info itself.     */        OBJ_CONSTRUCT(&app_context_list, opal_list_t);        /* set the default values to INVALID */    start = ORTE_VPID_INVALID;    range = ORTE_VPID_INVALID;        values = (orte_gpr_value_t**)(data->values)->addr;    for (j=0, i=0; i < data->cnt && j < (data->values)->size; j++) {  /* loop through all returned values */        if (NULL != values[j]) {            i++;            value = values[j];                        if (NULL != value->tokens) {               /* this came from the globals container, so it must contain                * the app_context(s), vpid_start, and vpid_range entries. Only one                * value object should ever come from that container                */                for (kv=0; kv < value->cnt; kv++) {                    kval = value->keyvals[kv];                    if (strcmp(kval->key, ORTE_JOB_VPID_START_KEY) == 0) {                        /* this can only occur once, so just store it */                        if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {                            ORTE_ERROR_LOG(rc);                            return rc;                        }                        start = *vptr;                        continue;                    }                    if (strcmp(kval->key, ORTE_JOB_VPID_RANGE_KEY) == 0) {                        /* this can only occur once, so just store it */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -