⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 orted.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 3 页
字号:
                           "OMPI_MCA_ns_nds", "env", ret);            return ret;        }        if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_name",                                  orted_globals.name, true, &environ))) {            opal_show_help("help-orted.txt", "orted:environ", false,                           "OMPI_MCA_ns_nds_name", orted_globals.name, ret);            return ret;        }        /* the following values are meaningless to the daemon, but may have         * been passed in anyway. we set them here because the nds_env component         * requires that they be set         */        if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_vpid_start",                                  orted_globals.vpid_start, true, &environ))) {            opal_show_help("help-orted.txt", "orted:environ", false,                           "OMPI_MCA_ns_nds_vpid_start", orted_globals.vpid_start, ret);            return ret;        }        if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_num_procs",                                  orted_globals.num_procs, true, &environ))) {            opal_show_help("help-orted.txt", "orted:environ", false,                           "OMPI_MCA_ns_nds_num_procs", orted_globals.num_procs, ret);            return ret;        }    }    if (orted_globals.ns_nds) {        if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds",                                               orted_globals.ns_nds, true, &environ))) {            opal_show_help("help-orted.txt", "orted:environ", false,                           "OMPI_MCA_ns_nds", "env", ret);            return ret;        }    }    /* turn on debug if debug_file is requested so output will be generated */    if (orted_globals.debug_daemons_file) {        orted_globals.debug_daemons = true;    }    /* detach from controlling terminal     * otherwise, remain attached so output can get to us     */    if(orted_globals.debug == false &&       orted_globals.debug_daemons == false &&       orted_globals.no_daemonize == false) {        opal_daemon_init(NULL);    }    /* Intialize the Open RTE */    /* Set the flag telling orte_init that I am NOT a     * singleton, but am "infrastructure" - prevents setting     * up incorrect infrastructure that only a singleton would     * require     */    if (ORTE_SUCCESS != (ret = orte_init(true))) {        opal_show_help("help-orted.txt", "orted:init-failure", false,                       "orte_init()", ret);        return ret;    }    /* Set signal handlers to catch kill signals so we can properly clean up     * after ourselves.      */    opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL,                   signal_callback, NULL);    opal_event_add(&term_handler, NULL);    opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL,                   signal_callback, NULL);    opal_event_add(&int_handler, NULL);    /* if requested, report my uri to the indicated pipe */    if (orted_globals.uri_pipe > 0) {        write(orted_globals.uri_pipe, orte_universe_info.seed_uri,                    strlen(orte_universe_info.seed_uri)+1); /* need to add 1 to get the NULL */        close(orted_globals.uri_pipe);    }    /* setup stdout/stderr */    if (orted_globals.debug_daemons_file) {        /* if we are debugging to a file, then send stdout/stderr to         * the orted log file         */        /* get my jobid */        if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobidstring,                                        orte_process_info.my_name))) {            ORTE_ERROR_LOG(ret);            return ret;        }        /* define a log file name in the session directory */        sprintf(log_file, "output-orted-%s-%s.log",                jobidstring, orte_system_info.nodename);        log_path = opal_os_path(false,                                orte_process_info.tmpdir_base,                                orte_process_info.top_session_dir,                                log_file,                                NULL);        fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);        if (fd < 0) {            /* couldn't open the file for some reason, so             * just connect everything to /dev/null             */             fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);        } else {            dup2(fd, STDOUT_FILENO);            dup2(fd, STDERR_FILENO);            if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {               close(fd);            }        }    }    /* output a message indicating we are alive, our name, and our pid     * for debugging purposes     */    if (orted_globals.debug_daemons) {        fprintf(stderr, "Daemon [%ld,%ld,%ld] checking in as pid %ld on host %s\n",                ORTE_NAME_ARGS(orte_process_info.my_name), (long)orte_process_info.pid,                orte_system_info.nodename);    }    /* setup the thread lock and condition variables */    OBJ_CONSTRUCT(&orted_globals.mutex, opal_mutex_t);    OBJ_CONSTRUCT(&orted_globals.condition, opal_condition_t);    /* register the daemon main receive functions */    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL);    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {        ORTE_ERROR_LOG(ret);        return ret;    }    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {        ORTE_ERROR_LOG(ret);        return ret;    }    /* check to see if I'm a bootproxy */    if (orted_globals.bootproxy) { /* perform bootproxy-specific things */        if (orted_globals.mpi_call_yield > 0) {            char *var;            var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");            opal_setenv(var, "1", true, &environ);        }        /* attach a subscription to the orted standard trigger so I can get         * information on the processes I am to locally launch as soon as all         * the orteds for this job are started.         *         * Once the registry gets to 2.0, we will be able to setup the         * subscription so we only get our own launch info back. In the interim,         * we setup the subscription so that ALL launch info for this job         * is returned. We will then have to parse that message to get our         * own local launch info.         *         * Since we have chosen this approach, we can take advantage of the         * fact that the callback function will directly receive this data.         * By setting up that callback function to actually perform the launch         * based on the received data, all we have to do here is go into our         * conditioned wait until the job completes!         *         * Sometimes, life can be good! :-)         */        /** put all this registry stuff in a compound command to limit communications */        if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) {            ORTE_ERROR_LOG(ret);            return ret;        }        /* let the local launcher setup a subscription for its required data. We         * pass the local_cb_launcher function so that this gets called back - this         * allows us to wakeup the orted so it can exit cleanly if the callback         * generates an error         */        if (ORTE_SUCCESS != (ret = orte_odls.subscribe_launch_data(orted_globals.bootproxy, orted_local_cb_launcher))) {            ORTE_ERROR_LOG(ret);            return ret;        }        /* get the job segment name */        if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) {            ORTE_ERROR_LOG(ret);            return ret;        }       /** increment the orted stage gate counter */        if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND,                                                         segment, 1, 1))) {            ORTE_ERROR_LOG(ret);            return ret;        }        free(segment); /* done with this now */        value->tokens[0] = strdup(ORTE_JOB_GLOBALS);        if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTED_LAUNCH_STAGE_GATE_CNTR, ORTE_UNDEF, NULL))) {            ORTE_ERROR_LOG(ret);            return ret;        }        /* do the increment */        if (ORTE_SUCCESS != (ret = orte_gpr.increment_value(value))) {            ORTE_ERROR_LOG(ret);            return ret;        }        OBJ_RELEASE(value);  /* done with this now */        /** send the compound command */        if (ORTE_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) {            ORTE_ERROR_LOG(ret);            return ret;        }        /* setup and enter the event monitor to wait for a wakeup call */        OPAL_THREAD_LOCK(&orted_globals.mutex);        while (false == orted_globals.exit_condition) {            opal_condition_wait(&orted_globals.condition, &orted_globals.mutex);        }        OPAL_THREAD_UNLOCK(&orted_globals.mutex);        /* make sure our local procs are dead - but don't update their state         * on the HNP as this may be redundant         */        orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);        /* cleanup their session directory */        orte_session_dir_cleanup(orted_globals.bootproxy);        /* send an ack - we are as close to done as we can be while         * still able to communicate         */        OBJ_CONSTRUCT(&answer, orte_buffer_t);        if (0 > orte_rml.send_buffer(ORTE_PROC_MY_HNP, &answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {            ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);        }        OBJ_DESTRUCT(&answer);        /* Finalize and clean up ourselves */        if (ORTE_SUCCESS != (ret = orte_finalize())) {            ORTE_ERROR_LOG(ret);        }        exit(ret);    }    /*     *  Set my process status to "running". Note that this must be done     *  after the rte init is completed.     */    if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,                                                     ORTE_PROC_STATE_RUNNING, 0))) {        ORTE_ERROR_LOG(ret);        return ret;    }    if (orted_globals.debug_daemons) {        opal_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));    }   /* go through the universe fields and see what else I need to do     * - could be setup a virtual machine, spawn a console, etc.     */    if (orted_globals.debug_daemons) {        opal_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name));    }     /* setup and enter the event monitor */    OPAL_THREAD_LOCK(&orted_globals.mutex);    while (false == orted_globals.exit_condition) {        opal_condition_wait(&orted_globals.condition, &orted_globals.mutex);    }    OPAL_THREAD_UNLOCK(&orted_globals.mutex);    if (orted_globals.debug_daemons) {       opal_output(0, "[%lu,%lu,%lu] orted: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name));    }    /* cleanup */    if (NULL != log_path) {        unlink(log_path);    }    /* finalize the system */    orte_finalize();    if (orted_globals.debug_daemons) {       opal_output(0, "[%lu,%lu,%lu] orted: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name));    }    exit(0);}/* this function receives the trigger callback from the orted launch stage gate * and passes it to the orted local launcher for processing. We do this intermediate * step so that we can get an error code if anything went wrong and, if so, wakeup the * orted so we can gracefully die */static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag){

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -