📄 orted.c
字号:
"OMPI_MCA_ns_nds", "env", ret); return ret; } if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_name", orted_globals.name, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_name", orted_globals.name, ret); return ret; } /* the following values are meaningless to the daemon, but may have * been passed in anyway. we set them here because the nds_env component * requires that they be set */ if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_vpid_start", orted_globals.vpid_start, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_vpid_start", orted_globals.vpid_start, ret); return ret; } if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_num_procs", orted_globals.num_procs, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_num_procs", orted_globals.num_procs, ret); return ret; } } if (orted_globals.ns_nds) { if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds", orted_globals.ns_nds, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds", "env", ret); return ret; } } /* turn on debug if debug_file is requested so output will be generated */ if (orted_globals.debug_daemons_file) { orted_globals.debug_daemons = true; } /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(orted_globals.debug == false && orted_globals.debug_daemons == false && orted_globals.no_daemonize == false) { opal_daemon_init(NULL); } /* Intialize the Open RTE */ /* Set the flag telling orte_init that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require */ if (ORTE_SUCCESS != (ret = orte_init(true))) { opal_show_help("help-orted.txt", "orted:init-failure", false, "orte_init()", ret); return ret; } /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, signal_callback, NULL); opal_event_add(&term_handler, NULL); opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL, signal_callback, NULL); opal_event_add(&int_handler, NULL); /* if requested, report my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { write(orted_globals.uri_pipe, orte_universe_info.seed_uri, strlen(orte_universe_info.seed_uri)+1); /* need to add 1 to get the NULL */ close(orted_globals.uri_pipe); } /* setup stdout/stderr */ if (orted_globals.debug_daemons_file) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobidstring, orte_process_info.my_name))) { ORTE_ERROR_LOG(ret); return ret; } /* define a log file name in the session directory */ sprintf(log_file, "output-orted-%s-%s.log", jobidstring, orte_system_info.nodename); log_path = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orted_globals.debug_daemons) { fprintf(stderr, "Daemon [%ld,%ld,%ld] checking in as pid %ld on host %s\n", ORTE_NAME_ARGS(orte_process_info.my_name), (long)orte_process_info.pid, orte_system_info.nodename); } /* setup the thread lock and condition variables */ OBJ_CONSTRUCT(&orted_globals.mutex, opal_mutex_t); OBJ_CONSTRUCT(&orted_globals.condition, opal_condition_t); /* register the daemon main receive functions */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; } ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; } /* check to see if I'm a bootproxy */ if (orted_globals.bootproxy) { /* perform bootproxy-specific things */ if (orted_globals.mpi_call_yield > 0) { char *var; var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); opal_setenv(var, "1", true, &environ); } /* attach a subscription to the orted standard trigger so I can get * information on the processes I am to locally launch as soon as all * the orteds for this job are started. * * Once the registry gets to 2.0, we will be able to setup the * subscription so we only get our own launch info back. In the interim, * we setup the subscription so that ALL launch info for this job * is returned. We will then have to parse that message to get our * own local launch info. * * Since we have chosen this approach, we can take advantage of the * fact that the callback function will directly receive this data. * By setting up that callback function to actually perform the launch * based on the received data, all we have to do here is go into our * conditioned wait until the job completes! * * Sometimes, life can be good! :-) */ /** put all this registry stuff in a compound command to limit communications */ if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) { ORTE_ERROR_LOG(ret); return ret; } /* let the local launcher setup a subscription for its required data. We * pass the local_cb_launcher function so that this gets called back - this * allows us to wakeup the orted so it can exit cleanly if the callback * generates an error */ if (ORTE_SUCCESS != (ret = orte_odls.subscribe_launch_data(orted_globals.bootproxy, orted_local_cb_launcher))) { ORTE_ERROR_LOG(ret); return ret; } /* get the job segment name */ if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) { ORTE_ERROR_LOG(ret); return ret; } /** increment the orted stage gate counter */ if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND, segment, 1, 1))) { ORTE_ERROR_LOG(ret); return ret; } free(segment); /* done with this now */ value->tokens[0] = strdup(ORTE_JOB_GLOBALS); if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTED_LAUNCH_STAGE_GATE_CNTR, ORTE_UNDEF, NULL))) { ORTE_ERROR_LOG(ret); return ret; } /* do the increment */ if (ORTE_SUCCESS != (ret = orte_gpr.increment_value(value))) { ORTE_ERROR_LOG(ret); return ret; } OBJ_RELEASE(value); /* done with this now */ /** send the compound command */ if (ORTE_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) { ORTE_ERROR_LOG(ret); return ret; } /* setup and enter the event monitor to wait for a wakeup call */ OPAL_THREAD_LOCK(&orted_globals.mutex); while (false == orted_globals.exit_condition) { opal_condition_wait(&orted_globals.condition, &orted_globals.mutex); } OPAL_THREAD_UNLOCK(&orted_globals.mutex); /* make sure our local procs are dead - but don't update their state * on the HNP as this may be redundant */ orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); /* cleanup their session directory */ orte_session_dir_cleanup(orted_globals.bootproxy); /* send an ack - we are as close to done as we can be while * still able to communicate */ OBJ_CONSTRUCT(&answer, orte_buffer_t); if (0 > orte_rml.send_buffer(ORTE_PROC_MY_HNP, &answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); } OBJ_DESTRUCT(&answer); /* Finalize and clean up ourselves */ if (ORTE_SUCCESS != (ret = orte_finalize())) { ORTE_ERROR_LOG(ret); } exit(ret); } /* * Set my process status to "running". Note that this must be done * after the rte init is completed. */ if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name, ORTE_PROC_STATE_RUNNING, 0))) { ORTE_ERROR_LOG(ret); return ret; } if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* go through the universe fields and see what else I need to do * - could be setup a virtual machine, spawn a console, etc. */ if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* setup and enter the event monitor */ OPAL_THREAD_LOCK(&orted_globals.mutex); while (false == orted_globals.exit_condition) { opal_condition_wait(&orted_globals.condition, &orted_globals.mutex); } OPAL_THREAD_UNLOCK(&orted_globals.mutex); if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] orted: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* cleanup */ if (NULL != log_path) { unlink(log_path); } /* finalize the system */ orte_finalize(); if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] orted: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name)); } exit(0);}/* this function receives the trigger callback from the orted launch stage gate * and passes it to the orted local launcher for processing. We do this intermediate * step so that we can get an error code if anything went wrong and, if so, wakeup the * orted so we can gracefully die */static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -