📄 odls_default_module.c
字号:
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } range = *vptr; continue; } if (strcmp(kval->key, ORTE_JOB_APP_CONTEXT_KEY) == 0) { /* this can occur multiple times since we allow multiple * app_contexts on the orterun command line. Add them * to the list */ if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&app, kval->value, ORTE_APP_CONTEXT))) { ORTE_ERROR_LOG(rc); return rc; } app_item = OBJ_NEW(odls_default_app_context_t); if (NULL == app_item) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } app_item->app_context = app; opal_list_append(&app_context_list, &app_item->super); kval->value->data = NULL; /* protect the data storage from later release */ } if (strcmp(kval->key, ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY) == 0) { /* this can only occur once, so just store it */ if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) { ORTE_ERROR_LOG(rc); return rc; } override_oversubscribed = *bptr; continue; } } /* end for loop to process global data */ } else { /* this must have come from one of the process containers, so it must * contain data for a proc structure - see if it * belongs to this node */ for (kv=0; kv < value->cnt; kv++) { kval = value->keyvals[kv]; if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) { /* Most C-compilers will bark if we try to directly compare the string in the * kval data area against a regular string, so we need to "get" the data * so we can access it */ if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) { ORTE_ERROR_LOG(rc); return rc; } /* if this is our node...must also protect against a zero-length string */ if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) { /* ...harvest the info into a new child structure */ child = OBJ_NEW(orte_odls_child_t); for (kv2 = 0; kv2 < value->cnt; kv2++) { kval = value->keyvals[kv2]; if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) { /* copy the name into the child object */ if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } continue; } if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } child->app_idx = *sptr; /* save the index into the app_context objects */ continue; } if(strcmp(kval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) { ORTE_ERROR_LOG(rc); return rc; } oversubscribed = *bptr; continue; } } /* kv2 */ /* protect operation on the global list of children */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); opal_list_append(&orte_odls_default.children, &child->super); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); } } } /* for kv */ } } /* for j */ } /* setup for processor affinity. If there are enough physical processors on this node, then * we indicate which processor each process should be assigned to, IFF the user has requested * processor affinity be used - the paffinity subsystem will make that final determination. All * we do here is indicate that we should do the definitions just in case paffinity is active */ if (ORTE_SUCCESS != opal_paffinity_base_get_num_processors(&num_processors)) { /* if we cannot find the number of local processors, then default to conservative * settings */ want_processor = false; /* default to not being a hog */ /* leave oversubscribed alone */ opal_output(orte_odls_globals.output, "odls: could not get number of processors - using conservative settings"); } else { /* only do this if we can actually get info on the number of processors */ if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) { want_processor = false; } else { want_processor = true; } /* now let's deal with the oversubscribed flag - and the use-case where a hostfile or some * other non-guaranteed-accurate method was used to inform us about our allocation. Since * the information on the number of slots on this node could have been incorrect, we need * to check it against the local number of processors to ensure we don't overload them */ if (override_oversubscribed) { opal_output(orte_odls_globals.output, "odls: overriding oversubscription"); if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) { /* if the #procs > #processors, declare us oversubscribed regardless * of what the mapper claimed - the user may have told us something * incorrect */ oversubscribed = true; } else { /* likewise, if there are more processors here than we were told, * declare us to not be oversubscribed so we can be aggressive. This * covers the case where the user didn't tell us anything about the * number of available slots, so we defaulted to a value of 1 */ oversubscribed = false; } } } opal_output(orte_odls_globals.output, "odls: oversubscribed set to %s want_processor set to %s", oversubscribed ? "true" : "false", want_processor ? "true" : "false"); /* okay, now let's launch our local procs using a fork/exec */ i = 0; /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; /* is this child already alive? This can happen if * we are asked to launch additional processes. * If it has been launched, then do nothing */ if (child->alive) { continue; } /* do we have a child from the specified job. Because the * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { continue; } opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]", ORTE_NAME_ARGS(child->name)); /* find the indicated app_context in the list */ for (item2 = opal_list_get_first(&app_context_list); item2 != opal_list_get_end(&app_context_list); item2 = opal_list_get_next(item2)) { app_item = (odls_default_app_context_t*)item2; if (child->app_idx == app_item->app_context->idx) { app = app_item->app_context; goto DOFORK; } } /* get here if we couldn't find the app_context */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return ORTE_ERR_NOT_FOUND; DOFORK: /* must unlock prior to fork to keep things clean in the * event library */ opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start, range, want_processor, i, oversubscribed, base_environ))) { ORTE_ERROR_LOG(rc); orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0); opal_condition_signal(&orte_odls_default.cond); return rc; } /* reaquire lock so we don't double unlock... */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); i++; } /* report the proc info and state in the registry */ if (ORTE_SUCCESS != (rc = orte_odls_base_report_spawn(&orte_odls_default.children))) { ORTE_ERROR_LOG(rc); return rc; } /* setup the waitpids on the children */ for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (ORTE_PROC_STATE_LAUNCHED == child->state) { orte_wait_cb(child->pid, odls_default_wait_local_proc, NULL); child->state = ORTE_PROC_STATE_RUNNING; } } /* cleanup */ while (NULL != (item = opal_list_remove_first(&app_context_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&app_context_list); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return rc;}/** * Pass a signal to my local procs */static int send_signal(pid_t pid, int signal){ int rc = ORTE_SUCCESS; if (kill(pid, signal) != 0) { switch(errno) { case EINVAL: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); rc = ORTE_ERR_BAD_PARAM; break; case ESRCH: ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; break; case EPERM: ORTE_ERROR_LOG(ORTE_ERR_PERM); rc = ORTE_ERR_PERM; break; default: ORTE_ERROR_LOG(ORTE_ERROR); rc = ORTE_ERROR; } } return rc;}int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal){ int rc; opal_list_item_t *item; orte_odls_child_t *child; /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_default.mutex); /* if procs is NULL, then we want to signal all * of the local procs, so just do that case */ if (NULL == proc) { rc = ORTE_SUCCESS; /* pre-set this as an empty list causes us to drop to bottom */ for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } } opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return rc; } /* we want it sent to some specified process, so find it */ for (item = opal_list_get_first(&orte_odls_default.children); item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (ORTE_EQUAL == orte_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) { /* unlock before signaling as this may generate a callback */ opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } return rc; } } /* only way to get here is if we couldn't find the specified proc. * report that as an error and return it */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); opal_condition_signal(&orte_odls_default.cond); OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); return ORTE_ERR_NOT_FOUND;}static void set_handler_default(int sig){ struct sigaction act; act.sa_handler = SIG_DFL; act.sa_flags = 0; sigemptyset(&act.sa_mask); sigaction(sig, &act, (struct sigaction *)0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -