⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odls_default_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
                        if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {                            ORTE_ERROR_LOG(rc);                            return rc;                        }                        range = *vptr;                        continue;                    }                    if (strcmp(kval->key, ORTE_JOB_APP_CONTEXT_KEY) == 0) {                        /* this can occur multiple times since we allow multiple                         * app_contexts on the orterun command line. Add them                         * to the list                         */                        if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&app, kval->value, ORTE_APP_CONTEXT))) {                            ORTE_ERROR_LOG(rc);                            return rc;                        }                        app_item = OBJ_NEW(odls_default_app_context_t);                        if (NULL == app_item) {                            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);                            return ORTE_ERR_OUT_OF_RESOURCE;                        }                        app_item->app_context = app;                        opal_list_append(&app_context_list, &app_item->super);                        kval->value->data = NULL;  /* protect the data storage from later release */                    }                    if (strcmp(kval->key, ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY) == 0) {                        /* this can only occur once, so just store it */                        if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {                            ORTE_ERROR_LOG(rc);                            return rc;                        }                        override_oversubscribed = *bptr;                        continue;                    }                } /* end for loop to process global data */            } else {                /* this must have come from one of the process containers, so it must                * contain data for a proc structure - see if it                * belongs to this node                */                for (kv=0; kv < value->cnt; kv++) {                    kval = value->keyvals[kv];                    if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) {                        /* Most C-compilers will bark if we try to directly compare the string in the                         * kval data area against a regular string, so we need to "get" the data                         * so we can access it */                       if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {                            ORTE_ERROR_LOG(rc);                            return rc;                        }                        /* if this is our node...must also protect against a zero-length string  */                        if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {                            /* ...harvest the info into a new child structure */                            child = OBJ_NEW(orte_odls_child_t);                            for (kv2 = 0; kv2 < value->cnt; kv2++) {                                kval = value->keyvals[kv2];                                if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) {                                    /* copy the name into the child object */                                    if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {                                        ORTE_ERROR_LOG(rc);                                        return rc;                                    }                                    continue;                                }                                if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {                                    if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {                                        ORTE_ERROR_LOG(rc);                                        return rc;                                    }                                    child->app_idx = *sptr;  /* save the index into the app_context objects */                                    continue;                                }                                if(strcmp(kval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) {                                    if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {                                        ORTE_ERROR_LOG(rc);                                        return rc;                                    }                                    oversubscribed = *bptr;                                    continue;                                }                            } /* kv2 */                            /* protect operation on the global list of children */                            OPAL_THREAD_LOCK(&orte_odls_default.mutex);                            opal_list_append(&orte_odls_default.children, &child->super);                            opal_condition_signal(&orte_odls_default.cond);                            OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);                        }                    }                } /* for kv */            }        } /* for j */    }    /* setup for processor affinity. If there are enough physical processors on this node, then     * we indicate which processor each process should be assigned to, IFF the user has requested     * processor affinity be used - the paffinity subsystem will make that final determination. All     * we do here is indicate that we should do the definitions just in case paffinity is active     */    if (ORTE_SUCCESS != opal_paffinity_base_get_num_processors(&num_processors)) {        /* if we cannot find the number of local processors, then default to conservative         * settings         */        want_processor = false;  /* default to not being a hog */        /* leave oversubscribed alone */        opal_output(orte_odls_globals.output,                    "odls: could not get number of processors - using conservative settings");    } else {        /* only do this if we can actually get info on the number of processors */        if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {            want_processor = false;        } else {            want_processor = true;        }                /* now let's deal with the oversubscribed flag - and the use-case where a hostfile or some        * other non-guaranteed-accurate method was used to inform us about our allocation. Since        * the information on the number of slots on this node could have been incorrect, we need        * to check it against the local number of processors to ensure we don't overload them        */        if (override_oversubscribed) {            opal_output(orte_odls_globals.output, "odls: overriding oversubscription");            if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {                /* if the #procs > #processors, declare us oversubscribed regardless                * of what the mapper claimed - the user may have told us something                * incorrect                */                oversubscribed = true;            } else {                /* likewise, if there are more processors here than we were told,                * declare us to not be oversubscribed so we can be aggressive. This                * covers the case where the user didn't tell us anything about the                * number of available slots, so we defaulted to a value of 1                */                oversubscribed = false;            }        }    }    opal_output(orte_odls_globals.output, "odls: oversubscribed set to %s want_processor set to %s",                oversubscribed ? "true" : "false", want_processor ? "true" : "false");    /* okay, now let's launch our local procs using a fork/exec */    i = 0;    /* protect operations involving the global list of children */    OPAL_THREAD_LOCK(&orte_odls_default.mutex);    for (item = opal_list_get_first(&orte_odls_default.children);         item != opal_list_get_end(&orte_odls_default.children);         item = opal_list_get_next(item)) {        child = (orte_odls_child_t*)item;        /* is this child already alive? This can happen if         * we are asked to launch additional processes.         * If it has been launched, then do nothing         */        if (child->alive) {            continue;        }                /* do we have a child from the specified job. Because the        *  job could be given as a WILDCARD value, we must use        *  the dss.compare function to check for equality.        */        if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {            continue;        }                opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",                                              ORTE_NAME_ARGS(child->name));        /* find the indicated app_context in the list */        for (item2 = opal_list_get_first(&app_context_list);             item2 != opal_list_get_end(&app_context_list);             item2 = opal_list_get_next(item2)) {            app_item = (odls_default_app_context_t*)item2;            if (child->app_idx == app_item->app_context->idx) {                app = app_item->app_context;                goto DOFORK;            }        }        /* get here if we couldn't find the app_context */        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);        opal_condition_signal(&orte_odls_default.cond);        OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);        return ORTE_ERR_NOT_FOUND;        DOFORK:        /* must unlock prior to fork to keep things clean in the         * event library         */        opal_condition_signal(&orte_odls_default.cond);        OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);                if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start,                                                               range, want_processor,                                                               i, oversubscribed,                                                               base_environ))) {            ORTE_ERROR_LOG(rc);            orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0);            opal_condition_signal(&orte_odls_default.cond);            return rc;        }        /* reaquire lock so we don't double unlock... */        OPAL_THREAD_LOCK(&orte_odls_default.mutex);        i++;    }    /* report the proc info and state in the registry */    if (ORTE_SUCCESS != (rc = orte_odls_base_report_spawn(&orte_odls_default.children))) {        ORTE_ERROR_LOG(rc);        return rc;    }    /* setup the waitpids on the children */    for (item = opal_list_get_first(&orte_odls_default.children);         item != opal_list_get_end(&orte_odls_default.children);         item = opal_list_get_next(item)) {        child = (orte_odls_child_t*)item;                if (ORTE_PROC_STATE_LAUNCHED == child->state) {            orte_wait_cb(child->pid, odls_default_wait_local_proc, NULL);            child->state = ORTE_PROC_STATE_RUNNING;        }    }    /* cleanup */    while (NULL != (item = opal_list_remove_first(&app_context_list))) {        OBJ_RELEASE(item);    }    OBJ_DESTRUCT(&app_context_list);    opal_condition_signal(&orte_odls_default.cond);    OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);    return rc;}/** *  Pass a signal to my local procs */static int send_signal(pid_t pid, int signal){    int rc = ORTE_SUCCESS;        if (kill(pid, signal) != 0) {        switch(errno) {            case EINVAL:                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);                rc = ORTE_ERR_BAD_PARAM;                break;            case ESRCH:                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);                rc = ORTE_ERR_NOT_FOUND;                break;            case EPERM:                ORTE_ERROR_LOG(ORTE_ERR_PERM);                rc = ORTE_ERR_PERM;                break;            default:                ORTE_ERROR_LOG(ORTE_ERROR);                rc = ORTE_ERROR;        }    }        return rc;}int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal){    int rc;    opal_list_item_t *item;    orte_odls_child_t *child;        /* protect operations involving the global list of children */    OPAL_THREAD_LOCK(&orte_odls_default.mutex);    /* if procs is NULL, then we want to signal all     * of the local procs, so just do that case     */    if (NULL == proc) {        rc = ORTE_SUCCESS;  /* pre-set this as an empty list causes us to drop to bottom */        for (item = opal_list_get_first(&orte_odls_default.children);             item != opal_list_get_end(&orte_odls_default.children);             item = opal_list_get_next(item)) {            child = (orte_odls_child_t*)item;            if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) {                ORTE_ERROR_LOG(rc);            }        }        opal_condition_signal(&orte_odls_default.cond);        OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);        return rc;    }        /* we want it sent to some specified process, so find it */    for (item = opal_list_get_first(&orte_odls_default.children);         item != opal_list_get_end(&orte_odls_default.children);         item = opal_list_get_next(item)) {        child = (orte_odls_child_t*)item;        if (ORTE_EQUAL == orte_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {            /* unlock before signaling as this may generate a callback */            opal_condition_signal(&orte_odls_default.cond);            OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);            if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) {                ORTE_ERROR_LOG(rc);            }            return rc;        }    }        /* only way to get here is if we couldn't find the specified proc.     * report that as an error and return it     */    ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);    opal_condition_signal(&orte_odls_default.cond);    OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);    return ORTE_ERR_NOT_FOUND;}static void set_handler_default(int sig){    struct sigaction act;    act.sa_handler = SIG_DFL;    act.sa_flags = 0;    sigemptyset(&act.sa_mask);    sigaction(sig, &act, (struct sigaction *)0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -