⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ras_loadleveler_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 2 页
字号:
    /* Specify that this is a QUERY_STEPID type of query. */    rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);    if(0 != rc) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_set request failed: error "                    "%d!", rc);        return ORTE_ERROR;    }    /* Get a Job object from LoadL_schedd that contains the relevant job step */    job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count,                       &err_code);    if(NULL == job) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_set request failed: error "                    "%d!", rc);        return ORTE_ERROR;    }    if (obj_count != 1) {  /* Only 1 Job object is expected. */        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_objs: expected one job "                     "to match, got %d!", obj_count);        return ORTE_ERROR;    }    if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",                    rc);        return ORTE_ERROR;    }    if (job_step_count != 1) { /* Only 1 Job Step object is expected. */        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_objs: expected one job "                    "step to match, got %d!", obj_count);        return ORTE_ERROR;    }    step = NULL;    if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",                    rc);        return ORTE_ERROR;    }    if(NULL == step) {        opal_output(orte_ras_base.ras_output,                    "ll_get_data() Error: Unable to obtain Job Step "                    "information.\n");        return ORTE_ERROR;    }    step_mode = -1;    if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_data: failure on "                    "LL_StepParallelMode. RC= %d!", rc);        return ORTE_ERROR;    }        /* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */    if ((step_mode != 0) && (step_mode != 1)) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: We support only Serial and "                    "Parallel LoadLeveler job types. PVM, NQS, and Blue Gene"                    "jobs are not supported by the LoadLeveler RAS!");        return ORTE_ERROR;    }                            if(step_mode == 0) { /* serial job */        node = NULL;        if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {            opal_output(orte_ras_base.ras_output,                        "ras:loadleveler:allocate: ll_get_data: failure on "                        "LL_StepGetFirstNode. RC= %d!", rc);            return ORTE_ERROR;        }        task = NULL;        if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {            opal_output(orte_ras_base.ras_output,                        "ras:loadleveler:allocate: ll_get_data: failure on "                        "LL_NodeGetFirstTask. RC= %d!", rc);            return ORTE_ERROR;        }        task_instance = NULL;        rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance);        if(0 != rc) {            opal_output(orte_ras_base.ras_output,                        "ras:loadleveler:allocate: ll_get_data: failure on "                        "LL_TaskGetFirstInstance. RC= %d!", rc);            return ORTE_ERROR;        }        task_machine_name = NULL;        if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName,                                   &task_machine_name))) {            opal_output(orte_ras_base.ras_output,                        "ras:loadleveler:allocate: ll_get_data: failure on "                         "LL_TaskInstanceMachineName. RC= %d!", rc);            return ORTE_ERROR;        }        opal_argv_append(num_hosts, hostlist, task_machine_name);    } else { /* parallel job */        node = NULL;        if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {            opal_output(orte_ras_base.ras_output,                        "ras:loadleveler:allocate: ll_get_data: failure on "                        "LL_StepGetFirstNode. RC= %d!", rc);            return ORTE_ERROR;        }            while(NULL != node) {     /* Loop through the "Node" objects. */            task = NULL;            if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {                opal_output(orte_ras_base.ras_output,                            "ras:loadleveler:allocate: ll_get_data: failure on "                            "LL_NodeGetFirstTask. RC= %d!", rc);                return ORTE_ERROR;            }                    while(task) {  /* Loop through the "Task" objects. */                ll_master_task = 0;                rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task);                if(0 != rc) {                    opal_output(orte_ras_base.ras_output,                                "ras:loadleveler:allocate: ll_get_data: failure"                                 "  on LL_TaskIsMaster. RC= %d!", rc);                    return ORTE_ERROR;                }                            /* The "master task" Task object is a LoadLeveler abstraction                  * and is not relevant here. Look at only Task objects that                  * are not "master".*/                if (!ll_master_task) {                    task_instance = NULL;                    if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance,                                              &task_instance))) {                        opal_output(orte_ras_base.ras_output,                                    "ras:loadleveler:allocate: ll_get_data: "                                    "failure on LL_TaskGetFirstTaskInstance. "                                    " RC= %d!", rc);                        return ORTE_ERROR;                    }                                    /* Loop through the "Task Instance" objects. */                    while (task_instance) {                        task_machine_name = NULL;                        rc = ll_get_data(task_instance,                                          LL_TaskInstanceMachineName,                                          &task_machine_name);                        if(0 != rc) {                            opal_output(orte_ras_base.ras_output,                                        "ras:loadleveler:allocate: ll_get_data:"                                        " failure on LL_TaskInstanceMachineName"                                        "RC= %d!", rc);                            return ORTE_ERROR;                        }                        opal_argv_append(num_hosts, hostlist, task_machine_name);                        task_instance = NULL;                        rc = ll_get_data(task, LL_TaskGetNextTaskInstance,                                          &task_instance);                        if(0 != rc) {                            opal_output(orte_ras_base.ras_output,                                        "ras:loadleveler:allocate: ll_get_data:"                                        " failure on LL_TaskGetNextInstance. "                                        "RC= %d!", rc);                            return ORTE_ERROR;                        }                    }                }                task = NULL;                if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {                    opal_output(orte_ras_base.ras_output,                                "ras:loadleveler:allocate: ll_get_data: "                                 "failure on LL_NodeGetNextTask. RC= %d!", rc);                    return ORTE_ERROR;                }            }            node = NULL;            if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {                opal_output(orte_ras_base.ras_output,                            "ras:loadleveler:allocate: ll_get_data: failure "                            "on LL_StepGetNextNode. RC= %d!", rc);                return ORTE_ERROR;            }        }    }    ll_free_objs(queryObject);    ll_deallocate(queryObject);    return ORTE_SUCCESS;}#if 0/* For now, we do not get the node architectures from LoadLeveler. It is slow, * and we don't even use the value. *//* * get the machine arch from LoadLeveler * Will return NULL on error or a arch string that needs to be freed * (some code from the IBM documentation, licensed as above) */static char* orte_ras_loadleveler_get_host_arch(char * hostname) {    LL_element *queryObject, *machine;     int rc, obj_count, err_code;    char * hostlist[2];    char * arch;      /* Initialize the query: Machine query */    queryObject = ll_query(MACHINES);    if(NULL == queryObject) {        return NULL;    }       /* Set query parameters: query specific machines by name */     hostlist[0] = hostname;    hostlist[1] = NULL;        rc = ll_set_request(queryObject, QUERY_HOST, hostlist, ALL_DATA);     if(0 != rc) {         return NULL;    }       /* Get the machine objects from the LoadL_negotiator (central manager) daemon */     machine = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);     if(NULL == machine || 1 != obj_count) {        return NULL;    }      /* Process the machine object */    rc = ll_get_data(machine, LL_MachineArchitecture, &arch);     if(0 != rc) {         return NULL;    }       /* Free objects obtained from Negotiator */     ll_free_objs(queryObject);      /* Free query element */     ll_deallocate(queryObject);         return arch;}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -