⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ras_loadleveler_module.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,  *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved. * $COPYRIGHT$ *  * Additional copyrights may follow *  * $HEADER$ *//* Much of the code in this file is taken from the file ll_get_machine_list.c,  * which is provided by IBM as part of their sample programs for LoadLeveler * in the samples/llmpich directory. The documentation has the following license: * COPYRIGHT LICENSE: * This information contains sample application programs in source language, which  * illustrate programming techniques on various operating platforms. You may copy, * modify, and distribute these sample programs in any form without payment to  * IBM, for the purposes of developing, using, marketing or distributing  * application programs conforming to the application programming interface for  * the operating platform for which the sample programs are written. These  * examples have not been thoroughly tested under all conditions. IBM,  * therefore, cannot guarantee or imply reliability, serviceability, or * function of these programs. */ #include "orte_config.h"#include "orte/orte_constants.h"#include "orte/orte_types.h"#include <errno.h>#include <unistd.h>#include <string.h>#include <llapi.h>#include "opal/util/argv.h"#include "opal/util/output.h"#include "orte/dss/dss.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/ras/base/ras_private.h"#include "ras_loadleveler.h"/* * Local functions */static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attributes);static int orte_ras_loadleveler_deallocate(orte_jobid_t jobid);static int orte_ras_loadleveler_finalize(void);static int orte_ras_loadleveler_get_hostlist(int * num_hosts, char*** hostlist);/* * Global variable */orte_ras_base_module_t orte_ras_loadleveler_module = {    orte_ras_loadleveler_allocate,    orte_ras_base_node_insert,    orte_ras_base_node_query,    orte_ras_base_node_query_alloc,    orte_ras_base_node_lookup,    orte_ras_loadleveler_deallocate,    orte_ras_loadleveler_finalize};/* * Discover available (pre-allocated) nodes.  Allocate the * requested number of nodes/process slots to the job. */static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attributes){    int i, ret;    opal_list_t nodes_list;    opal_list_item_t* item;    orte_ras_node_t* node;    char ** hostlist = NULL;    int num_hosts = 0;    OBJ_CONSTRUCT(&nodes_list, opal_list_t);    ret = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);    if(ORTE_SUCCESS != ret) {        goto cleanup;    }    for (i = 0; i < num_hosts; i++) {        /* check for duplicated nodes */        for (item = opal_list_get_first(&nodes_list);              opal_list_get_end(&nodes_list) != item;             item = opal_list_get_next(item)) {             node = (orte_ras_node_t*) item;             if (0 == strcmp(node->node_name, hostlist[i])) {                ++node->node_slots;                break;            }        }             if(opal_list_get_end(&nodes_list) == item) {            /* we did not find a duplicate, so add a new item to the list */            node = OBJ_NEW(orte_ras_node_t);            if (NULL == node) {                ret = ORTE_ERR_OUT_OF_RESOURCE;                goto cleanup;            }            node->node_name = strdup(hostlist[i]);            node->node_arch = NULL;            node->node_state = ORTE_NODE_STATE_UP;            node->node_cellid = 0;            node->node_slots_inuse = 0;            node->node_slots_max = 0;            node->node_slots = 1;            opal_list_append(&nodes_list, &node->super);        }       }           ret = orte_ras_base_node_insert(&nodes_list);    ret = orte_ras_base_allocate_nodes(jobid, &nodes_list);cleanup:                while (NULL != (item = opal_list_remove_first(&nodes_list))) {        OBJ_RELEASE(item);        }           OBJ_DESTRUCT(&nodes_list);    opal_argv_free(hostlist);                return ret;}/* * There's really nothing to do here */static int orte_ras_loadleveler_deallocate(orte_jobid_t jobid){    opal_output(orte_ras_base.ras_output,                 "ras:loadleveler:deallocate: success (nothing to do)");    return ORTE_SUCCESS;}/* * There's really nothing to do here */static int orte_ras_loadleveler_finalize(void){    opal_output(orte_ras_base.ras_output,                 "ras:loadleveler:finalize: success (nothing to do)");    return ORTE_SUCCESS;}/* * get the hostlist from LoadLeveler * *hostlist should either by NULL or a valid argv and *num_hosts * should be 0 or the number of elements in the hostlist argv */static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist){    LL_element *queryObject = NULL, *job = NULL, *step = NULL;    LL_element *node = NULL, *task = NULL, *task_instance = NULL;    int rc, obj_count, err_code, ll_master_task, job_step_count;    char *ll_step_id= NULL, *job_step_list[2], *task_machine_name = NULL;    char *schedd_host_name = NULL;    int  step_mode;     /* Get the step ID from LOADL_STEP_ID environment variable. */    if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: could not get LOADL_STEP_ID "                    "from environment!");        return ORTE_ERROR;    }    job_step_list[0] = ll_step_id;    job_step_list[1] = NULL;    /* STEP 1: Get Job object from Central Manager to find out the name of the      * Schedd daemon that handles this job. In a Multicluster environment we      * can not get the schedd name from the job step id. */    /* Initialize the LL API. Specify that query type is JOBS. */    if(NULL == (queryObject = ll_query(JOBS))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_query faild on JOBS!");        return ORTE_ERROR;    }    /* Specify that this is a QUERY_STEPID type of query. */    rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);    if(0 > rc) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_set request failed: error "                    "%d!", rc);        return ORTE_ERROR;    }    /* Get a Job object from LoadL_schedd that contains the relevant job step */    job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);    if(NULL == job) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_objs faild!");        return ORTE_ERROR;    }    if (obj_count != 1) {  /* Only 1 Job object is expected. */        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_objs: expected one job "                    "to match, got %d!", obj_count);        return ORTE_ERROR;    }    if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",                     rc);        return ORTE_ERROR;    }    if (schedd_host_name != NULL) {        job_step_list[0] = ll_step_id;        job_step_list[1] = NULL;    } else {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_get_objs() Error: Could not "                    "determine managing schedd for job %s.\n",                    job_step_list[0]);        return ORTE_ERROR;    }    ll_free_objs(queryObject);    ll_deallocate(queryObject);    /* STEP 2: Get Job object from Schedd that manages this job step.   */    /* Only schedd query gives us all the relevant task instance info.  */    /* Initialize the LL API. Specify that query type is JOBS. */    if(NULL == (queryObject = ll_query(JOBS))) {        opal_output(orte_ras_base.ras_output,                    "ras:loadleveler:allocate: ll_query faild on JOBS!");        return ORTE_ERROR;    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -