📄 ras_slurm_module.c
字号:
/* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */#include "orte_config.h"#include "orte/orte_constants.h"#include "orte/orte_types.h"#include <unistd.h>#include <string.h>#include <ctype.h>#include "opal/util/argv.h"#include "opal/util/output.h"#include "opal/util/show_help.h"#include "orte/dss/dss.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/mca/ras/base/ras_private.h"#include "ras_slurm.h"/* * Local functions */static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes);static int orte_ras_slurm_deallocate(orte_jobid_t jobid);static int orte_ras_slurm_finalize(void);static int orte_ras_slurm_discover(char *regexp, char* tasks_per_node, opal_list_t *nodelist);static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***nodelist);static int orte_ras_slurm_parse_range(char *base, char *range, char ***nodelist);/* * Global variable */orte_ras_base_module_t orte_ras_slurm_module = { orte_ras_slurm_allocate, orte_ras_base_node_insert, orte_ras_base_node_query, orte_ras_base_node_query_alloc, orte_ras_base_node_lookup, orte_ras_slurm_deallocate, orte_ras_slurm_finalize};/** * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. * */static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes){ int ret; char *slurm_node_str, *regexp; char *tasks_per_node, *node_tasks; opal_list_t nodes; opal_list_item_t* item; slurm_node_str = getenv("SLURM_NODELIST"); if (NULL == slurm_node_str) { opal_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_NODELIST"); return ORTE_ERR_NOT_FOUND; } regexp = strdup(slurm_node_str); tasks_per_node = getenv("SLURM_TASKS_PER_NODE"); if (NULL == tasks_per_node) { opal_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_TASKS_PER_NODE"); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if(NULL == regexp || NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } OBJ_CONSTRUCT(&nodes, opal_list_t); ret = orte_ras_slurm_discover(regexp, node_tasks, &nodes); free(regexp); free(node_tasks); if (ORTE_SUCCESS != ret) { opal_output(orte_ras_base.ras_output, "ras:slurm:allocate: discover failed!"); return ret; } ret = orte_ras_base_allocate_nodes(jobid, &nodes); while (NULL != (item = opal_list_remove_first(&nodes))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&nodes); /* All done */ if (ORTE_SUCCESS == ret) { opal_output(orte_ras_base.ras_output, "ras:slurm:allocate: success"); } else { opal_output(orte_ras_base.ras_output, "ras:slurm:allocate: failure (base_allocate_nodes=%d)", ret); } return ret;}/* * There's really nothing to do here */static int orte_ras_slurm_deallocate(orte_jobid_t jobid){ opal_output(orte_ras_base.ras_output, "ras:slurm:deallocate: success (nothing to do)"); return ORTE_SUCCESS;}/* * There's really nothing to do here */static int orte_ras_slurm_finalize(void){ opal_output(orte_ras_base.ras_output, "ras:slurm:finalize: success (nothing to do)"); return ORTE_SUCCESS;}/** * Discover the available resources. * * In order to fully support slurm, we need to be able to handle * node regexp/task_per_node strings such as: * foo,bar 5,3 * foo 5 * foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16) * * @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST) * @param *tasks_per_node A tasks per node expression from SLURM * (i.e. SLURM_TASKS_PER_NODE) * @param *nodelist A list which has already been constucted to return * the found nodes in */static int orte_ras_slurm_discover(char *regexp, char *tasks_per_node, opal_list_t* nodelist){ int i, j, len, ret, count, reps, num_nodes; char *base, **names = NULL; char *begptr, *endptr, *orig; int *slots; bool found_range = false; bool more_to_come = false; orig = base = strdup(regexp); if (NULL == base) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } opal_output(orte_ras_base.ras_output, "ras:slurm:allocate:discover: checking nodelist: %s", regexp); do { /* Find the base */ len = strlen(base); for (i = 0; i <= len; ++i) { if (base[i] == '[') { /* we found a range. this gets dealt with below */ base[i] = '\0'; found_range = true; break; } if (base[i] == ',') { /* we found a singleton node, and there are more to come */ base[i] = '\0'; found_range = false; more_to_come = true; break; } if (base[i] == '\0') { /* we found a singleton node */ found_range = false; more_to_come = false; break; } } if(i == 0) { /* we found a special character at the beginning of the string */ opal_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp, tasks_per_node, "SLURM_NODELIST"); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); free(orig); return ORTE_ERR_BAD_PARAM; } if (found_range) { /* If we found a range, now find the end of the range */ for (j = i; j < len; ++j) { if (base[j] == ']') { base[j] = '\0'; break; } } if (j >= len) { /* we didn't find the end of the range */ opal_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp, tasks_per_node, "SLURM_NODELIST"); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); free(orig); return ORTE_ERR_BAD_PARAM; } ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names); if(ORTE_SUCCESS != ret) { opal_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp, tasks_per_node, "SLURM_NODELIST"); ORTE_ERROR_LOG(ret); free(orig); return ret; } if(base[j + 1] == ',') { more_to_come = true; base = &base[j + 2]; } else { more_to_come = false; } } else { /* If we didn't find a range, just add the node */ opal_output(orte_ras_base.ras_output, "ras:slurm:allocate:discover: found node %s", base); if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(&names, base))) { ORTE_ERROR_LOG(ret); free(orig); return ret; } /* set base equal to the (possible) next base to look at */ base = &base[i + 1]; } } while(more_to_come); free(orig);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -