📄 orte_setup_hnp.c
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ *//** * @file * * Establish a Head Node Process on a cluster's front end */#include "orte_config.h"#include <stdlib.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#include <errno.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#ifdef HAVE_SYS_WAIT_H#include <sys/wait.h>#endif#include <fcntl.h>#include "orte/orte_constants.h"#include "opal/event/event.h"#include "opal/threads/mutex.h"#include "opal/threads/condition.h"#include "opal/util/argv.h"#include "opal/util/opal_environ.h"#include "opal/util/output.h"#include "opal/util/path.h"#include "opal/util/os_path.h"#include "opal/mca/base/mca_base_param.h"#include "orte/dss/dss.h"#include "orte/runtime/orte_wait.h"#include "orte/util/univ_info.h"#include "orte/util/sys_info.h"#include "orte/util/proc_info.h"#include "orte/util/session_dir.h"#include "orte/util/universe_setup_file_io.h"#include "orte/mca/smr/smr.h"#include "orte/mca/rml/rml.h"#include "orte/mca/rds/rds_types.h"#include "orte/mca/ns/ns.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/errmgr/errmgr.h"#include "orte/runtime/runtime.h"#include "orte/runtime/orte_setup_hnp.h"/* Local condition variables and mutex */static opal_mutex_t orte_setup_hnp_mutex;static opal_condition_t orte_setup_hnp_condition;/* Local return code */static int orte_setup_hnp_rc;/* Local uri storage */static char *orte_setup_hnp_orted_uri;static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata;/* * NON-BLOCKING RECEIVER */static void orte_setup_hnp_recv(int status, orte_process_name_t* sender, orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);/* * PID WAIT CALLBACK */static void orte_setup_hnp_wait(pid_t wpid, int status, void *data);/* * ORTE_SETUP_HNP */int orte_setup_hnp(char *target_cluster, char *headnode, char *username){ char **argv, *param, *uri, *uid, *hn=NULL; char *path, *name_string, *orteprobe; int argc, rc=ORTE_SUCCESS, id, intparam; pid_t pid; bool can_launch=false, on_gpr=false; orte_cellid_t cellid=ORTE_CELLID_MAX, *cptr; orte_jobid_t jobid; orte_vpid_t vpid; orte_std_cntr_t i, j, k, cnt=0; orte_gpr_value_t **values=NULL, *value; orte_gpr_keyval_t **keyvals; char *keys[4], *tokens[3], *cellname; struct timeval tv; struct timespec ts; bool infrastructure = true, *bptr, tf_flag; /* get the nodename for the headnode of the target cluster */ if (NULL == headnode) { /* not provided, so try to look it up */ tokens[0] = target_cluster; tokens[1] = NULL; keys[0] = ORTE_RDS_FE_NAME; keys[1] = ORTE_RDS_FE_SSH; keys[2] = ORTE_CELLID_KEY; keys[3] = NULL; if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR, ORTE_RESOURCE_SEGMENT, tokens, keys, &cnt, &values))) { ORTE_ERROR_LOG(rc); return rc; } if (0 == cnt || 0 == values[0]->cnt) { /* nothing found */ goto MOVEON; } on_gpr = true; /* need to decide what to do if more than value found. Some * clusters have more than one head node, so which one do * we choose? For now, just take the first one returned. */ keyvals = values[0]->keyvals; for (i=0; i < values[0]->cnt; i++) { if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_NAME)) { hn = strdup((const char*)keyvals[i]->value->data); continue; } if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_SSH)) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyvals[i]->value, ORTE_BOOL))) { ORTE_ERROR_LOG(rc); return rc; } can_launch = *bptr; continue; } if (0 == strcmp(keyvals[i]->key, ORTE_CELLID_KEY)) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyvals[i]->value, ORTE_CELLID))) { ORTE_ERROR_LOG(rc); return rc; } cellid = *cptr; continue; } } goto MOVEON; } else { /* lookup the headnode's cellid */ hn = strdup(headnode); keys[0] = ORTE_RDS_FE_NAME; keys[1] = ORTE_RDS_FE_SSH; keys[2] = ORTE_CELLID_KEY; keys[3] = NULL; rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR, ORTE_RESOURCE_SEGMENT, NULL, keys, &cnt, &values); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } /* Nothing found */ if (0 == cnt || 0 == values[0]->cnt) { goto MOVEON; } on_gpr = true; for (i=0; i < cnt; i++) { keyvals = values[i]->keyvals; for (j=0; j < values[i]->cnt; j++) { if ((0 == strcmp(keyvals[j]->key, ORTE_RDS_FE_NAME)) && 0 == strcmp((const char*)keyvals[j]->value->data, headnode)) { /* okay, this is the right cell - now need to find * the ssh flag (if provided) and cellid */ for (k=0; k < values[i]->cnt; k++) { if (0 == strcmp(keyvals[k]->key, ORTE_RDS_FE_SSH)) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyvals[i]->value, ORTE_BOOL))) { ORTE_ERROR_LOG(rc); return rc; } can_launch = *bptr; continue; } if (0 == strcmp(keyvals[k]->key, ORTE_CELLID_KEY)) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyvals[i]->value, ORTE_CELLID))) { ORTE_ERROR_LOG(rc); return rc; } cellid = *cptr; continue; } } goto MOVEON; } } } }MOVEON: if (NULL != values) { for (i=0; i < cnt; i++) OBJ_RELEASE(values[i]); free(values); } if (!on_gpr && (NULL != target_cluster || NULL != headnode)) { /* if we couldn't find anything about this cell on the gpr, then * we need to put the required headnode data on the registry. We need * it to be there so other functions/processes can find it, if needed. * User must provide either a target_cluster name (which then must be * synonymous with the headnode name), a headnode name (on a named or * unnamed target_cluster), or both. */ /* get new cellid for this site/resource */ if (NULL != target_cluster) { cellname = strdup(target_cluster); } else { /* if the target_cluster was NULL, then headnode CAN'T be NULL * or else we wouldn't get here */ cellname = strdup(headnode); } /* can't know the site name, so it becomes "unknown" */ rc = orte_ns.create_cellid(&cellid, "unknown", cellname); if (ORTE_SUCCESS != rc ) { ORTE_ERROR_LOG(rc); free(cellname); return rc; } /* * Store the cell info on the resource segment of the registry */ if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR, ORTE_RESOURCE_SEGMENT, 4, 0))) { ORTE_ERROR_LOG(rc); return rc; } rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), cellid, cellname); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } /* Set Cell Name */ if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_RDS_NAME, ORTE_STRING, cellname))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } /* Set Cell ID */ if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_CELLID_KEY, ORTE_CELLID, &cellid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } /* Set Front End Name */ if (NULL == headnode) { if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_RDS_FE_NAME, ORTE_STRING, cellname))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } } else { if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_RDS_FE_NAME, ORTE_STRING, headnode))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } } /* Asssume ability to ssh to front end node*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -