📄 ad_bgl_aggrs.c
字号:
/* ---------------------------------------------------------------- *//* (C)Copyright IBM Corp. 2007, 2008 *//** * \file ad_bgl_aggrs.c * \brief The externally used function from this file is is declared in ad_bgl_aggrs.h *//* -*- Mode: C; c-basic-offset:4 ; -*- *//* * Copyright (C) 1997 University of Chicago. * See COPYRIGHT notice in top-level directory. */#include "adio.h"#include "adio_cb_config_list.h"#include "ad_bgl.h"#include "ad_bgl_pset.h"#include "ad_bgl_aggrs.h"int aggrsInPsetSize=0;int *aggrsInPset=NULL;/* forward declaration */static void ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *all_procInfo, int *aggrsInPset );/* * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. * The parameters are * . the number of aggregators (proxies) : fd->hints->cb_nodes * . the ranks of the aggregators : fd->hints->ranklist * By compute these two parameters in a BGL-PSET-aware way, the default 2-phase collective IO of * ADIO can work more efficiently. */int ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset) { int r, s; ADIOI_BGL_ProcInfo_t *procInfo, *all_procInfo; ADIOI_BGL_ConfInfo_t *confInfo; MPI_Comm_size( fd->comm, &s ); MPI_Comm_rank( fd->comm, &r ); /* Collect individual BGL personality information */ confInfo = ADIOI_BGL_ConfInfo_new (); procInfo = ADIOI_BGL_ProcInfo_new (); ADIOI_BGL_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset ); /* Gather BGL personality infomation onto process 0 */ // if (r == 0) all_procInfo = ADIOI_BGL_ProcInfo_new_n (s); if(s > aggrsInPsetSize) { if(aggrsInPset) ADIOI_Free(aggrsInPset); aggrsInPset = (int *) ADIOI_Malloc (s *sizeof(int)); aggrsInPsetSize = s; } MPI_Gather( (void *)procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE, (void *)all_procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE, 0, fd->comm ); /* Compute a list of the ranks of chosen IO proxy CN on process 0 */ if (r == 0) { ADIOI_BGL_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset); // ADIOI_BGL_ProcInfo_free (all_procInfo); } ADIOI_BGL_ProcInfo_free (all_procInfo); /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct. Declared in adio_cb_config_list.h */ ADIOI_cb_bcast_rank_map(fd); /* Broadcast the BGL-GPFS related file domain info */ MPI_Bcast( (void *)aggrsInPset, fd->hints->cb_nodes * sizeof(int), MPI_BYTE, 0, fd->comm ); ADIOI_BGL_persInfo_free( confInfo, procInfo ); return 0;}/* * the purpose of abstracting out this routine is to make it easy for trying different proxy-selection criteria. */static int ADIOI_BGL_select_agg_in_pset (const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *pset_procInfo, int nCN_in_pset, int *tmp_ranklist){/* first implementation, based on their rank order. */ int i, j, k; /* The number of aggregators in the PSET is proportional to the CNs in the PSET */ int nAggrs = nCN_in_pset * confInfo->aggRatio; if (nAggrs < ADIOI_BGL_NAGG_PSET_MIN) nAggrs = ADIOI_BGL_NAGG_PSET_MIN; /* for not virtual-node-mode, pick aggregators in this PSET based on the order of the global rank */ if (!confInfo->isVNM) { for (i=0; i<nAggrs; i++) tmp_ranklist[i] = pset_procInfo[i].rank; } /* for virtual-node-mode, first pick aggregators among CPU-0 */ else { /* Try to pick from CPU-0 first, then CPU-1, then ... CPU-n */ j = 0; for (k=0; k < confInfo->cpuidSize; k++){ for (i=0; i< nCN_in_pset ; i++) { if (pset_procInfo[i].cpuid == k) tmp_ranklist[j++] = pset_procInfo[i].rank; if ( j >= nAggrs) break; } if ( j >= nAggrs) break; } } return nAggrs;}/* * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist. * The first order of tmp_ranklist is : PSET number * The secondary order of the list is determined in ADIOI_BGL_select_agg_in_pset() and thus adjustable. */static int ADIOI_BGL_compute_agg_ranklist_serial_do (const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *all_procInfo, int *aggrsInPset, int *tmp_ranklist){ int i, j; /* a list of the numbers of all the PSETS */ int *psetNumList = (int *) ADIOI_Malloc ( confInfo->nProcs * sizeof(int) ); /* sweep through all processes' records, collect the numbers of all the PSETS. * The reason for not doing MIN, MAX is that the owned PSETs may not have contiguous numbers */ int n_psets=0; for (i=0; i<confInfo->nProcs; i++) { ADIOI_BGL_ProcInfo_t *info_p = all_procInfo+i; int exist = 0; for (j=n_psets-1; j>=0; j--) if (info_p->psetNum == psetNumList[j]) { exist=1; break; } if (!exist) { psetNumList [n_psets] = info_p->psetNum; n_psets ++; } } /* bucket sort: put the CN nodes into ordered buckets, each of which represents a PSET */ /* bucket space for bucket sort */ ADIOI_BGL_ProcInfo_t *sorted_procInfo = ADIOI_BGL_ProcInfo_new_n ( n_psets * confInfo->virtualPsetSize ); int *PsetIdx = (int *) ADIOI_Malloc ( n_psets * sizeof(int) ); AD_BGL_assert ( (PsetIdx != NULL) ); /* initialize bucket pointer */ for (i=0; i<n_psets; i++) { PsetIdx[i] = i*confInfo->virtualPsetSize; } /* sort */ for (i=0; i<confInfo->nProcs; i++) { int pset_id = all_procInfo[i].psetNum; for (j=n_psets-1; j>=0; j--) if (pset_id == psetNumList[j]) break; AD_BGL_assert ( (j >= 0) ); /* got to find a PSET bucket */ sorted_procInfo[ PsetIdx[j] ++ ] = all_procInfo[i]; } ADIOI_Free(psetNumList); /* select a number of CN aggregators from each Pset */ int naggs = 0; for (i=0; i<n_psets; i++) { /* the number of CN in this PSET -- may not be a full PSET */ int nCN_in_pset = PsetIdx[i] - i*confInfo->virtualPsetSize; /* select aggregators and put them into tmp_ranklist contiguously. */ int local_naggs = ADIOI_BGL_select_agg_in_pset( confInfo, sorted_procInfo + i*confInfo->virtualPsetSize, nCN_in_pset, tmp_ranklist + naggs); aggrsInPset[i+1] = local_naggs; naggs += local_naggs; } aggrsInPset[0] = n_psets; /* leave */ ADIOI_Free ( PsetIdx ); ADIOI_BGL_ProcInfo_free ( sorted_procInfo ); return naggs;}/* * compute aggregators ranklist and put it into fd->hints struct */ static void ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *all_procInfo, int *aggrsInPset ){# define DEBUG 0# if DEBUG int i; # endif int naggs; int *tmp_ranklist; /* compute the ranklist of IO aggregators and put into tmp_ranklist */ tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));# if DEBUG for (i=0; i<confInfo->nProcs; i++) printf( "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );# endif naggs = ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);# define VERIFY 0# if VERIFY printf( "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n", confInfo->PsetSize , confInfo->numPsets , confInfo->isVNM , confInfo->virtualPsetSize , confInfo->nProcs , confInfo->nAggrs , confInfo->aggRatio , naggs );# endif# if DEBUG for (i=0; i<naggs; i++) printf( "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );# endif /* copy the ranklist of IO aggregators to fd->hints */ if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist); fd->hints->cb_nodes = naggs; fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) ); /* */ ADIOI_Free( tmp_ranklist ); return;}/* * Compute a dynamic access range based file domain partition among I/O aggregators, * which align to the GPFS block size * Divide the I/O workload among "nprocs_for_coll" processes. This is * done by (logically) dividing the file into file domains (FDs); each * process may directly access only its own file domain. * Additional effort is to make sure that each I/O aggregator get * a file domain that aligns to the GPFS block size. So, there will * not be any false sharing of GPFS file blocks among multiple I/O nodes. */void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr, void *fs_ptr){ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; int i, aggr; static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains"; __blksize_t blksize = 1048576; /* default to 1M */ if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */ blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;/* FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);*/ /* find the range of all the requests */ min_st_offset = st_offsets [0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } // printf( "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset ); /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1; ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize; ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset; ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1; int naggs = nprocs_for_coll; fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -