📄 #psymbfact.c#
字号:
/* * -- Parallel symbolic factorization routine (version 2.1) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 * INRIA France - January 2004 * Laura Grigori * November 1, 2007 * * Last update: Feb. 3, 2008 *//* * The function symbfact_dist implements the parallel symbolic factorization * algorithm described in the paper: * * Parallel Symbolic Factorization for Sparse LU with Static Pivoting, * Laura Grigori, James W. Demmel and Xiaoye S. Li, * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3. * *//* limits.h: the largest positive integer (INT_MAX) */#include <limits.h>#include <math.h>#include "superlu_ddefs.h"#include "psymbfact.h"/* * Internal protypes */static int_t *intMalloc_symbfact(int_t );static int_t *intCalloc_symbfact(int_t );static int_tinitParmsAndStats(psymbfact_stat_t *PS);static voidestimate_memUsage(int_t, int, mem_usage_t *, float *, float *, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);static voidsymbfact_free (int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *);static int_tdenseSep_symbfact (int , int_t, int, int, int, int_t *, int_t *, int, int, int, int_t, int_t, int_t *, int_t *, int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *, Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t * );static int_tdnsUpSeps_symbfact(int_t, int, int, int, int, int_t *, int_t *, int_t, Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *);static voidintraLvl_symbfact (SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, int, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *);static voidinitLvl_symbfact(int_t, int, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, int_t *, int_t, int_t);static voidcreateComm (int, int, MPI_Comm *, MPI_Comm *);static voiddomain_symbfact(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *);static floatallocPrune_domain(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);static floatallocPrune_lvl(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);static intsymbfact_alloc(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);static float symbfact_mapVtcs(int, int, int, SuperMatrix *, int_t *, int_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *);static void symbfact_distributeMatrix (int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *);static int_tinterLvl_symbfact(SuperMatrix *, int, int, int, int, int, int, int, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, MPI_Comm *);static floatcntsVtcs (int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *);/************************************************************************/float symbfact_dist/************************************************************************/( int nprocs_num, /* Input - no of processors */ int nprocs_symb, /* Input - no of processors for the symbolic factorization */ SuperMatrix *A, /* Input - distributed input matrix */ int_t *perm_c, /* Input - column permutation */ int_t *perm_r, /* Input - row permutation */ int_t *sizes, /* Input - sizes of each node in the separator tree */ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, global to local indexing information */ MPI_Comm *num_comm, /* Input - communicator for numerical factorization */ MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ mem_usage_t *symb_mem_usage ){/* * Purpose * ======= * symbfact_dist() performs symbolic factorization of matrix A suitable * for performing the supernodal Gaussian elimination with no pivoting (GEPP). * This routine computes the structure of one column of L and one row of U * at a time. It uses: * o distributed input matrix * o supernodes * o symmetric structure pruning * * * Arguments * ========= * * nprocs_num (input) int * Number of processors SuperLU_DIST is executed on, and the input * matrix is distributed on. * * nprocs_symb (input) int * Number of processors on which the symbolic factorization is * performed. It is equal to the number of independent domains * idenfied in the graph partitioning algorithm executed * previously and has to be a power of 2. It corresponds to * number of leaves in the separator tree. * * A (input) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The * number of the linear equations is A->nrow. Matrix A is * distributed in NRformat_loc format. * Matrix A is not yet permuted by perm_c. * * perm_c (input) int_t* * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int_t* * Row permutation vector of size A->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means column i of A is * in position j in Pr*A. * * sizes (input) int_t* * Contains the number of vertices in each separator. * * fstVtxSep (input) int_t* * Contains first vertex for each separator. * * Pslu_freeable (output) Pslu_freeable_t* * Returns the local L and U structure, and global to local * information on the indexing of the vertices. Contains all * the information necessary for performing the data * distribution towards the numeric factorization. * * num_comm (input) MPI_Comm* * Communicator for numerical factorization * * symb_comm (input) MPI_Comm* * Communicator for symbolic factorization * * symb_mem_usage (input) mem_usage_t * * Statistics on memory usage. * * Return value * ============ * < 0, number of bytes allocated on return from the symbolic factorization. * > 0, number of bytes allocated when out of memory. * * Sketch of the algorithm * ======================= * * Distrbute the vertices on the processors using a subtree to * subcube algorithm. * * Redistribute the structure of the input matrix A according to the * subtree to subcube computed previously for the symbolic * factorization routine. This implies in particular a distribution * from nprocs_num processors to nprocs_symb processors. * * Perform symbolic factorization guided by the separator tree provided by * a graph partitioning algorithm. The symbolic factorization uses a * combined left-looking, right-looking approach. * */ NRformat_loc *Astore; int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep; int iinfo; /* return code */ int_t m, n; int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr; int_t ind_blk, nsuper, vtx, min_mn, nnzL, nnzU, szsn; float stat_loc[23], stat_glob[23], mem_glob[15]; Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */ vtcsInfo_symbfact_t VInfo; /* local information on number of blocks, number of vertices in a block etc */ matrix_symbfact_t AS; /* temporary storage for the input matrix after redistribution */ comm_symbfact_t CS; /* information on communication */ /* relaxation parameters (for future release) and statistics collected during the symbolic factorization */ psymbfact_stat_t PS; /* temp array of size n, used as a marker by the subroutines */ int_t *tempArray; int_t i, j, k; int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc; int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd; float totalMemLU, overestimMem; MPI_Comm *commLvls; /* maximum block size */ int_t maxSzBlk; float flinfo;#if ( PRNTlevel >= 1) float stat_msgs_l[10], stat_msgs_g[10]; #endif #if ( PROFlevel>=1 ) double t, t_symbFact[3], t_symbFact_loc[3]; double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9];#endif /* Initialization */ MPI_Comm_rank ((*num_comm), &iam);#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter psymbfact()");#endif initParmsAndStats (&PS); if (nprocs_symb != 1) { if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) { fprintf (stderr, "Malloc fails for commLvls[]."); return (PS.allocMem); } PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm); } else { commLvls = NULL; } nlvls = (int) log2( (double)nprocs_num ) + 1;#if ( PROFlevel>=1 ) time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) * sizeof(double)); time_lvls = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double)); if (!time_lvls || !time_lvlsT) { fprintf (stderr, "Malloc fails for time_lvls[]."); return (PS.allocMem); } PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double);#endif VInfo.xlsub_nextLvl = 0; VInfo.xusub_nextLvl = 0; VInfo.maxSzBlk = sp_ienv_dist(3); maxSzBlk = VInfo.maxSzBlk; mark = EMPTY; nsuper_loc = 0; nextl = 0; nextu = 0; neltsZr = 0; neltsTotal = 0; m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN( m, n ); if (!(tempArray = intMalloc_symbfact(n))) { fprintf (stderr, "Malloc fails for tempArray[].\n"); return (PS.allocMem); } PS.allocMem += n * sizeof(int_t); #if ( PROFlevel>=1 ) t = SuperLU_timer_();#endif /* Distribute vertices on processors */ if ((flinfo = symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) return (flinfo); maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; /* Redistribute matrix A on processors following the distribution found in symbfact_mapVtcs. Store the redistributed A temporarily into AS */ symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb, A, perm_c, perm_r, &AS, Pslu_freeable, &VInfo, tempArray, num_comm); /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB PROCESSORS */ if ( iam < nprocs_symb ) { #if ( PROFlevel>=1 ) t_symbFact_loc[0] = SuperLU_timer_() - t; t = SuperLU_timer_(); t_symbFact_loc[1] = t;#endif /* Allocate storage common to the symbolic factor routines */ if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS)) return (PS.allocMem); /* Copy the redistributed input matrix AS at the end of the memory buffer allocated to store L and U. That is, copy (AS.x_ainf, AS.ind_ainf) in (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub). Free the memory used to store the input matrix */ nnz_ainf_loc = VInfo.nnz_ainf_loc; nnz_asup_loc = VInfo.nnz_asup_loc; j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc; k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc; for (i = 0; i <= VInfo.nvtcs_loc; i++) { Llu_symbfact.xusub[i] = AS.x_asup[i] + j; Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k; } for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) Llu_symbfact.usub[j] = AS.ind_asup[i]; for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++) Llu_symbfact.lsub[k] = AS.ind_ainf[i]; SUPERLU_FREE( AS.x_ainf ); SUPERLU_FREE( AS.x_asup ); SUPERLU_FREE( AS.ind_ainf ); SUPERLU_FREE( AS.ind_asup ); if (nprocs_symb != 1) { createComm (iam, nprocs_symb, commLvls, symb_comm); #if ( PROFlevel>=1 ) t_symbFact_loc[2] = SuperLU_timer_();#endif if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) return (flinfo); #if ( PROFlevel>=1 ) t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2];#endif } /* set to EMPTY marker[] array */ for (i = 0; i < n; i++) tempArray[i] = EMPTY; szSep = nprocs_symb; iSep = 0; lvl = 0; while (szSep >= 1) { /* for each level in the separator tree */ npNode = nprocs_symb / szSep; fstP = 0; /* for each node in the level */ for (jSep = iSep; jSep < iSep + szSep; jSep++) { fstVtx = fstVtxSep[jSep]; lstVtx = fstVtx + sizes[jSep]; /* if this is the first level */ if (szSep == nprocs_symb) { /* compute symbolic factorization for my domain */ if (fstP == iam) { /* allocate storage for the pruned structures */#if ( PROFlevel>=1 ) t1 = SuperLU_timer_();#endif if ((flinfo = allocPrune_domain (fstVtx, lstVtx,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -