📄 pzsymbfact_distdata.c
字号:
/* * -- Parallel symbolic factorization auxialiary routine (version 2.2) -- * -- Distributes the data from parallel symbolic factorization * -- to numeric factorization * INRIA France - July 1, 2004 * Laura Grigori * * November 1, 2007 * Feburary 20, 2008 *//* limits.h: the largest positive integer (INT_MAX) */#include <limits.h>#include "superlu_zdefs.h"#include "psymbfact.h"static floatdist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, Glu_persist_t *Glu_persist, int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, gridinfo_t *grid )/* * Purpose * ======= * * Redistribute the symbolic structure of L and U from the distribution * used in the parallel symbolic factorization step to the distdibution * used in the parallel numeric factorization step. On exit, the L and U * structure for the 2D distribution used in the numeric factorization step is * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal * information is also computed and it is stored in Glu_persist->supno * and Glu_persist->xsup. * * This routine allocates memory for storing the structure of L and U * and the supernodes information. This represents the arrays: * p_xlsub, p_lsub, p_xusub, p_usub, * Glu_persist->supno, Glu_persist->xsup. * * This routine also deallocates memory allocated during symbolic * factorization routine. That is, the folloing arrays are freed: * Pslu_freeable->xlsub, Pslu_freeable->lsub, * Pslu_freeable->xusub, Pslu_freeable->usub, * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. * * Arguments * ========= * * n (Input) int_t * Order of the input matrix * Pslu_freeable (Input) Pslu_freeable_t * * Local L and U structure, * global to local indexing information. * * Glu_persist (Output) Glu_persist_t * * Stores on output the information on supernodes mapping. * * p_xlsub (Output) int_t ** * Pointer to structure of L distributed on a 2D grid * of processors, stored by columns. * * p_lsub (Output) int_t ** * Structure of L distributed on a 2D grid of processors, * stored by columns. * * p_xusub (Output) int_t ** * Pointer to structure of U distributed on a 2D grid * of processors, stored by rows. * * p_usub (Output) int_t ** * Structure of U distributed on a 2D grid of processors, * stored by rows. * * grid (Input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * < 0, number of bytes allocated on return from the dist_symbLU. * > 0, number of bytes allocated in this routine when out of memory. * (an approximation). */{ int iam, nprocs, pc, pr, p, np, p_diag; int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, *tmp_ptrToSend, *mem; int_t *nnzToRecv_l, *nnzToRecv_u; int_t *send_1, *send_2, nsend_1, nsend_2; int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; int_t nsupers, nsupers_i, nsupers_j; int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; int_t maxszsn, maxNvtcsPProc; int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; int_t *xsub_s, *sub_s, *xsub_n, *sub_n; int_t *globToLoc, nvtcs_loc; int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, RecvCnt_l, RecvCnt_u, ind_loc; int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; int_t nelts, isize; float memAux; /* Memory used during this routine and freed on return */ float memRet; /* Memory allocated and not freed on return */ int_t iword, dword; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dist_symbLU()");#endif nprocs = (int) grid->nprow * grid->npcol; xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; globToLoc = Pslu_freeable->globToLoc; nvtcs_loc = Pslu_freeable->nvtcs_loc; xsup_beg_s = Pslu_freeable->xsup_beg_loc; xsup_end_s = Pslu_freeable->xsup_end_loc; supno_s = Pslu_freeable->supno_loc; rcv_luind = NULL; iword = sizeof(int_t); dword = sizeof(doublecomplex); memAux = 0.; memRet = 0.; mem = intCalloc_dist(12 * nprocs); if (!mem) return (ERROR_RET); memAux = (float) (12 * nprocs * sizeof(int_t)); nnzToRecv = mem; nnzToSend = nnzToRecv + 2*nprocs; nnzToSend_l = nnzToSend + 2 * nprocs; nnzToSend_u = nnzToSend_l + nprocs; send_1 = nnzToSend_u + nprocs; send_2 = send_1 + nprocs; tmp_ptrToSend = send_2 + nprocs; nnzToRecv_l = tmp_ptrToSend + nprocs; nnzToRecv_u = nnzToRecv_l + nprocs; ptrToSend = nnzToSend; ptrToRecv = nnzToSend + nprocs; nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); intBuf1 = nvtcs + nprocs; intBuf2 = nvtcs + 2 * nprocs; intBuf3 = nvtcs + 3 * nprocs; intBuf4 = nvtcs + 4 * nprocs; memAux += 5 * nprocs * sizeof(int); maxszsn = sp_ienv_dist(3); /* Allocate space for storing Glu_persist_n. */ if ( !(supno_n = intMalloc_dist(n+1)) ) { fprintf (stderr, "Malloc fails for supno_n[]."); return (memAux); } memRet += (float) ((n+1) * sizeof(int_t)); /* ------------------------------------------------------------ DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION ------------------------------------------------------------*/ if (nvtcs_loc > INT_MAX) ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); intNvtcs_loc = (int) nvtcs_loc; MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, 0, grid->comm); if (!iam) { /* set ptrToRecv to point to the beginning of the data for each processor */ for (k = 0, p = 0; p < nprocs; p++) { ptrToRecv[p] = k; k += nvtcs[p]; } } if (nprocs > 1) { temp = NULL; if (!iam ) { if ( !(temp = intMalloc_dist (n+1)) ) { fprintf (stderr, "Malloc fails for temp[]."); return (memAux + memRet); } memAux += (float) (n+1) * iword; }#if defined (_LONGINT) for (p=0; p<nprocs; p++) { if (ptrToRecv[p] > INT_MAX) ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); intBuf1[p] = (int) ptrToRecv[p]; }#else /* Default */ intBuf1 = ptrToRecv;#endif MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); } else temp = supno_s; if (!iam) { nsupers = 0; p = (int) OWNER( globToLoc[0] ); gb = temp[ptrToRecv[p]]; supno_n[0] = nsupers; ptrToRecv[p] ++; szsn = 1; for (j = 1; j < n; j ++) { if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { nsupers ++; p = (int) OWNER( globToLoc[j] ); gb = temp[ptrToRecv[p]]; szsn = 1; } else { szsn ++; } ptrToRecv[p] ++; supno_n[j] = nsupers; } nsupers++; if (nprocs > 1) { SUPERLU_FREE (temp); memAux -= (float) (n+1) * iword; } supno_n[n] = nsupers; } /* reset to 0 nnzToSend */ for (p = 0; p < 2 *nprocs; p++) nnzToSend[p] = 0; MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); nsupers = supno_n[n]; /* Allocate space for storing Glu_persist_n. */ if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { fprintf (stderr, "Malloc fails for xsup_n[]."); return (memAux + memRet); } memRet += (float) (nsupers+1) * iword; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF L and U. ------------------------------------------------------------*/ gb = EMPTY; for (i = 0; i < n; i++) { if (gb != supno_n[i]) { /* a new supernode starts */ gb = supno_n[i]; xsup_n[gb] = i; } } xsup_n[nsupers] = n; for (p = 0; p < nprocs; p++) { send_1[p] = FALSE; send_2[p] = FALSE; } for (gb_n = 0; gb_n < nsupers; gb_n ++) { i = xsup_n[gb_n]; if (iam == (int) OWNER( globToLoc[i] )) { pc = PCOL( gb_n, grid ); pr = PROW( gb_n, grid ); p_diag = PNUM( pr, pc, grid); i_loc = LOCAL_IND( globToLoc[i] ); gb_s = supno_s[i_loc]; fst_s = xsup_beg_s[gb_s]; lst_s = xsup_end_s[gb_s]; fst_s_l = LOCAL_IND( globToLoc[fst_s] ); for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { k = lsub_s[j]; if (k >= i) { gb = supno_n[k]; p = (int) PNUM( PROW(gb, grid), pc, grid ); nnzToSend[2*p] ++; send_1[p] = TRUE; } } for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { k = usub_s[j]; if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { gb = supno_n[k]; p = PNUM( pr, PCOL(gb, grid), grid); nnzToSend[2*p+1] ++; send_2[p] = TRUE; } } nsend_2 = 0; for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { nnzToSend[2*p+1] += 2; if (send_2[p]) nsend_2 ++; } for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) if (send_2[p] || p == p_diag) { if (p == p_diag && !send_2[p]) nnzToSend[2*p+1] += nsend_2; else nnzToSend[2*p+1] += nsend_2-1; send_2[p] = FALSE; } nsend_1 = 0; for (p = pc; p < nprocs; p += grid->npcol) { nnzToSend[2*p] += 2; if (send_1[p]) nsend_1 ++; } for (p = pc; p < nprocs; p += grid->npcol) if (send_1[p]) { nnzToSend[2*p] += nsend_1-1; send_1[p] = FALSE; } else nnzToSend[2*p] += nsend_1; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, grid->comm); nnz_loc_l = nnz_loc_u = 0; SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; for (p = 0; p < nprocs; p++) { if ( p != iam ) { SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; } else { nnz_loc_l += nnzToRecv[2*p]; nnz_loc_u += nnzToRecv[2*p+1]; nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; nnzToRecv_l[p] = nnzToRecv[2*p]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; } } /* Allocate space for storing the symbolic structure after redistribution. */ nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { fprintf (stderr, "Malloc fails for xlsub_n[]."); return (memAux + memRet); } memRet += (float) (nsupers_j+1) * iword; if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { fprintf (stderr, "Malloc fails for xusub_n[]."); return (memAux + memRet); } memRet += (float) (nsupers_i+1) * iword; /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { if (!(rcv_luind = intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { fprintf (stderr, "Malloc fails for rcv_luind[]."); return (memAux + memRet); } memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; } if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { fprintf (stderr, "Malloc fails for index[]."); return (memAux + memRet); } memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; } /* ------------------------------------------------------------------ LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF L and U. ------------------------------------------------------------------*/ sendL = TRUE; sendU = FALSE; while (sendL || sendU) { if (sendL) { xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -