📄 get_perm_c_parmetis.c
字号:
/* * -- Distributed symbolic factorization auxialiary routine (version 2.1) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 * INRIA France - January 2004 * Laura Grigori * * November 1, 2007 *//* limits.h: the largest positive integer (INT_MAX) */#include <limits.h>#include <math.h>#include "superlu_ddefs.h"/* * Internal protypes */static floata_plus_at_CompRow_loc(int, int_t *, int, int_t *, int_t , int_t *, int_t *, int, int_t *, int_t *, int_t **, int_t **, gridinfo_t *);floatget_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c, int nprocs_i, int noDomains, int_t **sizes, int_t **fstVtxSep, gridinfo_t *grid, MPI_Comm *metis_comm)/* * Purpose * ======= * * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a * graph partitioning algorithm to the symmetrized graph A+A'. The * multilevel graph partitioning algorithm used is the * ParMETIS_V3_NodeND routine available in the parallel graph * partitioning package parMETIS. * * The number of independent sub-domains noDomains computed by this * algorithm has to be a power of 2. Hence noDomains is the larger * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * * npcol is the number of processors used in SuperLU_DIST. * * Arguments * ========= * * A (input) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Matrix A is distributed * in NRformat_loc format. * * perm_r (input) int_t* * Row permutation vector of size A->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means row i of A is in * position j in Pr*A. * * perm_c (output) int_t* * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * nprocs_i (input) int* * Number of processors the input matrix is distributed on in a block * row format. It corresponds to number of processors used in * SuperLU_DIST. * * noDomains (input) int*, must be power of 2 * Number of independent domains to be computed by the graph * partitioning algorithm. ( noDomains <= nprocs_i ) * * sizes (output) int_t**, of size 2 * noDomains * Returns pointer to an array containing the number of nodes * for each sub-domain and each separator. Separators are stored * from left to right. * Memory for the array is allocated in this routine. * * fstVtxSep (output) int_t**, of size 2 * noDomains * Returns pointer to an array containing first node for each * sub-domain and each separator. * Memory for the array is allocated in this routine. * * Return value * ============ * < 0, number of bytes allocated on return from the symbolic factorization. * > 0, number of bytes allocated when out of memory. * */{ NRformat_loc *Astore; int iam, p; int *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int; int *options, numflag; int_t m_loc, nnz_loc, fst_row; int_t m, n, bnz, i, j; int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes; int_t *b_rowptr, *b_colind; int_t *dist_order; int *recvcnts, *displs; /* first row index on each processor when the matrix is distributed on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */ int_t *vtxdist_i, *vtxdist_o; int_t szSep, k, noNodes; float apat_mem_l; /* memory used during the computation of the graph of A+A' */ float mem; /* Memory used during this routine */ MPI_Status status; /* Initialization. */ MPI_Comm_rank (grid->comm, &iam); n = A->ncol; m = A->nrow; if ( m != n ) ABORT("Matrix is not square"); mem = 0.;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");#endif Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */ m_loc = Astore->m_loc; /* number of rows local to this processor */ fst_row = Astore->fst_row; /* global index of the first row */ rowptr = Astore->rowptr; /* pointer to rows and column indices */ colind = Astore->colind; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n", noDomains);#endif numflag = 0; /* determine first row on each processor */ vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i."); vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o."); MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t, grid->comm); vtxdist_i[nprocs_i] = m; if (noDomains == nprocs_i) { /* keep the same distribution of A */ for (p = 0; p <= nprocs_i; p++) vtxdist_o[p] = vtxdist_i[p]; } else { i = n / noDomains; j = n % noDomains; for (k = 0, p = 0; p < noDomains; p++) { vtxdist_o[p] = k; k += i; if (p < j) k++; } /* The remaining non-participating processors get the same first-row-number as the last processor. */ for (p = noDomains; p <= nprocs_i; p++) vtxdist_o[p] = k; }#if ( DEBUGlevel>=2 ) if (!iam) PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);#endif /* Compute distributed A + A' */ if ((apat_mem_l = a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i, n, rowptr, colind, noDomains, vtxdist_o, &bnz, &b_rowptr, &b_colind, grid)) > 0) return (apat_mem_l); mem += -apat_mem_l; /* Initialize and allocate storage for parMetis. */ (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes."); l_sizes = *sizes; (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep."); l_fstVtxSep = *fstVtxSep; m_loc = vtxdist_o[iam+1] - vtxdist_o[iam]; if ( iam < noDomains) /* dist_order_int is the perm returned by parMetis, distributed */ if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for dist_order_int."); /* ParMETIS represents the column pointers and row indices of * * the input matrix using integers. When SuperLU_DIST uses * * long int for the int_t type, then several supplementary * * copies need to be performed in order to call ParMETIS. */#if defined (_LONGINT) l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int)); if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int."); /* Allocate storage */ if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]"); for (i = 0; i <= m_loc; i++) b_rowptr_int[i] = b_rowptr[i]; SUPERLU_FREE (b_rowptr); if ( bnz ) { if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for b_colind_int[]"); for (i = 0; i < bnz; i++) b_colind_int[i] = b_colind[i]; SUPERLU_FREE (b_colind); } if ( !(vtxdist_o_int = (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for vtxdist_o_int."); for (i = 0; i <= nprocs_i; i++) vtxdist_o_int[i] = vtxdist_o[i]; SUPERLU_FREE (vtxdist_o);#else /* Default */ vtxdist_o_int = vtxdist_o; b_rowptr_int = b_rowptr; b_colind_int = b_colind; l_sizes_int = l_sizes;#endif if ( iam < noDomains) { options = (int *) SUPERLU_MALLOC(4 * sizeof(int)); options[0] = 0; options[1] = 0; options[2] = 0; options[3] = 1; ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, &numflag, options, dist_order_int, l_sizes_int, metis_comm); } if (bnz) SUPERLU_FREE (b_colind_int); if ( iam < noDomains) { SUPERLU_FREE (options); } SUPERLU_FREE (b_rowptr_int); #if defined (_LONGINT) /* Copy data from dist_order_int to dist_order */ if ( iam < noDomains) { /* dist_order is the perm returned by parMetis, distributed */ if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for dist_order."); for (i = 0; i < m_loc; i++) dist_order[i] = dist_order_int[i]; SUPERLU_FREE(dist_order_int); for (i = 0; i < 2*noDomains; i++) l_sizes[i] = l_sizes_int[i]; SUPERLU_FREE(l_sizes_int); }#else dist_order = dist_order_int;#endif /* Allgatherv dist_order to get perm_c */ if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) ABORT ("SUPERLU_MALLOC fails for displs."); if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) ABORT ("SUPERLU_MALLOC fails for recvcnts."); for (i = 0; i < nprocs_i; i++) recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i]; displs[0]=0; for(i=1; i < nprocs_i; i++) displs[i] = displs[i-1] + recvcnts[i-1]; MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, mpi_int_t, grid->comm); if ( iam < noDomains) { SUPERLU_FREE (dist_order); } SUPERLU_FREE (vtxdist_i); SUPERLU_FREE (vtxdist_o_int); SUPERLU_FREE (recvcnts); SUPERLU_FREE (displs); /* send l_sizes to every processor p >= noDomains */ if (!iam) for (p = noDomains; p < nprocs_i; p++) MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm); if (noDomains <= iam && iam < nprocs_i) MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm, &status); /* Determine the first node in each separator, store it in l_fstVtxSep */ for (j = 0; j < 2 * noDomains; j++) l_fstVtxSep[j] = 0; l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2]; szSep = noDomains; i = 0; while (szSep != 1) { for (j = i; j < i + szSep; j++) { l_fstVtxSep[j] += l_sizes[j]; } for (j = i; j < i + szSep; j++) { k = i + szSep + (j-i) / 2; l_fstVtxSep[k] += l_fstVtxSep[j]; } i += szSep; szSep = szSep / 2; } l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2]; i = 2 * noDomains - 2; szSep = 1; while (i > 0) { for (j = i; j < i + szSep; j++) { k = (i - 2 * szSep) + (j-i) * 2 + 1; noNodes = l_fstVtxSep[k]; l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k]; l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - noNodes - l_sizes[k-1]; } szSep *= 2; i -= szSep; }#if ( PRNTlevel>=2 ) if (!iam ) { PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes); PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep); }#endif#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");#endif return (-mem);} /* get_perm_c_parmetis */static floata_plus_at_CompRow_loc( int iam, /* Input - my processor number */ int_t *perm_r, /* Input - row permutation vector Pr */ int nprocs_i, /* Input - number of processors the input matrix is distributed on */ int_t *vtxdist_i, /* Input - index of first row on each processor of the input matrix */ int_t n, /* Input - number of columns in matrix A. */ int_t *rowptr, /* Input - row pointers of size m_loc+1 for matrix A. */ int_t *colind, /* Input - column indices of size nnz_loc for matrix A. */ int nprocs_o, /* Input - number of processors the output matrix is distributed on */ int_t *vtxdist_o, /* Input - index of first row on each processor of the output matrix */ int_t *p_bnz, /* Output - on exit, returns the actual number of local nonzeros in matrix A'+A. */ int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */ int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */ gridinfo_t *grid /* Input - grid of processors information */ ){/* * Purpose * ======= * * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in * NRformat_loc format, represented by (rowptr, colind). The output * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row * oriented), represented by (b_rowptr, b_colind). * * The input matrix A is distributed in block row format on nprocs_i * processors. The output matrix B is distributed in block row format * on nprocs_o processors, where nprocs_o <= nprocs_i. On output, the * matrix B has its rows permuted according to perm_r. * * Sketch of the algorithm * ======================= * * Let iam by my process number. Let fst_row, lst_row = m_loc + * fst_row be the first/last row stored on iam. * * Compute Pr' - the inverse row permutation, stored in iperm_r. * * Compute the transpose of the block row of Pr*A that iam owns: * T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr' * * * All to all communication such that every processor iam receives all * the blocks of the transpose matrix that it needs, that is * T[fst_row:lst_row, :] * * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :] * * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that * is compute Pr*B) and redistribute from nprocs_i to nprocs_o * according to the block row distribution in vtxdist_i, vtxdist_o. */ int_t i, j, k, col, num_nz, nprocs; int_t *tcolind_recv; /* temporary receive buffer */ int_t *tcolind_send; /* temporary send buffer */ int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv; int_t ind, ind_tmp, ind_rcv; int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */ int_t *marker, *iperm_r; int_t *sendCnts, *recvCnts; int_t *sdispls, *rdispls; int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t; int_t p, t_ind, nelts, ipcol; int_t m_loc, m_loc_o; /* number of local rows */ int_t fst_row, fst_row_o; /* index of first local row */ int_t nnz_loc; /* number of local nonzeros in matrix A */ float apat_mem, apat_mem_max; int *intBuf1, *intBuf2, *intBuf3, *intBuf4; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");#endif fst_row = vtxdist_i[iam]; m_loc = vtxdist_i[iam+1] - vtxdist_i[iam]; nnz_loc = rowptr[m_loc]; redist_pra = FALSE; nprocs = SUPERLU_MAX(nprocs_i, nprocs_o); apat_mem_max = 0.; if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for marker[]"); if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for iperm_r[]"); if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for sendCnts[]"); if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for recvCnts[]"); if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for sdispls[]"); if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for rdispls[]"); apat_mem = 2 * n + 4 * nprocs + 3;#if defined (_LONGINT) intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int)); intBuf2 = intBuf1 + nprocs; intBuf3 = intBuf1 + 2 * nprocs; intBuf4 = intBuf1 + 3 * nprocs; apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -