⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 get_perm_c_parmetis.c

📁 SuperLU 2.2版本。对大型、稀疏、非对称的线性系统的直接求解
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * -- Distributed symbolic factorization auxialiary routine  (version 2.1) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 * INRIA France - January 2004 * Laura Grigori * * November 1, 2007 *//* limits.h:  the largest positive integer (INT_MAX) */#include <limits.h>#include <math.h>#include "superlu_ddefs.h"/* * Internal protypes */static floata_plus_at_CompRow_loc(int, int_t *, int, int_t *, int_t , int_t *, int_t *,   int, int_t *, int_t *, int_t **,  int_t **, gridinfo_t *);floatget_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,		     int nprocs_i, int noDomains, 		     int_t **sizes, int_t **fstVtxSep,		     gridinfo_t *grid, MPI_Comm *metis_comm)/* * Purpose * ======= * * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a * graph partitioning algorithm to the symmetrized graph A+A'.  The * multilevel graph partitioning algorithm used is the * ParMETIS_V3_NodeND routine available in the parallel graph * partitioning package parMETIS.   * * The number of independent sub-domains noDomains computed by this * algorithm has to be a power of 2.  Hence noDomains is the larger * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * * npcol is the number of processors used in SuperLU_DIST. * * Arguments * ========= * * A       (input) SuperMatrix* *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number *         of the linear equations is A->nrow.  Matrix A is distributed *         in NRformat_loc format. * * perm_r  (input) int_t* *         Row permutation vector of size A->nrow, which defines the  *         permutation matrix Pr; perm_r[i] = j means row i of A is in  *         position j in Pr*A. * * perm_c  (output) int_t* *	   Column permutation vector of size A->ncol, which defines the  *         permutation matrix Pc; perm_c[i] = j means column i of A is  *         in position j in A*Pc. * * nprocs_i (input) int* *         Number of processors the input matrix is distributed on in a block *         row format.  It corresponds to number of processors used in *         SuperLU_DIST. * * noDomains (input) int*, must be power of 2 *         Number of independent domains to be computed by the graph *         partitioning algorithm.  ( noDomains <= nprocs_i ) * * sizes   (output) int_t**, of size 2 * noDomains *         Returns pointer to an array containing the number of nodes *         for each sub-domain and each separator.  Separators are stored  *         from left to right. *         Memory for the array is allocated in this routine. * * fstVtxSep (output) int_t**, of size 2 * noDomains *         Returns pointer to an array containing first node for each *         sub-domain and each separator. *         Memory for the array is allocated in this routine. * * Return value * ============ *   < 0, number of bytes allocated on return from the symbolic factorization. *   > 0, number of bytes allocated when out of memory. * */{  NRformat_loc *Astore;  int   iam, p;  int   *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;  int   *options, numflag;  int_t m_loc, nnz_loc, fst_row;  int_t m, n, bnz, i, j;  int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;  int_t *b_rowptr, *b_colind;  int_t *dist_order;  int  *recvcnts, *displs;  /* first row index on each processor when the matrix is distributed     on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */  int_t  *vtxdist_i, *vtxdist_o;   int_t szSep, k, noNodes;  float apat_mem_l; /* memory used during the computation of the graph of A+A' */  float mem;  /* Memory used during this routine */  MPI_Status status;  /* Initialization. */  MPI_Comm_rank (grid->comm, &iam);  n = A->ncol;  m = A->nrow;  if ( m != n ) ABORT("Matrix is not square");  mem = 0.;#if ( DEBUGlevel>=1 )  CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");#endif  Astore = (NRformat_loc *) A->Store;  nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */  m_loc = Astore->m_loc;     /* number of rows local to this processor */  fst_row = Astore->fst_row; /* global index of the first row */  rowptr = Astore->rowptr;   /* pointer to rows and column indices */  colind = Astore->colind;  #if ( PRNTlevel>=1 )  if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",		     noDomains);#endif  numflag = 0;  /* determine first row on each processor */  vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));  if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");  vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));  if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");  MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,		 grid->comm);  vtxdist_i[nprocs_i] = m;  if (noDomains == nprocs_i) {    /* keep the same distribution of A */    for (p = 0; p <= nprocs_i; p++)      vtxdist_o[p] = vtxdist_i[p];  }  else {    i = n / noDomains;    j = n % noDomains;    for (k = 0, p = 0; p < noDomains; p++) {      vtxdist_o[p] = k;      k += i;      if (p < j)  k++;    }    /* The remaining non-participating processors get the same        first-row-number as the last processor.   */    for (p = noDomains; p <= nprocs_i; p++)      vtxdist_o[p] = k;  }#if ( DEBUGlevel>=2 )  if (!iam)    PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);#endif    /* Compute distributed A + A' */  if ((apat_mem_l =        a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,			     n, rowptr, colind, noDomains, vtxdist_o,			     &bnz, &b_rowptr, &b_colind, grid)) > 0)    return (apat_mem_l);  mem += -apat_mem_l;    /* Initialize and allocate storage for parMetis. */      (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));  if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");  l_sizes = *sizes;  (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));  if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");  l_fstVtxSep = *fstVtxSep;  m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];    if ( iam < noDomains)     /* dist_order_int is the perm returned by parMetis, distributed */    if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int))))      ABORT("SUPERLU_MALLOC fails for dist_order_int.");  /* ParMETIS represents the column pointers and row indices of *   * the input matrix using integers. When SuperLU_DIST uses    *   * long int for the int_t type, then several supplementary    *   * copies need to be performed in order to call ParMETIS.     */#if defined (_LONGINT)  l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));  if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");    /* Allocate storage */  if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))    ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");  for (i = 0; i <= m_loc; i++)    b_rowptr_int[i] = b_rowptr[i];  SUPERLU_FREE (b_rowptr);    if ( bnz ) {    if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))      ABORT("SUPERLU_MALLOC fails for b_colind_int[]");    for (i = 0; i < bnz; i++)      b_colind_int[i] = b_colind[i];    SUPERLU_FREE (b_colind);  }    if ( !(vtxdist_o_int = 	 (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))    ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");  for (i = 0; i <= nprocs_i; i++)    vtxdist_o_int[i] = vtxdist_o[i];  SUPERLU_FREE (vtxdist_o);#else  /* Default */  vtxdist_o_int = vtxdist_o;  b_rowptr_int = b_rowptr; b_colind_int = b_colind;  l_sizes_int = l_sizes;#endif      if ( iam < noDomains) {    options = (int *) SUPERLU_MALLOC(4 * sizeof(int));    options[0] = 0;    options[1] = 0;    options[2] = 0;    options[3] = 1;    ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, 		       &numflag, options,		       dist_order_int, l_sizes_int, metis_comm);  }    if (bnz)     SUPERLU_FREE (b_colind_int);  if ( iam < noDomains) {    SUPERLU_FREE (options);  }  SUPERLU_FREE (b_rowptr_int);  #if defined (_LONGINT)  /* Copy data from dist_order_int to dist_order */  if ( iam < noDomains) {    /* dist_order is the perm returned by parMetis, distributed */    if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))      ABORT("SUPERLU_MALLOC fails for dist_order.");    for (i = 0; i < m_loc; i++)      dist_order[i] = dist_order_int[i];    SUPERLU_FREE(dist_order_int);        for (i = 0; i < 2*noDomains; i++)      l_sizes[i] = l_sizes_int[i];    SUPERLU_FREE(l_sizes_int);  }#else   dist_order = dist_order_int;#endif    /* Allgatherv dist_order to get perm_c */  if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))    ABORT ("SUPERLU_MALLOC fails for displs.");  if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))    ABORT ("SUPERLU_MALLOC fails for recvcnts.");  for (i = 0; i < nprocs_i; i++)    recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i];  displs[0]=0;  for(i=1; i < nprocs_i; i++)     displs[i] = displs[i-1] + recvcnts[i-1];    MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, 		  mpi_int_t, grid->comm);  if ( iam < noDomains) {    SUPERLU_FREE (dist_order);  }  SUPERLU_FREE (vtxdist_i);  SUPERLU_FREE (vtxdist_o_int);  SUPERLU_FREE (recvcnts);  SUPERLU_FREE (displs);    /* send l_sizes to every processor p >= noDomains */  if (!iam)    for (p = noDomains; p < nprocs_i; p++)      MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);  if (noDomains <= iam && iam < nprocs_i)    MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,	      &status);    /* Determine the first node in each separator, store it in l_fstVtxSep */    for (j = 0; j < 2 * noDomains; j++)    l_fstVtxSep[j] = 0;  l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];  szSep = noDomains;  i = 0;  while (szSep != 1) {    for (j = i; j < i + szSep; j++) {      l_fstVtxSep[j] += l_sizes[j]; 	          }    for (j = i; j < i + szSep; j++) {      k = i + szSep + (j-i) / 2;      l_fstVtxSep[k] += l_fstVtxSep[j];     }    i += szSep;    szSep = szSep / 2;  }    l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];  i = 2 * noDomains - 2;  szSep = 1;  while (i > 0) {    for (j = i; j < i + szSep; j++) {      k = (i - 2 * szSep) + (j-i) * 2 + 1;      noNodes = l_fstVtxSep[k];      l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];      l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - 	noNodes - l_sizes[k-1];    }    szSep *= 2;    i -= szSep;  }#if ( PRNTlevel>=2 )  if (!iam ) {    PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);    PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);  }#endif#if ( DEBUGlevel>=1 )  CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");#endif    return (-mem);} /* get_perm_c_parmetis */static floata_plus_at_CompRow_loc( int   iam,         /* Input - my processor number */ int_t *perm_r,     /* Input - row permutation vector Pr */ int   nprocs_i,    /* Input - number of processors the input matrix		       is distributed on */ int_t *vtxdist_i,  /* Input - index of first row on each processor of the input matrix */ int_t n,           /* Input - number of columns in matrix A. */ int_t *rowptr,     /* Input - row pointers of size m_loc+1 for matrix A. */ int_t *colind,     /* Input - column indices of size nnz_loc for matrix A. */ int   nprocs_o,    /* Input - number of processors the output matrix		       is distributed on */ int_t *vtxdist_o,  /* Input - index of first row on each processor of the output matrix */ int_t *p_bnz,      /* Output - on exit, returns the actual number of		       local nonzeros in matrix A'+A. */ int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */ int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */ gridinfo_t *grid    /* Input - grid of processors information */ ){/* * Purpose * ======= * * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in * NRformat_loc format, represented by (rowptr, colind). The output * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row * oriented), represented by (b_rowptr, b_colind). * * The input matrix A is distributed in block row format on nprocs_i * processors.  The output matrix B is distributed in block row format * on nprocs_o processors, where nprocs_o <= nprocs_i.  On output, the * matrix B has its rows permuted according to perm_r. * * Sketch of the algorithm * ======================= * * Let iam by my process number.  Let fst_row, lst_row = m_loc + * fst_row be the first/last row stored on iam. *  * Compute Pr' - the inverse row permutation, stored in iperm_r. * * Compute the transpose  of the block row of Pr*A that iam owns: *    T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr' * * * All to all communication such that every processor iam receives all * the blocks of the transpose matrix that it needs, that is *           T[fst_row:lst_row, :] * * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :] * * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that * is compute Pr*B) and redistribute from nprocs_i to nprocs_o * according to the block row distribution in vtxdist_i, vtxdist_o. */    int_t i, j, k, col, num_nz, nprocs;  int_t *tcolind_recv; /* temporary receive buffer */  int_t *tcolind_send; /* temporary send buffer */  int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv;  int_t ind, ind_tmp, ind_rcv;  int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */  int_t *marker, *iperm_r;  int_t *sendCnts, *recvCnts;  int_t *sdispls, *rdispls;  int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t;  int_t p, t_ind, nelts, ipcol;  int_t m_loc, m_loc_o;      /* number of local rows */   int_t fst_row, fst_row_o;  /* index of first local row */  int_t nnz_loc;    /* number of local nonzeros in matrix A */  float apat_mem, apat_mem_max;  int   *intBuf1, *intBuf2, *intBuf3, *intBuf4;  #if ( DEBUGlevel>=1 )  CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");#endif    fst_row    = vtxdist_i[iam];  m_loc      = vtxdist_i[iam+1] - vtxdist_i[iam];  nnz_loc    = rowptr[m_loc];  redist_pra = FALSE;    nprocs     = SUPERLU_MAX(nprocs_i, nprocs_o);  apat_mem_max = 0.;    if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for marker[]");  if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for iperm_r[]");  if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for sendCnts[]");  if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for recvCnts[]");  if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for sdispls[]");  if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))    ABORT("SUPERLU_MALLOC fails for rdispls[]");  apat_mem = 2 * n + 4 * nprocs + 3;#if defined (_LONGINT)  intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int));  intBuf2 = intBuf1 + nprocs;  intBuf3 = intBuf1 + 2 * nprocs;  intBuf4 = intBuf1 + 3 * nprocs;  apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);#endif  

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -