📄 pzgsmv.c
字号:
/* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * */#include <math.h>#include "superlu_zdefs.h"void pzgsmv_init( SuperMatrix *A, /* Matrix A permuted by columns (input/output). The type of A can be: Stype = NR_loc; Dtype = D; Mtype = GE. */ int_t *row_to_proc, /* Input. Mapping between rows and processes. */ gridinfo_t *grid, /* Input */ pzgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ ){ NRformat_loc *Astore; int iam, p, procs; int *SendCounts, *RecvCounts; int_t i, j, k, l, m, m_loc, n, fst_row, jcol; int_t TotalIndSend, TotalValSend; int_t *colind, *rowptr; int_t *ind_tosend = NULL, *ind_torecv = NULL; int_t *ptr_ind_tosend, *ptr_ind_torecv; int_t *extern_start, *spa, *itemp; doublecomplex *nzval, *val_tosend = NULL, *val_torecv = NULL, t; MPI_Request *send_req, *recv_req; MPI_Status status;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzgsmv_init()");#endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; m = A->nrow; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; colind = Astore->colind; rowptr = Astore->rowptr; nzval = Astore->nzval; if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) ABORT("Malloc fails for SendCounts[]"); /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ RecvCounts = SendCounts + procs; if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) ABORT("Malloc fails for ptr_ind_tosend[]"); ptr_ind_torecv = ptr_ind_tosend + procs + 1; if ( !(extern_start = intMalloc_dist(m_loc)) ) ABORT("Malloc fails for extern_start[]"); for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; /* ------------------------------------------------------------ COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE LOCAL PART OF X. THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ ABORT("Malloc fails for spa[]"); for (p = 0; p < procs; ++p) SendCounts[p] = 0; for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = extern_start[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ jcol = colind[j]; p = row_to_proc[jcol]; if ( p != iam ) { /* External */ if ( spa[jcol] == 0 ) { /* First time see this index */ ++SendCounts[p]; spa[jcol] = 1; } } else { /* Swap to beginning the part of A corresponding to the local part of X */ l = colind[k]; t = nzval[k]; colind[k] = jcol; nzval[k] = nzval[j]; colind[j] = l; nzval[j] = t; ++k; } } extern_start[i] = k; } /* ------------------------------------------------------------ LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ /* Build pointers to ind_tosend[]. */ ptr_ind_tosend[0] = 0; for (p = 0, TotalIndSend = 0; p < procs; ++p) { TotalIndSend += SendCounts[p]; /* Total to send. */ ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; }#if 0 ptr_ind_tosend[iam] = 0; /* Local part of X */#endif if ( TotalIndSend ) { if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ } /* Build SPA to aid global to local translation. */ for (i = 0; i < n; ++i) spa[i] = EMPTY; for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; if ( spa[jcol] == EMPTY ) { /* First time see this index */ p = row_to_proc[jcol]; if ( p == iam ) { /* Local */ /*assert(jcol>=fst_row);*/ spa[jcol] = jcol - fst_row; /* Relative position in local X */ } else { /* External */ ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ ++ptr_ind_tosend[p]; } } } } /* ------------------------------------------------------------ TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; colind[j] = spa[jcol]; } } /* ------------------------------------------------------------ COMMUNICATE THE EXTERNAL INDICES OF X. ------------------------------------------------------------*/ MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, grid->comm); /* Build pointers to ind_torecv[]. */ ptr_ind_torecv[0] = 0; for (p = 0, TotalValSend = 0; p < procs; ++p) { TotalValSend += RecvCounts[p]; /* Total to receive. */ ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; } if ( TotalValSend ) { if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for ind_torecv[]"); } if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) ABORT("Malloc fails for recv_req[]."); recv_req = send_req + procs; for (p = 0; p < procs; ++p) { ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ if ( SendCounts[p] ) { MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], mpi_int_t, p, iam, grid->comm, &send_req[p]); } if ( RecvCounts[p] ) { MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], mpi_int_t, p, p, grid->comm, &recv_req[p]); } } for (p = 0; p < procs; ++p) { if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); } /* Allocate storage for the X values to to transferred. */ if ( TotalIndSend && !(val_torecv = doublecomplexMalloc_dist(TotalIndSend)) ) ABORT("Malloc fails for val_torecv[]."); if ( TotalValSend && !(val_tosend = doublecomplexMalloc_dist(TotalValSend)) ) ABORT("Malloc fails for val_tosend[]."); gsmv_comm->extern_start = extern_start; gsmv_comm->ind_tosend = ind_tosend; gsmv_comm->ind_torecv = ind_torecv; gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; gsmv_comm->SendCounts = SendCounts; gsmv_comm->RecvCounts = RecvCounts; gsmv_comm->val_tosend = val_tosend; gsmv_comm->val_torecv = val_torecv; gsmv_comm->TotalIndSend = TotalIndSend; gsmv_comm->TotalValSend = TotalValSend; SUPERLU_FREE(spa); SUPERLU_FREE(send_req);#if ( DEBUGlevel>=1 ) PrintInt10("pzgsmv_init::rowptr", m_loc+1, rowptr); PrintInt10("pzgsmv_init::extern_start", m_loc, extern_start); CHECK_MALLOC(iam, "Exit pzgsmv_init()");#endif} /* PZGSMV_INIT *//* * Performs sparse matrix-vector multiplication. */voidpzgsmv( int_t abs, /* Input. Do abs(A)*abs(x). */ SuperMatrix *A_internal, /* Input. Matrix A permuted by columns. The column indices are translated into the relative positions in the gathered x-vector. The type of A can be: Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. */ gridinfo_t *grid, /* Input */ pzgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */ doublecomplex x[], /* Input. The distributed source vector */ doublecomplex ax[] /* Output. The distributed destination vector */){ NRformat_loc *Astore; int iam, procs; int_t i, j, p, m, m_loc, n, fst_row, jcol; int_t *colind, *rowptr; int *SendCounts, *RecvCounts; int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv; int_t *extern_start, TotalValSend; doublecomplex *nzval, *val_tosend, *val_torecv; doublecomplex zero = {0.0, 0.0}, temp; double *ax_abs = (double *) ax; MPI_Request *send_req, *recv_req; MPI_Status status;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzgsmv()");#endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A_internal->Store; m = A_internal->nrow; n = A_internal->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; colind = Astore->colind; rowptr = Astore->rowptr; nzval = (doublecomplex *) Astore->nzval; extern_start = gsmv_comm->extern_start; ind_torecv = gsmv_comm->ind_torecv; ptr_ind_tosend = gsmv_comm->ptr_ind_tosend; ptr_ind_torecv = gsmv_comm->ptr_ind_torecv; SendCounts = gsmv_comm->SendCounts; RecvCounts = gsmv_comm->RecvCounts; val_tosend = (doublecomplex *) gsmv_comm->val_tosend; val_torecv = (doublecomplex *) gsmv_comm->val_torecv; TotalValSend = gsmv_comm->TotalValSend; /* ------------------------------------------------------------ COPY THE X VALUES INTO THE SEND BUFFER. ------------------------------------------------------------*/ for (i = 0; i < TotalValSend; ++i) { j = ind_torecv[i] - fst_row; /* Relative index in x[] */ val_tosend[i] = x[j]; } /* ------------------------------------------------------------ COMMUNICATE THE X VALUES. ------------------------------------------------------------*/ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) ABORT("Malloc fails for recv_req[]."); recv_req = send_req + procs; for (p = 0; p < procs; ++p) { if ( RecvCounts[p] ) { MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p], SuperLU_MPI_DOUBLE_COMPLEX, p, iam, grid->comm, &send_req[p]); } if ( SendCounts[p] ) { MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p], SuperLU_MPI_DOUBLE_COMPLEX, p, p, grid->comm, &recv_req[p]); } } for (p = 0; p < procs; ++p) { if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); } /* ------------------------------------------------------------ PERFORM THE ACTUAL MULTIPLICATION. ------------------------------------------------------------*/ if ( abs ) { /* Perform abs(A)*abs(x) */ for (i = 0; i < m_loc; ++i) { /* Loop through each row */ ax_abs[i] = 0.0; /* Multiply the local part. */ for (j = rowptr[i]; j < extern_start[i]; ++j) { jcol = colind[j]; ax_abs[i] += z_abs1(&nzval[j]) * z_abs1(&x[jcol]); } /* Multiply the external part. */ for (; j < rowptr[i+1]; ++j) { jcol = colind[j]; ax_abs[i] += z_abs1(&nzval[j]) * z_abs(&val_torecv[jcol]); } } } else { for (i = 0; i < m_loc; ++i) { /* Loop through each row */ ax[i] = zero; /* Multiply the local part. */ for (j = rowptr[i]; j < extern_start[i]; ++j) { jcol = colind[j]; zz_mult(&temp, &nzval[j], &x[jcol]); z_add(&ax[i], &ax[i], &temp); } /* Multiply the external part. */ for (; j < rowptr[i+1]; ++j) { jcol = colind[j]; zz_mult(&temp, &nzval[j], &val_torecv[jcol]); z_add(&ax[i], &ax[i], &temp); } } } SUPERLU_FREE(send_req);#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pzgsmv()");#endif} /* PZGSMV */void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm){ int_t *it; doublecomplex *dt; SUPERLU_FREE(gsmv_comm->extern_start); if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); SUPERLU_FREE(gsmv_comm->SendCounts); if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -