📄 pzgstrs1.c
字号:
/* * -- Distributed SuperLU routine (version 1.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * September 1, 1999 * * Modified: * Feburary 7, 2001 use MPI_Isend/MPI_Irecv * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test */#include "superlu_zdefs.h"#define ISEND_IRECV/* * Function prototypes */#ifdef _CRAYfortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*, doublecomplex*, int*, doublecomplex*, int*);fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);_fcd ftcs1;_fcd ftcs2;_fcd ftcs3;#endifvoid pzgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *x, int nrhs, SuperLUStat_t *stat, int *info){/* * Purpose * ======= * * PZGSTRS1 solves a system of distributed linear equations * * op( sub(A) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PZGSTRF. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures to store L and U factors, * and the permutation vectors. * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. * * grid (input) gridinfo_t* * The 2D process mesh. * * x (input/output) doublecomplex* * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * NOTE: the right-hand side matrix is already distributed on * the diagonal processes. * * nrhs (input) int (global) * Number of right-hand sides. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves; * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status;#ifdef ISEND_IRECV MPI_Request *send_req, recv_req;#endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t;#if ( DEBUGlevel>=2 ) int_t Ublocks = 0;#endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -8; if ( *info ) { pxerbla("PZGSTRS1", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->SolveMsgSent = 0;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs1()");#endif /* Save the count to be altered so it can be used by subsequent call to PZGSTRS1. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv;#ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[].");#endif#ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U"));#endif /* Compute ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Prepended the block number in the header for lsum[]. */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k; lsum[il - LSUM_H].i = 0; } } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf;#if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc );#endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */#if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf);#endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( !frecv[lk] && !fmod[lk] ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1];#ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);#else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ --nleaf;#if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k);#endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]);#else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );#endif#if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi);#endif } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asynchronously by all processes. */#if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf);#endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */#ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status );#else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );#endif k = (*recvbuf).r;#if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);#endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1];#ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);#else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/#if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k);#endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] );#else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );#endif#if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi);#endif } /*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -