pdgstrs_bglobal.c

来自「SuperLU 2.2版本。对大型、稀疏、非对称的线性系统的直接求解」· C语言代码 · 共 956 行 · 第 1/2 页
956 行
/* * -- Distributed SuperLU routine (version 1.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * September 1, 1999 * * Modified: *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test */#include "superlu_ddefs.h"#define ISEND_IRECV/* * Function prototypes */#ifdef _CRAYfortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,		   double*, int*, double*, int*);fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 		   int*, double*, int*, double*, double*, int*);_fcd ftcs1;_fcd ftcs2;_fcd ftcs3;#endifstatic void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,                               LocalLU_t *, gridinfo_t *, int_t, int_t [],                               int_t [], double [], int_t, double []);voidpdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,                 double *B, int_t ldb, int nrhs,                 SuperLUStat_t *stat, int *info){/* * Purpose * ======= * * pdgstrs_Bglobal solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by pdgstrf. *  * Arguments * ========= * * n      (input) int (global) *        The order of the system of linear equations. * * LUstruct (input) LUstruct_t* *        The distributed data structures storing L and U factors. *        The L and U factors are obtained from pdgstrf for *        the possibly scaled and permuted matrix A. *        See superlu_ddefs.h for the definition of 'LUstruct_t'. * * grid   (input) gridinfo_t* *        The 2D process mesh. It contains the MPI communicator, the number *        of process rows (NPROW), the number of process columns (NPCOL), *        and my process rank. It is an input argument to all the *        parallel routines. *        Grid can be initialized by subroutine SUPERLU_GRIDINIT. *        See superlu_ddefs.h for the definition of 'gridinfo_t'. * * B      (input/output) double* *        On entry, the right-hand side matrix of the possibly equilibrated *        and row permuted system. *        On exit, the solution matrix of the possibly equilibrated *        and row permuted system if info = 0; * *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all  *              processes when calling this routine. * * ldb    (input) int (global) *        Leading dimension of matrix B. * * nrhs   (input) int (global) *        Number of right-hand sides. * * stat   (output) SuperLUStat_t* *        Record the statistics about the triangular solves. *        See util.h for the definition of 'SuperLUStat_t'. * * info   (output) int* * 	   = 0: successful exit *	   < 0: if info = -i, the i-th argument had an illegal value *         */    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;    LocalLU_t *Llu = LUstruct->Llu;    double alpha = 1.0;    double *lsum;  /* Local running sum of the updates to B-components */    double *x;     /* X component at step k. */    double *lusup, *dest;    double *recvbuf, *tempv;    double *rtemp; /* Result of full matrix-vector multiply. */    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */    int_t  iam, kcol, krow, mycol, myrow;    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;    int_t  nb, nlb, nub, nsupers;    int_t  *xsup, *lsub, *usub;    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/    int_t  Pc, Pr;    int    knsupc, nsupr;    int    ldalsum;   /* Number of lsum entries locally owned. */    int    maxrecvsz, p, pi;    int_t  **Lrowind_bc_ptr;    double **Lnzval_bc_ptr;    MPI_Status status;#if defined (ISEND_IRECV) || defined (BSEND)    MPI_Request *send_req, recv_req;#endif    /*-- Counts used for L-solve --*/    int_t  *fmod;         /* Modification count for L-solve. */    int_t  **fsendx_plist = Llu->fsendx_plist;    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */    int_t  *frecv;        /* Count of modifications to be recv'd from			     processes in this row. */    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */    int_t  nleaf = 0, nroot = 0;    /*-- Counts used for U-solve --*/    int_t  *bmod;         /* Modification count for L-solve. */    int_t  **bsendx_plist = Llu->bsendx_plist;    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */    int_t  *brecv;        /* Count of modifications to be recv'd from			     processes in this row. */    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */    double t;#if ( DEBUGlevel>=2 )    int_t Ublocks = 0;#endif    t = SuperLU_timer_();    /* Test input parameters. */    *info = 0;    if ( n < 0 ) *info = -1;    else if ( nrhs < 0 ) *info = -9;    if ( *info ) {	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);	return;    }	    /*     * Initialization.     */    iam = grid->iam;    Pc = grid->npcol;    Pr = grid->nprow;    myrow = MYROW( iam, grid );    mycol = MYCOL( iam, grid );    nsupers = Glu_persist->supno[n-1] + 1;    xsup = Glu_persist->xsup;    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */    stat->ops[SOLVE] = 0.0;    Llu->SolveMsgSent = 0;#if ( DEBUGlevel>=1 )    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");#endif    /* Save the count to be altered so it can be used by       subsequent call to PDGSTRS_BGLOBAL. */    if ( !(fmod = intMalloc_dist(nlb)) )	ABORT("Calloc fails for fmod[].");    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];    if ( !(frecv = intMalloc_dist(nlb)) )	ABORT("Malloc fails for frecv[].");    Llu->frecv = frecv;#if defined (ISEND_IRECV) || defined (BSEND)    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )	ABORT("Malloc fails for send_req[].");#endif#ifdef _CRAY    ftcs1 = _cptofcd("L", strlen("L"));    ftcs2 = _cptofcd("N", strlen("N"));    ftcs3 = _cptofcd("U", strlen("U"));#endif    /* Obtain ilsum[] and ldalsum for process column 0. */    ilsum = Llu->ilsum;    ldalsum = Llu->ldalsum;    /* Allocate working storage. */    knsupc = sp_ienv_dist(3);    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs         + nlb * LSUM_H)) )	ABORT("Calloc fails for lsum[].");    if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs         + nlb * XK_H)) )	ABORT("Malloc fails for x[].");    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )	ABORT("Malloc fails for recvbuf[].");    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )	ABORT("Malloc fails for rtemp[].");        /*---------------------------------------------------     * Forward solve Ly = b.     *---------------------------------------------------*/    /*     * Copy B into X on the diagonal processes.     */    ii = 0;    for (k = 0; k < nsupers; ++k) {	knsupc = SuperSize( k );	krow = PROW( k, grid );	if ( myrow == krow ) {	    lk = LBi( k, grid );   /* Local block number. */	    il = LSUM_BLK( lk );	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */	    kcol = PCOL( k, grid );	    if ( mycol == kcol ) { /* Diagonal process. */		jj = X_BLK( lk );		x[jj - XK_H] = k;  /* Block number prepended in the header. */		RHS_ITERATE(j)		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */			x[i + jj + j*knsupc] = B[i + ii + j*ldb];	    }	}	ii += knsupc;    }    /*     * Compute frecv[] and nfrecvmod counts on the diagonal processes.     */    {	superlu_scope_t *scp = &grid->rscp;	for (k = 0; k < nsupers; ++k) {	    krow = PROW( k, grid );	    if ( myrow == krow ) {		lk = LBi( k, grid );    /* Local block number. */		kcol = PCOL( k, grid ); /* Root process in this row scope. */		if ( mycol != kcol && fmod[lk] )		    i = 1;  /* Contribution from non-diagonal process. */		else i = 0;		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,			   MPI_SUM, kcol, scp->comm );		if ( mycol == kcol ) { /* Diagonal process. */		    nfrecvmod += frecv[lk];		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;#if ( DEBUGlevel>=2 )		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);		    assert( frecv[lk] < Pc );#endif		}	    }	}    }    /* ---------------------------------------------------------       Solve the leaf nodes first by all the diagonal processes.       --------------------------------------------------------- */#if ( DEBUGlevel>=2 )    printf("(%2d) nleaf %4d\n", iam, nleaf);#endif    for (k = 0; k < nsupers && nleaf; ++k) {	krow = PROW( k, grid );	kcol = PCOL( k, grid );	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */	    knsupc = SuperSize( k );	    lk = LBi( k, grid );	    if ( frecv[lk]==0 && fmod[lk]==0 ) {		fmod[lk] = -1;  /* Do not solve X[k] in the future. */		ii = X_BLK( lk );		lk = LBj( k, grid ); /* Local block number, column-wise. */		lsub = Lrowind_bc_ptr[lk];		lusup = Lnzval_bc_ptr[lk];		nsupr = lsub[1];#ifdef _CRAY		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,		      lusup, &nsupr, &x[ii], &knsupc);#elif defined (USE_VENDOR_BLAS)		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);#else		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 		       lusup, &nsupr, &x[ii], &knsupc);#endif		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;		--nleaf;#if ( DEBUGlevel>=2 )		printf("(%2d) Solve X[%2d]\n", iam, k);#endif				/*		 * Send Xk to process column Pc[k].		 */		for (p = 0; p < Pr; ++p) {		    if ( fsendx_plist[lk][p] != EMPTY ) {			pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,				   MPI_DOUBLE, pi, Xk, grid->comm,                                   &send_req[Llu->SolveMsgSent++]);#else#ifdef BSEND			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,                                   MPI_DOUBLE, pi, Xk, grid->comm );#else			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,				  MPI_DOUBLE,                                   pi, Xk, grid->comm );#endif#endif#if ( DEBUGlevel>=2 )			printf("(%2d) Sent X[%2.0f] to P %2d\n",			       iam, x[ii-XK_H], pi);#endif		    }		}		/*		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]		 */		nb = lsub[0] - 1;		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;		luptr = knsupc; /* Skip diagonal block L(k,k). */				dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,			   fmod, nb, lptr, luptr, xsup, grid, Llu, 			   send_req,stat);	    }	} /* if diagonal process ... */    } /* for k ... */    /* -----------------------------------------------------------       Compute the internal nodes asynchronously by all processes.       ----------------------------------------------------------- */#if ( DEBUGlevel>=2 )    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",	   iam, nfrecvx, nfrecvmod, nleaf);#endif    while ( nfrecvx || nfrecvmod ) { /* While not finished. */	/* Receive a message. */#ifdef ISEND_IRECV	/* -MPI- FATAL: Remote protocol queue full */	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,		 MPI_ANY_TAG, grid->comm, &recv_req );	MPI_Wait( &recv_req, &status );#else	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,		 MPI_ANY_TAG, grid->comm, &status );#endif	k = *recvbuf;#if ( DEBUGlevel>=2 )	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);#endif		switch ( status.MPI_TAG ) {	  case Xk:	      --nfrecvx;	      lk = LBj( k, grid ); /* Local block number, column-wise. */	      lsub = Lrowind_bc_ptr[lk];	      lusup = Lnzval_bc_ptr[lk];	      if ( lsub ) {		  nb   = lsub[0];		  lptr = BC_HEADER;		  luptr = 0;		  knsupc = SuperSize( k );		  /*		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]		   */		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,			     fmod, nb, lptr, luptr, xsup, grid, Llu, 			     send_req, stat);	      } /* if lsub */	      break;	  case LSUM: /* Receiver must be a diagonal process */	      --nfrecvmod;	      lk = LBi( k, grid ); /* Local block number, row-wise. */	      ii = X_BLK( lk );	      knsupc = SuperSize( k );	      tempv = &recvbuf[LSUM_H];	      RHS_ITERATE(j)		  for (i = 0; i < knsupc; ++i)		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {		  fmod[lk] = -1; /* Do not solve X[k] in the future. */		  lk = LBj( k, grid ); /* Local block number, column-wise. */		  lsub = Lrowind_bc_ptr[lk];		  lusup = Lnzval_bc_ptr[lk];		  nsupr = lsub[1];#ifdef _CRAY		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,			lusup, &nsupr, &x[ii], &knsupc);#elif defined (USE_VENDOR_BLAS)		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);#else		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 			 lusup, &nsupr, &x[ii], &knsupc);#endif		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;#if ( DEBUGlevel>=2 )		  printf("(%2d) Solve X[%2d]\n", iam, k);#endif				  /*		   * Send Xk to process column Pc[k].		   */		  kcol = PCOL( k, grid );		  for (p = 0; p < Pr; ++p) {		      if ( fsendx_plist[lk][p] != EMPTY ) {			  pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,				     MPI_DOUBLE, pi, Xk, grid->comm, 				     &send_req[Llu->SolveMsgSent++]);#else#ifdef BSEND			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,                                     MPI_DOUBLE, pi, Xk, grid->comm );#else			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,				   MPI_DOUBLE, pi, Xk, grid->comm );#endif#endif#if ( DEBUGlevel>=2 )			  printf("(%2d) Sent X[%2.0f] to P %2d\n",				 iam, x[ii-XK_H], pi);#endif		      }                  }		  /*		   * Perform local block modifications.		   */		  nb = lsub[0] - 1;		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;		  luptr = knsupc; /* Skip diagonal block L(k,k). */		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,			     fmod, nb, lptr, luptr, xsup, grid, Llu,			     send_req, stat);	      } /* if */	      break;#if ( DEBUGlevel>=2 )	      	    default:	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);	      break;#endif	  } /* switch */    } /* while not finished ... */#if ( PRNTlevel>=2 )    t = SuperLU_timer_() - t;    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
pdgstrs_bglobal.c - 源码说明

本页面展示了「SuperLU 2.2版本。对大型、稀疏、非对称的线性系统的直接求解」中的 pdgstrs_bglobal.c 源码文件，采用 C语言编程语言编写，共 956 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与SuperLU相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?