⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pdgstrf_irecv.c.bak

📁 LU分解求解矩阵方程组的解
💻 BAK
📖 第 1 页 / 共 3 页
字号:
/* * -- Distributed SuperLU routine (version 1.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * September 1, 1999 * * Modified: *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv */#include <math.h>#include "superlu_ddefs.h"#if ( VAMPIR>=1 )#include <VT.h>#endif/* * Internal prototypes */static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);#ifdef _CRAYstatic void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);#elsestatic void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,		     LocalLU_t *, SuperLUStat_t *);#endif/*  * Sketch of the algorithm * ======================= * * The following relations hold: *     * A_kk = L_kk * U_kk *     * L_ik = Aik * U_kk^(-1) *     * U_kj = L_kk^(-1) * A_kj * *              ---------------------------------- *              |   |                            | *              ----|----------------------------- *              |   | \ U_kk|                    | *              |   |   \   |        U_kj        | *              |   |L_kk \ |         ||         | *              ----|-------|---------||---------- *              |   |       |         \/         | *              |   |       |                    | *              |   |       |                    | *              |   |       |                    | *              |   | L_ik ==>       A_ij        | *              |   |       |                    | *              |   |       |                    | *              |   |       |                    | *              ---------------------------------- * * Handle the first block of columns separately. *     * Factor diagonal and subdiagonal blocks and test for exact *       singularity. ( pdgstrf2(0), one column at a time ) *     * Compute block row of U *     * Update trailing matrix *  * Loop over the remaining blocks of columns. *   mycol = MYCOL( iam, grid ); *   myrow = MYROW( iam, grid ); *   N = nsupers; *   For (k = 1; k < N; ++k) { *       krow = PROW( k, grid ); *       kcol = PCOL( k, grid ); *       Pkk = PNUM( krow, kcol, grid ); * *     * Factor diagonal and subdiagonal blocks and test for exact *       singularity. *       if ( mycol == kcol ) { *           pdgstrf2(k), one column at a time  *       } * *     * Parallel triangular solve *       if ( iam == Pkk ) multicast L_k,k to this process row; *       if ( myrow == krow && mycol != kcol ) { *          Recv L_k,k from process Pkk; *          for (j = k+1; j < N; ++j)  *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 ) *                 U_k,j = L_k,k \ A_k,j; *       } * *     * Parallel rank-k update *       if ( myrow == krow ) multicast U_k,k+1:N to this process column; *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row; *       if ( myrow != krow ) { *          Pkj = PNUM( krow, mycol, grid ); *          Recv U_k,k+1:N from process Pkj; *       } *       if ( mycol != kcol ) { *          Pik = PNUM( myrow, kcol, grid ); *          Recv L_k+1:N,k from process Pik; *       } *       for (j = k+1; k < N; ++k) { *          for (i = k+1; i < N; ++i)  *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) *                   && L_i,k != 0 && U_k,j != 0 ) *                 A_i,j = A_i,j - L_i,k * U_k,j; *       } *  } * * * Remaining issues *   (1) Use local indices for L subscripts and SPA.  [DONE] * *//************************************************************************/int_t pdgstrf/************************************************************************/( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info )/*  * Purpose * ======= * *  PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= *  * options (input) superlu_options_t* *         The structure defines the input parameters to control *         how the LU decomposition will be performed. *         The following field should be defined: *         o ReplaceTinyPivot (yes_no_t) *           Specifies whether to replace the tiny diagonals by *           sqrt(epsilon)*norm(A) during LU factorization. * * m      (input) int *        Number of rows in the matrix. * * n      (input) int *        Number of columns in the matrix. * * anorm  (input) double *        The norm of the original matrix A, or the scaled A if *        equilibration was done. * * LUstruct (input/output) LUstruct_t* *         The data structures to store the distributed L and U factors. *         The following fields should be defined: * *         o Glu_persist (input) Glu_persist_t* *           Global data structure (xsup, supno) replicated on all processes, *           describing the supernode partition in the factored matrices *           L and U: *	       xsup[s] is the leading column of the s-th supernode, *             supno[i] is the supernode number to which column i belongs. * *         o Llu (input/output) LocalLU_t* *           The distributed data structures to store L and U factors. *           See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid   (input) gridinfo_t* *        The 2D process mesh. It contains the MPI communicator, the number *        of process rows (NPROW), the number of process columns (NPCOL), *        and my process rank. It is an input argument to all the *        parallel routines. *        Grid can be initialized by subroutine SUPERLU_GRIDINIT. *        See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat   (output) SuperLUStat_t* *        Record the statistics on runtime and floating-point operation count. *        See util.h for the definition of 'SuperLUStat_t'. * * info   (output) int* *        = 0: successful exit *        < 0: if info = -i, the i-th argument had an illegal value *        > 0: if info = i, U(i,i) is exactly zero. The factorization has *             been completed, but the factor U is exactly singular, *             and division by zero will occur if it is used to solve a *             system of equations. * */{#ifdef _CRAY    _fcd ftcs = _cptofcd("N", strlen("N"));    _fcd ftcs1 = _cptofcd("L", strlen("L"));    _fcd ftcs2 = _cptofcd("N", strlen("N"));    _fcd ftcs3 = _cptofcd("U", strlen("U"));#endif    double alpha = 1.0, beta = 0.0;    int_t *xsup;    int_t *lsub, *lsub1, *usub, *Usub_buf,          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */    double *lusup, *lusup1, *uval, *Uval_buf,           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,          nlb, nub, nsupc, rel, rukp;    int_t Pc, Pr;    int   iam, kcol, krow, mycol, myrow, pi, pj;    int   j, k, lk, nsupers;    int   nsupr, nbrow, segsize;    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:		      *     0 : transferred in Lsub_buf[]		      *     1 : transferred in Lval_buf[]		      *     2 : transferred in Usub_buf[] 		      *     3 : transferred in Uval_buf[]		      */    int_t  msg0, msg2;    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;    double **Unzval_br_ptr, **Lnzval_bc_ptr;    int_t  *index;    double *nzval;    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */    double *ucol;    int_t  *indirect;    double *tempv, *tempv2d;    int_t iinfo;    int_t *ToRecv, *ToSendD, **ToSendR;    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;    LocalLU_t *Llu = LUstruct->Llu;    superlu_scope_t *scp;    double s_eps, thresh;    double *tempU2d, *tempu;    int    full, ldt, ldu, lead_zero, ncols;    MPI_Request recv_req[4], *send_req;    MPI_Status status;#if ( DEBUGlevel>=2 )     int_t num_copy=0, num_update=0;#endif#if ( PRNTlevel==3 )    int_t  zero_msg = 0, total_msg = 0;#endif#if ( PROFlevel>=1 )    double t1, t2;    float msg_vol = 0, msg_cnt = 0;    int_t iword = sizeof(int_t), dword = sizeof(double);#endif    /* Test the input parameters. */    *info = 0;    if ( m < 0 ) *info = -2;    else if ( n < 0 ) *info = -3;    if ( *info ) {	pxerbla("pdgstrf", grid, -*info);	return (-1);    }    /* Quick return if possible. */    if ( m == 0 || n == 0 ) return 0;    /*     * Initialization.     */    iam = grid->iam;    Pc = grid->npcol;    Pr = grid->nprow;    myrow = MYROW( iam, grid );    mycol = MYCOL( iam, grid );    nsupers = Glu_persist->supno[n-1] + 1;    xsup = Glu_persist->xsup;    s_eps = slamch_("Epsilon");    thresh = s_eps * anorm;#if ( DEBUGlevel>=1 )    CHECK_MALLOC(iam, "Enter pdgstrf()");#endif    stat->ops[FACT] = 0.0;    if ( Pr*Pc > 1 ) {	i = Llu->bufmax[0];	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )	    ABORT("Malloc fails for Lsub_buf.");	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;	i = Llu->bufmax[1];	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )	    ABORT("Malloc fails for Lval_buf[].");	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;	if ( Llu->bufmax[2] != 0 ) 	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )		ABORT("Malloc fails for Usub_buf[].");	if ( Llu->bufmax[3] != 0 ) 	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )		ABORT("Malloc fails for Uval_buf[].");	if ( !(send_req =	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))	    ABORT("Malloc fails for send_req[].");    }    if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )	ABORT("Malloc fails for ujrow[].");#if ( PRNTlevel>=1 )    if ( !iam ) {	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",	       Llu->bufmax[0], Llu->bufmax[1], 	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);    }#endif    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];    Lval_buf_2[0] = Llu->Lval_buf_2[0];    Lval_buf_2[1] = Llu->Lval_buf_2[1];    Usub_buf = Llu->Usub_buf;    Uval_buf = Llu->Uval_buf;    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;    Unzval_br_ptr = Llu->Unzval_br_ptr;    ToRecv = Llu->ToRecv;    ToSendD = Llu->ToSendD;    ToSendR = Llu->ToSendR;    ldt = sp_ienv_dist(3); /* Size of maximum supernode */    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )	ABORT("Calloc fails for tempv2d[].");    tempU2d = tempv2d + ldt*ldt;    if ( !(indirect = intMalloc_dist(ldt)) )	ABORT("Malloc fails for indirect[].");    k = CEILING( nsupers, Pr ); /* Number of local block rows */    if ( !(iuip = intMalloc_dist(k)) )	ABORT("Malloc fails for iuip[].");    if ( !(ruip = intMalloc_dist(k)) )	ABORT("Malloc fails for ruip[].");#if ( VAMPIR>=1 )    VT_symdef(1, "Send-L", "Comm");    VT_symdef(2, "Recv-L", "Comm");    VT_symdef(3, "Send-U", "Comm");    VT_symdef(4, "Recv-U", "Comm");    VT_symdef(5, "TRF2", "Factor");    VT_symdef(100, "Factor", "Factor");    VT_begin(100);    VT_traceon();#endif    /* ---------------------------------------------------------------       Handle the first block column separately to start the pipeline.       --------------------------------------------------------------- */    if ( mycol == 0 ) {#if ( VAMPIR>=1 )	VT_begin(5);#endif	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);#if ( VAMPIR>=1 )	VT_end(5);#endif	scp = &grid->rscp; /* The scope of process row. */	/* Process column *kcol* multicasts numeric values of L(:,k) 	   to process rows. */	lsub = Lrowind_bc_ptr[0];	lusup = Lnzval_bc_ptr[0];	if ( lsub ) {	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;	    msgcnt[1] = lsub[1] * SuperSize( 0 );	} else {	    msgcnt[0] = msgcnt[1] = 0;	}		for (pj = 0; pj < Pc; ++pj) {	    if ( ToSendR[0][pj] != EMPTY ) {#if ( PROFlevel>=1 )		TIC(t1);#endif#if ( VAMPIR>=1 )		VT_begin(1);#endif		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,			  &send_req[pj] );		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,			  &send_req[pj+Pc] );#if ( DEBUGlevel>=2 )		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",		       iam, 0, msgcnt[0], msgcnt[1], pj);#endif#if ( VAMPIR>=1 )		VT_end(1);#endif#if ( PROFlevel>=1 )		TOC(t2, t1);		stat->utime[COMM] += t2;		msg_cnt += 2;		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;#endif	    }	} /* for pj ... */    } else { /* Post immediate receives. */	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */	    scp = &grid->rscp; /* The scope of process row. */	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,		      0, scp->comm, &recv_req[0] );	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,		      1, scp->comm, &recv_req[1] );#if ( DEBUGlevel>=2 )	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);#endif	}    } /* if mycol == 0 */    /* ------------------------------------------       MAIN LOOP: Loop through all block columns.       ------------------------------------------ */    for (k = 0; k < nsupers; ++k) {	knsupc = SuperSize( k );	krow = PROW( k, grid );	kcol = PCOL( k, grid );	if ( mycol == kcol ) {	    lk = LBj( k, grid ); /* Local block number. */	    for (pj = 0; pj < Pc; ++pj) {                /* Wait for Isend to complete before using lsub/lusup. */		if ( ToSendR[lk][pj] != EMPTY ) {		    MPI_Wait( &send_req[pj], &status );		    MPI_Wait( &send_req[pj+Pc], &status );		}	    }	    lsub = Lrowind_bc_ptr[lk];	    lusup = Lnzval_bc_ptr[lk];	} else {	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */		scp = &grid->rscp; /* The scope of process row. */#if ( PROFlevel>=1 )		TIC(t1);#endif#if ( VAMPIR>=1 )		VT_begin(2);#endif		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 		  Llu->bufmax[0]);*/		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 			 (4*k)%NTAGS, scp->comm, &status );*/		MPI_Wait( &recv_req[0], &status );		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 		  Llu->bufmax[1]);*/		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 			 (4*k+1)%NTAGS, scp->comm, &status );*/		MPI_Wait( &recv_req[1], &status );		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -