📄 pdgstrs_bglobal.c.bak
字号:
t = SuperLU_timer_();#endif#if ( DEBUGlevel>=2 ) printf("\n(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); } MPI_Barrier( grid->comm ); }#endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp);#ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0;#endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PDGSTRS_BGLOBAL. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot;#if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc );#endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; } } /* Set up additional pointers for the index and value arrays of U. nub is the number of local block columns. */ nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero blocks in a block column. */ Urbs1 = Urbs + nub; if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) ABORT("Malloc fails for Ucb_indptr[]"); if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) ABORT("Malloc fails for Ucb_valptr[]"); /* Count number of row blocks in a block column. One pass of the skeleton graph of U. */ for (lk = 0; lk < nlb; ++lk) { usub = Ufstnz_br_ptr[lk]; if ( usub ) { /* Not an empty block row. */ /* usub[0] -- number of column blocks in this block row. */#if ( DEBUGlevel>=2 ) Ublocks += usub[0];#endif i = BR_HEADER; /* Pointer in index array. */ for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ k = usub[i]; /* Global block number */ ++Urbs[LBj(k,grid)]; i += UB_DESCRIPTOR + SuperSize( k ); } } } /* Set up the vertical linked lists for the row blocks. One pass of the skeleton graph of U. */ for (lb = 0; lb < nub; ++lb) { if ( Urbs[lb] ) { /* Not an empty block column. */ if ( !(Ucb_indptr[lb] = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) ABORT("Malloc fails for Ucb_indptr[lb][]"); if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) ABORT("Malloc fails for Ucb_valptr[lb][]"); } } for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ usub = Ufstnz_br_ptr[lk]; if ( usub ) { /* Not an empty block row. */ i = BR_HEADER; /* Pointer in index array. */ j = 0; /* Pointer in nzval array. */ for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ k = usub[i]; /* Global block number, column-wise. */ ljb = LBj( k, grid ); /* Local block number, column-wise. */ Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; Ucb_valptr[ljb][Urbs1[ljb]] = j; ++Urbs1[ljb]; j += usub[i+1]; i += UB_DESCRIPTOR + SuperSize( k ); } } }#if ( DEBUGlevel>=2 ) for (p = 0; p < Pr*Pc; ++p) { if (iam == p) { printf("(%2d) .. Ublocks %d\n", iam, Ublocks); for (lb = 0; lb < nub; ++lb) { printf("(%2d) Local col %2d: # row blocks %2d\n", iam, lb, Urbs[lb]); if ( Urbs[lb] ) { for (i = 0; i < Urbs[lb]; ++i) printf("(%2d) .. row blk %2d:\ lbnum %d, indpos %d, valpos %d\n", iam, i, Ucb_indptr[lb][i].lbnum, Ucb_indptr[lb][i].indpos, Ucb_valptr[lb][i]); } } } MPI_Barrier( grid->comm ); } for (p = 0; p < Pr*Pc; ++p) { if ( iam == p ) { printf("\n(%d) bsendx_plist[][]", iam); for (lb = 0; lb < nub; ++lb) { printf("\n(%d) .. local col %2d: ", iam, lb); for (i = 0; i < Pr; ++i) printf("%4d", bsendx_plist[lb][i]); } printf("\n"); } MPI_Barrier( grid->comm ); }#endif /* DEBUGlevel */#if ( PRNTlevel>=3 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); t = SuperLU_timer_();#endif /* * Solve the roots first by all the diagonal processes. */#if ( DEBUGlevel>=1 ) printf("(%2d) nroot %4d\n", iam, nroot);#endif for (k = nsupers-1; k >= 0 && nroot; --k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number, row-wise. */ if ( brecv[lk]==0 && bmod[lk]==0 ) { bmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1];#ifdef _CRAY STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#else dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#endif stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; --nroot;#if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k);#endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( bsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]);#else#ifdef BSEND MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm );#else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm );#endif#endif#if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi);#endif } } /* * Perform local block modifications: lsum[i] -= U_i,k * X[k] */ if ( Urbs[lk] ) dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, Ucb_indptr, Ucb_valptr, xsup, grid, Llu, send_req, stat); } /* if root ... */ } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asychronously by all processes. */ while ( nbrecvx || nbrecvmod ) { /* While not finished. */ /* Receive a message. */ MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); k = *recvbuf;#if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);#endif switch ( status.MPI_TAG ) { case Xk: --nbrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ /* * Perform local block modifications: * lsum[i] -= U_i,k * X[k] */ dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, Ucb_indptr, Ucb_valptr, xsup, grid, Llu, send_req, stat); break; case LSUM: /* Receiver must be a diagonal process */ --nbrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; if ( (--brecv[lk])==0 && bmod[lk]==0 ) { bmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1];#ifdef _CRAY STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#else dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc);#endif stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;#if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k);#endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( bsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid );#ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] );#else#ifdef BSEND MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm );#else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm );#endif#endif#if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii - XK_H], pi);#endif } } /* * Perform local block modifications: * lsum[i] -= U_i,k * X[k] */ if ( Urbs[lk] ) dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, Ucb_indptr, Ucb_valptr, xsup, grid, Llu, send_req, stat); } /* if becomes solvable */ break;#if ( DEBUGlevel>=1 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break;#endif } /* switch */ } /* while not finished ... */#if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);#endif /* Copy the solution X into B (on all processes). */ { int_t num_diag_procs, *diag_procs, *diag_len; double *work; get_diag_procs(n, Glu_persist, grid, &num_diag_procs, &diag_procs, &diag_len); jj = diag_len[0]; for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); if ( !(work = doubleMalloc_dist(((size_t)jj)*nrhs)) ) ABORT("Malloc fails for work[]"); gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, grid, num_diag_procs, diag_procs, diag_len, B, ldb, work); SUPERLU_FREE(diag_procs); SUPERLU_FREE(diag_len); SUPERLU_FREE(work); } /* Deallocate storage. */ SUPERLU_FREE(lsum); SUPERLU_FREE(x); SUPERLU_FREE(recvbuf); for (i = 0; i < nub; ++i) if ( Urbs[i] ) { SUPERLU_FREE(Ucb_indptr[i]); SUPERLU_FREE(Ucb_valptr[i]); } SUPERLU_FREE(Ucb_indptr); SUPERLU_FREE(Ucb_valptr); SUPERLU_FREE(Urbs); SUPERLU_FREE(bmod); SUPERLU_FREE(brecv);#ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); SUPERLU_FREE(send_req);#endif#ifdef BSEND SUPERLU_FREE(send_req);#endif stat->utime[SOLVE] = SuperLU_timer_() - t;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");#endif} /* PDGSTRS_BGLOBAL *//* * Gather the components of x vector on the diagonal processes * onto all processes, and combine them into the global vector y. */static voidgather_diag_to_all(int_t n, int_t nrhs, double x[], Glu_persist_t *Glu_persist, LocalLU_t *Llu, gridinfo_t *grid, int_t num_diag_procs, int_t diag_procs[], int_t diag_len[], double y[], int_t ldy, double work[]){ int_t i, ii, j, k, lk, lwork, nsupers, p; int_t *ilsum, *xsup; int iam, knsupc, pkk; double *x_col, *y_col; iam = grid->iam; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; for (p = 0; p < num_diag_procs; ++p) { pkk = diag_procs[p]; if ( iam == pkk ) { /* Copy x vector into a buffer. */ lwork = 0; for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ x_col = &x[ii]; for (j = 0; j < nrhs; ++j) { for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; lwork += knsupc; x_col += knsupc; } } MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); } else { MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm ); } /* Scatter work[] into global y vector. */ lwork = 0; for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); ii = FstBlockC( k ); y_col = &y[ii]; for (j = 0; j < nrhs; ++j) { for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; lwork += knsupc; y_col += ldy; } } }} /* GATHER_DIAG_TO_ALL */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -