📄 pzsymbfact_distdata.c.old
字号:
xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; } for (i = 0, j = 0, p = 0; p < nprocs; p++) { if ( p != iam ) { ptrToSend[p] = i; i += nnzToSend[p]; } ptrToRecv[p] = j; j += nnzToRecv[p]; } nnzToRecv[iam] = 0; ind_loc = ptrToRecv[iam]; for (gb_n = 0; gb_n < nsupers; gb_n++) { nsend_2 = 0; i = xsup_n[gb_n]; if (iam == OWNER( globToLoc[i] )) { pc = PCOL( gb_n, grid ); pr = PROW( gb_n, grid ); p_diag = PNUM( pr, pc, grid ); i_loc = LOCAL_IND( globToLoc[i] ); gb_s = supno_s[i_loc]; fst_s = xsup_beg_s[gb_s]; lst_s = xsup_end_s[gb_s]; fst_s_l = LOCAL_IND( globToLoc[fst_s] ); if (sendL) { p = pc; np = grid->nprow; } else { p = pr * grid->npcol; np = grid->npcol; } for (j = 0; j < np; j++) { if (p == iam) { rcv_luind[ind_loc] = gb_n; rcv_luind[ind_loc+1] = 0; tmp_ptrToSend[p] = ind_loc + 1; ind_loc += 2; } else { snd_luind[ptrToSend[p]] = gb_n; snd_luind[ptrToSend[p]+1] = 0; tmp_ptrToSend[p] = ptrToSend[p] + 1; ptrToSend[p] += 2; } if (sendL) p += grid->npcol; if (sendU) p++; } for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { k = sub_s[j]; if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { gb = supno_n[k]; if (sendL) p = PNUM( PROW(gb, grid), pc, grid ); else p = PNUM( pr, PCOL(gb, grid), grid); if (send_1[p] == FALSE) { send_1[p] = TRUE; send_2[nsend_2] = k; nsend_2 ++; } if (p == iam) { rcv_luind[ind_loc] = k; ind_loc++; if (sendL) xsub_n[LBj( gb_n, grid )] ++; else xsub_n[LBi( gb_n, grid )] ++; } else { snd_luind[ptrToSend[p]] = k; ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; } } } if (sendL) for (p = pc; p < nprocs; p += grid->npcol) { for (k = 0; k < nsend_2; k++) { gb = supno_n[send_2[k]]; if (PNUM(PROW(gb, grid), pc, grid) != p) { if (p == iam) { rcv_luind[ind_loc] = send_2[k]; ind_loc++; xsub_n[LBj( gb_n, grid )] ++; } else { snd_luind[ptrToSend[p]] = send_2[k]; ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; } } } send_1[p] = FALSE; } if (sendU) for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { if (send_1[p] || p == p_diag) { for (k = 0; k < nsend_2; k++) { gb = supno_n[send_2[k]]; if(PNUM( pr, PCOL(gb, grid), grid) != p) { if (p == iam) { rcv_luind[ind_loc] = send_2[k]; ind_loc++; xsub_n[LBi( gb_n, grid )] ++; } else { snd_luind[ptrToSend[p]] = send_2[k]; ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; } } } send_1[p] = FALSE; } } } } /* reset ptrToSnd to point to the beginning of the data for each processor (structure needed in MPI_Alltoallv) */ for (i = 0, p = 0; p < nprocs; p++) { ptrToSend[p] = i; i += nnzToSend[p]; } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. Note: it uses MPI_Alltoallv. ------------------------------------------------------------*/ if (nprocs > 1) {#if defined (_LONGINT) for (p=0; p<nprocs; p++) { if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX || nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); intBuf1[p] = (int) nnzToSend[p]; intBuf2[p] = (int) ptrToSend[p]; intBuf3[p] = (int) nnzToRecv[p]; intBuf4[p] = (int) ptrToRecv[p]; }#else /* Default */ intBuf1 = nnzToSend; intBuf2 = ptrToSend; intBuf3 = nnzToRecv; intBuf4 = ptrToRecv;#endif MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, rcv_luind, intBuf3, intBuf4, mpi_int_t, grid->comm); } if (sendL) nnzToRecv[iam] = nnz_loc_l; else nnzToRecv[iam] = nnz_loc_u; /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE. -------------------------------------------------------------*/ if (sendU) if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { SUPERLU_FREE (snd_luind); memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; } /* ------------------------------------------------------------ CONVERT THE FORMAT. ------------------------------------------------------------*/ /* Initialize the array of column of L/ row of U pointers */ k = 0; for (p = 0; p < nprocs; p ++) { if (p != iam) { i = k; while (i < k + nnzToRecv[p]) { gb = rcv_luind[i]; nelts = rcv_luind[i+1]; if (sendL) xsub_n[LBj( gb, grid )] = nelts; else xsub_n[LBi( gb, grid )] = nelts; i += nelts + 2; } } k += nnzToRecv[p]; } if (sendL) j = nsupers_j; else j = nsupers_i; k = 0; isize = xsub_n[0]; xsub_n[0] = 0; for (gb_l = 1; gb_l < j; gb_l++) { k += isize; isize = xsub_n[gb_l]; xsub_n[gb_l] = k; } xsub_n[gb_l] = k + isize; nnz_loc = xsub_n[gb_l]; if (sendL) { lsub_n = NULL; if (nnz_loc) { if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { fprintf (stderr, "Malloc fails for lsub_n[]."); return (memAux + memRet); } memRet += (float) (nnz_loc * iword); } sub_n = lsub_n; } if (sendU) { usub_n = NULL; if (nnz_loc) { if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { fprintf (stderr, "Malloc fails for usub_n[]."); return (memAux + memRet); } memRet += (float) (nnz_loc * iword); } sub_n = usub_n; } /* Copy the data into the L column / U row oriented storage */ k = 0; for (p = 0; p < nprocs; p++) { i = k; while (i < k + nnzToRecv[p]) { gb = rcv_luind[i]; if (gb >= nsupers) printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", iam, p, gb, nsupers, i, i-k); i += 2; if (sendL) gb_l = LBj( gb, grid ); if (sendU) gb_l = LBi( gb, grid ); for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { sub_n[j] = rcv_luind[i]; } } k += nnzToRecv[p]; } if (sendL) { sendL = FALSE; sendU = TRUE; } else sendU = FALSE; } /* deallocate memory allocated during symbolic factorization routine */ if (rcv_luind != NULL) { SUPERLU_FREE (rcv_luind); memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; } SUPERLU_FREE (mem); memAux -= (float) (12 * nprocs * iword); SUPERLU_FREE(nvtcs); memAux -= (float) (5 * nprocs * sizeof(int)); if (xlsub_s != NULL) { SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); } if (xusub_s != NULL) { SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); } SUPERLU_FREE (globToLoc); if (supno_s != NULL) { SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); SUPERLU_FREE (supno_s); } Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; *p_xlsub = xlsub_n; *p_lsub = lsub_n; *p_xusub = xusub_n; *p_usub = usub_n;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dist_symbLU()");#endif return (-memRet);} static floatzdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val, int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val, int_t *ilsum_i, int_t *ilsum_j ){/* * * Purpose * ======= * Re-distribute A on the 2D process mesh. The lower part is * stored using a column format and the upper part * is stored using a row format. * * Arguments * ========= * * A (Input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. * * ScalePermstruct (Input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_persist (Input) Glu_persist_t * * Information on supernodes mapping. * * grid (Input) gridinfo_t* * The 2D process mesh. * * p_ainf_colptr (Output) int_t** * Pointer to the lower part of A distributed on a 2D grid * of processors, stored by columns. * * p_ainf_rowind (Output) int_t** * Structure of of the lower part of A distributed on a * 2D grid of processors, stored by columns. * * p_ainf_val (Output) doublecomplex** * Numerical values of the lower part of A, distributed on a * 2D grid of processors, stored by columns. * * p_asup_rowptr (Output) int_t** * Pointer to the upper part of A distributed on a 2D grid * of processors, stored by rows. * * p_asup_colind (Output) int_t** * Structure of of the upper part of A distributed on a * 2D grid of processors, stored by rows. * * p_asup_val (Output) doublecomplex** * Numerical values of the upper part of A, distributed on a * 2D grid of processors, stored by rows. * * ilsum_i (Input) int_t * * Starting position of each supernode in * the full array (local, block row wise). * * ilsum_j (Input) int_t * * Starting position of each supernode in * the full array (local, block column wise). * * Return value * ============ * < 0, number of bytes allocated on return from the dist_symbLU * > 0, number of bytes allocated when out of memory. * (an approximation). * */ int iam, p, procs; NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; int_t nsupers, nsupers_i, nsupers_j; int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; doublecomplex *asup_val, *ainf_val; int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; doublecomplex *aij, **aij_send, *nzval, *dtemp; doublecomplex *nzval_a; MPI_Request *send_req; MPI_Status status; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; float memAux; /* Memory used during this routine and freed on return */ float memRet; /* Memory allocated and not freed on return */ int_t iword, dword, szbuf; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter zdist_A()");#endif iword = sizeof(int_t); dword = sizeof(double); perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; if (!(nnzToRecv = intCalloc_dist(2*procs))) { fprintf (stderr, "Malloc fails for nnzToRecv[]."); return (ERROR_RET); } memAux = (float) (2 * procs * iword);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -