📄 #psymbfact.c#
字号:
* ======= * * Computes an estimation of the number of elements in columns of L * and rows of U. Stores this information in cntelt_vtcs, and it will * be used in the right-looking symbolic factorization. */{ int fstP, lstP, szSep, npNode, i, j; int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk; int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk; int_t nelts, nelts_new_blk; int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc; int_t *minElt_vtx, *cntelt_vtcs; /* Initialization */ xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; cntelt_vtcs = Llu_symbfact->cntelt_vtcs; globToLoc = Pslu_freeable->globToLoc; nvtcs_loc = VInfo->nvtcs_loc; maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n) minElt_vtx = lsub; else { /* allocate memory for minElt_vtx */ if (!(minElt_vtx = intMalloc_dist(n))) { fprintf(stderr, "Malloc fails for minElt_vtx[]."); return (PS->allocMem); } PS->allocMem += n * sizeof (int_t); } for (ii = 0; ii < n; ii++) tempArray[ii] = n; for (ii = 0; ii < nvtcs_loc; ii++) cntelt_vtcs[ii] = 0; szSep = nprocs_symb; i = 0; cur_blk = 0; vtx_lid = 0; while (szSep >= 1) { /* for each level in the separator tree */ npNode = nprocs_symb / szSep; fstP = 0; /* for each node in the level */ for (j = i; j < i + szSep; j++) { fstVtx = fstVtxSep[j]; lstVtx = fstVtx + sizes[j]; lstP = fstP + npNode; if (fstP <= iam && iam < lstP) { ind_blk = cur_blk; ii = vtx_lid; while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * VInfo->nblks_loc) { fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; ind_blk += 2; for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) { for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) { vtx_elt = lsub[jj]; if (tempArray[vtx_elt] == n) { tempArray[vtx_elt] = vtx; } } for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) { vtx_elt = usub[jj]; if (tempArray[vtx_elt] == n) { tempArray[vtx_elt] = vtx; } } } } if (szSep == nprocs_symb) vtx_lid = ii; else { MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]);#if ( PRNTlevel>=1 ) PS->no_msgsCol += (float) (2 * (int) log2( (double) npNode )); PS->sz_msgsCol += (float) (n - fstVtx); if (PS->maxsz_msgCol < n - fstVtx) PS->maxsz_msgCol = n - fstVtx; #endif nelts = 0; for (ii = fstVtx; ii < lstVtx; ii++) tempArray[ii] = 0; for (ii = fstVtx; ii < n; ii++) { if (minElt_vtx[ii] != n) { if (minElt_vtx[ii] < fstVtx) nelts ++; else tempArray[minElt_vtx[ii]] ++; if (ii > lstVtx) tempArray[ii] = minElt_vtx[ii]; } } ind_blk = cur_blk; lv = fstVtx; while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * VInfo->nblks_loc) { fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; ind_blk += 2; for (ii = lv; ii < fstVtx_blk; ii++) nelts += tempArray[ii]; lv = lstVtx_blk; nelts_new_blk = 0; for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { nelts_new_blk += tempArray[vtx]; cntelt_vtcs[vtx_lid] = nelts; } nelts += nelts_new_blk; } } /* if (szSep != nprocs_symb) */ cur_blk = ind_blk; } fstP += npNode; } i += szSep; szSep = szSep / 2; } /* free memory */ if (minElt_vtx != lsub) { SUPERLU_FREE (minElt_vtx); PS->allocMem -= n * sizeof(int_t); } return (SUCCES_RET);}static floatsymbfact_mapVtcs( int iam, /* Input -process number */ int nprocs_num, /* Input -number of processors */ int nprocs_symb, /* Input -number of procs for symbolic factorization */ SuperMatrix *A, /* Input -input distributed matrix A */ int_t *fstVtxSep, /* Input -first vertex in each separator */ int_t *sizes, /* Input -size of each separator in the separator tree */ Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc computed */ vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ int_t *tempArray, /* Input -temp array of size n = order of the matrix */ int_t maxSzBlk, /* Input -maximum number of vertices in a block */ psymbfact_stat_t *PS /* Input/Output -statistics */ ) {/* * Purpose * ======= * * symbfact_mapVtcs maps the vertices of the graph of the input * matrix A on nprocs_symb processors, using the separator tree * returned by a graph partitioning algorithm from the previous step * of the symbolic factorization. The number of processors * nprocs_symb must be a power of 2. * * Description of the algorithm * ============================ * * A subtree to subcube algorithm is used first to map the processors * on the nodes of the separator tree. * * For each node of the separator tree, its corresponding vertices * are distributed on the processors affected to this node, using a * block cyclic distribution. * * After the distribution, fields of the VInfo structure are * computed. The array globToLoc and maxNvtcsPProc of Pslu_freeable * are also computed. * */ int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d; int_t k, n, kk; int_t fstVtx, lstVtx; int_t fstVtxBlk, ind_blk; int_t noVtcsProc, noBlk; int_t nvtcs_loc; /* number of vertices owned by process iam */ int_t nblks_loc; /* no of blocks owned by process iam */ int_t *globToLoc; /* global indexing to local indexing */ int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx; int_t *begEndBlks_loc; /* begin and end vertex of each local block */ int_t *vtcs_pe; /* contains the number of vertices on each processor */ int *avail_pes; /* contains the processors to be used at each level */ n = A->ncol; /* allocate memory */ if (!(globToLoc = intMalloc_dist(n + 1))) { fprintf (stderr, "Malloc fails for globToLoc[]."); return (PS->allocMem); } PS->allocMem += (n+1) * sizeof(int_t); if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) { fprintf (stderr, "Malloc fails for avail_pes[]."); return (PS->allocMem); } PS->allocMem += nprocs_symb*sizeof(int); if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) { fprintf (stderr, "Malloc fails for vtcs_pe[]."); return (PS->allocMem); } PS->allocMem += nprocs_symb*sizeof(int_t); /* Initialization */ globToLoc[n] = n; for (p = 0; p < nprocs_symb; p++) { vtcs_pe[p] = 0; avail_pes[p] = EMPTY; } nvtcs_loc = 0; nblks_loc = 0; maxNvtcsNds_loc = 0; maxNeltsVtx = 0; /* distribute data among processors */ szSep = nprocs_symb; iSep = 0; while (szSep >= 1) { /* for each level in the separator tree */ npNode = nprocs_symb / szSep; firstP = 0; nvtcsNds_loc = 0; for (jSep = iSep; jSep < iSep + szSep; jSep++) { /* for each node in the level */ fstVtx = fstVtxSep[jSep]; lstVtx = fstVtx + sizes[jSep]; if (firstP <= iam && iam < firstP + npNode) maxNeltsVtx += lstVtx - fstVtx; if (szSep == nprocs_symb) { /* leaves of the separator tree */ for (k = fstVtx; k < lstVtx; k++) { globToLoc[k] = (int_t) firstP; vtcs_pe[firstP] ++; } if (firstP == iam) { nvtcs_loc += lstVtx - fstVtx; nblks_loc ++; } } else { /* superior levels of the separator tree */ k = fstVtx; noVtcsProc = maxSzBlk; fstVtxBlk = fstVtx; if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode; /* first allocate processors from previous levels */ for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) { p = avail_pes[ind_ap_s]; if (p != EMPTY && k < lstVtx) { /* for each column in the separator */ avail_pes[ind_ap_s] = EMPTY; kk = 0; while (kk < noVtcsProc && k < lstVtx) { globToLoc[k] = p; vtcs_pe[p] ++; k ++; kk ++; } if (p == iam) { nvtcs_loc += kk; nblks_loc ++; nvtcsNds_loc += kk; } } else { if (p != EMPTY && k == lstVtx) { avail_pes[ind_ap_s] = EMPTY; avail_pes[ind_ap_d] = p; ind_ap_d ++; } } } noBlk = 0; p = firstP + npNode; while (k < lstVtx) { /* for each column in the separator */ kk = 0; p = (int) (noBlk % (int_t) npNode) + firstP; while (kk < noVtcsProc && k < lstVtx) { globToLoc[k] = p; vtcs_pe[p] ++; k ++; kk ++; } if (p == iam) { nvtcs_loc += kk; nblks_loc ++; nvtcsNds_loc += kk; } noBlk ++; } /* while (k < lstVtx) */ /* Add the unused processors to the avail_pes list of pes */ for (p = p + 1; p < firstP + npNode; p ++) { avail_pes[ind_ap_d] = p; ind_ap_d ++; } } firstP += npNode; } if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb) maxNvtcsNds_loc = nvtcsNds_loc; iSep += szSep; szSep = szSep / 2; } #if ( PRNTlevel>=2 ) if (!iam) PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe);#endif /* determine maximum number of vertices among processors */ maxNvtcsPProc = vtcs_pe[0]; vtcs_pe[0] = 0; for (p = 1; p < nprocs_symb; p++) { if (maxNvtcsPProc < vtcs_pe[p]) maxNvtcsPProc = vtcs_pe[p]; vtcs_pe[p] = 0; }#if ( PRNTlevel>=2 ) if (!iam) printf (" MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n);#endif if (iam < nprocs_symb) if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1))) ABORT("Malloc fails for begEndBlks_loc[]."); ind_blk = 0; k = 0; while (k < n) { p = globToLoc[k]; if (p == iam) begEndBlks_loc[ind_blk] = k; while (globToLoc[k] == p && k < n) { globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p]; vtcs_pe[p] ++; k ++; } if (p == iam) { begEndBlks_loc[ind_blk + 1] = k; ind_blk += 2; } } if (iam < nprocs_symb) begEndBlks_loc[2 * nblks_loc] = n; SUPERLU_FREE (avail_pes); SUPERLU_FREE (vtcs_pe); Pslu_freeable->maxNvtcsPProc = maxNvtcsPProc; Pslu_freeable->globToLoc = globToLoc; if (iam < nprocs_symb) { VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc; VInfo->nblks_loc = nblks_loc; VInfo->nvtcs_loc = nvtcs_loc; VInfo->curblk_loc = 0; VInfo->maxNeltsVtx = maxNeltsVtx; VInfo->filledSep = FALSE; VInfo->xlsub_nextLvl = 0; VInfo->xusub_nextLvl = 0; VInfo->begEndBlks_loc = begEndBlks_loc; VInfo->fstVtx_nextLvl = begEndBlks_loc[0]; } return SUCCES_RET;}static void symbfact_distributeMatrix( int iam, /* Input - my processor number */ int nprocs_num, /* Input - number of processors */ int nprocs_symb, /* Input - number of processors for the symbolic factorization */ SuperMatrix *A, /* Input - input matrix A */ int_t *perm_c, /* Input - column permutation */ int_t *perm_r, /* Input - row permutation */ matrix_symbfact_t *AS, /* Output - temporary storage for the redistributed matrix */ Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */ vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ int_t *tempArray, /* Input/Output - temporary array of size n (order of the matrix) */ MPI_Comm *num_comm /* Input - communicator for nprocs_num procs */ ){/* * Purpose * ======= * * Distribute input matrix A for the symbolic factorization routine. * Only structural information is distributed. The redistributed * matrix has its rows and columns permuted according to perm_r and * perm_c. A is not modified during this routine. * *//* Notations: * Ainf : inferior part of A, including diagonal. * Asup : superior part of A.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -