📄 pzsymbfact_distdata.c
字号:
memAux = (float) (2 * procs * iword); memRet = 0.; nnzToSend = nnzToRecv + procs; nsupers = supno[n-1] + 1; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ szbuf = k; /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) { fprintf (stderr, "Malloc fails for ia[]."); return (memAux); } memAux += (float) (2*k*iword); ja = ia + k; if ( !(aij = doublecomplexMalloc_dist(k)) ) { fprintf (stderr, "Malloc fails for aij[]."); return (memAux); } memAux += (float) (k*dword); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { fprintf (stderr, "Malloc fails for send_req[]."); return (memAux); } memAux += (float) (2*procs *sizeof(MPI_Request)); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { fprintf(stderr, "Malloc fails for ia_send[]."); return (memAux); } memAux += (float) (procs*sizeof(int_t*)); if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) { fprintf(stderr, "Malloc fails for aij_send[]."); return (memAux); } memAux += (float) (procs*sizeof(doublecomplex*)); if ( !(index = intMalloc_dist(2*SendCnt)) ) { fprintf(stderr, "Malloc fails for index[]."); return (memAux); } memAux += (float) (2*SendCnt*iword); if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) { fprintf(stderr, "Malloc fails for nzval[]."); return (memAux); } memAux += (float) (SendCnt * dword); if ( !(ptr_to_send = intCalloc_dist(procs)) ) { fprintf(stderr, "Malloc fails for ptr_to_send[]."); return (memAux); } memAux += (float) (procs * iword); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { fprintf(stderr, "Malloc fails for itemp[]."); return (memAux); } memAux += (float) (2*maxnnzToRecv*iword); if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) { fprintf(stderr, "Malloc fails for dtemp[]."); return (memAux); } memAux += (float) (maxnnzToRecv * dword); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { fprintf (stderr, "Malloc fails for *ainf_colptr[]."); return (memAux); } memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { fprintf (stderr, "Malloc fails for *asup_rowptr[]."); return (memAux+memRet); } memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nnz_loc_ainf = nnz_loc_asup = 0; nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; /* Count nonzeros in each column of L / row of U */ if (gbi >= gbj) { ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; nnz_loc_ainf ++; } else { asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; nnz_loc_asup ++; } } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; irow = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /* assert(jcol<n); */ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); /* Count nonzeros in each column of L / row of U */ if (gbi >= gbj) { ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; nnz_loc_ainf ++; } else { asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; nnz_loc_asup ++; } } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); memAux -= 2 * procs * iword; if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + procs*sizeof(doublecomplex*) + 2*SendCnt * iword + SendCnt* dword + procs*iword + 2*maxnnzToRecv*iword + maxnnzToRecv*dword; } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT. ------------------------------------------------------------*/ if (nnz_loc_ainf != 0) { if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { fprintf (stderr, "Malloc fails for *ainf_rowind[]."); return (memAux+memRet); } memRet += (float) (nnz_loc_ainf * iword); if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) { fprintf (stderr, "Malloc fails for *ainf_val[]."); return (memAux+memRet); } memRet += (float) (nnz_loc_ainf * dword); } else { ainf_rowind = NULL; ainf_val = NULL; } if (nnz_loc_asup != 0) { if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { fprintf (stderr, "Malloc fails for *asup_colind[]."); return (memAux + memRet); } memRet += (float) (nnz_loc_asup * iword); if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) { fprintf (stderr, "Malloc fails for *asup_val[]."); return (memAux + memRet); } memRet += (float) (nnz_loc_asup * dword); } else { asup_colind = NULL; asup_val = NULL; } /* Initialize the array of column pointers */ k = 0; jsize = ainf_colptr[0]; ainf_colptr[0] = 0; for (j = 1; j < ilsum_j[nsupers_j]; j++) { k += jsize; jsize = ainf_colptr[j]; ainf_colptr[j] = k; } ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; i = 0; isize = asup_rowptr[0]; asup_rowptr[0] = 0; for (j = 1; j < ilsum_i[nsupers_i]; j++) { i += isize; isize = asup_rowptr[j]; asup_rowptr[j] = i; } asup_rowptr[ilsum_i[nsupers_i]] = i + isize; /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { jcol = ja[i]; irow = ia[i]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); /* Count nonzeros in each column of L / row of U */ if (gbi >= gbj) { j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); k = ainf_colptr[j]; ainf_rowind[k] = irow; ainf_val[k] = aij[i]; ainf_colptr[j] ++; } else { j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); k = asup_rowptr[j]; asup_colind[k] = jcol; asup_val[k] = aij[i]; asup_rowptr[j] ++; } } /* Reset the column pointers to the beginning of each column */ for (j = ilsum_j[nsupers_j]; j > 0; j--) ainf_colptr[j] = ainf_colptr[j-1]; for (j = ilsum_i[nsupers_i]; j > 0; j--) asup_rowptr[j] = asup_rowptr[j-1]; ainf_colptr[0] = 0; asup_rowptr[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); memAux -= 2*szbuf*iword + szbuf*dword; *p_ainf_colptr = ainf_colptr; *p_ainf_rowind = ainf_rowind; *p_ainf_val = ainf_val; *p_asup_rowptr = asup_rowptr; *p_asup_colind = asup_colind; *p_asup_val = asup_val;#if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit zdist_A()"); fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);#endif return (-memRet);} /* dist_A */int_tzdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Pslu_freeable_t *Pslu_freeable, LUstruct_t *LUstruct, gridinfo_t *grid)/* * * * Purpose * ======= * Distribute the input matrix onto the 2D process mesh. * * Arguments * ========= * * fact (input) fact_t * Specifies whether or not the L and U structures will be re-used. * = SamePattern_SameRowPerm: L and U structures are input, and * unchanged on exit. * This routine should not be called for this case, an error * is generated. Instead, pddistribute routine should be called. * = DOFACT or SamePattern: L and U structures are computed and output. * * n (Input) int * Dimension of the matrix. * * A (Input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (Input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (Input) *Glu_freeable_t * The global structure describing the graph of L and U. * * LUstruct (Input) LUstruct_t* * Data structures for L and U factors. * * grid (Input) gridinfo_t* * The 2D process mesh. * * Return value * ============ * < 0, number of bytes allocated on return from the dist_symbLU * > 0, number of bytes allocated for performing the distribution * of the data, when out of memory. * (an approximation). *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -