⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 #psymbfact.c#

📁 SuperLU 2.2版本。对大型、稀疏、非对称的线性系统的直接求解
💻 C#
📖 第 1 页 / 共 5 页
字号:
 * ======= *  * Computes an estimation of the number of elements in columns of L * and rows of U.  Stores this information in cntelt_vtcs, and it will * be used in the right-looking symbolic factorization. */{  int   fstP, lstP, szSep, npNode, i, j;  int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk;  int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk;  int_t nelts, nelts_new_blk;  int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc;  int_t *minElt_vtx, *cntelt_vtcs;    /* Initialization */  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;  cntelt_vtcs = Llu_symbfact->cntelt_vtcs;  globToLoc = Pslu_freeable->globToLoc;  nvtcs_loc = VInfo->nvtcs_loc;  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;  if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n)    minElt_vtx = lsub;  else {     /* allocate memory for minElt_vtx */    if (!(minElt_vtx = intMalloc_dist(n))) {      fprintf(stderr, "Malloc fails for minElt_vtx[].");      return (PS->allocMem);    }    PS->allocMem += n * sizeof (int_t);  }     for (ii = 0; ii < n; ii++)     tempArray[ii] = n;  for (ii = 0; ii < nvtcs_loc; ii++)    cntelt_vtcs[ii] = 0;  szSep = nprocs_symb;  i = 0;  cur_blk = 0;  vtx_lid = 0;  while (szSep >= 1) {    /* for each level in the separator tree */    npNode = nprocs_symb / szSep;     fstP = 0;     /* for each node in the level */    for (j = i; j < i + szSep; j++) {      fstVtx = fstVtxSep[j];      lstVtx  = fstVtx + sizes[j];      lstP = fstP + npNode;      if (fstP <= iam && iam < lstP) {      	ind_blk = cur_blk;	ii = vtx_lid;	while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 	       ind_blk < 2 * VInfo->nblks_loc) {	  	  fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];	  lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];	  ind_blk += 2;	  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) {	    for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) {	      vtx_elt = lsub[jj];	      if (tempArray[vtx_elt] == n) {		tempArray[vtx_elt] = vtx;	      }	    }	    for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) {	      vtx_elt = usub[jj];	      if (tempArray[vtx_elt] == n) {		tempArray[vtx_elt] = vtx;	      }	    }	  }	  	} 	if (szSep == nprocs_symb) 	  vtx_lid = ii;	else {	  MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), 			 (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]);#if ( PRNTlevel>=1 )	  PS->no_msgsCol += (float) (2 * (int) log2( (double) npNode ));	  PS->sz_msgsCol += (float) (n - fstVtx);	  if (PS->maxsz_msgCol < n - fstVtx) 	    PS->maxsz_msgCol = n - fstVtx;      #endif	  nelts = 0;	  for (ii = fstVtx; ii < lstVtx; ii++)	    tempArray[ii] = 0;	  for (ii = fstVtx; ii < n; ii++) {	    if (minElt_vtx[ii] != n) {	      if (minElt_vtx[ii] < fstVtx)		nelts ++;	      else		tempArray[minElt_vtx[ii]] ++;	      if (ii > lstVtx)		tempArray[ii] = minElt_vtx[ii];	    }	  }		  ind_blk = cur_blk;	  lv = fstVtx;	  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 		 ind_blk < 2 * VInfo->nblks_loc) {	  	    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];	    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];	    ind_blk += 2;	    	    for (ii = lv; ii < fstVtx_blk; ii++)	      nelts += tempArray[ii];	    lv = lstVtx_blk;	    nelts_new_blk = 0;	    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {	      nelts_new_blk += tempArray[vtx];	      cntelt_vtcs[vtx_lid] = nelts;	    }	    nelts += nelts_new_blk;	  }	} /* if (szSep != nprocs_symb) */	cur_blk = ind_blk;      }      fstP += npNode;    }    i += szSep;    szSep = szSep / 2;  }  /* free memory */  if (minElt_vtx != lsub) {    SUPERLU_FREE (minElt_vtx);    PS->allocMem -= n * sizeof(int_t);  }  return (SUCCES_RET);}static floatsymbfact_mapVtcs( int iam,             /* Input -process number */ int nprocs_num,      /* Input -number of processors */ int nprocs_symb,     /* Input -number of procs for symbolic factorization */ SuperMatrix *A,      /* Input -input distributed matrix A */ int_t *fstVtxSep,    /* Input -first vertex in each separator */ int_t *sizes,        /* Input -size of each separator in the separator tree */ Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc 				    computed */ vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ int_t *tempArray,    /* Input -temp array of size n = order of the matrix */ int_t  maxSzBlk,     /* Input -maximum number of vertices in a block */ psymbfact_stat_t *PS /* Input/Output -statistics */ ) {/*  * Purpose * ======= * *  symbfact_mapVtcs maps the vertices of the graph of the input *  matrix A on nprocs_symb processors, using the separator tree *  returned by a graph partitioning algorithm from the previous step *  of the symbolic factorization.  The number of processors *  nprocs_symb must be a power of 2. * * Description of the algorithm * ============================ * *  A subtree to subcube algorithm is used first to map the processors *  on the nodes of the separator tree. * *  For each node of the separator tree, its corresponding vertices *  are distributed on the processors affected to this node, using a *  block cyclic distribution. * *  After the distribution, fields of the VInfo structure are *  computed.  The array globToLoc and maxNvtcsPProc of Pslu_freeable *  are also computed. * */  int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d;  int_t k, n, kk;  int_t fstVtx, lstVtx;  int_t fstVtxBlk, ind_blk;  int_t noVtcsProc, noBlk;  int_t nvtcs_loc; /* number of vertices owned by process iam */  int_t nblks_loc; /* no of blocks owned by process iam */  int_t *globToLoc;    /* global indexing to local indexing */  int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx;  int_t *begEndBlks_loc; /* begin and end vertex of each local block */  int_t *vtcs_pe;  /* contains the number of vertices on each processor */  int   *avail_pes; /* contains the processors to be used at each level */    n = A->ncol;  /* allocate memory */  if (!(globToLoc = intMalloc_dist(n + 1))) {    fprintf (stderr, "Malloc fails for globToLoc[].");    return (PS->allocMem);  }  PS->allocMem += (n+1) * sizeof(int_t);  if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) {    fprintf (stderr, "Malloc fails for avail_pes[].");    return (PS->allocMem);  }  PS->allocMem += nprocs_symb*sizeof(int);  if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) {    fprintf (stderr, "Malloc fails for vtcs_pe[].");    return (PS->allocMem);  }  PS->allocMem += nprocs_symb*sizeof(int_t);    /* Initialization */  globToLoc[n] = n;    for (p = 0; p < nprocs_symb; p++) {    vtcs_pe[p] = 0;    avail_pes[p] = EMPTY;  }  nvtcs_loc = 0;  nblks_loc = 0;  maxNvtcsNds_loc = 0;  maxNeltsVtx     = 0;    /* distribute data among processors */  szSep = nprocs_symb;  iSep = 0;  while (szSep >= 1) {    /* for each level in the separator tree */    npNode = nprocs_symb / szSep;     firstP = 0;     nvtcsNds_loc = 0;        for (jSep = iSep; jSep < iSep + szSep; jSep++) {      /* for each node in the level */      fstVtx = fstVtxSep[jSep];      lstVtx = fstVtx + sizes[jSep];      if (firstP <= iam && iam < firstP + npNode)	maxNeltsVtx += lstVtx - fstVtx;      if (szSep == nprocs_symb) {	/* leaves of the separator tree */	for (k = fstVtx; k < lstVtx; k++) {	  globToLoc[k] = (int_t) firstP;	  vtcs_pe[firstP] ++;	}	if (firstP == iam) {	  	  nvtcs_loc += lstVtx - fstVtx;	  nblks_loc ++;	}      }      else {	/* superior levels of the separator tree */	k = fstVtx;	noVtcsProc = maxSzBlk;	fstVtxBlk = fstVtx;	if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode;	/* first allocate processors from previous levels */		for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) {	  p = avail_pes[ind_ap_s];	  if (p != EMPTY && k < lstVtx) {	    /* for each column in the separator */	  	    avail_pes[ind_ap_s] = EMPTY;	    kk = 0;	    while (kk < noVtcsProc && k < lstVtx) {	      globToLoc[k] = p;	      vtcs_pe[p] ++;	      k ++;	      kk ++;	    }	    if (p == iam) {	      nvtcs_loc += kk;	      nblks_loc ++;	      nvtcsNds_loc += kk;	    }	  }	  else {	    if (p != EMPTY && k == lstVtx) {	      avail_pes[ind_ap_s] = EMPTY;	      avail_pes[ind_ap_d] = p; ind_ap_d ++;	    }	  }	} 	noBlk = 0;	p = firstP + npNode;	while (k < lstVtx) {	  /* for each column in the separator */	  kk = 0;	  p = (int) (noBlk % (int_t) npNode) + firstP;	  while (kk < noVtcsProc && k < lstVtx) {	    globToLoc[k] = p;	    vtcs_pe[p] ++;	    k ++;	    kk ++;	  }	  if (p == iam) {	    nvtcs_loc += kk;	    nblks_loc ++;	    nvtcsNds_loc += kk;	  }	  noBlk ++;	} /* while (k < lstVtx) */	/* Add the unused processors to the avail_pes list of pes */	for (p = p + 1; p < firstP + npNode; p ++) {	  avail_pes[ind_ap_d] = p; ind_ap_d ++;	}      }      firstP += npNode;    }    if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb)      maxNvtcsNds_loc = nvtcsNds_loc;    iSep += szSep;    szSep = szSep / 2;  }  #if ( PRNTlevel>=2 )  if (!iam)    PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe);#endif  /* determine maximum number of vertices among processors */  maxNvtcsPProc = vtcs_pe[0];  vtcs_pe[0] = 0;  for (p = 1; p < nprocs_symb; p++) {    if (maxNvtcsPProc < vtcs_pe[p])      maxNvtcsPProc = vtcs_pe[p];    vtcs_pe[p] = 0;  }#if ( PRNTlevel>=2 )  if (!iam)    printf ("  MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", 	    maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n);#endif  if (iam < nprocs_symb)    if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1)))      ABORT("Malloc fails for begEndBlks_loc[].");    ind_blk = 0;  k = 0;  while (k < n) {    p = globToLoc[k];    if (p == iam)       begEndBlks_loc[ind_blk] = k;    while (globToLoc[k] == p && k < n) {      globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p];      vtcs_pe[p] ++;      k ++;    }    if (p == iam) {      begEndBlks_loc[ind_blk + 1] = k;      ind_blk += 2;    }  }  if (iam < nprocs_symb)    begEndBlks_loc[2 * nblks_loc] = n;   SUPERLU_FREE (avail_pes);  SUPERLU_FREE (vtcs_pe);    Pslu_freeable->maxNvtcsPProc   = maxNvtcsPProc;  Pslu_freeable->globToLoc       = globToLoc;  if (iam < nprocs_symb) {    VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc;    VInfo->nblks_loc       = nblks_loc;    VInfo->nvtcs_loc       = nvtcs_loc;    VInfo->curblk_loc      = 0;    VInfo->maxNeltsVtx     = maxNeltsVtx;    VInfo->filledSep       = FALSE;    VInfo->xlsub_nextLvl   = 0;    VInfo->xusub_nextLvl   = 0;    VInfo->begEndBlks_loc  = begEndBlks_loc;    VInfo->fstVtx_nextLvl  = begEndBlks_loc[0];  }  return SUCCES_RET;}static void symbfact_distributeMatrix( int   iam,             /* Input - my processor number */   int   nprocs_num,      /* Input - number of processors */ int   nprocs_symb,     /* Input - number of processors for the			   symbolic factorization */ SuperMatrix *A,        /* Input - input matrix A */ int_t *perm_c,         /* Input - column permutation */ int_t *perm_r,         /* Input - row permutation */ matrix_symbfact_t *AS, /* Output - temporary storage for the			   redistributed matrix */ Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */ vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices				 distribution */ int_t  *tempArray,     /* Input/Output - temporary array of size n			   (order of the matrix) */ MPI_Comm    *num_comm  /* Input - communicator for nprocs_num procs */ ){/* * Purpose  * ======= * * Distribute input matrix A for the symbolic factorization routine. * Only structural information is distributed.  The redistributed * matrix has its rows and columns permuted according to perm_r and * perm_c. A is not modified during this routine. * *//* Notations: * Ainf : inferior part of A, including diagonal. * Asup : superior part of A.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -