📄 conjugate_gradient.br

📁 用于GPU通用计算的编程语言BrookGPU 0.4
💻 BR
字号:
/*  Conjugate Gradient (CG) Solver for Ax = b.  The conjugate method is used to solve large systems of equations  where A is sparse and symmetric positive definite.  For a description  of the technique, see "Matrix Computations", by Golub and Van Loan.  Note: see the example app spMatrixVec for a description of how the  sparse matrix A is represented for streaming, and how the matrix-vector  product is performed.  The program takes two parameters 'it_max' and 'eps'.  it_max:  maximum number of iterations of solver  eps:     solution convergence threshold*/#include <stdio.h>#include <stdlib.h>#define MAX_NZ_PER_ROW  7#define NUM_ROWS        200#define MAX_NZ          (NUM_ROWS*MAX_NZ_PER_ROW)/*** Kernels needed for Sparse Matrix-Vector multiplication ****************/kernel void mult(float a<>, float b<>, out float c<>) {  c = a*b;}kernel void sparse_matmult_product( float index<>, float x[][], float A<>, out float result<> ){  result = x[index][0] * A;}/*** Kernels called in the CG iterations *************************************/kernel void square(float a<>, out float b<>) {  b = a*a;}kernel void scaleAdd(float a<>, float b<>, float scale, out float c<>) {  c = a + (scale*b);}kernel void subtract(float a<>, float b<>, out float c<>) {  c = a - b;}kernel void copy(float a<>, out float b<>) {  b = a;}reduce void sum(float a<>, reduce float result<>) {  result += a;}/**********************************************************************/// generates a matrix resulting from the discretation of the poisson operator// on a uniform grid of dimension 'dim' and grdi resolution 'size'.  This matrix// is very sparse and symmetric positive definite and thus can be solved efficiently// using the CG methodvoid createCSRFromPoisson(int size, int dim, float nz[MAX_NZ], int cols[MAX_NZ], int rowStart[NUM_ROWS+1]) {  int nnzPerRow = 0, matSize = 0;  int i=0,j=0;  int offsets[7][2];  int nnz = 0;  int idx = 0;  switch (dim) {    case 1:      nnzPerRow = 3;      matSize = size;      offsets[0][0] = -1; offsets[0][1] = -1;      offsets[1][0] = 0; offsets[1][1] = 2;      offsets[2][0] = 1; offsets[2][1] = -1;      break;                case 2:      nnzPerRow = 5;      matSize = size*size;      offsets[0][0] = -size; offsets[0][1] = -1;      offsets[1][0] = -1; offsets[1][1] = -1;      offsets[2][0] = 0; offsets[2][1] = 4;      offsets[3][0] = 1; offsets[3][1] = -1;      offsets[4][0] = size; offsets[4][1] = -1;      break;    case 3:      nnzPerRow = 7;      matSize = size*size*size;      offsets[0][0] = -size*size; offsets[0][1] = -1;      offsets[1][0] = -size; offsets[1][1] = -1;      offsets[2][0] = -1; offsets[2][1] = -1;      offsets[3][0] = 0; offsets[3][1] = 6;      offsets[4][0] = 1; offsets[4][1] = -1;      offsets[5][0] = size; offsets[5][1] = -1;      offsets[6][0] = size*size; offsets[6][1] = -1;      break;    default: printf("Error in createCSRFromPoisson()\n"); return;  }  for (i=0;i<matSize;i++) {    for (j=0;j<nnzPerRow;j++) {      int col = i+offsets[j][0];              if (col >= 0 && col < matSize)        nnz++;    }  }  rowStart[0] = 0;  for (i=0;i<matSize;i++) {    for (j=0;j<nnzPerRow;j++) {      int col = i+offsets[j][0];      if (col>= 0 && col < matSize) {        nz[idx] = (float)offsets[j][1];        cols[idx] = col;        idx++;      }    }    rowStart[i+1] = idx;  }}/// The data is generated in compressed sparse row (CSR) format.// This converts the CSR format into the padded ITPACK// representation that is easy to stream.void reshuffleData(float nz[MAX_NZ], int cols[MAX_NZ], int rowStart[NUM_ROWS+1],                   float Anz[MAX_NZ], float Acols[MAX_NZ]) {  int i,j;  for (i=0;i<NUM_ROWS;i++) {    int offset = 0;    for (j=rowStart[i];j<rowStart[i+1];j++) {      Anz[MAX_NZ_PER_ROW*i + offset] = nz[j];      Acols[MAX_NZ_PER_ROW*i + offset] = (float)cols[j];      offset++;    }      // must pad the rest of the row    while (offset < MAX_NZ_PER_ROW) {      Anz[MAX_NZ_PER_ROW*i + offset] = 0.0f;      Acols[MAX_NZ_PER_ROW*i + offset] = (float)0.0f;   // this should be an invalid index.... but doesn't have to be since x multiplied by a zero here      offset++;    }  }}int parseCmdLine(int argc, char** argv, char* filename, int* it_max, float* eps) {    if (argc < 3) {    printf("it_max and eps arguments.\n");    return 1;  }  *it_max = atoi(argv[1]);  *eps = (float)atof(argv[2]);  return 0;}int main(int argc, char** argv) {  int i;  char filename[80];  int it, it_max;  float eps;  float alpha, resNew, resOld, convergeThresh;  float  Anz[MAX_NZ];  float  cIdx[MAX_NZ];  float  x[NUM_ROWS];  float  b[NUM_ROWS];  float  AStrm<NUM_ROWS,MAX_NZ_PER_ROW>;  float  cIdxStrm<NUM_ROWS,MAX_NZ_PER_ROW>;  float  productsStrm<NUM_ROWS,MAX_NZ_PER_ROW>;  float  vecGatherStrm<NUM_ROWS,MAX_NZ_PER_ROW>;  float  xStrm<NUM_ROWS,1>; // solution vector  float  bStrm<NUM_ROWS,1>; // rhs vector  // required temporaries of the CG method  float  rStrm<NUM_ROWS,1>;  float  qStrm<NUM_ROWS,1>;  float  dStrm<NUM_ROWS,1>;  float  tmpStrm<NUM_ROWS,1>;    // compressed sparse column representation of A, will be converted to ITPACK format  // for streaming  float  csrNz[MAX_NZ];  int    csrCols[MAX_NZ];  int    csrRowStart[NUM_ROWS+1];  if (parseCmdLine(argc, argv, filename, &it_max, &eps))    return 1;  // generate a matrix  createCSRFromPoisson(NUM_ROWS, 1, csrNz, csrCols, csrRowStart);  // convert from pure CSR to a padded scheme better suited for streaming  reshuffleData(csrNz, csrCols, csrRowStart, Anz, cIdx);    // read in the matrix data  streamRead(AStrm, Anz);  streamRead(cIdxStrm, cIdx);    // Now do the CG solve...  printf("\n\n");  // initial guess for x is zero.  Hard coding some right hand side for b  for (i=0;i<NUM_ROWS;i++) {    x[i] = 0.0f;    b[i] = 50.0f;  }  streamRead(xStrm, x);  streamRead(bStrm, b);  // Ax_0  // these two lines constitute a sparse matrix-vector multiply  sparse_matmult_product( cIdxStrm, xStrm, AStrm, productsStrm );  sum( productsStrm, rStrm );  // r = b - Ax  subtract(bStrm, rStrm, rStrm);  // d = r  copy(rStrm, dStrm);  // resNormNew = r dot r  resNew = 0.0f;  square(rStrm, tmpStrm);  sum(tmpStrm, resNew);  convergeThresh = eps * eps * resNew;  printf("initial r norm = %f\n", resNew);  printf("max it = %d\n", it_max);  printf("eps = %f\n", eps);  it = 0;  // CG loop.  Note, for simplicty the convergence is checked after after  // each loop iteration.  However, in a practical implementation, multiple  // iterations should be run on the hardware before reading back the residual  // to check performance.  while (it < it_max && resNew > convergeThresh ) {    if (it > 0)	    scaleAdd(rStrm, dStrm, resNew / resOld, dStrm);	  // q = Ad    // these two lines constitute a sparse matrix-vector multiply    sparse_matmult_product( cIdxStrm, dStrm, AStrm, productsStrm );    sum( productsStrm, qStrm );    // alpha = d dot q	  mult(dStrm, qStrm, tmpStrm);    sum(tmpStrm, alpha);	  alpha = resNew / alpha;	  // x = x + alpha * d 	  scaleAdd(xStrm, dStrm, alpha, xStrm);	  // r = r - alpha * q	  scaleAdd(rStrm, qStrm, -1.0f * alpha, rStrm);    // resNew = r dot r    resOld = resNew;	  square(rStrm, tmpStrm);    sum(tmpStrm, resNew);	  it++;    printf("iteration %d.  Current residual norm: %f\n", it, resNew);  }        streamWrite(xStrm, x);  printf("\n");  printf("Solution x:\n");  for (i=0;i<NUM_ROWS;i++)    printf("%.3f  " ,x[i]);  printf("\n");    return 0;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -