📄 blas.br
字号:
sumReduceKernel(tmpStrm, resultStrm); denseMatVecScaleAddKernel(yStrm, resultStrm, zStrm, alpha, beta); } forceGPUFlush_float4(zStrm, flushStrm); streamWrite(flushStrm, flush);/* tmp = (float*)malloc(sizeof(float) * 4 * dim * dim); streamWrite(tmpStrm, tmp); for (i=0;i<dim;i++) for(j=0;j<dim;j++) printf("[%d,%d] %f %f %f %f\n", j, i, tmp[4*(i*dim + j)], tmp[4*(i*dim + j)+1], tmp[4*(i*dim + j)+2], tmp[4*(i*dim + j)+3]); printf("\n"); streamWrite(zStrm, tmp); for(i=0;i<dim;i++) printf("[%d] %f %f %f %f\n", i, tmp[i*4], tmp[4*i + 1], tmp[i*4+2], tmp[4*i+3]); printf("\n\n"); fflush(stdout); free(tmp); */ millisStop = GetTimeMillis(); *innerTime = (int)(millisStop - millisStart); streamWrite(zStrm, y);}void sgemv(int dim, float* x, float* y, float* A, float alpha, float beta, int num_iter, int* innerTime) { float fDim; float fWideDim; fDim = (float)dim; fWideDim = (float)(4*dim); // Hack, to get around no casts in initializers rule sgemv_inner(dim, 4*dim, fDim, fWideDim, x, y, A, alpha, beta, num_iter, innerTime);}void do_sgemv(int length, int num_iter, int* timing, float* flops) { float *x, *y, *A, *tmp; float alpha, beta; float val[4]; int i,j,k, base; int innerTime; int millisStart, millisStop; assert(length % 4 == 0); assert(length <= MAX_DIM); x = (float*)malloc(sizeof(float) * length); y = (float*)malloc(sizeof(float) * length); A = (float*)malloc(sizeof(float) * length * length); tmp = (float*)malloc(sizeof(float) * length); /* fill in matrix and x-vector values */ for (i=0;i<length;i++) { x[i] = (float)(1+(i % 3)); tmp[i] = y[i] = 0.0f; } for (i=0;i<length/4;i++) { for (j=0;j<length;j++) { A[ 4*(i*length + j) ] = (float)(i % 3); A[ 4*(i*length + j)+1] = (float)(j % 3); A[ 4*(i*length + j)+2] = (float)((i+1) % 3); A[ 4*(i*length + j)+3] = (float)((j+1) % 3); } } /* arbitrarily chosen constants */ alpha = 2.0f; beta = 1.5f; printf("SGEMV: using %d by %d texture for matrix, %d for vector.\n", length, length/4, length); millisStart = GetTimeMillis(); sgemv(length/4, x, y, A, alpha, beta, num_iter, &innerTime); millisStop = GetTimeMillis(); /* Computation of aAx + by for general dense matrix: // 2N + N^2 mults // N + N(N-1) adds // ---------------- // 2 * (N^2 + N) float operations */ timing[0] = (int)(millisStop-millisStart); timing[1] = innerTime; flops[0] = (2.0f * num_iter * (length*length + length)) / (float)(timing[0]) / 1000.0f; flops[1] = (2.0f * num_iter * (length*length + length)) / (float)(timing[1]) / 1000.0f; if (DO_VERIFY) { // printArray(y, length); for (i=0;i<length/4;i++) { for (k=0;k<4;k++) val[k] = 0.0f; for (j=0;j<length;j++) { base = 4*(i*length + j); for (k=0;k<4;k++) val[k] += A[base+k] * x[j]; } for (k=0;k<4;k++) { //printf("[%d] expected: %f got: %f\n", 4*i+k, alpha*val[k] + beta*tmp[4*i+k], y[4*i + k]); if (!fequals(alpha*val[k] + beta*tmp[4*i+k], y[4*i+k], EPS)) { printf("CHOKE!\n"); printf("[%d] expected: %f got: %f\n", 4*i+k, alpha*val[k] + beta*tmp[4*i+k], y[4*i + k]); printf("exiting...\n"); fflush(stdout); assert(0); } } } printf("Verified SGEMV results to be correct.\n"); } free(x); free(y); free(A);}// hackish cube root functionint intCubeRoot(int x) { int i = 0; assert(x >= 0 && x < 100*100*100); while (i*i*i <= x) i++; return i-1; }void createMatrix(int length, float* A, float* Aind) { int i,j, colIdx, nnz; int offset = intCubeRoot(length); int offsets[MAX_NZ_PER_ROW]; offsets[0] = -1 * offset * offset; offsets[1] = -1 * offset; offsets[2] = -1; offsets[3] = 0; offsets[4] = 1; offsets[5] = offset; offsets[6] = offset * offset; for (i=0;i<length;i++) { nnz = 0; for (j=0; j<MAX_NZ_PER_ROW; j++) { colIdx = i + offsets[j]; if (colIdx >= 0 && colIdx < length) { A[i*MAX_NZ_PER_ROW + nnz] = (j == 3) ? 6.0f : -1.0f; Aind[i*MAX_NZ_PER_ROW + nnz] = (float)colIdx; nnz++; } } while (nnz < MAX_NZ_PER_ROW) { A[i*MAX_NZ_PER_ROW + nnz] = 0.0f; Aind[i*MAX_NZ_PER_ROW + nnz] = 0.0f; nnz++; } }}void spMatVecf1(int strmDim, float* A, float* Aind, float* x, float* y, int num_iter, int* innerTime) { float flush[1]; float flushStrm<1>; float AStrm<strmDim, MAX_NZ_PER_ROW>; float AindStrm<strmDim, MAX_NZ_PER_ROW>; float productsStrm<strmDim, MAX_NZ_PER_ROW>; float xStrm<strmDim, 1>; float yStrm<strmDim, 1>; float forceStrm<1>; float force[1]; int millisStart, millisStop; int i; streamRead(AStrm, A); streamRead(AindStrm, Aind); streamRead(xStrm, x); forceGPUFlush_float1(AStrm, flushStrm); forceGPUFlush_float1(AindStrm, flushStrm); forceGPUFlush_float1(xStrm, flushStrm); streamRead(flushStrm, flush); millisStart = GetTimeMillis(); // (A^num_iter)x for (i=1;i<=num_iter;i++) { sparse_matmult_product( AindStrm, xStrm, AStrm, productsStrm ); sum( productsStrm, xStrm ); } forceGPUFlush_float1(xStrm, flushStrm); streamRead(flushStrm, force); millisStop = GetTimeMillis(); streamWrite(xStrm, y); *innerTime = millisStop - millisStart;}void do_spMatVec(int length, int num_iter, int* timing, float* flops) { int i,j, base; float val; float *x, *y, *A, *Aind; int millisStart, millisStop; int innerTime; assert(length <= MAX_DIM); assert(length % 4 == 0); x = (float*)malloc(sizeof(float)*length); y = (float*)malloc(sizeof(float)*length); A = (float*)malloc(sizeof(float)*length*MAX_NZ_PER_ROW); Aind = (float*)malloc(sizeof(float)*length*MAX_NZ_PER_ROW); createMatrix(length, A, Aind); for (i=0;i<length;i++) x[i] = 1.0f; millisStart = GetTimeMillis(); spMatVecf1(length, A, Aind, x, y, num_iter, &innerTime); millisStop = GetTimeMillis(); timing[0] = millisStop - millisStart; timing[1] = innerTime; flops[0] = (float)num_iter * (2.0f * MAX_NZ_PER_ROW - 1.0f) * length / (float)timing[0] / 1000.0f; flops[1] = (float)num_iter * (2.0f * MAX_NZ_PER_ROW - 1.0f) * length / (float)timing[1] / 1000.0f; if (DO_VERIFY) { //printArray(y, length); for (i=0;i<length;i++) { val = 0.0f; base = i*MAX_NZ_PER_ROW; for (j=0;j<MAX_NZ_PER_ROW;j++) val += A[base + j] * x[(int)Aind[base+j]]; //printf("%.2f ", val); assert(fequals(val, y[i], EPS)); } } free(x); free(y); free(A); free(Aind);}kernel void dummyKernel(float a<>, out float b<>) { b = a;}void startup() { float a[1]; float aStrm<1>; float bStrm<1>; streamRead(aStrm, a); dummyKernel(aStrm, bStrm); streamWrite(bStrm, a);}void printData(char* str, int length, int iterations, int ms[2], float flops[2]) { printf("%8d %4d %7d %7.4f %7d %7.4f %s\n", length, iterations, ms[0], flops[0], ms[1], flops[1], str);}// command line args: length iter skipvoid Blas_Time(int length) { int i; int skip, iterations, num_iter; int ms[2]; float flops[2]; num_iter = 1; skip = 2000; startup(); if (length < 2048) { for (i=0; i<num_iter;i++) { iterations = skip*(i+1); do_sdot(length*length, iterations, ms, flops); //printData("sdot ", length*length, iterations, ms, flops); printf("SDOT: (%d = %d^2 elements -- %d iters)\n", length * length, length, iterations); printf(" %.3f sec %.3f MFLOPS\n", ms[1] / 1000.0f, flops[1]); } } else { printf("Ignoring SDOT test, cannot reduce > 2048 dim textures (DX9 limitation).\n"); } printf("\n"); for (i=0; i<num_iter;i++) { iterations = skip*(i+1); do_saxpy(length*length, iterations, ms, flops); //printData("saxpy", length*length, iterations, ms, flops); printf("SAXPY: (%d = %d^2 elements -- %d iters)\n", length * length, length, iterations); printf(" %.3f sec %.3f MFLOPS\n", ms[1] / 1000.0f, flops[1]); } printf("\n"); for (i=0; i<num_iter;i++) { iterations = skip*(i+1); do_sgemv(length, iterations, ms, flops); //printData("sgemv", length, iterations, ms, flops); printf("SGEMV: (%dx%d matrix -- %d iters)\n", length, length, iterations); printf(" %.3f sec %.3f MFLOPS\n", ms[1] / 1000.0f, flops[1]); } printf("\n"); /* if (length < 2048) { for (i=0; i<num_iter;i++) { iterations = skip*(i+1); do_spMatVec(length, iterations, ms, flops); printData("spMat", length, iterations, ms, flops); } }*/ }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -