📄 blas.br
字号:
#include <stdio.h>#include <stdlib.h>#include "main.h"#include "blas.h"#define MAX_DIM 2048#define DO_VERIFY 1#define EPS 1e-3f#define MAX_NZ_PER_ROW 7// SDOT related kernels ////////////////////////////////////////////////////////////kernel void multKernel(float4 x<>, float4 y<>, out float4 result<>) { result = x * y;}// reduction: might be best to reduce mostly on GPU, then read back more // than a single float4. Need to play around with things.reduce void sumReduceKernel(float4 x<>, reduce float4 result<>) { result += x;}kernel void forceGPUFlush_float4(float4 input<>, out float4 result<>) { result = input;}kernel void forceGPUFlush_float1(float input<>, out float result<>) { result = input;}// SAXPY related kernels ////////////////////////////////////////////////////////////kernel void saxpyKernel(float4 x<>, float4 y<>, float alpha, out float4 result<>) { result = (alpha * x) + y;}// SGEMV related kernels ////////////////////////////////////////////////////////////// A is (m/4)-by-n// x is (n/4)-by-1// result is (m/4)-by-(n/4)kernel void denseMatVecKernel(iter float2 it1<>, iter float2 it2<>, iter float2 it3<>, iter float2 it4<>, float4 A[][], float4 x<>, out float4 result<>) { float4 data1 = A[it1.xy]; float4 data2 = A[it2.xy]; float4 data3 = A[it3.xy]; float4 data4 = A[it4.xy]; result = x.x * data1; result += x.y * data2; result += x.z * data3; result += x.w * data4;} kernel void denseMatVecScaleAddKernel(float4 y<>, float4 Ax<>, out float4 result<>, float alpha, float beta) { result = (alpha * Ax) + (beta * y); }/*** Kernels needed for Sparse Matrix-Dense Vector multiplication ****/kernel void sparse_matmult_product( float index<>, float x[][], float A<>, out float result<> ){ // if we wanted to go 4-wide here, we'd have to fetch 4 different x elements. We also must // then do math to determine the component in the float4 that is desired. The float4 packing // in the case of general-sparse matrices doesn't seem provide any computation speedup, just // mem savings. result = x[index][0] * A;}reduce void sum(float a<>, reduce float result<>) { result += a;}///////////////////////////////////////////////////////////////////////////////////static void fillArray(float *a, int n, float val) { int i; for (i=0;i<n;i++) a[i] = val;}// length is in stream elementsvoid convert1d_to_2d(int length, int* dim1, int* dim2) { if (length <= (MAX_DIM/2)) { *dim1 = 1; *dim2 = length; } else { *dim1 = length / (MAX_DIM/2); // may need to pad to fit in a 2d texture if (length % (MAX_DIM/2) > 0) (*dim1)++; *dim2 = (MAX_DIM/2); }}static int fequals(float a, float b, float tol) { return (fabs(a - b) < tol);}void saxpy_2d(int dim1, int dim2, float *x, float *y, float alpha, int num_iter, int* innerTime) { float4 flush[1]; float4 flushStrm<1,1>; float4 xStrm<dim1, dim2>; float4 yStrm<dim1, dim2>; float4 zStrm<dim1, dim2>; int i; int millisStart, millisStop; streamRead(xStrm, x); streamRead(yStrm, y); forceGPUFlush_float4(xStrm, flushStrm); streamWrite(flushStrm, flush); millisStart = GetTimeMillis(); /* result = alpha*x + y each iteration. Things get a little complicated since I ping-pong buffers when performing multiple iterations. */ for (i=1;i<=num_iter;i++) { if (i & 1) saxpyKernel(xStrm, yStrm, alpha, zStrm); else saxpyKernel(xStrm, zStrm, alpha, yStrm); } // make sure the computation has completed before stopping the timer if (i & 1) forceGPUFlush_float4(yStrm, flushStrm); else forceGPUFlush_float4(zStrm, flushStrm); streamWrite(flushStrm, flush); millisStop = GetTimeMillis(); *innerTime = (int)(millisStop-millisStart); if (i & 1) streamWrite(yStrm, y); else streamWrite(zStrm, y);}void do_saxpy(int length, int num_iter, int* timing, float* flops) { int i; float *x, *y, *tmp; float alpha; int paddedLength, dim1, dim2; int millisStart, millisStop; int innerTime; /* pack into a 2D texture, compute dims for texture */ assert(length % 4 == 0); assert(length/4 <= MAX_DIM * MAX_DIM / 2); convert1d_to_2d(length/4, &dim1, &dim2); paddedLength = 4 * dim1 * dim2; x = (float*)malloc(sizeof(float) * paddedLength); y = (float*)malloc(sizeof(float) * paddedLength); tmp = (float*)malloc(sizeof(float) * paddedLength); printf("SAXPY using %d by %d textures.\n", dim1, dim2); /* fill in array data, pad with zeros */ fillArray(x+length, paddedLength-length, 0.0f); fillArray(y+length, paddedLength-length, 0.0f); for (i=0;i<length;i++) { x[i] = (float)(i % 15); tmp[i] = y[i] = (float)((i + 5) % 15); } /* arbitrary constant chosen for alpha. Choosing const that driver will not be able to optimize */ alpha = 2.5f; millisStart = GetTimeMillis(); saxpy_2d(dim1, dim2, x, y, alpha, num_iter, &innerTime); millisStop = GetTimeMillis(); /* Computation of ax + y requires: // N mults // N adds // ---------------- // 2N float operations */ timing[0] = (int)(millisStop-millisStart); timing[1] = innerTime; flops[0] = (2.0f*num_iter*length)/(float)(timing[0]) / 1000.0f; flops[1] = (2.0f*num_iter*length)/(float)(timing[1]) / 1000.0f; if (DO_VERIFY) { // verify results for (i=0;i<length;i++) { if (!fequals(y[i], num_iter * alpha*x[i] + tmp[i], EPS)) { printf("[%d] expected: %f got: %f\n", i, num_iter * alpha*x[i] + tmp[i]); printf("Asserting...\n"); fflush(stdout); assert(0); } } printf("Verified SAXPY results to be correct!\n"); } free(x); free(y); free(tmp);}void sdot_2d(int dim1, int dim2, float* x, float* y, float* result, int num_iter, int* innerTime) { float4 flush[1]; float4 flushStrm<1,1>; float sums[4]; float4 sumsStrm<1,1>; float4 xStrm<dim1, dim2>; float4 yStrm<dim1, dim2>; float4 tmpStrm<dim1, dim2>; int i; int millisStart, millisStop; streamRead(xStrm, x); streamRead(yStrm, y); forceGPUFlush_float4(xStrm, flushStrm); forceGPUFlush_float4(yStrm, flushStrm); streamWrite(flushStrm, flush); millisStart = GetTimeMillis(); for (i=1;i<=num_iter;i++) { multKernel(xStrm, yStrm, tmpStrm); sumReduceKernel(tmpStrm, sumsStrm); } forceGPUFlush_float4(sumsStrm, flushStrm); streamWrite(flushStrm, flush); millisStop = GetTimeMillis(); streamWrite(sumsStrm, &sums); *result = sums[0] + sums[1] + sums[2] + sums[3]; *innerTime = (int)(millisStop - millisStart);}void do_sdot(int length, int num_iter, int* timing, float* flops) { float *x, *y; float result; float val; int i; int paddedLength, dim1, dim2; int millisStart, millisStop; int innerTime; /* compute dimensions to pack vector into a 2D texture */ assert(length % 4 == 0); assert(length/4 <= MAX_DIM * MAX_DIM/2); convert1d_to_2d(length/4, &dim1, &dim2); paddedLength = 4 * dim1 * dim2; x = (float*)malloc(sizeof(float) * paddedLength); y = (float*)malloc(sizeof(float) * paddedLength); /* fill in arrays */ fillArray(x+length, paddedLength-length, 0.0f); fillArray(y+length, paddedLength-length, 0.0f); for(i=0;i<length;i++) { x[i] = y[i] = 1.0f + (float)(i % 2); //x[i] = y[i] = 1.0f; } millisStart = GetTimeMillis(); sdot_2d( dim1, dim2, x, y, &result, num_iter, &innerTime); millisStop = GetTimeMillis(); /* Computation of xTy requires: // N mults // N-1 adds // ---------------- // 2N-1 float operations */ timing[0] = (int)(millisStop-millisStart); timing[1] = innerTime; flops[0] = (2.0f*num_iter*length - 1.0f)/(float)(timing[0]) / 1000.0f; flops[1] = (2.0f*num_iter*length - 1.0f)/(float)(timing[1]) / 1000.0f; if (DO_VERIFY) { // verify results val = 0.0f; for (i=0;i<length;i++) val += x[i] * y[i]; if (!fequals(val, result, EPS)) { printf("expected: %f got: %f\n", val, result); printf("exiting...\n"); fflush(stdout); assert(0); } //printf("%f %f\n", result, val); printf("Verified SDOT results to be correct.\n"); } free(x); free(y);}void sgemv_inner(int dim, int wideDim, float fDim, float fWideDim, float* x, float* y, float* A, float alpha, float beta, int num_iter, int* innerTime) { float* tmp = NULL; float4 flush[1]; float4 flushStrm<1,1>; int i, j = 0; int millisStart, millisStop; float4 xStrm<1, dim>; float4 yStrm<dim, 1>; float4 zStrm<dim, 1>; float4 AStrm<dim, wideDim>; float4 tmpStrm<dim, dim>; float4 resultStrm<dim, 1>; iter float2 it1<dim, dim> = iter( float2(0.0f, 0.0f), float2(fWideDim, fDim) ); iter float2 it2<dim, dim> = iter( float2(1.0f, 0.0f), float2(fWideDim+1.0f, fDim)); iter float2 it3<dim, dim> = iter( float2(2.0f, 0.0f), float2(fWideDim+2.0f, fDim)); iter float2 it4<dim, dim> = iter( float2(3.0f, 0.0f), float2(fWideDim+3.0f, fDim)); streamRead(xStrm, x); streamRead(yStrm, y); streamRead(AStrm, A); forceGPUFlush_float4(xStrm, flushStrm); forceGPUFlush_float4(yStrm, flushStrm); forceGPUFlush_float4(AStrm, flushStrm); streamWrite(flushStrm, flush); millisStart = GetTimeMillis(); // Computation here is sort of meaningless. Each iteration computes z = aAx + y // z is never used again... for (i=1;i<=num_iter;i++) { denseMatVecKernel(it1, it2, it3, it4, AStrm, xStrm, tmpStrm);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -