📄 blas.br

📁 用于GPU通用计算的编程语言BrookGPU 0.4
💻 BR
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include <stdio.h>#include <stdlib.h>#include "main.h"#include "blas.h"#define MAX_DIM         2048#define DO_VERIFY       1#define EPS             1e-3f#define MAX_NZ_PER_ROW  7// SDOT related kernels ////////////////////////////////////////////////////////////kernel void multKernel(float4 x<>, float4 y<>, out float4 result<>) {  result = x * y;}// reduction: might be best to reduce mostly on GPU, then read back more // than a single float4.  Need to play around with things.reduce void sumReduceKernel(float4 x<>, reduce float4 result<>) {  result += x;}kernel void forceGPUFlush_float4(float4 input<>, out float4 result<>) {  result = input;}kernel void forceGPUFlush_float1(float input<>, out float result<>) {  result = input;}// SAXPY related kernels ////////////////////////////////////////////////////////////kernel void saxpyKernel(float4 x<>, float4 y<>, float alpha, out float4 result<>) {  result = (alpha * x) + y;}// SGEMV related kernels ////////////////////////////////////////////////////////////// A is (m/4)-by-n// x is (n/4)-by-1// result is (m/4)-by-(n/4)kernel void denseMatVecKernel(iter float2 it1<>, iter float2 it2<>,                              iter float2 it3<>, iter float2 it4<>,                              float4 A[][], float4 x<>, out float4 result<>) {  float4 data1 = A[it1.xy];  float4 data2 = A[it2.xy];  float4 data3 = A[it3.xy];  float4 data4 = A[it4.xy];      result  = x.x * data1;  result += x.y * data2;  result += x.z * data3;  result += x.w * data4;} kernel void denseMatVecScaleAddKernel(float4 y<>, float4 Ax<>, out float4 result<>,                                       float alpha, float beta) {  result = (alpha * Ax) + (beta * y); }/*** Kernels needed for Sparse Matrix-Dense Vector multiplication ****/kernel void sparse_matmult_product( float index<>, float x[][], float A<>, out float result<> ){  // if we wanted to go 4-wide here, we'd have to fetch 4 different x elements.  We also must  // then do math to determine the component in the float4 that is desired.  The float4 packing  // in the case of general-sparse matrices doesn't seem provide any computation speedup, just  // mem savings.  result = x[index][0] * A;}reduce void sum(float a<>, reduce float result<>) {  result += a;}///////////////////////////////////////////////////////////////////////////////////static void fillArray(float *a, int n, float val) {	int i;	for (i=0;i<n;i++)		a[i] = val;}// length is in stream elementsvoid convert1d_to_2d(int length, int* dim1, int* dim2) {  if (length <= (MAX_DIM/2)) {    *dim1 = 1;    *dim2 = length;  } else {    *dim1 = length / (MAX_DIM/2);    // may need to pad to fit in a 2d texture    if (length % (MAX_DIM/2) > 0)      (*dim1)++;    *dim2 = (MAX_DIM/2);  }}static int fequals(float a, float b, float tol) {  return (fabs(a - b) < tol);}void saxpy_2d(int dim1, int dim2, float *x, float *y, float alpha, int num_iter, int* innerTime) {  float4  flush[1];  float4  flushStrm<1,1>;  float4 xStrm<dim1, dim2>;  float4 yStrm<dim1, dim2>;  float4 zStrm<dim1, dim2>;  int i;  int millisStart, millisStop;  streamRead(xStrm, x);  streamRead(yStrm, y);  forceGPUFlush_float4(xStrm, flushStrm);  streamWrite(flushStrm, flush);  millisStart = GetTimeMillis();  /* result = alpha*x + y each iteration.     Things get a little complicated since I ping-pong buffers     when performing multiple iterations.  */  for (i=1;i<=num_iter;i++) {    if (i & 1)       saxpyKernel(xStrm, yStrm, alpha, zStrm);    else     saxpyKernel(xStrm, zStrm, alpha, yStrm);  }  // make sure the computation has completed before stopping the timer  if (i & 1)     forceGPUFlush_float4(yStrm, flushStrm);  else     forceGPUFlush_float4(zStrm, flushStrm);  streamWrite(flushStrm, flush);    millisStop = GetTimeMillis();  *innerTime = (int)(millisStop-millisStart);  if (i & 1)    streamWrite(yStrm, y);  else    streamWrite(zStrm, y);}void do_saxpy(int length, int num_iter, int* timing, float* flops) {  int i;  float *x, *y, *tmp;  float alpha;  int paddedLength, dim1, dim2;  int millisStart, millisStop;  int innerTime;  /* pack into a 2D texture, compute dims for texture */  assert(length % 4 == 0);  assert(length/4 <= MAX_DIM * MAX_DIM / 2);  convert1d_to_2d(length/4, &dim1, &dim2);  paddedLength = 4 * dim1 * dim2;  x = (float*)malloc(sizeof(float) * paddedLength);  y = (float*)malloc(sizeof(float) * paddedLength);  tmp = (float*)malloc(sizeof(float) * paddedLength);  printf("SAXPY using %d by %d textures.\n", dim1, dim2);  /* fill in array data, pad with zeros */  fillArray(x+length, paddedLength-length, 0.0f);  fillArray(y+length, paddedLength-length, 0.0f);  for (i=0;i<length;i++) {     x[i] = (float)(i % 15);     tmp[i] = y[i] = (float)((i + 5) % 15);  }  /* arbitrary constant chosen for alpha.  Choosing const that driver will     not be able to optimize */  alpha = 2.5f;  millisStart = GetTimeMillis();  saxpy_2d(dim1, dim2, x, y, alpha, num_iter, &innerTime);  millisStop = GetTimeMillis();  /* Computation of ax + y requires:  // N mults  // N adds  // ----------------  // 2N float operations  */  timing[0] = (int)(millisStop-millisStart);  timing[1] = innerTime;  flops[0] = (2.0f*num_iter*length)/(float)(timing[0]) / 1000.0f;  flops[1] = (2.0f*num_iter*length)/(float)(timing[1]) / 1000.0f;  if (DO_VERIFY) {     // verify results     for (i=0;i<length;i++) {       if (!fequals(y[i], num_iter * alpha*x[i] + tmp[i], EPS)) {          printf("[%d] expected: %f got: %f\n", i, num_iter * alpha*x[i] + tmp[i]);	  printf("Asserting...\n");	  fflush(stdout);	  assert(0);     }   }     printf("Verified SAXPY results to be correct!\n");   }  free(x);  free(y);  free(tmp);}void sdot_2d(int dim1, int dim2, float* x, float* y, float* result, int num_iter, int* innerTime) {  float4 flush[1];  float4 flushStrm<1,1>;  float sums[4];  float4 sumsStrm<1,1>;  float4 xStrm<dim1, dim2>;  float4 yStrm<dim1, dim2>;  float4 tmpStrm<dim1, dim2>;  int i;  int millisStart, millisStop;  streamRead(xStrm, x);  streamRead(yStrm, y);  forceGPUFlush_float4(xStrm, flushStrm);  forceGPUFlush_float4(yStrm, flushStrm);  streamWrite(flushStrm, flush);  millisStart = GetTimeMillis();  for (i=1;i<=num_iter;i++) {    multKernel(xStrm, yStrm, tmpStrm);    sumReduceKernel(tmpStrm, sumsStrm);  }  forceGPUFlush_float4(sumsStrm, flushStrm);  streamWrite(flushStrm, flush);  millisStop = GetTimeMillis();  streamWrite(sumsStrm, &sums);  *result = sums[0] + sums[1] + sums[2] + sums[3];  *innerTime = (int)(millisStop - millisStart);}void do_sdot(int length, int num_iter, int* timing, float* flops) {  float *x, *y;  float result;  float val;  int i;  int paddedLength, dim1, dim2;  int millisStart, millisStop;  int innerTime;  /* compute dimensions to pack vector into a 2D texture */  assert(length % 4 == 0);  assert(length/4 <= MAX_DIM * MAX_DIM/2);  convert1d_to_2d(length/4, &dim1, &dim2);  paddedLength = 4 * dim1 * dim2;  x = (float*)malloc(sizeof(float) * paddedLength);  y = (float*)malloc(sizeof(float) * paddedLength);  /* fill in arrays */  fillArray(x+length, paddedLength-length, 0.0f);  fillArray(y+length, paddedLength-length, 0.0f);  for(i=0;i<length;i++) {     x[i] = y[i] = 1.0f + (float)(i % 2);     //x[i] = y[i] = 1.0f;  }  millisStart = GetTimeMillis();  sdot_2d( dim1, dim2, x, y, &result, num_iter, &innerTime);  millisStop = GetTimeMillis();  /* Computation of xTy requires:  // N mults  // N-1 adds  // ----------------  // 2N-1 float operations  */  timing[0] = (int)(millisStop-millisStart);  timing[1] = innerTime;  flops[0] = (2.0f*num_iter*length - 1.0f)/(float)(timing[0]) / 1000.0f;  flops[1] = (2.0f*num_iter*length - 1.0f)/(float)(timing[1]) / 1000.0f;  if (DO_VERIFY) {    // verify results    val = 0.0f;    for (i=0;i<length;i++)      val += x[i] * y[i];    if (!fequals(val, result, EPS)) {       printf("expected: %f got: %f\n", val, result);       printf("exiting...\n");       fflush(stdout);       assert(0);    }      //printf("%f  %f\n", result, val);    printf("Verified SDOT results to be correct.\n");  }  free(x);  free(y);}void sgemv_inner(int dim, int wideDim, float fDim, float fWideDim,                 float* x, float* y, float* A,                 float alpha, float beta, int num_iter, int* innerTime) {  float* tmp = NULL;  float4 flush[1];  float4 flushStrm<1,1>;  int i, j = 0;  int millisStart, millisStop;  float4 xStrm<1, dim>;  float4 yStrm<dim, 1>;  float4 zStrm<dim, 1>;  float4 AStrm<dim, wideDim>;  float4 tmpStrm<dim, dim>;  float4 resultStrm<dim, 1>;  iter float2 it1<dim, dim> = iter( float2(0.0f, 0.0f), float2(fWideDim, fDim) );  iter float2 it2<dim, dim> = iter( float2(1.0f, 0.0f), float2(fWideDim+1.0f, fDim));  iter float2 it3<dim, dim> = iter( float2(2.0f, 0.0f), float2(fWideDim+2.0f, fDim));  iter float2 it4<dim, dim> = iter( float2(3.0f, 0.0f), float2(fWideDim+3.0f, fDim));  streamRead(xStrm, x);  streamRead(yStrm, y);  streamRead(AStrm, A);  forceGPUFlush_float4(xStrm, flushStrm);  forceGPUFlush_float4(yStrm, flushStrm);  forceGPUFlush_float4(AStrm, flushStrm);  streamWrite(flushStrm, flush);  millisStart = GetTimeMillis();  // Computation here is sort of meaningless. Each iteration computes z = aAx + y  // z is never used again...  for (i=1;i<=num_iter;i++) {    denseMatVecKernel(it1, it2, it3, it4, AStrm, xStrm, tmpStrm);
12 下一页
💿 文件大小 951 K
👤 上传用户 zjf646
📂 所属分类数学计算
🏷️ 相关标签

#BrookGPU #GPU #0.4 #计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -