📄 runkernel.br

📁 用于GPU通用计算的编程语言BrookGPU 0.4
💻 BR
📖 第 1 页 / 共 2 页
字号:
上一页 12
}/* * RunKernel1D -- * *      Runs a simple kernel on a 1D stream and times it. */static voidRunKernel1D(char *logName, int streamLength, int nRuns){   float4 s<streamLength>, o<streamLength>;   float4 *data;   int i;   RunKernelBuildData(&data, streamLength);   if (curNEntries != streamLength) {      UPDATE_CUR_OPS(streamLength, data, RunKernelHardWork, s, o);   }   start = GetTimeTSC();   streamRead(s, data);   for (i = 0; i < nRuns; i++) {      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);   }   streamWrite(o, data);   stop = GetTimeTSC();   RunKernelProcessTiming(logName, data, streamLength, streamLength, nRuns);   free(data);}/* * RunKernel2D -- * *      Runs a simple kernel on a 2D stream and times it. */static voidRunKernel2D(char *logName, int streamLength, int nRuns){   float4 s<streamLength, streamLength>, o<streamLength, streamLength>;   float4 *data, *data2;   int i;   RunKernelBuildData(&data, streamLength * streamLength);   RunKernelBuildData(&data2, streamLength * streamLength);   if (curNEntries != streamLength * streamLength) {      RunKernelBuildSummary(streamLength, 0, 0);      UPDATE_CUR_OPS(streamLength * streamLength, data, RunKernelHardWork, s, o);   }   start = GetTimeTSC();   streamRead(s, data);   for (i = 0; i < nRuns; i++) {      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);   }   streamWrite(o, data);   stop = GetTimeTSC();   RunKernelProcessTiming(logName, data,                          streamLength, streamLength * streamLength, nRuns);   free(data);}/* * RunKernelDoRun -- * *      More code factoring.  A convenient function for regularizing the *      output of a run with a given length to avoiding copy-pasting. */static voidRunKernelDoRun(RunKernelWrapperFn f, char *logName, int length){   int i;   printf("(* %s: length %d *)\n", logName, length);   printf("(* usecs   MFLOPS   # MFLOPs   runs *)\n");   for (i = 0; i < numIterations; i++) {      f(logName, length, iterations[i]);   }   printf("\n");}/* * RunKernel1D_Time -- * *      Entry point for the 1D kernel overhead tests. */voidRunKernel1D_Time(int maxLength){   int i;   RunKernelKickBRT();   for (i = 0; i < numLengths && lengths[i] < maxLength; i++) {      RunKernelDoRun(RunKernel1D, "RunK1D", lengths[i]);   }   RunKernelDoRun(RunKernel1D, "RunK1D", maxLength);}/* * RunKernel2D_Time -- * *      Entry point for the 2D kernel overhead tests. */voidRunKernel2D_Time(int maxLength){   int i;   RunKernelKickBRT();   for (i = 0; i < numLengths && lengths[i] < maxLength; i++) {      RunKernelDoRun(RunKernel2D, "RunK2D", lengths[i]);   }   RunKernelDoRun(RunKernel2D, "RunK2D", maxLength);}/* * RunKernelFindIdealGPUvsCPUSlope -- * *      At peak, we expect the GPU / CPU crossover point to be linear as a *      function of the streamLength.  Specifically, we expect the CPU time *      to be: * *      T_cpu = len * (# iterations of k) * k_cpu * *      where k_cpu is time to execute the CPU implementation of the kernel. *      The GPU cost should be * *      T_gpu = (R+W)*len + len * (# iterations of k) * k_gpu * *      where R, W are the R and W write bandwidth in streamElements / time. * *      Solving, we get T_gpu < T_cpu when * *              len * (# iterations) * (ops in k) > *                      len * (R + W) * (ops in k) / (k_cpu - k_gpu) * *      which is just * *              # ops > [(R + W) * (ops in k) / (k_cpu - k_gpu)] * len * *      So we need to find k_gpu, k_cpu, and (R+W). * */static floatRunKernelFindIdealGPUvsCPUSlope(void){   float4 s<BIG_LENGTH, BIG_LENGTH>, o<BIG_LENGTH, BIG_LENGTH>;   float4 tiny<1>;   float4 *data, *outData;   float slope = 0.0f, cMFLOPS, gMFLOPS, readRate, writeRate;   int cpuOps, i;   RunKernelBuildData(&data, BIG_LENGTH * BIG_LENGTH);   outData = (float4 *) malloc(BIG_LENGTH * BIG_LENGTH * sizeof *outData);   RunKernelBuildSummary(BIG_LENGTH, 1, 1);   RunKernelHardWorkCPU(BIG_LENGTH,                        float4(1.0f, 1.0f, 1.0f, 1.0f), data, outData);   t1 = GetTimeTSC();   for (i = 0; i < SMOOTHING; i++) {      RunKernelHardWorkCPU(BIG_LENGTH,                           float4(1.0f, 1.0f, 1.0f, 1.0f), data, outData);   }   t1 = CyclesToUsecs(GetTimeTSC() - t1) / SMOOTHING;   cpuOps = (int) outData[1].x - 1;   cMFLOPS = (BIG_LENGTH * BIG_LENGTH * 4 * ((float) cpuOps) / t1);   t2 = GetTimeTSC();   streamRead(s, data);   for (i = 0; i < SMOOTHING; i++) {      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);   }   streamWrite(o, data);   t2 = CyclesToUsecs(GetTimeTSC() - t2) / SMOOTHING;   gMFLOPS = (BIG_LENGTH * BIG_LENGTH * 4 * (data[1].x - 1) / t2);   if (t2 > t1) {     printf("(* The CPU is _faster_ than the GPU by");     printf64(t2-t1);     printf(" usecs! *)\n");   }   if (((int) data[1].x - 1) != cpuOps) {      printf("(* CPU is doing %4.1f ops and GPU is doing %4.1f! *)\n",             (float) cpuOps, (float) (data[1].x - 1));   }   readRate = sizeof(float) * 4 * BIG_LENGTH * BIG_LENGTH / cur_R;   writeRate = sizeof(float) * 4 * BIG_LENGTH * BIG_LENGTH / cur_W;   printf("(* Read: %5.2f MB/s, Write: %5.2f MB/s, CPU MFLOPS: %4.f GPU MFLOPS: %4.f\n",         readRate, writeRate, cMFLOPS, gMFLOPS);   slope = (cur_R + cur_W) * cpuOps / (float) (t1 - t2);   printf("(* Ideal slope: %5.2f Ideal iterations: %4.2f (%d ops)\tRunKVSIdeal *)\n",          slope, slope / (float) cpuOps, cpuOps);   free(data);   free(outData);   return slope;}/* * RunKernelFindGPUvsCPUOne -- * *      Finds the crossover point i at which * *              Read(N) + i*(GPU Version(N)) + Write(N) < i*(CPU Version(N)) * *      Essentially, the plot is N floats vs. i*N*(# ops in the kernel) and *      this function finds i given N and and a kernel length. * */static intRunKernelFindGPUvsCPUOne(int length){   float4 s<length, length>, o<length, length>;   float4 *data, *outCPU, *outGPU;   int i;   if (length <= 1) return -1; /* We're never going to catch the CPU... */   RunKernelBuildData(&data, length * length);   outCPU = (float4 *) malloc(length * length * sizeof *outCPU);   outGPU = (float4 *) malloc(length * length * sizeof *outGPU);   for (i = 1; i < MAX_TRIES; i++) {      int j, k;      RunKernelHardWorkCPU(length,                           float4(1.0f, 1.0f, 1.0f, 1.0f), data, outCPU);      t1 = GetTimeTSC();      for (k = 0; k < SMOOTHING; k++) {         for (j = 0; j < i; j++) {            RunKernelHardWorkCPU(length,                                 float4(1.0f, 1.0f, 1.0f, 1.0f), data, outCPU);         }      }      t1 = CyclesToUsecs(GetTimeTSC() - t1);      streamRead(s, data);      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);      streamWrite(o, outGPU);      t2 = GetTimeTSC();      for (k = 0; k < SMOOTHING; k++) {         streamRead(s, data);         for (j = 0; j < i; j++) {            RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);         }         streamWrite(o, outGPU);      }      t2 = CyclesToUsecs(GetTimeTSC() - t2);      if (t1 > t2) {         printf("%9d   %10d   %8d\t\t\t(* RunKVS %d *)\n",                4 * length * length, i,                (int) (outGPU[1].x * length * length * i * 4), length);         goto done;      }   }   if (i == MAX_TRIES) {      printf("(* GPU can't catch the CPU in %d iterations at length %d *)\n",             MAX_TRIES, length);      printf("%9d   %10d   %8d\t\t\t(* RunKVS %d *)\n",             4 * length * length, -1, -1, length);      i = -1;   }done:   free(data);   free(outGPU);   free(outCPU);   return i;}/* * RunKernel_GPUvsCPU -- * *      Entry point for generating the list of GPU/CPU crossover points as a *      function of length. * */voidRunKernel_GPUvsCPU(int minLength){   int lastIters = 0, count = 0;   int iters, i;   float idealSlope;   RunKernelKickBRT();   printf("(* RunKernel GPU vs. CPU: min length %d *)\n", minLength);   for (i = 0; i < 1; i++) {      idealSlope = RunKernelFindIdealGPUvsCPUSlope();   }   printf("(* length   iterations   # of ops *)\n");   RunKernelFindGPUvsCPUOne(minLength);   for (i = 0; i < numLengths; i++) {      if (lengths[i] <= minLength) continue;      iters = RunKernelFindGPUvsCPUOne(lengths[i]);      /*       * The way we time, we end up determining # iterations i as a function       * of stream length beyond which the GPU is faster than the CPU.  The       * model predicts this to converge to be linear in length which means       * i should become constant.  Once we're satisfied we've hit that       * point, there's no reason to calculate any more values (and it's       * quadratically expensive to keep going).  --Jeremy.       */      if (iters > 0 && iters == lastIters) {         //if (count++ > 10) { i += (numLengths - i) / 2; count = 0; }         if (count++ > 10) { i += 5; count = 0; }      } else {         count = 0;         lastIters = iters;      }   }}
上一页 12
💿 文件大小 951 K
👤 上传用户 zjf646
📂 所属分类数学计算
🏷️ 相关标签

#BrookGPU #GPU #0.4 #计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -