runkernel.br

来自「用于GPU通用计算的编程语言BrookGPU 0.4」· BR 代码 · 共 692 行 · 第 1/2 页
692 行
/* * runkernel.br * *      Simple tests to time how long it takes to invoke various kernels *      (i.e. ratio of execution time to stream length). */#include <stdlib.h>#include <stdio.h>#include <assert.h>#include "main.h"#include "runkernel.h"#include "statrecord.h"#if 1#define CHECK_MISMATCH 1#else#define CHECK_MISMATCH 0#endif#define MAX_TRIES       100#define SMOOTHING       100#define BIG_LENGTH      512/* * Streams can't be passed to functions, so use a macro. * *      We run our kernel and based on the knowledge that input[1] is 1.0 *      and that the calculation produces n*input, there end up being n - 1 *      instructions (cgc, fxc, and cl are all smart enough to merge the *      assingment with the first math op). */#define UPDATE_CUR_OPS(numEntries, data, k, s, o)               \   do {                                                         \      int newOps;                                               \      float4 *scratch;                                          \                                                                \      curNEntries = numEntries;                                 \      scratch = (float4 *) malloc(numEntries * sizeof *scratch);\      streamRead(s, data);                                      \      k(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);                  \      streamWrite(o, scratch);                                  \      newOps = numEntries > 1 ? (int) (scratch[1].x  - 1) : 38; \      if (newOps > 10) curOps = newOps;                         \      else printf("(* Length %d computed bogus Ops %d *)\n", numEntries, newOps); \      free(scratch);                                            \   } while (0)typedef void (*RunKernelWrapperFn)(char *logName, int length, int nRuns);static const int lengths[] = {   2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,   21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,   40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,   59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,   77, 78, 79, 80, 90, 96, 100, 120, 140, 160, 180, 200, 220, 240,   256, 300, 350, 400, 450, 512, 550, 600, 650, 700, 750, 800, 850, 900,   950, 1024, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2048};static const int iterations[] = {   1, 2, 4, 8, 16, /*20,  40, 50, 75, 150,*/ 1000, /* 200, 300, 500 */};static const int numLengths = sizeof lengths / sizeof lengths[0];static const int numIterations = sizeof iterations / sizeof iterations[0];/* * Latched values (each time we see a request for to run a kernel on a * stream with different from curNEntries, we calculate R: the length of * time it takes to do a streamRead() on a stream of the new length). */static int curNEntries;static int curOps = 0;static float cur_R, cur_W;/* * RunKernelMemcpyKernel -- * *      Just copy.  This kernel is separate from the test kernels just so *      nothing related to it gets cached.  It exists only for priming the *      BRT so we can ignore cold start timing. */kernel voidRunKernelMemcpyKernel(float s<>, out float o<>){   o = s;}/* * RunKernelHardWork -- * *      Try for a kernel that will actually exercise the GPU */kernel voidRunKernelHardWork(float4 c, float4 s<>, out float4 o<>){   o = c*s + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; /*o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s; o = c*o + s;   o = c*o + s; o = c*o + s; o = c*o + s;   */}/* * RunKernelHardWorkCPU -- * *      Simple C code to mimic the HardWork kernel.  Makes a plausible *      baseline for a naive native implementation. */#define I(o, _c, s)     o = o + s#define DO_MADs(o, _c, s) \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); /*I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); I(o, _c, s); \   I(o, _c, s); I(o, _c, s); I(o, _c, s); \   *//* static (Leave extern so it's easy to find in with dumpbin) */ voidRunKernelHardWorkCPU(int streamLength, float4 c, float4 *input, float4 *output){   int i;   float ox, oy, oz, ow;   for (i = 0; i < streamLength * streamLength; i++) {      ox = input[i].x;      DO_MADs(ox, c.x, input[i].x);      output[i].x = ox;      oy = input[i].y;      DO_MADs(oy, c.y, input[i].y);      output[i].y = oy;      oz = input[i].z;      DO_MADs(oz, c.z, input[i].z);      output[i].z = oz;      ow = input[i].w;      DO_MADs(ow, c.w, input[i].w);      output[i].w = ow;   }}/* * RunKernelKickBRT -- * *      Running the first kernel is very slow because a lot of one time *      initialization happens in BRT itself.  We use the same kernel as the *      tests to ignore the one-time overhead of downloading the kernel to *      the GPU. */static voidRunKernelKickBRT(void){   float4 d[1] = { float4(12.34f, 12.34f, 12.34f, 12.34f) };   float4 s<1>, o<1>;   streamRead(s, d);   RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);   streamWrite(o, d);}/* * RunKernelBuildSummary -- * *      Generates a bunch of summary timing numbers */static voidRunKernelBuildSummary(int length, int verbose, int fast){   struct StatRecord kW, W, RkW, R100kW;   float4 s<length, length>, o<length, length>, tiny<1, 1>;   float4 *outData;   float timeR, timeK;   int j, i, n;   outData = (float4 *) malloc(length * length * sizeof *outData);   StatRecord_Clear(&kW);   for (i = 0; i < SMOOTHING; i++) {      t1 = GetTimeTSC();      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);      streamWrite(o, outData);      StatRecord_Record(&kW, GetTimeTSC() - t1);   }   if (verbose) StatRecord_Print(&kW, "kW");   StatRecord_Clear(&RkW);   for (i = 0; i < SMOOTHING; i++) {      t1 = GetTimeTSC();      streamRead(s, outData);      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);      streamWrite(o, outData);      StatRecord_Record(&RkW, GetTimeTSC() - t1);   }   if (verbose) StatRecord_Print(&RkW, "RkW");   StatRecord_Clear(&W);   for (i = 0; i < SMOOTHING; i++) {      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);      RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, tiny);      streamWrite(tiny, outData);      t1 = GetTimeTSC();      streamWrite(o, outData);      StatRecord_Record(&W, GetTimeTSC() - t1);   }   if (verbose) StatRecord_Print(&W, "W");   StatRecord_Clear(&R100kW);   n = fast ? 1 : 10;   for (i = 0; i < n; i++) {      t1 = GetTimeTSC();      streamRead(s, outData);      for (j = 0; j < SMOOTHING; j++) {         RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o);      }      streamWrite(o, outData);      StatRecord_Record(&R100kW, GetTimeTSC() - t1);   }   if (verbose) StatRecord_Print(&R100kW, "R100kW");   free(outData);   timeR = (RkW.total - kW.total) / SMOOTHING;   timeK = R100kW.total / R100kW.n / SMOOTHING;   if (verbose) {      printf("(* Summary: R: %5.2f, k: %5.2f, W %5.2f (timed) / %5.2f (computed) *)\n",             timeR, timeK, W.total / W.n, RkW.total / RkW.n - timeR - timeK);   }   cur_R = timeR;   cur_W = W.total / W.n;}/* * RunKernelBuildData -- * *      Helper function that allocates memory and fills it with the stream *      test pattern. */static voidRunKernelBuildData(float4 **data, int numEntries){   int i;   *data = (float4 *) malloc(numEntries * sizeof **data);   assert(*data);   for (i = 0; i < numEntries; i++) {      (*data)[i].x = (*data)[i].y = (*data)[i].z = (*data)[i].w = (float) i;   }}/* * RunKernelProcessTiming -- * *      Interpret the numbers, calculate FLOPS, and check to make certain *      the output is the expected transformation on the input. * *      NOTE: stop and start are tunnelled in as globals for the same ease *      of implementation issue as the fact that they're declared in main.h *      instead of here. */static voidRunKernelProcessTiming(char *name, float4 *data,                       int length, int numEntries, int nRuns){   float elapsed, numFLOPs;   int i;   elapsed = (float) CyclesToUsecs(stop - start);   numFLOPs = numEntries * nRuns * 4.0f * curOps;   /*    * 'MFLOPS' calculation:    *   - numEntries * nRuns entries processed total    *   - calculations took elapsed usecs = elapsed/10^6 secs    *   - curOps instructions    *   - 4 floats per entry    */   printf64(CyclesToUsecs(stop - start));   printf(" %8.2f   %8.0f   %4d\t(* %s %d R%d %d %5.2f %5.2f *)\n",          numFLOPs / elapsed, numFLOPs / 1000000,          nRuns, name, length, nRuns, curOps, cur_R, cur_W);   for (i = 0; CHECK_MISMATCH && curOps > 0.0f && i < numEntries; i++) {      float expected;      float cur = data[i].x;      /*       * Grr.  Tolerate precision errors so long as the result isn't       * wrong by more than 10%.  Since you start to see skipping around       * 2^24 on nv30 and around 2^17 on ATI, once we're dealing with       * reasonable scales and 1024x1024 streams, there are precision       * artifacts.  Sad.  --Jeremy.       */      expected = (float) curOps * i;      if ((cur != expected &&           (cur - expected > 0.1f * expected || expected - cur > 0.1f * cur)) ||          data[i].y != cur || data[i].z != cur || data[i].w != cur) {         printf("(* Mismatch %d,%d/%d: *)\n"                "(* Expected %.2f, Got: %.2f %.2f %.2f %.2f *)\n",                 i / length, i % length, length, expected,                 data[i].x, data[i].y, data[i].z, data[i].w);         return;      }   }
runkernel.br - 源码说明

本页面展示了「用于GPU通用计算的编程语言BrookGPU 0.4」中的 runkernel.br 源码文件，采用 BR 编程语言编写，共 692 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与BrookGPU相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?