📄 runkernel.br
字号:
}/* * RunKernel1D -- * * Runs a simple kernel on a 1D stream and times it. */static voidRunKernel1D(char *logName, int streamLength, int nRuns){ float4 s<streamLength>, o<streamLength>; float4 *data; int i; RunKernelBuildData(&data, streamLength); if (curNEntries != streamLength) { UPDATE_CUR_OPS(streamLength, data, RunKernelHardWork, s, o); } start = GetTimeTSC(); streamRead(s, data); for (i = 0; i < nRuns; i++) { RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o); } streamWrite(o, data); stop = GetTimeTSC(); RunKernelProcessTiming(logName, data, streamLength, streamLength, nRuns); free(data);}/* * RunKernel2D -- * * Runs a simple kernel on a 2D stream and times it. */static voidRunKernel2D(char *logName, int streamLength, int nRuns){ float4 s<streamLength, streamLength>, o<streamLength, streamLength>; float4 *data, *data2; int i; RunKernelBuildData(&data, streamLength * streamLength); RunKernelBuildData(&data2, streamLength * streamLength); if (curNEntries != streamLength * streamLength) { RunKernelBuildSummary(streamLength, 0, 0); UPDATE_CUR_OPS(streamLength * streamLength, data, RunKernelHardWork, s, o); } start = GetTimeTSC(); streamRead(s, data); for (i = 0; i < nRuns; i++) { RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o); } streamWrite(o, data); stop = GetTimeTSC(); RunKernelProcessTiming(logName, data, streamLength, streamLength * streamLength, nRuns); free(data);}/* * RunKernelDoRun -- * * More code factoring. A convenient function for regularizing the * output of a run with a given length to avoiding copy-pasting. */static voidRunKernelDoRun(RunKernelWrapperFn f, char *logName, int length){ int i; printf("(* %s: length %d *)\n", logName, length); printf("(* usecs MFLOPS # MFLOPs runs *)\n"); for (i = 0; i < numIterations; i++) { f(logName, length, iterations[i]); } printf("\n");}/* * RunKernel1D_Time -- * * Entry point for the 1D kernel overhead tests. */voidRunKernel1D_Time(int maxLength){ int i; RunKernelKickBRT(); for (i = 0; i < numLengths && lengths[i] < maxLength; i++) { RunKernelDoRun(RunKernel1D, "RunK1D", lengths[i]); } RunKernelDoRun(RunKernel1D, "RunK1D", maxLength);}/* * RunKernel2D_Time -- * * Entry point for the 2D kernel overhead tests. */voidRunKernel2D_Time(int maxLength){ int i; RunKernelKickBRT(); for (i = 0; i < numLengths && lengths[i] < maxLength; i++) { RunKernelDoRun(RunKernel2D, "RunK2D", lengths[i]); } RunKernelDoRun(RunKernel2D, "RunK2D", maxLength);}/* * RunKernelFindIdealGPUvsCPUSlope -- * * At peak, we expect the GPU / CPU crossover point to be linear as a * function of the streamLength. Specifically, we expect the CPU time * to be: * * T_cpu = len * (# iterations of k) * k_cpu * * where k_cpu is time to execute the CPU implementation of the kernel. * The GPU cost should be * * T_gpu = (R+W)*len + len * (# iterations of k) * k_gpu * * where R, W are the R and W write bandwidth in streamElements / time. * * Solving, we get T_gpu < T_cpu when * * len * (# iterations) * (ops in k) > * len * (R + W) * (ops in k) / (k_cpu - k_gpu) * * which is just * * # ops > [(R + W) * (ops in k) / (k_cpu - k_gpu)] * len * * So we need to find k_gpu, k_cpu, and (R+W). * */static floatRunKernelFindIdealGPUvsCPUSlope(void){ float4 s<BIG_LENGTH, BIG_LENGTH>, o<BIG_LENGTH, BIG_LENGTH>; float4 tiny<1>; float4 *data, *outData; float slope = 0.0f, cMFLOPS, gMFLOPS, readRate, writeRate; int cpuOps, i; RunKernelBuildData(&data, BIG_LENGTH * BIG_LENGTH); outData = (float4 *) malloc(BIG_LENGTH * BIG_LENGTH * sizeof *outData); RunKernelBuildSummary(BIG_LENGTH, 1, 1); RunKernelHardWorkCPU(BIG_LENGTH, float4(1.0f, 1.0f, 1.0f, 1.0f), data, outData); t1 = GetTimeTSC(); for (i = 0; i < SMOOTHING; i++) { RunKernelHardWorkCPU(BIG_LENGTH, float4(1.0f, 1.0f, 1.0f, 1.0f), data, outData); } t1 = CyclesToUsecs(GetTimeTSC() - t1) / SMOOTHING; cpuOps = (int) outData[1].x - 1; cMFLOPS = (BIG_LENGTH * BIG_LENGTH * 4 * ((float) cpuOps) / t1); t2 = GetTimeTSC(); streamRead(s, data); for (i = 0; i < SMOOTHING; i++) { RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o); } streamWrite(o, data); t2 = CyclesToUsecs(GetTimeTSC() - t2) / SMOOTHING; gMFLOPS = (BIG_LENGTH * BIG_LENGTH * 4 * (data[1].x - 1) / t2); if (t2 > t1) { printf("(* The CPU is _faster_ than the GPU by"); printf64(t2-t1); printf(" usecs! *)\n"); } if (((int) data[1].x - 1) != cpuOps) { printf("(* CPU is doing %4.1f ops and GPU is doing %4.1f! *)\n", (float) cpuOps, (float) (data[1].x - 1)); } readRate = sizeof(float) * 4 * BIG_LENGTH * BIG_LENGTH / cur_R; writeRate = sizeof(float) * 4 * BIG_LENGTH * BIG_LENGTH / cur_W; printf("(* Read: %5.2f MB/s, Write: %5.2f MB/s, CPU MFLOPS: %4.f GPU MFLOPS: %4.f\n", readRate, writeRate, cMFLOPS, gMFLOPS); slope = (cur_R + cur_W) * cpuOps / (float) (t1 - t2); printf("(* Ideal slope: %5.2f Ideal iterations: %4.2f (%d ops)\tRunKVSIdeal *)\n", slope, slope / (float) cpuOps, cpuOps); free(data); free(outData); return slope;}/* * RunKernelFindGPUvsCPUOne -- * * Finds the crossover point i at which * * Read(N) + i*(GPU Version(N)) + Write(N) < i*(CPU Version(N)) * * Essentially, the plot is N floats vs. i*N*(# ops in the kernel) and * this function finds i given N and and a kernel length. * */static intRunKernelFindGPUvsCPUOne(int length){ float4 s<length, length>, o<length, length>; float4 *data, *outCPU, *outGPU; int i; if (length <= 1) return -1; /* We're never going to catch the CPU... */ RunKernelBuildData(&data, length * length); outCPU = (float4 *) malloc(length * length * sizeof *outCPU); outGPU = (float4 *) malloc(length * length * sizeof *outGPU); for (i = 1; i < MAX_TRIES; i++) { int j, k; RunKernelHardWorkCPU(length, float4(1.0f, 1.0f, 1.0f, 1.0f), data, outCPU); t1 = GetTimeTSC(); for (k = 0; k < SMOOTHING; k++) { for (j = 0; j < i; j++) { RunKernelHardWorkCPU(length, float4(1.0f, 1.0f, 1.0f, 1.0f), data, outCPU); } } t1 = CyclesToUsecs(GetTimeTSC() - t1); streamRead(s, data); RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o); streamWrite(o, outGPU); t2 = GetTimeTSC(); for (k = 0; k < SMOOTHING; k++) { streamRead(s, data); for (j = 0; j < i; j++) { RunKernelHardWork(float4(1.0f, 1.0f, 1.0f, 1.0f), s, o); } streamWrite(o, outGPU); } t2 = CyclesToUsecs(GetTimeTSC() - t2); if (t1 > t2) { printf("%9d %10d %8d\t\t\t(* RunKVS %d *)\n", 4 * length * length, i, (int) (outGPU[1].x * length * length * i * 4), length); goto done; } } if (i == MAX_TRIES) { printf("(* GPU can't catch the CPU in %d iterations at length %d *)\n", MAX_TRIES, length); printf("%9d %10d %8d\t\t\t(* RunKVS %d *)\n", 4 * length * length, -1, -1, length); i = -1; }done: free(data); free(outGPU); free(outCPU); return i;}/* * RunKernel_GPUvsCPU -- * * Entry point for generating the list of GPU/CPU crossover points as a * function of length. * */voidRunKernel_GPUvsCPU(int minLength){ int lastIters = 0, count = 0; int iters, i; float idealSlope; RunKernelKickBRT(); printf("(* RunKernel GPU vs. CPU: min length %d *)\n", minLength); for (i = 0; i < 1; i++) { idealSlope = RunKernelFindIdealGPUvsCPUSlope(); } printf("(* length iterations # of ops *)\n"); RunKernelFindGPUvsCPUOne(minLength); for (i = 0; i < numLengths; i++) { if (lengths[i] <= minLength) continue; iters = RunKernelFindGPUvsCPUOne(lengths[i]); /* * The way we time, we end up determining # iterations i as a function * of stream length beyond which the GPU is faster than the CPU. The * model predicts this to converge to be linear in length which means * i should become constant. Once we're satisfied we've hit that * point, there's no reason to calculate any more values (and it's * quadratically expensive to keep going). --Jeremy. */ if (iters > 0 && iters == lastIters) { //if (count++ > 10) { i += (numLengths - i) / 2; count = 0; } if (count++ > 10) { i += 5; count = 0; } } else { count = 0; lastIters = iters; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -