📄 chunky.cpp
字号:
#include <blitz/timer.h>using namespace blitz;void report(const char* name, Timer& timer, int N, long int iterations);void unfused(int N);void fused(int N);void chunky(int N);float* _bz_restrict a;float* _bz_restrict b;float* _bz_restrict c;float* _bz_restrict d;float* _bz_restrict e;float* _bz_restrict f;/* * Code to support the "fuse" macro */int _chunk;bool _done_chunks;int _chunk_size = 512;#define fuse _chunk = 0; _done_chunks = false; \ for (; !_done_chunks; ++_chunk)int main(int argc, char** argv){ if (argc == 2) _chunk_size = atoi(argv[1]); cout << "Using chunk size " << _chunk_size << endl; const int N = 100000; a = new float[N]; b = new float[N]; c = new float[N]; d = new float[N]; e = new float[N]; f = new float[N]; for (int i=0; i < N; ++i) { a[i] = i; b[i] = i; c[i] = i; d[i] = i; } Timer timer; long int iterations = 100; timer.start(); for (long i=0; i < iterations; ++i) unfused(N); timer.stop(); report("Unfused", timer, N, iterations); timer.start(); for (long i=0; i < iterations; ++i) fused(N); timer.stop(); report("Fused", timer, N, iterations); timer.start(); for (long i=0; i < iterations; ++i) chunky(N); timer.stop(); report("Chunky", timer, N, iterations); return 0;}void report(const char* name, Timer& timer, int N, long int iterations){ float flops = float(N) * iterations * 2; float Mflops = flops / timer.elapsedSeconds() / 1e+6; cout << setw(20) << name << " " << Mflops << " Mflops/s" << endl;}void __sink() { }void unfused(int N){ for (int i=0; i < N; ++i) e[i] = a[i] * b[i] + c[i] * d[i]; __sink(); for (int i=0; i < N; ++i) f[i] = c[i] * b[i] + a[i] * d[i];}void fused(int N){ for (int i=0; i < N; ++i) { e[i] = a[i] * b[i] + c[i] * d[i]; f[i] = c[i] * b[i] + a[i] * d[i]; }}// This "chunky" routine is a simulated implementation of// expression templates with tiling across multiple statements// (the "chunky fusion" approach). This code would be// generated by://// fuse {// E = A*B + C*D;// F = C*B + A*D;// }void chunky(int N){ fuse { { // Code generated by E = A*B + C*D; int lbound = _chunk * _chunk_size; int uboundp1 = lbound + _chunk_size; if (uboundp1 > N) { _done_chunks = true; uboundp1 = N; } for (int i=lbound; i < uboundp1; ++i) e[i] = a[i] * b[i] + c[i] * d[i]; } __sink(); { // Code generated by F = C*B + A*D; int lbound = _chunk * _chunk_size; int uboundp1 = lbound + _chunk_size; if (uboundp1 > N) { _done_chunks = true; uboundp1 = N; } for (int i=lbound; i < uboundp1; ++i) f[i] = c[i] * b[i] + a[i] * d[i]; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -